::p_load(tidyr,plyr, dplyr, lattice, stargazer,readr, sjPlot) pacman
Introducción al análisis longitudinal
Correspondiente a la sesión del jueves, 5 de junio de 2025
Introducción análisis longitudinal
Datos
- National Youth Survey (Raudenbusch & Chan, 1992)
- Encuesta anual desde los 11 años en adelante, 5 olas, una al año (i.e. hasta 15 años)
- Medida: instrumento de tolerancia TOL a conductas desviadas, promedio 9 items, likert 1-4 desde “muy mal” hasta “nada mal”. Por ejemplo “fumar marihuana”, “copiar en una prueba” .Mayores puntajes indican mayor tolerancia a desviación.
- Predictores: sexo MALE=1 y autoreporte de exposición a conductas desviadas a los 11 años EXPOSURE. Esta variable se construye estimando la proporción de amigos cercanos que se han visto envueltos en actividades de los mismos 9 items de autoreporte (1=ninguno, 4=todos), los cuales se promedian.
Lectura datos nivel-persona
<- read.csv("https://github.com/cursos-metodos-facso/multinivel-facso/raw/refs/heads/main/assignment/data/tolerance1.txt")
tolerance print(tolerance,row.names=FALSE)
id tol11 tol12 tol13 tol14 tol15 male exposure
9 2.23 1.79 1.90 2.12 2.66 0 1.54
45 1.12 1.45 1.45 1.45 1.99 1 1.16
268 1.45 1.34 1.99 1.79 1.34 1 0.90
314 1.22 1.22 1.55 1.12 1.12 0 0.81
442 1.45 1.99 1.45 1.67 1.90 0 1.13
514 1.34 1.67 2.23 2.12 2.44 1 0.90
569 1.79 1.90 1.90 1.99 1.99 0 1.99
624 1.12 1.12 1.22 1.12 1.22 1 0.98
723 1.22 1.34 1.12 1.00 1.12 0 0.81
918 1.00 1.00 1.22 1.99 1.22 0 1.21
949 1.99 1.55 1.12 1.45 1.55 1 0.93
978 1.22 1.34 2.12 3.46 3.32 1 1.59
1105 1.34 1.90 1.99 1.90 2.12 1 1.38
1542 1.22 1.22 1.99 1.79 2.12 0 1.44
1552 1.00 1.12 2.23 1.55 1.55 0 1.04
1653 1.11 1.11 1.34 1.55 2.12 0 1.25
stargazer(tolerance, type="text")
=========================================
Statistic N Mean St. Dev. Min Max
-----------------------------------------
id 16 762.812 516.239 9 1,653
tol11 16 1.364 0.353 1.000 2.230
tol12 16 1.441 0.321 1.000 1.990
tol13 16 1.676 0.406 1.120 2.230
tol14 16 1.754 0.575 1.000 3.460
tol15 16 1.861 0.618 1.120 3.320
male 16 0.438 0.512 0 1
exposure 16 1.191 0.329 0.810 1.990
-----------------------------------------
Transformación datos período-persona función pivot_longer()
de la librería tidyr
attach(tolerance)
# Pasar a long
<- pivot_longer(
tolerance.pp data = tolerance,
cols = tol11:tol15,
names_to = "variable",
values_to = "value",
names_transform = list(variable = as.factor)
)
# Algunas comparaciones entre wide y long
names(tolerance)
[1] "id" "tol11" "tol12" "tol13" "tol14" "tol15" "male"
[8] "exposure"
head(tolerance)
id tol11 tol12 tol13 tol14 tol15 male exposure
1 9 2.23 1.79 1.90 2.12 2.66 0 1.54
2 45 1.12 1.45 1.45 1.45 1.99 1 1.16
3 268 1.45 1.34 1.99 1.79 1.34 1 0.90
4 314 1.22 1.22 1.55 1.12 1.12 0 0.81
5 442 1.45 1.99 1.45 1.67 1.90 0 1.13
6 514 1.34 1.67 2.23 2.12 2.44 1 0.90
dim(tolerance)
[1] 16 8
names(tolerance.pp)
[1] "id" "male" "exposure" "variable" "value"
head(tolerance.pp)
# A tibble: 6 × 5
id male exposure variable value
<int> <int> <dbl> <fct> <dbl>
1 9 0 1.54 tol11 2.23
2 9 0 1.54 tol12 1.79
3 9 0 1.54 tol13 1.9
4 9 0 1.54 tol14 2.12
5 9 0 1.54 tol15 2.66
6 45 1 1.16 tol11 1.12
dim(tolerance.pp)
[1] 80 5
%>% slice(1:32) # muestra dos primeras medidas de tolerancia (16 casos primera, 16 segunda), ordenado por mediciones en lugar de casos tolerance.pp
# A tibble: 32 × 5
id male exposure variable value
<int> <int> <dbl> <fct> <dbl>
1 9 0 1.54 tol11 2.23
2 9 0 1.54 tol12 1.79
3 9 0 1.54 tol13 1.9
4 9 0 1.54 tol14 2.12
5 9 0 1.54 tol15 2.66
6 45 1 1.16 tol11 1.12
7 45 1 1.16 tol12 1.45
8 45 1 1.16 tol13 1.45
9 45 1 1.16 tol14 1.45
10 45 1 1.16 tol15 1.99
# ℹ 22 more rows
# Sort by id
= arrange(tolerance.pp, id)
tolerance.pp %>% slice(1:15) # muestra tres primeras ID con sus correspondientes 5 mediciones tolerance.pp
# A tibble: 15 × 5
id male exposure variable value
<int> <int> <dbl> <fct> <dbl>
1 9 0 1.54 tol11 2.23
2 9 0 1.54 tol12 1.79
3 9 0 1.54 tol13 1.9
4 9 0 1.54 tol14 2.12
5 9 0 1.54 tol15 2.66
6 45 1 1.16 tol11 1.12
7 45 1 1.16 tol12 1.45
8 45 1 1.16 tol13 1.45
9 45 1 1.16 tol14 1.45
10 45 1 1.16 tol15 1.99
11 268 1 0.9 tol11 1.45
12 268 1 0.9 tol12 1.34
13 268 1 0.9 tol13 1.99
14 268 1 0.9 tol14 1.79
15 268 1 0.9 tol15 1.34
# adecuar a cap. 2 Singer & Willet
$age=as.numeric(tolerance.pp$variable)+10 # para identificar edad
tolerance.pp= dplyr::rename(tolerance.pp,tolerance=value)
tolerance.pp names(tolerance.pp)
[1] "id" "male" "exposure" "variable" "tolerance" "age"
attach(tolerance.pp)
Plot trayectorias
xyplot(tolerance ~ age | id, data=tolerance.pp, as.table=T)
Matriz de correlaciones
options(digits=2)
=cor(tolerance[ , 2:6]) cortol
stargazer(cortol, type = "html")
tol11 | tol12 | tol13 | tol14 | tol15 | |
tol11 | 1 | 0.660 | 0.062 | 0.140 | 0.260 |
tol12 | 0.660 | 1 | 0.250 | 0.210 | 0.390 |
tol13 | 0.062 | 0.250 | 1 | 0.590 | 0.570 |
tol14 | 0.140 | 0.210 | 0.590 | 1 | 0.820 |
tol15 | 0.260 | 0.390 | 0.570 | 0.820 | 1 |
Plot trayectorias no paramétricas
xyplot(tolerance~age | id, data=tolerance.pp,
prepanel = function(x, y) prepanel.loess(x, y, family="gaussian"),
xlab = "Age", ylab = "Tolerance",
panel = function(x, y) {
panel.xyplot(x, y)
panel.loess(x,y, family="gaussian") },
ylim=c(0, 4), as.table=T)
Aproximación de modelos individuales (regresiones)
$agec=tolerance.pp$age-11 # centrar
tolerance.pp
# Generar regresiones individuales
by(tolerance.pp, id, function(x) summary(lm(tolerance ~ agec, data=x)))
Análisis de interceptos, pendientes y
Interceptos
<- by(tolerance.pp, id, function(data)
int coefficients(lm(tolerance ~ agec, data = data))[[1]])
<- unlist(int)
int names(int) <- NULL
intsummary(int)
stem(int, scale=3)
Pendientes/tasa de cambio
<- by(tolerance.pp, id, function(data)
rate coefficients(lm(tolerance ~ age, data = data))[[2]])
<- unlist(rate)
rate names(rate) <- NULL
summary(rate)
stem(rate, scale=2)
<- by(tolerance.pp, id, function(data)
rsq summary(lm(tolerance ~ agec, data = data))$r.squared)
<- unlist(rsq)
rsq names(rsq) <- NULL
summary(rsq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0154 0.2505 0.3917 0.4910 0.7971 0.8864
stem(rsq, scale=2)
The decimal point is 1 digit(s) to the left of the |
0 | 27
1 | 3
2 | 55
3 | 113
4 | 5
5 |
6 | 8
7 | 78
8 | 6889
Plot de trayectorias individuales
xyplot(tolerance ~ age | id, data=tolerance.pp,
panel = function(x, y){
panel.xyplot(x, y)
panel.lmline(x, y)
ylim=c(0, 4), as.table=T) },
Plot de trayectorias brutas conjuntas
interaction.plot(tolerance.pp$age, tolerance.pp$id, tolerance.pp$tolerance)
Plot de trayectorias lineales conjuntas
# fitting the linear model by id
<- by(tolerance.pp, id, function(bydata) fitted.values(lm(tolerance ~ age, data=bydata)))
fit <- unlist(fit)
fit
# plotting the linear fit by id
interaction.plot(tolerance.pp$age, tolerance.pp$id, fit, xlab="age", ylab="tolerance")
Análisis del efecto de variables de nivel 2 (sexo)
Hombres
# fitting the linear model by id, males only
<- tolerance.pp[male==1 , ]
tolm <- by(tolm, tolm$id, function(bydata) fitted.values(lm(tolerance ~ age, data=bydata)))
fitmlist <- unlist(fitmlist)
fitm
#appending the average for the whole group
<- fitted( lm(tolerance ~ age, data=tolm) )
lm.m names(lm.m) <- NULL
<- c(fitm, lm.m[1:5])
fit.m2 <- c(tolm$age, seq(11,15))
age.m <- c(tolm$id, rep(111, 5))
id.m
#plotting the linear fit by id, males
#id.m=111 denotes the average value for males
interaction.plot(age.m, id.m, fit.m2, ylim=c(0, 4), xlab="AGE", ylab="TOLERANCE", lwd=1, main="Males")
Mujeres
#fitting the linear model by id, females only
<- tolerance.pp[tolerance.pp$male==0 , ]
tol.pp.fm <- by(tol.pp.fm, tol.pp.fm$id,
fit.fm function(data) fitted.values(lm(tolerance ~ age, data=data)))
<- unlist(fit.fm)
fit.fm1 names(fit.fm1) <- NULL
#appending the average for the whole group
<- fitted( lm(tolerance ~ age, data=tol.pp.fm) )
lm.fm names(lm.fm) <- NULL
<- c(fit.fm1, lm.fm[1:5])
fit.fm2 <- c(tol.pp.fm$age, seq(11,15))
age.fm <- c(tol.pp.fm$id, rep(1111, 5))
id.fm
#plotting the linear fit by id, females
#id.fm=1111 denotes the average for females
interaction.plot(age.fm, id.fm, fit.fm2, ylim=c(0, 4), xlab="AGE", ylab="TOLERANCE", lwd=1,main="Females" )
Análisis longitudinal II: Estimación
Sesión basada principalmente en Singer & Willett ALDA (Applied Longitudinal Data Analysis), capítulos 2 & 3
Cargar paquetes
::p_load(
pacman
haven,
lme4,
dplyr,
lattice,
texreg )
Ejemplo Singer & Willett cap. 3
- Datos: estudio sobre efecto de intervencíon temprana en habilidades cognitivas
- 3 mediciones a 103 niños de familias de bajos ingresos
- A los 6 meses, la mitad (58) fue asignado a una intervención de estimulación cognitiva
- Mediciones a los 12, 18 y 24 meses
- Variable dependiente: puntaje en habilidades cognitivas (cog)
Lectura de datos
<-read_stata("https://github.com/cursos-metodos-facso/multinivel-facso/raw/refs/heads/main/assignment/data/earlyint_pp.dta")
early.intsummary(early.int)
id cog age program time
Min. : 68 Min. : 57 Min. :1.0 Min. :0.000 Min. :0.0
1st Qu.:102 1st Qu.: 92 1st Qu.:1.0 1st Qu.:0.000 1st Qu.:0.0
Median :144 Median :103 Median :1.5 Median :1.000 Median :0.5
Mean :474 Mean :103 Mean :1.5 Mean :0.563 Mean :0.5
3rd Qu.:940 3rd Qu.:113 3rd Qu.:2.0 3rd Qu.:1.000 3rd Qu.:1.0
Max. :985 Max. :137 Max. :2.0 Max. :1.000 Max. :1.0
Exploración con casos seleccionados
$id early.int
[1] 68 68 68 70 70 70 71 71 71 72 72 72 75 75 75 76 76 76
[19] 77 77 77 79 79 79 80 80 80 81 81 81 83 83 83 84 84 84
[37] 86 86 86 87 87 87 89 89 89 90 90 90 91 91 91 92 92 92
[55] 93 93 93 96 96 96 97 97 97 98 98 98 99 99 99 100 100 100
[73] 101 101 101 102 102 102 105 105 105 106 106 106 107 107 107 109 109 109
[91] 110 110 110 111 111 111 112 112 112 113 113 113 114 114 114 115 115 115
[109] 116 116 116 117 117 117 120 120 120 121 121 121 122 122 122 125 125 125
[127] 126 126 126 127 127 127 128 128 128 129 129 129 134 134 134 135 135 135
[145] 136 136 136 137 137 137 143 143 143 144 144 144 145 145 145 146 146 146
[163] 148 148 148 149 149 149 151 151 151 152 152 152 902 902 902 904 904 904
[181] 906 906 906 908 908 908 909 909 909 911 911 911 913 913 913 916 916 916
[199] 917 917 917 919 919 919 924 924 924 925 925 925 926 926 926 929 929 929
[217] 931 931 931 934 934 934 935 935 935 936 936 936 938 938 938 940 940 940
[235] 941 941 941 942 942 942 944 944 944 947 947 947 949 949 949 950 950 950
[253] 953 953 953 960 960 960 964 964 964 966 966 966 967 967 967 968 968 968
[271] 969 969 969 970 970 970 971 971 971 972 972 972 973 973 973 975 975 975
[289] 976 976 976 977 977 977 979 979 979 980 980 980 982 982 982 984 984 984
[307] 985 985 985
attr(,"format.stata")
[1] "%9.0g"
%>% group_by(id) %>% summarise(mean(program)) early.int
# A tibble: 103 × 2
id `mean(program)`
<dbl> <dbl>
1 68 1
2 70 1
3 71 1
4 72 1
5 75 1
6 76 1
7 77 1
8 79 1
9 80 1
10 81 1
# ℹ 93 more rows
<- early.int %>% filter(id >= 68 & id <=75 | id >= 902 & id <=906)
early.int1
early.int1
# A tibble: 24 × 5
id cog age program time
<dbl> <dbl> <dbl> <dbl> <dbl>
1 68 103 1 1 0
2 68 119 1.5 1 0.5
3 68 96 2 1 1
4 70 106 1 1 0
5 70 107 1.5 1 0.5
6 70 96 2 1 1
7 71 112 1 1 0
8 71 86 1.5 1 0.5
9 71 73 2 1 1
10 72 100 1 1 0
# ℹ 14 more rows
xyplot(cog~age | id, data=early.int1,
panel = function(x, y){
panel.xyplot(x, y)
panel.lmline(x, y)
ylim=c(50, 150), as.table=T) },
Estimación de modelo lineal por id y plot
<- by(early.int, early.int$id, function(data) fitted.values(lm(cog ~ age, data=data)))
fit <- unlist(fit)
fit1 names(fit1) <- NULL
#plotting the linear fit by id
xyplot(cog ~ age, groups=id, data=early.int, type="r", ylim=c(50,150))
Plot según participación en programa de estimulación de habilidades cognitivas
#plotting the linear fit by idm program = 0
=subset(early.int, program<1)
early.int0xyplot(cog ~ age, groups=id, data=early.int0, type="r", ylim=c(50,150), main = "Program=0")
#plotting the linear fit by idm program = 1
=subset(early.int, program>0)
early.int01xyplot(cog ~ age, groups=id, data=early.int01, type="r", ylim=c(50,150), main = "Program=1")
Estimación multinivel
<- lmer(cog ~ 1 + (1|id), data=early.int, REML=FALSE)
results0
::ICC(results0) reghelper
[1] 0.34
Modelos con variable nivel 2, variable nivel 1 e interacción
<- lmer(cog ~ 1 + program + (1|id), data=early.int, REML=FALSE)
results1 summary(results1)
<- lmer(cog ~ 1 + program + time+ (1|id), data=early.int, REML=FALSE)
results2 summary(results2)
<- lmer(cog ~ time*program + (time|id), data=early.int, REML=FALSE)
results3 summary(results3)
htmlreg(list(results0, results1,results2, results3))
Model 1 | Model 2 | Model 3 | Model 4 | |
---|---|---|---|---|
(Intercept) | 102.62*** | 97.27*** | 106.36*** | 107.84*** |
(1.16) | (1.61) | (1.72) | (2.03) | |
program | 9.49*** | 9.49*** | 6.85* | |
(2.14) | (2.14) | (2.71) | ||
time | -18.17*** | -21.13*** | ||
(1.24) | (1.88) | |||
time:program | 5.27* | |||
(2.51) | ||||
AIC | 2551.12 | 2535.12 | 2389.84 | 2385.94 |
BIC | 2562.32 | 2550.05 | 2408.50 | 2415.81 |
Log Likelihood | -1272.56 | -1263.56 | -1189.92 | -1184.97 |
Num. obs. | 309 | 309 | 309 | 309 |
Num. groups: id | 103 | 103 | 103 | 103 |
Var: id (Intercept) | 84.35 | 62.19 | 89.69 | 123.97 |
Var: Residual | 161.50 | 161.50 | 79.01 | 74.76 |
Var: id time | 10.10 | |||
Cov: id (Intercept) time | -35.38 | |||
***p < 0.001; **p < 0.01; *p < 0.05 |
Gráfico de interacción
<-fitted.values (results3)
ainteraction.plot(early.int$age, early.int$program, a, xlab="AGE", ylab="COG",
ylim=c(50, 150), lwd=4, lty=1, col=4:5)