This example reproduces some of the analyses in the lecture notes for the SEM model of health care utilization.
library(lavaan) # model fitting
library(semPlot) # path diagrams
library(dplyr) # data manipulation
healthdat <- read.table("http://euclid.psych.yorku.ca/datavis/courses/factor/data/healthutil.txt")
names(healthdat)
## [1] "age" "stress" "esteem" "marriage" "control" "physical"
## [7] "mental" "druguse" "drvisits"
=~
specifies measurement equations for the latent variables; ~
for regressions; ~~
for correlations or covariances
health.mod1 <- '
# latent variables
Self =~ esteem + marriage + control
Ill =~ physical + mental
Util =~ druguse + drvisits
# regressions
Ill ~ age + stress + Self
Util ~ age + stress + Ill
# correlations (phi)
Self ~~ age + stress
age ~~ stress
'
health.sem1 <- lavaan::sem(health.mod1,
data=healthdat,
estimator="ML",
fixed.x=FALSE)
Quick look at fit measures. This model doesn’t fit well by any criteria
fitMeasures(health.sem1, c("chisq", "df", "pvalue", "cfi", "rmsea"))
## chisq df pvalue cfi rmsea
## 111.521 20.000 0.000 0.878 0.101
For more detailed output, use summary()
summary(health.sem1, standardized=TRUE, fit.measures=TRUE)
## lavaan 0.6-3 ended normally after 104 iterations
##
## Optimization method NLMINB
## Number of free parameters 25
##
## Number of observations 445
##
## Estimator ML
## Model Fit Test Statistic 111.521
## Degrees of freedom 20
## P-value (Chi-square) 0.000
##
## Model test baseline model:
##
## Minimum Function Test Statistic 784.615
## Degrees of freedom 36
## P-value 0.000
##
## User model versus baseline model:
##
## Comparative Fit Index (CFI) 0.878
## Tucker-Lewis Index (TLI) 0.780
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -9120.139
## Loglikelihood unrestricted model (H1) -9064.378
##
## Number of free parameters 25
## Akaike (AIC) 18290.278
## Bayesian (BIC) 18392.730
## Sample-size adjusted Bayesian (BIC) 18313.391
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.101
## 90 Percent Confidence Interval 0.084 0.120
## P-value RMSEA <= 0.05 0.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.065
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Self =~
## esteem 1.000 2.552 0.644
## marriage 1.510 0.294 5.132 0.000 3.853 0.432
## control 0.266 0.051 5.260 0.000 0.678 0.535
## Ill =~
## physical 1.000 1.873 0.798
## mental 1.450 0.128 11.326 0.000 2.717 0.646
## Util =~
## druguse 1.000 5.198 0.569
## drvisits 0.055 0.006 9.833 0.000 0.288 0.698
##
## Regressions:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Ill ~
## age 0.119 0.046 2.582 0.010 0.063 0.141
## stress 0.692 0.079 8.717 0.000 0.370 0.482
## Self 0.201 0.055 3.691 0.000 0.274 0.274
## Util ~
## age 0.037 0.125 0.296 0.767 0.007 0.016
## stress 0.482 0.264 1.823 0.068 0.093 0.121
## Ill 2.451 0.302 8.108 0.000 0.883 0.883
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Self ~~
## age -0.578 0.359 -1.607 0.108 -0.226 -0.102
## stress -0.010 0.209 -0.048 0.961 -0.004 -0.003
## age ~~
## stress -0.842 0.143 -5.886 0.000 -0.842 -0.291
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .esteem 9.201 1.351 6.808 0.000 9.201 0.586
## .marriage 64.572 5.257 12.283 0.000 64.572 0.813
## .control 1.145 0.116 9.879 0.000 1.145 0.713
## .physical 2.000 0.282 7.093 0.000 2.000 0.363
## .mental 10.290 0.871 11.808 0.000 10.290 0.582
## .druguse 56.380 4.505 12.516 0.000 56.380 0.676
## .drvisits 0.087 0.010 9.160 0.000 0.087 0.513
## age 4.943 0.331 14.916 0.000 4.943 1.000
## stress 1.701 0.114 14.916 0.000 1.701 1.000
## Self 6.511 1.475 4.415 0.000 1.000 1.000
## .Ill 2.528 0.350 7.233 0.000 0.721 0.721
## .Util 3.044 2.633 1.156 0.248 0.113 0.113
Look for the largest ones in absolute value
resid(health.sem1, type='cor')$cov
## esteem marrig contrl physcl mental drugus drvsts age stress
## esteem 0.000
## marriage 0.020 0.000
## control 0.000 -0.029 0.000
## physical -0.054 -0.010 0.004 0.000
## mental 0.115 0.168 0.196 -0.017 0.000
## druguse -0.133 0.028 0.001 -0.011 0.023 0.000
## drvisits -0.118 -0.032 -0.041 0.054 -0.070 0.000 0.000
## age 0.082 -0.054 -0.082 0.037 -0.073 -0.010 0.006 0.000
## stress -0.100 0.112 0.066 -0.051 0.101 0.019 -0.012 0.000 0.000
Just look for the largest ones and only for residual correlations
modindices(health.sem1, minimum.value=20, op="~~")
## lhs op rhs mi epc sepc.lv sepc.all sepc.nox
## 66 physical ~~ drvisits 35.327 0.278 0.278 0.666 0.666
## 70 mental ~~ drvisits 21.326 -0.341 -0.341 -0.360 -0.360
semPlot::semPaths
draws reasonable path diagrams, with many, many options. I always have to play around with these to get something relatively pleasing.
semPaths(health.sem1,
what="std",
style="lisrel",
color=list(lat="pink", man="lightblue"),
nCharNodes=8,
sizeMan=8, sizeLat=10)
semPaths(health.sem1,
what="std",
color=list(lat="pink", man="lightblue"),
nCharNodes=0,
sizeMan=10, sizeLat=8,
edge.label.cex=1,
rotation=2,
residuals=FALSE,
layout="tree2", label.norm="OOOOO")
health.mod2 <- '
# latent variables
Self =~ esteem + marriage + control
Ill =~ physical + mental
Util =~ druguse + drvisits
# regressions
Ill ~ age + stress + Self
Util ~ age + stress + Ill
Self ~~ age + stress
# correlations (phi)
age ~~ stress
physical ~~ drvisits
'
# or, more simply: just add this covariance to Model 1
health.mod2 <- paste(health.mod1,
'physical ~~ drvisits
')
cat(health.mod2)
##
## # latent variables
## Self =~ esteem + marriage + control
## Ill =~ physical + mental
## Util =~ druguse + drvisits
## # regressions
## Ill ~ age + stress + Self
## Util ~ age + stress + Ill
## # correlations (phi)
## Self ~~ age + stress
## age ~~ stress
## physical ~~ drvisits
health.sem2 <- lavaan::sem(health.mod2,
data=healthdat,
estimator="ML",
fixed.x=FALSE)
With one more parameter, this model fits better, even if there is still a lack of fit measured by the \(\chi^2\) test
fitMeasures(health.sem2, c("chisq", "df", "pvalue", "cfi", "rmsea"))
## chisq df pvalue cfi rmsea
## 68.568 19.000 0.000 0.934 0.077
summary(health.sem2, standardized=TRUE, fit.measures=FALSE)
## lavaan 0.6-3 ended normally after 113 iterations
##
## Optimization method NLMINB
## Number of free parameters 26
##
## Number of observations 445
##
## Estimator ML
## Model Fit Test Statistic 68.568
## Degrees of freedom 19
## P-value (Chi-square) 0.000
##
## Parameter Estimates:
##
## Information Expected
## Information saturated (h1) model Structured
## Standard Errors Standard
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Self =~
## esteem 1.000 2.426 0.612
## marriage 1.607 0.282 5.705 0.000 3.900 0.438
## control 0.291 0.047 6.168 0.000 0.707 0.558
## Ill =~
## physical 1.000 1.456 0.619
## mental 2.402 0.251 9.555 0.000 3.496 0.832
## Util =~
## druguse 1.000 5.394 0.591
## drvisits 0.044 0.006 7.884 0.000 0.237 0.580
##
## Regressions:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Ill ~
## age 0.077 0.035 2.230 0.026 0.053 0.118
## stress 0.552 0.073 7.543 0.000 0.379 0.494
## Self 0.279 0.054 5.164 0.000 0.466 0.466
## Util ~
## age 0.124 0.136 0.917 0.359 0.023 0.051
## stress 1.102 0.289 3.811 0.000 0.204 0.266
## Ill 2.413 0.343 7.026 0.000 0.651 0.651
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Self ~~
## age -0.618 0.345 -1.789 0.074 -0.255 -0.115
## stress 0.032 0.200 0.161 0.872 0.013 0.010
## age ~~
## stress -0.842 0.143 -5.886 0.000 -0.842 -0.291
## .physical ~~
## .drvisits 0.274 0.040 6.855 0.000 0.274 0.447
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .esteem 9.824 1.127 8.719 0.000 9.824 0.625
## .marriage 64.213 5.124 12.531 0.000 64.213 0.809
## .control 1.105 0.109 10.154 0.000 1.105 0.688
## .physical 3.413 0.298 11.458 0.000 3.413 0.617
## .mental 5.444 1.126 4.836 0.000 5.444 0.308
## .druguse 54.309 5.094 10.661 0.000 54.309 0.651
## .drvisits 0.110 0.010 10.965 0.000 0.110 0.663
## age 4.943 0.331 14.916 0.000 4.943 1.000
## stress 1.701 0.114 14.916 0.000 1.701 1.000
## Self 5.887 1.229 4.788 0.000 1.000 1.000
## .Ill 1.201 0.218 5.518 0.000 0.567 0.567
## .Util 10.299 3.346 3.078 0.002 0.354 0.354
Print the standardized solution, but just for the latent variables and regressions
standardizedSolution(health.sem2) %>%
filter(op != "~~") %>%
print(digits=3)
## lhs op rhs est.std se z pvalue ci.lower ci.upper
## 1 Self =~ esteem 0.6121 0.0560 10.927 0.00e+00 0.5023 0.722
## 2 Self =~ marriage 0.4376 0.0554 7.899 2.89e-15 0.3290 0.546
## 3 Self =~ control 0.5582 0.0552 10.116 0.00e+00 0.4500 0.666
## 4 Ill =~ physical 0.6189 0.0403 15.345 0.00e+00 0.5399 0.698
## 5 Ill =~ mental 0.8318 0.0391 21.292 0.00e+00 0.7552 0.908
## 6 Util =~ druguse 0.5906 0.0475 12.422 0.00e+00 0.4974 0.684
## 7 Util =~ drvisits 0.5801 0.0470 12.347 0.00e+00 0.4881 0.672
## 8 Ill ~ age 0.1182 0.0521 2.270 2.32e-02 0.0161 0.220
## 9 Ill ~ stress 0.4943 0.0496 9.964 0.00e+00 0.3971 0.592
## 10 Ill ~ Self 0.4657 0.0610 7.629 2.38e-14 0.3460 0.585
## 11 Util ~ age 0.0513 0.0558 0.918 3.59e-01 -0.0582 0.161
## 12 Util ~ stress 0.2665 0.0680 3.916 8.99e-05 0.1331 0.400
## 13 Util ~ Ill 0.6513 0.0685 9.512 0.00e+00 0.5171 0.786
These models are nested, so the \(\chi^2\) test for their difference indicates whether model 2 is a significant improvement.
anova(health.sem1, health.sem2)
## Chi Square Difference Test
##
## Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
## health.sem2 19 18249 18356 68.568
## health.sem1 20 18290 18393 111.521 42.953 1 5.607e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1