lme4
and tidyverse
R packages and references:
lme4
package for (Restricted) Maximum Likelihood Estimation of linear multilevel models (Bates et al. 2015; Bates 2012) and (2) integrating lme4
with the tidyverse
, a collection of R packages for data science (including dplyr
, ggplot2
, and purrr
) with a common language and set of principles (Wickham and Grolemund 2017).MathAchieve
and MathAchSchool
data frames in the nlme
package. They derive from the “High School and Beyond” survey of 7185 students in 160 U.S. high schools, including 70 Catholic and 90 public schools (Fox 2016b; Raudenbush and Bryk 2002). See the links and references above for more documentation on these data.Further readings and resources:
To take this workshop you need to:
Top menu > Tools > Install packages...
.Clone
button > Download ZIP > Then unzip the folder in your computer.Once in class, go to the workshop project folder (point 5 above) and double-click on the workshop R project file (Multilevel_with_R.Rproj
) in it. This will open RStudio.
broom.mixed
car
for tests of significance.ggeffects
to calculate and visualize predicted values.lme4
for specifying and estimating multilevel models.lmerTest
for tests of significance on multilevel models.mathach
).ses.dev
).mean.ses
) and school sector (Public vs Catholic, sector
).# Packages
library(tidyverse)
library(car)
library(lme4)
library(lmerTest)
library(ggeffects)
library(magrittr)
library(broom)
library(broom.mixed)
# Data
load("./Data/multilevel_data.rda")
# View the data
stud_data
## # A tibble: 7,185 × 5
## school minority sex ses mathach
## <chr> <fct> <fct> <dbl> <dbl>
## 1 1224 No Female -1.53 5.88
## 2 1224 No Female -0.588 19.7
## 3 1224 No Male -0.528 20.3
## 4 1224 No Male -0.668 8.78
## 5 1224 No Male -0.158 17.9
## 6 1224 No Male 0.022 4.58
## 7 1224 No Female -0.618 -2.83
## 8 1224 No Male -0.998 0.523
## 9 1224 No Female -0.888 1.53
## 10 1224 No Male -0.458 21.5
## # … with 7,175 more rows
school_data
## # A tibble: 160 × 2
## school sector
## * <chr> <fct>
## 1 1224 Public
## 2 1288 Public
## 3 1296 Public
## 4 1308 Catholic
## 5 1317 Catholic
## 6 1358 Public
## 7 1374 Public
## 8 1433 Catholic
## 9 1436 Catholic
## 10 1461 Public
## # … with 150 more rows
# How many schools?
stud_data %>%
pull(school) %>%
n_distinct
## [1] 160
# How many students per school (school size)?
stud_data %>%
count(school)
## # A tibble: 160 × 2
## school n
## <chr> <int>
## 1 1224 47
## 2 1288 25
## 3 1296 48
## 4 1308 20
## 5 1317 48
## 6 1358 30
## 7 1374 28
## 8 1433 35
## 9 1436 44
## 10 1461 33
## # … with 150 more rows
# Sort by school size
stud_data %>%
count(school, sort = TRUE)
## # A tibble: 160 × 2
## school n
## <chr> <int>
## 1 2305 67
## 2 5619 66
## 3 4292 65
## 4 3610 64
## 5 4042 64
## 6 8857 64
## 7 4530 63
## 8 1477 62
## 9 2277 61
## 10 4642 61
## # … with 150 more rows
# Average school size
stud_data %>%
count(school) %>%
summarise(mean(n))
## # A tibble: 1 × 1
## `mean(n)`
## <dbl>
## 1 44.9
# Calculate mean student SES in each school
stud_data %<>% # Note marittr %<>% operator: pipe + assign
group_by(school) %>% # Group data frame by school for mutate
mutate(mean.ses= mean(ses)) %>% # Create mean(ses) by school
ungroup # Ungroup data frame
stud_data
## # A tibble: 7,185 × 6
## school minority sex ses mathach mean.ses
## <chr> <fct> <fct> <dbl> <dbl> <dbl>
## 1 1224 No Female -1.53 5.88 -0.434
## 2 1224 No Female -0.588 19.7 -0.434
## 3 1224 No Male -0.528 20.3 -0.434
## 4 1224 No Male -0.668 8.78 -0.434
## 5 1224 No Male -0.158 17.9 -0.434
## 6 1224 No Male 0.022 4.58 -0.434
## 7 1224 No Female -0.618 -2.83 -0.434
## 8 1224 No Male -0.998 0.523 -0.434
## 9 1224 No Female -0.888 1.53 -0.434
## 10 1224 No Male -0.458 21.5 -0.434
## # … with 7,175 more rows
# Create student's SES deviation from school mean
(stud_data %<>%
mutate(ses.dev = ses - mean.ses))
## # A tibble: 7,185 × 7
## school minority sex ses mathach mean.ses ses.dev
## <chr> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1224 No Female -1.53 5.88 -0.434 -1.09
## 2 1224 No Female -0.588 19.7 -0.434 -0.154
## 3 1224 No Male -0.528 20.3 -0.434 -0.0936
## 4 1224 No Male -0.668 8.78 -0.434 -0.234
## 5 1224 No Male -0.158 17.9 -0.434 0.276
## 6 1224 No Male 0.022 4.58 -0.434 0.456
## 7 1224 No Female -0.618 -2.83 -0.434 -0.184
## 8 1224 No Male -0.998 0.523 -0.434 -0.564
## 9 1224 No Female -0.888 1.53 -0.434 -0.454
## 10 1224 No Male -0.458 21.5 -0.434 -0.0236
## # … with 7,175 more rows
# Join with more school-level variables (sector)
df <- left_join(stud_data, school_data, by="school")
df
## # A tibble: 7,185 × 8
## school minority sex ses mathach mean.ses ses.dev sector
## <chr> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1224 No Female -1.53 5.88 -0.434 -1.09 Public
## 2 1224 No Female -0.588 19.7 -0.434 -0.154 Public
## 3 1224 No Male -0.528 20.3 -0.434 -0.0936 Public
## 4 1224 No Male -0.668 8.78 -0.434 -0.234 Public
## 5 1224 No Male -0.158 17.9 -0.434 0.276 Public
## 6 1224 No Male 0.022 4.58 -0.434 0.456 Public
## 7 1224 No Female -0.618 -2.83 -0.434 -0.184 Public
## 8 1224 No Male -0.998 0.523 -0.434 -0.564 Public
## 9 1224 No Female -0.888 1.53 -0.434 -0.454 Public
## 10 1224 No Male -0.458 21.5 -0.434 -0.0236 Public
## # … with 7,175 more rows
# Groups (school ID) should be a factor (i.e., categorical variable)
df %<>%
mutate(school = factor(school))
# Frequencies of school sector:
# N schools
school_data %>%
count(sector)
## # A tibble: 2 × 2
## sector n
## <fct> <int>
## 1 Public 90
## 2 Catholic 70
# N students
df %>%
count(sector)
## # A tibble: 2 × 2
## sector n
## <fct> <int>
## 1 Public 3642
## 2 Catholic 3543
# Sample 20 random IDs of catholic schools
set.seed(1129)
school.IDs <- school_data %>%
filter(sector=="Catholic") %>%
sample_n(20) %>%
pull(school)
school.IDs
## [1] "5667" "4223" "9198" "4253" "5650" "8150" "5761" "2658" "3838" "8193"
## [11] "4292" "7364" "3992" "5404" "5192" "1462" "4042" "3020" "7172" "1477"
# Filter data to just those school IDs
(df.cat <- df %>%
filter(school %in% school.IDs))
## # A tibble: 1,011 × 8
## school minority sex ses mathach mean.ses ses.dev sector
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1462 Yes Male 0.162 1.97 -0.669 0.831 Catholic
## 2 1462 Yes Male -0.758 7.72 -0.669 -0.0886 Catholic
## 3 1462 Yes Male -0.708 19.0 -0.669 -0.0386 Catholic
## 4 1462 Yes Male -0.818 20.1 -0.669 -0.149 Catholic
## 5 1462 Yes Male -1.92 22.3 -0.669 -1.25 Catholic
## 6 1462 Yes Male -0.948 14.3 -0.669 -0.279 Catholic
## 7 1462 Yes Male -0.798 10.2 -0.669 -0.129 Catholic
## 8 1462 Yes Male -1.46 9.61 -0.669 -0.789 Catholic
## 9 1462 Yes Male -0.158 6.12 -0.669 0.511 Catholic
## 10 1462 Yes Male -0.378 23.8 -0.669 0.291 Catholic
## # … with 1,001 more rows
# We can put everything into a single pipe to have shorter code
set.seed(1129)
df.cat <- school_data %>%
filter(sector=="Catholic") %>%
sample_n(20) %>%
pull(school) %>%
{filter(df, school %in% .)} # Note the {} brackets so that "." is not used as first argument in filter()
# Same thing for Public schools
set.seed(1129)
df.pub <- school_data %>%
filter(sector=="Public") %>%
sample_n(20) %>%
pull(school) %>%
{filter(df, school %in% .)}
# Plot SES vs math score in each of the 20 Catholic schools
# Data and variables
p <- ggplot(df.cat, aes(x=ses.dev, y=mathach)) +
# Scatterplot geom
geom_point(shape=1) +
# Un-comment following line to add loess smothing line
# geom_smooth(method="loess", color= "blue", se=FALSE) +
# linear regression line
geom_smooth(method="lm", color= "red", se=FALSE) +
# Facet by school
facet_wrap(~ school, nrow=4, ncol=5) +
# Black/white theme
theme_bw()
# See the plot
p
## `geom_smooth()` using formula 'y ~ x'
# Same as above for Public schools
ggplot(df.pub, aes(x=ses.dev, y=mathach)) +
# Scatterplot geom
geom_point(shape=1) +
# Un-comment following line to add loess smothing line
# geom_smooth(method="loess", color= "blue", se=FALSE) +
# linear regression line
geom_smooth(method="lm", color= "red", se=FALSE) +
# Facet by school
facet_wrap(~ school, nrow=4, ncol=5) +
# Black/white theme
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
nested.df
) with each school’s row including the data frame for that school’s students.nested.df
and purrr::map
to estimate a separate linear regression model of math achievement on SES in each school.# Estimate a linear model for mathach as predicted by ses.dev, separately in
# each of the 160 schools.
# First, nest the student data frame by school
nested.df <- df %>%
group_by(school) %>%
nest()
# This creates a school-level data frame (data row = school), with each school's
# row containing the student-level data frame for that school.
nested.df
## # A tibble: 160 × 2
## # Groups: school [160]
## school data
## <fct> <list>
## 1 1224 <tibble [47 × 7]>
## 2 1288 <tibble [25 × 7]>
## 3 1296 <tibble [48 × 7]>
## 4 1308 <tibble [20 × 7]>
## 5 1317 <tibble [48 × 7]>
## 6 1358 <tibble [30 × 7]>
## 7 1374 <tibble [28 × 7]>
## 8 1433 <tibble [35 × 7]>
## 9 1436 <tibble [44 × 7]>
## 10 1461 <tibble [33 × 7]>
## # … with 150 more rows
# What do the numbers of rows and columns indicate in each school's data frame?
# (e.g. [47 x 7])
# The column nested.df$data is a list of data frames, one for each school
class(nested.df$data)
## [1] "list"
length(nested.df$data)
## [1] 160
# For example, look at the data for school 1224:
## Still nested
nested.df %>%
filter(school=="1224") %>%
dplyr::select(school, data)
## # A tibble: 1 × 2
## # Groups: school [1]
## school data
## <fct> <list>
## 1 1224 <tibble [47 × 7]>
## Unnested
nested.df %>%
filter(school=="1224") %>%
dplyr::select(school, data) %>%
unnest(cols = c(data))
## # A tibble: 47 × 8
## # Groups: school [1]
## school minority sex ses mathach mean.ses ses.dev sector
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1224 No Female -1.53 5.88 -0.434 -1.09 Public
## 2 1224 No Female -0.588 19.7 -0.434 -0.154 Public
## 3 1224 No Male -0.528 20.3 -0.434 -0.0936 Public
## 4 1224 No Male -0.668 8.78 -0.434 -0.234 Public
## 5 1224 No Male -0.158 17.9 -0.434 0.276 Public
## 6 1224 No Male 0.022 4.58 -0.434 0.456 Public
## 7 1224 No Female -0.618 -2.83 -0.434 -0.184 Public
## 8 1224 No Male -0.998 0.523 -0.434 -0.564 Public
## 9 1224 No Female -0.888 1.53 -0.434 -0.454 Public
## 10 1224 No Male -0.458 21.5 -0.434 -0.0236 Public
## # … with 37 more rows
# Or get the first element of nested.df$data
nested.df$data[[1]]
## # A tibble: 47 × 7
## minority sex ses mathach mean.ses ses.dev sector
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 No Female -1.53 5.88 -0.434 -1.09 Public
## 2 No Female -0.588 19.7 -0.434 -0.154 Public
## 3 No Male -0.528 20.3 -0.434 -0.0936 Public
## 4 No Male -0.668 8.78 -0.434 -0.234 Public
## 5 No Male -0.158 17.9 -0.434 0.276 Public
## 6 No Male 0.022 4.58 -0.434 0.456 Public
## 7 No Female -0.618 -2.83 -0.434 -0.184 Public
## 8 No Male -0.998 0.523 -0.434 -0.564 Public
## 9 No Female -0.888 1.53 -0.434 -0.454 Public
## 10 No Male -0.458 21.5 -0.434 -0.0236 Public
## # … with 37 more rows
# Equivalently, with tidyverse syntax
nested.df %>%
pull(data) %>%
extract2(1)
## # A tibble: 47 × 7
## minority sex ses mathach mean.ses ses.dev sector
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 No Female -1.53 5.88 -0.434 -1.09 Public
## 2 No Female -0.588 19.7 -0.434 -0.154 Public
## 3 No Male -0.528 20.3 -0.434 -0.0936 Public
## 4 No Male -0.668 8.78 -0.434 -0.234 Public
## 5 No Male -0.158 17.9 -0.434 0.276 Public
## 6 No Male 0.022 4.58 -0.434 0.456 Public
## 7 No Female -0.618 -2.83 -0.434 -0.184 Public
## 8 No Male -0.998 0.523 -0.434 -0.564 Public
## 9 No Female -0.888 1.53 -0.434 -0.454 Public
## 10 No Male -0.458 21.5 -0.434 -0.0236 Public
## # … with 37 more rows
# Using the nested data frame, we can now fit a separate linear model in each
# school's data frame (each element of nested.df$data)
lmodels <- nested.df %>%
# Get all school data frames
pull(data) %>%
# Run lm() on each via map
purrr::map(~ lm(mathach ~ ses.dev, data= .x))
# Note the formula notation in purrr::map(), where .x indicates each element of
# nested.df$data.
# lmodels is now a list of fitted linear models, one for each school
head(lmodels)
## [[1]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 9.715 2.509
##
##
## [[2]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 13.511 3.255
##
##
## [[3]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 7.636 1.076
##
##
## [[4]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 16.256 0.126
##
##
## [[5]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 13.178 1.274
##
##
## [[6]]
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 11.206 5.068
class(lmodels)
## [1] "list"
length(lmodels)
## [1] 160
# Instead of creating this list as a separate object, we can create it as a new
# column in nested.df, adding each model to its school's row in nested.df.
nested.df %<>%
mutate(model = purrr::map(data,
~ lm(mathach ~ ses.dev, data= .x)))
# Result
nested.df
## # A tibble: 160 × 3
## # Groups: school [160]
## school data model
## <fct> <list> <list>
## 1 1224 <tibble [47 × 7]> <lm>
## 2 1288 <tibble [25 × 7]> <lm>
## 3 1296 <tibble [48 × 7]> <lm>
## 4 1308 <tibble [20 × 7]> <lm>
## 5 1317 <tibble [48 × 7]> <lm>
## 6 1358 <tibble [30 × 7]> <lm>
## 7 1374 <tibble [28 × 7]> <lm>
## 8 1433 <tibble [35 × 7]> <lm>
## 9 1436 <tibble [44 × 7]> <lm>
## 10 1461 <tibble [33 × 7]> <lm>
## # … with 150 more rows
# The third column of nested.df ($model) now includes the fitted linear model
# for each school (each row).
# E.g., model for school 1224
nested.df %>%
filter(school=="1224") %>%
pull(model) %>%
extract2(1)
##
## Call:
## lm(formula = mathach ~ ses.dev, data = .x)
##
## Coefficients:
## (Intercept) ses.dev
## 9.715 2.509
# Get intercept and slope for model of school 1224
nested.df %>%
filter(school=="1224") %>%
pull(model) %>%
# This is needed to get the lm object out of the list object
extract2(1) %>%
coef
## (Intercept) ses.dev
## 9.715447 2.508582
# Or, for tidy output
nested.df %>%
filter(school=="1224") %>%
pull(model) %>%
extract2(1) %>%
broom::tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9.72 1.10 8.87 1.94e-11
## 2 ses.dev 2.51 1.77 1.42 1.62e- 1
# Let's apply the same code within mutate to get tidy estimation results for all
# models (all schools)
nested.df %<>%
mutate(model.results = purrr::map(model,
broom::tidy)
)
nested.df
## # A tibble: 160 × 4
## # Groups: school [160]
## school data model model.results
## <fct> <list> <list> <list>
## 1 1224 <tibble [47 × 7]> <lm> <tibble [2 × 5]>
## 2 1288 <tibble [25 × 7]> <lm> <tibble [2 × 5]>
## 3 1296 <tibble [48 × 7]> <lm> <tibble [2 × 5]>
## 4 1308 <tibble [20 × 7]> <lm> <tibble [2 × 5]>
## 5 1317 <tibble [48 × 7]> <lm> <tibble [2 × 5]>
## 6 1358 <tibble [30 × 7]> <lm> <tibble [2 × 5]>
## 7 1374 <tibble [28 × 7]> <lm> <tibble [2 × 5]>
## 8 1433 <tibble [35 × 7]> <lm> <tibble [2 × 5]>
## 9 1436 <tibble [44 × 7]> <lm> <tibble [2 × 5]>
## 10 1461 <tibble [33 × 7]> <lm> <tibble [2 × 5]>
## # … with 150 more rows
# Unnest to view the results
nested.df %>%
unnest(model.results)
## # A tibble: 320 × 8
## # Groups: school [160]
## school data model term estimate std.error statistic p.value
## <fct> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1224 <tibble [47 × 7]> <lm> (Inter… 9.72 1.10 8.87 1.94e-11
## 2 1224 <tibble [47 × 7]> <lm> ses.dev 2.51 1.77 1.42 1.62e- 1
## 3 1288 <tibble [25 × 7]> <lm> (Inter… 13.5 1.36 9.91 9.12e-10
## 4 1288 <tibble [25 × 7]> <lm> ses.dev 3.26 2.08 1.57 1.31e- 1
## 5 1296 <tibble [48 × 7]> <lm> (Inter… 7.64 0.774 9.86 6.27e-13
## 6 1296 <tibble [48 × 7]> <lm> ses.dev 1.08 1.21 0.890 3.78e- 1
## 7 1308 <tibble [20 × 7]> <lm> (Inter… 16.3 1.40 11.6 9.02e-10
## 8 1308 <tibble [20 × 7]> <lm> ses.dev 0.126 3.00 0.0420 9.67e- 1
## 9 1317 <tibble [48 × 7]> <lm> (Inter… 13.2 0.790 16.7 3.97e-21
## 10 1317 <tibble [48 × 7]> <lm> ses.dev 1.27 1.44 0.887 3.80e- 1
## # … with 310 more rows
# Let's keep the part we're interested in
lm.coeff <- nested.df %>%
unnest(model.results) %>%
dplyr::select(school, term, estimate)
lm.coeff
## # A tibble: 320 × 3
## # Groups: school [160]
## school term estimate
## <fct> <chr> <dbl>
## 1 1224 (Intercept) 9.72
## 2 1224 ses.dev 2.51
## 3 1288 (Intercept) 13.5
## 4 1288 ses.dev 3.26
## 5 1296 (Intercept) 7.64
## 6 1296 ses.dev 1.08
## 7 1308 (Intercept) 16.3
## 8 1308 ses.dev 0.126
## 9 1317 (Intercept) 13.2
## 10 1317 ses.dev 1.27
## # … with 310 more rows
# Reshape and rename
lm.coeff %<>%
# Model intercept and slope into two columns
pivot_wider(names_from = term, values_from = estimate) %>%
# Rename columns
dplyr::select(school, intercept = `(Intercept)`, slope = ses.dev)
lm.coeff
## # A tibble: 160 × 3
## # Groups: school [160]
## school intercept slope
## <fct> <dbl> <dbl>
## 1 1224 9.72 2.51
## 2 1288 13.5 3.26
## 3 1296 7.64 1.08
## 4 1308 16.3 0.126
## 5 1317 13.2 1.27
## 6 1358 11.2 5.07
## 7 1374 9.73 3.85
## 8 1433 19.7 1.85
## 9 1436 18.1 1.60
## 10 1461 16.8 6.27
## # … with 150 more rows
# Bring school sector and mean.ses into this data frame.
# Create data frame with school ID, school sector, mean.ses
lm.df <- df %>%
dplyr::select(school, sector, mean.ses) %>%
distinct
lm.df
## # A tibble: 160 × 3
## school sector mean.ses
## <fct> <fct> <dbl>
## 1 1224 Public -0.434
## 2 1288 Public 0.122
## 3 1296 Public -0.426
## 4 1308 Catholic 0.528
## 5 1317 Catholic 0.345
## 6 1358 Public -0.0197
## 7 1374 Public -0.0126
## 8 1433 Catholic 0.712
## 9 1436 Catholic 0.563
## 10 1461 Public 0.677
## # … with 150 more rows
# Join with lm.coeff
(lm.df %<>%
left_join(lm.coeff, by="school")
)
## # A tibble: 160 × 5
## school sector mean.ses intercept slope
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 1224 Public -0.434 9.72 2.51
## 2 1288 Public 0.122 13.5 3.26
## 3 1296 Public -0.426 7.64 1.08
## 4 1308 Catholic 0.528 16.3 0.126
## 5 1317 Catholic 0.345 13.2 1.27
## 6 1358 Public -0.0197 11.2 5.07
## 7 1374 Public -0.0126 9.73 3.85
## 8 1433 Catholic 0.712 19.7 1.85
## 9 1436 Catholic 0.563 18.1 1.60
## 10 1461 Public 0.677 16.8 6.27
## # … with 150 more rows
# Now we can get a boxplot of school intercepts by school sector
ggplot(lm.df, aes(x= sector, y= intercept)) + geom_boxplot()
# And a boxplot of school SES slopes by school sector
ggplot(lm.df, aes(x= sector, y= slope)) + geom_boxplot()
# Or both sets of estimates in a scatterplot by school sector
ggplot(lm.df, aes(x= intercept, y= slope)) +
# Scatterplot
geom_point() +
# Linear regression line
geom_smooth(method="lm") +
# Facet by sector
facet_wrap(~ sector) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot of school intercepts by school mean.ses, by sector
ggplot(lm.df, aes(x= mean.ses, y= intercept)) +
geom_point() +
geom_smooth(method="loess") +
facet_wrap(~ sector) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
# Scatterplot of school slopes by school mean.ses, by sector
ggplot(lm.df, aes(x= mean.ses, y= slope)) +
geom_point() +
geom_smooth(method="loess") +
facet_wrap(~ sector) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
nested.df$data
, what do the numbers of rows and columns indicate in each school’s data frame?mod1
.
matchach
variation between variation at the student level (between students) and variation at the school level (between schools).mathach
is modeled as resulting from a school random effect (group level) plus a student random effect (individual or residual level): these are called \(u_i\) and \(e_i\), respectively, by Rasbash, Steele, and Reckie (2008).mod1
is estimated via Restricted Maximum Likelihood (REML).mod1
with the same null linear model (no predictor) without taking into account student clustering in schools (i.e., a single-level model).mathach
variation that is attributable to the second level, that is, to between-school differences.mathach
of two random students from the same school).# Let's start with a simple variance components model
mod1 <- lmer(mathach ~ 1 + (1 | school),
data=df)
# Model 1 is estimated with Restricted Maximum Likelihood (REML) by default.
# You can set REML=FALSE to use Maximum Likelihood (ML) instead.
# See results.
# lmerTest augmented summary function
summary(mod1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mathach ~ 1 + (1 | school)
## Data: df
##
## REML criterion at convergence: 47116.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0631 -0.7539 0.0267 0.7606 2.7426
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 8.614 2.935
## Residual 39.148 6.257
## Number of obs: 7185, groups: school, 160
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 12.6370 0.2444 156.6473 51.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Results in tidy format
(mod1.res <- tidy(mod1))
## # A tibble: 3 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.6 0.244 51.7 157. 2.34e-100
## 2 ran_pars school sd__(Intercep… 2.93 NA NA NA NA
## 3 ran_pars Residual sd__Observati… 6.26 NA NA NA NA
# To test for significance of school effect, let's estimate a null single-level
# model
mod1_sl <- lm(mathach ~ 1,
data=df)
# Compare the two models with Likelihood Ratio Test (LRT).
anova(mod1, mod1_sl)
## refitting model(s) with ML (instead of REML)
## Data: df
## Models:
## mod1_sl: mathach ~ 1
## mod1: mathach ~ 1 + (1 | school)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod1_sl 2 48104 48117 -24050 48100
## mod1 3 47122 47142 -23558 47116 983.92 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Note that by default, anova() refits the models with ML so the LRT is correct.
# School-level variance estimate
(sigma2_u <- mod1.res %>%
filter(effect == "ran_pars", group == "school") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2)
## [1] 8.614025
# Estimate for residual, individual-level variance
(sigma2_e <- mod1.res %>%
filter(effect == "ran_pars", group == "Residual") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2)
## [1] 39.14832
# Variance Partition Coefficient
sigma2_u/(sigma2_u + sigma2_e)
## [1] 0.1803518
mathach
?mathach
variation is explained by the school level? How does this compare with the correlation between math scores of two random students in the same school?mod2
, a random intercept model with fixed slope for individual student SES.Intercept
and ses.dev
slope) and for random-effect parameters.mathach
is now explained in part by individual student SES (ses.dev
), and its unexplained variation is partitioned between individual and school random effects.
mod1
, indicating that a relatively higher proportion of the (unexplained) mathach
variation is now explained by the school level: in other words, student SES absorbs part of the student-level variation which the previous mod1
attributed to individual random effects (\(e_i\)).lmerTest
’s modification of the summary
function to show p-values for single coefficient estimates based on t-tests with Satterthwaite’s method.summary
function to our estimated models.car::Anova
function on the model object (see discussion by Fox and Weisberg 2018).# Random intercept model with fixed slope for individual student SES.
mod2 <- lmer(mathach ~ 1 + ses.dev + (1 | school),
data=df)
# See results.
summary(mod2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mathach ~ 1 + ses.dev + (1 | school)
## Data: df
##
## REML criterion at convergence: 46724
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0969 -0.7322 0.0194 0.7572 2.9147
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 8.672 2.945
## Residual 37.010 6.084
## Number of obs: 7185, groups: school, 160
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 12.6361 0.2445 156.7405 51.68 <2e-16 ***
## ses.dev 2.1912 0.1087 7022.0237 20.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ses.dev 0.000
# Another option to test significance of single coefficient estimates.
Anova(mod2)
## Analysis of Deviance Table (Type II Wald chisquare tests)
##
## Response: mathach
## Chisq Df Pr(>Chisq)
## ses.dev 406.68 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Kenward-Roger "F" tests with Satterthwaite degrees of freedom, which in this
# case have very close results (but take much longer to calculate)
# Anova(mod2, test="F")
# Tidy results
(mod2.res <- tidy(mod2))
## # A tibble: 4 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.6 0.244 51.7 157. 2.29e-100
## 2 fixed <NA> ses.dev 2.19 0.109 20.2 7022. 5.77e- 88
## 3 ran_pars school sd__(Intercep… 2.94 NA NA NA NA
## 4 ran_pars Residual sd__Observati… 6.08 NA NA NA NA
# See the estimates for the population-level fixed effects: intercept and SES
# slope
mod2.res %>%
filter(effect == "fixed")
## # A tibble: 2 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.6 0.244 51.7 157. 2.29e-100
## 2 fixed <NA> ses.dev 2.19 0.109 20.2 7022. 5.77e- 88
# See the estimates for the variance component parameters (aka random-effect
# parameters)
mod2.res %>%
filter(effect == "ran_pars")
## # A tibble: 2 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ran_pars school sd__(Intercept) 2.94 NA NA NA NA
## 2 ran_pars Residual sd__Observation 6.08 NA NA NA NA
# School-level variance estimate
(sigma2_u <- mod2.res %>%
filter(effect == "ran_pars", group == "school") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2)
## [1] 8.672289
# Residual, individual-level variance
(sigma2_e <- mod2.res %>%
filter(effect == "ran_pars", group == "Residual") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2)
## [1] 37.01041
# Variance Partition Coefficient
sigma2_u/(sigma2_u + sigma2_e)
## [1] 0.1898375
mathach
variation not explained by ses.dev
is attributable to the school level?
mod1
?mod1
?mod3
, a random slope model with in which the effect of individual student SES on math score is allowed to vary between schools.lme4
model results show the correlation \(\rho_{u01}\) (not the covariance) between random intercept and random slope: to get the covariance just multiply the correlation times the two standard deviations (\(\rho_{u01}*\sigma_{u0}*\sigma_{u1}\)).# Random slope model with SES slope allowed to vary by school
mod3 <- lmer(mathach ~ 1 + ses.dev + (1 + ses.dev | school),
data=df)
# Results
summary(mod3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mathach ~ 1 + ses.dev + (1 + ses.dev | school)
## Data: df
##
## REML criterion at convergence: 46714.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.09680 -0.73193 0.01855 0.75386 2.89924
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 8.681 2.9464
## ses.dev 0.694 0.8331 0.02
## Residual 36.700 6.0581
## Number of obs: 7185, groups: school, 160
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 12.6362 0.2445 156.7512 51.68 <2e-16 ***
## ses.dev 2.1932 0.1283 155.2166 17.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## ses.dev 0.009
# Tidy results
(mod3.res <- tidy(mod3))
## # A tibble: 6 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.6 0.245 51.7 157. 2.29e-100
## 2 fixed <NA> ses.dev 2.19 0.128 17.1 155. 1.58e- 37
## 3 ran_pars school sd__(Intercep… 2.95 NA NA NA NA
## 4 ran_pars school cor__(Interce… 0.0191 NA NA NA NA
## 5 ran_pars school sd__ses.dev 0.833 NA NA NA NA
## 6 ran_pars Residual sd__Observati… 6.06 NA NA NA NA
# School-level variance of random intercept
mod3.res %>%
filter(effect == "ran_pars", group == "school", term == "sd__(Intercept)") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2
## [1] 8.681044
# School-level variance of random SES slope
mod3.res %>%
filter(effect == "ran_pars", group == "school", term == "sd__ses.dev") %>%
# Standard deviation
pull(estimate) %>%
# ^2 = Variance
.^2
## [1] 0.6939974
# School-level correlation between random intercept and random SES slope
mod3.res %>%
filter(effect == "ran_pars", group == "school", term == "cor__(Intercept).ses.dev") %>%
# Standard deviation
pull(estimate)
## [1] 0.0190622
mean.ses
and sector
.
mean.ses
and sector
as both main effects and interactions with ses.dev
(see derivation in slides): mod4
.ses.dev
slope fixed (i.e., a random-intercept model): mod5
.mod4
explains significantly more variation in the dependent variable compared to the simpler, more parsimonious mod5
(Likelihood Ratio Test).
ses.dev
slope is fixed across schools), so we choose mod5
over mod4
.mod5
we obtain predicted values of student mathach
as a function of student ses.dev
, given different contexts (i.e., different fixed values of school’s mean.ses
and sector
). We then plot these results.# Estimate the model
mod4 <- lmer(mathach ~ 1 + mean.ses*ses.dev + sector*ses.dev
+ (1 + ses.dev | school),
data=df)
# See results
summary(mod4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mathach ~ 1 + mean.ses * ses.dev + sector * ses.dev + (1 + ses.dev |
## school)
## Data: df
##
## REML criterion at convergence: 46503.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.15926 -0.72319 0.01704 0.75444 2.95822
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## school (Intercept) 2.380 1.5426
## ses.dev 0.101 0.3179 0.39
## Residual 36.721 6.0598
## Number of obs: 7185, groups: school, 160
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 12.1279 0.1993 159.8955 60.856 < 2e-16 ***
## mean.ses 5.3329 0.3692 150.9859 14.446 < 2e-16 ***
## ses.dev 2.9450 0.1556 139.4992 18.928 < 2e-16 ***
## sectorCatholic 1.2266 0.3063 149.6127 4.005 9.74e-05 ***
## mean.ses:ses.dev 1.0393 0.2989 160.4375 3.477 0.000652 ***
## ses.dev:sectorCatholic -1.6427 0.2398 143.2292 -6.851 2.01e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) men.ss ses.dv sctrCt mn.s:.
## mean.ses 0.256
## ses.dev 0.075 0.019
## sectorCthlc -0.699 -0.356 -0.053
## mn.ss:ss.dv 0.019 0.074 0.293 -0.026
## ss.dv:sctrC -0.052 -0.027 -0.696 0.077 -0.351
tidy(mod4)
## # A tibble: 10 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.1 0.199 60.9 160. 1.70e-112
## 2 fixed <NA> mean.ses 5.33 0.369 14.4 151. 2.95e- 30
## 3 fixed <NA> ses.dev 2.95 0.156 18.9 139. 2.32e- 40
## 4 fixed <NA> sectorCathol… 1.23 0.306 4.00 150. 9.74e- 5
## 5 fixed <NA> mean.ses:ses… 1.04 0.299 3.48 160. 6.52e- 4
## 6 fixed <NA> ses.dev:sect… -1.64 0.240 -6.85 143. 2.01e- 10
## 7 ran_pars school sd__(Interce… 1.54 NA NA NA NA
## 8 ran_pars school cor__(Interc… 0.391 NA NA NA NA
## 9 ran_pars school sd__ses.dev 0.318 NA NA NA NA
## 10 ran_pars Residual sd__Observat… 6.06 NA NA NA NA
# Test for random slope for ses.dev: compare random intercept vs random slope
# models with same fixed effects.
# Estimate same model as mod4, but without random slope (only random intercept).
mod5 <- lmer(mathach ~ 1 + mean.ses*ses.dev + sector*ses.dev
+ (1 | school), data=df)
# See results
summary(mod5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mathach ~ 1 + mean.ses * ses.dev + sector * ses.dev + (1 | school)
## Data: df
##
## REML criterion at convergence: 46504.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1701 -0.7249 0.0148 0.7542 2.9655
##
## Random effects:
## Groups Name Variance Std.Dev.
## school (Intercept) 2.375 1.541
## Residual 36.766 6.064
## Number of obs: 7185, groups: school, 160
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 12.1282 0.1992 160.0061 60.885 < 2e-16 ***
## mean.ses 5.3367 0.3690 151.0718 14.463 < 2e-16 ***
## ses.dev 2.9421 0.1512 7018.2611 19.457 < 2e-16 ***
## sectorCatholic 1.2245 0.3061 149.6953 4.000 9.92e-05 ***
## mean.ses:ses.dev 1.0444 0.2910 7018.2611 3.589 0.000335 ***
## ses.dev:sectorCatholic -1.6422 0.2331 7018.2611 -7.045 2.03e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) men.ss ses.dv sctrCt mn.s:.
## mean.ses 0.256
## ses.dev 0.000 0.000
## sectorCthlc -0.699 -0.356 0.000
## mn.ss:ss.dv 0.000 0.000 0.295 0.000
## ss.dv:sctrC 0.000 0.000 -0.696 0.000 -0.351
tidy(mod5)
## # A tibble: 8 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.1 0.199 60.9 160. 1.40e-112
## 2 fixed <NA> mean.ses 5.34 0.369 14.5 151. 2.62e- 30
## 3 fixed <NA> ses.dev 2.94 0.151 19.5 7018. 3.58e- 82
## 4 fixed <NA> sectorCatholic 1.22 0.306 4.00 150. 9.92e- 5
## 5 fixed <NA> mean.ses:ses.… 1.04 0.291 3.59 7018. 3.35e- 4
## 6 fixed <NA> ses.dev:secto… -1.64 0.233 -7.05 7018. 2.03e- 12
## 7 ran_pars school sd__(Intercep… 1.54 NA NA NA NA
## 8 ran_pars Residual sd__Observati… 6.06 NA NA NA NA
# Test for random slope: LRT comparing random slope model vs random intercept model
anova(mod5, mod4)
## refitting model(s) with ML (instead of REML)
## Data: df
## Models:
## mod5: mathach ~ 1 + mean.ses * ses.dev + sector * ses.dev + (1 | school)
## mod4: mathach ~ 1 + mean.ses * ses.dev + sector * ses.dev + (1 + ses.dev | school)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod5 8 46513 46568 -23249 46497
## mod4 10 46516 46585 -23248 46496 1.0016 2 0.6061
# Based on the value of the observed Chisq statistic and its p-value, we keep
# the random intercept model with fixed slope for ses.dev: mod5
# Get mod5 predicted values by ses.dev, at different levels of school sector and
# mean.ses.
pred.val <- ggpredict(mod5, terms = c("ses.dev", "mean.ses [-1:0.5 by=0.5]", "sector")) %>%
as_tibble() %>%
dplyr::rename(ses.dev = x, mathach = predicted, mean.ses = group, sector = facet)
pred.val
## # A tibble: 64 × 7
## ses.dev mathach std.error conf.low conf.high mean.ses sector
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct>
## 1 -4 -0.799 1.20 -3.15 1.56 -1 Public
## 2 -4 6.99 1.54 3.97 10.0 -1 Catholic
## 3 -4 -0.220 0.743 -1.68 1.24 -0.5 Public
## 4 -4 7.57 1.03 5.55 9.60 -0.5 Catholic
## 5 -4 0.360 0.637 -0.888 1.61 0 Public
## 6 -4 8.15 0.706 6.77 9.54 0 Catholic
## 7 -4 0.939 1.00 -1.03 2.90 0.5 Public
## 8 -4 8.73 0.823 7.12 10.3 0.5 Catholic
## 9 -3 1.10 0.934 -0.733 2.93 -1 Public
## 10 -3 7.25 1.20 4.90 9.60 -1 Catholic
## # … with 54 more rows
# Plot these predicted effects
ggplot(pred.val) +
# Lines of mathach by ses.dev, grouped/colored by sector (Public vs Catholic)
geom_line(aes(y = mathach, x = ses.dev, group = sector, color = sector)) +
# Different panels for different mean.ses values
facet_grid(~ mean.ses) +
theme(legend.position="bottom")
mod4
but not in mod5
? Why?mod4
and mod5
?mod5
? What is the (fixed) effect of ses.dev
on mathach
? How does this change in Catholic vs Public schools? How does this change in schools whose student population has higher SES on average?mean.ses
) for Public and Catholic schools.# Estimated coefficients in selected model
mod5.res <- tidy(mod5)
# Get fixed parameters from model
mod5.res %>%
filter(effect == "fixed")
## # A tibble: 6 × 8
## effect group term estimate std.error statistic df p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 fixed <NA> (Intercept) 12.1 0.199 60.9 160. 1.40e-112
## 2 fixed <NA> mean.ses 5.34 0.369 14.5 151. 2.62e- 30
## 3 fixed <NA> ses.dev 2.94 0.151 19.5 7018. 3.58e- 82
## 4 fixed <NA> sectorCatholic 1.22 0.306 4.00 150. 9.92e- 5
## 5 fixed <NA> mean.ses:ses.dev 1.04 0.291 3.59 7018. 3.35e- 4
## 6 fixed <NA> ses.dev:sectorCatho… -1.64 0.233 -7.05 7018. 2.03e- 12
# Fixed intercept
mod5.res %>%
filter(effect == "fixed", term == "(Intercept)") %>%
pull(estimate)
## [1] 12.12821
# Note that because Public schools are the reference category, for Catholic
# schools (dummy variable=1) we have to add the sectorCatholic parameter to get
# the actual fixed intercept.
mod5.res %>%
filter(effect == "fixed", term == "sectorCatholic") %>%
pull(estimate)
## [1] 1.224529
# Save the fixed intercepts.
# Fixed intercept for public schools.
(fixed_int_pub <- mod5.res %>%
filter(effect == "fixed", term == "(Intercept)") %>%
pull(estimate))
## [1] 12.12821
# sectorCatholic fixed effect.
(fixed_slo_cat <- mod5.res %>%
filter(effect == "fixed", term == "sectorCatholic") %>%
pull(estimate))
## [1] 1.224529
# Fixed intercept for Catholic schools: sum.
(fixed_int_cat <- fixed_int_pub + fixed_slo_cat)
## [1] 13.35274
# lme4::ranef can calculate estimates (conditional modes) for the intercept random effect
# for each school
lme4::ranef(mod5) %>%
str
## List of 1
## $ school:'data.frame': 160 obs. of 1 variable:
## ..$ (Intercept): num [1:160] -0.0712 0.4531 -1.6798 0.0479 -1.5259 ...
## ..- attr(*, "postVar")= num [1, 1, 1:160] 0.588 0.908 0.579 1.036 0.579 ...
## - attr(*, "class")= chr "ranef.mer"
ranef(mod5)$school %>%
head
## (Intercept)
## 1224 -0.07116497
## 1288 0.45311618
## 1296 -1.67981589
## 1308 0.04791777
## 1317 -1.52592390
## 1358 -0.53895146
# Save as data frame
(school.effects <- ranef(mod5)$school %>%
as_tibble(rownames = "school") %>%
# These are called u_j in certain textbooks.
rename(u_j = `(Intercept)`)
)
## # A tibble: 160 × 2
## school u_j
## <chr> <dbl>
## 1 1224 -0.0712
## 2 1288 0.453
## 3 1296 -1.68
## 4 1308 0.0479
## 5 1317 -1.53
## 6 1358 -0.539
## 7 1374 -1.50
## 8 1433 1.78
## 9 1436 1.30
## 10 1461 0.748
## # … with 150 more rows
# Note that school is character in this data frame: convert to factor.
school.effects %<>%
mutate(school = factor(school))
# Add school sector and mean.ses to the data frame of random effects.
# Remember we have these variables here:
lm.df
## # A tibble: 160 × 5
## school sector mean.ses intercept slope
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 1224 Public -0.434 9.72 2.51
## 2 1288 Public 0.122 13.5 3.26
## 3 1296 Public -0.426 7.64 1.08
## 4 1308 Catholic 0.528 16.3 0.126
## 5 1317 Catholic 0.345 13.2 1.27
## 6 1358 Public -0.0197 11.2 5.07
## 7 1374 Public -0.0126 9.73 3.85
## 8 1433 Catholic 0.712 19.7 1.85
## 9 1436 Catholic 0.563 18.1 1.60
## 10 1461 Public 0.677 16.8 6.27
## # … with 150 more rows
# Join
school.effects %<>%
left_join(lm.df, by="school") %>%
dplyr::select(school, u_j, sector, mean.ses)
school.effects
## # A tibble: 160 × 4
## school u_j sector mean.ses
## <fct> <dbl> <fct> <dbl>
## 1 1224 -0.0712 Public -0.434
## 2 1288 0.453 Public 0.122
## 3 1296 -1.68 Public -0.426
## 4 1308 0.0479 Catholic 0.528
## 5 1317 -1.53 Catholic 0.345
## 6 1358 -0.539 Public -0.0197
## 7 1374 -1.50 Public -0.0126
## 8 1433 1.78 Catholic 0.712
## 9 1436 1.30 Catholic 0.563
## 10 1461 0.748 Public 0.677
## # … with 150 more rows
# Add a column with the fixed intercept estimate.
# Remember this is different for Catholic vs Public schools as per calculations
# above: we use dplyr::case_when() to set the fixed intercept estimate
# for each school depending on sector.
school.effects %<>%
mutate(fixed.int = case_when(
sector == "Public" ~ fixed_int_pub,
sector == "Catholic" ~ fixed_int_cat
))
# View
school.effects
## # A tibble: 160 × 5
## school u_j sector mean.ses fixed.int
## <fct> <dbl> <fct> <dbl> <dbl>
## 1 1224 -0.0712 Public -0.434 12.1
## 2 1288 0.453 Public 0.122 12.1
## 3 1296 -1.68 Public -0.426 12.1
## 4 1308 0.0479 Catholic 0.528 13.4
## 5 1317 -1.53 Catholic 0.345 13.4
## 6 1358 -0.539 Public -0.0197 12.1
## 7 1374 -1.50 Public -0.0126 12.1
## 8 1433 1.78 Catholic 0.712 13.4
## 9 1436 1.30 Catholic 0.563 13.4
## 10 1461 0.748 Public 0.677 12.1
## # … with 150 more rows
# Now we can add the fixed intercept estimate to the school random effect estimate
# to get each school's estimated realization of the random intercept.
school.effects %<>%
mutate(ran.int = fixed.int + u_j)
# View
school.effects
## # A tibble: 160 × 6
## school u_j sector mean.ses fixed.int ran.int
## <fct> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 1224 -0.0712 Public -0.434 12.1 12.1
## 2 1288 0.453 Public 0.122 12.1 12.6
## 3 1296 -1.68 Public -0.426 12.1 10.4
## 4 1308 0.0479 Catholic 0.528 13.4 13.4
## 5 1317 -1.53 Catholic 0.345 13.4 11.8
## 6 1358 -0.539 Public -0.0197 12.1 11.6
## 7 1374 -1.50 Public -0.0126 12.1 10.6
## 8 1433 1.78 Catholic 0.712 13.4 15.1
## 9 1436 1.30 Catholic 0.563 13.4 14.7
## 10 1461 0.748 Public 0.677 12.1 12.9
## # … with 150 more rows
# "Best" schools: highest value of random intercept (average math score at mean
# values of predictors).
school.effects %>%
arrange(desc(ran.int))
## # A tibble: 160 × 6
## school u_j sector mean.ses fixed.int ran.int
## <fct> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 3427 4.21 Catholic 0.153 13.4 17.6
## 2 7688 3.17 Catholic 0.186 13.4 16.5
## 3 8628 3.13 Catholic -0.140 13.4 16.5
## 4 8193 2.81 Catholic -0.177 13.4 16.2
## 5 9198 2.08 Catholic 0.492 13.4 15.4
## 6 2655 3.05 Public -0.702 12.1 15.2
## 7 2629 1.80 Catholic -0.138 13.4 15.2
## 8 1433 1.78 Catholic 0.712 13.4 15.1
## 9 4292 1.70 Catholic -0.486 13.4 15.1
## 10 2526 1.54 Catholic 0.327 13.4 14.9
## # … with 150 more rows
# Best among Catholic schools
school.effects %>%
filter(sector=="Catholic") %>%
arrange(desc(ran.int))
## # A tibble: 70 × 6
## school u_j sector mean.ses fixed.int ran.int
## <fct> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 3427 4.21 Catholic 0.153 13.4 17.6
## 2 7688 3.17 Catholic 0.186 13.4 16.5
## 3 8628 3.13 Catholic -0.140 13.4 16.5
## 4 8193 2.81 Catholic -0.177 13.4 16.2
## 5 9198 2.08 Catholic 0.492 13.4 15.4
## 6 2629 1.80 Catholic -0.138 13.4 15.2
## 7 1433 1.78 Catholic 0.712 13.4 15.1
## 8 4292 1.70 Catholic -0.486 13.4 15.1
## 9 2526 1.54 Catholic 0.327 13.4 14.9
## 10 3838 1.50 Catholic 0.145 13.4 14.9
## # … with 60 more rows
# Best among Public schools
school.effects %>%
filter(sector=="Public") %>%
arrange(desc(ran.int))
## # A tibble: 90 × 6
## school u_j sector mean.ses fixed.int ran.int
## <fct> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 2655 3.05 Public -0.702 12.1 15.2
## 2 6170 2.11 Public -0.302 12.1 14.2
## 3 6089 2.03 Public 0.0859 12.1 14.2
## 4 4420 1.98 Public -0.224 12.1 14.1
## 5 8357 1.80 Public -0.107 12.1 13.9
## 6 5640 1.55 Public -0.177 12.1 13.7
## 7 2336 1.53 Public 0.442 12.1 13.7
## 8 1942 1.53 Public 0.682 12.1 13.7
## 9 7697 1.49 Public 0.258 12.1 13.6
## 10 4642 1.48 Public 0.115 12.1 13.6
## # … with 80 more rows
# Visualize random intercept distribution by sector
ggplot(school.effects, aes(x=sector, y= ran.int)) + geom_boxplot()
# Visualize random intercepts by mean.ses in Catholic vs Public schools
ggplot(school.effects,
aes(x=mean.ses, y=ran.int, color = sector)) +
# Scatterplot geom
geom_point(shape=1) +
# linear regression line
geom_smooth(method="lm", se=FALSE) +
# Black/white theme
theme_bw()
## `geom_smooth()` using formula 'y ~ x'