We use the Lalonde data that we encountered in HW2 and will encounter again in HW4. Here we focus on the observational data of the job training program. The job training program took place in 1976, and we are interested in its impact on 1978 earnings.
We obtain the data from a software called MatchIt. This is a subset of the data that you will deal with in HW4.
library("MatchIt")
data("lalonde")
lalonde
– Add linear terms
Set re74 is the base covariate that we know that we need to add a priori
temp <- glm(treat ~ . , data = lalonde[, -9], family = "binomial")
temp0 <- glm(treat ~re74, data = lalonde[, -9], family = "binomial")
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
## can set trace = F to have more concise messages
model <- stepAIC(temp0, scope=list(upper = formula(temp), lower = formula(temp0)), direction = "forward")
Start: AIC=708.54
treat ~ re74
Df Deviance AIC
+ race 2 504.05 512.05
+ married 1 675.29 681.29
<none> 704.54 708.54
+ nodegree 1 702.77 708.77
+ educ 1 702.85 708.85
+ re75 1 704.15 710.15
+ age 1 704.23 710.23
Step: AIC=512.05
treat ~ re74 + race
Df Deviance AIC
+ married 1 496.34 506.34
+ educ 1 501.25 511.25
<none> 504.05 512.05
+ re75 1 503.72 513.72
+ nodegree 1 503.82 513.82
+ age 1 503.94 513.94
Step: AIC=506.34
treat ~ re74 + race + married
Df Deviance AIC
<none> 496.34 506.34
+ educ 1 494.55 506.55
+ re75 1 494.92 506.92
+ nodegree 1 495.94 507.94
+ age 1 496.00 508.00
summary(model)
Call:
glm(formula = treat ~ re74 + race + married, family = "binomial",
data = lalonde[, -9])
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5694 -0.4638 -0.3033 0.8308 2.6076
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.863e-01 1.565e-01 5.663 1.48e-08 ***
re74 -4.949e-05 2.275e-05 -2.176 0.02958 *
racehispan -2.150e+00 3.596e-01 -5.979 2.25e-09 ***
racewhite -3.062e+00 2.826e-01 -10.833 < 2e-16 ***
married -7.262e-01 2.631e-01 -2.760 0.00578 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 751.49 on 613 degrees of freedom
Residual deviance: 496.34 on 609 degrees of freedom
AIC: 506.34
Number of Fisher Scoring iterations: 5
– Add interactions
tempfull <- glm(treat ~(race + married + re74)^2 + I(re74^2), data = lalonde[, -9], family = "binomial")
library(MASS)
model <- stepAIC(model, scope=list(upper = formula(tempfull), lower = formula(model)), direction = "forward")
Start: AIC=502.57
treat ~ re74 + race + married + I(re74^2) + re74:race
Df Deviance AIC
<none> 486.57 502.57
+ married:re74 1 485.92 503.92
+ race:married 2 485.13 505.13
summary(model)
Call:
glm(formula = treat ~ re74 + race + married + I(re74^2) + re74:race,
family = "binomial", data = lalonde)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5612 -0.5542 -0.2251 0.8372 3.0364
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.683e-01 1.664e-01 5.218 1.81e-07 ***
re74 -1.148e-04 6.260e-05 -1.834 0.0667 .
racehispan -2.034e+00 4.175e-01 -4.871 1.11e-06 ***
racewhite -2.642e+00 3.183e-01 -8.300 < 2e-16 ***
married -6.451e-01 2.665e-01 -2.420 0.0155 *
I(re74^2) 4.976e-09 3.270e-09 1.522 0.1281
re74:racehispan -2.781e-05 7.658e-05 -0.363 0.7165
re74:racewhite -1.644e-04 9.291e-05 -1.770 0.0768 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 751.49 on 613 degrees of freedom
Residual deviance: 486.57 on 606 degrees of freedom
AIC: 502.57
Number of Fisher Scoring iterations: 7
## obtain linearized propensity scores
lps <- predict(model)
library(ggplot2)
temp.data <- data.frame(lps = lps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = lps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") +
xlab("Linearized propensity score")
Stratify the units into K = 5 equal strata is problematic due to poor overlapping
Strimming to improve overlapping
eps <- glm(treat ~ re74 + race + married + I(re74^2) + re74:race, data = lalonde, family = "binomial")$fitted.values
rm.idx <- which(eps < 0.1 | eps > 0.9)
idx.treated <- which((lalonde$treat == 1) & (eps > max(eps[lalonde$treat == 0])))
idx.control <- which((lalonde$treat == 0) & (eps < min(eps[lalonde$treat == 1])))
rm.idx <- union(rm.idx, c(idx.treated, idx.control))
lalonde <- lalonde[-rm.idx, ]
eps <- eps[-rm.idx]
lalonde
## First, create a data frame to store the current grouping information
temp <- data.frame(e = eps, treat = lalonde$treat,
b = 1)
t.max <- 1.96
# check whether t. stat is above t.max for first iteration
t.stat <- t.test(x = temp$e[temp$treat == 1], y = temp$e[temp$treat == 0], var.equal=T)$statistic
condition = t.stat > t.max
# minimum n.t or n.c in each block
size <- 3
size.new <- 20
# continue until all t statistics are below t.max
set.seed(0)
while(condition)
{
# calculate size in each group
# we want to not split a block if it is too small
b <- max(temp$b)
ignore <- sapply(1:b, function(j) {
n.t <- sum(temp$treat == 1 & temp$b == j)
n.c <- sum(temp$treat == 0 & temp$b == j)
return(n.t < size | n.c < size | (n.t + n.c < size.new * 2))
})
# split unbalanced blocks into more blocks
split <- which((abs(t.stat) > t.max) & (!ignore))
if(length(split) == 0)
break
## we need to keep a current copy of the block information and which block to ignore as block assignments are going to change later
b.current <- temp$b
for(j in split)
{
cutoff <- median(temp$e[b.current == j])
## We split units into two new blocks
## extract the index of units belonging to each new stratum
idx.s <- which(b.current == j & temp$e < cutoff)
idx.l <- which(b.current == j & temp$e > cutoff)
## randomly put half of the ties into one category
idx.e <- sample(which(temp$e == cutoff & b.current == j))
n.tie <- length(idx.e)
if (n.tie >= 1) {
if (n.tie > 1)
idx.s <- c(idx.s, idx.e[1:round(n.tie/2)])
idx.l <- c(idx.l, idx.e[(round(n.tie/2)+ 1):n.tie])
}
## we split only when new stratum has at least size number of control/treated units
if (sum(temp$treat[idx.s]==1) > size && sum(temp$treat[idx.s]==0) > size &&
sum(temp$treat[idx.l]==1) > size && sum(temp$treat[idx.l]==0) > size) {
# anything above the current will have to be moved up 1
temp$b[b.current > j] <- temp$b[b.current > j] + 1
## the upper new stratum will also have the block idx added by 1
temp$b[idx.l] <- temp$b[idx.l] + 1
}
## We don't do anything if we do not want to split
}
# calculate t statistic for each block
b <- max(temp$b)
t.stat <- sapply(1:b, function(j) {
t.test(x = temp$e[temp$treat == 1 & temp$b == j],
y = temp$e[temp$treat == 0 & temp$b == j], var.equal=T)$statistic
})
## Update condition
# check whether ANY blocks are above t.max
# AND are not too small
condition <- any(abs(t.stat) > t.max)
}
lalonde$blocks <- temp$b
table(lalonde[, c("treat", "blocks")])
blocks
treat 1 2 3 4 5
0 72 36 18 7 67
1 7 12 8 19 130
## check the range of estimated propensity scores
sapply(1:max(lalonde$blocks), function(j) range(temp$e[temp$b == j]))
[,1] [,2] [,3] [,4] [,5]
[1,] 0.1011357 0.1451386 0.2376577 0.4672575 0.5555612
[2,] 0.1451386 0.2376577 0.4567325 0.5555612 0.7043876
## CI from Neyman
est_per_block <- sapply(1:max(lalonde$blocks), function(idx) {
Yt <- lalonde$re78[lalonde$blocks == idx & lalonde$treat == 1]
Yc <- lalonde$re78[lalonde$blocks == idx & lalonde$treat == 0]
tau_hat <- mean(Yt) - mean(Yc)
st <- sd(Yt)
sc <- sd(Yc)
varhat_tauhat <- st^2/length(Yt) + sc^2/length(Yc)
return(c(tau_hat, varhat_tauhat))
})
colnames(est_per_block)<- 1:max(lalonde$blocks)
rownames(est_per_block) <- c("mean", "var")
est_per_block
1 2 3 4 5
mean 2252.462 -1206.69 3752.993 -980.4199 1318.621
var 7019811.047 2558729.60 11917739.160 12153089.3809 1021087.344
wts <- table(lalonde$blocks)/length(lalonde$blocks)
tau_hat <- sum(est_per_block[1, ] * wts)
sd_tau_hat <- sqrt(sum(est_per_block[2, ] * wts^2))
result <- c(tau_hat, sd_tau_hat, c(tau_hat- 1.96 * sd_tau_hat, tau_hat + 1.96 * sd_tau_hat))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
est sd CI_lower CI_upper
1201.8050 864.2807 -492.1852 2895.7952