We still use the lalonde data from the MatchIt package and use the
propensity score model that we found out in R example 5
library("MatchIt")
data("lalonde")
model <- glm(treat ~ re74 + race + married + I(re74^2) + re74:race, data = lalonde, family = "binomial")
eps <- predict(model, type = "response")
lalonde
Check weights and covariates balancing before trimming
Then we perform a first check of the weights by drawing the histogram
or boxplot of the weights. At this moment, the figures are not very
informative as there are units with extremely large weights.
## Calculate the raw weights (before normalization)
## Weights to estimate ATE
n.treated <- sum(lalonde$treat == 1)
n.control <- sum(lalonde$treat == 0)
weights <- ifelse(lalonde$treat == 1, 1/eps, 1/(1 - eps))
## Check the weights histogram
library(ggplot2)
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = weights, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") +
xlab("Weights")
## We can also draw boxplots
ggplot(temp.data, aes(x = treated, y = weights, color = treated)) +
geom_boxplot()
- Check covariate balancing. If we do not perform any trimming and
just look at the full data, we can check covariate balancing from the
love plot or summary data of the standardized mean difference. Here, we
need to write our own function to draw the love plot.
## Need to change race (categorical) into indicators (numerical)
lalonde$black <- lalonde$race == "black"
lalonde$hispan <- lalonde$race == "hispan"
lalonde$white <- lalonde$race == "white"
## Draw love plot
love.plot = function(cov, treat, ## cov is the matrix of covariates and treat is a vector of treatment assignment
weights = rep(1, length(treat)),
plot = F)
{
## mean with normalized weights \sum w_i x_i / (\sum w_i)
treat.means <- colSums(cov[treat == 1,] * weights[treat == 1])/sum(weights[treat == 1])
treat.var <- colSums(t(t(cov[treat == 1,]) - treat.means)^2 *
weights[treat == 1])/sum(weights[treat == 1])
control.means <- colSums(cov[treat == 0,] * weights[treat == 0])/sum(weights[treat == 0])
control.var <- colSums(t(t(cov[treat == 0,]) - control.means)^2 *
weights[treat == 0])/sum(weights[treat == 0])
## the standardized mean differences for every covariate
smd <- (treat.means - control.means)/sqrt((treat.var + control.var)/2)
names(smd) <- colnames(cov)
if (plot == T) {
plot.data <- data.frame(smd = smd, covariates = names(smd))
range <- max(abs(smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates)) +
geom_vline(xintercept = 0) + xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
}
return(smd)
}
colnames(lalonde)
[1] "treat" "age" "educ" "race" "married" "nodegree" "re74"
[8] "re75" "re78" "black" "hispan" "white"
raw.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat)
weighted.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat, weights = weights)
plot.data <- data.frame(smd = c(raw.smd, weighted.smd),
covariates = c(names(raw.smd), names(weighted.smd)),
category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
linetype = c("solid", "dashed", "solid", "dashed", "solid")) +
xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
Check weights and covariates balancing after trimming
Now we perform trimming and check covariate balancing again
## Histogram of estimated propensity score
temp.data <- data.frame(eps = eps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = eps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
ggtitle("Histogram of eps before trimming")
rm.idx <- which(eps < 0.1 | eps > 0.9)
## remove control units
length(rm.idx)
[1] 237
## Check the histogram of eps again
ggplot(temp.data[-rm.idx, ], aes(x = eps, fill = treated, color = treated)) +
geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
ggtitle("Histogram of eps after trimming")
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
## We can also draw boxplots
ggplot(temp.data[-rm.idx, ], aes(x = treated, y = weights, color = treated)) +
geom_boxplot()
Check covariate balancing after trimming. Covariate balancing is much
better.
raw.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx])
weighted.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx],
weights = weights[-rm.idx])
plot.data <- data.frame(smd = c(raw.smd, weighted.smd),
covariates = c(names(raw.smd), names(weighted.smd)),
category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
linetype = c("solid", "dashed", "solid", "dashed", "solid")) +
xlim(-range, range) +
labs(x = 'Standardized Difference in Means')
Estimate causal effect by weighted least square regression and
estimate the variance by Sandwich estimator
lm.result <- lm(re78 ~ treat, weights = weights[-rm.idx], data = lalonde[-rm.idx, ])
summary(lm.result)
Call:
lm(formula = re78 ~ treat, data = lalonde[-rm.idx, ], weights = weights[-rm.idx])
Weighted Residuals:
Min 1Q Median 3Q Max
-17233 -6644 -2553 4239 64304
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5135.9 480.0 10.700 <2e-16 ***
treat 1203.2 676.4 1.779 0.0761 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9335 on 375 degrees of freedom
Multiple R-squared: 0.008368, Adjusted R-squared: 0.005723
F-statistic: 3.164 on 1 and 375 DF, p-value: 0.07607
library(sandwich)
tau_hat <- lm.result$coefficients[2]
SE <- sqrt(diag(vcovHC(lm.result, type = "HC2")))[2]
## get the 95% CI
result <- c(tau_hat, SE, c(tau_hat- 1.96 * SE, tau_hat + 1.96 * SE))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
est sd CI_lower CI_upper
1203.2270 768.3267 -302.6934 2709.1475
---
title: "R Example 7: Inverse propensity score weighting"
output: html_notebook
---



We still use the lalonde data from the MatchIt package and use the propensity score model that we found out in R example 5

```{r}
library("MatchIt")
data("lalonde")

model <- glm(treat ~ re74 + race + married + I(re74^2) + re74:race, data = lalonde, family = "binomial")
eps <- predict(model, type = "response")

lalonde
```

## Check weights and covariates balancing before trimming

Then we perform a first check of the weights by drawing the histogram or boxplot of the weights. At this moment, the figures are not very informative as there are units with extremely large weights.
```{r}
## Calculate the raw weights (before normalization)
## Weights to estimate ATE
n.treated <- sum(lalonde$treat == 1)
n.control <- sum(lalonde$treat == 0)
weights <- ifelse(lalonde$treat == 1, 1/eps, 1/(1 - eps))

## Check the weights histogram
library(ggplot2)
temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = weights, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + 
  xlab("Weights") 

## We can also draw boxplots
ggplot(temp.data, aes(x = treated, y = weights, color = treated)) + 
  geom_boxplot() 
```

- Check covariate balancing.
If we do not perform any trimming and just look at the full data, we can check covariate balancing from the love plot or summary data of the standardized mean difference. Here, we need to write our own function to draw the love plot. 
```{r}
## Need to change race (categorical) into indicators (numerical)
lalonde$black <- lalonde$race == "black"
lalonde$hispan <- lalonde$race == "hispan"
lalonde$white <- lalonde$race == "white"

## Draw love plot
love.plot = function(cov, treat,  ## cov is the matrix of covariates and treat is a vector of treatment assignment
                     weights = rep(1, length(treat)),
                     plot = F) 
{
    
    ## mean with normalized weights \sum w_i x_i / (\sum w_i)
  treat.means <- colSums(cov[treat == 1,] * weights[treat == 1])/sum(weights[treat == 1])
  treat.var <- colSums(t(t(cov[treat == 1,]) - treat.means)^2 *
                          weights[treat == 1])/sum(weights[treat == 1])
  
  control.means <- colSums(cov[treat == 0,] * weights[treat == 0])/sum(weights[treat == 0])
  control.var <- colSums(t(t(cov[treat == 0,]) - control.means)^2 *
                          weights[treat == 0])/sum(weights[treat == 0])
  
  ## the standardized mean differences for every covariate
  smd <- (treat.means - control.means)/sqrt((treat.var + control.var)/2)
  names(smd) <- colnames(cov)
  
  if (plot == T) {
    plot.data <- data.frame(smd = smd, covariates = names(smd))
    range <- max(abs(smd))
    ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates)) +
      geom_vline(xintercept = 0) + xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
  }
  return(smd)
}

colnames(lalonde)
raw.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat)
weighted.smd <- love.plot(lalonde[, c(2:3, 5:9, 10:12)], lalonde$treat, weights = weights)


plot.data <- data.frame(smd = c(raw.smd, weighted.smd), 
                        covariates = c(names(raw.smd), names(weighted.smd)),
                        category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))

ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
      geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
                 linetype = c("solid", "dashed", "solid", "dashed", "solid")) + 
      xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
```

## Check weights and covariates balancing after trimming

Now we perform trimming and check covariate balancing again
```{r}
## Histogram of estimated propensity score
temp.data <- data.frame(eps = eps, treated = as.factor(lalonde$treat))
ggplot(temp.data, aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
  ggtitle("Histogram of eps before trimming")

rm.idx <- which(eps < 0.1 | eps > 0.9)
## remove control units
length(rm.idx)

## Check the histogram of eps again
ggplot(temp.data[-rm.idx, ], aes(x = eps, fill = treated, color = treated)) + 
  geom_histogram(alpha = 0.5, position = "identity") + xlim(c(0, 1)) +
  ggtitle("Histogram of eps after trimming")


temp.data <- data.frame(weights = weights, treated = as.factor(lalonde$treat))
## We can also draw boxplots
ggplot(temp.data[-rm.idx, ], aes(x = treated, y = weights, color = treated)) + 
  geom_boxplot() 
```

Check covariate balancing after trimming. Covariate balancing is much better.
```{r}
raw.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx])
weighted.smd <- love.plot(lalonde[-rm.idx, c(2:3, 5:9, 10:12)], lalonde$treat[-rm.idx], 
                          weights = weights[-rm.idx])
plot.data <- data.frame(smd = c(raw.smd, weighted.smd), 
                        covariates = c(names(raw.smd), names(weighted.smd)),
                        category = c(rep("Original", length(raw.smd)), rep("IPW", length(weighted.smd))))
range <- max(abs(plot.data$smd))
ggplot(plot.data) + geom_point(aes(x = as.numeric(smd), y = covariates, color = category)) +
      geom_vline(xintercept = c(-0.1, -0.05, 0, 0.05, 0.1),
                 linetype = c("solid", "dashed", "solid", "dashed", "solid")) + 
      xlim(-range, range) +
      labs(x = 'Standardized Difference in Means')
```

## Estimate causal effect by weighted least square regression and estimate the variance by Sandwich estimator

```{r}
lm.result <- lm(re78 ~ treat, weights = weights[-rm.idx], data = lalonde[-rm.idx, ])
summary(lm.result)

library(sandwich)
tau_hat <- lm.result$coefficients[2]
SE <- sqrt(diag(vcovHC(lm.result, type = "HC2")))[2]

## get the 95% CI
result <- c(tau_hat, SE, c(tau_hat- 1.96 * SE, tau_hat + 1.96 * SE))
names(result) <- c("est", "sd", "CI_lower", "CI_upper")
result
```