This vignette walks through a concrete replication exercise:
re-estimating the Duflo (2001) INPRES specification with
Rfuzzydid.
The key idea is to recover the wage return to schooling from cohort-by-district variation in treatment intensity, using the fuzzy DID estimators in de Chaisemartin and D’Haultfoeuille (2018).
The replication starts from inpresdata.dta, which is not
bundled with the package to keep the source tarball small. The file
originates from David Roodman’s replication materials (droodman/Duflo-2001).
In the companion archive README (droodman/Duflo-2001),
he notes that the main 1995 SUPAS file is “not contained in this
repository” and links to that downloadable location. Download
inpresdata.dta from Roodman’s droodman/Duflo2001
repository when running this vignette locally.
Following the original design, we keep two cohort windows:
The t indicator equals 1 for the later cohort.
# Define cohort groups
cohort0 <- raw$p504thn >= 57 & raw$p504thn <= 62
cohort1 <- raw$p504thn >= 68 & raw$p504thn <= 72
# Keep observations with non-missing outcome and treatment
keep <- !is.na(raw$lhwage) & !is.na(raw$yeduc) & (cohort0 | cohort1)
dat <- raw[keep, c("birthpl", "lhwage", "yeduc", "p504thn")]
# Time indicator: t = 1 for later cohort
dat$t <- as.integer(dat$p504thn >= 68 & dat$p504thn <= 72)The original Stata workflow builds “super-groups” based on how schooling changes within district between the two cohort windows. We follow the same logic with a chi-square screen and the sign of the change in mean schooling:
pval_threshold <- 0.5
districts <- sort(unique(dat$birthpl))
gstar_map <- setNames(integer(length(districts)), as.character(districts))
for (g in districts) {
i <- dat$birthpl == g
sub <- dat[i, ]
tab <- table(sub$yeduc, sub$t)
pval <- suppressWarnings(chisq.test(tab)$p.value)
evol <- mean(sub$yeduc[sub$t == 1]) - mean(sub$yeduc[sub$t == 0])
if (!is.na(pval) && pval < pval_threshold) {
gstar_map[[as.character(g)]] <- if (evol > 0) 1L else if (evol < 0) -1L else 0L
} else {
gstar_map[[as.character(g)]] <- 0L
}
}
dat$gstar <- unname(gstar_map[as.character(dat$birthpl)])This produces three groups:
gstar = 1: Districts where education increased
(treatment intensity rose)gstar = -1: Districts where education decreasedgstar = 0: Districts with no significant changeAs in the paper’s setup, we estimate two arms separately and combine them after:
To aggregate, we weight each arm by:
p_inc, p_dec)dD_inc,
dD_dec)The decreasing arm uses -dD_dec so both weights are on a
comparable “increase in treatment” scale.
# Treatment intensity weights
dD_inc <- mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 1]) -
mean(arm_inc_data$yeduc[arm_inc_data$G == 1 & arm_inc_data$t == 0]) -
mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 1]) +
mean(arm_inc_data$yeduc[arm_inc_data$G == 0 & arm_inc_data$t == 0])
dD_dec <- mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 1]) -
mean(arm_dec_data$yeduc[arm_dec_data$G == 1 & arm_dec_data$t == 0]) -
mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 1]) +
mean(arm_dec_data$yeduc[arm_dec_data$G == 0 & arm_dec_data$t == 0])
p_inc <- mean(dat$gstar == 1)
p_dec <- mean(dat$gstar == -1)
w_inc <- p_inc * dD_inc
w_dec <- p_dec * (-dD_dec)
# Extract point estimates
did_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_DID"]
tc_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_TC"]
cic_inc <- fit_inc$late$estimate[fit_inc$late$estimator == "W_CIC"]
did_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_DID"]
tc_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_TC"]
cic_dec <- fit_dec$late$estimate[fit_dec$late$estimator == "W_CIC"]
# Weighted aggregation
did_agg <- (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec)
tc_agg <- (tc_inc * w_inc + tc_dec * w_dec) / (w_inc + w_dec)
cic_agg <- (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec)
cat("Aggregate Estimates:\n")
cat(sprintf("DID = %.6f\n", did_agg))
cat(sprintf("TC = %.6f\n", tc_agg))
cat(sprintf("CIC = %.6f\n", cic_agg))The following one-shot Stata code starts from
inpresdata.dta and reproduces the arm-level and aggregate
point estimates:
The Stata snippet assumes the Stata fuzzydid command is
already installed locally; Rfuzzydid no longer ships the
.ado sources.
clear all
set more off
local pval_threshold 0.5
use inpresdata.dta, clear
keep if lhwage < . & yeduc < . & (inrange(p504thn,57,62) | inrange(p504thn,68,72))
gen byte t = inrange(p504thn,68,72)
gen gstar = .
levelsof birthpl, local(districts)
local pval_threshold = 0.05
quietly foreach g of local districts {
tab yeduc t if birthpl == `g', chi2
scalar p = r(p)
summarize yeduc if t == 1 & birthpl == `g', meanonly
scalar m1 = r(mean)
summarize yeduc if t == 0 & birthpl == `g', meanonly
scalar m0 = r(mean)
scalar evol = m1 - m0
replace gstar = (scalar(evol) > 0 & scalar(p) < `pval_threshold') - (scalar(evol) < 0 & scalar(p) < `pval_threshold') if birthpl == `g'
}
gen byte inc = (gstar == 1)
gen byte dec = (gstar == -1)
summarize inc, meanonly
scalar p_inc = r(mean)
summarize dec, meanonly
scalar p_dec = r(mean)
* Arm g*=1 vs g*=0
preserve
keep if gstar >= 0
gen byte G = (gstar == 1)
fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose
matrix M = e(b_LATE)
scalar did_inc = M[1,1]
scalar tc_inc = M[2,1]
scalar cic_inc = M[3,1]
summarize yeduc if G == 1 & t == 1, meanonly
scalar D11 = r(mean)
summarize yeduc if G == 1 & t == 0, meanonly
scalar D10 = r(mean)
summarize yeduc if G == 0 & t == 1, meanonly
scalar D01 = r(mean)
summarize yeduc if G == 0 & t == 0, meanonly
scalar D00 = r(mean)
scalar dD_inc = (D11 - D10) - (D01 - D00)
restore
* Arm g*=-1 vs g*=0
preserve
keep if gstar <= 0
gen byte G = (gstar == -1)
fuzzydid lhwage G t yeduc, did tc cic newcateg(5 8 11 14 1000) nose
matrix M = e(b_LATE)
scalar did_dec = M[1,1]
scalar tc_dec = M[2,1]
scalar cic_dec = M[3,1]
summarize yeduc if G == 1 & t == 1, meanonly
scalar D11m = r(mean)
summarize yeduc if G == 1 & t == 0, meanonly
scalar D10m = r(mean)
summarize yeduc if G == 0 & t == 1, meanonly
scalar D01m = r(mean)
summarize yeduc if G == 0 & t == 0, meanonly
scalar D00m = r(mean)
scalar dD_dec_pos = -((D11m - D10m) - (D01m - D00m))
restore
scalar w_inc = p_inc * dD_inc
scalar w_dec = p_dec * dD_dec_pos
scalar did_agg = (did_inc * w_inc + did_dec * w_dec) / (w_inc + w_dec)
scalar tc_agg = (tc_inc * w_inc + tc_dec * w_dec) / (w_inc + w_dec)
scalar cic_agg = (cic_inc * w_inc + cic_dec * w_dec) / (w_inc + w_dec)
display "Arm g*=1: DID = " %9.6f did_inc " ; TC = " %9.6f tc_inc " ; CIC = " %9.6f cic_inc
display "Arm g*=-1: DID = " %9.6f did_dec " ; TC = " %9.6f tc_dec " ; CIC = " %9.6f cic_dec
display "AGG DID = " %9.6f did_agg
display "AGG TC = " %9.6f tc_agg
display "AGG CIC = " %9.6f cic_aggRunning that code in Stata reports:
g*=1: DID = 0.123679191, TC =
0.079868219, CIC = 0.074316049g*=-1: DID = 0.117024113, TC =
0.109567707, CIC = 0.1143018550.122244473, TC =
0.086270906, CIC = 0.082936285The next chunk checks parity directly in R:
stata_vals <- c(
did_inc = 0.123679191,
tc_inc = 0.079868219,
cic_inc = 0.074316049,
did_dec = 0.117024113,
tc_dec = 0.109567707,
cic_dec = 0.114301855,
did_agg = 0.122244473,
tc_agg = 0.086270906,
cic_agg = 0.082936285
)
r_vals <- c(
did_inc = did_inc,
tc_inc = tc_inc,
cic_inc = cic_inc,
did_dec = did_dec,
tc_dec = tc_dec,
cic_dec = cic_dec,
did_agg = did_agg,
tc_agg = tc_agg,
cic_agg = cic_agg
)
cmp <- data.frame(
estimate = names(stata_vals),
R = unname(r_vals),
Stata = unname(stata_vals),
abs_diff = abs(unname(r_vals) - unname(stata_vals))
)
knitr::kable(cmp, digits = 6)Point estimates match to numerical tolerance. Confidence intervals can still differ when bootstrap is enabled, because draws and RNG streams differ across software.
Duflo, E. (2001). Schooling and Labor Market Consequences of School Construction in Indonesia. American Economic Review, 91(4): 795-813.
de Chaisemartin, C. and D’Haultfoeuille, X. (2018). Fuzzy Differences-in-Differences. Review of Economic Studies, 85(2): 999-1028. doi:10.1093/restud/rdx049.
de Chaisemartin, C., D’Haultfoeuille, X., and Guyonvarch, Y. (2018). Fuzzy Differences-in-Differences with Stata. Stata Journal. doi:10.1177/1536867X19854019.