The post motivated by a tweetorial from Darren Dahly

In an experiment, do we adjust for covariates that differ between treatment levels measured pre-experiment (“imbalance” in random assignment), where a difference is inferred from a t-test with p < 0.05? Or do we adjust for all covariates, regardless of differences pre-test? Or do we adjust only for covariates that have sustantial correlation with the outcome? Or do we not adjust at all?

The original tweet focussed on Randomized Clinical Trials, which typically have large sample size. Here I simulate experimental biology, which typically has much smaller n.

library(ggplot2)
library(GGally)
library(data.table)

source("../R/fake_x.R") # bookdown

Fake data

Generate \(p\) correlated variables and assign the first to the response (\(\mathbf{y}\)) and the rest to the covariates (\(\mathbf{X}\)). Construct a treatment variable and effect and add this to the response.

n <- 100 # per treatment level - this is modified below
p <- 3 # number of covariates (columns of the data)
pp1 <- p+1
beta_0 <- 0 # intercept

niter <- 2000 # modified below
measure_cols <- c("no_adjust", "imbalance", "all_covariates", "weak_covariates", "strong_covariates")

xcols <- paste0("X", 1:p)
build_ycols <- c("Y_o", xcols)
cor_ycols <- c("Y", xcols)

b_mat <- data.table(NULL)
se_mat <- data.table(NULL)
p_mat <- data.table(NULL)
ci_mat <- data.table(NULL)

for(beta_1 in c(0, 0.2, 0.8)){ # treatment effect on standardized scale
  beta <- c(beta_0, beta_1)
  for(n in c(6, 10, 50)){
    # larger iterations with smaller n
    niter <- round((3*10^4)/sqrt(n), 0)
    niter <- 2000
    
    # repopulate with NA each n
    b <- se <- pval <- ci <- matrix(NA, nrow=niter, ncol=length(measure_cols))
    colnames(b) <- colnames(se) <- colnames(pval) <- colnames(ci) <- measure_cols
    
    Treatment <- rep(c("Cn", "Tr"), each=n)
    X <- model.matrix(formula("~ Treatment"))
    
    for(iter in 1:niter){
      # generate p random, correlated variables. The first is assigned to Y
      fake_data <- fake.X(n*2, pp1, fake.eigenvectors(pp1), fake.eigenvalues(pp1))
      colnames(fake_data) <- build_ycols
      
      # resacale so that var(Y) = 1, where Y is the first column
      fake_data <- fake_data/sd(fake_data[,1])
      
      fake_data <- data.table(fake_data)
      
      # view the scatterplots
      #gg <- ggpairs(X,progress = ggmatrix_progress(clear = FALSE))
      show_it <- FALSE
      if(show_it ==TRUE){
        gg <- ggpairs(fake_data)
        print(gg, progress = F)
      }
      
      # add the treatment effect
      fake_data[, Y:=Y_o + X%*%beta]
      fake_data[, Treatment:=Treatment]
      
      # model 1 - just the treatment
      fit1 <- lm(Y ~ Treatment, data=fake_data)
      res <- coef(summary(fit1))["TreatmentTr", ]
      b[iter, 1] <- res["Estimate"]
      se[iter, 1] <- res["Std. Error"]
      pval[iter, 1] <-res["Pr(>|t|)"]
      ci_i <- confint(fit1)["TreatmentTr",]
      ci[iter, 1] <- ifelse(beta_1 >= ci_i[1] & beta_1 <= ci_i[2], 1, 0)
      res1 <- copy(res)
      
      # model 2 - adjust for imablance
      inc_xcols <- NULL
      for(i in 1:p){
        formula <- paste0(xcols[i], " ~ Treatment")
        fit2a <- lm(formula, data=fake_data)
        if(coef(summary(fit2a))["TreatmentTr", "Pr(>|t|)"] < 0.05){
          inc_xcols <- c(inc_xcols, xcols[i])
        }
      }
      if(length(inc_xcols) > 0){ # if any signifianct effects refit, otherwise use old fit
        formula <- paste0("Y ~ Treatment + ", paste(inc_xcols, collapse=" + "))
        fit2b <- lm(formula, data=fake_data)
        res <- coef(summary(fit2b))["TreatmentTr", ]
        ci_i <- confint(fit2b)["TreatmentTr",]
      }else{
        res <- res1
      }
      b[iter, 2] <- res["Estimate"]
      se[iter, 2] <- res["Std. Error"]
      pval[iter, 2] <-res["Pr(>|t|)"]
      ci[iter, 2] <- ifelse(beta_1 >= ci_i[1] & beta_1 <= ci_i[2], 1, 0)
      
      
      # model 3- adjust for covariates
      (ycor <- abs(cor(fake_data[, .SD, .SDcols=cor_ycols])[2:pp1, 1]))
      mean(ycor)
      
      j <- 2
      for(target_cor in c(0, .2, .4)){
        j <- j+1
        if(target_cor == 0.2){
          inc <- which(ycor < target_cor) # include only weak covariates
        }else{
          inc <- which(ycor > target_cor) # include all OR strong covariates
        }
        if(length(inc) > 0){  # if matches refit, otherwise use old fit
          inc_xcols <- xcols[inc]
          formula <- paste0("Y ~ Treatment + ", paste(inc_xcols, collapse=" + "))
          fit3 <- lm(formula, data=fake_data)
          res <- coef(summary(fit3))["TreatmentTr", ]
          ci_i <- confint(fit3)["TreatmentTr",]
        }else{
          res <- res1
        }
        b[iter, j] <- res["Estimate"]
        se[iter, j] <- res["Std. Error"]
        pval[iter, j] <-res["Pr(>|t|)"]
        ci[iter, j] <- ifelse(beta_1 >= ci_i[1] & beta_1 <= ci_i[2], 1, 0)
      }
    }  
    b_mat <- rbind(b_mat, data.table(n=n, beta_1=beta_1, b))
    se_mat <- rbind(se_mat, data.table(n=n, beta_1=beta_1, se))
    p_mat <- rbind(p_mat, data.table(n=n, beta_1=beta_1, pval))
    ci_mat <- rbind(ci_mat, data.table(n=n, beta_1=beta_1, ci))
  }
}

p_long <- melt(p_mat, measure.vars=measure_cols, variable.name="method", value.name="p")
ci_long <- melt(ci_mat, measure.vars=measure_cols, variable.name="method", value.name="covers")
b_long <- melt(b_mat, measure.vars=measure_cols, variable.name="method", value.name="b")
se_long <- melt(se_mat, measure.vars=measure_cols, variable.name="method", value.name="se")


#ci_long[, .(coverage=sum(covers)/niter), by=.(method, n, beta_1)]

Distribution of estimates

pd <- position_dodge(0.8)
gg <- ggplot(data=b_long, aes(x=factor(n), y=b, fill=method)) +
  geom_boxplot(position=pd) +
  xlab("sample size (per treatment level)") +
  NULL
gg

Distribution of SE of estimate

pd <- position_dodge(0.8)
gg <- ggplot(data=se_long, aes(x=factor(n), y=se, fill=method)) +
  geom_boxplot(position=pd) +
  xlab("sample size (per treatment level)") +
  NULL
gg

Type I error

# type I
p_sum <- p_long[, .(error=sum(p < 0.05)/niter), by=.(method, n, beta_1)]
pd <- position_dodge(0.8)
gg <- ggplot(data=p_sum[beta_1==0], aes(x=factor(n), y=error, color=method, group=method)) +
  geom_point(position=pd) +
  geom_line(position=pd) + 
  xlab("sample size (per treatment level)") +
  ylab("Type I error") +
  # facet_grid(.~beta_1) +
  NULL
gg

Power

# power
# need p_sum from above
gg <- ggplot(data=p_sum[beta_1!=0], aes(x=factor(n), y=error, color=method)) +
  geom_point(position=pd) +
  xlab("sample size (per treatment level)") +
  ylab("Power") +
  facet_grid(.~beta_1) +
  NULL
gg

Sign error

# sign error
p_long2 <- cbind(p_long, b=b_long[, b])
sign_error <- p_long2[beta_1 > 0, .(error=sum(p < 0.1 & b < 0)/niter), by=.(method, n, beta_1)]
gg <- ggplot(data=sign_error, aes(x=factor(n), y=error, color=method)) +
  geom_point(position=pd) +
  xlab("sample size (per treatment level)") +
  ylab("Sign error") +
  facet_grid(.~beta_1) +
  NULL
gg