A skeleton simulation of different strategies for NHST for count data if all we care about is a p-value, as in bench biology where p-values are used to simply give one confidence that something didn’t go terribly wrong (similar to doing experiments in triplicate – it’s not the effect size that matters only “we have experimental evidence of a replicatable effect”)

load libraries

library(ggplot2)
library(MASS)
library(data.table)
do_sim <- function(){
  set.seed(1)
  niter <- 1000
  methods <- c("t", "Welch", "log", "Wilcoxan", "nb")
  p_table <- matrix(NA, nrow=niter, ncol=length(methods))
  colnames(p_table) <- methods
  
  res_table <- data.table(NULL)
  beta_0_list <- c(4, 10, 100) # control count
  theta_list <- c(0.5, 1, 100) # dispersion
  effect_list <- c(1:3, 5) # relative effect size will be 0%, 50%, 100%, 200%
  n_list <- c(5, 10, 20, 40) # sample size
  for(theta in theta_list){
    for(beta_0 in beta_0_list){
      for(effect in effect_list){
        beta_1 <- (effect-1)*beta_0/2 # 0% 50% 100%
        beta <- c(beta_0, beta_1)
        for(n in n_list){
          treatment <- rep(c("Cn", "Trt"), each=n)
          X <- model.matrix(~treatment)
          mu <- (X%*%beta)[,1]
          fd <- data.table(treatment=treatment, y=NA)
          for(iter in 1:niter){
            fd[, y:=rnegbin(n=n*2, mu=mu, theta=theta)]
            fd[, log_yp1:=log10(y+1)]
            p.t <- t.test(y~treatment, data=fd, var.equal=TRUE)$p.value
            p.welch <- t.test(y~treatment, data=fd, var.equal=FALSE)$p.value
            p.log <- t.test(log_yp1~treatment, data=fd, var.equal=TRUE)$p.value
            p.wilcox <- wilcox.test(y~treatment, data=fd, exact=FALSE)$p.value
            fit <- glm.nb(y~treatment, data=fd)
            p.nb <- coef(summary(fit))["treatmentTrt", "Pr(>|z|)"]
            p_table[iter,] <- c(p.t, p.welch, p.log, p.wilcox, p.nb)
          }
          p_sum <- apply(p_table, 2, function(x) length(which(x <= 0.05))/niter)
          res_table <- rbind(res_table, data.table(beta_0=beta_0,
                                                   beta_1=beta_1,
                                                   n=n,
                                                   theta=theta,
                                                   t(p_sum)))
        }
      }
    }
  }
  return(res_table)
}

do_it <- FALSE # if FALSE the results are available as a file
if(do_it==TRUE){
  res_table <- do_sim()
  write.table(res_table, "../output/glm-t-wilcoxon.txt", row.names = FALSE, quote=FALSE)
}
res_table <- fread("../output/glm-t-wilcoxon.txt")
#res_table
res <- melt(res_table, 
            id.vars=c("beta_0", "beta_1", "n", "theta"),
            measure.vars=c("t", "Welch", "log", "Wilcoxan", "nb"),
            variable.name="model",
            value.name="frequency")
# res[, beta_0:=factor(beta_0)]
# res[, beta_1:=factor(beta_1)]
# res[, theta:=factor(theta)]
# res[, n:=factor(n)]
gg <- ggplot(data=res[beta_1==0], aes(x=n, y=frequency, group=model, color=model)) +
  geom_line() +
  facet_grid(beta_0 ~ theta, labeller=label_both) +
  NULL
gg

b0_levels <- unique(res$beta_0)
# small count
gg1 <- ggplot(data=res[beta_0==b0_levels[1] & beta_1 > 0], aes(x=n, y=frequency, group=model, color=model)) +
  geom_line() +
  facet_grid(beta_1 ~ theta, labeller=label_both) +
  NULL

# large count
gg2 <- ggplot(data=res[beta_0==b0_levels[3] & beta_1 > 0], aes(x=n, y=frequency, group=model, color=model)) +
  geom_line() +
  facet_grid(beta_1 ~ theta, labeller=label_both) +
  NULL

gg1

gg2