This document uses Fanny’s original simulation code in islam_sims_fc2.Rmd and has been adapted by Koen to adopt the new simulation framework.
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint8 = np.dtype([("qint8", np.int8, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint16 = np.dtype([("qint16", np.int16, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint32 = np.dtype([("qint32", np.int32, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## np_resource = np.dtype([("resource", np.ubyte, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint8 = np.dtype([("qint8", np.int8, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint16 = np.dtype([("qint16", np.int16, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## _np_qint32 = np.dtype([("qint32", np.int32, 1)])
## /home/sturm/projects/2019/zinbwaveZinger/work/conda/zinbwavezinger-a9754a3992265e758d80763df15636e8-c03ddbbe523231fddb66dd969db6608d/lib/python3.7/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
## np_resource = np.dtype([("resource", np.ubyte, 1)])
import pandas as pd
import numpy as np
# import scqtl
from diffxpy.api.stats import wald_test, wald_test_chisq
import diffxpy.api as de
import statsmodels.api as smThe goal of this document is to reproduce Figure 1 from our paper. A scRNA-seq dataset from Islam dataset is simulated using zingeR simulation framework. We evaluate the performance of different DE methods.
The scRNA-seq simulation is based on the Islam mouse dataset, which compares 48 embryonic stem cells to 44 embryonic fibroblasts in mouse. Reference is Saiful Islam, Una Kjallquist, Annalena Moliner, Pawel Zajac, Jian-Bing Fan, Peter Lonnerberg, and Sten Linnarsson. Characterization of the single-cell transcriptional landscape by highly multiplex RNA-seq. Genome research.
if(params$dataset == 'islam') {
#code from https://github.com/statOmics/zingeR/blob/master/vignettes/zingeRVignette_v2.Rmd
data(islamEset, package = "zingeR")
islamHlp=exprs(islamEset)[9:nrow(exprs(islamEset)),] #first 8 are spike-ins.
cellType=pData(islamEset)[,"cellType"]
paramsIslam = getDatasetMoMPositive(counts = islamHlp)
save(paramsIslam, file="./paramsIslam.rda")
# rename objects
dataset_hlp = islamHlp
dataset_params = paramsIslam
dataset_var = cellType
rm(islamHlp, paramsIslam, cellType)
} else if(params$dataset == 'trapnell') {
# from 'preprocessTrapnellData.R'
trapnellAssay72 <- readRDS("../../../datasets/GSE52529-GPL11154.rds")
trapnellAssay72 = updateObject(trapnellAssay72)
trapnellAssay <- readRDS("../../../datasets/GSE52529-GPL16791.rds")
trapnellAssay = updateObject(trapnellAssay)
trapnellAssay48 <- trapnellAssay[,colData(trapnellAssay)[,"characteristics_ch1.1"] == "hour post serum-switch: 48"]
countsTrapnell72 <- round(assay(experiments(trapnellAssay72)$gene,"count"))
id48=colData(trapnellAssay)[,"characteristics_ch1.1"] == "hour post serum-switch: 48"
countsTrapnell48 <- round(assay(experiments(trapnellAssay)$gene[,id48],"count"))
#wells containing debris
debris72 = colData(trapnellAssay72)[,"characteristics_ch1.2"]=="debris: TRUE"
debris48 = colData(trapnellAssay48)[,"characteristics_ch1.2"]=="debris: TRUE"
#wells that did not contain one cell
one72 = colData(trapnellAssay72)[,"characteristics_ch1.4"]!="cells in well: 1"
one48 = colData(trapnellAssay48)[,"characteristics_ch1.4"]!="cells in well: 1"
# remove
countsTrapnell72 = countsTrapnell72[,(!debris72 & !one72)]
countsTrapnell48 = countsTrapnell48[,(!debris48 & !one48)]
countsTrapnell <- cbind(countsTrapnell48,countsTrapnell72)
countsTrapnell <- countsTrapnell[rowSums(countsTrapnell>0)>9,] #expression in at least 10 out of 149 samples. Remains 24,576 genes and 149 samples.
rm(trapnellAssay)
timePoint=factor(c(rep(48,85),rep(72,64)))
paramsTrapnell = getDatasetMoMPositive(counts = countsTrapnell)
save(paramsTrapnell, file="./paramsTrapnell.rda")
# rename objects
dataset_hlp = countsTrapnell
dataset_params = paramsTrapnell
dataset_var = timePoint
rm(countsTrapnell, paramsTrapnell, timePoint)
}
# load("./paramsIslam.rda")nSamples=as.numeric(params$n_samples)
grp=as.factor(rep(0:1, each = nSamples/2)) #two-group comparison
nTags=10000 #nr of features
set.seed(11)
DEind = sample(1:nTags,floor(nTags*.1),replace=FALSE) #10% DE
fcSim=(2 + rexp(length(DEind), rate = 1/2)) #fold changes
libSizes=sample(colSums(dataset_hlp),nSamples,replace=TRUE) #library sizes
sim_data <- NBsimSingleCell(foldDiff = fcSim, ind = DEind,
dataset = dataset_hlp, nTags = nTags,
group = grp,
verbose = TRUE, params = dataset_params,
lib.size = libSizes, normalizeLambda=TRUE)
sim_data$counts[1:5,1:5]## sample1 sample2 sample3 sample4 sample5
## ids1 0 0 13 46 31
## ids2 31 30 7 19 0
## ids3 0 22 25 0 0
## ids4 0 0 0 0 0
## ids5 43 0 193 214 172
# BCV plots
dOrig=suppressWarnings(edgeR::calcNormFactors(DGEList(dataset_hlp)))
dOrig=estimateGLMTagwiseDisp(estimateGLMCommonDisp(dOrig, design=model.matrix(~dataset_var), interval=c(0,10)),prior.df=0)
d=suppressWarnings(edgeR::calcNormFactors(DGEList(sim_data$counts)))
d=estimateGLMTagwiseDisp(estimateGLMCommonDisp(d, design=model.matrix(~grp), interval=c(0,10)),prior.df=0)
par(mfrow=c(1,2))
plotBCV(dOrig,ylim=c(0,13), xlim=c(2,16), main="real dataset")
plotBCV(d,ylim=c(0,13), xlim=c(2,16), main="simulated dataset")par(mfrow=c(1,1))
# association of library size with zeros
plot(x=colSums(dataset_hlp), y=colMeans(dataset_hlp==0), xlab="Log library size", ylab="Fraction of zeros", ylim=c(0.2,1))
points(x=colSums(sim_data$counts), y=colMeans(sim_data$counts==0), col=2)
legend("bottomleft", c("real", "simulated"), col=1:2, pch=1)# association of aveLogCPM with zeros
plot(x=edgeR::aveLogCPM(dataset_hlp), y=rowMeans(dataset_hlp==0), xlab="Average log CPM", ylab="Fraction of zeros", ylim=c(0,1), col=alpha(1,1/2), pch=19, cex=.3)
points(x=edgeR::aveLogCPM(sim_data$counts), y=rowMeans(sim_data$counts==0),col=alpha(2,1/2),pch=19,cex=.3)
legend("bottomleft", c("real", "simulated"), col=1:2, pch=16)edgeR <- function(counts, group, ylim = NULL, xlim = NULL){
d <- DGEList(counts)
d <- suppressWarnings(edgeR::calcNormFactors(d))
design <- model.matrix(~group)
d <- estimateDisp(d, design)
plotBCV(d, ylim = ylim, main = 'edgeR', xlim = xlim)
fit <- glmFit(d,design)
lrt <- glmLRT(fit, coef = 2)
pval <- lrt$table$PValue
padj <- p.adjust(pval, "BH")
cbind(pval = pval, padj = padj)
}edger_ngenes <- function(counts, group, ylim = NULL, xlim = NULL){
d <- DGEList(counts)
d <- suppressWarnings(edgeR::calcNormFactors(d))
cdr <- colSums(counts > 0)
design <- model.matrix(~ group + cdr)
d <- estimateDisp(d, design)
plotBCV(d, ylim = ylim, main = 'edgeR', xlim = xlim)
fit <- glmFit(d,design)
lrt <- glmLRT(fit, coef = 2)
pval <- lrt$table$PValue
padj <- p.adjust(pval, "BH")
cbind(pval = pval, padj = padj)
}DESeq2 <- function(counts, group, ylim = NULL, xlim = NULL){
colData <- data.frame(group = group)
dse <- DESeqDataSetFromMatrix(countData=counts, colData=colData, design=~group)
colData(dse)$group <- as.factor(colData(dse)$group)
dse <- DESeq2::estimateSizeFactors(dse, type="poscounts")
dse <- estimateDispersions(dse, minmu=1e-3)
dse <- nbinomWaldTest(dse, minmu=1e-3, betaPrior=TRUE)
rr <- results(dse)
cbind(pval = rr$pvalue, padj = rr$padj)
}limma <- function(counts, group, ylim = NULL, xlim = NULL){
design <- model.matrix(~ group)
nf <- suppressWarnings(edgeR::calcNormFactors(counts))
y <- voom(counts, design, plot = FALSE, lib.size = colSums(counts) * nf)
fit <- lmFit(y, design)
fit <- eBayes(fit)
tt <- topTable(fit, coef = 2, n = nrow(counts), sort.by = "none")
pval <- tt$P.Value
padj <- tt$adj.P.Val
cbind(pval = pval, padj = padj)
}We encounter errors with the latest version of scde, as documented here: https://groups.google.com/forum/#!topic/singlecellstats/rbFUTOQ9wu4. We followed the guidelines suggested by the authors and work with version 1.99.2.
scde <- function(counts, group, ylim=NULL, xlim=NULL){
counts = matrix(as.integer(counts),nrow=nrow(counts),ncol=ncol(counts))
if(is.null(colnames(counts))) colnames(counts)=paste0("sample",1:ncol(counts))
require(scde)
# calculate error models
o.ifm <- scde.error.models(counts = counts, groups = group, n.cores = 1, threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE, verbose = 0)
# estimate gene expression prior
o.prior <- scde.expression.prior(models = o.ifm, counts = counts, length.out = 400, show.plot = FALSE)
# calculate differential expression
ediff <- scde.expression.difference(o.ifm, counts, o.prior, groups = group, n.randomizations = 150, n.cores = 1, verbose = 0)
lfc <- ediff$mle
pval=(1-pnorm(abs(ediff$Z)))*2
padj=(1-pnorm(abs(ediff$cZ)))*2
out = cbind(pval,padj,lfc)
return(out)
}### copied code from FPR_mocks.Rmd on September 14, 2017.
MAST <- function(counts, group, ylim = NULL, xlim = NULL){
require(MAST)
tpm <- counts*1e6/colSums(counts)
tpm <- log2(tpm+1)
sca <- FromMatrix(tpm, cData=data.frame(group=group))
#sca <- FromMatrix(counts, cData=data.frame(group=group))
# Adaptive thresholding from MAST vignette
freq_expressed <- 0.2
thres <- thresholdSCRNACountMatrix(assay(sca), nbins = 10, min_per_bin = 50, conditions = group)
#par(mfrow=c(5,4))
#plot(thres)
assays(sca) <- list(thresh=thres$counts_threshold, tpm=assay(sca))
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]
ngeneson <- apply(counts,2,function(x) mean(x>0))
CD <- colData(sca)
CD$ngeneson <- ngeneson
CD$cngeneson <- CD$ngeneson-mean(ngeneson)
colData(sca) <- CD
## differential expression
fit <- zlm(~ cngeneson + group , sca = sca)
lrFit <- lrTest(fit, 'group')
pval <- lrFit[, 'hurdle', 'Pr(>Chisq)']
padj <- p.adjust(pval, method = "BH")
### MAST filtered the genes, so make a list that is consistent with the original count matrix.
pvalAll = vector(length=nrow(counts))
pvalAll[] = 1
names(pvalAll)=rownames(counts)
pvalAll[match(names(pval),names(pvalAll))] = pval
padjAll = vector(length=nrow(counts))
padjAll[] = 1
names(padjAll)=rownames(counts)
padjAll[match(names(padj),names(padjAll))] = padj
out = cbind(pval = pvalAll, padj = padjAll, logfc = NA)
return(out)
}NODES <- function(counts, group, xlim, ylim){
require(NODES)
g=ifelse(group==0,"A","B")
colnames(counts)=g
normCounts=pQ(counts)
res=NODES::NODES(data=normCounts,group=colnames(normCounts))
pval=vector(length=nrow(counts))
names(pval)=rownames(counts)
pval[rownames(normCounts)]=res$Fisher
pval[is.na(pval)]=1
padj=p.adjust(pval,"BH")
lfc=NA
out=cbind(pval,padj,lfc)
return(out)
}metagenomeSeq <- function(counts, group, xlim, ylim){
require(metagenomeSeq)
design <- model.matrix(~group)
pheno <- AnnotatedDataFrame(data.frame(group=group))
rownames(pheno) <- colnames(counts)
p <- cumNormStatFast(counts)
dat <- newMRexperiment(counts=counts, phenoData=pheno, featureData = NULL, libSize = colSums(counts), normFactors = metagenomeSeq::calcNormFactors(counts, p=p))
fit <- fitZig(dat,design)
lfc <- fit$eb$coefficients[,"group1"]
pval <- fit$eb$p.value[,"group1"]
padj <- p.adjust(pval)
out <- cbind(pval,padj,lfc)
return(out)
}Seurat <- function(counts, group, xlim=NULL, ylim=NULL){
require(Seurat)
seur = CreateSeuratObject(counts, project=paste0("cell",as.character(group)), display.progress = FALSE)
res <- FindMarkers(seur, ident.1 = "cell0", ident.2 = "cell1", print.bar=FALSE)
pval = res$p_val[match(rownames(counts),rownames(res))]
padj = p.adjust(pval,"BH")
out=cbind(pval,padj)
return(out)
}zingeR_edgeR <- function(counts, group, ylim = NULL, xlim = NULL){
#require(zingeR)
d <- DGEList(counts)
d <- suppressWarnings(edgeR::calcNormFactors(d))
design <- model.matrix(~ group)
weights <- zingeR::zeroWeightsLS(counts = d$counts, design = design, maxit = 300,
normalization = "TMM", verbose = F)
d$weights <- weights
d <- estimateDisp(d, design)
plotBCV(d, ylim = ylim, main = 'zingeR', xlim = xlim)
fit <- glmFit(d,design)
lrt <- zinbwave::glmWeightedF(fit, coef = 2, independentFiltering = TRUE)
cbind(pval = lrt$table$PValue, padj =lrt$table$padjFilter)
}zingeR_DESeq2 <- function(counts, group, ylim = NULL, xlim = NULL){
#require(zingeR)
colData <- data.frame(group = group)
design <- model.matrix(~ group)
dse <- DESeqDataSetFromMatrix(countData = counts, colData = colData,
design = ~group)
weights <- zingeR::zeroWeightsLS(counts = counts, design = design, maxit = 300,
normalization = "DESeq2_poscounts", colData = colData,
designFormula = ~group, verbose = F)
assays(dse)[["weights"]] <- weights
dse <- DESeq2::estimateSizeFactors(dse, type="poscounts")
dse <- estimateDispersions(dse)
dse <- nbinomWaldTest(dse, betaPrior = TRUE, useT = TRUE,
df = rowSums(weights) - 2)
rr <- results(dse)
cbind(pval = rr$pvalue, padj = rr$padj)
}We compute the same weights as zingeR (i.e. posterior probabilities that a count belongs to the count component given that the count and library size is observed), but using the ZINB-WaVE estimation procedure. See more details here (http://www.biorxiv.org/content/early/2017/04/06/125112).
zinbwave_edgeR <- function(counts, group, zinb, ylim = NULL, xlim = NULL, main = 'ZINB-WaVE'){
d=DGEList(counts)
d=suppressWarnings(edgeR::calcNormFactors(d))
design=model.matrix(~group)
weights <- computeObservationalWeights(zinb, d$counts)
d$weights <- weights
d=estimateDisp(d, design)
plotBCV(d, ylim = ylim, main = main)
fit=glmFit(d,design)
lrt=glmWeightedF(fit,coef=2, independentFiltering = TRUE)
cbind(pval = lrt$table$PValue, padj =lrt$table$padjFilter)
}zinbwave_DESeq2 <- function(counts, group, zinb){
colData=data.frame(group=group)
design=model.matrix(~group)
dse=DESeqDataSetFromMatrix(countData=counts, colData=colData, design=~group)
weights <- computeObservationalWeights(zinb, counts(dse))
assays(dse)[["weights"]]=weights
dse = DESeq2::estimateSizeFactors(dse, type="poscounts")
dse = estimateDispersions(dse, minmu=1e-3)
dse <- nbinomWaldTest(dse, minmu=1e-3, betaPrior=TRUE, useT=TRUE, df=rowSums(weights)-2)
res = results(dse)
cbind(pval = res$pvalue, padj = res$padj)
}zinbwave_limma <- function(counts, group, zinb){
design <- model.matrix(~group)
nf <- edgeR::calcNormFactors(counts)
zeroWeights <- computeObservationalWeights(zinb, counts)
y <- voom(counts, design, plot=FALSE, lib.size = colSums(counts)*nf,
weights = zeroWeights)
y$weights <- y$weights * zeroWeights
fit <- lmFit(y, design, weights=y$weights)
fit$df.residual <- rowSums(zeroWeights) - ncol(design)
fit <- eBayes(fit)
tt <- topTable(fit,coef=2,n=nrow(counts), sort.by = "none")
pval <- tt$P.Value
baseMean = unname(rowMeans(sweep(counts,2,nf,FUN="*")))
hlp <- pvalueAdjustment_kvdb(baseMean=baseMean, pValue=pval)
padj <- hlp$padj
cbind(pval = pval, padj = padj)
}def wald_test(counts, group):
test = de.test.wald(
data=counts.values,
formula_loc="~ 1 + group",
factor_loc_totest="group",
sample_description=pd.DataFrame().assign(group=group),
batch_size=100,
training_strategy="DEFAULT",
dtype="float64",
gene_names=counts.index
)
return pd.DataFrame().assign(
pval=test.pval,
padj=test.qval
)
def lrt_test(counts, group):
test = de.test.lrt(
data=counts.values,
# TODO check what's correct. This is even inconsistent within diffxpy.
full_formula_scale="~ 1",
full_formula_loc="~ 1 + group",
reduced_formula_scale="~ 1",
reduced_formula_loc="~ 1",
sample_description=pd.DataFrame().assign(group=group),
batch_size=100,
training_strategy="DEFAULT",
dtype="float64",
gene_names=counts.index
)
def wald_test_counts(counts, group):
n_genes = np.sum(counts > 0, axis=1)
test = de.test.wald(
data=counts.values,
formula_loc="~ group + n_genes",
factor_loc_totest=["group"],
coef_to_test="group",
sample_description=pd.DataFrame().assign(group=pd.Categorical(group), n_genes=np.float64(n_genes.values)),
batch_size=100,
training_strategy="DEFAULT",
dtype="float64",
gene_names=counts.index
)
return pd.DataFrame().assign(
pval=test.pval,
padj=test.qval
)
# def test_scqtl(counts, group):
# umi = counts.values.T
# num_samples = umi.shape[0]
# size_factors = np.ones(len(group))
# test = de.test.wald(
# data=umi.astype(np.float64),
# formula_loc="~ 1 + group",
# factor_loc_totest="group",
# sample_description=pd.DataFrame().assign(group=group),
# batch_size=100,
# training_strategy="DEFAULT",
# dtype="float64",
# size_factors=size_factors,
# gene_names=counts.index
# )
# design = test.model_estim.input_data.design_loc
# size_factor = size_factors.reshape(1, num_samples).T
# onehot = np.ones((num_samples, 1))
# init = scqtl.tf.fit(umi=umi.astype(np.float32),
# onehot=onehot.astype(np.float32),
# design=design.astype(np.float32),
# size_factor=size_factor.astype(np.float32),
# learning_rate=1e-3,
# max_epochs=8000,
# verbose=True
# )
# res = scqtl.tf.fit(umi=umi.astype(np.float32),
# onehot=onehot.astype(np.float32),
# design=design.astype(np.float32),
# size_factor=size_factor.astype(np.float32),
# learning_rate=1e-3,
# max_epochs=8000,
# warm_start=init[:3],
# verbose=True
# )
# wald_zinb = wald_test(res[0][0, :], np.maximum(res[1][0, :], 1e-299))
# return wald_zinbimport warnings
import sys
from multiprocessing import Pool
warnings.filterwarnings('ignore')
def _fit_model(data, model_class):
model = model_class.from_formula(
"gene ~ 1 + group",
data,
missing='raise'
)
try:
res = model.fit(
method='nm',
disp=False,
maxiter=5000
)
test = res.wald_test("group[T.1]=0")
return test.pvalue
except:
return np.nan
def fit_models(count_matrix, group, model_class):
p = Pool(16)
assert len(group) == count_matrix.shape[1]
_pvals = []
pvals = []
for i in range(count_matrix.shape[0]):
if not i % 200:
sys.stdout.write("model {}\r".format(i))
sys.stdout.flush()
data = pd.DataFrame().assign(gene=count_matrix.values[i, :]).assign(group=group)
_pvals.append(p.apply_async(_fit_model, (data, model_class)))
for i, r in enumerate(_pvals):
if not i % 200:
sys.stdout.write("gene: {}\r".format(i))
sys.stdout.flush()
pvals.append(r.get())
p.close()
return np.array(pvals)diffxpy_wald = function(counts, group, ylim = NULL, xlim=NULL) {
res = py$wald_test(as.data.frame(t(counts)), group)
}
diffxpy_lrt = function(counts, group, ylim = NULL, xlim=NULL) {
res = py$lrt_test(as.data.frame(t(counts)), group)
}
diffxpy_wald_counts = function(counts, group, ylim = NULL, xlim=NULL) {
res = py$wald_test_counts(as.data.frame(t(counts)), group)
}
# scqtl_wald = function(counts, group, ylim=NULL, xlim=NULL) {
# pval = py$test_scqtl(as.data.frame(counts), group)
# data.frame(pval=pval, padj=p.adjust(pval, method="BH"))
# }
statsmodels_nb = function(counts, group, ylim=NULL, xlim=NULL) {
pval = py$fit_models(as.data.frame(counts), group, py$sm$NegativeBinomial)
data.frame(pval=pval, padj=p.adjust(pval, method="BH"))
}
statsmodels_zip = function(counts, group, ylim=NULL, xlim=NULL) {
pval = py$fit_models(as.data.frame(counts), group, py$sm$ZeroInflatedPoisson)
data.frame(pval=pval, padj=p.adjust(pval, method="BH"))
}
statsmodels_zinb = function(counts, group, ylim=NULL, xlim=NULL) {
pval = py$fit_models(as.data.frame(counts), group, py$sm$ZeroInflatedNegativeBinomialP)
data.frame(pval=pval, padj=p.adjust(pval, method="BH"))
}counts = sim_data$counts
myfct = list(DESeq2 = DESeq2,
edgeR = edgeR,
limmavoom = limma,
MAST = MAST
# NODES = NODES,
# scde = scde,
# metagenomeSeq = metagenomeSeq
)
# if we additionally load Seurat, too many packages are loaded and the DLL limit is reached. We ran Seurat in a separate session and will add it in this session.
par(mfrow = c(2,2))
ylim = c(0, 11)
xlim = c(0, 16)
res = lapply(names(myfct), function(fname){
fct = myfct[[fname]]
message(paste0("running ", fname))
fct(counts = counts, group = grp, ylim = ylim, xlim = xlim)
})## (-1e-10,0.353] (0.353,1.48] (1.48,2.35] (2.35,3.53] (3.53,5.13]
## 1.250673 1.250673 1.250673 1.250673 1.250673
## (5.13,7.3] (7.3,10.2] (10.2,14.2] (14.2,19.6]
## 3.277946 3.365603 7.057327 9.403185
# load("seuratResIslam.rda")
# res[[8]] = seuratRes
# names(res)[8] = "Seurat"
#res[['ZINB-WaVE_DESeq2_common']] = zinbwave_DESeq2(counts, grp, zinb_c)
res[['ZINB-WaVE_edgeR_common']] = zinbwave_edgeR(counts, grp, zinb_c, ylim=ylim, main = 'ZINB-WaVE, common dispersion', xlim = xlim)
#res[['ZINB-WaVE_limmavoom_common']] = zinbwave_limma(counts, grp, zinb_c)
#res[['ZINB-WaVE_DESeq2_genewise']] = zinbwave_DESeq2(counts, grp, zinb_g)
#res[['ZINB-WaVE_edgeR_genewise']] = zinbwave_edgeR(counts, grp, zinb_g, ylim=ylim, main = 'ZINB-WaVE, genewise dispersion', xlim = xlim)
#res[['ZINB-WaVE_limmavoom_genewise']] = zinbwave_limma(counts, grp, zinb_g)
#par(mfrow = c(1,1))
res[["zingeR_edgeR"]] = zingeR_edgeR(counts=sim_data$counts, group=grp)## converged.
#res[["zingeR_DESeq2"]] = zingeR_DESeq2(counts=sim_data$counts, group=grp)
save(res, file="./intermediate_res.rda")
# load("./intermediate_res.rda")res[["diffxpy_wald"]] = diffxpy_wald(counts=sim_data$counts, group=grp)
# Does not work, assertion error.
# res[["diffxpy_lrt"]] = diffxpy_lrt(counts=sim_data$counts, group=grp)# Does not work
# Error in py_call_impl(callable, dots$args, dots$keywords) :
# ValueError: constrained design matrix is not full rank: 79 80
# res[["diffxpy_wald_ngenes"]] = diffxpy_wald_counts(counts=sim_data$counts, group=grp)
invisible() res[["statsmodels_nb"]] = statsmodels_nb(counts=sim_data$counts, group=grp)
res[["statsmodels_zip"]] = statsmodels_zip(counts=sim_data$counts, group=grp)
res[["statsmodels_zinb"]] = statsmodels_zinb(counts=sim_data$counts, group=grp)# res[["scqtl"]] = scqtl_wald(counts=sim_data$counts, group=grp)
# res[["scqtl"]] = dplyr::rename(res[['scqtl']], padj=qval)res = lapply(res, as.data.frame)
names(res) = c(names(myfct), names(res)[5:length(res)])
for(k in 1:length(res)) res[[k]]$padj[is.na(res[[k]]$padj)] = 1
# load("../../results/resIslamFc2.rda")
save(res, file="resIslamFc2.tmp.rda")computeZinbwaveWeights <- function(zinb, counts){
mu <- getMu(zinb)
pi <- getPi(zinb)
theta <- getTheta(zinb)
theta_mat <- matrix(rep(theta, each = ncol(counts)), ncol = nrow(counts))
nb_part <- dnbinom(t(counts), size = theta_mat, mu = mu)
zinb_part <- pi * ( t(counts) == 0 ) + (1 - pi) * nb_part
zinbwg <- ( (1 - pi) * nb_part ) / zinb_part
t(zinbwg)
}
zinbwave_c_weights <- computeZinbwaveWeights(zinb_c, counts)
zinbwave_g_weights <- computeZinbwaveWeights(zinb_g, counts)par(mfrow=c(1,2))
#hist(zingeR_edgeR_weights, main='zingeR_edgeR', xlab = 'Weights')
#hist(zingeR_DESeq2_weights, main='zingeR_DESeq2', xlab = 'Weights')
hist(zinbwave_c_weights, main ='ZINB-WaVE, common dispersion', xlab = 'Weights')
hist(zinbwave_g_weights, main ='ZINB-WaVE, genewise dispersion', xlab = 'Weights')qqplot(zinbwave_c_weights, zinbwave_g_weights, type = 'o',
main = '',
xlab = 'ZINB-WaVE weights, common dispersion',
ylab = 'ZINB-WaVE weights, genewise dispersion')
abline(a=0,b=1)listRates = lapply(res, function(y){
y = as.data.frame(y)
nDE = sum(y[['padj']] <= 0.05, na.rm = TRUE)
TPR = mean(sim_data$indDE %in% which( y$padj <= 0.05))
FDR = mean(which(y$padj <= 0.05) %in% sim_data$indNonDE)
c(nDE = nDE, TPR = TPR, FDR = FDR)
})
df = do.call(rbind, listRates)
df = as.data.frame(df)
df$Method = names(res)
df$nDE = as.integer(df$nDE)
df$TPR = round(df$TPR*100, 1)
df$FDR = round(df$FDR*100, 1)
df = df[,c('Method', 'nDE', 'TPR', 'FDR')]
colnames(df) = c('Method', 'nDE', 'TPR(%)', 'FDR(%)')
rownames(df) = NULL
kable(df)| Method | nDE | TPR(%) | FDR(%) |
|---|---|---|---|
| DESeq2 | 122 | 12.2 | 0.0 |
| edgeR | 78 | 7.1 | 9.0 |
| limmavoom | 21 | 2.0 | 4.8 |
| MAST | 249 | 23.8 | 4.4 |
| ZINB-WaVE_edgeR_common | 578 | 53.6 | 7.3 |
| zingeR_edgeR | 376 | 36.0 | 4.3 |
| edger_ngenes | 99 | 9.3 | 6.1 |
| diffxpy_wald | 84 | 7.7 | 8.3 |
| statsmodels_nb | 85 | 7.7 | 9.4 |
| statsmodels_zip | 7676 | 94.1 | 87.7 |
| statsmodels_zinb | 753 | 55.4 | 26.4 |
#all methods
pp = COBRAData(pval = as.data.frame(do.call(cbind, lapply(res, '[[', 1))),
padj = as.data.frame(do.call(cbind, lapply(res, '[[', 2))),
truth = data.frame(status = trueDE))
cobraperf <- calculate_performance(pp, binary_truth = "status", thrs = 0.05)
colors=c(limmavoom="blue", "edger_ngenes2"="darkolivegreen4", "edger_ngenes"="deeppink2", edgeR="red", "ZINB-WaVE_edgeR_common"="salmon", DESeq2="brown", "ZINB-WaVE_DESeq2_genewise"="darkkhaki", MAST="darkturquoise", metagenomeSeq="forestgreen", scde="grey", NODES="black", Seurat="dodgerblue", "zingeR_edgeR"="hotpink1", zingeR_DESeq2="darkolivegreen4",
diffxpy_wald="dodgerblue", statsmodels_nb='darkseagreen', statsmodels_zip='steelblue', statsmodels_zinb='darkslategray3')
#iCOBRA converts '-' to '.'. Redo this.
cobraNames = sort(names(cobraperf@overlap)[1:(ncol(cobraperf@overlap)-1)])
cobraNames = gsub(x=cobraNames, pattern=".", fixed=TRUE, replacement="-")
colsCobra=colors[match(cobraNames,names(colors))]
cobraplot <- prepare_data_for_plot(cobraperf, colorscheme=colsCobra)## Warning in define_colors(cobraperf = cobraperf, palette = colorscheme,
## facetted = facetted, : too few colors provided, 2 random colors will be
## added
## Warning: Removed 11 rows containing missing values (geom_path).
# #only common disp ZINB-WaVE
# pvalDf = as.data.frame(do.call(cbind, lapply(res, '[[', 1)))
# padjDf = as.data.frame(do.call(cbind, lapply(res, '[[', 2)))
# pvalDfCommon = pvalDf[,-grep(x=colnames(pvalDf), pattern="genewise")]
# padjDfCommon = padjDf[,-grep(x=colnames(padjDf), pattern="genewise")]
# pp = COBRAData(pval = pvalDfCommon,
# padj = padjDfCommon,
# truth = data.frame(status = trueDE))
# cobraperf <- calculate_performance(pp, binary_truth = "status", thrs = 0.05)
# colors=c(limmavoom="blue", "ZINB-WaVE_limmavoom_common"="steelblue", "ZINB-WaVE_limmavoom_genewise"="darkslategray3", edgeR="red", "ZINB-WaVE_edgeR_common"="salmon", "ZINB-WaVE_edgeR_genewise"="deeppink2", DESeq2="brown", "ZINB-WaVE_DESeq2_common"="darkseagreen", "ZINB-WaVE_DESeq2_genewise"="darkkhaki", MAST="darkturquoise", metagenomeSeq="forestgreen", scde="grey", NODES="black", Seurat="dodgerblue", "zingeR_edgeR"="hotpink1", zingeR_DESeq2="darkolivegreen4")
# #iCOBRA converts '-' to '.'. Redo this.
# cobraNames = sort(names(cobraperf@overlap)[1:(ncol(cobraperf@overlap)-1)])
# cobraNames = gsub(x=cobraNames, pattern=".", fixed=TRUE, replacement="-")
# colsCobra=colors[match(cobraNames,names(colors))]
# cobraplot <- prepare_data_for_plot(cobraperf, colorscheme=colsCobra)
# save(cobraplot,file="cobraplotIslam.rda")
# plot_fdrtprcurve(cobraplot, pointsize=1)
#
# #only common disp ZINB-WaVE, no ZINB-WaVE_limma-voom
# pvalDfCommon2 = pvalDfCommon[,-grep(x=colnames(pvalDfCommon), pattern="ZINB-WaVE_limmavoom")]
# padjDfCommon2 = padjDfCommon[,-grep(x=colnames(padjDfCommon), pattern="ZINB-WaVE_limmavoom")]
# pp = COBRAData(pval = pvalDfCommon2,
# padj = padjDfCommon2,
# truth = data.frame(status = trueDE))
# cobraperf <- calculate_performance(pp, binary_truth = "status", thrs = 0.05)
# colors=c(limmavoom="blue", "ZINB-WaVE_limmavoom_common"="steelblue", "ZINB-WaVE_limmavoom_genewise"="darkslategray3", edgeR="red", "ZINB-WaVE_edgeR_common"="salmon", "ZINB-WaVE_edgeR_genewise"="deeppink2", DESeq2="brown", "ZINB-WaVE_DESeq2_common"="darkseagreen", "ZINB-WaVE_DESeq2_genewise"="darkkhaki", MAST="darkturquoise", metagenomeSeq="forestgreen", scde="grey", NODES="black", Seurat="dodgerblue", "zingeR_edgeR"="hotpink1", zingeR_DESeq2="darkolivegreen4", diffxpy_wald="dodgerblue")
# #iCOBRA converts '-' to '.'. Redo this.
# cobraNames = sort(names(cobraperf@overlap)[1:(ncol(cobraperf@overlap)-1)])
# cobraNames = gsub(x=cobraNames, pattern=".", fixed=TRUE, replacement="-")
# colsCobra=colors[match(cobraNames,names(colors))]
# cobraplot <- prepare_data_for_plot(cobraperf, colorscheme=colsCobra)
# #save(cobraplot,file="cobraplotIslamNoLimma.rda")
# plot_fdrtprcurve(cobraplot, pointsize=1) +xlab("FDP")
# res10 = res[1:10]
# names(res10) = gsub('_common', '', names(res10))
# pp = COBRAData(pval = as.data.frame(do.call(cbind, lapply(res10, '[[', 1))),
# padj = as.data.frame(do.call(cbind, lapply(res10, '[[', 2))),
# truth = data.frame(status = trueDE))
# cobraperf <- calculate_performance(pp, binary_truth = "status", thrs = 0.05)
#
# reds = brewer.pal(11, "RdYlGn")[1:3]
# blues = rev(brewer.pal(11, "RdYlBu"))[1:3]
# brown = brewer.pal(8, "Dark2")[4]
# greens = rev(brewer.pal(11, "PiYG"))[1:3]
# mycol = c(blues[1], greens[1], reds[1], brown, blues[2], greens[2], reds[2],
# blues[3], greens[3], reds[3], 'black')
# names(mycol) = c(names(res10), 'truth')
# names(cobraperf@overlap) = names(mycol)
# colsCobra <- mycol[match(sort(names(cobraperf@overlap)[1:(ncol(cobraperf@overlap)-1)]), names(mycol))]
# cobraplot <- prepare_data_for_plot(cobraperf, colorscheme = colsCobra,
# facetted = FALSE)
#
# p1 <- plot_fdrtprcurve(cobraplot, plottype = c("curve", "points"), pointsize = 1,
# linewidth = .5, xaxisrange = c(0, .5)) +
# theme(axis.text.x = element_text(size = 10),
# axis.text.y = element_text(size = 10),
# axis.title.x = element_text(size = 15),
# axis.title.y = element_text(size = 15),
# legend.text=element_text(size=7)) + theme(legend.position="none")
#
# orderLegend = c(2, 9, 6, 1, 8, 5, 3, 10, 7, 4)
# p2 <- plot_fdrtprcurve(cobraplot, plottype = c("curve", "points"), pointsize = 1,
# linewidth = .5, xaxisrange = c(0, .5)) +
# theme(legend.text=element_text(size=7)) +
# scale_color_manual(labels = names(colsCobra)[orderLegend],
# values = unname(colsCobra)[orderLegend],
# name = 'Method')
# legend <- get_legend(p2)
#
# #pdf("../../draftOverleaf/10963157vrrwqjqjdrnf/performanceIslamfc2.pdf")
# plot_grid(p1, legend, nrow = 1, ncol = 2, rel_widths = c(1, .4))
# #dev.off()# png("~/Dropbox/phdKoen/singleCell/zinbwaveZinger/plots2/pvalsIslamSim.png", width=9,height=9, units="in", res=300)
ylim = c(0, 3700)
par(mfrow = c(4,4), mar=c(3,2,1,1))
hist = lapply(1:length(res), function(i){
hist(res[[i]][,'pval'], main = names(res)[i], ylim = ylim, xlab = 'pvalues', breaks=seq(0,1,0.05))
})
# dev.off()