DESeq2差异表达分析

tximport导入表达定量文件

获取转录本ID所对应的基因Id

1
2
3
4
library(GenomicFeatures)
txd <- makeTxDbFromGFF(file='gencode.v44.annotation.gtf',format='gtf',dataSource='gencode.v44',organism ='Homo sapiens')
txTogene <- AnnotationDbi::select(txd, keys = keys(txd, 'TXNAME'), keytype = 'TXNAME', columns = c('TXNAME', 'GENEID'))
write.csv(txTogene, "tx2gene.csv")

获取转录本ID所对应的基因Symbol

grep "transcript_id" gencode.v44.annotation.gtf |cut -f 9|cut -f 2,4 -d ";" |cut -f 3,5 -d " "|sed 's/"//g' |sed 's/; /\t/g' |uniq > txtogene.txt

获取基因ID所对应的基因Symbol

grep "transcript_id" gencode.v44.annotation.gtf |cut -f 9|cut -f 1,2,4 -d ";" |cut -f 2,6 -d " "|sed 's/"//g' |sed 's/; /\t/g' |uniq > gene2gene.txt

Salmon导入

1
2
3
4
5
6
7
8
9
10
11
12
library(tximport)

tx2gene <- read.csv("tx2gene.csv")
salmonOut <- "/home/deseq2/salmon"
sample <- c("A","B","C")

data <- file.path(salmonOut, sample, "quant.sf")
names(data) <- sample
txi <- tximport(data, type = 'salmon', tx2gene = tx2gene)

# View expression matrix
head(txi$counts)

Kallisto导入

1
2
3
4
data <- file.path(kallistoOut, sample, "abundance.h5")
names(data) <- sample
# if not set ignoreAfterBar = TRUE, the gene will contain the "|" and the long string.
txi <- tximport(data, type = 'kallisto', tx2gene = tx2gene, ignoreAfterBar = TRUE)

RSEM导入

1
2
3
4
5
6
7
data <- file.path(rsemOut, paste0(sample, ".genes.results"))
names(data) <- sample
txi <- tximport(data, type = 'rsem', txIn = TRUE, txOut = TRUE)
# Fix the error: all(lengths >0) is not TRUE
txi$abundance = txi$abundance[apply(txi$length,1,function(row) all(row!=0)),]
txi$counts = txi$counts[apply(txi$length,1,function(row) all(row!=0)),]
txi$length = txi$length[apply(txi$length,1,function(row) all(row!=0)),]

tximport Reference: https://bioconductor.org/packages/release/bioc/vignettes/tximport/inst/doc/tximport.html

表达计数矩阵导入

1
2
3
4
5
6
7
8
9
mat = order(rowMeans(data[,-1]),decreasing = T)
mat_order = data[mat,]
keep = !duplicated(mat_order$Gene)
exp_mat = mat_order[keep,]
name = exp_mat[,1]
exp_mat[,1] = NULL
txi = as.matrix(exp_mat)
txi = apply(txi, 2, as.integer)
rownames(txi) = name
  • data为行为基因列为样本的文件
  • 重复的基因名去重
  • DESeq2输入的为未标准化的count矩阵, count为整数

DESeq2差异表达分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
library(DESeq2)

sample <- c("A","B","C","D")
group <- c("control","control","case","case")
grouplist <- data.frame(sample, group)

dds <- DESeqDataSetFromTximport(txi, colData = grouplist, design = ~group) # From tximport
dds <- DESeqDataSetFromMatrix(countData = txi, colData = grouplist, design = ~group) # From matrix
dds <- DESeq(dds)

# Select the contrast result
res <- results(dds, contrast = c("group", "case", "control"))
res <- res[order(res$padj),]
res <- subset(res, padj < 0.1)
res <- na.omit(res)
write.table(res, "deg.txt", sep = "\t", quote = F, row.names = F)

Reference: https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

Author: Giftbear
Link: https://giftbear.github.io/2023/12/01/DESeq2差异表达分析/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.