ChIP-Seq Pipeline

References:

  1. http://biocluster.ucr.edu/~rkaundal/workshops/R_feb2016/ChIPseq/ChIPseq.html
  2. https://github.com/ENCODE-DCC/chip-seq-pipeline2

Quality Control–Fastp

1
2
3
4
fastp \
-i raw_R1.fq.gz -I raw_R2.fq.gz \
-o filter_R1.fq.gz -O filter_R2.fq.gz \
-h sample.QC.html

Alignment–Bowtie2

1
2
3
4
5
6
7
8
# Build index
bowtie2-build GRCh38.p14.genome.fa /opt/genome/hg38/bowtie2_index

# Alignment
bowtie2 \
-p 4 -X 2000 \
-x /opt/genome/hg38/bowtie2_index \
-1 filter_R1.fq.gz -2 filter_R2.fq.gz | samtools sort -@ 4 -O bam -o sample.raw.bam

Bam Filter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
samtools view -F 1804 -f 2 -q 30 -u sample.raw.bam | samtools sort -n -o sample.f1.bam

samtools fixmate -r sample.f1.bam sample.fixmate.bam

samtools view -F 1804 -f 2 -u sample.fixmate.bam | samtools sort -o sample.f2.bam

java -Xmx4G -jar picard.jar MarkDuplicates \
INPUT=sample.f2.bam \
OUTPUT=sample.mkdup.bam \
METRICS_FILE=sample.metri.txt \
VALIDATION_STRINGENCY=LENIENT ASSUME_SORTED=true REMOVE_DUPLICATES=false

samtools view -F 1804 -f 2 -b sample.mkdup.bam > sample.filter.bam
samtools index sample.filter.bam

Bam to Bigwig–Deeptools

1
2
3
4
bamCoverage \
-b sample.filter.bam \
-o sample.bw \
-of bigwig --normalizeUsing BPM --binSize 5 --ignoreDuplicates -p 4

Calling Peaks–MACS3

1
2
3
4
5
6
7
8
macs3 predictd -i sample.filter.bam # extsize = (# predicted fragment length is 211 bps from process log)

macs3 callpeak \
-t sample.filter.bam \
-c input.filter.bam \
-n sample \
--outdir /opt/chip/sample \
-f BAMPE -g hs -B --nomodel --extsize 190

Differential Peaks

Without replicates–MACS3 bdgdiff

1
2
3
4
5
6
7
8
9
macs3 bdgdiff \
--t1 c1_treat_pileup.bdg \
--c1 c1_control_lambda.bdg \
--t2 c2_treat_pileup.bdg \
--c2 c2_control_lambda.bdg \
--d1 17249333 \ # c1 fragments after filtering in control (From process log)
--d2 20109621 \ # c2 fragments after filtering in control
-g 60 -l 190 \
--o-prefix sample

With replicates–DiffBind

  • DiffBind Installation: conda install -c bioconda bioconductor-diffbind

Peak Annotation

Signal around gene–Deeptools

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 1. Compute matrix
computeMatrix scale-regions \
--skipZeros --missingDataAsZero -p 4 \
-b 3000 -a 3000 \
--regionBodyLength 5000 \
-S H3K27ac.bw H3K4me3.bw INPUT.bw \
-R gencode.v44.annotation.gtf --metagene \
-o gene.gz \
--outFileSortedRegions gene.bed

# 2. Plot heatmap
plotHeatmap --matrixFile gene.gz -out gene.png --colorMap Blues Blues Greens Greens Oranges --heatmapWidth 8

# Get protein coding bed
grep "protein_coding" gencode.v44.annotation.gtf | perl -alne '{next unless $F[2] eq "gene"; /gene_name \"(.*?)\";/; print "$F[0]\t$F[3]\t$F[4]\t$1"}' > protein_coding.bed

Signal around peak center–Deeptools

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 1. Merge peaks (summits)
cat *summits.bed > merge_summits.bed
sort -k1,1 -k2,2n merge_summits.bed > sort_merge_summits.bed
bedtools merge -i sort_merge_summits.bed > merged_summits.bed

# 1. Merge peaks (summits 250bp)
for i in `ls *summits.bed`; do awk 'BGEIN{OFS="\t"}{print $1"\t"$2-250"\t"$3+250}' $i > 250_$i; done
cat 250*summits.bed | awk '{if($2<0){print $1"\t"0"\t"$3} else {print $0}}' | sortBed -i - | bedtools merge -i - > merged_summits.bed

# 2. Compute matrix
computeMatrix reference-point --referencePoint center \
--skipZeros --missingDataAsZero -p 4 \
-b 3000 -a 3000 \
-S H3K27ac.bw H3K4me3.bw INPUT.bw \
-R merged_summits.bed \
-o merged_summits_center.gz \
--outFileSortedRegions merged_summits_center.bed

# 3. Plot heatmap
plotHeatmap --matrixFile merged_summits_center.gz -out merged_summits_center.png --startLabel "Peak Start" --endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks" --heatmapWidth 6

Peak annotation–ChIPseeker

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
library(ChIPseeker)
library(GenomicFeatures)
library(tidyverse)

txdb <- makeTxDbFromGFF('gencode.v44.annotation.gff3')
saveDb(txdb, file="gencode.v44.sqlite")
hg38 <- loadDb("gencode.v44.sqlite")

samplefiles <- list.files(".", pattern = ".broadPeak", full.names = T)
samplefiles <- as.list(samplefiles)
names(samplefiles) <- c("H3K27ac", "H3K4me3")
peakAnnoList <- lapply(samplefiles, annotatePeak, TxDb=hg38,
tssRegion=c(-3000, 3000), addFlankGeneInfo = TRUE, flankDistance = 5000)
H3K27ac_anno <- as.data.frame(peakAnnoList[["H3K27ac"]]@anno)

tx2gene <- read.table("gene2gene.txt", header = T)
tx2gene <- tx2gene %>% as.data.frame()

H3K27ac_gi <- unique(H3K27ac_anno$geneId)
H3K27ac_gs <- tx2gene %>%
filter(geneId %in% H3K27ac_gi)
m <- match(H3K27ac_anno$geneId, H3K27ac_gs$geneId)
H3K27ac_anno <- cbind(H3K27ac_anno[,1:18], geneSymbol=H3K27ac_gs$symbol[m], H3K27ac_anno[,19:21])
write.table(H3K27ac_anno, file="H3K27ac_peakAnn.txt", sep="\t", quote=F, row.names=F)

peak1 <- readPeakFile('sample1_peaks.narrowPeak')
peak2 <- readPeakFile('sample2_peaks.narrowPeak')
peaks <- list(sample1 = peak1, sample2 = peak2)
plotPeakProf2(peaks, upstream = 3000, downstream = 3000, conf = 0.95,
by = "gene", type = "start_site", TxDb = hg38, facet="row")
plotPeakProf2(peaks, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body", nbin = 800,
TxDb = hg38, facet="row")
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=hg38,
tssRegion=c(-3000, 3000), annoDb="org.Hs.eg.db") # annoDb="org.Hs.eg.db" will produce the gene symbol
plotAnnoBar(peakAnnoList)
plotDistToTSS(peakAnnoList)
upsetplot(peakAnnoList[1], vennpie=TRUE)
write.table(peakAnnoList[1], file = 'sample1_peak_ann.txt', sep = '\t', quote = FALSE, row.names = FALSE)

Calculate peak intensity and enrichment

  1. gunzip center.mat.gz
  2. sed -i '1d' center.mat
  3. cut -f 297-317 center.mat|awk '{sum = 0; for (i = 1; i <= NF; i++) sum += $i; sum /= NF; print sum}' > sample1_intensity.txt
  4. cut -f 197-297,317-417 center.mat|awk '{sum = 0; for (i = 1; i <= NF; i++) sum += $i; sum /= NF; print sum}' > sample1_enrichment.txt
  5. paste sample1_intensity.txt sample1_enrichment.txt > temp1.txt
  6. awk '{if($2==0){print $1"\t0"}else{print $1"\t"$1/$2}}' temp1.txt > sample1.txt
  7. cut -f 1-4 center.mat > rowname.txt
  8. paste rowname.txt sample1.txt > peak_intensity.txt
Author: Giftbear
Link: https://giftbear.github.io/2024/02/27/ChIP-Seq_Pipeline/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.