CUTTag Pipeline

References: https://yezhengstat.github.io/CUTTag_tutorial/

Alignment–Bowtie2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Build hg38 index
bowtie2-build GRCh38.p14.genome.fa /opt/database/hg38
# Build E.coli index
bowtie2-build Escherichia_coli_str_k_12_substr_mg1655_gca_000005845.ASM584v2.dna.chromosome.Chromosome.fa /opt/database/ecoli

# Alignment
bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 \
-I 10 -X 700 -p 4 \
-x /opt/database/hg38 \
-1 sample.R1.fastq.gz -2 sample.R2.fastq.gz \
-S sample.sam &> sample.txt

bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 \
-I 10 -X 700 -p 4 \
-x /opt/database/ecoli \
-1 sample.R1.fastq.gz -2 sample.R2.fastq.gz \
-S sample.spikeIn.sam &> sample.spikeIn.txt

Bam convert to bedgraph

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
samtools view -bS -F 0x04 sample.sam > sample.mapped.bam
bedtools bamtobed -bedpe -i sample.mapped.bam > sample.bed
awk '$1==$4 && $6-$2 < 1000 {print $0}' sample.bed > sample.clean.bed
cut -f 1,2,6 sample.clean.bed | sort -k1,1 -k2,2n -k3,3n > sample.fragments.bed

# Spike-in calibration (Optional)
seqDepthDouble=`samtools view -F 0x04 sample.spikeIn.sam | wc -l`
seqDepth=$((seqDepthDouble/2))
echo $seqDepth > sample.spikeIn.seqDepth

seqDepth=`cat sample.spikeIn.seqDepth`
if [[ "$seqDepth" -gt "1" ]]
then
scale_factor=`echo "10000 / $seqDepth" | bc -l`
bedtools genomecov -bg -scale $scale_factor \
-i sample.fragments.bed \
-g GRCh38.p14.genome.size > sample.fragments.normalized.bedgraph
fi
  • Get GRCh38.p14.genome.size: samtools faidx GRCh38.p14.genome.fa;cut -f 1,2 GRCh38.p14.genome.fa.fai > GRCh38.p14.genome.size

Calling peaks–SEACR

1
2
3
4
bash /opt/SEACR-master/SEACR_1.3.sh \
sample.fragments.normalized.bedgraph \
0.01 \
non stringent sample_seacr_peak

Visualization–Deeptools

Bam convert to bigwig

1
2
3
samtools sort -o sample.sorted.bam sample.mapped.bam                                                     
samtools index sample.sorted.bam
bamCoverage -b sample.sorted.bam -o sample.bw -of bigwig --normalizeUsing BPM --binSize 5 --ignoreDuplicates -p 8

Signal around gene

1
2
3
4
5
6
7
8
9
computeMatrix scale-regions -S sample.bw \
-R /opt/database/gencode.v44.annotation.gtf \
--beforeRegionStartLength 3000 \
--regionBodyLength 5000 \
--afterRegionStartLength 3000 \
--skipZeros --missingDataAsZero --metagene -p 8 \
-o gene_mat.gz

plotHeatmap -m gene_mat.gz -out gene_enrichment.png --sortUsing sum --heatmapWidth 8

Signal around peak center

1
2
3
4
5
6
7
8
9
awk '{split($6, summit, ":"); split(summit[2], region, "-"); print summit[1]"\t"region[1]"\t"region[2]}' sample_seacr_peak.stringent.bed > sample.summitRegion.bed

computeMatrix reference-point --referencePoint center \
-S sample.bw \
-R sample.summitRegion.bed \
-a 3000 -b 3000 \
--skipZeros --missingDataAsZero -p 8 \
-o sample_mat.gz
plotHeatmap -m sample_mat.gz -out sample_heatmap.png --sortUsing sum --startLabel "Peak Start" --endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks"

Peak annotation–ChIPseeker

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(ChIPseeker)
library(GenomicFeatures)
library(tidyverse)

hg38 <- loadDb("gencode.v44.sqlite")

peak1 <- readPeakFile('sample1_seacr_peak.stringent.bed')
peak2 <- readPeakFile('sample2_seacr_peak.stringent.bed')

peaks <- list(sample1 = peak1, sample2 = peak2)

plotPeakProf2(peaks, upstream = 3000, downstream = 3000, conf = 0.95,
by = "gene", type = "start_site", TxDb = hg38, facet="row")

plotPeakProf2(peaks, upstream = rel(0.2), downstream = rel(0.2),
conf = 0.95, by = "gene", type = "body", nbin = 800,
TxDb = hg38, facet="row")

peakAnnoList <- lapply(peaks, annotatePeak, TxDb=hg38,
tssRegion=c(-3000, 3000), annoDb="org.Hs.eg.db")
write.table(peakAnnoList[1], file = 'sample1_peak.txt', sep = '\t', quote = FALSE, row.names = FALSE)
write.table(peakAnnoList[2], file = 'sample2_peak.txt', sep = '\t', quote = FALSE, row.names = FALSE)

plotAnnoBar(peakAnnoList)
plotDistToTSS(peakAnnoList)
Author: Giftbear
Link: https://giftbear.github.io/2024/03/15/CUTTag_Pipeline/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.