还是之前拿到的转录组数据,只有表达量矩阵是不行的,作为 RNA-seq 下游分析重要的一步,差异表达分析将不同处理下的基因表达数据同对照组做统计分析,筛选出具有显著表达变化的基因集,这一步对后面的研究是很有必要的。
首先加载 R 包:
library(DESeq2)
读入表达矩阵,以基因 ID 为行名:
counts = read.csv(
'gene2.counts',
header = T,
sep = '\t',
row.names = "Geneid",
comment.char = '#',
check.name = F
)
我这里拿到的表达矩阵的数据格式是这样的:
# Program:featureCounts v2.0.6; Command:"featureCounts" "-T" "5" "-t" "exon" "-g" "Name" "-a" "Lolium_perenne.gff3" "-o" "gene.counts" "-p" "C0_1" "C0_2" "C0_3" "C50_1" "C50_2" "C50_3" "C500_1" "C500_2" "C500_3"
Geneid Chr Start End Strand Length C0_1 C0_2 C0_3 C50_1 C50_2 C50_3 C500_1 C500_2 C500_3
KYUSt_chr1.5-E1 1 53340 53891 + 552 233 146 195 201 189 264 177 326 243
KYUSt_chr1.6-E1 1 54648 55664 - 1017 2 4 8 12 6 3 20 47 17
KYUSt_chr1.7-E1 1 59936 60283 + 348 3 0 6 7 0 0 1 0 0
KYUSt_chr1.8-E1 1 61782 61841 + 60 0 0 0 0 0 0 2 0 0
KYUSt_chr1.8-E2 1 62373 62621 + 249 9 21 81 34 17 8 57 12 9
KYUSt_chr1.9-E1 1 63456 63489 + 34 0 0 0 0 0 0 0 0 0
KYUSt_chr1.9-E2 1 63904 63983 + 80 0 0 0 0 0 0 0 0 0
KYUSt_chr1.9-E3 1 64100 64177 + 78 0 0 0 0 0 0 0 0 0
KYUSt_chr1.9-E4 1 64286 64361 + 76 0 0 0 0 0 0 0 0 0
对数据要先进行一些预处理,比如删掉前五行不需要的数据:
counts = counts[,-c(1:5)]
我的这个数据里面有很多表达量很低或者都是 0 的,这种数据自然是直接删掉为好,只保留相加大于 10 的数据:
counts = counts[rowSums(counts)>10, ]
创建样本信息数据框:
samples = data.frame(
sampleID = c("C0_1", "C0_2", "C0_3", "C50_1", "C50_2", "C50_3", "C500_1", "C500_2", "C500_3"),
sample = c("sample1", "sample1", "sample1", "sample2", "sample2", "sample2", "sample3", "sample3", "sample3")
)
按照 sampleID 更改 samples 的行名:
rownames(samples) = samples$sampleID
将样本类型列转换为因子型数据,并设定默认排序为 1=sample1, 2=sample2, 3=sample3:
samples$sample = factor(samples$sample, levels = c('sample1', 'sample2', 'sample3'))
将读入的基因表达矩阵和样本信息构建为 DESeqDataSet 对象,使 DESeq2 包可读:
dds = DESeqDataSetFromMatrix(countData = counts, colData = samples, design = ~sample)
进行差异分析,计算差异倍数和 p 值:
dds_count <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = FALSE)
因为我这里有三组样本,分别是 C0、C50、C500,于是分为两组进行计算:
sampl1_vs_sample2 <- results(dds_count, contrast = c('sample', 'sample1', 'sample2'))
sampl1_vs_sample3 <- results(dds_count, contrast = c('sample', 'sample2', 'sample3'))
最后将两组结果转为数据框并且写入文件:
result1 <- data.frame(sampl1_vs_sample2, stringsAsFactors = FALSE, check.names = FALSE)
result2 <- data.frame(sampl1_vs_sample3, stringsAsFactors = FALSE, check.names = FALSE)
write.table(result1, 'sampl1_vs_sample2.DESeq2.txt', col.names = NA, sep = '\t', quote = FALSE)
write.table(result2, 'sampl1_vs_sample3.DESeq2.txt', col.names = NA, sep = '\t', quote = FALSE)
最后得到的数据格式如下:
baseMean log2FoldChange lfcSE stat pvalue padj
KYUSt_chr1.5-E1 213.082745455399 -0.159514113054599 0.32122397700057 -0.496582212025585 0.619483699498433 1
KYUSt_chr1.6-E1 12.858979620238 -0.583646140965058 0.84821388423366 -0.688088407668978 0.491397110057002 1
KYUSt_chr1.7-E1 1.93773722098783 0.307122539555924 2.44861358460283 0.125427115771613 0.900185422338187 NA
KYUSt_chr1.8-E2 27.843256050414 0.92386601716311 1.12261950538863 0.822955607602139 0.41053323848971 0.993851867813233
KYUSt_chr1.107-E3 9624.18154923748 1.2105461981764 0.753264345021032 1.60706690311036 0.108039692660408 0.826100628829101
KYUSt_chr1.107-E2 5756.42276327944 0.282255247362519 0.52083207523245 0.541931384000394 0.587865775653251 1
KYUSt_chr1.107-E1 6541.69156640286 0.396931212730972 0.672807064222869 0.589962908890457 0.555215516638021 1
KYUSt_chr1.117-E1 87.1271890512162 0.356042837051628 0.492560961499353 0.722840145446841 0.469778099923088 1
然后使用 ggplot2 包绘制火山图:
library(ggplot2)
data <- read.table('./sampl1_vs_sample2.DESeq2.txt', header = TRUE)
data <- as.data.frame(data)
设置 pvalue 和 logFC 的阈值:
cut_off_pvalue = 0.0000001
cut_off_logFC = 1
据阈值分为上调基因 down 和下调基因 down,无差异基因为 stable,并保存到 change 列:
data$change <- ifelse (
data$pvalue < cut_off_pvalue & abs(data$log2FoldChange) >= cut_off_logFC,
ifelse(data$log2FoldChange > cut_off_logFC, 'Up', 'Down'),
'Stable'
)
p <- ggplot(
data, # 使用的数据框
aes(
x = log2FoldChange, # x 轴表示基因的对数倍变化
y = -log10(pvalue), # y 轴表示-p 值的对数
colour = change, # 根据变化情况着色
)) +
geom_point(
alpha = 0.4, # 设置散点的透明度
size = 3.5, # 设置散点的大小
) +
scale_color_manual(values = c("#546de5", "#d2dae2", "#ff4757")) + # 手动设置颜色
geom_vline(xintercept = c(-1, 1), lty = 4, col = "black", lwd = 0.8) + # 添加垂直虚线
geom_hline(yintercept = -log10(cut_off_pvalue), lty = 4, col = "black", lwd = 0.8) + # 添加水平虚线
labs(
x = "log2(fold change)", # x 轴标签
y = "-log10 (p-value)" # y 轴标签
) +
theme_bw() + # 使用白底主题
theme(
plot.title = element_text(hjust = 0.5), # 设置标题居中
legend.position = "right", # 设置图例位置
legend.title = element_blank() # 隐藏图例标题
)
plot <- p
plot # 绘图并展示
将绘图结果保存为文件:
ggsave(plot, filename = "volcano.png", width = 10, height = 6, dpi = 300)