A pipeline for RNASeq data analysis in R

Team in Bioinformatics

Posted by Yajin on October 26, 2020

Bioinfomatics Group 2

RNA-seq analysis

陈卓、莫亚金、王艺、危子航

共同出品


本文通过R语言,以TCGA中下载的关于肝癌和正常组织的RNA-seq表达矩阵为例,进行差异表达分析的流程演示。

具体流程和过程中生成的文件如下图:

pipeline

通过这一套流程,我们希望找出癌变组织与正常组织之间的差异基因、各个样本所属的类群、差异基因的蛋白质互作关系,并将这些结果可视化呈现,最终寻找在癌变组织与正常组织之间起到核心功能的重要差异基因,以及这些样本中是否有值得关注的新的组织类型。

需要的R package:

1
2
3
4
5
6
7
8
library(edgeR)
library(limma)
library(ggplot2)
library(ggThemeAssist)#非必需
library(ggrepel)
library(tidyverse)
library(pheatmap)
library(monocle)

数据预处理

去除重复值、注释基因名,为后续寻找差异基因做准备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#读入数据
data_count<-read.csv("D:/更换数据/更换数据/modified HTseq-counts data delete 150 tumor samples.csv")
gene_id<-read.table("D:/更换数据/更换数据/biomart_export.txt/biomart_export.txt",sep=",",header = TRUE)

#去掉gene_id(按照ensembl_id)中的重复名
index<-duplicated(gene_id$Gene.stable.ID)
gene_id_clean<-gene_id[!index,]
gene_id_converse<-select(gene_id_clean,"Gene.stable.ID","Gene.name")

#ID转换
ensemble_id<-substr(data_count[,1],1,15)
data_count[,1]<-ensemble_id
data_clean<-merge(x=data_count,y=gene_id_converse,by.x="Ensembl_ID",by.y="Gene.stable.ID",all=FALSE)

#去掉data_clean(按照gene_name)中的重复名 ‘MATR3’, ‘PRAMEF7’, ‘TMSB15B’
index1<-duplicated(data_clean$Gene.name)
data_clean<-data_clean[!index1,]
row.names(data_clean)<-data_clean$Gene.name
data_clean<-data_clean[,-1]
data_clean<-data_clean[,-275]


#区分control与tumor组
n<-c()
m<-c()
for(i in 1:274){
  if(substr(colnames(data_clean)[i],14,15)==11){
    n<-append(n,colnames(data_clean)[i])}
  else
    m<-append(m,colnames(data_clean)[i])
}

data_control<-select(data_clean,n)
data_tumor<-select(data_clean,m)
data_input<-cbind(data_control,data_tumor)

write.csv(data_input,"D:/更换数据/data_input.csv")

输出:整理好的counts列表

用edgeR寻找差异基因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
##读入数据
data_input<-read.csv("D:/更换数据/data_input.csv")
rownames(data_input)<-data_input$X
data_input<-data_input[,-1]


##设置分组信息
group_list<-factor(c(rep("normal",50),rep("tumor",224)))

##去除低表达量的gene,保留在至少在两个样本中cpm值大约1的基因
keep <- rowSums(cpm(data_input)>1) >= 2

##创建DEGList类型变量
DEG<-DGEList(counts = data_input,group = group_list)

write.csv(DEG$counts,"D:/更换数据/data_.csv")

##计算标准化因子
DEG<-calcNormFactors(DEG)

##计算离散度
DEG<-estimateCommonDisp(DEG)
DEG<-estimateTagwiseDisp(DEG)

##输出标准化基因表达矩阵
newData=DEG$pseudo.counts
normalizeExp=newData
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
#显著性检验
et<-exactTest(DEG)
tTag<-topTags(et,n=nrow(data_input))
ordered_tags=topTags(et,n=50000)
allDiff=ordered_tags$table
allDiff=allDiff[is.na(allDiff$FDR)==FALSE,]
diff=allDiff
####---------可选---------##差异基因

padj=0.001    ###FDR cut off阈值
foldChange=2 ###差异基因Log2FC变化的阈值(绝对值)

diffSig = diff[(diff$FDR < padj & (diff$logFC>foldChange | diff$logFC<(-foldChange))),]
write.table(diffSig, file="diffSig.xls",sep="\t",quote=F)
diffUp = diff[(diff$FDR < padj & (diff$logFC>foldChange)),]
write.table(diffUp, file="up.xls",sep="\t",quote=F)
diffDown = diff[(diff$FDR < padj & (diff$logFC<(-foldChange))),]
write.table(diffDown, file="down.xls",sep="\t",quote=F)
####-----------------------------------

normalizeExp=as.data.frame(rbind(id=colnames(newData),newData))
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)   #输出所有基因校正后的表达值(normalizeExp.txt)
diffExp=rbind(id=colnames(newData),newData[rownames(diffSig),])
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)         #输出差异基因校正后的表达值(diffmRNAExp.txt)

输出:差异表达基因的列表、标准化后的counts列表

火山图可视化差异分析结果

标记各个基因(上调、下调、无显著性差异),阈值可根据实际情况自行修改

1
2
3
4
5
6
#读取差异表达分析后的结果
#logFC:log10Flodchange;FDR:矫正后的p值;X:基因的列名
hs<-read.csv("fold_change.csv")
#标记差异基因
hs$threshold= factor(ifelse(hs$FDR<0.001 & abs(hs$logFC)>=2,ifelse(hs$logFC>=2,'UP','Down'),'NoSignificance'),
                     levels = c('UP','Down','NoSignificance'))

破产版

1
2
3
4
5
6
7
8
9
10
ggplot(hs,aes(logFC,-log10(FDR),color=threshold))+
  geom_point(size=3,alpha=0.6)+
  geom_vline(xintercept = c(-2,2),linetype=2)+
  geom_hline(yintercept = -log10(0.001),linetype=2)+
  scale_color_manual(values = c("#DC143C","#00008B","#808080"))+
  labs(title = "Volcano Plot",
       y = "-log10(Qvalue)", x = "log10(FoldChagne)",colour = "Type")+
  geom_text_repel(data = hs[-log10(hs$FDR)>50 & abs(hs$logFC)>=2,],
                  aes(label=X),color="black",size=3,)+#标记基因名,FDR<10^(-50)且|Log2FC|>2的
  theme_bw()

1

进化版

1
2
3
4
5
6
7
8
9
10
11
12
13
###-----colorpicker
CPCOLS <- c("#C90C64", "#080FF2", "#AD9F9F")

ggplot(hs,aes(logFC,-log10(FDR),color=threshold))+
  geom_point(size=3,alpha=0.6)+
  geom_vline(xintercept = c(-2,2),linetype=2)+
  geom_hline(yintercept = -log10(0.001),linetype=2)+
  scale_color_manual(values = c("#C90C64", "#080FF2", "#AD9F9F"))+
  labs(title = "Volcano Plot",
       y = "-log10(Qvalue)", x = "log10(FoldChagne)",colour = "Type")+
  geom_text_repel(data = hs[-log10(hs$FDR)>50 & abs(hs$logFC)>=2,],
                  aes(label=X),color="black",size=3,)+#标记基因名,FDR<10^(-50)且|Log2FC|>2的
  theme_bw()

2

专业豪华版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
ggplot(hs,aes(logFC,-log10(FDR),color=threshold))+
  geom_point(size=3,alpha=0.6)+
  geom_vline(xintercept = c(-2,2),linetype=2)+
  geom_hline(yintercept = -log10(0.001),linetype=2)+
  geom_hline(yintercept = 50,linetype=2)+
  scale_color_manual(values = c("#C90C64", "#080FF2", "#AD9F9F"))+
  labs(title = "Volcano Plot",
       y = "-log10(Qvalue)", x = "log10(FoldChagne)",colour = "Type")+
  geom_text_repel(data = hs[-log10(hs$FDR)>50 & abs(hs$logFC)>=2,],
                  aes(label=X),color="black",size=3,family='serif')+
  scale_x_continuous(limits=c(-6,6))+theme_classic()+
  theme(plot.title =element_text(hjust = 0.5,family = 'serif',size=20),
        axis.title.x = element_text(family='serif',size=18),
        axis.title.y = element_text(family='serif',size=18),
        legend.title = element_text(family='serif',size=18),
        legend.text = element_text(family='serif',size=15))

火山图

通过热图观察整体数据情况

由于本次数据中有很多个样本,故可以用ggplot()+geom_point()来进行一个简易热图的绘制

简易版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#数据筛选
#读取的是标准化后的表达矩阵
nor<-read.table('diffmRNAExp.txt',sep=',')
#选择有差异的基因
diff<-hs[which(!(hs$threshold=='NoSignificance')),] #选择有差异的基因
hmmtrix<-nor[which(nor$genes %in% diff$X),] 
#给基因排个序
hmmtrix$genes%<>%
  factor(levels = diff$X)

#转换为长列表
lhm<-gather(hmmtrix,"sample","value",-genes)
#给样本排个序
colname<-colnames(nor)
col<-colname[2:275]
lhm$sample%<>%
  factor(levels = col)

#画图
ggplot(lhm,aes(sample,genes,color=value))+
  geom_point(shape="square")+
  scale_colour_gradient2(low = "#009ACD",mid = "white",high = "#EE7600",midpoint = 3,) +
  theme(panel.grid.major = element_line(linetype = "blank"), 
    panel.grid.minor = element_line(linetype = "blank"), 
    axis.text.x = element_text(size = 1), 
    axis.text.y = element_text(size = 3)) +
  labs(x = "samples", subtitle = "Heat map") 

4

含聚类的Pro版

这个需要一个pheatmap的package来实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
a=read.table('diffmRNAExp.txt',sep='\t',header=T,stringsAsFactors = F)
heatmapData=a
rownames(heatmapData)=heatmapData[,1]
heatmapData=heatmapData[,-1]
rm(a)
hmExp=log10(heatmapData+0.000001)

##设置分组信息
rownames(group_dataframe)=colnames(heatmapData)
group_dataframe$standard=0
group_dataframe=group_dataframe[,-1]
library(pheatmap)
library(ggplot2)
range(hmExp) ###从-6-6.312
p=pheatmap(hmExp,annotation_col=group_dataframe,
           fontsize_col = 1,
           fontsize_row = .5,
           cutree_rows = 5,
           cluster_rows = TRUE,
           cluster_cols = TRUE,
           clustering_method = 'complete',
           clustering_distance_rows = 'manhattan',
           clustering_distance_cols = 'manhattan',
           display_numbers = F,
           main = 'Tumors vs Normal')
###参数解释:
###annotation_col: 分组标记
###cluster_row 是否按照行聚类
###cluster_cols 是否按照列聚类
###cutree_rows 人为划分行聚类
###clustering_methods: 聚类方法
###clustering_distance_rows/cols:聚类显示方法
###display_numbers:是否显示数字
###main 标题

5

通过聚类降维观察整体数据情况

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
data <- read.delim("D:/VSCode_WorkSpace/temp_pre/data/data_input.csv",sep=",")
cell_name <-read.delim("D:/VSCode_WorkSpace/temp_pre/data/cell_name.csv",sep=",", row.names = 1)
gene_name <-read.delim("D:/VSCode_WorkSpace/temp_pre/data/gene_name.csv",sep=",", row.names = 1)
HSMM <- ceiling(data[,-1])
rownames(HSMM) <- data[,1]
##读取艺的DE结果并构建后续分析使用的HSMMdataframe,并且读取cell_name和gene_name两个dataframe

library(monocle)
phenoData <- cell_name
featureData <- gene_name
pd <- new("AnnotatedDataFrame", data = phenoData)
fd <- new("AnnotatedDataFrame", data = featureData)
##调用monocle包,将读入的三个dataframe赋值给包内变量
HSMM <- newCellDataSet(as.matrix(HSMM),
                       phenoData = pd, 
                       featureData = fd,
                       expressionFamily = negbinomial.size())
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)
##调用包内函数估计矩阵分布并normalize

HSMM <- detectGenes(HSMM, min_expr = 0.1)
##设置最小表达阈值

BMP10_id <- row.names(subset(fData(HSMM),gene_short_name =="BMP10"))
LIN28B_id <- row.names(subset(fData(HSMM),gene_short_name =="LIN28B"))
COX7B2_id <- row.names(subset(fData(HSMM),gene_short_name =="COX7B2"))
cth <- newCellTypeHierarchy()
cth <- addCellType(cth,"BMP10_Type",classify_func = function(x){x[BMP10_id,]>=1})
cth <- addCellType(cth,"LIN28B_Type",classify_func = function(x){x[LIN28B_id,]>=1 & x[BMP10_id,]<1})
cth <- addCellType(cth,"COX7B2_Type",classify_func = function(x){x[COX7B2_id,]>=1 & x[BMP10_id,]<1 & x[LIN28B_id,]<1})
##根据DE结果或者先验知识申明marker基因,设置分类函数

HSMM <- classifyCells(HSMM, cth, 0.1)
table(pData(HSMM)$CellType)
pie <- ggplot(pData(HSMM), 
              aes(x=factor(1),fill=factor(CellType))) +geom_bar(width = 1)
pie + coord_polar(theta="y")+
  theme(axis.title.x = element_blank(),axis.title.y = element_blank())
##根据分类函数将sample分类

提取可用于聚类的基因:

图片1

图片2

BMP10、COX7B2、LIN28B有很高的覆盖率,说明这几个基因可以用来“区分”不同的样品类群

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
disp_table <- dispersionTable(HSMM)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
HSMM <- setOrderingFilter(HSMM,unsup_clustering_genes$gene_id)
plot_ordering_genes(HSMM)
##聚类之前对plot的细胞进行质控
plot_pc_variance_explained(HSMM, return_all = F)
##查看每个主成分占variance比例
HSMM <- reduceDimension(HSMM, max_components = 3, num_dim = 15, reduction_method = 'tSNE', verbose = T)
HSMM <- clusterCells(HSMM, num_clusters = 7)
plot_cell_clusters(HSMM,x=1,y=2, color_by = "Cluster") + facet_wrap(~CellType)
plot_cell_clusters(HSMM, 1, 2,  color = "CellType", markers = c("BMP10","LIN28B","COX7B2"))
plot_cell_clusters(HSMM, 1, 2, 3, color="celltype")
##分别画三种tSNE图

expressed_genes <- row.names(subset(fData(HSMM),
                                    num_cells_expressed >= 5))
diff_test_res <- differentialGeneTest(HSMM[expressed_genes,],
                                      fullModelFormulaStr = "~celltype")
sig_genes <- subset(diff_test_res, qval < 0.1)
sig_genes[1:6,c("gene_short_name", "pval", "qval")]
##非监督地定义差异表达基因

ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
HSMM <- setOrderingFilter(HSMM, ordering_genes)
plot_ordering_genes(HSMM)
##画时序图之前的质控
HSMM <- reduceDimension(
    HSMM,
    max_components = 2,
    method = 'DDRTree')
HSMM <- orderCells(HSMM)
plot_cell_trajectory(HSMM)
##画时序图

8

图片4

拟时序分析结果:

图片6

差异基因互作的网络图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# 筛选差异基因列表
## 设置数据载入接口和阈值
fold_change_data=read.csv('fold_change.csv',header=T)
FDR_cut_threshold=0.01
FC_cut_threshold=2 ###绝对值

## 数据排序
head(fold_change_data)
library(dplyr)
fold_change_data=arrange(fold_change_data,FDR)
head(fold_change_data) ###再查看一次顺序有没有变化

## 根据threshold筛选数据
fold_change_data=fold_change_data[which((fold_change_data$FDR<=FDR_cut_threshold)&(abs(fold_change_data$logFC)>=FC_cut_threshold)), ]
## 检验筛选结果
a_test=ifelse((fold_change_data$FDR>FDR_cut_threshold)|(abs(fold_change_data$logFC)<FC_cut_threshold),1,0)
sum(a_test)    ### 输出结果为0,说明没有残留FDR>FDR_cut_threshold以及|logFC|小于FC_cut_threshold的情况

## 获得列表
write.csv(fold_change_data,'FC_FDR_filter_genelist.csv')
#---------------------------------------------------------------------
# 根据差异基因列表获得相应的蛋白互作网络
## 蛋白互作关系列表请自行下载,不推荐使用STRING

## 设置数据载入接口
pro_inter1=read.csv('bioplex_293T.csv',header=T,sep=',')
pro_inter2=read.csv('Bioplex_HCT116.csv',header=T,sep=',')
A_node=5 ###节点A基因名所在的列
B_node=6 ###节点B基因名所在的列
num_edge=9 ###A与B连接的数值所在的列

## 提取所需列并合并数据(节点和连接数值)
head(pro_inter1)
head(pro_inter2)
pro_inter1=pro_inter1[,c(A_node,B_node,num_edge)] ###提取节点的Gene Name和连接edge数值
pro_inter2=pro_inter2[,c(A_node,B_node,num_edge)]
mypro_inter=rbind(pro_inter1,pro_inter2)
View(mypro_inter)
mypro_inter=mypro_inter[which(mypro_inter$pInt>0.98),]###选择互作可信度高的结果

## 读取差异基因列表
gene_list=read.csv=read.csv('FC_FDR_filter_genelist.csv',header=T,sep=',')
View(gene_list)
## 匹配蛋白互作数据
gene_list_inter=mypro_inter[which(mypro_inter$SymbolA%in%gene_list$X),]
gene_list_inter2=mypro_inter[which(mypro_inter$SymbolB%in%gene_list$X),]
gene_list_inter=rbind(gene_list_inter,gene_list_inter2)
View(gene_list_inter)
mypro_inter=gene_list_inter
View(mypro_inter)
## 删除无用数据,扩大内存
rm(gene_list)
rm(gene_list_inter2)
rm(pro_inter1)
rm(pro_inter2)


## 过滤重复互作
### 额外增加一列作为检验是否重复
mypro_inter$dup_or_not=0

### For循环开始查找重复互作关系(对于AP-MS数据可以不执行这一步,因为节点A一般作为bait获得蛋白互作数据,因此A和B是有实验意义的,这种时候互作关系得到重复可以表明其在实验中的重复性)
n_rows_of_data=nrow(mypro_inter)
for (i in 1:(n_rows_of_data-1)) {
  for (n in (i+1):n_rows_of_data) {
    if (((mypro_inter[i,1]==mypro_inter[n,1])&(mypro_inter[i,2]==mypro_inter[n,2]))|((mypro_inter[i,2]==mypro_inter[n,1])&(mypro_inter[i,1]==mypro_inter[n,2]))){
      mypro_inter[n,4]=1
    }
    print(i)
    print(n)
  }
}
write.csv(mypro_inter,'mypro_inter_nodup.csv') ### 保存结果以防丢失
### 观察结果
mypro_inter_nodup=read.csv('mypro_inter_nodup.csv',header=T)
View(mypro_inter_nodup)
sum(mypro_inter_nodup$dup_or_not)###输出122对互作是重复的
mypro_inter_nodup=subset(mypro_inter_nodup,dup_or_not!=1)
dim(mypro_inter_nodup) ###查看后发现删除了122行重复互作关系

###整理结果,剔除无用列
mypro_inter_nodup=mypro_inter_nodup[,-c(1,5)]
write.csv(mypro_inter_nodup,'mypro_inter_nodup.csv') ##保存结果

#------------------------------------------------------------------
#------------------------------------------------------------------
#注释节点方便后面画图

##读取文件
mypro_inter_nodup=read.csv('mypro_inter_nodup.csv')
mypro_inter_nodup=mypro_inter_nodup[,-1]
gene_list=read.csv('FC_FDR_filter_genelist.csv',header=T,sep=',')## 准备数据
Uniprot_data=read.csv('Uniprot data.csv',header=T)
View(Uniprot_data)
## 注释节点
mypro_inter_nodup$A_in_difflist=ifelse((mypro_inter_nodup$SymbolA) %in% gene_list$X,1,0)
mypro_inter_nodup$B_in_difflist=ifelse((mypro_inter_nodup$SymbolB) %in% gene_list$X,1,0)

## 蛋白质家族注释
prt_family='Krueppel C2H2-type zinc-finger protein family'
Uniprot_family=Uniprot_data[which(Uniprot_data$Protein.families==prt_family),]
mypro_inter_nodup$A_is_ZNF=ifelse((mypro_inter_nodup$SymbolA %in% Uniprot_family$Gene.names...primary..),1,0) ###注释A节点是否为ZNF家族蛋白
mypro_inter_nodup$B_is_ZNF=ifelse((mypro_inter_nodup$SymbolB %in% Uniprot_family$Gene.names...primary..),1,0) ###注释B节点
View(mypro_inter_nodup)

## 连接类型注释
a=mypro_inter_nodup$A_in_difflist+mypro_inter_nodup$B_in_difflist
mypro_inter_nodup$edge_about_list=a ###判断连接中与差异基因列表重合的节点数
b=mypro_inter_nodup$A_is_ZNF+mypro_inter_nodup$B_is_ZNF
mypro_inter_nodup$edge_about_ZNF=b
write.table(mypro_inter_nodup,'data_for_cytoscape.txt',sep='\t')###输出文件
### 开始使用Cytoscape绘图

由于接下来的步骤不在R中进行,具体图片绘制方法详见视频【xxxxx】

7bb8b8ca410a4621740cd01b0cfe3ab