library(edgeR)
options(stringsAsFactors = F)
library(airway)#气道平滑肌细胞RNA—seq
help(airway)#数据包中的help信息可以提供很多有用的信息,如参数意思
data("airway")
#获取基因counts矩阵
expr<-assay(airway)
?assay
expr[1:5,1:5]
#获取分组信息
group_list<-colData(airway)$dex
group_list
library(clusterProfiler)#Y叔开发的包(重点介绍)ID转换
library(org.Hs.eg.db)#人类基因组注释包
keytypes(org.Hs.eg.db)#查看支持ID的转换类型
#转换ID
geneid<-bitr(rownames(expr),fromType = "ENSEMBL",toType = "SYMBOL",
OrgDb = "org.Hs.eg.db",drop=T)#去除空值
geneid
head(geneid)
#注释ID
expr<-expr[rownames(expr)%in%geneid[,1],]
dim(expr)
geneid<-geneid[match(rownames(expr),geneid[,1]),]
rownames(expr)<-geneid$SYMBOL
head(expr)
expr[1:5,1:5]
#去重(chong)
table(duplicated(rownames(expr)))#统计重复基因数目
dim(expr)
#对重复基因名取平均表达量
if(sum(duplicated(rownames(expr)))>0)#判断是否重复
expr1<-avereps(expr,ID=rownames(expr))
dim(expr1)
#过滤低表达基因
apply(expr1,2,fivenum)#查看样本表达基因
apply(expr1,2,function(x)sum(x>1))
#过滤方法1---自定义
expr2<-expr1[apply(expr1,1,function(x) sum(x>1)>1),]
dim(expr2)
save(group_list,expr2,file="expr.Rdata")
#过滤方法2
x<-edgeR::DGEList(counts=expr1)#构建DGEList
dim(x)
keep.exprs<-edgeR::filterByExpr(x,group=group_list)#判断基因是否低表达
x<-x[keep.exprs,,keep.lib.sizes=F]
dim(x)
head(keep.exprs)
dim(x)
#可视化过滤方法2中数据
library(RColorBrewer)
x<-edgeR::DGEList(counts=expr1,group=factor(group_list))
dim(x)
L<-mean(x$samples$lib.size)/10^6
M<-median(x$samples$lib.size)/10^6
lcpm<-cpm(x,log=T,prior.count = 2)#log2(CPM+2/L)
head(lcpm)
dim(x)
lcpm.cutoff<-log2(10/M+2/L)#过滤阈值
nsamples<-ncol(x)#提取样本数
col<-brewer.pal(nsamples,"Paired")#颜色
par(mfrow=c(1,2))
#过滤前图形
plot(density(lcpm[,1]),col=col[1],lwd=2,ylim=c(0,0.26),las=2,main="",
xlab="")
abline(v=lcpm.cutoff,lty=5)#阈值线
for(i in 2:nsamples){
den<-density(lcpm[,i])
lines(den$x,den$y,col=col[i],lwd=2)
}
title(main="A.Raw data",xlab="log-cpm")
legend("topright",colnames(x),text.col=col,bty="n",cex=0.5)
#过滤后图形
keep.exprs<-edgeR::filterByExpr(x,group=group_list)
x<-x[keep.exprs,,keep.lib.sizes=F]#过滤
dim(x)
Lcpm<-cpm(x,log=T,prior.count = 2)
head(Lcpm)
plot(density(Lcpm[,1]),col=col[1],lwd=2,ylim=c(0,0.26),las=2,main="",
xlab="")
abline(v=lcpm.cutoff,lty=5)#阈值线
for(i in 2:nsamples){
den<-density(Lcpm[,i])
lines(den$x,den$y,col=col[i],lwd=2)
}
title(main="B.Filter data",xlab="log-cpm")
legend("topright",colnames(x),text.col=col,bty="n",cex=0.5)
#无监督聚类MDS图
par(mfrow=c(1,1))
lcpm<-cpm(x,log=T)
col.group<-group_list
levels(col.group)<-brewer.pal(nlevels(col.group),"Set1")
col.group<-as.character(col.group)
plotMDS(lcpm,col=col.group,labels=group_list)
library(limma)
options(stringsAsFactors = F)
load("expr.RData")#过滤方法1下的基因表达矩阵
data<-expr2
dim(data)
# 分组矩阵
group_list = factor(group_list)
design <- model.matrix(~0+group_list)
rownames(design) = colnames(data)
colnames(design) <- levels(group_list)
# 差异比较矩阵
cont.matrix <- makeContrasts(contrasts = c('trt-untrt'),
levels = design)
#建模前归一化
#方法一:通过箱线图查看数据整齐与否后决定是否归一化
library(RColorBrewer)
dge <- DGEList(counts = data)
col <- brewer.pal(ncol(dge$counts), "Paired")
par(mfrow=c(2,2))
boxplot(dge$counts,outline=F, col=col)
title(main="A. Unnormalised ",ylab="raw count")
boxplot(calcNormFactors(dge, method = "TMM")$counts,outline=F,col=col)
title(main="B. TMM ",ylab="raw count")
boxplot(cpm(dge$counts),outline=F, col=col)
title(main="C. CPM ",ylab="cpm")
boxplot(cpm(dge$counts,log=TRUE),outline=F, col=col)
title(main="D. Log-CPM ",ylab="log-cpm")
从上面的图形可看出log-cpm归一化效果最好,若直接用log-cpm化效果不佳,可先通过TMM标准化在进行log-cpm化.
#方法2:用limma包中的voom进行进行归一化
dge <- DGEList(counts = data)
dge <- calcNormFactors(dge)
v <- voom(dge, design, plot=T) #会自动计算log(cpm)值
voom图:显示一个均值与方差之间递减的趋势,生物差异高的实验通常会到更平坦的趋势,其方差在高表达处稳定;生物差异低的实验很可能会急剧下降的趋势.
#建模---使用的为voom归一化后的数据
#拟合线性模型
fit<-lmFit(v,design)
#针对给定的对比计算估计系数以及标准误差
fit2<-contrasts.fit(fit,cont.matrix)
#计算t,F,差异表达倍数的对数
fit2<-eBayes(fit2)
plotSA(fit2)
#提取差异基因方法1
#用decideTests
#查看差异基因数目
summary(decideTests(fit2,lfc=1,p.value = 0.05))
#提取
allDEG<-topTable(fit2,coef="trt-untrt",n=Inf)
allDEG<-na.omit(allDEG)
dt<-decideTests(fit2)
summary(dt)
de.common<-which(dt[,1]!=0)
diff<-allDEG[de.common,]
head(dt)
dim(diff)
##检查
#将基因按照校正p值从小到大排列输出所有结果
df <- topTreat(fit2, coef=1, n=Inf)
head(df)
#基因集检验
library(clusterProfiler)
library(org.Hs.eg.db)
load("human_c2_v5p2.rdata")
#使用voom归一化的数据
geneid<-bitr(rownames(v), fromType='SYMBOL', toType='ENTREZID',
OrgDb='org.Hs.eg.db', drop = TRUE)
idx <- ids2indices(Hs.c2,id=geneid$ENTREZID) #匹配基因集
cam<- camera(v,idx,design,contrast=cont.matrix)
head(idx,3)
head(cam)
getwd()
#提取2 自定义阈值
allDEG <- topTable(fit2, coef = 'trt-untrt', n = Inf)
allDEG <- na.omit(allDEG)
padj = 0.05
foldChange= 1
diff_signif = allDEG[(allDEG$adj.P.Val < padj & abs(allDEG$logFC)>foldChange),]
diff_signif = diff_signif[order(diff_signif$logFC),]
save(diff_signif, file = 'limma_diff.Rdata')
dim(diff_signif)
head(diff_signif)
a<-diff_signif
#火山图
load(file="limma_diff.Rdata")
library(ggplot2)
diff_signif[which(diff_signif$P.Value<0.05&
diff_signif$logFC>=2),"diff"]<-"up"
diff_signif[which(diff_signif$P.Value<0.05&
diff_signif$logFC<=-2),"diff"]<-"down"
diff_signif[!(diff_signif$diff %in% c("up","down")),"diff"]<-"no"
p1<-ggplot(diff_signif,aes(x=-logFC,y=-log10(P.Value)))+
geom_point(aes(col=diff),size=1.5)+
scale_color_manual(limits=c("up","down","no"),
values=c("blue","red","gray40"),labels=c("enouts","douts","nouts"))
p1
#allsample_top100sd_count_heatmap
load("expr.Rdata")
library(pheatmap)
head(expr2)
cg<-names(tail(apply(expr2,1,sd),100))#取表达量标准差前100的行名
head(cg)
pheatmap(expr2[cg,])
#中心化、标准化allsample_top100sd_cutree1.pheatmap
table(group_list)
n<-t(scale(t(expr2[cg,])))
n[1:3,1:3]
n[n>2]=2
n[n<-2]=-2
ac<-data.frame(group_list)
#判断ac的行数是否等于n的列数
dim(ac)
dim(n)
rownames(ac)<-colnames(n)
pheatmap(n,show_rownames = F,show_colnames = F,annotation_col = ac)
#对表达量标准差前100的基因信息进行聚类
hc<-hclust(dist(t(n)))
plot(hc)
#剪枝
c<-cutree(hc,2)
group_list1<-as.factor(c)#转换成因子
table(group_list1)#统计样本分组信息
table(group_list1,group_list)#得到混淆矩阵
ac1<-data.frame(g=group_list1)
rownames(ac1)<-colnames(n)
pheatmap(n,show_rownames = F,show_colnames = T,annotation_col = ac1)
基因富集分析是分析基因表达信息的一种方法,富集是指将基因按照先验知识,也就是基因组注释信息进行分类。 人类有约30,000个基因,总碱基对的数量约32亿。目前约有3.2亿可能的碱基对变异情况,而每两个人之间的差异约为2千万个碱基对,也就是总碱基对的0.6%。换句话说,人与人之间的基因序列相似度高达99%以上。这些细微的差别,导致了我们长得不同,性格也不同。那么怎么更好的理解这些不同呢?可以按照功能、通路等性质将基因做划分,这也是基因富集分析的重要作用之一。
作者:师师 链接:https://www.zhihu.com/question/30778984/answer/91262467 来源:知乎 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
#富集图
library(clusterProfiler)#用来富集分析的包
load("limma_diff.Rdata")
dim(diff_signif)
library(clusterProfiler)
library(org.Hs.eg.db)
library(topGO)
library(pathview)
c<-diff_signif
#将gene symbol 转换成行名
c$SYMBOL<-rownames(c)
head(c)
rownames(c)<-1:660
head(c)
class(c$SYMBOL)
bitr(geneID, fromType, toType, OrgDb, drop = TRUE); geneid :基因ID输入 ; fromtype : 输入ID型;toType:输出ID型;orgdb :注释数据库)
#数据转换
test1 = bitr(unique(c$SYMBOL), fromType="SYMBOL",
toType= "ENTREZID", OrgDb="org.Hs.eg.db")
head(test1)
dim(test1)
GO数据库总共有三大类,分别是生物学过程(Biological Process,BP)、细胞定位(Cellular Component,CC)、分子功能(Molecular Function,MF),各自描述了基因产物可能行使的分子功能,所处的细胞环境,以及参与的生物学过程.GO数据库中一个基本的概念是节点,比如"Cell"、"Fibroblast Growth Factor Receptor Binding"或者"Signal Transduction",同时有一个唯一的编号,如"GO:nnnnnnnn ".基于鉴定到的蛋白ID,采用mapping(比对)的方式从Uniprot数据库中 得到蛋白质的GO数据库注释信息,对蛋白质进行功能分类注释.
#GO分析
#groupGO富集分析
ggo <- groupGO(gene = test1$ENTREZID, OrgDb = org.Hs.eg.db,
ont = "CC",level = 3,readable = TRUE)
head(ggo)
#enrichGO富集分析
ego_ALL <- enrichGO(gene = test1$ENTREZID,
OrgDb = org.Hs.eg.db,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1,
readable = T)
head(ego_ALL,2)
#setReadble函数进行转换
ego_MF <- enrichGO(gene = test1$ENTREZID,
OrgDb = org.Hs.eg.db,ont = "MF",
pAdjustMethod = "BH",pvalueCutoff = 1,
qvalueCutoff = 1,readable = FALSE)#ont可以选择"All","MF","BP","CC"
ego_MF1 <- setReadable(ego_MF, OrgDb = org.Hs.eg.db)
head(ego_MF,2)
#可视化--点图
dotplot(ego_MF,title="EnrichmentGO_MF_dot")#点图,按富集的数从大到小的
#条状图,按p从小到大排,绘制前20个Term
barplot(ego_MF, showCategory=20,title="EnrichmentGO_MF")
plotGOgraph(ego_MF)#差异表达蛋白的topGO分子功能有向无环图
#KEGG分析
#候选基因进行通路分析
k<-enrichKEGG(gene=test1$ENTREZID,organism = 'hsa',
pvalueCutoff = 1)
head(k,2)
#KEGG气泡图
dotplot(k,title="enrich keggdotplot")
#pathview通路可视化
library(pathview)
pathview(test1$ENTREZID,pathway.id = "04933",species = "hsa")
#加载差异表达矩阵
load("limma_diff.Rdata")
head(diff_signif,3)
dim(diff_signif)
#设定阈值计算基因上调下调数目
a=diff_signif
logFC_t=1.5
a$g<-ifelse(a$P.Value>0.05,'stable',ifelse( a$logFC > logFC_t,'UP',
ifelse(a$logFC < -logFC_t,'DOWN','stable')))
table(a$g)
head(a)
#将gene SYMBOL加入差异表达矩阵
a$symbol<-rownames(a)
head(a)
rownames(a)<-1:660
head(a)
#ID转换
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(a$symbol), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
dim(df)
head(df,2)
#把数据DEG,df通过,DEG的'symbol'列,df的'SYMBOL'列连接在一起,转化ID
DEG<-a
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
head(DEG)
save(DEG,file = 'anno_DEG.Rdata')
#得出差异基因
gene_up= DEG[DEG$g == 'UP','ENTREZID'] #选出上调基因ID
gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] #选出下调基因ID
gene_diff=c(gene_up,gene_down)#得出上下调基因ID
gene_all=as.character(DEG[ ,'ENTREZID'] )#得出所有基因ID
head(gene_all,3)
#geneList
data(geneList, package="DOSE")#得出geneList数据
head(geneList,4)#查看
boxplot(geneList)
boxplot(DEG$logFC)
#得到排序后的geneList
geneList<-DEG$logFC#赋值
names(geneList)<-DEG$ENTREZID
geneList<-sort(geneList,decreasing = T)#从大到小排序
head(geneList)
#KEGG
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa',
universe= gene_all,
pvalueCutoff = 1,
qvalueCutoff =1)
head(kk.up,3)
dotplot(kk.up )
kk.down<- enrichKEGG(gene = gene_down,
organism = 'hsa',
universe = gene_all,
pvalueCutoff = 1,
qvalueCutoff =1)
dotplot(kk.down )
kk.diff<-enrichKEGG(gene=gene_diff,organism = 'hsa',pvalueCutoff = 0.05)
dotplot(kk.diff)
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
up_kegg$group=1
b<-rbind(down_kegg,up_kegg)
head(b)
p<-ggplot(b,aes(x=GeneRatio,y=Description))+geom_point(aes(col=pvalue,size=Count))+
facet_grid(~group)
p+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")
head(kegg_diff_dt,3)
j<-ggplot(kegg_diff_dt,aes(x=GeneRatio,y=Description))+geom_point(aes(col=pvalue,size=Count))
j+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
pvalueCutoff = 1,
verbose = FALSE)
head(kk_gse)[,1:6]
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,]
down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,]
up_kegg$group=1
head(up_kegg)
h<-rbind(down_kegg,up_kegg)
m<-ggplot(h,aes(x=setSize,y=Description))+geom_point(aes(col=pvalue,size=enrichmentScore))+ facet_grid(~group)
m+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")