R Packages for DGE

limma edgeR DESeq clusterProfiler

Bioconductor

http://www.bioconductor.org/

数据预处理

In [5]:
library(edgeR)
options(stringsAsFactors = F)
library(airway)#气道平滑肌细胞RNA—seq
help(airway)#数据包中的help信息可以提供很多有用的信息,如参数意思
data("airway")
#获取基因counts矩阵
expr<-assay(airway)
?assay
expr[1:5,1:5]
#获取分组信息
group_list<-colData(airway)$dex
group_list
Loading required package: limma


Attaching package: 'limma'


The following object is masked from 'package:BiocGenerics':

    plotMA


Loading required package: SummarizedExperiment

Loading required package: GenomicRanges

Loading required package: GenomeInfoDb

Warning message:
"package 'GenomeInfoDb' was built under R version 3.6.3"
Loading required package: DelayedArray

Warning message:
"package 'DelayedArray' was built under R version 3.6.3"
Loading required package: matrixStats

Warning message:
"package 'matrixStats' was built under R version 3.6.3"

Attaching package: 'matrixStats'


The following objects are masked from 'package:Biobase':

    anyMissing, rowMedians


Loading required package: BiocParallel


Attaching package: 'DelayedArray'


The following objects are masked from 'package:matrixStats':

    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges


The following object is masked from 'package:clusterProfiler':

    simplify


The following objects are masked from 'package:base':

    aperm, apply, rowsum


A matrix: 5 × 5 of type int
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516
ENSG000000000036794488734081138
ENSG00000000005 0 0 0 0 0
ENSG00000000419467515621365 587
ENSG00000000457260211263164 245
ENSG00000000460 60 55 40 35 78
  1. untrt
  2. trt
  3. untrt
  4. trt
  5. untrt
  6. trt
  7. untrt
  8. trt
Levels:
  1. 'trt'
  2. 'untrt'

注释基因

In [6]:
library(clusterProfiler)#Y叔开发的包(重点介绍)ID转换
library(org.Hs.eg.db)#人类基因组注释包
keytypes(org.Hs.eg.db)#查看支持ID的转换类型
#转换ID
geneid<-bitr(rownames(expr),fromType = "ENSEMBL",toType = "SYMBOL",
             OrgDb = "org.Hs.eg.db",drop=T)#去除空值
geneid
head(geneid)
  1. 'ACCNUM'
  2. 'ALIAS'
  3. 'ENSEMBL'
  4. 'ENSEMBLPROT'
  5. 'ENSEMBLTRANS'
  6. 'ENTREZID'
  7. 'ENZYME'
  8. 'EVIDENCE'
  9. 'EVIDENCEALL'
  10. 'GENENAME'
  11. 'GO'
  12. 'GOALL'
  13. 'IPI'
  14. 'MAP'
  15. 'OMIM'
  16. 'ONTOLOGY'
  17. 'ONTOLOGYALL'
  18. 'PATH'
  19. 'PFAM'
  20. 'PMID'
  21. 'PROSITE'
  22. 'REFSEQ'
  23. 'SYMBOL'
  24. 'UCSCKG'
  25. 'UNIGENE'
  26. 'UNIPROT'
'select()' returned 1:many mapping between keys and columns

Warning message in bitr(rownames(expr), fromType = "ENSEMBL", toType = "SYMBOL", :
"59.46% of input gene IDs are fail to map..."
A data.frame: 26268 × 2
ENSEMBLSYMBOL
<chr><chr>
1ENSG00000000003TSPAN6
2ENSG00000000005TNMD
3ENSG00000000419DPM1
4ENSG00000000457SCYL3
5ENSG00000000460C1orf112
6ENSG00000000938FGR
7ENSG00000000971CFH
8ENSG00000001036FUCA2
9ENSG00000001084GCLC
10ENSG00000001167NFYA
11ENSG00000001460STPG1
12ENSG00000001461NIPAL3
13ENSG00000001497LAS1L
14ENSG00000001561ENPP4
15ENSG00000001617SEMA3F
16ENSG00000001626CFTR
17ENSG00000001629ANKIB1
18ENSG00000001630CYP51A1
19ENSG00000001631KRIT1
20ENSG00000002016RAD52
22ENSG00000002330BAD
23ENSG00000002549LAP3
24ENSG00000002586CD99
25ENSG00000002587HS3ST1
26ENSG00000002726AOC1
27ENSG00000002745WNT16
28ENSG00000002746HECW1
29ENSG00000002822MAD1L1
30ENSG00000002834LASP1
31ENSG00000002919SNX11
.........
63496ENSG00000273032DGCR5
63497ENSG00000273033LINC02035
63509ENSG00000273045C2orf15
63510ENSG00000273046HOXC5
63525ENSG00000273061CDC37L1-DT
63543ENSG00000273079GRIN2B
63549ENSG00000273085OR52E1
63550ENSG00000273086TAS2R9
63556ENSG00000273092TAS2R20
63575ENSG00000273111LYPD4
63589ENSG00000273125LINC01990
63593ENSG00000273129PACERR
63600ENSG00000273136NBPF26
63631ENSG00000273167SPATA13
63637ENSG00000273173SNURF
63677ENSG00000273213H3-2
63702ENSG00000273238TMEM271
63729ENSG00000273265CNNM3-DT
63738ENSG00000273274ZBTB8B
63758ENSG00000273294C1QTNF3-AMACR
63761ENSG00000273297LOC105375519
63775ENSG00000273311DGCR11
63777ENSG00000273313RBAKDN
63790ENSG00000273326TAS2R7
63808ENSG00000273344PAXIP1-AS1
63860ENSG00000273396LINC01396
63861ENSG00000273397NDUFA6
63873ENSG00000273409LINC02712
63895ENSG00000273431TAS2R50
63921ENSG00000273457TAS2R13
A data.frame: 6 × 2
ENSEMBLSYMBOL
<chr><chr>
1ENSG00000000003TSPAN6
2ENSG00000000005TNMD
3ENSG00000000419DPM1
4ENSG00000000457SCYL3
5ENSG00000000460C1orf112
6ENSG00000000938FGR
In [7]:
#注释ID
expr<-expr[rownames(expr)%in%geneid[,1],]
dim(expr)
geneid<-geneid[match(rownames(expr),geneid[,1]),]
rownames(expr)<-geneid$SYMBOL
head(expr)
expr[1:5,1:5]
  1. 25988
  2. 8
A matrix: 6 × 8 of type int
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516SRR1039517SRR1039520SRR1039521
TSPAN667944887340811381047770572
TNMD 0 0 0 0 0 0 0 0
DPM1467515621365 587 799417508
SCYL3260211263164 245 331233229
C1orf112 60 55 40 35 78 63 76 60
FGR 0 0 2 0 1 0 0 0
A matrix: 5 × 5 of type int
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516
TSPAN66794488734081138
TNMD 0 0 0 0 0
DPM1467515621365 587
SCYL3260211263164 245
C1orf112 60 55 40 35 78
In [8]:
#去重(chong)
table(duplicated(rownames(expr)))#统计重复基因数目
dim(expr)
#对重复基因名取平均表达量
if(sum(duplicated(rownames(expr)))>0)#判断是否重复
  expr1<-avereps(expr,ID=rownames(expr))
dim(expr1)
FALSE  TRUE 
24811  1177 
  1. 25988
  2. 8
  1. 24811
  2. 8
In [9]:
#过滤低表达基因
apply(expr1,2,fivenum)#查看样本表达基因
apply(expr1,2,function(x)sum(x>1))
#过滤方法1---自定义
expr2<-expr1[apply(expr1,1,function(x) sum(x>1)>1),]
dim(expr2)
save(group_list,expr2,file="expr.Rdata")
A matrix: 5 × 8 of type dbl
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516SRR1039517SRR1039520SRR1039521
0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0
24 20 32 16 27 31 25 22
473 418 540 312 545 659 427 447
297906255662513766273878397791401539378834372489
SRR1039508
15694
SRR1039509
15591
SRR1039512
16068
SRR1039513
15177
SRR1039516
15951
SRR1039517
16090
SRR1039520
15734
SRR1039521
15557
  1. 16777
  2. 8
In [10]:
#过滤方法2
x<-edgeR::DGEList(counts=expr1)#构建DGEList
dim(x)
keep.exprs<-edgeR::filterByExpr(x,group=group_list)#判断基因是否低表达
x<-x[keep.exprs,,keep.lib.sizes=F]
dim(x)
head(keep.exprs)
dim(x)
  1. 24811
  2. 8
  1. 13501
  2. 8
TSPAN6
TRUE
TNMD
FALSE
DPM1
TRUE
SCYL3
TRUE
C1orf112
TRUE
FGR
FALSE
  1. 13501
  2. 8
In [11]:
#可视化过滤方法2中数据
library(RColorBrewer)
x<-edgeR::DGEList(counts=expr1,group=factor(group_list))
dim(x)
L<-mean(x$samples$lib.size)/10^6
M<-median(x$samples$lib.size)/10^6
lcpm<-cpm(x,log=T,prior.count = 2)#log2(CPM+2/L)
head(lcpm)
dim(x)
lcpm.cutoff<-log2(10/M+2/L)#过滤阈值
nsamples<-ncol(x)#提取样本数
col<-brewer.pal(nsamples,"Paired")#颜色
par(mfrow=c(1,2))
#过滤前图形
plot(density(lcpm[,1]),col=col[1],lwd=2,ylim=c(0,0.26),las=2,main="",
     xlab="")
abline(v=lcpm.cutoff,lty=5)#阈值线
for(i in 2:nsamples){
  den<-density(lcpm[,i])
  lines(den$x,den$y,col=col[i],lwd=2)
}
title(main="A.Raw data",xlab="log-cpm")
legend("topright",colnames(x),text.col=col,bty="n",cex=0.5)
#过滤后图形
keep.exprs<-edgeR::filterByExpr(x,group=group_list)
x<-x[keep.exprs,,keep.lib.sizes=F]#过滤
dim(x)
Lcpm<-cpm(x,log=T,prior.count = 2)
head(Lcpm)
plot(density(Lcpm[,1]),col=col[1],lwd=2,ylim=c(0,0.26),las=2,main="",
     xlab="")
abline(v=lcpm.cutoff,lty=5)#阈值线
for(i in 2:nsamples){
  den<-density(Lcpm[,i])
  lines(den$x,den$y,col=col[i],lwd=2)
}
title(main="B.Filter data",xlab="log-cpm")
legend("topright",colnames(x),text.col=col,bty="n",cex=0.5)
  1. 24811
  2. 8
A matrix: 6 × 8 of type dbl
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516SRR1039517SRR1039520SRR1039521
TSPAN6 5.141473 4.672112 5.2001552 4.839259 5.638631 5.172082 5.420317 4.841827
TNMD-3.367032-3.367032-3.3670321-3.367032-3.367032-3.367032-3.367032-3.367032
DPM1 4.603281 4.872472 4.7103095 4.679162 4.686195 4.783291 4.538267 4.671255
SCYL3 3.762943 3.592004 3.4780315 3.531635 3.433178 3.519089 3.703294 3.528445
C1orf112 1.681397 1.684784 0.8293573 1.347256 1.809396 1.176655 2.109043 1.629866
FGR-3.367032-3.367032-2.4665716-3.367032-2.830181-3.367032-3.367032-3.367032
  1. 24811
  2. 8
  1. 13501
  2. 8
A matrix: 6 × 8 of type dbl
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516SRR1039517SRR1039520SRR1039521
TSPAN65.1424494.6730675.20119224.8401055.6396655.1730125.4216564.842878
DPM14.6042574.8734274.71134654.6800084.6872294.7842224.5396064.672305
SCYL33.7639193.5929593.47906843.5324823.4342123.5200203.7046323.529495
C1orf1121.6823741.6857410.83039341.3481081.8104301.1775892.1103761.630916
CFH7.3987117.7059858.02077538.2171828.1995078.5662058.1677828.643355
FUCA26.2179185.9151066.18852165.9480535.9625505.6307606.2398565.795687
In [15]:
#无监督聚类MDS图
par(mfrow=c(1,1))
lcpm<-cpm(x,log=T)
col.group<-group_list
levels(col.group)<-brewer.pal(nlevels(col.group),"Set1")

col.group<-as.character(col.group)
plotMDS(lcpm,col=col.group,labels=group_list)
Warning message in brewer.pal(nlevels(col.group), "Set1"):
"minimal value for n is 3, returning requested palette with 3 different levels
"

差异性分析(Limma)

In [12]:
library(limma)
options(stringsAsFactors = F)
load("expr.RData")#过滤方法1下的基因表达矩阵
data<-expr2
dim(data)
  1. 16777
  2. 8
In [14]:
# 分组矩阵
group_list = factor(group_list)
design <- model.matrix(~0+group_list)
rownames(design) = colnames(data)
colnames(design) <- levels(group_list)
In [15]:
# 差异比较矩阵
cont.matrix <- makeContrasts(contrasts = c('trt-untrt'), 
                             levels = design)
In [16]:
#建模前归一化
#方法一:通过箱线图查看数据整齐与否后决定是否归一化
library(RColorBrewer)
dge <- DGEList(counts = data)
col <- brewer.pal(ncol(dge$counts), "Paired")
par(mfrow=c(2,2))
boxplot(dge$counts,outline=F, col=col)
title(main="A. Unnormalised ",ylab="raw count")
boxplot(calcNormFactors(dge, method = "TMM")$counts,outline=F,col=col)
title(main="B. TMM ",ylab="raw count")
boxplot(cpm(dge$counts),outline=F, col=col)
title(main="C. CPM ",ylab="cpm")
boxplot(cpm(dge$counts,log=TRUE),outline=F, col=col)
title(main="D. Log-CPM ",ylab="log-cpm")

结论

从上面的图形可看出log-cpm归一化效果最好,若直接用log-cpm化效果不佳,可先通过TMM标准化在进行log-cpm化.

In [17]:
#方法2:用limma包中的voom进行进行归一化
dge <- DGEList(counts = data)
dge <- calcNormFactors(dge)
v <- voom(dge, design, plot=T) #会自动计算log(cpm)值

voom图:显示一个均值与方差之间递减的趋势,生物差异高的实验通常会到更平坦的趋势,其方差在高表达处稳定;生物差异低的实验很可能会急剧下降的趋势.

In [18]:
#建模---使用的为voom归一化后的数据
#拟合线性模型
fit<-lmFit(v,design)
#针对给定的对比计算估计系数以及标准误差
fit2<-contrasts.fit(fit,cont.matrix)
#计算t,F,差异表达倍数的对数
fit2<-eBayes(fit2)
plotSA(fit2)
In [19]:
#提取差异基因方法1
#用decideTests
#查看差异基因数目
summary(decideTests(fit2,lfc=1,p.value = 0.05))
#提取
allDEG<-topTable(fit2,coef="trt-untrt",n=Inf)
allDEG<-na.omit(allDEG)
dt<-decideTests(fit2)
summary(dt)
de.common<-which(dt[,1]!=0)
diff<-allDEG[de.common,]
head(dt)
dim(diff)
       trt-untrt
Down         317
NotSig     16117
Up           343
       trt-untrt
Down         811
NotSig     14965
Up          1001
A TestResults: 6 × 1 of type dbl
trt-untrt
TSPAN60
DPM10
SCYL30
C1orf1120
CFH0
FUCA20
  1. 1812
  2. 6
In [30]:
##检查
#将基因按照校正p值从小到大排列输出所有结果
df <- topTreat(fit2, coef=1, n=Inf) 
head(df)
#基因集检验
library(clusterProfiler)
library(org.Hs.eg.db)
load("human_c2_v5p2.rdata")
#使用voom归一化的数据
geneid<-bitr(rownames(v), fromType='SYMBOL', toType='ENTREZID',
             OrgDb='org.Hs.eg.db', drop = TRUE) 
idx <- ids2indices(Hs.c2,id=geneid$ENTREZID) #匹配基因集
cam<- camera(v,idx,design,contrast=cont.matrix)   
head(idx,3)
head(cam)
A data.frame: 6 × 6
logFCAveExprtP.Valueadj.P.ValB
<dbl><dbl><dbl><dbl><dbl><dbl>
SPARCL14.5749654.25407918.785131.305430e-080.00013202859.324703
PHC21.3823526.92608616.836713.459629e-080.00013202859.394810
STOM1.4420848.94320516.732093.656354e-080.00013202859.177518
PER13.1872804.50759716.720643.678642e-080.00013202859.038168
MT2A2.2013737.11124716.594203.934805e-080.00013202859.260883
DUSP12.9505626.73163016.178334.927101e-080.00013777009.059292
'select()' returned 1:many mapping between keys and columns

$KEGG_GLYCOLYSIS_GLUCONEOGENESIS
  1. 120
  2. 891
  3. 903
  4. 1067
  5. 1150
  6. 1321
  7. 1720
  8. 2144
  9. 2332
  10. 2701
  11. 2937
  12. 3113
  13. 3131
  14. 3195
  15. 3395
  16. 3429
  17. 3440
  18. 3443
  19. 3452
  20. 4119
  21. 5507
  22. 5593
  23. 5720
  24. 5919
  25. 6321
  26. 6367
  27. 7033
  28. 7142
  29. 7291
  30. 7921
  31. 7968
  32. 8157
  33. 8378
  34. 8521
  35. 8812
  36. 9708
  37. 9759
  38. 10075
  39. 10077
  40. 10375
  41. 10568
  42. 10895
  43. 11006
  44. 11060
  45. 11162
  46. 12605
  47. 13104
  48. 13406
  49. 13482
  50. 13740
  51. 13793
  52. 15019
  53. 15925
$KEGG_CITRATE_CYCLE_TCA_CYCLE
  1. 304
  2. 750
  3. 918
  4. 1109
  5. 1720
  6. 1728
  7. 2055
  8. 2144
  9. 2236
  10. 2835
  11. 4081
  12. 4325
  13. 4623
  14. 5556
  15. 5593
  16. 6206
  17. 6583
  18. 7165
  19. 7608
  20. 7968
  21. 9370
  22. 10007
  23. 10375
  24. 11063
  25. 11262
  26. 12264
  27. 13647
  28. 14184
$KEGG_PENTOSE_PHOSPHATE_PATHWAY
  1. 374
  2. 581
  3. 891
  4. 1321
  5. 2296
  6. 2701
  7. 3195
  8. 5380
  9. 6321
  10. 7033
  11. 7094
  12. 7667
  13. 7921
  14. 8157
  15. 8249
  16. 8892
  17. 9480
  18. 9759
  19. 10568
  20. 10870
  21. 11713
  22. 13699
  23. 15421
A data.frame: 6 × 4
NGenesDirectionPValueFDR
<dbl><chr><dbl><dbl>
BIOCARTA_ACTINY_PATHWAY19Up3.896315e-050.1839451
REACTOME_SEMA4D_IN_SEMAPHORIN_SIGNALING28Up5.361421e-040.5368448
CHEN_LVAD_SUPPORT_OF_FAILING_HEART_UP99Up6.463547e-040.5368448
DAVIES_MULTIPLE_MYELOMA_VS_MGUS_DN21Up7.632493e-040.5368448
STEGER_ADIPOGENESIS_UP15Up8.956429e-040.5368448
JECHLINGER_EPITHELIAL_TO_MESENCHYMAL_TRANSITION_DN61Up9.174624e-040.5368448
In [23]:
getwd()
'C:/Users/31207'
In [25]:
#提取2 自定义阈值
allDEG <- topTable(fit2, coef = 'trt-untrt', n = Inf)
allDEG <- na.omit(allDEG)
padj = 0.05
foldChange= 1
diff_signif = allDEG[(allDEG$adj.P.Val < padj & abs(allDEG$logFC)>foldChange),]                    
diff_signif = diff_signif[order(diff_signif$logFC),]
save(diff_signif, file = 'limma_diff.Rdata')
dim(diff_signif)
head(diff_signif)
a<-diff_signif
  1. 660
  2. 6
A data.frame: 6 × 6
logFCAveExprtP.Valueadj.P.ValB
<dbl><dbl><dbl><dbl><dbl><dbl>
LINC00906-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390
MCHR1-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098
LRRTM2-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081
VCAM1-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506
VASH2-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112
SMTNL2-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586

数据可视化分析之火山图&热图&富集图

In [21]:
#火山图
load(file="limma_diff.Rdata")
library(ggplot2)
diff_signif[which(diff_signif$P.Value<0.05&
              diff_signif$logFC>=2),"diff"]<-"up"
diff_signif[which(diff_signif$P.Value<0.05&
                 diff_signif$logFC<=-2),"diff"]<-"down"
diff_signif[!(diff_signif$diff %in% c("up","down")),"diff"]<-"no"
p1<-ggplot(diff_signif,aes(x=-logFC,y=-log10(P.Value)))+
  geom_point(aes(col=diff),size=1.5)+
  scale_color_manual(limits=c("up","down","no"),
 values=c("blue","red","gray40"),labels=c("enouts","douts","nouts"))
p1

基因表达量热图

In [1]:
#allsample_top100sd_count_heatmap
load("expr.Rdata")
library(pheatmap)
head(expr2)
cg<-names(tail(apply(expr2,1,sd),100))#取表达量标准差前100的行名
head(cg)
pheatmap(expr2[cg,])
A matrix: 6 × 8 of type dbl
SRR1039508SRR1039509SRR1039512SRR1039513SRR1039516SRR1039517SRR1039520SRR1039521
TSPAN6 679 448 873 4081138 1047 770 572
DPM1 467 515 621 365 587 799 417 508
SCYL3 260 211 263 164 245 331 233 229
C1orf112 60 55 40 35 78 63 76 60
CFH325136796177425267211102751767995
FUCA2143310621733 8811424 143913591109
  1. 'FENDRR'
  2. 'LOC644656'
  3. 'HAVCR1P1'
  4. 'LOC93429'
  5. 'MIR4453HG'
  6. 'LOC105372482'
In [2]:
#中心化、标准化allsample_top100sd_cutree1.pheatmap
table(group_list)
n<-t(scale(t(expr2[cg,])))
n[1:3,1:3]
n[n>2]=2
n[n<-2]=-2
ac<-data.frame(group_list)
#判断ac的行数是否等于n的列数
dim(ac)
dim(n)
rownames(ac)<-colnames(n)
pheatmap(n,show_rownames = F,show_colnames = F,annotation_col = ac)
group_list
  trt untrt 
    4     4 
A matrix: 3 × 3 of type dbl
SRR1039508SRR1039509SRR1039512
FENDRR0.3229845-0.3497569-0.08615209
LOC6446561.7677670-1.0606602 1.29636243
HAVCR1P11.1692679-0.7015608-0.70156076
  1. 8
  2. 1
  1. 100
  2. 8
In [3]:
#对表达量标准差前100的基因信息进行聚类
hc<-hclust(dist(t(n)))
plot(hc)
In [30]:
#剪枝
c<-cutree(hc,2)
group_list1<-as.factor(c)#转换成因子
table(group_list1)#统计样本分组信息
table(group_list1,group_list)#得到混淆矩阵
ac1<-data.frame(g=group_list1)
rownames(ac1)<-colnames(n)
pheatmap(n,show_rownames = F,show_colnames = T,annotation_col = ac1)
group_list1
1 2 
5 3 
           group_list
group_list1 trt untrt
          1   1     4
          2   3     0

富集分析

基因富集分析是分析基因表达信息的一种方法,富集是指将基因按照先验知识,也就是基因组注释信息进行分类。 人类有约30,000个基因,总碱基对的数量约32亿。目前约有3.2亿可能的碱基对变异情况,而每两个人之间的差异约为2千万个碱基对,也就是总碱基对的0.6%。换句话说,人与人之间的基因序列相似度高达99%以上。这些细微的差别,导致了我们长得不同,性格也不同。那么怎么更好的理解这些不同呢?可以按照功能、通路等性质将基因做划分,这也是基因富集分析的重要作用之一。

作者:师师 链接:https://www.zhihu.com/question/30778984/answer/91262467 来源:知乎 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

In [29]:
#富集图
library(clusterProfiler)#用来富集分析的包
In [3]:
load("limma_diff.Rdata")
dim(diff_signif)
library(clusterProfiler)
library(org.Hs.eg.db)
library(topGO)
library(pathview)
c<-diff_signif
#将gene symbol 转换成行名
c$SYMBOL<-rownames(c)
head(c)
rownames(c)<-1:660
head(c)
class(c$SYMBOL)
  1. 660
  2. 6
A data.frame: 6 × 7
logFCAveExprtP.Valueadj.P.ValBSYMBOL
<dbl><dbl><dbl><dbl><dbl><dbl><chr>
LINC00906-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390LINC00906
MCHR1-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098MCHR1
LRRTM2-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081LRRTM2
VCAM1-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506VCAM1
VASH2-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112VASH2
SMTNL2-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586SMTNL2
A data.frame: 6 × 7
logFCAveExprtP.Valueadj.P.ValBSYMBOL
<dbl><dbl><dbl><dbl><dbl><dbl><chr>
1-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390LINC00906
2-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098MCHR1
3-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081LRRTM2
4-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506VCAM1
5-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112VASH2
6-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586SMTNL2
'character'

bitr功能为ID转换

bitr(geneID, fromType, toType, OrgDb, drop = TRUE); geneid :基因ID输入 ; fromtype : 输入ID型;toType:输出ID型;orgdb :注释数据库)

In [5]:
#数据转换
test1 = bitr(unique(c$SYMBOL), fromType="SYMBOL", 
             toType= "ENTREZID", OrgDb="org.Hs.eg.db")
head(test1)
dim(test1)
'select()' returned 1:1 mapping between keys and columns

A data.frame: 6 × 2
SYMBOLENTREZID
<chr><chr>
1LINC00906148145
2MCHR1 2847
3LRRTM2 26045
4VCAM1 7412
5VASH2 79805
6SMTNL2 342527
  1. 660
  2. 2

GO dataset Analysis

GO数据库总共有三大类,分别是生物学过程(Biological Process,BP)、细胞定位(Cellular Component,CC)、分子功能(Molecular Function,MF),各自描述了基因产物可能行使的分子功能,所处的细胞环境,以及参与的生物学过程.GO数据库中一个基本的概念是节点,比如"Cell"、"Fibroblast Growth Factor Receptor Binding"或者"Signal Transduction",同时有一个唯一的编号,如"GO:nnnnnnnn ".基于鉴定到的蛋白ID,采用mapping(比对)的方式从Uniprot数据库中 得到蛋白质的GO数据库注释信息,对蛋白质进行功能分类注释.

In [3]:
#GO分析
#groupGO富集分析
ggo <- groupGO(gene = test1$ENTREZID, OrgDb = org.Hs.eg.db, 
               ont = "CC",level = 3,readable = TRUE)
head(ggo)
Loading required package: DOSE

DOSE v3.12.0  For help: https://guangchuangyu.github.io/software/DOSE

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609


A data.frame: 6 × 5
IDDescriptionCountGeneRatiogeneID
<fct><fct><int><fct><chr>
GO:0005886GO:0005886plasma membrane 229229/660MCHR1/LRRTM2/VCAM1/GRIN2A/SLITRK6/AQP3/ASIC2/GPR68/AIF1L/TNFSF15/NPC1L1/IGSF9/EFR3B/GPR20/GPR20/KCTD12/RGS6/IL1RN/FLRT3/PLPPR4/AMOT/RND2/CAMK2A/SLC6A9/RAB11FIP4/SIGLEC15/GFRA2/SLC6A6/BDKRB2/INPP5D/INPP5D/RASD2/P2RX7/CXCL12/CPNE9/RTKN2/BDKRB1/TNFSF9/ADAM12/CLDN11/OMG/DNM1/TNFSF13B/EPHB2/ADCY4/ADCY4/FAIM2/TNFRSF11B/SLC51B/PRKG2/FNDC5/KIF18A/CDON/CIT/TSPAN32/RRAD/EPHB3/CUBN/GPC2/CNIH3/SEMA6D/S1PR1/ACVR1C/TSPAN11/EPHA5/ACKR4/TENM4/PLXDC1/LYPD3/LDB2/RHOJ/CORO1A/HMOX1/RAP2B/KCNE3/GDF5/PCDHGC5/LHFPL2/SCN1B/BRCA1/SLC39A10/SLC7A5/SEMA3A/PRKCE/TMEM119/TNIK/SGCD/TNFSF10/RAPGEF4/TRIB3/FRK/CD37/NEDD4L/MARCKSL1/ROBO1/OSBPL3/LGR4/RAET1G/RASL12/SPTBN4/EPB41/FRRS1L/COLQ/UNC5B/DRP2/ABCC6/ABCC6/IRAK2/ARHGEF2/ANK2/SLC41A2/IL17RB/OSBPL7/LZTS1/ITGA2/STIM2/PLPP3/DNM3/NRP2/TLCD2/TLCD2/ABCA6/MORF4L2/NF2/IRAK3/CLIC6/RHOBTB3/PPP1CB/CRIM1/CRIM1/KCNS3/NAMPT/ARHGEF28/ANPEP/STK17B/AP1M1/PNPLA2/ADH1C/MFGE8/AGTRAP/CARMIL1/TM4SF1/AMPH/KANK1/ADH1B/TMEM47/CYTH3/ANXA4/CFLAR/SLC5A6/JADE1/ITGA8/CD82/SLC38A2/RHOB/MMP15/C1QTNF1/ING2/ATP10A/SCARF1/SCARF1/ATP2A2/ABCA4/LIMS2/CPM/INPP5A/SLC7A6/EDA/EPHB6/EPHB6/NEGR1/STOM/TGFBR2/RAC3/MYADM/HMMR/CDC42EP3/DNAJB4/PIK3R1/STEAP1/P2RY14/ACSL1/PDLIM5/NEDD9/MYO16/MYO16/MMD/DIO2/GPRC5B/VIPR2/SORBS1/THBS1/CORIN/TJP2/TRPC6/STEAP2/RASGRP2/NEXN/LAMA2/DCXR/KLF9/IRS2/SORT1/FZD5/NKD1/GPR150/KCNQ3/IYD/HSPA2/ERRFI1/HIF3A/FZD8/ERVMER34-1/KCNK6/PLXNA4/DPEP1/GLUL/C7/CACNB2/FPR1/SYN2/SAMHD1/ITGA10/CD163/SLC16A12/ADRA1B/GUCY2D/ZBTB16/ALOX15B
GO:0005628GO:0005628prospore membrane 00/660
GO:0005789GO:0005789endoplasmic reticulum membrane 3030/660 LCTL/HSD11B2/PLA2G4A/FLRT3/MEST/SCD/CH25H/CNIH3/HMOX1/TMEM119/GRAMD4/OSBPL3/ABCC6/ABCC6/TMCC2/OSBPL7/STIM2/PNPLA2/PREB/PCYT2/AGTRAP/YIF1B/GPAT3/CYP19A1/ATP10A/ATP2A2/EDA/ACSL1/MCTP1/SORT1
GO:0019867GO:0019867outer membrane 88/660 FOXO3/STARD7/ACSL1/SEPTIN4/SEPTIN4/NAV3/MAOA/GUCY2D
GO:0031090GO:0031090organelle membrane 8989/660 MCHR1/SLC7A14/NPC1L1/DOC2A/PLA2G4A/LFNG/RND2/CAMK2A/SLC6A9/RAB11FIP4/CHDH/P2RX7/SYN3/CBFA2T3/TMEM35A/PRKG2/FNDC5/ST8SIA1/CUBN/CNIH3/CORO1A/RAP2B/LHFPL2/CKMT2/GRAMD4/ROBO1/OSBPL3/ABCC6/ABCC6/IRAK2/GALNT16/SH3BP5/COX11/CTSC/SQOR/RHOBTB3/ANPEP/AP1M1/PREB/AGTRAP/YIF1B/GASK1B/CALCOCO2/ULK2/AMPH/CYTH3/ANXA4/SLC5A6/HIGD1A/RHOB/COL7A1/AHCTF1/FOXO3/STARD7/SCARF1/SCARF1/GCHFR/NR4A1/ATP2A2/ABCA4/STOM/BDH1/BDH1/CHSY3/RAB11FIP1/CHST7/STEAP1/GCH1/ACSL1/GALNT15/MMD/GPRC5B/SUN2/MCTP1/SEPTIN4/SEPTIN4/STEAP2/NAV3/RNF144B/SORT1/FZD5/IYD/FPR1/MAOA/SYN2/CD163/ADRA1B/GUCY2D/PRODH
GO:0034357GO:0034357photosynthetic membrane 00/660
In [8]:
#enrichGO富集分析
ego_ALL <- enrichGO(gene = test1$ENTREZID, 
                    OrgDb = org.Hs.eg.db, 
                    ont = "ALL", 
                    pAdjustMethod = "BH", 
                    pvalueCutoff = 1, 
                    qvalueCutoff = 1,
                    readable = T)
head(ego_ALL,2)
A data.frame: 2 × 10
ONTOLOGYIDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCount
<fct><chr><chr><chr><chr><dbl><dbl><dbl><chr><int>
GO:0007411BPGO:0007411axon guidance 29/558276/186704.630024e-091.135587e-059.608369e-06EGR2/IGSF9/RPS6KA5/FLRT3/GFRA2/CXCL12/EPHB2/EPHB3/SEMA6D/EPHA5/VEGFA/SCN1B/SEMA3A/ROBO1/DOK6/SPTBN4/UNC5B/NRP2/APBB2/EPHB6/PIK3R1/GDNF/NOG/GDF7/NEXN/LAMA2/NR4A3/IRS2/PLXNA429
GO:0097485BPGO:0097485neuron projection guidance29/558277/186705.029174e-091.135587e-059.608369e-06EGR2/IGSF9/RPS6KA5/FLRT3/GFRA2/CXCL12/EPHB2/EPHB3/SEMA6D/EPHA5/VEGFA/SCN1B/SEMA3A/ROBO1/DOK6/SPTBN4/UNC5B/NRP2/APBB2/EPHB6/PIK3R1/GDNF/NOG/GDF7/NEXN/LAMA2/NR4A3/IRS2/PLXNA429
In [9]:
#setReadble函数进行转换
ego_MF <- enrichGO(gene = test1$ENTREZID, 
                    OrgDb = org.Hs.eg.db,ont = "MF", 
                   pAdjustMethod = "BH",pvalueCutoff = 1,
                   qvalueCutoff = 1,readable = FALSE)#ont可以选择"All","MF","BP","CC"
ego_MF1 <- setReadable(ego_MF, OrgDb = org.Hs.eg.db)
head(ego_MF,2)
A data.frame: 2 × 9
IDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCount
<chr><chr><chr><chr><dbl><dbl><dbl><chr><int>
GO:1901681GO:1901681sulfur compound binding22/553250/176971.307357e-050.0055509360.0051086244856/10468/134526/7422/27010/8292/3673/8828/283/307/4329/875/59271/284654/3491/1311/7057/11096/9510/1301/83716/4316 22
GO:0003779GO:0003779actin binding 31/553431/176971.550541e-050.0055509360.00510862479805/83543/286187/63876/81930/10391/11151/8507/5581/283431/65108/57731/2035/4771/9645/283/10810/25802/85464/3983/10611/23026/23500/171024/9788/10580/7225/91624/121512/84940/78331

GO levels 分类统计图

In [6]:
#可视化--点图
dotplot(ego_MF,title="EnrichmentGO_MF_dot")#点图,按富集的数从大到小的
In [7]:
#条状图,按p从小到大排,绘制前20个Term
barplot(ego_MF, showCategory=20,title="EnrichmentGO_MF")
In [7]:
plotGOgraph(ego_MF)#差异表达蛋白的topGO分子功能有向无环图
groupGOTerms: 	GOBPTerm, GOMFTerm, GOCCTerm environments built.


Building most specific GOs .....

	( 1129 GO terms found. )


Build GO DAG topology ..........

	( 1129 GO terms and 1465 relations. )


Annotating nodes ...............

	( 17697 genes annotated to the GO terms. )

Loading required package: Rgraphviz

Loading required package: grid


Attaching package: 'grid'


The following object is masked from 'package:topGO':

    depth



Attaching package: 'Rgraphviz'


The following objects are masked from 'package:IRanges':

    from, to


The following objects are masked from 'package:S4Vectors':

    from, to


$dag
A graphNEL graph with directed edges
Number of Nodes = 22 
Number of Edges = 24 

$complete.dag
[1] "A graph with 22 nodes."
In [11]:
#KEGG分析
#候选基因进行通路分析
k<-enrichKEGG(gene=test1$ENTREZID,organism = 'hsa',
              pvalueCutoff = 1)
head(k,2)
A data.frame: 2 × 9
IDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCount
<chr><chr><chr><chr><dbl><dbl><dbl><chr><int>
hsa04933hsa04933AGE-RAGE signaling pathway in diabetic complications13/277100/80483.49086e-050.0045492910.0037338687412/8503/7042/1277/7422/3725/5581/1282/7048/5295/1286/1285/2308 13
hsa04068hsa04068FoxO signaling pathway 15/277131/80484.06305e-050.0045492910.0037338688503/7042/1901/1030/8743/10769/1026/114907/2309/7048/4616/5295/51422/8660/230815
In [ ]:
#KEGG气泡图
dotplot(k,title="enrich keggdotplot")
In [ ]:
#pathview通路可视化
library(pathview)
pathview(test1$ENTREZID,pathway.id = "04933",species = "hsa")

细化富集分析处理过程

  1. 读取差异基因表达矩阵
  2. 设定阈值计算基因上调下调数量
  3. ID转换
  4. 得出差异基因(得到排序的geneList)
In [12]:
#加载差异表达矩阵
load("limma_diff.Rdata")
head(diff_signif,3)
dim(diff_signif)
A data.frame: 3 × 6
logFCAveExprtP.Valueadj.P.ValB
<dbl><dbl><dbl><dbl><dbl><dbl>
LINC00906-4.387631-1.15315187-3.6838664.918911e-030.047292017-1.9708390
MCHR1-4.207932-2.72930547-8.6337291.088733e-050.001563930 0.4953098
LRRTM2-4.169559-0.02831165-8.2945871.511796e-050.001864956 2.1409081
  1. 660
  2. 6
In [13]:
#设定阈值计算基因上调下调数目
a=diff_signif
logFC_t=1.5
a$g<-ifelse(a$P.Value>0.05,'stable',ifelse( a$logFC > logFC_t,'UP',
                      ifelse(a$logFC < -logFC_t,'DOWN','stable')))
table(a$g)            
head(a)
  DOWN stable     UP 
   160    326    174 
A data.frame: 6 × 7
logFCAveExprtP.Valueadj.P.ValBg
<dbl><dbl><dbl><dbl><dbl><dbl><chr>
LINC00906-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390DOWN
MCHR1-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098DOWN
LRRTM2-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081DOWN
VCAM1-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506DOWN
VASH2-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112DOWN
SMTNL2-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586DOWN
In [14]:
#将gene SYMBOL加入差异表达矩阵
a$symbol<-rownames(a)
head(a)
rownames(a)<-1:660
head(a)
A data.frame: 6 × 8
logFCAveExprtP.Valueadj.P.ValBgsymbol
<dbl><dbl><dbl><dbl><dbl><dbl><chr><chr>
LINC00906-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390DOWNLINC00906
MCHR1-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098DOWNMCHR1
LRRTM2-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081DOWNLRRTM2
VCAM1-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506DOWNVCAM1
VASH2-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112DOWNVASH2
SMTNL2-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586DOWNSMTNL2
A data.frame: 6 × 8
logFCAveExprtP.Valueadj.P.ValBgsymbol
<dbl><dbl><dbl><dbl><dbl><dbl><chr><chr>
1-4.387631-1.15315187 -3.6838664.918911e-030.0472920174-1.9708390DOWNLINC00906
2-4.207932-2.72930547 -8.6337291.088733e-050.0015639303 0.4953098DOWNMCHR1
3-4.169559-0.02831165 -8.2945871.511796e-050.0018649561 2.1409081DOWNLRRTM2
4-3.686535 3.65813483-10.6919941.814316e-060.0006764174 5.4886506DOWNVCAM1
5-3.627419-1.00508844 -8.4038561.358500e-050.0017917458 1.6860112DOWNVASH2
6-3.522934 0.41710895 -3.6958344.827700e-030.0469016124-1.7974586DOWNSMTNL2
In [16]:
#ID转换
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(unique(a$symbol), fromType = "SYMBOL",
           toType = c( "ENTREZID"),
           OrgDb = org.Hs.eg.db)
dim(df)
head(df,2)
'select()' returned 1:1 mapping between keys and columns

  1. 660
  2. 2
A data.frame: 2 × 2
SYMBOLENTREZID
<chr><chr>
1LINC00906148145
2MCHR1 2847
In [18]:
#把数据DEG,df通过,DEG的'symbol'列,df的'SYMBOL'列连接在一起,转化ID
DEG<-a
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
head(DEG)
save(DEG,file = 'anno_DEG.Rdata')
A data.frame: 6 × 9
symbollogFCAveExprtP.Valueadj.P.ValBgENTREZID
<chr><dbl><dbl><dbl><dbl><dbl><dbl><chr><chr>
1AASS 1.0896005.0494166 7.2088994.662372e-050.0033175260 2.4220641stable10157
2ABAT 1.1507614.519235514.7730381.098921e-070.0001601153 8.3045285stable18
3ABCA10-1.5780022.7125774-5.6816562.851423e-040.0086040149 0.7782505DOWN 10349
4ABCA4 1.3380170.7579106 3.8463933.820855e-030.0407517394-1.5991555stable24
5ABCA6 1.0186466.7918822 7.1249355.113057e-050.0034731907 2.3143131stable23460
6ABCC6 -1.0410612.5046354-4.6723671.119258e-030.0195805999-0.5592302stable368
In [27]:
#得出差异基因
gene_up= DEG[DEG$g == 'UP','ENTREZID'] #选出上调基因ID
gene_down=DEG[DEG$g == 'DOWN','ENTREZID'] #选出下调基因ID
gene_diff=c(gene_up,gene_down)#得出上下调基因ID
gene_all=as.character(DEG[ ,'ENTREZID'] )#得出所有基因ID
head(gene_all,3)
  1. '10157'
  2. '18'
  3. '10349'
In [22]:
#geneList
data(geneList, package="DOSE")#得出geneList数据
head(geneList,4)#查看
4312
4.57261268231107
8318
4.51459371540294
10874
4.41821798112707
55143
4.14407518193211
In [23]:
boxplot(geneList)
boxplot(DEG$logFC)
In [24]:
#得到排序后的geneList
geneList<-DEG$logFC#赋值
names(geneList)<-DEG$ENTREZID
geneList<-sort(geneList,decreasing = T)#从大到小排序
head(geneList)
247
8.02863104170142
7704
7.40167100084314
65997
4.89487840146106
8404
4.57496462683689
5625
4.53141336224362
28999
4.42381460169865

KEGG pathway analysis

In [29]:
#KEGG
kk.up <- enrichKEGG(gene = gene_up,
                    organism = 'hsa',
                    universe= gene_all,
                    pvalueCutoff = 1,
                    qvalueCutoff =1)
head(kk.up,3)
A data.frame: 3 × 9
IDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCount
<chr><chr><chr><chr><dbl><dbl><dbl><chr><int>
hsa05165hsa05165Human papillomavirus infection10/7520/2770.020029370.78114560.78009141285/1286/1311/2308/7855/8325/8515/3908/5295/705710
hsa04512hsa04512ECM-receptor interaction 6/75 11/2770.046119580.89933190.89811821285/1286/1311/8515/3908/7057 6
hsa04151hsa04151PI3K-Akt signaling pathway 8/75 19/2770.106421490.94462000.94334521285/1286/1311/27006/8515/3908/5295/7057 8
In [30]:
dotplot(kk.up )
In [31]:
kk.down<- enrichKEGG(gene         = gene_down,
                    organism     = 'hsa',
                    universe     = gene_all,
                    pvalueCutoff = 1,
                    qvalueCutoff =1)
dotplot(kk.down )
In [33]:
kk.diff<-enrichKEGG(gene=gene_diff,organism = 'hsa',pvalueCutoff = 0.05)
dotplot(kk.diff)
In [34]:
kegg_diff_dt <- as.data.frame(kk.diff)
kegg_down_dt <- as.data.frame(kk.down)
kegg_up_dt <- as.data.frame(kk.up)
down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.05,]
down_kegg$group=-1
up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.05,]
up_kegg$group=1
b<-rbind(down_kegg,up_kegg)
head(b)
A data.frame: 6 × 10
IDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCountgroup
<chr><chr><chr><chr><dbl><dbl><dbl><chr><int><dbl>
hsa04060hsa04060Cytokine-cytokine receptor interaction 13/5922/2775.927126e-050.0022523080.0021212876387/9518/3592/3557/146433/83729/3976/7042/4982/10673/9966/8744/8548013-1
hsa04750hsa04750Inflammatory mediator regulation of TRP channels7/59 10/2771.023191e-030.0194406320.018309736196883/40/623/624/815/8503/5321 7-1
hsa04020hsa04020Calcium signaling pathway 6/59 10/2777.656089e-030.0969771220.091335794196883/623/624/815/2903/5027 6-1
hsa04080hsa04080Neuroactive ligand-receptor interaction 5/59 10/2773.919920e-020.3723924080.350729692623/624/2903/2847/5027 5-1
hsa05165hsa05165Human papillomavirus infection 10/7520/2772.002937e-020.7811455560.7800913781285/1286/1311/2308/7855/8325/8515/3908/5295/7057 10 1
hsa04512hsa04512ECM-receptor interaction 6/75 11/2774.611958e-020.8993318710.8981181981285/1286/1311/8515/3908/7057 6 1
In [35]:
p<-ggplot(b,aes(x=GeneRatio,y=Description))+geom_point(aes(col=pvalue,size=Count))+
  facet_grid(~group)
p+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")
In [40]:
head(kegg_diff_dt,3)
j<-ggplot(kegg_diff_dt,aes(x=GeneRatio,y=Description))+geom_point(aes(col=pvalue,size=Count))
j+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")
A data.frame: 3 × 9
IDDescriptionGeneRatioBgRatiopvaluep.adjustqvaluegeneIDCount
<chr><chr><chr><chr><dbl><dbl><dbl><chr><int>
hsa04213hsa04213Longevity regulating pathway - multiple species8/134 62/8058 7.536378e-060.0018916310.0015786732308/3306/8660/5295/51422/196883/3310/8503 8
hsa05144hsa05144Malaria 7/134 50/8058 1.654599e-050.0020765220.0017329751311/7057/3039/3040/3592/7042/7412 7
hsa04060hsa04060Cytokine-cytokine receptor interaction 15/134294/80581.027727e-040.0085986470.007176057151449/3625/6387/9518/3592/3557/146433/83729/3976/7042/4982/10673/9966/8744/8548015

GSEA

In [43]:
kk_gse <- gseKEGG(geneList     = geneList,
                    organism     = 'hsa',
                    pvalueCutoff = 1,
                    verbose      = FALSE)
  head(kk_gse)[,1:6]
A data.frame: 6 × 6
IDDescriptionsetSizeenrichmentScoreNESpvalue
<chr><chr><int><dbl><dbl><dbl>
hsa04060hsa04060Cytokine-cytokine receptor interaction 22-0.4723285-2.1222020.002398082
hsa04750hsa04750Inflammatory mediator regulation of TRP channels10-0.6112319-2.0113710.004618938
hsa04512hsa04512ECM-receptor interaction 11 0.5139663 1.6604490.040636042
hsa04020hsa04020Calcium signaling pathway 10-0.4466430-1.4697610.071593533
hsa05165hsa05165Human papillomavirus infection 20 0.3666823 1.4433390.079861111
hsa04510hsa04510Focal adhesion 16 0.3521086 1.2979900.162544170
In [44]:
gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
In [45]:
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,]
down_kegg$group=-1
  up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,]
up_kegg$group=1
head(up_kegg)
A data.frame: 1 × 12
IDDescriptionsetSizeenrichmentScoreNESpvaluep.adjustqvaluesrankleading_edgecore_enrichmentgroup
<chr><chr><int><dbl><dbl><dbl><dbl><dbl><dbl><chr><chr><dbl>
hsa04512hsa04512ECM-receptor interaction110.51396631.6604490.040636040.52826860.5132974248tags=82%, list=38%, signal=52%8515/1285/3908/7057/1286/1311/3161/1282/85161
In [46]:
h<-rbind(down_kegg,up_kegg)
m<-ggplot(h,aes(x=setSize,y=Description))+geom_point(aes(col=pvalue,size=enrichmentScore))+ facet_grid(~group)
m+scale_color_gradient2(low="navyblue",mid="blue",high="yellow")