添加 RNA-arry.R

添加RNA基因芯片数据分析模板 Signed-off-by: 生信分析 <bioinfo@baihub.cn>
2024-12-11 02:46:58 +08:00 · 2024-12-11 02:46:58 +08:00 · c7b7f636a9
commit c7b7f636a9
parent f53e29348c
1 changed files with 70 additions and 0 deletions
--- a/RNA-arry.R
+++ b/RNA-arry.R
@ -0,0 +1,70 @@
+## Written By LiShang
+## Notice：适用于基因芯片平台
+
+#加载依赖包
+library(GEOquery)
+library(limma)
+library(ggplot2)
+library(patchwork)
+library(stringr)
+library(dplyr)
+
+##############################从GEO下载并提取数据##############################
+
+setwd("~/arry")
+#下载芯片数据
+gse <- getGEO("GSE15852", destdir = ".")[[1]]
+#提取基因表达矩阵、样本分组信息
+exp <- exprs(gse)
+grp <- pData(gse)
+
+###表达矩阵数据前处理
+#将表达矩阵的行名（探针ID）转换为Gene Symbol
+#方法一：直接从GEO拿数据，好处是方便快捷，通用性高
+gene_symbols <- fData(gse)[,c("ID","Gene Symbol")]
+gene_symbols <- setNames(gene_symbols$`Gene Symbol`,gene_symbols$ID)[rownames(exp)]
+#方法二：通过芯片提供的R包拿数据，数据不如GEO的全，好处是基因名短
+#install.packages("hgu133a.db")
+library(hgu133a.db)
+gene_symbols <- toTable(hgu133aSYMBOL)[,c("probe_id","symbol")]
+gene_symbols <- setNames(gene_symbols$symbol,gene_symbols$probe_id)[rownames(exp)]
+
+#合并相同基因的多个表达值(平均数法)
+exp <- aggregate(exp, by = list(gene_symbols), FUN = mean)
+rownames(exp) <- exp$Group.1
+exp <- exp[, -1]
+
+###样本分组数据前处理
+#将样本按pData$title分为normal组和cancer组，并转换为factor
+grp <- grp[colnames(exp),]
+grp <- ifelse(str_detect(grp$title,"Normal"),"normal","cancer") %>%
+  factor(c("normal","cancer"))
+
+############################样本质量控制与标准化################################
+
+pca_plot1 <- as.data.frame(prcomp(t(exp))$x) %>%
+  ggplot(aes(x = PC1, y = PC2, colour = grp)) +
+  geom_point() +
+  stat_ellipse(level = 0.95, show.legend = F) +
+  theme_bw() +
+  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
+  ggtitle("Before Normalize")
+
+exp <- normalizeBetweenArrays(exp, method="quantile")
+if(max(exp)>50) exp <- log2(exp + 1)
+
+pca_plot2 <- as.data.frame(prcomp(t(exp))$x) %>%
+  ggplot(aes(x = PC1, y = PC2, colour = grp)) +
+  geom_point() +
+  stat_ellipse(level = 0.95, show.legend = F) +
+  theme_bw() +
+  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
+  ggtitle("After Normalize")
+
+pca_plot1 + theme(legend.position = "none") + pca_plot2
+
+##############################差异基因表达分析##################################
+
+fit <- lmFit(exp, model.matrix(~grp))
+fit <- eBayes(fit)
+deg <- topTable(fit, coef="grpcancer", adjust.method="fdr", number=Inf)