RNA/RNA-arry.R
生信分析 c7b7f636a9 添加 RNA-arry.R
添加RNA基因芯片数据分析模板

Signed-off-by: 生信分析 <bioinfo@baihub.cn>
2024-12-11 02:46:58 +08:00

70 lines
2.4 KiB
R
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## Written By LiShang
## Notice适用于基因芯片平台
#加载依赖包
library(GEOquery)
library(limma)
library(ggplot2)
library(patchwork)
library(stringr)
library(dplyr)
##############################从GEO下载并提取数据##############################
setwd("~/arry")
#下载芯片数据
gse <- getGEO("GSE15852", destdir = ".")[[1]]
#提取基因表达矩阵、样本分组信息
exp <- exprs(gse)
grp <- pData(gse)
###表达矩阵数据前处理
#将表达矩阵的行名探针ID转换为Gene Symbol
#方法一直接从GEO拿数据好处是方便快捷通用性高
gene_symbols <- fData(gse)[,c("ID","Gene Symbol")]
gene_symbols <- setNames(gene_symbols$`Gene Symbol`,gene_symbols$ID)[rownames(exp)]
#方法二通过芯片提供的R包拿数据数据不如GEO的全好处是基因名短
#install.packages("hgu133a.db")
library(hgu133a.db)
gene_symbols <- toTable(hgu133aSYMBOL)[,c("probe_id","symbol")]
gene_symbols <- setNames(gene_symbols$symbol,gene_symbols$probe_id)[rownames(exp)]
#合并相同基因的多个表达值(平均数法)
exp <- aggregate(exp, by = list(gene_symbols), FUN = mean)
rownames(exp) <- exp$Group.1
exp <- exp[, -1]
###样本分组数据前处理
#将样本按pData$title分为normal组和cancer组并转换为factor
grp <- grp[colnames(exp),]
grp <- ifelse(str_detect(grp$title,"Normal"),"normal","cancer") %>%
  factor(c("normal","cancer"))
############################样本质量控制与标准化################################
pca_plot1 <- as.data.frame(prcomp(t(exp))$x) %>%
  ggplot(aes(x = PC1, y = PC2, colour = grp)) +
  geom_point() +
  stat_ellipse(level = 0.95, show.legend = F) +
  theme_bw() +
  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
  ggtitle("Before Normalize")
exp <- normalizeBetweenArrays(exp, method="quantile")
if(max(exp)>50) exp <- log2(exp + 1)
pca_plot2 <- as.data.frame(prcomp(t(exp))$x) %>%
  ggplot(aes(x = PC1, y = PC2, colour = grp)) +
  geom_point() +
  stat_ellipse(level = 0.95, show.legend = F) +
  theme_bw() +
  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) +
  ggtitle("After Normalize")
pca_plot1 + theme(legend.position = "none") + pca_plot2
##############################差异基因表达分析##################################
fit <- lmFit(exp, model.matrix(~grp))
fit <- eBayes(fit)
deg <- topTable(fit, coef="grpcancer", adjust.method="fdr", number=Inf)