diff --git a/RNA-arry.R b/RNA-arry.R new file mode 100644 index 0000000..5e3adcd --- /dev/null +++ b/RNA-arry.R @@ -0,0 +1,70 @@ +## Written By LiShang +## Notice:适用于基因芯片平台 + +#加载依赖包 +library(GEOquery) +library(limma) +library(ggplot2) +library(patchwork) +library(stringr) +library(dplyr) + +##############################从GEO下载并提取数据############################## + +setwd("~/arry") +#下载芯片数据 +gse <- getGEO("GSE15852", destdir = ".")[[1]] +#提取基因表达矩阵、样本分组信息 +exp <- exprs(gse) +grp <- pData(gse) + +###表达矩阵数据前处理 +#将表达矩阵的行名(探针ID)转换为Gene Symbol +#方法一:直接从GEO拿数据,好处是方便快捷,通用性高 +gene_symbols <- fData(gse)[,c("ID","Gene Symbol")] +gene_symbols <- setNames(gene_symbols$`Gene Symbol`,gene_symbols$ID)[rownames(exp)] +#方法二:通过芯片提供的R包拿数据,数据不如GEO的全,好处是基因名短 +#install.packages("hgu133a.db") +library(hgu133a.db) +gene_symbols <- toTable(hgu133aSYMBOL)[,c("probe_id","symbol")] +gene_symbols <- setNames(gene_symbols$symbol,gene_symbols$probe_id)[rownames(exp)] + +#合并相同基因的多个表达值(平均数法) +exp <- aggregate(exp, by = list(gene_symbols), FUN = mean) +rownames(exp) <- exp$Group.1 +exp <- exp[, -1] + +###样本分组数据前处理 +#将样本按pData$title分为normal组和cancer组,并转换为factor +grp <- grp[colnames(exp),] +grp <- ifelse(str_detect(grp$title,"Normal"),"normal","cancer") %>% +  factor(c("normal","cancer")) + +############################样本质量控制与标准化################################ + +pca_plot1 <- as.data.frame(prcomp(t(exp))$x) %>% +  ggplot(aes(x = PC1, y = PC2, colour = grp)) + +  geom_point() + +  stat_ellipse(level = 0.95, show.legend = F) + +  theme_bw() + +  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + +  ggtitle("Before Normalize") + +exp <- normalizeBetweenArrays(exp, method="quantile") +if(max(exp)>50) exp <- log2(exp + 1) + +pca_plot2 <- as.data.frame(prcomp(t(exp))$x) %>% +  ggplot(aes(x = PC1, y = PC2, colour = grp)) + +  geom_point() + +  stat_ellipse(level = 0.95, show.legend = F) + +  theme_bw() + +  theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + +  ggtitle("After Normalize") + +pca_plot1 + theme(legend.position = "none") + pca_plot2 + +##############################差异基因表达分析################################## + +fit <- lmFit(exp, model.matrix(~grp)) +fit <- eBayes(fit) +deg <- topTable(fit, coef="grpcancer", adjust.method="fdr", number=Inf) \ No newline at end of file