## Written By LiShang ## Notice: 适用于基因芯片平台 #加载依赖包 library(GEOquery) library(limma) library(ggplot2) library(patchwork) library(stringr) library(dplyr) ##############################从GEO下载并提取数据############################## setwd("~/arry") #下载芯片数据 gse <- getGEO("GSE15852", destdir = ".")[[1]] #提取基因表达矩阵、样本分组信息 exp <- exprs(gse) grp <- pData(gse) ###表达矩阵数据前处理 #将表达矩阵的行名(探针ID)转换为Gene Symbol #方法一: 直接从GEO拿数据, 好处是方便快捷, 通用性高 gene_symbols <- fData(gse)[,c("ID","Gene Symbol")] gene_symbols <- setNames(gene_symbols$`Gene Symbol`,gene_symbols$ID)[rownames(exp)] #方法二: 通过芯片提供的R包拿数据, 数据不如GEO的全, 好处是基因名短 #install.packages("hgu133a.db") library(hgu133a.db) gene_symbols <- toTable(hgu133aSYMBOL)[,c("probe_id","symbol")] gene_symbols <- setNames(gene_symbols$symbol,gene_symbols$probe_id)[rownames(exp)] #合并相同基因的多个表达值(平均数法) exp <- aggregate(exp, by = list(gene_symbols), FUN = mean) rownames(exp) <- exp$Group.1 exp <- exp[, -1] ###样本分组数据前处理 #将样本按pData$title分为normal组和cancer组, 并转换为factor grp <- grp[colnames(exp),] grp <- ifelse(str_detect(grp$title,"Normal"),"normal","cancer") %>% factor(c("normal","cancer")) ############################样本质量控制与标准化################################ pca_plot1 <- as.data.frame(prcomp(t(exp))$x) %>% ggplot(aes(x = PC1, y = PC2, colour = grp)) + geom_point() + stat_ellipse(level = 0.95, show.legend = F) + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle("Before Normalize") exp <- normalizeBetweenArrays(exp, method="quantile") if(max(exp)>50) exp <- log2(exp + 1) pca_plot2 <- as.data.frame(prcomp(t(exp))$x) %>% ggplot(aes(x = PC1, y = PC2, colour = grp)) + geom_point() + stat_ellipse(level = 0.95, show.legend = F) + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank()) + ggtitle("After Normalize") pca_plot1 + theme(legend.position = "none") + pca_plot2 ##############################差异基因表达分析################################## fit <- lmFit(exp, model.matrix(~grp)) fit <- eBayes(fit) deg <- topTable(fit, coef="grpcancer", adjust.method="fdr", number=Inf)