# SCRIPT ONE: COELIAC DISEASE EXAMPLES (GENE SET ENRICHMENT ANALYSIS) # by Eugenio Del Prete # 1. Set your work folder setwd(".../workfolder") # 2. Load libraries for script one library(RCurl) library(GEOquery) library(limma) library(topGO) library(genefilter) # First Example # A) GSE11501, GPL6104 (Primary Leukocytes) # 3. Download the GEO dataset url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE11nnn/GSE11501/matrix/" dataset <- getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE) dataset <- unlist(strsplit(dataset, "\r\n")) for (ds in dataset) { download.file(paste(url,ds, sep = ""), paste(getwd(), "/",ds,sep = "")) } # 4. Convert download dataset in usable class gse <- getGEO(filename = "GSE11501_series_matrix.txt.gz") # 5. Create matrix design for limma and calculate differential expression d <- factor(c(rep('CD', 110),rep('CTRL',22))) mod <- model.matrix(~0+d) fit_1 <- lmFit(gse, mod) contr <- makeContrasts(dCD-dCTRL, levels = mod) fit_2 <- contrasts.fit(fit = fit_1, contrasts = contr) fit_3 <- eBayes(fit_2) # 6. Create statistics table and subtable table_result <- topTable(fit_3, n = length(fit_3), adjust="BH", sort.by = "logFC") subtable_result <- subset(table_result, select=c("ID","Symbol","adj.P.Val","P.Value","t","B","logFC")) subtable_result <- subtable_result[is.na(subtable_result$ID)==0,] geneList <- subtable_result$logFC names(geneList) <- subtable_result$Symbol # 7. Save subtable in Excel format write.csv2(subtable_result,"GSE11501_table.csv") # nrow(subtable_result[subtable_result$P.Value<0.05,]) # 8. Choose the LogFC threshold topDiffGenes <- function(allScore){ return(abs(allScore)>1.00) } # 9. Create topGO class with annotation CD_GOdata <- new("topGOdata", description = "cd study", ontology = "BP", allGenes = geneList, geneSel = topDiffGenes, annot = annFUN.org, ID = "symbol", mapping = "org.Hs.eg.db", nodeSize = 10) # 10. Show genes and GO terms n_sg <- sum(topDiffGenes(geneList)) sg <- sigGenes(CD_GOdata) ug <- usedGO(CD_GOdata) # 11. Perform Fisher's Exact Test (and Kolmogorov-Smirnov for evaluation, not described) resultFisher <- runTest(CD_GOdata, algorithm = "classic", statistic = "fisher") resultKS <- runTest(CD_GOdata, algorithm = "classic", statistic = "ks") # 12. Compare the tests (not described) #pvalFis <- score(resultFisher) #pvalKS <- score(resultKS,whichGO = names(pvalFis)) #cor_pval <- cor(pvalFis,pvalKS) # 13. Create GO terms tree allRes <- GenTable(CD_GOdata, classic = resultFisher, KS = resultKS, orderBy = "classic", topNodes = 30) showSigOfNodes(CD_GOdata,score(resultFisher),firstSigNodes = 5, useInfo = "all") printGraph(CD_GOdata,resultFisher,firstSigNodes = 5, fn.prefix = "CD1_GSE11501", useInfo = "all", pdfSW = TRUE) # 14. Create text file for the correspondence GO terms - genes (this file is mandatory for the script two) terms <- allRes$GO.ID genes <- genesInTerm(CD_GOdata,terms) for (i in 1:length(terms)) { term <- terms[i] genes_term <- genes[term][[1]] # find the genes that are in the list of genes of interest fact <- genes_term %in% sg genes_term_2 <- genes_term[fact == TRUE] genes_term_2 <- paste(genes_term_2, collapse=',') cat(term,"genes:",genes_term_2,"\n", append = TRUE, file = "CD1_GSE11501_correspondence.txt" ) } #-------------------------------------------------------------------------------------- # Second Example # B) GSE87629, GPL6947 (B-T lymphocytes) url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE87nnn/GSE87629/matrix/" dataset <- getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE) dataset <- unlist(strsplit(dataset, "\r\n")) for (ds in dataset) { download.file(paste(url,ds, sep = ""), paste(getwd(), "/",ds,sep = "")) } gse <- getGEO(filename = "GSE87629-GPL6947_series_matrix.txt.gz") gse <- gse[,sampleNames(gse)!="GSM2335908"] d <- factor(c('CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL', 'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL', 'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL', 'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL')) mod <- model.matrix(~0+d) fit_1 <- lmFit(gse, mod) contr <- makeContrasts(dCD-dCTRL, levels = mod) fit_2 <- contrasts.fit(fit = fit_1, contrasts = contr) fit_3 <- eBayes(fit_2) table_result <- topTable(fit_3, n = length(fit_3), adjust="BH", sort.by = "logFC") subtable_result <- subset(table_result, select=c("ID","Symbol","adj.P.Val","P.Value","t","B","logFC")) subtable_result <- subtable_result[is.na(subtable_result$ID)==0,] geneList <- subtable_result$logFC names(geneList) <- subtable_result$Symbol write.csv2(subtable_result,"GSE87629_table.csv") # nrow(subtable_result[subtable_result$P.Value<0.05,]) topDiffGenes <- function(allScore){ return(abs(allScore)>1.00) } CD_GOdata <- new("topGOdata", description = "cd study", ontology = "BP", allGenes = geneList, geneSel = topDiffGenes, annot = annFUN.org, ID = "symbol", mapping = "org.Hs.eg.db", nodeSize = 10) n_sg <- sum(topDiffGenes(geneList)) sg <- sigGenes(CD_GOdata) ug <- usedGO(CD_GOdata) resultFisher <- runTest(CD_GOdata, algorithm = "classic", statistic = "fisher") resultKS <- runTest(CD_GOdata, algorithm = "classic", statistic = "ks") #pvalFis <- score(resultFisher) #pvalKS <- score(resultKS,whichGO = names(pvalFis)) #cor_pval <- cor(pvalFis,pvalKS) allRes <- GenTable(CD_GOdata, classic = resultFisher, KS = resultKS, orderBy = "classic", topNodes = 30) showSigOfNodes(CD_GOdata,score(resultFisher),firstSigNodes = 5, useInfo = "all") printGraph(CD_GOdata,resultFisher,firstSigNodes = 5, fn.prefix = "CD1_GSE87629", useInfo = "all", pdfSW = TRUE) terms <- allRes$GO.ID genes <- genesInTerm(CD_GOdata,terms) for (i in 1:length(terms)) { term <- terms[i] genes_term <- genes[term][[1]] # find the genes that are in the list of genes of interest fact <- genes_term %in% sg genes_term_2 <- genes_term[fact == TRUE] genes_term_2 <- paste(genes_term_2, collapse=',') cat(term,"genes:",genes_term_2,"\n", append = TRUE, file = "CD1_GSE87629_correspondence.txt" ) }