# SCRIPT ONE: COELIAC DISEASE EXAMPLES (GENE SET ENRICHMENT ANALYSIS) 
# by Eugenio Del Prete

# 1. Set your work folder
setwd(".../workfolder")

# 2. Load libraries for script one
library(RCurl)
library(GEOquery)
library(limma)
library(topGO)
library(genefilter)

# First Example
# A) GSE11501, GPL6104 (Primary Leukocytes)

# 3. Download the GEO dataset 
url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE11nnn/GSE11501/matrix/"
dataset <- getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE)
dataset <- unlist(strsplit(dataset, "\r\n"))
for (ds in dataset) {
    download.file(paste(url,ds, sep = ""), paste(getwd(), "/",ds,sep = ""))
}

# 4. Convert download dataset in usable class
gse <- getGEO(filename = "GSE11501_series_matrix.txt.gz")

# 5. Create matrix design for limma and calculate differential expression 
d <- factor(c(rep('CD', 110),rep('CTRL',22)))
mod <- model.matrix(~0+d)
fit_1 <- lmFit(gse, mod)
contr <- makeContrasts(dCD-dCTRL, levels = mod)
fit_2 <- contrasts.fit(fit = fit_1, contrasts = contr)
fit_3 <- eBayes(fit_2)

# 6. Create statistics table and subtable
table_result <- topTable(fit_3, n = length(fit_3), adjust="BH", sort.by = "logFC")
subtable_result <- subset(table_result, select=c("ID","Symbol","adj.P.Val","P.Value","t","B","logFC"))
subtable_result <- subtable_result[is.na(subtable_result$ID)==0,]
geneList <- subtable_result$logFC
names(geneList) <- subtable_result$Symbol

# 7. Save subtable in Excel format
write.csv2(subtable_result,"GSE11501_table.csv")
# nrow(subtable_result[subtable_result$P.Value<0.05,])

# 8. Choose the LogFC threshold 
topDiffGenes <- function(allScore){
    return(abs(allScore)>1.00)
}

# 9. Create topGO class with annotation
CD_GOdata <- new("topGOdata",
                 description = "cd study",
                 ontology = "BP", 
                 allGenes = geneList,
                 geneSel = topDiffGenes,
                 annot = annFUN.org, 
                 ID = "symbol", 
                 mapping = "org.Hs.eg.db",
                 nodeSize = 10)

# 10. Show genes and GO terms
n_sg <- sum(topDiffGenes(geneList))
sg <- sigGenes(CD_GOdata)
ug <- usedGO(CD_GOdata)

# 11. Perform Fisher's Exact Test (and Kolmogorov-Smirnov for evaluation, not described)
resultFisher <- runTest(CD_GOdata, algorithm = "classic", statistic = "fisher")
resultKS <- runTest(CD_GOdata, algorithm = "classic", statistic = "ks")

# 12. Compare the tests (not described)
#pvalFis <- score(resultFisher)
#pvalKS <- score(resultKS,whichGO = names(pvalFis))
#cor_pval <- cor(pvalFis,pvalKS)

# 13. Create GO terms tree
allRes <- GenTable(CD_GOdata, classic = resultFisher, KS = resultKS, orderBy = "classic", topNodes = 30)
showSigOfNodes(CD_GOdata,score(resultFisher),firstSigNodes = 5, useInfo = "all")
printGraph(CD_GOdata,resultFisher,firstSigNodes = 5, fn.prefix = "CD1_GSE11501", useInfo = "all", pdfSW = TRUE)

# 14. Create text file for the correspondence GO terms - genes (this file is mandatory for the script two)
terms <- allRes$GO.ID
genes <- genesInTerm(CD_GOdata,terms)
for (i in 1:length(terms))
{
    term <- terms[i]
    genes_term <- genes[term][[1]]
    # find the genes that are in the list of genes of interest
    fact <- genes_term %in% sg
    genes_term_2 <- genes_term[fact == TRUE]
    genes_term_2 <- paste(genes_term_2, collapse=',')
    cat(term,"genes:",genes_term_2,"\n", append = TRUE, file = "CD1_GSE11501_correspondence.txt" )
}

#--------------------------------------------------------------------------------------
# Second Example
# B) GSE87629, GPL6947 (B-T lymphocytes)

url <- "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE87nnn/GSE87629/matrix/"
dataset <- getURL(url, ftp.use.epsv = FALSE, dirlistonly = TRUE)
dataset <- unlist(strsplit(dataset, "\r\n"))
for (ds in dataset) {
    download.file(paste(url,ds, sep = ""), paste(getwd(), "/",ds,sep = ""))
}

gse <- getGEO(filename = "GSE87629-GPL6947_series_matrix.txt.gz")
gse <- gse[,sampleNames(gse)!="GSM2335908"]

d <- factor(c('CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL',
              'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL',
              'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL',
              'CD','CTRL','CD','CTRL','CD','CTRL','CD','CTRL'))
mod <- model.matrix(~0+d)
fit_1 <- lmFit(gse, mod)
contr <- makeContrasts(dCD-dCTRL, levels = mod)
fit_2 <- contrasts.fit(fit = fit_1, contrasts = contr)
fit_3 <- eBayes(fit_2)

table_result <- topTable(fit_3, n = length(fit_3), adjust="BH", sort.by = "logFC")
subtable_result <- subset(table_result, select=c("ID","Symbol","adj.P.Val","P.Value","t","B","logFC"))
subtable_result <- subtable_result[is.na(subtable_result$ID)==0,]
geneList <- subtable_result$logFC
names(geneList) <- subtable_result$Symbol

write.csv2(subtable_result,"GSE87629_table.csv")
# nrow(subtable_result[subtable_result$P.Value<0.05,])

topDiffGenes <- function(allScore){
    return(abs(allScore)>1.00)
}

CD_GOdata <- new("topGOdata",
                 description = "cd study",
                 ontology = "BP", 
                 allGenes = geneList,
                 geneSel = topDiffGenes,
                 annot = annFUN.org, 
                 ID = "symbol", 
                 mapping = "org.Hs.eg.db",
                 nodeSize = 10)

n_sg <- sum(topDiffGenes(geneList))
sg <- sigGenes(CD_GOdata)
ug <- usedGO(CD_GOdata)

resultFisher <- runTest(CD_GOdata, algorithm = "classic", statistic = "fisher")
resultKS <- runTest(CD_GOdata, algorithm = "classic", statistic = "ks")

#pvalFis <- score(resultFisher)
#pvalKS <- score(resultKS,whichGO = names(pvalFis))
#cor_pval <- cor(pvalFis,pvalKS)

allRes <- GenTable(CD_GOdata, classic = resultFisher, KS = resultKS, orderBy = "classic", topNodes = 30)
showSigOfNodes(CD_GOdata,score(resultFisher),firstSigNodes = 5, useInfo = "all")
printGraph(CD_GOdata,resultFisher,firstSigNodes = 5, fn.prefix = "CD1_GSE87629", useInfo = "all", pdfSW = TRUE)

terms <- allRes$GO.ID
genes <- genesInTerm(CD_GOdata,terms)
for (i in 1:length(terms))
{
    term <- terms[i]
    genes_term <- genes[term][[1]]
    # find the genes that are in the list of genes of interest
    fact <- genes_term %in% sg
    genes_term_2 <- genes_term[fact == TRUE]
    genes_term_2 <- paste(genes_term_2, collapse=',')
    cat(term,"genes:",genes_term_2,"\n", append = TRUE, file = "CD1_GSE87629_correspondence.txt" )
}