# SCRIPT TWO: PATHWAY AND MATRICES (SEMANTIC SIMILIARY) # by Eugenio Del Prete # 1. Set your work folder setwd(".../workfolder") # 2. Load libraries for script two library(GOSemSim) library(readtext) library(stringr) library(factoextra) library(dendextend) library(corrplot) library(RColorBrewer) # 3. List the available correspondence files path <- c(".../Output_Txt") files <- readtext(path) files_split <- strsplit(files[,1],"_") id <- unlist(lapply(files_split, function(x) paste0(x[1],"_",x[2]))) # 4. Create list of genes related to each dataset go_term <- lapply(files[,2], function(x) str_extract_all(x,"GO:.{7}")) go_term_sub <- lapply(go_term, function(x) x[[1]][1:5]) names(go_term_sub) <- id gene_grasp <- function(text_gene){ aux_1 <- str_replace_all(text_gene,"GO:.{7} |\n|genes:", "") aux_2 <- strsplit(aux_1, " ") aux_3 <- strsplit(aux_2[[1]][2:6],",") aux_4 <- unique(unlist(aux_3)) return(aux_4) } gene_term_sub <- lapply(files[,2],gene_grasp) names(gene_term_sub) <- id # saveRDS(gene_term_sub, file = "gene_term_sub_5.Rda") # 5. Select the gene ontology and prepare the annotation hsGO <- godata('org.Hs.eg.db', ont="BP") hsGO2 <- godata('org.Hs.eg.db', keytype = "SYMBOL", ont="BP", computeIC=FALSE) # 6. Create semantic similarity matrices (GO terms, genes) len <- length(id) go_sem_sim_mat = gene_sem_sim_mat <- matrix(data = 0, nrow = len, ncol = len) rownames(go_sem_sim_mat) = rownames(gene_sem_sim_mat) <- id colnames(go_sem_sim_mat) = colnames(gene_sem_sim_mat) <- id for(k in 1:len){ for(kk in 1:len){ go_sem_sim_mat[k,kk] <- mgoSim(go_term_sub[[k]], go_term_sub[[kk]], semData=hsGO, measure="Wang", combine="BMA") gene_sem_sim_mat[k,kk] <- clusterSim(gene_term_sub[[k]], gene_term_sub[[kk]], semData=hsGO2, measure="Wang", combine="BMA") cat("k =",k,"kk =",kk,"per =", round(((k-1)*len+kk)/len^2,digits = 3)*100,"%\n") } } #saveRDS(go_sem_sim_mat,file="go_sem_sim_mat_5.Rda") #saveRDS(gene_sem_sim_mat,file="gene_sem_sim_mat_5.Rda") # 7. Plot the semantic similarity matrices library(DOSE) pdf(file = "go_mat_5.pdf", width = 10) simplot(go_sem_sim_mat, color.low="white", color.high="red", labs=TRUE, digits=2, labs.size=2, font.size=12, xlab="", ylab="") dev.off() pdf(file = "gene_mat_5.pdf", width = 10) simplot(gene_sem_sim_mat, color.low="white", color.high="red", labs=TRUE, digits=2, labs.size=2, font.size=12, xlab="", ylab="") dev.off() do_id <- c("DOID:986","DOID:11390","DOID:13375","DOID:2508","DOID:9810", "DOID:7188","DOID:10608", "DOID:10223","DOID:12236","DOID:0060054", "DOID:7148","DOID:676","DOID:12306") do_ac <- c("AA","ARa","ARb","ARc","ARd","AT","CD", "DM","PB","PN","RAa","RAb","VI") do_sem_sim_mat <- doSim(do_id,do_id,measure="Wang") rownames(do_sem_sim_mat) = colnames(do_sem_sim_mat) <- do_ac #saveRDS(do_sem_sim_mat,file="do_sem_sim_mat.Rda") pdf(file = "do_mat.pdf", width = 10) simplot(do_sem_sim_mat, color.low="white", color.high="red", labs=TRUE, digits=2, labs.size=2, font.size=20, xlab="", ylab="") dev.off() # 8. Create KEGG Enrichment graph library(clusterProfiler) corrisp <- lapply(gene_term_sub,function(x) bitr(x, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")) corrisp <- lapply(corrisp,"[","ENTREZID") corrisp <- lapply(corrisp, unlist) names(corrisp) = substr(names(corrisp),1,3) ccluster <- compareCluster(geneCluster = corrisp, fun = "enrichKEGG") pdf(file = "enrich_KEGG_5.pdf", width = 15) dotplot(ccluster, font.size = 9) dev.off()