# Script:       Step2_EnrichmentAnalysis.R
# Description:  In this script, we will explore the differential gene 
#               expression dataset comparing lung cancer vs. healthy tissue
#               samples. The RNA-sequencing dataset was retrieved from 
#               TCGA (The Cancer Genome Atlas) and pre-processed in R. 
#               Differential gene expression analysis was performed with the 
#               DESeq2 R-package. 
# Version: 3.0
# Last updated: 2025-06-24
# Author: mkutmon & peiprJS

# #############################################
# GENE ONTOLOGY ENRICHMENT ANALYSIS
# #############################################

# We will first explore what kind of biological processes are affected by 
# performing a Gene Ontology enrichment analysis. 

# We will start by looking at processes that are up-regulated
res.go.up <- clusterProfiler::enrichGO(genes.up$GeneID, OrgDb = "org.Hs.eg.db", 
                   keyType="ENSEMBL", universe=data$GeneID, ont = "BP", 
                   pAdjustMethod = "BH", qvalueCutoff = 0.05, 
                   minGSSize = 20, maxGSSize = 400)
res.go.up.df <- as.data.frame(res.go.up)
write.table(res.go.up.df, file=paste0(out.folder,"go-up.txt"), sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

res.go.down <- clusterProfiler::enrichGO(genes.down$GeneID, OrgDb = "org.Hs.eg.db", 
                                       keyType="ENSEMBL", universe=data$GeneID, ont = "BP", 
                                       pAdjustMethod = "BH", qvalueCutoff = 0.05, 
                                       minGSSize = 20, maxGSSize = 400)
res.go.down.df <- as.data.frame(res.go.down)
write.table(res.go.down.df, file=paste0(out.folder,"go-down.txt"), sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

# Both lists are very long and difficult to interpret. Let's see if the 
# treeplots (discussed in workshop 1) help with the interpretation:
# we first calculate the similarity between the processes
res.go.up.sim <- enrichplot::pairwise_termsim(res.go.up)
res.go.down.sim <- enrichplot::pairwise_termsim(res.go.down)

# then we visualize them in a treeplot (this will only look at the top 100 terms)
treeplot(res.go.up.sim, label_format = 0.5, showCategory = 100, cluster.params = list(n = 15, label_words_n = 0))

treeplot(res.go.down.sim, label_format = 0.5, showCategory = 100, cluster.params = list(n = 15, label_words_n = 0))

# we will also save files that has a nicely readable figure
filename <- paste0(out.folder,"GO_Upregulated_Treeplot.png")
png(filename , width = 3000, height = 4000, res = 150)
plot(treeplot(res.go.up.sim, label_format = 0.5, showCategory = 100, cluster.params = list(n = 15, label_words_n = 0)))
dev.off()
## png 
##   2
filename <- paste0(out.folder,"GO_Downregulated_Treeplot.png")
png(filename , width = 3000, height = 4000, res = 150)
plot(treeplot(res.go.down.sim, label_format = 0.5, showCategory = 100, cluster.params = list(n = 15, label_words_n = 0)))
dev.off()
## png 
##   2
# #############################################
# PATHWAY ENRICHMENT ANALYSIS
# #############################################

# We will now explore what kind of pathways are affected by 
# performing a pathway enrichment analysis. 

# Let's retrieve information about the human pathways in WikiPathways
genesets.wp <- msigdbr(species = "Homo sapiens", subcollection = "CP:WIKIPATHWAYS") %>% dplyr::select(gs_name, ensembl_gene)

# Let's first look for altered pathways (up- and down-regulated genes together)
# Most often when looking at pathways, we look for up- and down-regulated genes together
res.wp <- clusterProfiler::enricher(degs$GeneID, TERM2GENE = genesets.wp, pAdjustMethod = "fdr", pvalueCutoff = 0.1, minGSSize = 5, maxGSSize = 400)
res.wp.df <- as.data.frame(res.wp)
write.table(res.wp.df, file=paste0(out.folder,"wp-degs.txt"), sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)


# For exploration, we can also check if there are pathways that are enriched for up- or down-regulated genes separately
res.wp.up <- clusterProfiler::enricher(genes.up$GeneID, TERM2GENE = genesets.wp, pAdjustMethod = "fdr", pvalueCutoff = 0.1, minGSSize = 5, maxGSSize = 400)
res.wp.up.df <- as.data.frame(res.wp.up)
write.table(res.wp.up.df, file=paste0(out.folder,"wp-up.txt"), sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)

res.wp.down <- clusterProfiler::enricher(genes.down$GeneID, TERM2GENE = genesets.wp, pAdjustMethod = "fdr", pvalueCutoff = 0.1, minGSSize = 5, maxGSSize = 400)
res.wp.down.df <- as.data.frame(res.wp.down)
write.table(res.wp.down.df, file=paste0(out.folder,"wp-down.txt"), sep="\t", row.names = FALSE, col.names = TRUE, quote = FALSE)


# Let's see if the treeplots (discussed in workshop 1) help with the interpretation:
# we first calculate the similarity between the processes
res.wp.sim <- enrichplot::pairwise_termsim(res.wp)
res.wp.up.sim <- enrichplot::pairwise_termsim(res.wp.up)
res.wp.down.sim <- enrichplot::pairwise_termsim(res.wp.down)

# then we visualize them in a treeplot
treeplot(res.wp.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0))

treeplot(res.wp.up.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0))

treeplot(res.wp.down.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0))

# we will also save files that has a nicely readable figure

filename <- paste0(out.folder,"WP_DEGs_Treeplot.png")
png(filename , width = 3000, height = 2000, res = 150)
plot(treeplot(res.wp.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0)))
dev.off()
## png 
##   2
filename <- paste0(out.folder,"WP_Upregulated_Treeplot.png")
png(filename , width = 3000, height = 2000, res = 150)
plot(treeplot(res.wp.up.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0)))
dev.off()
## png 
##   2
filename <- paste0(out.folder,"WP_Downregulated_Treeplot.png")
png(filename , width = 3000, height = 2000, res = 150)
plot(treeplot(res.wp.down.sim, label_format = 0.5, showCategory = 100, cluster.params = list(label_words_n = 0)))
dev.off()
## png 
##   2