# Script:       Step1_DataExploration.R
# Description:  In this script, we will explore the differential gene 
#               expression dataset comparing lung cancer vs. healthy tissue
#               samples. The RNA-sequencing dataset was retrieved from 
#               TCGA (The Cancer Genome Atlas) and pre-processed in R. 
#               Differential gene expression analysis was performed with the 
#               DESeq2 R-package. 
# Version: 3.0
# Last updated: 2025-06-24
# Author: mkutmon & peiprJS

# #############################################
# DIFFERENTIALLY EXPRESSED GENES
# #############################################

# Let's see how many genes are differentially expressed in our dataset
degs <- data[abs(data$log2FC) > log2fc.cutoff & data$adj.P.Value < pvalue.cutoff,]

# let's write out the table with all differentially expressed genes
write.table(degs, file=paste0(out.folder,"degs.tsv"), row.names = FALSE, sep="\t", quote = FALSE)

log2fc.cutoff <- -1
genes.down <- data[data$log2FC < log2fc.cutoff & data$adj.P.Value < pvalue.cutoff,]
write.table(degs, file=paste0(out.folder,"degs_down.tsv"), row.names = FALSE, sep="\t", quote = FALSE)

log2fc.cutoff <- 1
genes.up <- data[data$log2FC > log2fc.cutoff & data$adj.P.Value < pvalue.cutoff,]
write.table(degs, file=paste0(out.folder,"degs_up.tsv"), row.names = FALSE, sep="\t", quote = FALSE)

# #############################################
# VOLCANO PLOT
# #############################################

# Let's create a Volcano plot to get a better understand of the intensity and
# direction of the changed genes

EnhancedVolcano(data, title = paste0("Breast cancer vs. Healthy (",nrow(degs), " DEGs)"), lab = data$GeneName, x = "log2FC", y = "adj.P.Value", pCutoff = pvalue.cutoff, FCcutoff = log2fc.cutoff, labSize = 3, xlim = c(-15,15), ylim=c(0,4.5))

# the code below saves the figure in a file in our output folder
filename <- paste0(out.folder,"volcano-plot.png")
png(filename , width = 2000, height = 1500, res = 150)
EnhancedVolcano(data, title = paste0("Breast cancer vs. Healthy (",nrow(degs), " DEGs)"), lab = data$GeneName, x = "log2FC", y = "adj.P.Value", pCutoff = pvalue.cutoff, FCcutoff = log2fc.cutoff, labSize = 3, xlim = c(-15,15), ylim=c(0,4.5))
dev.off()
## png 
##   2