knitr::opts_chunk$set(echo = TRUE)

In this document, we’ll walk through our gene expression analysis process. Beginning from the selection of dataset of interest to the visualization of our results. The codes and outputs will be shown.

# STEP 1: Read in data and assign to variable

gbm_data <- read.csv("glioblastoma.csv", row.names=1)

head(gbm_data)
            TCGA.19.4065.02A.11R.2005.01 TCGA.19.0957.02A.11R.2005.01

ENSG00000272398 763 4526 ENSG00000135439 2759 8384 ENSG00000130348 939 850 ENSG00000198719 231 1266 ENSG00000169429 540 512 ENSG00000171608 1282 720 TCGA.06.0152.02A.01R.2005.01 TCGA.14.1402.02A.01R.2005.01 ENSG00000272398 683 1820 ENSG00000135439 2763 294 ENSG00000130348 1250 1398 ENSG00000198719 817 459 ENSG00000169429 655 2891 ENSG00000171608 1694 264 TCGA.14.0736.02A.01R.2005.01 TCGA.06.5410.01A.01R.1849.01 ENSG00000272398 3113 284 ENSG00000135439 715 819 ENSG00000130348 519 982 ENSG00000198719 114 305 ENSG00000169429 1194 41877 ENSG00000171608 342 2286 TCGA.19.5960.01A.11R.1850.01 TCGA.14.0781.01B.01R.1849.01 ENSG00000272398 491251 949 ENSG00000135439 83504 771 ENSG00000130348 59301 1060 ENSG00000198719 165404 337 ENSG00000169429 135 7248 ENSG00000171608 413 1181 TCGA.02.2483.01A.01R.1849.01 TCGA.06.2570.01A.01R.1849.01 ENSG00000272398 22400 9955 ENSG00000135439 1504 21358 ENSG00000130348 1553 1716 ENSG00000198719 5404 4069 ENSG00000169429 1911 3736 ENSG00000171608 31702 1190

# STEP 2: Generate heatmaps and visualize clusters
library(gplots)
library(RColorBrewer)

# a- Define a color palette for heatmap
# Diverging color palettes

colors_diverging <- colorRampPalette(c("Blue", "White", "Brown"))(100)
heatmap.2(as.matrix(gbm_data), trace = 'none',
          scale='row', dendogram = 'col',
          Colv = TRUE, Rowv = FALSE,
          cexCol = 0.6,
          col = colors_diverging,
          main = "Diverging color scale")

# Sequential color palettes
heatmap.2(as.matrix(gbm_data), trace = 'none',
          scale='row', dendogram = 'col',
          Colv = TRUE, Rowv = FALSE,
          cexCol = 0.6,
          col = colorRampPalette(brewer.pal(11, "Blues"))(100),
          main = "Sequential color scale")

# b- Cluster by genes, samples and both
# Cluster by both genes and samples
heatmap.2(as.matrix(gbm_data), trace = 'none',
          scale='row', dendrogram = 'both',
          Colv = TRUE, Rowv = TRUE,
          cexCol = 0.6,
          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
          main = "Clustering by genes and samples")

# Cluster by rows (genes)
heatmap.2(as.matrix(gbm_data), trace = 'none',
          scale='row', dendrogram = 'row',
          Colv = FALSE, Rowv = TRUE,
          cexCol = 0.6,
          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
          main = "Clustering by genes")

# Cluster by columns (samples)
heatmap.2(as.matrix(gbm_data), trace = 'none',
          scale='row', dendrogram = 'col',
          Colv = TRUE, Rowv = FALSE,
          cexCol = 0.6,
          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
          main = "Clustering by samples")

# STEP 3: Subset genes that are significantly upregulated and downregulated

# a- First, calculate fold change and pvalue

# Divide the groups based on the clusters from the heatmap

# Get column names
colnames(gbm_data)

[1] “TCGA.19.4065.02A.11R.2005.01” “TCGA.19.0957.02A.11R.2005.01” [3] “TCGA.06.0152.02A.01R.2005.01” “TCGA.14.1402.02A.01R.2005.01” [5] “TCGA.14.0736.02A.01R.2005.01” “TCGA.06.5410.01A.01R.1849.01” [7] “TCGA.19.5960.01A.11R.1850.01” “TCGA.14.0781.01B.01R.1849.01” [9] “TCGA.02.2483.01A.01R.1849.01” “TCGA.06.2570.01A.01R.1849.01”

# Select groups by index positions, group A and group B
groupA <- c(1,2,3,4,5)
groupB <- c(6,7,8,9,10)

groupA_data <- gbm_data[, groupA]
groupB_data <- gbm_data[, groupB]

# View data
View(groupA_data)
View(groupB_data)

# Get means of both groups to calculate fold change
groupA_mean <- rowMeans(groupA_data)
groupB_mean <- rowMeans(groupB_data)

# Calculate log fold change
logFC <- log2(groupB_mean+0.5) - log2(groupA_mean+0.5)


# Calculate p values using Wilcoxon test
pvalues <- apply(gbm_data, 1, function(row) {
  wilcox.test(row[1:5], row[6:10])$p.value
})

plot(logFC, -log10(pvalues))

# Create a dataframe that has the logFC and pvalue for each gene
library(dplyr)
DEG <- tibble(logFC, pvalues)
View(DEG)
rownames(DEG) <- rownames(gbm_data)

# b- Subset upregulated and downregulated genes
upreg_genes <- DEG %>% filter(logFC >= 1.5 & pvalues<= 0.05)
View(upreg_genes)

downreg_genes <- DEG %>% filter(logFC <= -1.5 & pvalues<= 0.05)
View(downreg_genes)                              

# writing the up and downregulated genes 
write.csv(upreg_genes,"upregul_genes.csv",row.names=TRUE)  
write.csv(downreg_genes,"downregul_genes.csv",row.names=TRUE)

# c- Get gene symbol from Ensembl
# Connect to Ensembl
library(biomaRt)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_ids <- rownames(DEG)

gene_annotations <- getBM(
  attributes = c("ensembl_gene_id", "hgnc_symbol"),
  filters = "ensembl_gene_id",                     
  values = ensembl_ids,                           
  mart = ensembl                                    
)

head(gene_annotations)

ensembl_gene_id hgnc_symbol 1 ENSG00000006128 TAC1 2 ENSG00000006453 BAIAP2L1 3 ENSG00000006611 USH1C 4 ENSG00000027644 INSRR 5 ENSG00000034239 CLXN 6 ENSG00000039537 C6

# Create a list of gene annotations
keys <- gene_annotations$ensembl_gene_id
values <- gene_annotations$hgnc_symbol

l <- list()
for (i in 1:length(keys)){
  l[keys[i]] <- values[i]
}

# Read in upregulated genes file and assign to variable
upreg_genes_with_symbol <- read.csv("upregul_genes.csv", row.names = 1)


# add new column "symbol" to upreg_genes_with_symbol
upreg_genes_with_symbol$symbol <- unlist(l[rownames(upreg_genes_with_symbol)], use.names = FALSE)

# Read in downregulated genes file and assign to variable
downreg_genes_with_symbol <- read.csv("downregul_genes.csv", row.names = 1)


# add new column "symbol" to downreg_genes_with_symbol
downreg_genes_with_symbol$symbol <- unlist(l[rownames(downreg_genes_with_symbol)], use.names = FALSE)

write.csv(upreg_genes_with_symbol, "upregul_genes_with_symb.csv", row.names=FALSE)
write.csv(downreg_genes_with_symbol, "downregul_genes_with_symb.csv", row.names=FALSE)

View(downreg_genes_with_symbol)
View(upreg_genes_with_symbol)
# STEP 4/5- Functional enrichment and Visualisation of top pathways

# Import upregulated enriched pathways (ShinyGO)
library(tibble)
upreg_pathways <- read.csv("upreg_top_pathways.csv")

# Getting the name of the pathways only in new column
upreg_pathways <- upreg_pathways %>%
  mutate(Pathways_names = trimws(sub("GO:........","",upreg_pathways$Pathway)))


# Lollipop plot of the top 5 upregulated enriched pathways color scaling the
# points according to - log 10 of the p_val
library(ggplot2)

ggplot(upreg_pathways[c(1:5),], aes(x = Pathways_names, y = nGenes)) + # defining the data and x,y axes
  geom_segment(aes(x = Pathways_names, xend = Pathways_names,
                   y = 0, yend = nGenes),
               color="black") +  # drawing segments of the lollipop in black color
  geom_point(aes(color=-log10(Enrichment.FDR)),
             size=8) + # Drawing the points of the lollipop and color scale it by - log 10 of the p_val
  scale_color_gradient(low = "blue",high = "red")+ # Specify the color scaling from blue to red
  coord_flip()+ # Flipping the axes for better visualization 
  theme_grey()+ # Using grey theme 
  labs(title = "Top 5 upregulated pathways: Gene Number and Significance Levels",
       x = "pathway",
       y = "gene_num",
       color="-log10 p_val")+    # specifying the labels for each element in the plot
  theme(plot.title = element_text(size = 20),
        axis.title = element_text(size = 16),
        axis.text = element_text(size = 14)) # Adjusting text sizes in the plot

For more details on the tools and libraries used for each step in this analysis,
kindly check the README file in the Stage 2 folder in this repo.