Show code

library(SingleCellExperiment)
library(here)
library(scater)
library(scran)
library(ggplot2)
library(cowplot)
library(edgeR)
library(Glimma)
library(BiocParallel)
library(patchwork)
library(janitor)
library(pheatmap)
library(batchelor)
library(rmarkdown)
library(BiocStyle)
library(readxl)
library(dplyr)
library(tidyr)
library(ggrepel)
library(magrittr)

knitr::opts_chunk$set(fig.path = "C094_Pellicci.single-cell.annotate.whole_cell_files/")

Preparing the data

We start from the cell selected SingleCellExperiment object created in ‘Merging cells for Pellicci gamma-delta T-cell dataset (whole cell)’.

Show code

sce <- readRDS(here("data", "SCEs", "C094_Pellicci.single-cell.merged.whole_cell.SCE.rds"))

# pre-create directories for saving export, or error (dir not exists)
dir.create(here("data", "marker_genes", "whole_cell"), recursive = TRUE)
dir.create(here("output", "marker_genes", "whole_cell"), recursive = TRUE)

# remove "Unknown" (as it is not informative at all)
sce <- sce[, sce$stage != "Unknown"]
colData(sce) <- droplevels(colData(sce))

# Some useful colours
plate_number_colours <- setNames(
  unique(sce$colours$plate_number_colours),
  unique(names(sce$colours$plate_number_colours)))
plate_number_colours <- plate_number_colours[levels(sce$plate_number)]
tissue_colours <- setNames(
  unique(sce$colours$tissue_colours),
  unique(names(sce$colours$tissue_colours)))
tissue_colours <- tissue_colours[levels(sce$tissue)]
donor_colours <- setNames(
  unique(sce$colours$donor_colours),
  unique(names(sce$colours$donor_colours)))
donor_colours <- donor_colours[levels(sce$donor)]
stage_colours <- setNames(
  unique(sce$colours$stage_colours),
  unique(names(sce$colours$stage_colours)))
stage_colours <- stage_colours[levels(sce$stage)]
group_colours <- setNames(
  unique(sce$colours$group_colours),
  unique(names(sce$colours$group_colours)))
group_colours <- group_colours[levels(sce$group)]
cluster_colours <- setNames(
  unique(sce$colours$cluster_colours),
  unique(names(sce$colours$cluster_colours)))
cluster_colours <- cluster_colours[levels(sce$cluster)]

# Some useful gene sets
mito_set <- rownames(sce)[any(rowData(sce)$ENSEMBL.SEQNAME == "MT")]
ribo_set <- grep("^RP(S|L)", rownames(sce), value = TRUE)
# NOTE: A more curated approach for identifying ribosomal protein genes
#       (https://github.com/Bioconductor/OrchestratingSingleCellAnalysis-base/blob/ae201bf26e3e4fa82d9165d8abf4f4dc4b8e5a68/feature-selection.Rmd#L376-L380)
library(msigdbr)
c2_sets <- msigdbr(species = "Homo sapiens", category = "C2")
ribo_set <- union(
  ribo_set,
  c2_sets[c2_sets$gs_name == "KEGG_RIBOSOME", ]$gene_symbol)
ribo_set <- intersect(ribo_set, rownames(sce))
sex_set <- rownames(sce)[any(rowData(sce)$ENSEMBL.SEQNAME %in% c("X", "Y"))]
pseudogene_set <- rownames(sce)[
  any(grepl("pseudogene", rowData(sce)$ENSEMBL.GENEBIOTYPE))]
# NOTE: not suggest to narrow down into protein coding genes (pcg) as it remove all significant candidate in most of the comparison !!!
protein_coding_gene_set <- rownames(sce)[
  any(grepl("protein_coding", rowData(sce)$ENSEMBL.GENEBIOTYPE))]

Show code

# include part of the FACS data (for plot of heatmap)
facs <- t(assays(altExp(sce, "FACS"))$pseudolog)
facs_markers <- grep("V525_50_A_CD4_BV510|B530_30_A_CD161_FITC", colnames(facs), value = TRUE)
facs_selected <- facs[,facs_markers]
colnames(facs_selected) <- c("CD161", "CD4")
colData(sce) <- cbind(colData(sce), facs_selected)

Re-clustering

NOTE: Based on our explorative data analyses (EDA) on the whole cell, we conclude the optimal number of clusters for demonstrating the heterogeneity of the dataset, we therefore re-cluster in here. Also, as indicated by Dan during our online meeting on 11 Aug 2011, we need to use different numbering and colouring for clusters in different subsets of the dataset (to prepare for publication), we perform all these by the following script.

Show code

set.seed(4759)
snn_gr <- buildSNNGraph(sce, use.dimred = "corrected", k=5)
clusters <- igraph::cluster_louvain(snn_gr)
sce$cluster <- factor(clusters$membership)

# NOTE: no re-numbering of the clusters is needed for "whole cell"
# NOTE: re-colouring is needed in here as the original "tableau10medium" does not have enough number of colours
cluster_colours <- setNames(
  scater:::.get_palette("tableau10medium")[seq_len(nlevels(sce$cluster))],
  levels(sce$cluster))
sce$colours$cluster_colours <- cluster_colours[sce$cluster]

After the re-clustering, there are 5 clusters for whole cell of the dataset.

Data overview

Show code

p1 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "cluster", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = cluster_colours, name = "cluster")
p2 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "stage", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = stage_colours, name = "stage")
p3 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "plate_number", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = plate_number_colours, name = "plate_number")
p4 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "tissue", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = tissue_colours, name = "tissue")
p5 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "donor", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = donor_colours, name = "donor")
p6 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "group", theme_size = 7, point_size = 0.2) +
  scale_colour_manual(values = group_colours, name = "group")
(p1 | p2) / (p3 | p4) / (p5 | p6)

Figure 1: UMAP plot, where each point represents a cell and is coloured according to the legend.

Show code

# summary - stacked barplot
p1 <- ggcells(sce) +
  geom_bar(aes(x = cluster, fill = cluster)) +
  coord_flip() +
  ylab("Number of samples") +
  theme_cowplot(font_size = 8) +
  scale_fill_manual(values = cluster_colours) +
  geom_text(stat='count', aes(x = cluster, label=..count..), hjust=1.5, size=2)
p2 <- ggcells(sce) +
  geom_bar(
    aes(x = cluster, fill = stage),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = stage_colours) +
  theme_cowplot(font_size = 8)
p3 <- ggcells(sce) +
  geom_bar(
    aes(x = cluster, fill = plate_number),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = plate_number_colours) +
  theme_cowplot(font_size = 8)
p4 <- ggcells(sce) +
  geom_bar(
    aes(x = cluster, fill = tissue),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = tissue_colours) +
  theme_cowplot(font_size = 8)
p5 <- ggcells(sce) +
  geom_bar(
    aes(x = cluster, fill = donor),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = donor_colours) +
  theme_cowplot(font_size = 8)
p6 <- ggcells(sce) +
  geom_bar(
    aes(x = cluster, fill = group),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = group_colours) +
  theme_cowplot(font_size = 8)
(p1 | p2) / (p3 | p4) / (p5 | p6)

Figure 2: Breakdown of clusters by experimental factors.

NOTE: Considering the fact that SingleR with use of the annotation reference (Monaco Immune Cell Data) most relevant to the gamma-delta T cells (even annotated at cell level) could not further sub-classify the developmental stage/subtype of them (either annotating cluster as Th1 cell-/Naive CD8/CD4 T cell or Vd2gd T cells-alike) [ref: EDA_annotation_SingleR_MI_fine_cell_level.R], we decide to characterize the clusters by manual detection and curation of specific marker genes directly.

Marker gene detection

To interpret our clustering results, we identify the genes that drive separation between clusters. These marker genes allow us to assign biological meaning to each cluster based on their functional annotation. In the most obvious case, the marker genes for each cluster are a priori associated with particular cell types, allowing us to treat the clustering as a proxy for cell type identity. The same principle can be applied to more subtle differences in activation status or differentiation state.

Identification of marker genes is usually based around the retrospective detection of differential expression between clusters¹. Genes that are more strongly DE are more likely to have driven cluster separation in the first place. The top DE genes are likely to be good candidate markers as they can effectively distinguish between cells in different clusters.

The Welch t-test is an obvious choice of statistical method to test for differences in expression between clusters. It is quickly computed and has good statistical properties for large numbers of cells (Soneson and Robinson 2018).

Show code

# block on plate
sce$block <- paste0(sce$plate_number)

Cluster `1` vs. `2` vs. `3` vs. `4` vs. `5`

Here we look for the unique up-regulated markers of each cluster when compared to the all remaining ones. For instance, unique markers of cluster 1 refer to the genes significantly up-regulated in all of these comparisons: cluster 2 vs. 1 and cluster 3 vs. 1 and cluster 4 vs. 1 and cluster 5 vs. 1.

Show code

###################################
# (M1) raw unique
#
# cluster 1 (i.e. S3.mix.higher.thymus
# cluster 2 (i.e. S3.mix.with.blood.1
# cluster 3 (i.e. S3.mix.with.blood.2
# cluster 4 (i.e. S3.mix.with.blood.3
# cluster 5 (i.e. mostly S1 and S2

# find unique DE ./. clusters
uniquely_up <- findMarkers(
  sce,
  groups = sce$cluster,
  block = sce$block,
  pval.type = "all",
  direction = "up")

Show code

# export DGE lists
saveRDS(
  uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_1_vs_2_vs_3_vs_4_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_1_vs_2_vs_3_vs_4_vs_5"), recursive = TRUE)

vs_pair <- c("1", "2", "3", "4", "5")

message("Writing 'uniquely_up (cluster_1_vs_2_vs_3_vs_4_vs_5)' marker genes to file.")
for (n in names(uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_1_vs_2_vs_3_vs_4_vs_5",
      paste0("cluster_",
             vs_pair[which(names(uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(uniquely_up) %in% n)][1],
             "_vs_",
             vs_pair[-which(names(uniquely_up) %in% n)][2],
             "_vs_",
             vs_pair[-which(names(uniquely_up) %in% n)][3],
             "_vs_",
             vs_pair[-which(names(uniquely_up) %in% n)][4],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

# NOTE: The following is a workaround to the lack of support for tabsets in 
#       distill (see https://github.com/rstudio/distill/issues/11 and 
#       https://github.com/rstudio/distill/issues/11#issuecomment-692142414 in 
#       particular).
xaringanExtra::use_panelset()

`Cluster 1`

Show code

##########################################
# look at cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "1"
cluster1_uniquely_up <- uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# cluster1_uniquely_up <- cluster1_uniquely_up[intersect(protein_coding_gene_set, rownames(cluster1_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
cluster1_uniquely_up_noiseR <- cluster1_uniquely_up[setdiff(rownames(cluster1_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(cluster1_uniquely_up_noiseR) %in% "CD4"),
       cluster1_uniquely_up_noiseR[which(rownames(cluster1_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(cluster1_uniquely_up_noiseR) %in% "KLRB1"),
       cluster1_uniquely_up_noiseR[which(rownames(cluster1_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only
best_set <- cluster1_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # annotation_row = data.frame(
  #   Sig = factor(
  #     ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
  #     # TODO: temp trick to deal with the row-colouring problem
  #     # levels = c("Yes", "No")),
  #     levels = c("Yes")),
  #   row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

Figure 3: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

`Cluster 2`

Show code

##########################################
# look at cluster 2 (i.e. S3.mix.with.blood.1)
chosen <- "2"
cluster2_uniquely_up <- uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.1)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# cluster2_uniquely_up <- cluster2_uniquely_up[intersect(protein_coding_gene_set, rownames(cluster2_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
cluster2_uniquely_up_noiseR <- cluster2_uniquely_up[setdiff(rownames(cluster2_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(cluster2_uniquely_up_noiseR) %in% "CD4"),
       cluster2_uniquely_up_noiseR[which(rownames(cluster2_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(cluster2_uniquely_up_noiseR) %in% "KLRB1"),
       cluster2_uniquely_up_noiseR[which(rownames(cluster2_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only
best_set <- cluster2_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # annotation_row = data.frame(
  #   Sig = factor(
  #     ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
  #     # TODO: temp trick to deal with the row-colouring problem
  #     # levels = c("Yes", "No")),
  #     levels = c("Yes")),
  #   row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 4: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

`Cluster 3`

Show code

##########################################
# look at cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "3"
cluster3_uniquely_up <- uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# cluster3_uniquely_up <- cluster3_uniquely_up[intersect(protein_coding_gene_set, rownames(cluster3_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
cluster3_uniquely_up_noiseR <- cluster3_uniquely_up[setdiff(rownames(cluster3_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(cluster3_uniquely_up_noiseR) %in% "CD4"),
       cluster3_uniquely_up_noiseR[which(rownames(cluster3_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(cluster3_uniquely_up_noiseR) %in% "KLRB1"),
       cluster3_uniquely_up_noiseR[which(rownames(cluster3_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only
best_set <- cluster3_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 5: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

`Cluster 4`

Show code

##########################################
# look at cluster 4 (i.e. S3.mix.with.blood.3)
chosen <- "4"
cluster4_uniquely_up <- uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# cluster4_uniquely_up <- cluster4_uniquely_up[intersect(protein_coding_gene_set, rownames(cluster4_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
cluster4_uniquely_up_noiseR <- cluster4_uniquely_up[setdiff(rownames(cluster4_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(cluster4_uniquely_up_noiseR) %in% "CD4"),
       cluster4_uniquely_up_noiseR[which(rownames(cluster4_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(cluster4_uniquely_up_noiseR) %in% "KLRB1"),
       cluster4_uniquely_up_noiseR[which(rownames(cluster4_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only
best_set <- cluster4_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 6: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

`Cluster 5`

Show code

##########################################
# look at cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "5"
cluster5_uniquely_up <- uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# cluster5_uniquely_up <- cluster5_uniquely_up[intersect(protein_coding_gene_set, rownames(cluster5_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
cluster5_uniquely_up_noiseR <- cluster5_uniquely_up[setdiff(rownames(cluster5_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(cluster5_uniquely_up_noiseR) %in% "CD4"),
       cluster5_uniquely_up_noiseR[which(rownames(cluster5_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(cluster5_uniquely_up_noiseR) %in% "KLRB1"),
       cluster5_uniquely_up_noiseR[which(rownames(cluster5_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only
best_set <- cluster5_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 7: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_1_vs_2_vs_3_vs_4_vs_5/.

Summary: Stemmed from the fact that the cluster 1, 2, and 4 are too similar to each other, except for cluster 3 (characterized by up-regulation of IL7R and LTB) and 5 (lots of strong and unique markers such as TCF7, LEF1, SOX4, etc.), we cannot determine sufficient number of marker genes for characterizing all other clusters.

With this regard, apart from making “all pairwise comparisons” between “all clusters” to pinpoint DE unique to each cluster as above, we took an alternative path and determined the DE unique to only the “selected pairwise comparisons” between “clusters” below. Say, for cluster 3, we determine markers that is significantly up-regulated in at least one of these comparisons: cluster 1 vs. 3 or cluster 2 vs. 3 or cluster 4 vs. 3 or cluster 5 vs. 3.

Besides, we also look into the the pairwise comparisons between the interesting “cluster-groups.” For instance, cluster 5 is the only cluster relatively distant from the main groups in the UMAP (Figure 1), thus it would be interesting to know what drive cluster 5 away from the merged group of cluster 1, 2, 3 and 4, or how cluster 1 (S3.mix.higher.thymus) is different from the cluster 2, 3 and 4 (S3.mix.with.blood), etc.

Here are the list of pairwise comparisons and what they are anticipated to achieve when compared:

fx: how clusters of the S3-mix different from that of the S1-S2-mix

1 vs 5 (A vs B)
2 vs 5 (C vs D)
3 vs 5 (E vs F)
4 vs 5 (G vs H)
2_3_4 vs 5 (I vs J)

fx: difference among the clusters of S3-mix (higher thymus vs with blood)

2 vs 1 (K vs L)
3 vs 1 (M vs N)
4 vs 1 (O vs P)
2_3_4 vs 1 (Q vs R)

fx: how special is cluster `3` in the clusters of S3-mix

2 vs 3 (S vs T)
4 vs 3 (U vs V)
2_4 vs 3 (W vs X)

FX: difference between cluster `2` and cluster `4` (in S3-mix with blood)

2 vs 4 (Y vs Z)

Selected pairwise comparisons

Show code

# NOTE: The following is a workaround to the lack of support for tabsets in 
#       distill (see https://github.com/rstudio/distill/issues/11 and 
#       https://github.com/rstudio/distill/issues/11#issuecomment-692142414 in 
#       particular).
xaringanExtra::use_panelset()

`cluster_1_vs_cluster_5`

Show code

#########
# A vs B
#########

##########################################################################
# cluster 1 (i.e. S3.mix.higher.thymus) vs cluster 5 (i.e. mostly.S1.S2.mix)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "1" | cp$cluster == "5"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs1 <- factor(ifelse(cp$cluster == 1, "A", "B"))

# set vs colours
vs1_colours <- setNames(
  palette.colors(nlevels(cp$vs1), "Set1"),
  levels(cp$vs1))
cp$colours$vs1_colours <- vs1_colours[cp$vs1]

# find unique DE ./. cluster-groups
vs1_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs1,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs1_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_1_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_1_vs_5"), recursive = TRUE)

vs_pair <- c("1", "5")

message("Writing 'uniquely_up (cluster_1_vs_5)' marker genes to file.")
for (n in names(vs1_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_1_vs_5",
      paste0("cluster_",
             vs_pair[which(names(vs1_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs1_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs1_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

##########################################################
# look at cluster-group A / cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "A"
A_uniquely_up <- vs1_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 1; S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# A_uniquely_up_pcg <- A_uniquely_up[intersect(protein_coding_gene_set, rownames(A_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
A_uniquely_up_noiseR <- A_uniquely_up[setdiff(rownames(A_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(A_uniquely_up_noiseR) %in% "CD4"),
       A_uniquely_up_noiseR[which(rownames(A_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(A_uniquely_up_noiseR) %in% "KLRB1"),
       A_uniquely_up_noiseR[which(rownames(A_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- A_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs1,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs1",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs1 = vs1_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 8: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

###########################################################
# look at cluster-group B / cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "B"
B_uniquely_up <- vs1_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 5; mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# B_uniquely_up_pcg <- B_uniquely_up[intersect(protein_coding_gene_set, rownames(B_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
B_uniquely_up_noiseR <- B_uniquely_up[setdiff(rownames(B_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(B_uniquely_up_noiseR) %in% "CD4"),
       B_uniquely_up_noiseR[which(rownames(B_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(B_uniquely_up_noiseR) %in% "KLRB1"),
       B_uniquely_up_noiseR[which(rownames(B_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- B_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs1,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs1",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs1 = vs1_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 9: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_1_vs_5/.

SUMMARY: 1 vs 5 (A vs B)

cluster 1 (S3.mix.higher.thymus >>> lots of markers (e.g. HLA, interferon, HKG7, GZMK, IL7R, etc)
cluster 5 (mostly.S1.S2.mix >>> lot of markers as well (e.g. TCF7, LEF1, SOX4, H3F#A)
COMMENT: S3.mix.higher.thymus is clearly different from S1-S2-mix

`cluster_2_vs_cluster_5`

Show code

#########
# C vs D
#########

##########################################################################################
# cluster 2 (i.e. S3.mix.with.blood.1) vs cluster 5 (i.e. mostly.S1.S2.mix)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "5"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs2 <- factor(ifelse(cp$cluster == 2, "C", "D"))

# set vs colours
vs2_colours <- setNames(
  palette.colors(nlevels(cp$vs2), "Set1"),
  levels(cp$vs2))
cp$colours$vs2_colours <- vs2_colours[cp$vs2]

# find unique DE ./. cluster-groups
vs2_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs2,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs2_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_vs_5"), recursive = TRUE)

vs_pair <- c("2", "5")

message("Writing 'uniquely_up (cluster_2_vs_5)' marker genes to file.")
for (n in names(vs2_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_vs_5",
      paste0("cluster_",
             vs_pair[which(names(vs2_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs2_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs2_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group C / cluster 2 (i.e. S3.mix.with.blood.1)
chosen <- "C"
C_uniquely_up <- vs2_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2; S3.mix.with.blood.1)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# C_uniquely_up_pcg <- C_uniquely_up[intersect(protein_coding_gene_set, rownames(C_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
C_uniquely_up_noiseR <- C_uniquely_up[setdiff(rownames(C_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(C_uniquely_up_noiseR) %in% "CD4"),
       C_uniquely_up_noiseR[which(rownames(C_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(C_uniquely_up_noiseR) %in% "KLRB1"),
       C_uniquely_up_noiseR[which(rownames(C_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- C_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs2,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs2",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs2 = vs2_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 10: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group D / cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "D"
D_uniquely_up <- vs2_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 5; mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# D_uniquely_up_pcg <- D_uniquely_up[intersect(protein_coding_gene_set, rownames(D_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
D_uniquely_up_noiseR <- D_uniquely_up[setdiff(rownames(D_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(D_uniquely_up_noiseR) %in% "CD4"),
       D_uniquely_up_noiseR[which(rownames(D_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(D_uniquely_up_noiseR) %in% "KLRB1"),
       D_uniquely_up_noiseR[which(rownames(D_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- D_uniquely_up_noiseR[1:25, ]

Show code

## heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs2,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs2",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs2 = vs2_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 11: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_vs_5/.

SUMMARY: 2 vs 5 (C vs D)

cluster 2 (S3.mix.with.blood.1 >>> lots of markers (e.g.IL32, NKG7, GZMA, HLA, CCL5, etc)
cluster 5 (mostly.S1.S2.mix >>> lots of markers as well (e.g. TCF7, LEF1, DGKA, TOX2, etc)
COMMENT: S3.mix.with.blood.1 is also clearly different from mostly.S1.S2.mix

`cluster_3_vs_cluster_5`

Show code

#########
# E vs F
#########

##########################################################################################
# cluster 3 (i.e. S3.mix.with.blood.2) vs cluster 5 (i.e. mostly.S1.S2.mix)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "3" | cp$cluster == "5"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs3 <- factor(ifelse(cp$cluster == 3, "E", "F"))

# set vs colours
vs3_colours <- setNames(
  palette.colors(nlevels(cp$vs3), "Set1"),
  levels(cp$vs3))
cp$colours$vs3_colours <- vs3_colours[cp$vs3]

# find unique DE ./. cluster-groups
vs3_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs3,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs3_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_3_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_3_vs_5"), recursive = TRUE)

vs_pair <- c("3", "5")

message("Writing 'uniquely_up (cluster_3_vs_5)' marker genes to file.")
for (n in names(vs3_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_3_vs_5",
      paste0("cluster_",
             vs_pair[which(names(vs3_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs3_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs3_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

##########################################################
# look at cluster-group E / cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "E"
E_uniquely_up <- vs3_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 3; S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# E_uniquely_up_pcg <- E_uniquely_up[intersect(protein_coding_gene_set, rownames(E_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
E_uniquely_up_noiseR <- E_uniquely_up[setdiff(rownames(E_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(E_uniquely_up_noiseR) %in% "CD4"),
       E_uniquely_up_noiseR[which(rownames(E_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(E_uniquely_up_noiseR) %in% "KLRB1"),
       E_uniquely_up_noiseR[which(rownames(E_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- E_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs3,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs3",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs3 = vs3_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 12: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group F / cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "F"
F_uniquely_up <- vs3_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 5; mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# F_uniquely_up_pcg <- F_uniquely_up[intersect(protein_coding_gene_set, rownames(F_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
F_uniquely_up_noiseR <- F_uniquely_up[setdiff(rownames(F_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(F_uniquely_up_noiseR) %in% "CD4"),
       F_uniquely_up_noiseR[which(rownames(F_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(D_uniquely_up_noiseR) %in% "KLRB1"),
       F_uniquely_up_noiseR[which(rownames(F_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- F_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs3,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs3",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs3 = vs3_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 13: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_3_vs_5/.

SUMMARY: 3 vs 5 (E vs F)

cluster 3 (S3.mix.with.blood.2 >>> number of markers (e.g. interferon, HLA, SYNE2, LTB, interleukin, etc.)
cluster 5 (mostly.S1.S2.mix >>> lot of markers (e.g.LEF1, SOX4, H3F3A, TOX2, etc
COMMENT: S3.mix.with.blood.2 is also clearly different from mostly.S1.S2.mix

`cluster_4_vs_cluster_5`

Show code

#########
# G vs H
#########

##########################################################################################
# cluster 4 (i.e. S3.mix.with.blood.3) vs cluster 5 (i.e. mostly.S1.S2.mix)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "4" | cp$cluster == "5"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs4 <- factor(ifelse(cp$cluster == 4, "G", "H"))

# set vs colours
vs4_colours <- setNames(
  palette.colors(nlevels(cp$vs4), "Set1"),
  levels(cp$vs4))
cp$colours$vs4_colours <- vs4_colours[cp$vs4]

# find unique DE ./. cluster-groups
vs4_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs4,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs4_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_4_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_4_vs_5"), recursive = TRUE)

vs_pair <- c("4", "5")

message("Writing 'uniquely_up (cluster_4_vs_5)' marker genes to file.")
for (n in names(vs4_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_4_vs_5",
      paste0("cluster_",
             vs_pair[which(names(vs4_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs4_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs4_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group G / cluster 4 (i.e. S3.mix.with.blood.3)
chosen <- "G"
G_uniquely_up <- vs4_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 4; S3.mix.with.blood.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# G_uniquely_up_pcg <- G_uniquely_up[intersect(protein_coding_gene_set, rownames(G_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
G_uniquely_up_noiseR <- G_uniquely_up[setdiff(rownames(G_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(G_uniquely_up_noiseR) %in% "CD4"),
       G_uniquely_up_noiseR[which(rownames(G_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(G_uniquely_up_noiseR) %in% "KLRB1"),
       G_uniquely_up_noiseR[which(rownames(G_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- G_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs4,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs4",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs4 = vs4_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 14: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group H / cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "H"
H_uniquely_up <- vs4_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 5; mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# H_uniquely_up_pcg <- H_uniquely_up[intersect(protein_coding_gene_set, rownames(H_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
H_uniquely_up_noiseR <- H_uniquely_up[setdiff(rownames(H_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(H_uniquely_up_noiseR) %in% "CD4"),
       H_uniquely_up_noiseR[which(rownames(H_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(H_uniquely_up_noiseR) %in% "KLRB1"),
       H_uniquely_up_noiseR[which(rownames(H_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- H_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs4,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs4",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs4 = vs4_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 15: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_4_vs_5/.

SUMMARY: 4 vs 5 (G vs H)

cluster 4 (S3.mix.with.blood.3 >>> lots of markers (e.g. CCL5, HLA, interleukin, NKG7, etc.)
cluster 5 (mostly.S1.S2.mix >>> lots of markers (e.g. TCF7, SOX4, TOX2, LEF1)
COMMENT: S3.mix.with.blood.3 is also clearly different from mostly.S1.S2.mix

`cluster_2_3_4_vs_cluster_5`

Show code

#########
# I vs J
#########

##########################################################################################
# cluster 2_3_4 (i.e. S3.mix.with.blood) vs cluster 5 (i.e. mostly.S1.S2.mix)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "3" | cp$cluster == "4" | cp$cluster == "5"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs5 <- factor(ifelse(cp$cluster == 2 | cp$cluster == 3 | cp$cluster == 4 , "I", "J"))

# set vs colours
vs5_colours <- setNames(
  palette.colors(nlevels(cp$vs5), "Set1"),
  levels(cp$vs5))
cp$colours$vs5_colours <- vs5_colours[cp$vs5]

# find unique DE ./. cluster-groups
vs5_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs5,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs5_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_3_4_vs_5.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_3_4_vs_5"), recursive = TRUE)

vs_pair <- c("2_3_4", "5")

message("Writing 'uniquely_up (cluster_2_3_4_vs_5)' marker genes to file.")
for (n in names(vs5_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_3_4_vs_5",
      paste0("cluster_",
             vs_pair[which(names(vs5_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs5_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs5_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group I / cluster 2_3_4 (i.e. S3.mix.with.blood)
chosen <- "I"
I_uniquely_up <- vs5_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2_3_4; S3.mix.with.blood)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# I_uniquely_up_pcg <- I_uniquely_up[intersect(protein_coding_gene_set, rownames(I_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
I_uniquely_up_noiseR <- I_uniquely_up[setdiff(rownames(I_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(I_uniquely_up_noiseR) %in% "CD4"),
       I_uniquely_up_noiseR[which(rownames(I_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(I_uniquely_up_noiseR) %in% "KLRB1"),
       I_uniquely_up_noiseR[which(rownames(I_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- I_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs5,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs5",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs5 = vs5_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-2_3_4-vs-5)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group J / cluster 5 (i.e. mostly.S1.S2.mix)
chosen <- "J"
J_uniquely_up <- vs5_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2_3_4; mostly.S1.S2.mix)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# J_uniquely_up_pcg <- J_uniquely_up[intersect(protein_coding_gene_set, rownames(J_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
J_uniquely_up_noiseR <- J_uniquely_up[setdiff(rownames(J_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(J_uniquely_up_noiseR) %in% "CD4"),
       J_uniquely_up_noiseR[which(rownames(J_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(J_uniquely_up_noiseR) %in% "KLRB1"),
       J_uniquely_up_noiseR[which(rownames(J_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- J_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs5,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs5",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs5 = vs5_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-5-vs-2_3_4)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_3_4_vs_5/.

SUMMARY: 2_3_4 vs 5 (I vs J)

cluster 2_3_4 (S3.mix.with.blood >>> lots of markers (e.g. CCL5, HLA, interleukin, NKG7, SYNE2, etc.)
cluster 5 (mostly.S1.S2.mix >>> lots of markers as well (e.g. FYB1, TCF7, CD3D, LEF1, SOX4, DGKA, etc.)
COMMENT: S3.mix.with.blood in bulk are clearly different from mostly.S1.S2.mix

`cluster_2_vs_cluster_1`

Show code

#########
# K vs L
#########

##########################################################################################
# cluster 2 (i.e. S3.mix.with.blood.1) vs cluster 1 (i.e. S3.mix.higher.thymus)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "1"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs6 <- factor(ifelse(cp$cluster == 2, "K", "L"))

# set vs colours
vs6_colours <- setNames(
  palette.colors(nlevels(cp$vs6), "Set1"),
  levels(cp$vs6))
cp$colours$vs6_colours <- vs6_colours[cp$vs6]

# find unique DE ./. cluster-groups
vs6_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs6,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs6_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_vs_1.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_vs_1"), recursive = TRUE)

vs_pair <- c("2", "1")

message("Writing 'uniquely_up (cluster_2_vs_1)' marker genes to file.")
for (n in names(vs6_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_vs_1",
      paste0("cluster_",
             vs_pair[which(names(vs6_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs6_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs6_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group K / cluster 2 (i.e. S3.mix.with.blood.1)
chosen <- "K"
K_uniquely_up <- vs6_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2; S3.mix.with.blood.1)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# K_uniquely_up_pcg <- K_uniquely_up[intersect(protein_coding_gene_set, rownames(K_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
K_uniquely_up_noiseR <- K_uniquely_up[setdiff(rownames(K_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(K_uniquely_up_noiseR) %in% "CD4"),
       K_uniquely_up_noiseR[which(rownames(K_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(K_uniquely_up_noiseR) %in% "KLRB1"),
       K_uniquely_up_noiseR[which(rownames(K_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- K_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs6,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs6",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs6 = vs6_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 16: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group L / cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "L"
L_uniquely_up <- vs6_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 1; S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# L_uniquely_up_pcg <- L_uniquely_up[intersect(protein_coding_gene_set, rownames(L_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
L_uniquely_up_noiseR <- L_uniquely_up[setdiff(rownames(L_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(L_uniquely_up_noiseR) %in% "CD4"),
       L_uniquely_up_noiseR[which(rownames(L_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(L_uniquely_up_noiseR) %in% "KLRB1"),
       L_uniquely_up_noiseR[which(rownames(L_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- L_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs6,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs6",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs6 = vs6_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 17: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_vs_1/.

SUMMARY: 2 vs 1 (K vs L)

cluster 2 (S3.mix.with.blood.1 >>> number of DE (e.g. CCL5, COX14, GOLGA7, etc.)
cluster 1 (S3.mix.higher.thymus >>> only 1 DE, i.e. IL7R (i.e. 1>2)
COMMENT: S3.mix.higher.thymus unique by having higher IL7R, but S3.mix.with.blood.1 with number of other unique markers

`cluster_3_vs_cluster_1`

Show code

#########
# M vs N
#########

##########################################################################################
# cluster 3 (i.e. S3.mix.with.blood.2) vs cluster 1 (i.e. S3.mix.higher.thymus)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "3" | cp$cluster == "1"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs7 <- factor(ifelse(cp$cluster == 3, "M", "N"))

# set vs colours
vs7_colours <- setNames(
  palette.colors(nlevels(cp$vs7), "Set1"),
  levels(cp$vs7))
cp$colours$vs7_colours <- vs7_colours[cp$vs7]

# find unique DE ./. cluster-groups
vs7_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs7,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs7_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_3_vs_1.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_3_vs_1"), recursive = TRUE)

vs_pair <- c("3", "1")

message("Writing 'uniquely_up (cluster_3_vs_1)' marker genes to file.")
for (n in names(vs7_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_3_vs_1",
      paste0("cluster_",
             vs_pair[which(names(vs7_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs7_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs7_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group M / cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "M"
M_uniquely_up <- vs7_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 3; S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# M_uniquely_up_pcg <- M_uniquely_up[intersect(protein_coding_gene_set, rownames(M_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
M_uniquely_up_noiseR <- M_uniquely_up[setdiff(rownames(M_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(M_uniquely_up_noiseR) %in% "CD4"),
       M_uniquely_up_noiseR[which(rownames(M_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(M_uniquely_up_noiseR) %in% "KLRB1"),
       M_uniquely_up_noiseR[which(rownames(M_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- M_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs7,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs7",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs7 = vs7_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 18: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group N / cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "N"
N_uniquely_up <- vs7_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 1; S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# N_uniquely_up_pcg <- N_uniquely_up[intersect(protein_coding_gene_set, rownames(N_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
N_uniquely_up_noiseR <- N_uniquely_up[setdiff(rownames(N_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(N_uniquely_up_noiseR) %in% "CD4"),
       N_uniquely_up_noiseR[which(rownames(N_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(N_uniquely_up_noiseR) %in% "KLRB1"),
       N_uniquely_up_noiseR[which(rownames(N_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- N_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs7,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs7",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs7 = vs7_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 19: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_3_vs_1/.

SUMMARY: 3 vs 1 (M vs N)

cluster 3 (S3.mix.with.blood.2 >>> a few markers (e.g. IL7R, LTB, etc)
cluster 1 (S3.mix.higher.thymus >>> 1 marker (i.e. CCL5)
COMMENT: S3.mix.higher.thymus unique by higher CCL5, S3.mix.with.blood.2 is unique by having higher IR7R (i.e. 3 > 1; thus: 3 > 1 > 2 !? > check by violin plot)

`cluster_4_vs_cluster_1`

Show code

#########
# O vs P
#########

##########################################################################################
# cluster 4 (i.e. S3.mix.with.blood.3) vs cluster 1 (i.e. S3.mix.higher.thymus)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "4" | cp$cluster == "1"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs8 <- factor(ifelse(cp$cluster == 4, "O", "P"))

# set vs colours
vs8_colours <- setNames(
  palette.colors(nlevels(cp$vs8), "Set1"),
  levels(cp$vs8))
cp$colours$vs8_colours <- vs8_colours[cp$vs8]

# find unique DE ./. cluster-groups
vs8_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs8,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs8_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_4_vs_1.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_4_vs_1"), recursive = TRUE)

vs_pair <- c("3", "1")

message("Writing 'uniquely_up (cluster_4_vs_1)' marker genes to file.")
for (n in names(vs8_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_4_vs_1",
      paste0("cluster_",
             vs_pair[which(names(vs8_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs8_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs8_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group O / cluster 4 (i.e. S3.mix.with.blood.3)
chosen <- "O"
O_uniquely_up <- vs8_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 4; S3.mix.with.blood.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# O_uniquely_up_pcg <- O_uniquely_up[intersect(protein_coding_gene_set, rownames(O_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
O_uniquely_up_noiseR <- O_uniquely_up[setdiff(rownames(O_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(O_uniquely_up_noiseR) %in% "CD4"),
       O_uniquely_up_noiseR[which(rownames(O_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(O_uniquely_up_noiseR) %in% "KLRB1"),
       O_uniquely_up_noiseR[which(rownames(O_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- O_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs8,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs8",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs8 = vs8_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 20: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group P / cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "P"
P_uniquely_up <- vs8_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 1;  S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# P_uniquely_up_pcg <- P_uniquely_up[intersect(protein_coding_gene_set, rownames(P_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
P_uniquely_up_noiseR <- P_uniquely_up[setdiff(rownames(P_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(P_uniquely_up_noiseR) %in% "CD4"),
       P_uniquely_up_noiseR[which(rownames(P_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(P_uniquely_up_noiseR) %in% "KLRB1"),
       P_uniquely_up_noiseR[which(rownames(P_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- P_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs8,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs8",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # annotation_row = data.frame(
  #   Sig = factor(
  #     ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
  #     # TODO: temp trick to deal with the row-colouring problem
  #     # levels = c("Yes", "No")),
  #     levels = c("Yes")),
  #   row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs8 = vs8_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 21: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_4_vs_1/.

SUMMARY: 4 vs 1 (O vs P)

cluster 4 (S3.mix.with.blood.3 >>> several marker but mostly CCL5
cluster 1 (S3.mix.higher.thymus >>> visually yes, but statistically no
COMMENT: S3.mix.with.blood.3 is similar to S3.mix.higher.thymus but with higher CCL5 expression

`cluster_2_3_4_vs_cluster_1`

Show code

#########
# Q vs R
#########

##########################################################################################
# cluster 2_3_4 (i.e. S3.mix.with.blood) vs cluster 1 (i.e. S3.mix.higher.thymus)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "3" | cp$cluster == "4" | cp$cluster == "1"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs9 <- factor(ifelse(cp$cluster == 2 | cp$cluster == 3 | cp$cluster == 4 , "Q", "R"))

# set vs colours
vs9_colours <- setNames(
  palette.colors(nlevels(cp$vs9), "Set1"),
  levels(cp$vs9))
cp$colours$vs9_colours <- vs9_colours[cp$vs9]

# find unique DE ./. cluster-groups
vs9_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs9,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs9_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_3_4_vs_1.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_3_4_vs_1"), recursive = TRUE)

vs_pair <- c("2_3_4", "1")

message("Writing 'uniquely_up (cluster_2_3_4_vs_1)' marker genes to file.")
for (n in names(vs9_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_3_4_vs_1",
      paste0("cluster_",
             vs_pair[which(names(vs9_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs9_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs9_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group Q / cluster 2_3_4 (i.e. S3.mix.with.blood)
chosen <- "Q"
Q_uniquely_up <- vs9_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2_3_4; S3.mix.with.blood)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# Q_uniquely_up_pcg <- Q_uniquely_up[intersect(protein_coding_gene_set, rownames(Q_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
Q_uniquely_up_noiseR <- Q_uniquely_up[setdiff(rownames(Q_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(Q_uniquely_up_noiseR) %in% "CD4"),
       Q_uniquely_up_noiseR[which(rownames(Q_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(Q_uniquely_up_noiseR) %in% "KLRB1"),
       Q_uniquely_up_noiseR[which(rownames(Q_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- Q_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs9,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs9",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs9 = vs9_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-2_3_4-vs-1)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group R / cluster 1 (i.e. S3.mix.higher.thymus)
chosen <- "R"
R_uniquely_up <- vs9_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 1; S3.mix.higher.thymus)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# R_uniquely_up_pcg <- R_uniquely_up[intersect(protein_coding_gene_set, rownames(R_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
R_uniquely_up_noiseR <- R_uniquely_up[setdiff(rownames(R_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(R_uniquely_up_noiseR) %in% "CD4"),
       R_uniquely_up_noiseR[which(rownames(R_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(R_uniquely_up_noiseR) %in% "KLRB1"),
       R_uniquely_up_noiseR[which(rownames(R_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- R_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs9,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs9",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs9 = vs9_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-1-vs-2_3_4)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_3_4_vs_1/.

SUMMARY: 2_3_4 vs 1 (Q vs R)

cluster 2_3_4 (S3.mix.with.blood >>> got lots of markers >>> e.g. CCL5, PSMF1, PRKCB
cluster 1 (S3.mix.higher.thymus >>> visually yes, but statistically no
COMMENT: again, S3.mix.with.blood is similar to S3.mix.higher.thymus but with higher CCL5 expression

`cluster_2_vs_cluster_3`

Show code

#########
# S vs T
#########

##########################################################################################
# cluster 2 (i.e. S3.mix.with.blood.1) vs cluster 3 (i.e. S3.mix.with.blood.2)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "3"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs10 <- factor(ifelse(cp$cluster == 2, "S", "T"))

# set vs colours
vs10_colours <- setNames(
  palette.colors(nlevels(cp$vs10), "Set1"),
  levels(cp$vs10))
cp$colours$vs10_colours <- vs10_colours[cp$vs10]

# find unique DE ./. cluster-groups
vs10_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs10,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs10_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_vs_3.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_vs_3"), recursive = TRUE)

vs_pair <- c("2", "3")

message("Writing 'uniquely_up (cluster_2_vs_3)' marker genes to file.")
for (n in names(vs10_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_vs_3",
      paste0("cluster_",
             vs_pair[which(names(vs10_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs10_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs10_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group S / cluster 2 (i.e. S3.mix.with.blood.1)
chosen <- "S"
S_uniquely_up <- vs10_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2; S3.mix.with.blood.1)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# S_uniquely_up_pcg <- S_uniquely_up[intersect(protein_coding_gene_set, rownames(S_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
S_uniquely_up_noiseR <- S_uniquely_up[setdiff(rownames(S_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(S_uniquely_up_noiseR) %in% "CD4"),
       S_uniquely_up_noiseR[which(rownames(S_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(S_uniquely_up_noiseR) %in% "KLRB1"),
       S_uniquely_up_noiseR[which(rownames(S_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- S_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs10,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs10",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs10 = vs10_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 22: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group T / cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "T"
T_uniquely_up <- vs10_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 3; S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# T_uniquely_up_pcg <- T_uniquely_up[intersect(protein_coding_gene_set, rownames(T_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
T_uniquely_up_noiseR <- T_uniquely_up[setdiff(rownames(T_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(T_uniquely_up_noiseR) %in% "CD4"),
       T_uniquely_up_noiseR[which(rownames(T_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(T_uniquely_up_noiseR) %in% "KLRB1"),
       T_uniquely_up_noiseR[which(rownames(T_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- T_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs10,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs10",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs10 = vs10_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 23: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_vs_3/.

SUMMARY: 2 vs 3 (S vs T)

cluster 2 (S3.mix.with.blood.1 >>> statistically got a number of sig markers, but visually not quite clear …
cluster 3 (S3.mix.with.blood.2 >>> no marker
COMMENT: S3.mix.with.blood.1 is differed from S3.mix.with.blood.2 by the more frequent expression of higher sig markers

`cluster_4_vs_cluster_3`

Show code

#########
# U vs V
#########

##########################################################################################
# cluster 4 (i.e. S3.mix.with.blood.3) vs cluster 3 (i.e. S3.mix.with.blood.2)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "4" | cp$cluster == "3"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs11 <- factor(ifelse(cp$cluster == 4 , "U", "V"))

# set vs colours
vs11_colours <- setNames(
  palette.colors(nlevels(cp$vs11), "Set1"),
  levels(cp$vs11))
cp$colours$vs11_colours <- vs11_colours[cp$vs11]

# find unique DE ./. cluster-groups
vs11_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs11,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs11_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_4_vs_3.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_4_vs_3"), recursive = TRUE)

vs_pair <- c("4", "3")

message("Writing 'uniquely_up (cluster_4_vs_3)' marker genes to file.")
for (n in names(vs11_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_4_vs_3",
      paste0("cluster_",
             vs_pair[which(names(vs11_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs11_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs11_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group U / cluster 4 (i.e. S3.mix.with.blood.3)
chosen <- "U"
U_uniquely_up <- vs11_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 4; S3.mix.with.blood.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# U_uniquely_up_pcg <- U_uniquely_up[intersect(protein_coding_gene_set, rownames(U_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
U_uniquely_up_noiseR <- U_uniquely_up[setdiff(rownames(U_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(U_uniquely_up_noiseR) %in% "CD4"),
       U_uniquely_up_noiseR[which(rownames(U_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(U_uniquely_up_noiseR) %in% "KLRB1"),
       U_uniquely_up_noiseR[which(rownames(U_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- U_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs11,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs11",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs11 = vs11_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 24: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group V / cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "V"
V_uniquely_up <- vs11_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 3; S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# V_uniquely_up_pcg <- V_uniquely_up[intersect(protein_coding_gene_set, rownames(V_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
V_uniquely_up_noiseR <- V_uniquely_up[setdiff(rownames(V_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(V_uniquely_up_noiseR) %in% "CD4"),
       V_uniquely_up_noiseR[which(rownames(V_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(V_uniquely_up_noiseR) %in% "KLRB1"),
       V_uniquely_up_noiseR[which(rownames(V_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- V_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs11,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs11",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs11 = vs11_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 25: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_4_vs_3/.

SUMMARY: 4 vs 3 (U vs V)

cluster 4 (S3.mix.with.blood.3 >>> number of markers (CCL5, NKG7)
cluster 3 (S3.mix.with.blood.2 >>> a few markers (e.g IL7R, LTB, IFITM)
COMMENT: S3.mix.with.blood.3 is unique by CCL5 and NKG7 expression, S3.mix.with.blood.2 is with higher IL7R expression

`cluster_2_4_vs_cluster_3`

Show code

#########
# W vs X
#########

##########################################################################################
# cluster 2_4 (i.e. S3.mix.with.blood.1.3) vs cluster 3 (i.e. S3.mix.with.blood.2)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "4" | cp$cluster == "3"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs12 <- factor(ifelse(cp$cluster == 2 | cp$cluster == 4 , "W", "X"))

# set vs colours
vs12_colours <- setNames(
  palette.colors(nlevels(cp$vs12), "Set1"),
  levels(cp$vs12))
cp$colours$vs12_colours <- vs12_colours[cp$vs12]

# find unique DE ./. cluster-groups
vs12_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs12,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs12_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_4_vs_3.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_4_vs_3"), recursive = TRUE)

vs_pair <- c("4", "3")

message("Writing 'uniquely_up (cluster_2_4_vs_3)' marker genes to file.")
for (n in names(vs12_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_4_vs_3",
      paste0("cluster_",
             vs_pair[which(names(vs12_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs12_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs12_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group W / cluster 2_4 (i.e. S3.mix.with.blood.1.3)
chosen <- "W"
W_uniquely_up <- vs12_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2_4; S3.mix.with.blood.1.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# W_uniquely_up_pcg <- W_uniquely_up[intersect(protein_coding_gene_set, rownames(W_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
W_uniquely_up_noiseR <- W_uniquely_up[setdiff(rownames(W_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(W_uniquely_up_noiseR) %in% "CD4"),
       W_uniquely_up_noiseR[which(rownames(W_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(W_uniquely_up_noiseR) %in% "KLRB1"),
       W_uniquely_up_noiseR[which(rownames(W_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- W_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs12,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs12",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs12 = vs12_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-2_4-vs-3)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group X / cluster 3 (i.e. S3.mix.with.blood.2)
chosen <- "X"
X_uniquely_up <- vs12_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 3; S3.mix.with.blood.2)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# X_uniquely_up_pcg <- X_uniquely_up[intersect(protein_coding_gene_set, rownames(X_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
X_uniquely_up_noiseR <- X_uniquely_up[setdiff(rownames(X_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(X_uniquely_up_noiseR) %in% "CD4"),
       X_uniquely_up_noiseR[which(rownames(X_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(X_uniquely_up_noiseR) %in% "KLRB1"),
       X_uniquely_up_noiseR[which(rownames(X_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- X_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs12,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs12",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs12 = vs12_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

(#fig:heat-uniquely-up-logExp-cluster-3-vs-2_4)Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_4_vs_3/.

SUMMARY: 2_4 vs 3 (W vs X)

cluster 2_4 (S3.mix.with.blood.1.3 >>> a few markers (e.g. CCL5 and NKG7, CST7, etc)
cluster 3 (S3.mix.with.blood.2, special >>> a few markers (e.g. IL7R, LTB, IFITM)
COMMENT: S3.mix.with.blood.1.3 is unique by CCL5 and NKG7, S3.mix.with.blood.2 is unique by higher expression of IL7R, LTB, IFITM

`cluster_2_vs_cluster_4`

Show code

#########
# Y vs Z
#########

##########################################################################################
# cluster 2 (i.e. S3.mix.with.blood.1) vs cluster 4 (i.e. S3.mix.with.blood.3)

# checkpoint
cp <- sce
# exclude cells of uninterested cluster from cp
cp <- cp[, cp$cluster == "2" | cp$cluster == "4"]
colData(cp) <- droplevels(colData(cp))

# classify cluster-group for comparison
cp$vs13 <- factor(ifelse(cp$cluster == 4, "Y", "Z"))

# set vs colours
vs13_colours <- setNames(
  palette.colors(nlevels(cp$vs13), "Set1"),
  levels(cp$vs13))
cp$colours$vs13_colours <- vs13_colours[cp$vs13]

# find unique DE ./. cluster-groups
vs13_uniquely_up <- findMarkers(
  cp,
  groups = cp$vs13,
  block = cp$block,
  pval.type = "all",
  direction = "up")

# export DGE lists
saveRDS(
  vs13_uniquely_up,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_vs_4.rds"),
  compress = "xz")

dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "cluster_2_vs_4"), recursive = TRUE)

vs_pair <- c("4", "3")

message("Writing 'uniquely_up (cluster_2_vs_4)' marker genes to file.")
for (n in names(vs13_uniquely_up)) {
  message(n)
  gzout <- gzfile(
    description = here(
      "output",
      "marker_genes",
      "whole_cell",
      "uniquely_up",
      "cluster_2_vs_4",
      paste0("cluster_",
             vs_pair[which(names(vs13_uniquely_up) %in% n)],
             "_vs_",
             vs_pair[-which(names(vs13_uniquely_up) %in% n)][1],
             ".uniquely_up.csv.gz")),
    open = "wb")
  write.table(
    x = vs13_uniquely_up[[n]] %>%
      as.data.frame() %>%
      tibble::rownames_to_column("gene_ID"),
    file = gzout,
    sep = ",",
    quote = FALSE,
    row.names = FALSE,
    col.names = TRUE)
  close(gzout)
}

Show code

###############################################################
# look at cluster-group Y / cluster 2 (i.e. S3.mix.with.blood.1)
chosen <- "Y"
Y_uniquely_up <- vs13_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 2; S3.mix.with.blood.1)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# Y_uniquely_up_pcg <- Y_uniquely_up[intersect(protein_coding_gene_set, rownames(Y_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
Y_uniquely_up_noiseR <- Y_uniquely_up[setdiff(rownames(Y_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(Y_uniquely_up_noiseR) %in% "CD4"),
       Y_uniquely_up_noiseR[which(rownames(Y_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(Y_uniquely_up_noiseR) %in% "KLRB1"),
       Y_uniquely_up_noiseR[which(rownames(Y_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- Y_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs13,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs13",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    Sig = factor(
      ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
      # TODO: temp trick to deal with the row-colouring problem
      # levels = c("Yes", "No")),
      levels = c("Yes")),
    row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs13 = vs13_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 26: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

Show code

##########################################################
# look at cluster-group Z / cluster 4 (i.e. S3.mix.with.blood.3)
chosen <- "Z"
Z_uniquely_up <- vs13_uniquely_up[[chosen]]

# add description for the chosen cluster-group
x <- "(cluster 4; S3.mix.with.blood.3)"

# look only at protein coding gene (pcg)
# NOTE: not suggest to narrow down into pcg as it remove all significant candidates (FDR << 0.05) !
# Z_uniquely_up_pcg <- Z_uniquely_up[intersect(protein_coding_gene_set, rownames(Z_uniquely_up)), ]

# get rid of noise (i.e. pseudo, ribo, mito, sex) that collaborator not interested in
Z_uniquely_up_noiseR <- Z_uniquely_up[setdiff(rownames(Z_uniquely_up), c(pseudogene_set, mito_set, ribo_set, sex_set)), ]

# see if key marker, "CD4 and/or ""KLRB1/CD161"", contain in the DE list + if it is "significant (i.e FDR <0.05)
y <- c("CD4",
       which(rownames(Z_uniquely_up_noiseR) %in% "CD4"),
       Z_uniquely_up_noiseR[which(rownames(Z_uniquely_up_noiseR) %in% "CD4"), ]$FDR < 0.05)
z <- c("KLRB1/CD161",
       which(rownames(Z_uniquely_up_noiseR) %in% "KLRB1"),
       Z_uniquely_up_noiseR[which(rownames(Z_uniquely_up_noiseR) %in% "KLRB1"), ]$FDR < 0.05)

# top25 only + gene-of-interest
best_set <- Z_uniquely_up_noiseR[1:25, ]

Show code

# heatmap
plotHeatmap(
  cp,
  features = rownames(best_set),
  columns = order(
    cp$vs13,
    cp$cluster,
    cp$stage,
    cp$tissue,
    cp$donor,
    cp$group,
    cp$plate_number,
    cp$CD4,
    cp$CD161),
  colour_columns_by = c(
    "vs13",
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # annotation_row = data.frame(
  #   Sig = factor(
  #     ifelse(best_set[, "FDR"] < 0.05, "Yes", "No"),
  #     # TODO: temp trick to deal with the row-colouring problem
  #     # levels = c("Yes", "No")),
  #     levels = c("Yes")),
  #   row.names = rownames(best_set)),
  main = paste0("Cluster-group: ", chosen, " ", x, " - \n",
                y[1], "_top ", y[2], "_significance: ", y[3], " ; \n",
                z[1], "_top ", z[2], "_significance: ", z[3]),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    vs13 = vs13_colours,
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 27: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range. Ranking of CD4 and CD161/KLRB1 from top of the DGE list sorted in ascending order of FDR and their statistical significance (TRUE = FDR < 0.05) are provided in the title.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/cluster_2_vs_4/.

SUMMARY: 2 vs 4 (Y vs Z)

cluster 2 (S3.mix.with.blood.1 >>> beside the NPIPB gene family, this cluster does have a few more markers, e.g. ANKRD36, AC009022.1
cluster 4 (S3.mix.with.blood.3 >>> no sign marker found
COMMENT: gene expression fingerprints seem to share between S3.mix.with.blood.1 and S3.mix.with.blood.3, with marker like, ANKRD36 and AC009022.1, expressed on only S3.mix.with.blood.1 for driving them apart in clustering

Selected pairwise comparisons - integrated

To have a comprehensive overview, the above analyses were summarized in the following heatmap per cluster:

Show code

# NOTE: The following is a workaround to the lack of support for tabsets in 
#       distill (see https://github.com/rstudio/distill/issues/11 and 
#       https://github.com/rstudio/distill/issues/11#issuecomment-692142414 in 
#       particular).
xaringanExtra::use_panelset()

`Cluster_5_integrated`

Show code

############
# cluster 5
############

chosen <- "5"
# add description for the chosen cluster-group
x <- "(mostly.S1.S2.mix)"
# retain only significant markers (FDR<0.05) + keep only required output columns
B_uniquely_up_noiseR_sig <- B_uniquely_up_noiseR[B_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
D_uniquely_up_noiseR_sig <- D_uniquely_up_noiseR[D_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
F_uniquely_up_noiseR_sig <- F_uniquely_up_noiseR[F_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
H_uniquely_up_noiseR_sig <- H_uniquely_up_noiseR[H_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
J_uniquely_up_noiseR_sig <- J_uniquely_up_noiseR[J_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
cluster5_uniquely_up_noiseR_sig <- cluster5_uniquely_up_noiseR[cluster5_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
# add top column
B_uniquely_up_noiseR_sig$top <- 1:nrow(B_uniquely_up_noiseR_sig)
D_uniquely_up_noiseR_sig$top <- 1:nrow(D_uniquely_up_noiseR_sig)
F_uniquely_up_noiseR_sig$top <- 1:nrow(F_uniquely_up_noiseR_sig)
H_uniquely_up_noiseR_sig$top <- 1:nrow(H_uniquely_up_noiseR_sig)
J_uniquely_up_noiseR_sig$top <- 1:nrow(J_uniquely_up_noiseR_sig)
cluster5_uniquely_up_noiseR_sig$top <- 1:nrow(cluster5_uniquely_up_noiseR_sig)
# unify S4 objects, sort by top (ascending) then FDR (ascending), keep only first unique entry for each marker
BDFHJ5_uniquely_up_noiseR_sig <- rbind2(B_uniquely_up_noiseR_sig,
                                        D_uniquely_up_noiseR_sig,
                                        F_uniquely_up_noiseR_sig,
                                        H_uniquely_up_noiseR_sig,
                                        J_uniquely_up_noiseR_sig,
                                        cluster5_uniquely_up_noiseR_sig)
BDFHJ5_uniquely_up_noiseR_sig_sort <- BDFHJ5_uniquely_up_noiseR_sig[with(BDFHJ5_uniquely_up_noiseR_sig, order(top, FDR)), ]
BDFHJ5_uniquely_up_noiseR_sig_sort_uniq <- BDFHJ5_uniquely_up_noiseR_sig_sort[unique(rownames(BDFHJ5_uniquely_up_noiseR_sig_sort)), ]
# # de-select unannotated/ not well-characterised genes
# deselected <- c("NPIPB13", "NPIPB3", "NPIPB11", "NPIPB5", "NPIPB4", "EEF1A1", "ACTG1", "ACTB", "IFITM1")
deselected <- c("MALAT1")
BDFHJ5_uniquely_up_noiseR_sig_sort_uniq_selected <- BDFHJ5_uniquely_up_noiseR_sig_sort_uniq[!(rownames(BDFHJ5_uniquely_up_noiseR_sig_sort_uniq) %in% deselected), ]

# export DGE lists
saveRDS(
  BDFHJ5_uniquely_up_noiseR_sig_sort_uniq_selected,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_5_integrated.rds"),
  compress = "xz")
dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "integrated"), recursive = TRUE)
message("Writing 'uniquely_up (cluster_5_integrated)' marker genes to file.")
gzout <- gzfile(
  description = here(
    "output",
    "marker_genes",
    "whole_cell",
    "uniquely_up",
    "integrated",
    paste0("cluster_5_integrated",
           ".uniquely_up.csv.gz")),
  open = "wb")
write.table(
  x = BDFHJ5_uniquely_up_noiseR_sig_sort_uniq_selected %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_ID"),
  file = gzout,
  sep = ",",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE)
close(gzout)
# top only + gene-of-interest
best_set <- BDFHJ5_uniquely_up_noiseR_sig_sort_uniq_selected[1:50, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    # TODO: temp trick to deal with the row-colouring problem
    cluster5.vs.1 = factor(ifelse(rownames(best_set) %in% rownames(B_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster5.vs.2 = factor(ifelse(rownames(best_set) %in% rownames(D_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster5.vs.3 = factor(ifelse(rownames(best_set) %in% rownames(F_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster5.vs.4 = factor(ifelse(rownames(best_set) %in% rownames(H_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster5.vs.2_3_4 = factor(ifelse(rownames(best_set) %in% rownames(J_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster5.vs.1.vs.2.vs.3.vs.4 = factor(ifelse(rownames(best_set) %in% rownames(cluster5_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),    
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 28: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Cluster_4_integrated`

Show code

############
# cluster 4
############

chosen <- "4"
# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.3)"
# retain only significant markers (FDR<0.05) + keep only required output columns
G_uniquely_up_noiseR_sig <- G_uniquely_up_noiseR[G_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
O_uniquely_up_noiseR_sig <- O_uniquely_up_noiseR[O_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
U_uniquely_up_noiseR_sig <- U_uniquely_up_noiseR[U_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
Z_uniquely_up_noiseR_sig <- Z_uniquely_up_noiseR[Z_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
cluster4_uniquely_up_noiseR_sig <- cluster4_uniquely_up_noiseR[cluster4_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
# add top column
G_uniquely_up_noiseR_sig$top <- 1:nrow(G_uniquely_up_noiseR_sig)
O_uniquely_up_noiseR_sig$top <- 1:nrow(O_uniquely_up_noiseR_sig)
U_uniquely_up_noiseR_sig$top <- 1:nrow(U_uniquely_up_noiseR_sig)
# TODO: fix error - exclude list from rbind2 if empty
# Z_uniquely_up_noiseR_sig$top <- 1:nrow(Z_uniquely_up_noiseR_sig)
cluster4_uniquely_up_noiseR_sig$top <- 1:nrow(cluster4_uniquely_up_noiseR_sig)
# unify S4 objects, sort by top (ascending) then FDR (ascending), keep only first unique entry for each marker
GOUZ4_uniquely_up_noiseR_sig <- rbind2(G_uniquely_up_noiseR_sig,
                                       O_uniquely_up_noiseR_sig,
                                       U_uniquely_up_noiseR_sig,
                                       Z_uniquely_up_noiseR_sig,
                                       cluster4_uniquely_up_noiseR_sig)
GOUZ4_uniquely_up_noiseR_sig_sort <- GOUZ4_uniquely_up_noiseR_sig[with(GOUZ4_uniquely_up_noiseR_sig, order(top, FDR)), ]
GOUZ4_uniquely_up_noiseR_sig_sort_uniq <- GOUZ4_uniquely_up_noiseR_sig_sort[unique(rownames(GOUZ4_uniquely_up_noiseR_sig_sort)), ]
# # de-select unannotated/ not well-characterised genes
# deselected <- c("NPIPB13", "NPIPB3", "NPIPB11", "NPIPB5", "NPIPB4", "EEF1A1", "ACTG1", "ACTB", "IFITM1")
deselected <- c("MALAT1")
GOUZ4_uniquely_up_noiseR_sig_sort_uniq_selected <- GOUZ4_uniquely_up_noiseR_sig_sort_uniq[!(rownames(GOUZ4_uniquely_up_noiseR_sig_sort_uniq) %in% deselected), ]

# export DGE lists
saveRDS(
  GOUZ4_uniquely_up_noiseR_sig_sort_uniq_selected,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_4_integrated.rds"),
  compress = "xz")
dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "integrated"), recursive = TRUE)
message("Writing 'uniquely_up (cluster_4_integrated)' marker genes to file.")
gzout <- gzfile(
  description = here(
    "output",
    "marker_genes",
    "whole_cell",
    "uniquely_up",
    "integrated",
    paste0("cluster_4_integrated",
           ".uniquely_up.csv.gz")),
  open = "wb")
write.table(
  x = GOUZ4_uniquely_up_noiseR_sig_sort_uniq_selected %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_ID"),
  file = gzout,
  sep = ",",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE)
close(gzout)
# top only + gene-of-interest
best_set <- GOUZ4_uniquely_up_noiseR_sig_sort_uniq_selected[1:50, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    # TODO: temp trick to deal with the row-colouring problem
    cluster4.vs.1 = factor(ifelse(rownames(best_set) %in% rownames(O_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster4.vs.3 = factor(ifelse(rownames(best_set) %in% rownames(U_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster4.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(G_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    # TODO: work out how to remove `fill` from `not_DE`
    # cluster4.vs.2 = factor(ifelse(rownames(best_set) %in% rownames(Z_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster4.vs.1.vs.2.vs.3.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(cluster4_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),    
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 29: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Cluster_3_integrated`

Show code

############
# cluster 3
############

chosen <- "3"
# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.2)"
# retain only significant markers (FDR<0.05) + keep only required output columns
E_uniquely_up_noiseR_sig <- E_uniquely_up_noiseR[E_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
M_uniquely_up_noiseR_sig <- M_uniquely_up_noiseR[M_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
T_uniquely_up_noiseR_sig <- T_uniquely_up_noiseR[T_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
V_uniquely_up_noiseR_sig <- V_uniquely_up_noiseR[V_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
X_uniquely_up_noiseR_sig <- X_uniquely_up_noiseR[X_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
cluster3_uniquely_up_noiseR_sig <- cluster3_uniquely_up_noiseR[cluster3_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
# add top column
E_uniquely_up_noiseR_sig$top <- 1:nrow(E_uniquely_up_noiseR_sig)
M_uniquely_up_noiseR_sig$top <- 1:nrow(M_uniquely_up_noiseR_sig)
T_uniquely_up_noiseR_sig$top <- 1:nrow(T_uniquely_up_noiseR_sig)
V_uniquely_up_noiseR_sig$top <- 1:nrow(V_uniquely_up_noiseR_sig)
X_uniquely_up_noiseR_sig$top <- 1:nrow(X_uniquely_up_noiseR_sig)
cluster3_uniquely_up_noiseR_sig$top <- 1:nrow(cluster3_uniquely_up_noiseR_sig)
# unify S4 objects, sort by top (ascending) then FDR (ascending), keep only first unique entry for each marker
EMTVX3_uniquely_up_noiseR_sig <- rbind2(E_uniquely_up_noiseR_sig,
                                        M_uniquely_up_noiseR_sig,
                                        T_uniquely_up_noiseR_sig,
                                        V_uniquely_up_noiseR_sig,
                                        X_uniquely_up_noiseR_sig,
                                        cluster3_uniquely_up_noiseR_sig)
EMTVX3_uniquely_up_noiseR_sig_sort <- EMTVX3_uniquely_up_noiseR_sig[with(EMTVX3_uniquely_up_noiseR_sig, order(top, FDR)), ]
EMTVX3_uniquely_up_noiseR_sig_sort_uniq <- EMTVX3_uniquely_up_noiseR_sig_sort[unique(rownames(EMTVX3_uniquely_up_noiseR_sig_sort)), ]
# # de-select unannotated/ not well-characterised genes
# deselected <- c("NPIPB13", "NPIPB3", "NPIPB11", "NPIPB5", "NPIPB4", "EEF1A1", "ACTG1", "ACTB", "IFITM1")
deselected <- c("MALAT1")
EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected <- EMTVX3_uniquely_up_noiseR_sig_sort_uniq[!(rownames(EMTVX3_uniquely_up_noiseR_sig_sort_uniq) %in% deselected), ]

# export DGE lists
saveRDS(
  EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_3_integrated.rds"),
  compress = "xz")
dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "integrated"), recursive = TRUE)
message("Writing 'uniquely_up (cluster_3_integrated)' marker genes to file.")
gzout <- gzfile(
  description = here(
    "output",
    "marker_genes",
    "whole_cell",
    "uniquely_up",
    "integrated",
    paste0("cluster_3_integrated",
           ".uniquely_up.csv.gz")),
  open = "wb")
write.table(
  x = EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_ID"),
  file = gzout,
  sep = ",",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE)
close(gzout)
# top only + gene-of-interest
# NOTE: there are only 19 markers
# best_set <- EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected[1:50, ]
best_set <- EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected[1:dim(EMTVX3_uniquely_up_noiseR_sig_sort_uniq_selected)[1], ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    # TODO: temp trick to deal with the row-colouring problem
    cluster3.vs.1 = factor(ifelse(rownames(best_set) %in% rownames(E_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster3.vs.3 = factor(ifelse(rownames(best_set) %in% rownames(M_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster3.vs.4 = factor(ifelse(rownames(best_set) %in% rownames(T_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster3.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(V_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster3.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(X_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster3.vs.1.vs.2.vs.4.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(cluster3_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),    
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 30: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Cluster_2_integrated`

Show code

############
# cluster 2
############

chosen <- "2"
# add description for the chosen cluster-group
x <- "(S3.mix.with.blood.1)"
# retain only significant markers (FDR<0.05) + keep only required output columns
C_uniquely_up_noiseR_sig <- C_uniquely_up_noiseR[C_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
K_uniquely_up_noiseR_sig <- K_uniquely_up_noiseR[K_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
S_uniquely_up_noiseR_sig <- S_uniquely_up_noiseR[S_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
Y_uniquely_up_noiseR_sig <- Y_uniquely_up_noiseR[Y_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
cluster2_uniquely_up_noiseR_sig <- cluster2_uniquely_up_noiseR[cluster2_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
# add top column
C_uniquely_up_noiseR_sig$top <- 1:nrow(C_uniquely_up_noiseR_sig)
K_uniquely_up_noiseR_sig$top <- 1:nrow(K_uniquely_up_noiseR_sig)
S_uniquely_up_noiseR_sig$top <- 1:nrow(S_uniquely_up_noiseR_sig)
Y_uniquely_up_noiseR_sig$top <- 1:nrow(Y_uniquely_up_noiseR_sig)
# TODO: fix error - exclude list from rbind2 if empty
# cluster2_uniquely_up_noiseR_sig$top <- 1:nrow(cluster2_uniquely_up_noiseR_sig)
# unify S4 objects, sort by top (ascending) then FDR (ascending), keep only first unique entry for each marker
CKSY2_uniquely_up_noiseR_sig <- rbind2(C_uniquely_up_noiseR_sig,
                                       K_uniquely_up_noiseR_sig,
                                       S_uniquely_up_noiseR_sig,
                                       Y_uniquely_up_noiseR_sig,
                                       cluster2_uniquely_up_noiseR_sig)
CKSY2_uniquely_up_noiseR_sig_sort <- CKSY2_uniquely_up_noiseR_sig[with(CKSY2_uniquely_up_noiseR_sig, order(top, FDR)), ]
CKSY2_uniquely_up_noiseR_sig_sort_uniq <- CKSY2_uniquely_up_noiseR_sig_sort[unique(rownames(CKSY2_uniquely_up_noiseR_sig_sort)), ]
# # de-select unannotated/ not well-characterised genes
# deselected <- c("NPIPB13", "NPIPB3", "NPIPB11", "NPIPB5", "NPIPB4", "EEF1A1", "ACTG1", "ACTB", "IFITM1")
deselected <- c("MALAT1")
CKSY2_uniquely_up_noiseR_sig_sort_uniq_selected <- CKSY2_uniquely_up_noiseR_sig_sort_uniq[!(rownames(CKSY2_uniquely_up_noiseR_sig_sort_uniq) %in% deselected), ]

# export DGE lists
saveRDS(
  CKSY2_uniquely_up_noiseR_sig_sort_uniq_selected,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_2_integrated.rds"),
  compress = "xz")
dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "integrated"), recursive = TRUE)
message("Writing 'uniquely_up (cluster_2_integrated)' marker genes to file.")
gzout <- gzfile(
  description = here(
    "output",
    "marker_genes",
    "whole_cell",
    "uniquely_up",
    "integrated",
    paste0("cluster_2_integrated",
           ".uniquely_up.csv.gz")),
  open = "wb")
write.table(
  x = CKSY2_uniquely_up_noiseR_sig_sort_uniq_selected %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_ID"),
  file = gzout,
  sep = ",",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE)
close(gzout)
# top only + gene-of-interest
best_set <- CKSY2_uniquely_up_noiseR_sig_sort_uniq_selected[1:50, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    # TODO: temp trick to deal with the row-colouring problem
    cluster2.vs.1 = factor(ifelse(rownames(best_set) %in% rownames(K_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster2.vs.3 = factor(ifelse(rownames(best_set) %in% rownames(S_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster2.vs.4 = factor(ifelse(rownames(best_set) %in% rownames(Y_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster2.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(C_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    # TODO: work out how to remove `fill` from `not_DE`
    # cluster2.vs.1.vs.3.vs.4.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(cluster2_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),    
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 31: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Cluster_1_integrated`

Show code

############
# cluster 1
############

chosen <- "1"
# add description for the chosen cluster-group
x <- "(S3.mix.higher.thymus)"
# retain only significant markers (FDR<0.05) + keep only required output columns
A_uniquely_up_noiseR_sig <- A_uniquely_up_noiseR[A_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
L_uniquely_up_noiseR_sig <- L_uniquely_up_noiseR[L_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
N_uniquely_up_noiseR_sig <- N_uniquely_up_noiseR[N_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
P_uniquely_up_noiseR_sig <- P_uniquely_up_noiseR[P_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
R_uniquely_up_noiseR_sig <- R_uniquely_up_noiseR[R_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
cluster1_uniquely_up_noiseR_sig <- cluster1_uniquely_up_noiseR[cluster1_uniquely_up_noiseR$FDR<0.05,][ ,c(1:3)]
# add top column
A_uniquely_up_noiseR_sig$top <- 1:nrow(A_uniquely_up_noiseR_sig)
L_uniquely_up_noiseR_sig$top <- 1:nrow(L_uniquely_up_noiseR_sig)
N_uniquely_up_noiseR_sig$top <- 1:nrow(N_uniquely_up_noiseR_sig)
# TODO: fix error - exclude list from rbind2 if empty
# P_uniquely_up_noiseR_sig$top <- 1:nrow(P_uniquely_up_noiseR_sig)
# R_uniquely_up_noiseR_sig$top <- 1:nrow(R_uniquely_up_noiseR_sig)
# cluster1_uniquely_up_noiseR_sig$top <- 1:nrow(cluster1_uniquely_up_noiseR_sig)

# unify S4 objects, sort by top (ascending) then FDR (ascending), keep only first unique entry for each marker
# TODO: fix why rbind2 cannot accept "empty" table only in this case 
# ALNPR1_uniquely_up_noiseR_sig <- rbind2(A_uniquely_up_noiseR_sig,
#                                         L_uniquely_up_noiseR_sig,
#                                         N_uniquely_up_noiseR_sig,
#                                         P_uniquely_up_noiseR_sig,
#                                         R_uniquely_up_noiseR_sig,
#                                         cluster1_uniquely_up_noiseR_sig)
ALNPR1_uniquely_up_noiseR_sig <- rbind2(A_uniquely_up_noiseR_sig,
                                        L_uniquely_up_noiseR_sig,
                                        N_uniquely_up_noiseR_sig)
ALNPR1_uniquely_up_noiseR_sig_sort <- ALNPR1_uniquely_up_noiseR_sig[with(ALNPR1_uniquely_up_noiseR_sig, order(top, FDR)), ]
ALNPR1_uniquely_up_noiseR_sig_sort_uniq <- ALNPR1_uniquely_up_noiseR_sig_sort[unique(rownames(ALNPR1_uniquely_up_noiseR_sig_sort)), ]
# de-select unannotated/ not well-characterised genes
# deselected <- c("NPIPB13", "NPIPB3", "NPIPB11", "NPIPB5", "NPIPB4", "EEF1A1", "ACTG1", "ACTB", "IFITM1")
deselected <- c("MALAT1")
ALNPR1_uniquely_up_noiseR_sig_sort_uniq_selected <- ALNPR1_uniquely_up_noiseR_sig_sort_uniq[!(rownames(ALNPR1_uniquely_up_noiseR_sig_sort_uniq) %in% deselected), ]

# export DGE lists
saveRDS(
  ALNPR1_uniquely_up_noiseR_sig_sort_uniq_selected,
  here("data", "marker_genes", "whole_cell", "C094_Pellicci.uniquely_up.cluster_1_integrated.rds"),
  compress = "xz")
dir.create(here("output", "marker_genes", "whole_cell", "uniquely_up", "integrated"), recursive = TRUE)
message("Writing 'uniquely_up (cluster_1_integrated)' marker genes to file.")
gzout <- gzfile(
  description = here(
    "output",
    "marker_genes",
    "whole_cell",
    "uniquely_up",
    "integrated",
    paste0("cluster_1_integrated",
           ".uniquely_up.csv.gz")),
  open = "wb")
write.table(
  x = ALNPR1_uniquely_up_noiseR_sig_sort_uniq_selected %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_ID"),
  file = gzout,
  sep = ",",
  quote = FALSE,
  row.names = FALSE,
  col.names = TRUE)
close(gzout)
# top only + gene-of-interest
best_set <- ALNPR1_uniquely_up_noiseR_sig_sort_uniq_selected[1:50, ]

Show code

# heatmap
plotHeatmap(
  sce,
  features = rownames(best_set),
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number,
    sce$CD4,
    sce$CD161),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number",
    "CD4",
    "CD161"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  annotation_row = data.frame(
    # TODO: temp trick to deal with the row-colouring problem
    cluster1.vs.2 = factor(ifelse(rownames(best_set) %in% rownames(L_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster1.vs.3 = factor(ifelse(rownames(best_set) %in% rownames(N_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    cluster1.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(A_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    # TODO: work out how to remove `fill` from `not_DE`
    # cluster1.vs.4 = factor(ifelse(rownames(best_set) %in% rownames(P_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE", "not DE")),
    # cluster1.vs.2_3_4 = factor(ifelse(rownames(best_set) %in% rownames(R_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE")),
    # cluster1.vs.2.vs.3.vs.4.vs.5 = factor(ifelse(rownames(best_set) %in% rownames(cluster1_uniquely_up_noiseR_sig), "DE", "not DE"), levels = c("DE", "not DE")),    
    row.names = rownames(best_set)),
  main = paste0("Cluster ", chosen, " ", x),
  column_annotation_colors = list(
    # Sig = c("Yes" = "red", "No" = "lightgrey"),
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 32: Heatmap of log-expression values in each sample for the top uniquely upregulated marker genes. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

DGE lists of these comparisons are available in output/marker_genes/whole_cell/uniquely_up/integrated/.

Summary:

In the current setup, stages of sample are defined by expression of two protein surface markers, CD4 and CD161. However, in the clustering of single cells, grouping of cells are purely based on their overall gene expression. Unless the heterogeneity between our targeted groups to be compared is really high and forming clear-cut clusters, we cannot pursue for comparison of clear-cut group by comparing clusters (say, pure thymus.s3 vs pure blood.s3 as in minibulk). But still, we can find gamma delta T cells exist in the same sub-stage (in terms of their gene expression) that shared between both tissue-of-origin.

Consistent with the mini-bulk outcome, single cell dataset shows that S3 cells (from either thymus or blood) is significantly different from S1-S2-mix. For the S3-mix cells, besides the S3.mix.higher.thymus, we provide evidence to support that S3.mix.with.blood could be subdivided into about 2-3 subgroups (depending on whether you trust “NPIP3 family member and ANKRD36” as markers): i.e. blood.s3.normal (cluster 2, 4) and blood.s3.special (cluster 3).

In general, each of these 3 clusters are characterized by: cluster 2: higher NKG7 (CCL5) cluster 3: highest IL7R and LTB (IFITM3) cluster 4: higher NPIP3 and ANKRD36 (CCL5, GZMK)

In comparison, for cluster 1 (S3.mix.higher.thymus): No obvious distinguishing feature and expression of most genes are sitting in the middle (slightly higher in expression of IFTIM3 and GZMK ?); interim development stage mostly in thymus ?

Supplemental info

Show code

############################################
# Comparison of key markers expression by clusters

assay(sce, "log2cpm") <- edgeR::cpm(
  counts(sce),
  log = TRUE,
  lib.size = colSums(counts(sce)))

Show code

plot_grid(
  
  plotExpression(
    sce,
    features = "IL7R",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "CCL5",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "LTB",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "NKG7",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "GZMK",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "IFITM3",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "ANKRD36",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  plotExpression(
    sce,
    features = "NPIPB3",
    x = "cluster",
    exprs_values = "log2cpm",
    colour_by = "cluster") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_colour_manual(values = cluster_colours, name = "cluster"),
  
  ncol = 2)

Expression of minibulk DE in the single-cell dataset

Show code

# laod package for read in csv.gz
library(data.table)
library(R.utils)

# read in
a <- fread(here("output", "DEGs", "excluding_blood_1-3", "Thymus.S2_vs_Thymus.S1.aggregated_tech_reps.DEGs.csv.gz"))
b <- fread(here("output", "DEGs", "excluding_blood_1-3", "Thymus.S3_vs_Blood.S3.aggregated_tech_reps.DEGs.csv.gz"))
c <- fread(here("output", "DEGs", "excluding_blood_1-3", "Thymus.S3_vs_Thymus.S1.aggregated_tech_reps.DEGs.csv.gz"))
d <- fread(here("output", "DEGs", "excluding_blood_1-3", "Thymus.S3_vs_Thymus.S2.aggregated_tech_reps.DEGs.csv.gz"))

# extract DEGlist (FDR < 0.05)
minibulkDEG.a <- a$ENSEMBL.GENENAME[a$FDR<0.05]
minibulkDEG.b <- b$ENSEMBL.GENENAME[b$FDR<0.05]
minibulkDEG.c <- c$ENSEMBL.GENENAME[c$FDR<0.05]
minibulkDEG.d <- d$ENSEMBL.GENENAME[d$FDR<0.05]

# divide the DE by UP and DOWN
minibulkDEG.a.up <- a$ENSEMBL.GENENAME[a$FDR<0.05 & a$logFC >0]
minibulkDEG.b.up <- b$ENSEMBL.GENENAME[b$FDR<0.05 & b$logFC >0]
minibulkDEG.c.up <- c$ENSEMBL.GENENAME[c$FDR<0.05 & c$logFC >0]
minibulkDEG.d.up <- d$ENSEMBL.GENENAME[d$FDR<0.05 & d$logFC >0]

minibulkDEG.a.down <- a$ENSEMBL.GENENAME[a$FDR<0.05 & a$logFC <0]
minibulkDEG.b.down <- b$ENSEMBL.GENENAME[b$FDR<0.05 & b$logFC <0]
minibulkDEG.c.down <- c$ENSEMBL.GENENAME[c$FDR<0.05 & c$logFC <0]
minibulkDEG.d.down <- d$ENSEMBL.GENENAME[d$FDR<0.05 & d$logFC <0]

# keep only unique markers
uniq.minibulkDEG.a <- Reduce(setdiff, list(minibulkDEG.a,
                                           minibulkDEG.b,
                                           minibulkDEG.c,
                                           minibulkDEG.d))
uniq.minibulkDEG.b <- Reduce(setdiff, list(minibulkDEG.b,
                                           minibulkDEG.a,
                                           minibulkDEG.c,
                                           minibulkDEG.d))
uniq.minibulkDEG.c <- Reduce(setdiff, list(minibulkDEG.c,
                                           minibulkDEG.a,
                                           minibulkDEG.b,
                                           minibulkDEG.d))
uniq.minibulkDEG.d <- Reduce(setdiff, list(minibulkDEG.d,
                                           minibulkDEG.a,
                                           minibulkDEG.b,
                                           minibulkDEG.c))

uniq.minibulkDEG.a.up <- Reduce(setdiff, list(minibulkDEG.a.up,
                                              minibulkDEG.b.up,
                                              minibulkDEG.c.up,
                                              minibulkDEG.d.up))
uniq.minibulkDEG.b.up <- Reduce(setdiff, list(minibulkDEG.b.up,
                                              minibulkDEG.a.up,
                                              minibulkDEG.c.up,
                                              minibulkDEG.d.up))
uniq.minibulkDEG.c.up <- Reduce(setdiff, list(minibulkDEG.c.up,
                                              minibulkDEG.a.up,
                                              minibulkDEG.b.up,
                                              minibulkDEG.d.up))
uniq.minibulkDEG.d.up <- Reduce(setdiff, list(minibulkDEG.d.up,
                                              minibulkDEG.a.up,
                                              minibulkDEG.b.up,
                                              minibulkDEG.c.up))

uniq.minibulkDEG.a.down <- Reduce(setdiff, list(minibulkDEG.a.down,
                                              minibulkDEG.b.down,
                                              minibulkDEG.c.down,
                                              minibulkDEG.d.down))
uniq.minibulkDEG.b.down <- Reduce(setdiff, list(minibulkDEG.b.down,
                                              minibulkDEG.a.down,
                                              minibulkDEG.c.down,
                                              minibulkDEG.d.down))
uniq.minibulkDEG.c.down <- Reduce(setdiff, list(minibulkDEG.c.down,
                                              minibulkDEG.a.down,
                                              minibulkDEG.b.down,
                                              minibulkDEG.d.down))
uniq.minibulkDEG.d.down <- Reduce(setdiff, list(minibulkDEG.d.down,
                                              minibulkDEG.a.down,
                                              minibulkDEG.b.down,
                                              minibulkDEG.c.down))

# check number of unique minibulkDEG in each
length(uniq.minibulkDEG.a)

[1] 3

Show code

length(uniq.minibulkDEG.b)

[1] 11

Show code

length(uniq.minibulkDEG.c)

[1] 511

Show code

length(uniq.minibulkDEG.d)

[1] 37

Show code

length(uniq.minibulkDEG.a.up)

[1] 2

Show code

length(uniq.minibulkDEG.b.up)

[1] 7

Show code

length(uniq.minibulkDEG.c.up)

[1] 222

Show code

length(uniq.minibulkDEG.d.up)

[1] 18

Show code

length(uniq.minibulkDEG.a.down)

[1] 2

Show code

length(uniq.minibulkDEG.b.down)

[1] 4

Show code

length(uniq.minibulkDEG.c.down)

[1] 289

Show code

length(uniq.minibulkDEG.d.down)

[1] 20

Show code

# keep only top50
top.uniq.minibulkDEG.a <- if(length(uniq.minibulkDEG.a) >=25){uniq.minibulkDEG.a[1:25]} else {uniq.minibulkDEG.a}
top.uniq.minibulkDEG.b <- if(length(uniq.minibulkDEG.b) >=25){uniq.minibulkDEG.b[1:25]} else {uniq.minibulkDEG.b}
top.uniq.minibulkDEG.c <- if(length(uniq.minibulkDEG.c) >=25){uniq.minibulkDEG.c[1:25]} else {uniq.minibulkDEG.c}
top.uniq.minibulkDEG.d <- if(length(uniq.minibulkDEG.d) >=25){uniq.minibulkDEG.d[1:25]} else {uniq.minibulkDEG.d}

top.uniq.minibulkDEG.a.up <- if(length(uniq.minibulkDEG.a.up) >=25){uniq.minibulkDEG.a[1:25]} else {uniq.minibulkDEG.a.up}
top.uniq.minibulkDEG.b.up <- if(length(uniq.minibulkDEG.b.up) >=25){uniq.minibulkDEG.b[1:25]} else {uniq.minibulkDEG.b.up}
top.uniq.minibulkDEG.c.up <- if(length(uniq.minibulkDEG.c.up) >=25){uniq.minibulkDEG.c[1:25]} else {uniq.minibulkDEG.c.up}
top.uniq.minibulkDEG.d.up <- if(length(uniq.minibulkDEG.d.up) >=25){uniq.minibulkDEG.d[1:25]} else {uniq.minibulkDEG.d.up}

top.uniq.minibulkDEG.a.down <- if(length(uniq.minibulkDEG.a.down) >=25){uniq.minibulkDEG.a[1:25]} else {uniq.minibulkDEG.a.down}
top.uniq.minibulkDEG.b.down <- if(length(uniq.minibulkDEG.b.down) >=25){uniq.minibulkDEG.b[1:25]} else {uniq.minibulkDEG.b.down}
top.uniq.minibulkDEG.c.down <- if(length(uniq.minibulkDEG.c.down) >=25){uniq.minibulkDEG.c[1:25]} else {uniq.minibulkDEG.c.down}
top.uniq.minibulkDEG.d.down <- if(length(uniq.minibulkDEG.d.down) >=25){uniq.minibulkDEG.d[1:25]} else {uniq.minibulkDEG.d.down}

# feature
minibulk_markers <- c(top.uniq.minibulkDEG.a,
                      top.uniq.minibulkDEG.b,
                      top.uniq.minibulkDEG.c,
                      top.uniq.minibulkDEG.d)

minibulk_markers_up <- c(top.uniq.minibulkDEG.a.up,
                         top.uniq.minibulkDEG.b.up,
                         top.uniq.minibulkDEG.c.up,
                         top.uniq.minibulkDEG.d.up)

minibulk_markers_down <- c(top.uniq.minibulkDEG.a.down,
                           top.uniq.minibulkDEG.b.down,
                           top.uniq.minibulkDEG.c.down,
                           top.uniq.minibulkDEG.d.down)

Show code

# NOTE: The following is a workaround to the lack of support for tabsets in 
#       distill (see https://github.com/rstudio/distill/issues/11 and 
#       https://github.com/rstudio/distill/issues/11#issuecomment-692142414 in 
#       particular).
xaringanExtra::use_panelset()

`Minibulk DE_ALL_by cluster`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers,
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.a, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.b, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers),
  main = "Minibulk DE expression (UP and DOWN) in whole cell (with cells divided by clusters)",
  column_annotation_colors = list(
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by `cluster`. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 33: Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by cluster. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_ALL_by group`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.a, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.b, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers),
  main = "Minibulk DE expression (UP and DOWN) in whole cell (with cells divided by groups)",
  column_annotation_colors = list(
    group = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by `group`. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 34: Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by group. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_ALL_by group (clust_col)`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = TRUE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.a, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.b, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers %in% top.uniq.minibulkDEG.c, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers),
  main = "Minibulk DE expression (UP and DOWN) in whole cell (with cells divided by groups, column-clustered)",
  column_annotation_colors = list(
    group = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by `group` (column-clustered). Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 35: Heatmap of log-expression values in each sample for the marker genes identified from the mini-bulk analysis. Cells are ordered by group (column-clustered). Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_UP_by cluster`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_up,
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.a.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.b.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_up),
  main = "Minibulk DE expression (UP only) in whole cell (with cells divided by clusters)",
  column_annotation_colors = list(
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes (up-regulated) identified from the mini-bulk analysis. Cells are ordered by `cluster`. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 36: Heatmap of log-expression values in each sample for the marker genes (up-regulated) identified from the mini-bulk analysis. Cells are ordered by cluster. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_UP_by group`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_up,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.a.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.b.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_up),
  main = "Minibulk DE expression (UP only) in whole cell (with cells divided by groups)",
  column_annotation_colors = list(
    group = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes (up-regulated) identified from the mini-bulk analysis. Cells are ordered by `group`. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 37: Heatmap of log-expression values in each sample for the marker genes (up-regulated) identified from the mini-bulk analysis. Cells are ordered by group. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_UP_by group (clust_col)`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_up,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = TRUE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.a.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.b.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_up %in% top.uniq.minibulkDEG.c.up, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_up),
  main = "Minibulk DE expression (UP only) in whole cell (with cells divided by groups, column-clustered)",
  column_annotation_colors = list(
    group = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 38: Heatmap of log-expression values in each sample for the marker genes (up-regulated) identified from the mini-bulk analysis. Cells are ordered by group (column-clustered). Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_DOWN_by cluster`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_down,
  columns = order(
    sce$cluster,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$group,
    sce$plate_number),
  colour_columns_by = c(
    "cluster",
    "stage",
    "tissue",
    "donor",
    "group",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.a.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.b.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_down),
  main = "Minibulk DE expression (DOWN only) in whole cell (with cells divided by clusters)",
  column_annotation_colors = list(
    cluster = cluster_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    group = group_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Heatmap of log-expression values in each sample for the marker genes (down-regulated) identified from the mini-bulk analysis. Cells are ordered by `cluster`. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Figure 39: Heatmap of log-expression values in each sample for the marker genes (down-regulated) identified from the mini-bulk analysis. Cells are ordered by cluster. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_DOWN_by group`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_down,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = FALSE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.a.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.b.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_down),
  main = "Minibulk DE expression (DOWN only) in whole cell (with cells divided by groups)",
  column_annotation_colors = list(
    grodown = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 40: Heatmap of log-expression values in each sample for the marker genes (down-regulated) identified from the mini-bulk analysis. Cells are ordered by grodown. Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

`Minibulk DE_DOWN_by group (clust_col)`

Show code

library(scater)
plotHeatmap(
  sce,
  features = minibulk_markers_down,
  columns = order(
    sce$group,
    sce$stage,
    sce$tissue,
    sce$donor,
    sce$plate_number),
  colour_columns_by = c(
    "group",
    "stage",
    "tissue",
    "donor",
    "plate_number"),
  cluster_cols = TRUE,
  center = TRUE,
  symmetric = TRUE,
  zlim = c(-3, 3),
  show_colnames = FALSE,
  # TODO: temp trick to deal with the row-colouring problem
  annotation_row = data.frame(
    thymus.s2.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.a.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.blood.s3 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.b.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s1 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    thymus.s3.vs.thymus.s2 = factor(ifelse(minibulk_markers_down %in% top.uniq.minibulkDEG.c.down, "DE", "not DE"), levels = c("DE")),
    row.names = minibulk_markers_down),
  main = "Minibulk DE expression (DOWN only) in whole cell (with cells divided by groups, column-clustered)",
  column_annotation_colors = list(
    grodown = group_colours,
    stage = stage_colours,
    tissue = tissue_colours,
    donor = donor_colours,
    plate_number = plate_number_colours),
  color = hcl.colors(101, "Blue-Red 3"),
  fontsize = 7)

Figure 41: Heatmap of log-expression values in each sample for the marker genes (down-regulated) identified from the mini-bulk analysis. Cells are ordered by grodown (column-clustered). Each column is a sample, each row a gene. Colours are capped at -3 and 3 to preserve dynamic range.

Session info

The analysis and this document were prepared using the following software (click triangle to expand)

Show code

sessioninfo::session_info()

─ Session info ─────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.0.3 (2020-10-10)
 os       CentOS Linux 7 (Core)       
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Australia/Melbourne         
 date     2021-09-29                  

─ Packages ─────────────────────────────────────────────────────────
 ! package              * version  date       lib
 P annotate               1.68.0   2020-10-27 [?]
 P AnnotationDbi          1.52.0   2020-10-27 [?]
 P assertthat             0.2.1    2019-03-21 [?]
 P batchelor            * 1.6.3    2021-04-16 [?]
 P beachmat               2.6.4    2020-12-20 [?]
 P beeswarm               0.3.1    2021-03-07 [?]
 P Biobase              * 2.50.0   2020-10-27 [?]
 P BiocGenerics         * 0.36.0   2020-10-27 [?]
 P BiocManager            1.30.12  2021-03-28 [?]
 P BiocNeighbors          1.8.2    2020-12-07 [?]
 P BiocParallel         * 1.24.1   2020-11-06 [?]
 P BiocSingular           1.6.0    2020-10-27 [?]
 P BiocStyle            * 2.18.1   2020-11-24 [?]
 P bit                    4.0.4    2020-08-04 [?]
 P bit64                  4.0.5    2020-08-30 [?]
 P bitops                 1.0-6    2013-08-17 [?]
 P blob                   1.2.1    2020-01-20 [?]
 P bluster                1.0.0    2020-10-27 [?]
 P bslib                  0.2.4    2021-01-25 [?]
 P cachem                 1.0.4    2021-02-13 [?]
 P cellranger             1.1.0    2016-07-27 [?]
 P cli                    2.4.0    2021-04-05 [?]
 P colorspace             2.0-0    2020-11-11 [?]
 P cowplot              * 1.1.1    2020-12-30 [?]
 P crayon                 1.4.1    2021-02-08 [?]
 P data.table           * 1.14.0   2021-02-21 [?]
 P DBI                    1.1.1    2021-01-15 [?]
 P DelayedArray           0.16.3   2021-03-24 [?]
 P DelayedMatrixStats     1.12.3   2021-02-03 [?]
 P DESeq2                 1.30.1   2021-02-19 [?]
 P digest                 0.6.27   2020-10-24 [?]
 P distill                1.2      2021-01-13 [?]
 P downlit                0.2.1    2020-11-04 [?]
 P dplyr                * 1.0.5    2021-03-05 [?]
 P dqrng                  0.2.1    2019-05-17 [?]
 P edgeR                * 3.32.1   2021-01-14 [?]
 P ellipsis               0.3.1    2020-05-15 [?]
 P evaluate               0.14     2019-05-28 [?]
 P fansi                  0.4.2    2021-01-15 [?]
 P farver                 2.1.0    2021-02-28 [?]
 P fastmap                1.1.0    2021-01-25 [?]
 P genefilter             1.72.1   2021-01-21 [?]
 P geneplotter            1.68.0   2020-10-27 [?]
 P generics               0.1.0    2020-10-31 [?]
 P GenomeInfoDb         * 1.26.4   2021-03-10 [?]
 P GenomeInfoDbData       1.2.4    2020-10-20 [?]
 P GenomicRanges        * 1.42.0   2020-10-27 [?]
 P ggbeeswarm             0.6.0    2017-08-07 [?]
 P ggplot2              * 3.3.3    2020-12-30 [?]
 P ggrepel              * 0.9.1    2021-01-15 [?]
 P Glimma               * 2.0.0    2020-10-27 [?]
 P glue                   1.4.2    2020-08-27 [?]
 P gridExtra              2.3      2017-09-09 [?]
 P gtable                 0.3.0    2019-03-25 [?]
 P here                 * 1.0.1    2020-12-13 [?]
 P highr                  0.9      2021-04-16 [?]
 P htmltools              0.5.1.1  2021-01-22 [?]
 P htmlwidgets            1.5.3    2020-12-10 [?]
 P httr                   1.4.2    2020-07-20 [?]
 P igraph                 1.2.6    2020-10-06 [?]
 P IRanges              * 2.24.1   2020-12-12 [?]
 P irlba                  2.3.3    2019-02-05 [?]
 P janitor              * 2.1.0    2021-01-05 [?]
 P jquerylib              0.1.3    2020-12-17 [?]
 P jsonlite               1.7.2    2020-12-09 [?]
 P knitr                  1.33     2021-04-24 [?]
 P labeling               0.4.2    2020-10-20 [?]
 P lattice                0.20-41  2020-04-02 [3]
 P lifecycle              1.0.0    2021-02-15 [?]
 P limma                * 3.46.0   2020-10-27 [?]
 P locfit                 1.5-9.4  2020-03-25 [?]
 P lubridate              1.7.10   2021-02-26 [?]
 P magrittr             * 2.0.1    2020-11-17 [?]
 P Matrix                 1.2-18   2019-11-27 [3]
 P MatrixGenerics       * 1.2.1    2021-01-30 [?]
 P matrixStats          * 0.58.0   2021-01-29 [?]
 P memoise                2.0.0    2021-01-26 [?]
 P msigdbr              * 7.2.1    2020-10-02 [?]
 P munsell                0.5.0    2018-06-12 [?]
 P patchwork            * 1.1.1    2020-12-17 [?]
 P pheatmap             * 1.0.12   2019-01-04 [?]
 P pillar                 1.5.1    2021-03-05 [?]
 P pkgconfig              2.0.3    2019-09-22 [?]
 P purrr                  0.3.4    2020-04-17 [?]
 P R.methodsS3          * 1.8.1    2020-08-26 [?]
 P R.oo                 * 1.24.0   2020-08-26 [?]
 P R.utils              * 2.10.1   2020-08-26 [?]
 P R6                     2.5.0    2020-10-28 [?]
 P RColorBrewer           1.1-2    2014-12-07 [?]
 P Rcpp                   1.0.6    2021-01-15 [?]
 P RCurl                  1.98-1.3 2021-03-16 [?]
 P readxl               * 1.3.1    2019-03-13 [?]
 P ResidualMatrix         1.0.0    2020-10-27 [?]
 P rlang                  0.4.11   2021-04-30 [?]
 P rmarkdown            * 2.7      2021-02-19 [?]
 P rprojroot              2.0.2    2020-11-15 [?]
 P RSQLite                2.2.5    2021-03-27 [?]
 P rsvd                   1.0.3    2020-02-17 [?]
 P S4Vectors            * 0.28.1   2020-12-09 [?]
 P sass                   0.3.1    2021-01-24 [?]
 P scales                 1.1.1    2020-05-11 [?]
 P scater               * 1.18.6   2021-02-26 [?]
 P scran                * 1.18.5   2021-02-04 [?]
 P scuttle                1.0.4    2020-12-17 [?]
 P sessioninfo            1.1.1    2018-11-05 [?]
 P SingleCellExperiment * 1.12.0   2020-10-27 [?]
 P snakecase              0.11.0   2019-05-25 [?]
 P sparseMatrixStats      1.2.1    2021-02-02 [?]
 P statmod                1.4.35   2020-10-19 [?]
 P stringi                1.7.3    2021-07-16 [?]
 P stringr                1.4.0    2019-02-10 [?]
 P SummarizedExperiment * 1.20.0   2020-10-27 [?]
 P survival               3.2-7    2020-09-28 [3]
 P tibble                 3.1.0    2021-02-25 [?]
 P tidyr                * 1.1.3    2021-03-03 [?]
 P tidyselect             1.1.0    2020-05-11 [?]
 P utf8                   1.2.1    2021-03-12 [?]
 P vctrs                  0.3.7    2021-03-29 [?]
 P vipor                  0.4.5    2017-03-22 [?]
 P viridis                0.5.1    2018-03-29 [?]
 P viridisLite            0.3.0    2018-02-01 [?]
 P withr                  2.4.1    2021-01-26 [?]
 P xaringanExtra          0.5.4    2021-08-04 [?]
 P xfun                   0.24     2021-06-15 [?]
 P XML                    3.99-0.6 2021-03-16 [?]
 P xtable                 1.8-4    2019-04-21 [?]
 P XVector                0.30.0   2020-10-27 [?]
 P yaml                   2.2.1    2020-02-01 [?]
 P zlibbioc               1.36.0   2020-10-27 [?]
 source                                  
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.0)                          
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.3)                          
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.3)                          
 Bioconductor                            
 Bioconductor                            
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 Bioconductor                            
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.2)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.3)                          
 Bioconductor                            
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.2)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 Bioconductor                            
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.2)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.2)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.5)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 Bioconductor                            
 Bioconductor                            
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.2)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.0)                          
 CRAN (R 4.0.3)                          
 Github (gadenbuie/xaringanExtra@5e2d80b)
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.3)                          
 CRAN (R 4.0.0)                          
 Bioconductor                            
 CRAN (R 4.0.0)                          
 Bioconductor                            

[1] /stornext/Projects/score/Analyses/C094_Pellicci/renv/library/R-4.0/x86_64-pc-linux-gnu
[2] /tmp/RtmpP41dVI/renv-system-library
[3] /stornext/System/data/apps/R/R-4.0.3/lib64/R/library

 P ── Loaded and on-disk path mismatch.

Soneson, C., and M. D. Robinson. 2018. “Bias, Robustness and Scalability in Single-Cell Differential Expression Analysis.” Nat. Methods 15 (4): 255–61. https://www.ncbi.nlm.nih.gov/pubmed/29481549.

Differential expression analyses is performed on the original log-expression values. We do not use the MNN-corrected values here except to obtain the clusters to be characterized.↩︎

Annotating Pellicci gamma-delta T-cell single-cell dataset (whole cell)

Preparing the data

Re-clustering

Data overview

Marker gene detection

Cluster 1 vs. 2 vs. 3 vs. 4 vs. 5

Cluster 1

Cluster 2

Cluster 3

Cluster 4

Cluster 5

fx: how clusters of the S3-mix different from that of the S1-S2-mix

fx: difference among the clusters of S3-mix (higher thymus vs with blood)

fx: how special is cluster 3 in the clusters of S3-mix

FX: difference between cluster 2 and cluster 4 (in S3-mix with blood)

Selected pairwise comparisons

cluster_1_vs_cluster_5

SUMMARY: 1 vs 5 (A vs B)

cluster_2_vs_cluster_5

SUMMARY: 2 vs 5 (C vs D)

cluster_3_vs_cluster_5

SUMMARY: 3 vs 5 (E vs F)

cluster_4_vs_cluster_5

SUMMARY: 4 vs 5 (G vs H)

cluster_2_3_4_vs_cluster_5

SUMMARY: 2_3_4 vs 5 (I vs J)

cluster_2_vs_cluster_1

SUMMARY: 2 vs 1 (K vs L)

cluster_3_vs_cluster_1

SUMMARY: 3 vs 1 (M vs N)

cluster_4_vs_cluster_1

SUMMARY: 4 vs 1 (O vs P)

cluster_2_3_4_vs_cluster_1

SUMMARY: 2_3_4 vs 1 (Q vs R)

cluster_2_vs_cluster_3

SUMMARY: 2 vs 3 (S vs T)

cluster_4_vs_cluster_3

SUMMARY: 4 vs 3 (U vs V)

cluster_2_4_vs_cluster_3

SUMMARY: 2_4 vs 3 (W vs X)

cluster_2_vs_cluster_4

SUMMARY: 2 vs 4 (Y vs Z)

Selected pairwise comparisons - integrated

Cluster_5_integrated

Cluster_4_integrated

Cluster_3_integrated

Cluster_2_integrated

Cluster_1_integrated

Supplemental info

Expression of minibulk DE in the single-cell dataset

Minibulk DE_ALL_by cluster

Minibulk DE_ALL_by group

Minibulk DE_ALL_by group (clust_col)

Minibulk DE_UP_by cluster

Minibulk DE_UP_by group

Minibulk DE_UP_by group (clust_col)

Minibulk DE_DOWN_by cluster

Minibulk DE_DOWN_by group

Minibulk DE_DOWN_by group (clust_col)

Session info

References

Cluster `1` vs. `2` vs. `3` vs. `4` vs. `5`

`Cluster 1`

`Cluster 2`

`Cluster 3`

`Cluster 4`

`Cluster 5`

fx: how special is cluster `3` in the clusters of S3-mix

FX: difference between cluster `2` and cluster `4` (in S3-mix with blood)

`cluster_1_vs_cluster_5`

`cluster_2_vs_cluster_5`

`cluster_3_vs_cluster_5`

`cluster_4_vs_cluster_5`

`cluster_2_3_4_vs_cluster_5`

`cluster_2_vs_cluster_1`

`cluster_3_vs_cluster_1`

`cluster_4_vs_cluster_1`

`cluster_2_3_4_vs_cluster_1`

`cluster_2_vs_cluster_3`

`cluster_4_vs_cluster_3`

`cluster_2_4_vs_cluster_3`

`cluster_2_vs_cluster_4`

`Cluster_5_integrated`

`Cluster_4_integrated`

`Cluster_3_integrated`

`Cluster_2_integrated`

`Cluster_1_integrated`

`Minibulk DE_ALL_by cluster`

`Minibulk DE_ALL_by group`

`Minibulk DE_ALL_by group (clust_col)`

`Minibulk DE_UP_by cluster`

`Minibulk DE_UP_by group`

`Minibulk DE_UP_by group (clust_col)`

`Minibulk DE_DOWN_by cluster`

`Minibulk DE_DOWN_by group`

`Minibulk DE_DOWN_by group (clust_col)`