Merging cells for Pellicci gamma-delta T-cell single-cell dataset (thymus only)

William Ho (Single Cell Open Research Endeavour (SCORE), WEHI)https://www.wehi.edu.au/people/rory-bowden/4536/wehi-advanced-genomics-facility
2021-09-29

Preparing the data

We start from the preprocessed SingleCellExperiment object created in ‘Selection of biologically relevant cells for Pellicci gamma-delta T-cell single-cell dataset’.

Show code
library(SingleCellExperiment)
library(here)
library(scater)
library(scran)
library(ggplot2)
library(cowplot)
library(edgeR)
library(Glimma)
library(BiocParallel)
library(patchwork)
library(pheatmap)
library(janitor)
library(distill)
source(here("code", "helper_functions.R"))
# NOTE: Using multiple cores seizes up my laptop. Can use more on unix box.
options("mc.cores" = ifelse(Sys.info()[["nodename"]] == "PC1331", 2L, 8L))
register(MulticoreParam(workers = getOption("mc.cores")))
knitr::opts_chunk$set(fig.path = "C094_Pellicci.single-cell.merge.thymus_only_files/")
Show code
sce <- readRDS(here("data", "SCEs", "C094_Pellicci.single-cell.cell_selected.SCE.rds"))

# Some useful colours
plate_number_colours <- setNames(
  unique(sce$colours$plate_number_colours),
  unique(names(sce$colours$plate_number_colours)))
plate_number_colours <- plate_number_colours[levels(sce$plate_number)]
tissue_colours <- setNames(
  unique(sce$colours$tissue_colours),
  unique(names(sce$colours$tissue_colours)))
tissue_colours <- tissue_colours[levels(sce$tissue)]
donor_colours <- setNames(
  unique(sce$colours$donor_colours),
  unique(names(sce$colours$donor_colours)))
donor_colours <- donor_colours[levels(sce$donor)]
stage_colours <- setNames(
  unique(sce$colours$stage_colours),
  unique(names(sce$colours$stage_colours)))
stage_colours <- stage_colours[levels(sce$stage)]

# also define group (i.e. tissue.stage)
sce$group <- factor(paste0(sce$tissue, ".", sce$stage))
group_colours <- setNames(
  Polychrome::kelly.colors(nlevels(sce$group) + 1)[-1],
  levels(sce$group))
sce$colours$group_colours <- group_colours[sce$group]

# Some useful gene sets
mito_set <- rownames(sce)[any(rowData(sce)$ENSEMBL.SEQNAME == "MT")]
ribo_set <- grep("^RP(S|L)", rownames(sce), value = TRUE)
# NOTE: A more curated approach for identifying ribosomal protein genes 
#       (https://github.com/Bioconductor/OrchestratingSingleCellAnalysis-base/blob/ae201bf26e3e4fa82d9165d8abf4f4dc4b8e5a68/feature-selection.Rmd#L376-L380)
library(msigdbr)
c2_sets <- msigdbr(species = "Homo sapiens", category = "C2")
ribo_set <- union(
  ribo_set,
  c2_sets[c2_sets$gs_name == "KEGG_RIBOSOME", ]$gene_symbol)
ribo_set <- intersect(ribo_set, rownames(sce))
sex_set <- rownames(sce)[any(rowData(sce)$ENSEMBL.SEQNAME %in% c("X", "Y"))]
pseudogene_set <- rownames(sce)[
  any(grepl("pseudogene", rowData(sce)$ENSEMBL.GENEBIOTYPE))]
# NOTE: get rid of psuedogene seems not to be good enough for HVG determination of this dataset
protein_coding_gene_set <- rownames(sce)[
  any(grepl("protein_coding", rowData(sce)$ENSEMBL.GENEBIOTYPE))]

Re-processing

We re-process the cells from thymus only remained in the dataset.

Show code
sce <- sce[, sce$tissue == "Thymus"]
sce <- sce[, !sce$stage == "Unknown"]
colData(sce) <- droplevels(colData(sce))

NOTE: as advised by Dan during the meeting on 15 Jul 2021, we should proceed with HVG determination (and thus clustering) based on protein coding genes only (instead of excluding ribosomal, mitochondrial and pseudogenes only).

Show code
sce$batch <- sce$plate_number
var_fit <- modelGeneVarWithSpikes(sce, "ERCC", block = sce$batch)
hvg <- getTopHVGs(var_fit, var.threshold = 0)
hvg <- intersect(hvg, protein_coding_gene_set)

set.seed(67726)
sce <- denoisePCA(
  sce, 
  var_fit, 
  subset.row = hvg,
  BSPARAM = BiocSingular::IrlbaParam(deferred = TRUE))

set.seed(853)
sce <- runUMAP(sce, dimred = "PCA")

set.seed(4759)
snn_gr_0 <- buildSNNGraph(sce, use.dimred = "PCA")
clusters_0 <- igraph::cluster_louvain(snn_gr_0)
sce$cluster_0 <- factor(clusters_0$membership)
cluster_colours_0 <- setNames(
  scater:::.get_palette("tableau10medium")[seq_len(nlevels(sce$cluster_0))],
  levels(sce$cluster_0))
sce$colours$cluster_colours_0 <- cluster_colours_0[sce$cluster_0]

There are 4 clusters detected, shown on the UMAP plot Figure 1 and broken down by experimental factors in Figure 2.

Show code
p1 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = cluster_0), size = 0.25) +
  scale_colour_manual(values = cluster_colours_0) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

p2 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = stage), size = 0.25) +
  scale_colour_manual(values = stage_colours) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

p3 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = plate_number), size = 0.25) +
  scale_colour_manual(values = plate_number_colours) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

p4 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = tissue), size = 0.25) +
  scale_colour_manual(values = tissue_colours) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

p5 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = donor), size = 0.25) +
  scale_colour_manual(values = donor_colours) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

p6 <- ggcells(sce, aes(x = UMAP.1, y = UMAP.2)) +
  geom_point(aes(colour = group), size = 0.25) +
  scale_colour_manual(values = group_colours) + 
  theme_cowplot(font_size = 8) + 
  xlab("Dimension 1") + 
  ylab("Dimension 2")

(p1 | p2) / (p3 | p4) / (p5 | p6)
UMAP plot, where each point represents a cell and is coloured according to the legend.

Figure 1: UMAP plot, where each point represents a cell and is coloured according to the legend.

Show code
p1 <- ggcells(sce) + 
  geom_bar(aes(x = cluster_0, fill = cluster_0)) +
  coord_flip() +
  ylab("Number of samples") +
  theme_cowplot(font_size = 8) +
  scale_fill_manual(values = cluster_colours_0) +
  geom_text(stat='count', aes(x = cluster_0, label=..count..), hjust=1.5, size=2)

p2 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster_0, fill = stage),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = stage_colours) +
  theme_cowplot(font_size = 8)

p3 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster_0, fill = plate_number),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = plate_number_colours) +
  theme_cowplot(font_size = 8)

p4 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster_0, fill = tissue),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = tissue_colours) +
  theme_cowplot(font_size = 8)

p5 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster_0, fill = donor),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = donor_colours) +
  theme_cowplot(font_size = 8)

p6 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster_0, fill = group),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = group_colours) +
  theme_cowplot(font_size = 8)

(p1 | p2) / (p3 | p4) / (p5 | p6)
Breakdown of clusters by experimental factors.

Figure 2: Breakdown of clusters by experimental factors.

NOTE for the unmerged data:

Data integration

Motivation

Large single-cell RNA sequencing (scRNA-seq) projects usually need to generate data across multiple batches due to logistical constraints. However, the processing of different batches is often subject to uncontrollable differences, e.g., changes in operator, differences in reagent quality. This results in systematic differences in the observed expression in cells from different batches, which we refer to as “batch effects.” Batch effects are problematic as they can be major drivers of heterogeneity in the data, masking the relevant biological differences and complicating interpretation of the results.

Computational correction of these effects is critical for eliminating batch-to-batch variation, allowing data across multiple batches to be combined for common downstream analysis. However, existing methods based on linear models (Ritchie et al. 2015; Leek et al. 2012) assume that the composition of cell populations are either known or the same across batches. To overcome these limitations, bespoke methods have been developed for batch correction of single-cell data (Haghverdi et al. 2018; Butler et al. 2018; Lin et al. 2019) that do not require a priori knowledge about the composition of the population. This allows them to be used in workflows for exploratory analyses of scRNA-seq data where such knowledge is usually unavailable.

We will use the Mutual Nearest Neighbours (MNN) approach of Haghverdi et al. (2018), as implemented in the batchelor package, to perform data integration. The MNN approach does not rely on pre-defined or equal population compositions across batches, only requiring that a subset of the population be shared between batches.

MNN correction

Show code
p1 <- ggcells(sce) +
  geom_bar(aes(x = plate_number, fill = plate_number)) +
  coord_flip() +
  ylab("Number of samples") +
  theme_cowplot(font_size = 8) +
  scale_fill_manual(values = plate_number_colours) +
  geom_text(stat='count', aes(x = plate_number, label=..count..), hjust=1.5, size=2)

p2 <- ggcells(sce) +
  geom_bar(
    aes(x = plate_number, fill = stage),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = stage_colours) +
  theme_cowplot(font_size = 8)

p3 <- ggcells(sce) +
  geom_bar(
    aes(x = plate_number, fill = tissue),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = tissue_colours) +
  theme_cowplot(font_size = 8)

p4 <- ggcells(sce) +
  geom_bar(
    aes(x = plate_number, fill = donor),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = donor_colours) +
  theme_cowplot(font_size = 8)

p5 <- ggcells(sce) +
  geom_bar(
    aes(x = plate_number, fill = group),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = group_colours) +
  theme_cowplot(font_size = 8)

(p1 | p2) / (p3 | p4) / (p5 | plot_spacer()) 
Breakdown of clusters by experimental factors.

Figure 3: Breakdown of clusters by experimental factors.

We treat each plate as a batch and also test to manually provide the merge order.

Show code
library(batchelor)
set.seed(1819)
# manual merge
# NOTE: we tried a merge order (list(list("LCE508", "LCE511", "LCE512", "LCE513", "LCE514"), "LCE509")), which led to significant variance loss; to reduced var loss of LCE509, which contained different proportion of cells from each sample, we decide to merge LCE509 after the merge of LCE503 + LCE504 (as indicated in the auto-merge)
mnn_out <- fastMNN(
  multiBatchNorm(sce, batch = sce$batch),
  batch = sce$batch,
  cos.norm = FALSE,
  d = ncol(reducedDim(sce, "PCA")),
  auto.merge = FALSE,
  merge.order = list(list("LCE513", "LCE514", "LCE509"), list("LCE508", "LCE511", "LCE512")),
  subset.row = hvg)

One useful diagnostic of the MNN algorithm is the proportion of variance within each batch that is lost during MNN correction1. Large proportions of lost variance (\(>10 \%\)) suggest that correction is removing genuine biological heterogeneity. This would occur due to violations of the assumption of orthogonality between the batch effect and the biological subspace (Haghverdi et al. 2018).

The manual merge order we chosen (Table 1) lead to an acceptable loss of biological variance from the dataset.

Show code
tab <- metadata(mnn_out)$merge.info$lost.var
knitr::kable(
  100 * tab,
  digits = 1,
  caption = "Percentage of estimated biological variation lost within each plate at each step of the merge (manual). Ideally, all these values should be small (e.g., < 5%).")
Table 1: Percentage of estimated biological variation lost within each plate at each step of the merge (manual). Ideally, all these values should be small (e.g., < 5%).
LCE508 LCE509 LCE511 LCE512 LCE513 LCE514
3.7 0.0 4.0 0.0 0.0 0.0
3.2 0.0 2.6 4.4 0.0 0.0
0.0 0.0 0.0 0.0 3.0 3.4
0.0 5.3 0.0 0.0 3.2 3.1
5.7 3.3 4.6 4.0 4.1 3.8
Show code
reducedDim(sce, "corrected") <- reducedDim(mnn_out, "corrected")

# generate UMAP
set.seed(1248)
sce <- runUMAP(sce, dimred = "corrected", name = "UMAP_corrected")

# re-clustering after each MNN correction
set.seed(4759)

snn_gr <- buildSNNGraph(sce, use.dimred = "corrected")
clusters <- igraph::cluster_louvain(snn_gr)
sce$cluster <- factor(clusters$membership)
cluster_colours <- setNames(
  scater:::.get_palette("tableau10medium")[seq_len(nlevels(sce$cluster))],
  levels(sce$cluster))
sce$colours$cluster_colours <- cluster_colours[sce$cluster]

Figure 4 shows an overview of comparisons between the unmerged and merged data (manual merge) broken down by different experimental factors.

Show code
p1 <- plotReducedDim(sce, "UMAP", colour_by = "plate_number", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = plate_number_colours, name = "plate_number")
p2 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "plate_number", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = plate_number_colours, name = "plate_number")

p3 <- plotReducedDim(sce, "UMAP", colour_by = "cluster_0", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = cluster_colours_0, name = "cluster_0")
p4 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "cluster", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = cluster_colours, name = "cluster")

p5 <- plotReducedDim(sce, "UMAP", colour_by = "stage", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = stage_colours, name = "stage")
p6 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "stage", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = stage_colours, name = "stage")

p7 <- plotReducedDim(sce, "UMAP", colour_by = "tissue", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = tissue_colours, name = "tissue")
p8 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "tissue", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = tissue_colours, name = "tissue")

p9 <- plotReducedDim(sce, "UMAP", colour_by = "donor", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = donor_colours, name = "donor")
p10 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "donor", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = donor_colours, name = "donor")

p11 <- plotReducedDim(sce, "UMAP", colour_by = "group", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = group_colours, name = "group")
p12 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "group", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = group_colours, name = "group")

p1 + p2 + p3 + p4 + 
  p5 + p6 + p7 + p8 + 
  p9 + p10 + p11 + p12 + 
  # plot_layout(ncol = 2, guides = "collect")
  plot_layout(ncol = 2)
Comparison between batch-uncorrected data (left column) and -corrected data by manual merge orders (right column).

Figure 4: Comparison between batch-uncorrected data (left column) and -corrected data by manual merge orders (right column).

To get an insight, here we focus on the uncorrected and the corrected data by the manual merge, then break down the UMAP plot above further by plate_number (Figure 5) and stage (Figure 6).

From the perspective of correcting the batch effect, the MNN correction can effectively alleviate the plate-specific grouping of cells (Figure 5).

From the perspective of preserving the biological-feature-of-interest, after the batch correction, we can still observe: the “shifting” of the transcriptomic stages of cells P6 to P7, then P8 (Figure 6)

Show code
umap_df <- makePerCellDF(sce)
bg <- dplyr::select(umap_df, -plate_number)

plot_grid(

ggplot(aes(x = UMAP.1, y = UMAP.2), data = umap_df) +
  geom_point(data = bg, colour = scales::alpha("grey", 0.5), size = 0.125) +
  geom_point(aes(colour = plate_number), alpha = 1, size = 0.5) +
  scale_fill_manual(values = plate_number_colours, name = "plate_number") +
  scale_colour_manual(values = plate_number_colours, name = "plate_number") +
  theme_cowplot(font_size = 10) +
  xlab("UMAP 1") +
  ylab("UMAP 2") +
  facet_wrap(~plate_number, ncol = 3) +
  guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
  guides(colour = FALSE),

ggplot(aes(x = UMAP_corrected.1, y = UMAP_corrected.2), data = umap_df) +
  geom_point(data = bg, colour = scales::alpha("grey", 0.5), size = 0.125) +
  geom_point(aes(colour = plate_number), alpha = 1, size = 0.5) +
  scale_fill_manual(values = plate_number_colours, name = "plate_number") +
  scale_colour_manual(values = plate_number_colours, name = "plate_number") +
  theme_cowplot(font_size = 10) +
  xlab("UMAP 1") +
  ylab("UMAP 2") +
  facet_wrap(~plate_number, ncol = 3) +
  guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
  guides(colour = FALSE),

ncol= 2,
align ="h"
)
UMAP plot of the dataset. Each point represents a cell and each panel highlights cells from a particular `plate_number` when data is unmerged (left) and merged by manual merge 2 (right).

Figure 5: UMAP plot of the dataset. Each point represents a cell and each panel highlights cells from a particular plate_number when data is unmerged (left) and merged by manual merge 2 (right).

Show code
umap_df <- makePerCellDF(sce)
bg <- dplyr::select(umap_df, -stage)

plot_grid(

ggplot(aes(x = UMAP.1, y = UMAP.2), data = umap_df) +
  geom_point(data = bg, colour = scales::alpha("grey", 0.5), size = 0.125) +
  geom_point(aes(colour = stage), alpha = 1, size = 0.5) +
  scale_fill_manual(
    values = stage_colours,
    name = "stage") +
  scale_colour_manual(
    values = stage_colours,
    name = "stage") +
  theme_cowplot(font_size = 10) +
  xlab("UMAP 1") +
  ylab("UMAP 2") +
  facet_wrap(~stage, ncol = 3) +
  guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
  guides(colour = FALSE),

ggplot(aes(x = UMAP_corrected.1, y = UMAP_corrected.2), data = umap_df) +
  geom_point(data = bg, colour = scales::alpha("grey", 0.5), size = 0.125) +
  geom_point(aes(colour = stage), alpha = 1, size = 0.5) +
  scale_fill_manual(
    values = stage_colours,
    name = "stage") +
  scale_colour_manual(
    values = stage_colours,
    name = "stage") +
  theme_cowplot(font_size = 10) +
  xlab("UMAP 1") +
  ylab("UMAP 2") +
  facet_wrap(~stage, ncol = 3) +
  guides(colour = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
  guides(colour = FALSE),

ncol= 2,
align ="h"
)
UMAP plot of the dataset. Each point represents a cell and each panel highlights cells from a particular `stage` when data is unmerged (left) and merged by manual merge 2 (right).

Figure 6: UMAP plot of the dataset. Each point represents a cell and each panel highlights cells from a particular stage when data is unmerged (left) and merged by manual merge 2 (right).

Taken altogether, we conclude that the data corrected by the manual merge as the best possible merge order for data integration of this dataset and proceed.

To sumup, there are 3 clusters detected, broken down by experimental factors in Figure 7 and shown on the UMAP plot Figure 8.

Show code
p1 <- ggcells(sce) + 
  geom_bar(aes(x = cluster, fill = cluster)) +
  coord_flip() +
  ylab("Number of samples") +
  theme_cowplot(font_size = 8) +
  scale_fill_manual(values = cluster_colours) +
  geom_text(stat='count', aes(x = cluster, label=..count..), hjust=1.5, size=2)

p2 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster, fill = stage),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = stage_colours) +
  theme_cowplot(font_size = 8)

p3 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster, fill = plate_number),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = plate_number_colours) +
  theme_cowplot(font_size = 8)

p4 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster, fill = tissue),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = tissue_colours) +
  theme_cowplot(font_size = 8)

p5 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster, fill = donor),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = donor_colours) +
  theme_cowplot(font_size = 8)

p6 <- ggcells(sce) + 
  geom_bar(
    aes(x = cluster, fill = group),
    position = position_fill(reverse = TRUE)) +
  coord_flip() +
  ylab("Frequency") +
  scale_fill_manual(values = group_colours) +
  theme_cowplot(font_size = 8)

(p1 | p2) / (p3 | p4) / (p5 | p6) 
Breakdown of clusters by experimental factors.

Figure 7: Breakdown of clusters by experimental factors.

Show code
p1 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "cluster", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = cluster_colours, name = "cluster")

p2 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "stage", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = stage_colours, name = "stage")

p3 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "plate_number", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = plate_number_colours, name = "plate_number")

p4 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "tissue", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = tissue_colours, name = "tissue")

p5 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "donor", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = donor_colours, name = "donor")

p6 <- plotReducedDim(sce, "UMAP_corrected", colour_by = "group", theme_size = 7, point_size = 0.2) + 
  scale_colour_manual(values = group_colours, name = "group")

(p1 | p2) / (p3 | p4) / (p5 | p6) 
UMAP plot, where each point represents a cell and is coloured according to the legend.

Figure 8: UMAP plot, where each point represents a cell and is coloured according to the legend.

Figure 9 overlays the index sorting data on the UMAP plots.

Show code
tmp <- sce
colData(tmp) <- cbind(colData(tmp), t(assays(altExp(tmp, "FACS"))$raw))

facs_markers <- grep("^F|^S|^V|^B|^R|^V", colnames(colData(tmp)), value = TRUE)

corrected_umap <- cbind(
  data.frame(
    UMAP.1 = reducedDim(tmp, "UMAP_corrected")[, 1],
    UMAP.2 = reducedDim(tmp, "UMAP_corrected")[, 2]),
  as.data.frame(colData(tmp)))

p <- lapply(facs_markers, function(m) {
  p1 <- ggplot(aes(x = UMAP.1, y = UMAP.2), data = corrected_umap) +
    geom_point(aes_string(colour = m, fill = m), alpha = 1, size = 1) +
    scale_fill_viridis_c(trans = ifelse(
      grepl("^FSC|^SSC", m), 
      "identity",
      scales::pseudo_log_trans(sigma = 150 / 2)),
      name = "") +
    scale_colour_viridis_c(trans = ifelse(
      grepl("^FSC|^SSC", m), 
      "identity",
      scales::pseudo_log_trans(sigma = 150 / 2)),
      name = "") +
    theme_cowplot(font_size = 10) + 
    xlab("Dimension 1") + 
    ylab("Dimension 2") + 
    ggtitle(ifelse(grepl("^FSC|^SSC", m), "Raw", "Pseudo-logged"), m)
  
  corrected_umap$scaled_rank <- corrected_umap %>%
    dplyr::group_by(plate_number) %>%
    dplyr::mutate(
      rank = rank(!!sym(m), na.last = "keep"),
      max_rank = max(rank, na.rm = TRUE),
      scaled_rank = rank / max_rank) %>%
    dplyr::pull(scaled_rank)
  
  p2 <- ggplot(aes(x = UMAP.1, y = UMAP.2), data = corrected_umap) +
    geom_point(aes(colour = scaled_rank, fill = scaled_rank), alpha = 1, size = 1) +
    scale_fill_viridis_c(name = "") +
    scale_colour_viridis_c(name = "") +
    theme_cowplot(font_size = 10) + 
    xlab("Dimension 1") + 
    ylab("Dimension 2") + 
    ggtitle("Scaled rank", m)
  list(p1, p2)
})
multiplot(
  plotlist = unlist(p, recursive = FALSE),
  layout = matrix(seq_len(length(p) * 2), ncol = 2, byrow = TRUE))
Overlay of index sorting data on UMAP plot. For each marker, the left-hand plot shows the 'raw' or 'pseudo-logged' fluorescence intensity and the right-side plots the 'scaled rank' of the raw intensity. The pseudo-log transformation is a transformation mapping numbers to a signed logarithmic scale with a smooth transition to linear scale around 0. This transformation is commonly used when plotting fluorescence intensities from FACS. The scaled rank is applied within each mouse and assigns the maximum fluorescence intensity a value of one and the minimum fluorescence intensities a value of zero. It can be thought of as a crude normalization of the FACS data that allows us to compare fluorescence intensities from different mice.

Figure 9: Overlay of index sorting data on UMAP plot. For each marker, the left-hand plot shows the ‘raw’ or ‘pseudo-logged’ fluorescence intensity and the right-side plots the ‘scaled rank’ of the raw intensity. The pseudo-log transformation is a transformation mapping numbers to a signed logarithmic scale with a smooth transition to linear scale around 0. This transformation is commonly used when plotting fluorescence intensities from FACS. The scaled rank is applied within each mouse and assigns the maximum fluorescence intensity a value of one and the minimum fluorescence intensities a value of zero. It can be thought of as a crude normalization of the FACS data that allows us to compare fluorescence intensities from different mice.

Concluding remarks

Show code
saveRDS(
  sce,
  here("data", "SCEs", "C094_Pellicci.single-cell.merged.thymus_only.SCE.rds"),
  compress = "xz")

The processed SingleCellExperiment object is available (see data/SCEs/C094_Pellicci.single-cell.merged.thymus_only.SCE.rds). This will be used in downstream analyses, e.g., identifying cluster marker genes and refining the cell labels.

Additional information

The following are available on request:

Session info

The analysis and this document were prepared using the following software (click triangle to expand)
Show code
sessioninfo::session_info()
─ Session info ─────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.0.3 (2020-10-10)
 os       CentOS Linux 7 (Core)       
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Australia/Melbourne         
 date     2021-09-29                  

─ Packages ─────────────────────────────────────────────────────────
 ! package              * version  date       lib source        
 P annotate               1.68.0   2020-10-27 [?] Bioconductor  
 P AnnotationDbi          1.52.0   2020-10-27 [?] Bioconductor  
 P assertthat             0.2.1    2019-03-21 [?] CRAN (R 4.0.0)
 P batchelor            * 1.6.3    2021-04-16 [?] Bioconductor  
 P beachmat               2.6.4    2020-12-20 [?] Bioconductor  
 P beeswarm               0.3.1    2021-03-07 [?] CRAN (R 4.0.3)
 P Biobase              * 2.50.0   2020-10-27 [?] Bioconductor  
 P BiocGenerics         * 0.36.0   2020-10-27 [?] Bioconductor  
 P BiocManager            1.30.12  2021-03-28 [?] CRAN (R 4.0.3)
 P BiocNeighbors          1.8.2    2020-12-07 [?] Bioconductor  
 P BiocParallel         * 1.24.1   2020-11-06 [?] Bioconductor  
 P BiocSingular           1.6.0    2020-10-27 [?] Bioconductor  
 P BiocStyle              2.18.1   2020-11-24 [?] Bioconductor  
 P bit                    4.0.4    2020-08-04 [?] CRAN (R 4.0.0)
 P bit64                  4.0.5    2020-08-30 [?] CRAN (R 4.0.0)
 P bitops                 1.0-6    2013-08-17 [?] CRAN (R 4.0.0)
 P blob                   1.2.1    2020-01-20 [?] CRAN (R 4.0.0)
 P bluster                1.0.0    2020-10-27 [?] Bioconductor  
 P bslib                  0.2.4    2021-01-25 [?] CRAN (R 4.0.3)
 P cachem                 1.0.4    2021-02-13 [?] CRAN (R 4.0.3)
 P cli                    2.4.0    2021-04-05 [?] CRAN (R 4.0.3)
 P colorspace             2.0-0    2020-11-11 [?] CRAN (R 4.0.3)
 P cowplot              * 1.1.1    2020-12-30 [?] CRAN (R 4.0.3)
 P crayon                 1.4.1    2021-02-08 [?] CRAN (R 4.0.3)
 P DBI                    1.1.1    2021-01-15 [?] CRAN (R 4.0.3)
 P DelayedArray           0.16.3   2021-03-24 [?] Bioconductor  
 P DelayedMatrixStats     1.12.3   2021-02-03 [?] Bioconductor  
 P DESeq2                 1.30.1   2021-02-19 [?] Bioconductor  
 P digest                 0.6.27   2020-10-24 [?] CRAN (R 4.0.2)
 P distill              * 1.2      2021-01-13 [?] CRAN (R 4.0.3)
 P downlit                0.2.1    2020-11-04 [?] CRAN (R 4.0.3)
 P dplyr                  1.0.5    2021-03-05 [?] CRAN (R 4.0.3)
 P dqrng                  0.2.1    2019-05-17 [?] CRAN (R 4.0.0)
 P edgeR                * 3.32.1   2021-01-14 [?] Bioconductor  
 P ellipsis               0.3.1    2020-05-15 [?] CRAN (R 4.0.0)
 P evaluate               0.14     2019-05-28 [?] CRAN (R 4.0.0)
 P fansi                  0.4.2    2021-01-15 [?] CRAN (R 4.0.3)
 P farver                 2.1.0    2021-02-28 [?] CRAN (R 4.0.3)
 P fastmap                1.1.0    2021-01-25 [?] CRAN (R 4.0.3)
 P FNN                    1.1.3    2019-02-15 [?] CRAN (R 4.0.0)
 P genefilter             1.72.1   2021-01-21 [?] Bioconductor  
 P geneplotter            1.68.0   2020-10-27 [?] Bioconductor  
 P generics               0.1.0    2020-10-31 [?] CRAN (R 4.0.3)
 P GenomeInfoDb         * 1.26.4   2021-03-10 [?] Bioconductor  
 P GenomeInfoDbData       1.2.4    2020-10-20 [?] Bioconductor  
 P GenomicRanges        * 1.42.0   2020-10-27 [?] Bioconductor  
 P ggbeeswarm             0.6.0    2017-08-07 [?] CRAN (R 4.0.0)
 P ggplot2              * 3.3.3    2020-12-30 [?] CRAN (R 4.0.3)
 P Glimma               * 2.0.0    2020-10-27 [?] Bioconductor  
 P glue                   1.4.2    2020-08-27 [?] CRAN (R 4.0.0)
 P gridExtra              2.3      2017-09-09 [?] CRAN (R 4.0.0)
 P gtable                 0.3.0    2019-03-25 [?] CRAN (R 4.0.0)
 P here                 * 1.0.1    2020-12-13 [?] CRAN (R 4.0.3)
 P highr                  0.9      2021-04-16 [?] CRAN (R 4.0.3)
 P htmltools              0.5.1.1  2021-01-22 [?] CRAN (R 4.0.3)
 P htmlwidgets            1.5.3    2020-12-10 [?] CRAN (R 4.0.3)
 P httr                   1.4.2    2020-07-20 [?] CRAN (R 4.0.0)
 P igraph                 1.2.6    2020-10-06 [?] CRAN (R 4.0.2)
 P IRanges              * 2.24.1   2020-12-12 [?] Bioconductor  
 P irlba                  2.3.3    2019-02-05 [?] CRAN (R 4.0.0)
 P janitor              * 2.1.0    2021-01-05 [?] CRAN (R 4.0.3)
 P jquerylib              0.1.3    2020-12-17 [?] CRAN (R 4.0.3)
 P jsonlite               1.7.2    2020-12-09 [?] CRAN (R 4.0.3)
 P knitr                  1.33     2021-04-24 [?] CRAN (R 4.0.3)
 P labeling               0.4.2    2020-10-20 [?] CRAN (R 4.0.0)
 P lattice                0.20-41  2020-04-02 [3] CRAN (R 4.0.3)
 P lifecycle              1.0.0    2021-02-15 [?] CRAN (R 4.0.3)
 P limma                * 3.46.0   2020-10-27 [?] Bioconductor  
 P locfit                 1.5-9.4  2020-03-25 [?] CRAN (R 4.0.0)
 P lubridate              1.7.10   2021-02-26 [?] CRAN (R 4.0.3)
 P magrittr               2.0.1    2020-11-17 [?] CRAN (R 4.0.3)
 P Matrix                 1.2-18   2019-11-27 [3] CRAN (R 4.0.3)
 P MatrixGenerics       * 1.2.1    2021-01-30 [?] Bioconductor  
 P matrixStats          * 0.58.0   2021-01-29 [?] CRAN (R 4.0.3)
 P memoise                2.0.0    2021-01-26 [?] CRAN (R 4.0.3)
 P msigdbr              * 7.2.1    2020-10-02 [?] CRAN (R 4.0.2)
 P munsell                0.5.0    2018-06-12 [?] CRAN (R 4.0.0)
 P patchwork            * 1.1.1    2020-12-17 [?] CRAN (R 4.0.3)
 P pheatmap             * 1.0.12   2019-01-04 [?] CRAN (R 4.0.0)
 P pillar                 1.5.1    2021-03-05 [?] CRAN (R 4.0.3)
 P pkgconfig              2.0.3    2019-09-22 [?] CRAN (R 4.0.0)
 P Polychrome             1.2.6    2020-11-11 [?] CRAN (R 4.0.3)
 P purrr                  0.3.4    2020-04-17 [?] CRAN (R 4.0.0)
 P R6                     2.5.0    2020-10-28 [?] CRAN (R 4.0.2)
 P RColorBrewer           1.1-2    2014-12-07 [?] CRAN (R 4.0.0)
 P Rcpp                   1.0.6    2021-01-15 [?] CRAN (R 4.0.3)
 P RCurl                  1.98-1.3 2021-03-16 [?] CRAN (R 4.0.3)
 P ResidualMatrix         1.0.0    2020-10-27 [?] Bioconductor  
 P rlang                  0.4.11   2021-04-30 [?] CRAN (R 4.0.5)
 P rmarkdown              2.7      2021-02-19 [?] CRAN (R 4.0.3)
 P rprojroot              2.0.2    2020-11-15 [?] CRAN (R 4.0.3)
 P RSpectra               0.16-0   2019-12-01 [?] CRAN (R 4.0.0)
 P RSQLite                2.2.5    2021-03-27 [?] CRAN (R 4.0.3)
 P rsvd                   1.0.3    2020-02-17 [?] CRAN (R 4.0.0)
 P S4Vectors            * 0.28.1   2020-12-09 [?] Bioconductor  
 P sass                   0.3.1    2021-01-24 [?] CRAN (R 4.0.3)
 P scales                 1.1.1    2020-05-11 [?] CRAN (R 4.0.0)
 P scater               * 1.18.6   2021-02-26 [?] Bioconductor  
 P scatterplot3d          0.3-41   2018-03-14 [?] CRAN (R 4.0.0)
 P scran                * 1.18.5   2021-02-04 [?] Bioconductor  
 P scuttle                1.0.4    2020-12-17 [?] Bioconductor  
 P sessioninfo            1.1.1    2018-11-05 [?] CRAN (R 4.0.0)
 P SingleCellExperiment * 1.12.0   2020-10-27 [?] Bioconductor  
 P snakecase              0.11.0   2019-05-25 [?] CRAN (R 4.0.0)
 P sparseMatrixStats      1.2.1    2021-02-02 [?] Bioconductor  
 P statmod                1.4.35   2020-10-19 [?] CRAN (R 4.0.2)
 P stringi                1.7.3    2021-07-16 [?] CRAN (R 4.0.3)
 P stringr                1.4.0    2019-02-10 [?] CRAN (R 4.0.0)
 P SummarizedExperiment * 1.20.0   2020-10-27 [?] Bioconductor  
 P survival               3.2-7    2020-09-28 [3] CRAN (R 4.0.3)
 P tibble                 3.1.0    2021-02-25 [?] CRAN (R 4.0.3)
 P tidyselect             1.1.0    2020-05-11 [?] CRAN (R 4.0.0)
 P utf8                   1.2.1    2021-03-12 [?] CRAN (R 4.0.3)
 P uwot                   0.1.10   2020-12-15 [?] CRAN (R 4.0.3)
 P vctrs                  0.3.7    2021-03-29 [?] CRAN (R 4.0.3)
 P vipor                  0.4.5    2017-03-22 [?] CRAN (R 4.0.0)
 P viridis                0.5.1    2018-03-29 [?] CRAN (R 4.0.0)
 P viridisLite            0.3.0    2018-02-01 [?] CRAN (R 4.0.0)
 P withr                  2.4.1    2021-01-26 [?] CRAN (R 4.0.3)
 P xfun                   0.24     2021-06-15 [?] CRAN (R 4.0.3)
 P XML                    3.99-0.6 2021-03-16 [?] CRAN (R 4.0.3)
 P xtable                 1.8-4    2019-04-21 [?] CRAN (R 4.0.0)
 P XVector                0.30.0   2020-10-27 [?] Bioconductor  
 P yaml                   2.2.1    2020-02-01 [?] CRAN (R 4.0.0)
 P zlibbioc               1.36.0   2020-10-27 [?] Bioconductor  

[1] /stornext/Projects/score/Analyses/C094_Pellicci/renv/library/R-4.0/x86_64-pc-linux-gnu
[2] /tmp/RtmpP41dVI/renv-system-library
[3] /stornext/System/data/apps/R/R-4.0.3/lib64/R/library

 P ── Loaded and on-disk path mismatch.
Butler, A., P. Hoffman, P. Smibert, E. Papalexi, and R. Satija. 2018. “Integrating Single-Cell Transcriptomic Data Across Different Conditions, Technologies, and Species.” Nat. Biotechnol. 36 (5): 411–20.
Haghverdi, L., A. T. L. Lun, M. D. Morgan, and J. C. Marioni. 2018. “Batch Effects in Single-Cell RNA-Sequencing Data Are Corrected by Matching Mutual Nearest Neighbors.” Nat. Biotechnol., April.
Leek, J. T., W. E. Johnson, H. S. Parker, A. E. Jaffe, and J. D. Storey. 2012. “The Sva Package for Removing Batch Effects and Other Unwanted Variation in High-Throughput Experiments.” Bioinformatics 28 (6): 882–83.
Lin, Y., S. Ghazanfar, K. Y. X. Wang, J. A. Gagnon-Bartsch, K. K. Lo, X. Su, Z. G. Han, et al. 2019. “scMerge Leverages Factor Analysis, Stable Expression, and Pseudoreplication to Merge Multiple Single-Cell RNA-Seq Datasets.” Proc. Natl. Acad. Sci. U.S.A. 116 (20): 9775–84.
Ritchie, M. E., B. Phipson, D. Wu, Y. Hu, C. W. Law, W. Shi, and G. K. Smyth. 2015. limma powers differential expression analyses for RNA-sequencing and microarray studies.” Nucleic Acids Res. 43 (7): e47.

  1. Specifically, this refers to the within-batch variance that is removed during orthogonalization with respect to the average correction vector at each merge step.↩︎

References