Batch integration

Removing batch effects while preserving biological variation

6 datasets · 19 methods · 7 control methods · 13 metrics

Info

Task info Method info Metric info Dataset info Results

As single-cell technologies advance, single-cell datasets are growing both in size and complexity. Especially in consortia such as the Human Cell Atlas, individual studies combine data from multiple labs, each sequencing multiple individuals possibly with different technologies. This gives rise to complex batch effects in the data that must be computationally removed to perform a joint analysis. These batch integration methods must remove the batch effect while not removing relevant biological information. Currently, over 200 tools exist that aim to remove batch effects scRNA-seq datasets (Zappia, Phipson, and Oshlack 2018). These methods balance the removal of batch effects with the conservation of nuanced biological information in different ways. This abundance of tools has complicated batch integration method choice, leading to several benchmarks on this topic (Luecken et al. 2021a; Tran et al. 2020; Chazarra-Gil et al. 2021; Mereu et al. 2020). Yet, benchmarks use different metrics, method implementations and datasets. Here we build a living benchmarking task for batch integration methods with the vision of improving the consistency of method evaluation.

In this task we evaluate batch integration methods on their ability to remove batch effects in the data while conserving variation attributed to biological effects. As input, methods require either normalised or unnormalised data with multiple batches and consistent cell type labels. The batch integrated output can be a feature matrix, a low dimensional embedding and/or a neighbourhood graph. The respective batch-integrated representation is then evaluated using sets of metrics that capture how well batch effects are removed and whether biological variance is conserved. We have based this particular task on the latest, and most extensive benchmark of single-cell data integration methods.

Summary

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

has_resources = results[0].hasOwnProperty("resources")
has_exit_codes = results[0].hasOwnProperty("exit_codes")

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

results_resources = {
  let results_resources = null

  if (has_resources) {
    results_resources = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        ...d.resources
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return results_resources
}

resources = {
  let resources = null

  if (has_resources) {
    resources = d3.groups(results_resources, d => d.method_id)
      .map(([method_id, values]) => {
        const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
        const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
        const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
        const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))

        return ({
          method_id,
          mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
          mean_peak_memory_mb,
          mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
          mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
          mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
          mean_disk_read_log: -Math.log10(mean_disk_read_mb),
          mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
          mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
          mean_disk_write_log: -Math.log10(mean_disk_write_mb),
          mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
          mean_duration_sec,
          mean_duration_log: -Math.log10(mean_duration_sec),
          mean_duration_str: " " + label_time(mean_duration_sec) + " "
        })
      })
  }

  return resources
}

exit_codes = {
  let exit_codes = null

  if (has_exit_codes) {
    exit_codes = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: Object.values(d.exit_codes)
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  } else {
    exit_codes = results_resources.flatMap(d => {
      let exit_code = d.exit_code
      if (exit_code === undefined) {
        // If there is not exit code, assume the method ran successfully
        exit_code = 0
      }

      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: [exit_code]
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return exit_codes
}

error_reasons = d3.groups(exit_codes, d => d.method_id)
  .map(([method_id, values]) => {
    const all_codes = values.flatMap(d => d.exit_codes)

    if (all_codes.length === 0) {
      return {method_id, error_reason: []}
    }

    const error_pct_oom = d3.mean(all_codes, d => d === 137)
    const error_pct_timeout = d3.mean(all_codes, d => d === 143)
    const error_pct_na = d3.mean(all_codes, d => d === 99)
    const error_pct_error = d3.mean(all_codes, d => d > 0) - error_pct_oom - error_pct_timeout - error_pct_na
    const error_pct_unknown = d3.mean(all_codes, d => d < 0)
    const error_pct_ok = d3.mean(all_codes, d => d === 0)
    return ({
      method_id,
      error_reason: [
        error_pct_oom,
        error_pct_timeout,
        error_pct_error,
        error_pct_unknown,
        error_pct_na,
        error_pct_ok
      ],
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const error_reasons_ = error_reasons.find(d => d.method_id === method_id)

    let summary = {
      method_id,
      method_name,
      mean_score,
      ...datasets,
      ...metrics,
      ...error_reasons_
    }

    if (has_resources) {
      const resources_ = resources.find(d => d.method_id === method_id)
      summary = {...summary, ...resources_}
    }
    return summary
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = {
  let column_info = [
    {
      id: "method_name",
      name: "Name",
      label: null,
      group: "method",
      geom: "text",
      palette: null
    },
    {
      id: "mean_score",
      name: "Score",
      group: "overall",
      geom: "bar",
      palette: "overall"
    },
    {
      id: "error_reason",
      name: "Error reason",
      group: "overall",
      geom: "pie",
      palette: "error_reason"
    },
    ...dataset_info
      .filter(d => dataset_ids.includes(d.dataset_id))
      .map(
        d => ({
          id: "dataset_" + d.dataset_id,
          name: d.dataset_name,
          group: "dataset",
          geom: "funkyrect",
          palette: "dataset"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
    ...metric_info
      .filter(d => metric_ids.includes(d.metric_id))
      .map(
        d => ({
          id: "metric_" + d.metric_id,
          name: d.metric_name,
          group: "metric",
          geom: "funkyrect",
          palette: "metric"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
  ]

  if (has_resources) {
    column_info.push(
      {
        id: "mean_cpu_pct",
        name: "%CPU",
        group: "resources",
        geom: "funkyrect",
        palette: "resources"
      },
      {
        id: "mean_peak_memory_log",
        name: "Peak memory",
        label: "mean_peak_memory_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_read_log",
        name: "Disk read",
        label: "mean_disk_read_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_write_log",
        name: "Disk write",
        label: "mean_disk_write_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_duration_log",
        name: "Duration",
        label: "mean_duration_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      }
    )
  }

  column_info = column_info.map(d => {
    if (d.id === "method_name") {
      return {...d, options: {width: 15, hjust: 0}}
    } else if (d.id === "is_baseline") {
      return {...d, options: {width: 1}}
    } else if (d.geom === "bar") {
      return {...d, options: {width: 4}}
    } else {
      return d
    }
  })

  return column_info
}

column_groups = {
  let column_groups = [
    {
      group: "method",
      palette: null,
      level1: ""
    },
    {
      group: "overall",
      palette: "overall",
      level1: "Overall"
    },
    {
      group: "error_reason",
      palette: "error_reason",
      level1: "Error reason"
    },
    {
      group: "dataset",
      palette: "dataset",
      level1: dataset_info.length >= 3 ? "Datasets" : ""
    },
    {
      group: "metric",
      palette: "metric",
      level1: metric_info.length >= 3 ? "Metrics" : ""
    }
  ]

  if (has_resources) {
    column_groups.push(
      {group: "resources", palette: "resources", level1: "Resources"}
    )
  }

  return column_groups
}

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#fdb462", "#999999", "#FFFFFF"],
      names: [
        "Memory limit exceeded",
        "Time limit exceeded",
        "Execution error",
        "Unknown error",
        "Not applicable",
        "No error"
      ]
    }
  }
][0]

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Dataset info

Show

Mouse Pancreatic Islet Atlas

Source dataset · Data source · 23-01-2025

Mouse pancreatic islet scRNA-seq atlas across sexes, ages, and stress conditions including diabetes (Hrovatin et al. 2023).

To better understand pancreatic β-cell heterogeneity we generated a mouse pancreatic islet atlas capturing a wide range of biological conditions. The atlas contains scRNA-seq datasets of over 300,000 mouse pancreatic islet cells, of which more than 100,000 are β-cells, from nine datasets with 56 samples, including two previously unpublished datasets. The samples vary in sex, age (ranging from embryonic to aged), chemical stress, and disease status (including T1D NOD model development and two T2D models, mSTZ and db/db) together with different diabetes treatments. Additional information about data fields is available in anndata uns field ‘field_descriptions’ and on https://github.com/theislab/mm_pancreas_atlas_rep/blob/main/resources/cellxgene.md.

Tabula Sapiens

Source dataset · Data source · 23-01-2025

A multiple-organ, single-cell transcriptomic atlas of humans (Jones et al. 2022).

Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects. This work is the product of the Tabula Sapiens Consortium. Taking the organs from the same individual controls for genetic background, age, environment, and epigenetic effects and allows detailed analysis and comparison of cell types that are shared between tissues. Our work creates a detailed portrait of cell types as well as their distribution and variation in gene expression across tissues and within the endothelial, epithelial, stromal and immune compartments.

GTEX v9

Source dataset · Data source · 23-01-2025 · 969.19 MiB

Single-nucleus cross-tissue molecular reference maps to decipher disease gene function (Eraslan et al. 2022).

Understanding the function of genes and their regulation in tissue homeostasis and disease requires knowing the cellular context in which genes are expressed in tissues across the body. Single cell genomics allows the generation of detailed cellular atlases in human tissues, but most efforts are focused on single tissue types. Here, we establish a framework for profiling multiple tissues across the human body at single-cell resolution using single nucleus RNA-Seq (snRNA-seq), and apply it to 8 diverse, archived, frozen tissue types (three donors per tissue). We apply four snRNA-seq methods to each of 25 samples from 16 donors, generating a cross-tissue atlas of 209,126 nuclei profiles, and benchmark them vs. scRNA-seq of comparable fresh tissues. We use a conditional variational autoencoder (cVAE) to integrate an atlas across tissues, donors, and laboratory methods. We highlight shared and tissue-specific features of tissue-resident immune cells, identifying tissue-restricted and non-restricted resident myeloid populations. These include a cross-tissue conserved dichotomy between LYVE1- and HLA class II-expressing macrophages, and the broad presence of LAM-like macrophages across healthy tissues that is also observed in disease. For rare, monogenic muscle diseases, we identify cell types that likely underlie the neuromuscular, metabolic, and immune components of these diseases, and biological processes involved in their pathology. For common complex diseases and traits analyzed by GWAS, we identify the cell types and gene modules that potentially underlie disease mechanisms. The experimental and analytical frameworks we describe will enable the generation of large-scale studies of how cellular and molecular processes vary across individuals and populations.

HypoMap

Source dataset · Data source · 23-01-2025

A unified single cell gene expression atlas of the murine hypothalamus (Steuernagel et al. 2022).

The hypothalamus plays a key role in coordinating fundamental body functions. Despite recent progress in single-cell technologies, a unified catalogue and molecular characterization of the heterogeneous cell types and, specifically, neuronal subtypes in this brain region are still lacking. Here we present an integrated reference atlas “HypoMap” of the murine hypothalamus consisting of 384,925 cells, with the ability to incorporate new additional experiments. We validate HypoMap by comparing data collected from SmartSeq2 and bulk RNA sequencing of selected neuronal cell types with different degrees of cellular heterogeneity.

Diabetic Kidney Disease

Source dataset · Data source · 23-01-2025 · 398.37 MiB

Multimodal single cell sequencing implicates chromatin accessibility and genetic background in diabetic kidney disease progression (Wilson et al. 2022).

Multimodal single cell sequencing is a powerful tool for interrogating cell-specific changes in transcription and chromatin accessibility. We performed single nucleus RNA (snRNA-seq) and assay for transposase accessible chromatin sequencing (snATAC-seq) on human kidney cortex from donors with and without diabetic kidney disease (DKD) to identify altered signaling pathways and transcription factors associated with DKD. Both snRNA-seq and snATAC-seq had an increased proportion of VCAM1+ injured proximal tubule cells (PT_VCAM1) in DKD samples. PT_VCAM1 has a pro-inflammatory expression signature and transcription factor motif enrichment implicated NFkB signaling. We used stratified linkage disequilibrium score regression to partition heritability of kidney-function-related traits using publicly-available GWAS summary statistics. Cell-specific PT_VCAM1 peaks were enriched for heritability of chronic kidney disease (CKD), suggesting that genetic background may regulate chromatin accessibility and DKD progression. snATAC-seq found cell-specific differentially accessible regions (DAR) throughout the nephron that change accessibility in DKD and these regions were enriched for glucocorticoid receptor (GR) motifs. Changes in chromatin accessibility were associated with decreased expression of insulin receptor, increased gluconeogenesis, and decreased expression of the GR cytosolic chaperone, FKBP5, in the diabetic proximal tubule. Cleavage under targets and release using nuclease (CUT&RUN) profiling of GR binding in bulk kidney cortex and an in vitro model of the proximal tubule (RPTEC) showed that DAR co-localize with GR binding sites. CRISPRi silencing of GR response elements (GRE) in the FKBP5 gene body reduced FKBP5 expression in RPTEC, suggesting that reduced FKBP5 chromatin accessibility in DKD may alter cellular response to GR. We developed an open-source tool for single cell allele specific analysis (SALSA) to model the effect of genetic background on gene expression. Heterozygous germline single nucleotide variants (SNV) in proximal tubule ATAC peaks were associated with allele-specific chromatin accessibility and differential expression of target genes within cis-coaccessibility networks. Partitioned heritability of proximal tubule ATAC peaks with a predicted allele-specific effect was enriched for eGFR, suggesting that genetic background may modify DKD progression in a cell-specific manner.

Immune Cell Atlas

Source dataset · Data source · 23-01-2025

Cross-tissue immune cell analysis reveals tissue-specific features in humans (Domínguez Conde et al. 2022).

Despite their crucial role in health and disease, our knowledge of immune cells within human tissues remains limited. We surveyed the immune compartment of 16 tissues from 12 adult donors by single-cell RNA sequencing and VDJ sequencing generating a dataset of ~360,000 cells. To systematically resolve immune cell heterogeneity across tissues, we developed CellTypist, a machine learning tool for rapid and precise cell type annotation. Using this approach, combined with detailed curation, we determined the tissue distribution of finely phenotyped immune cell types, revealing hitherto unappreciated tissue-specific features and clonal architecture of T and B cells. Our multitissue approach lays the foundation for identifying highly resolved immune cell types by leveraging a common reference dataset, tissue-integrated expression analysis, and antigen receptor sequencing.

Method info

Show

batchelor fastMNN

Documentation · Repository · Source Code · Container · build_main

Fast mutual nearest neighbors correction (Haghverdi et al. 2018)

The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps.

Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality.
Identify MNN pairs in the low-dimensional space between a reference batch and a target batch.
Remove variation along the average batch vector in both reference and target batches.
Correct the cells in the target batch towards the reference, using locally weighted correction vectors.
Merge the corrected target batch with the reference, and repeat with the next target batch.

batchelor mnnCorrect

Documentation · Repository · Source Code · Container · build_main

Mutual nearest neighbors correction (Haghverdi et al. 2018)

Correct for batch effects in single-cell expression data using the mutual nearest neighbors method.

BBKNN

Documentation · Repository · Source Code · Container · build_main

BBKNN creates k nearest neighbours graph by identifying neighbours within batches, then combining and processing them with UMAP for visualization (Polański et al. 2019)

“BBKNN or batch balanced k nearest neighbours graph is built for each cell by identifying its k nearest neighbours within each defined batch separately, creating independent neighbour sets for each cell in each batch. These sets are then combined and processed with the UMAP algorithm for visualisation.”

Combat

Documentation · Repository · Source Code · Container · build_main

Adjusting batch effects in microarray expression data using empirical Bayes methods (Johnson, Li, and Rabinovic 2006)

“An Empirical Bayes (EB) approach to correct for batch effects. It estimates batch-specific parameters by pooling information across genes in each batch and shrinks the estimates towards the overall mean of the batch effect estimates across all genes. These parameters are then used to adjust the data for batch effects, leading to more accurate and reproducible results.”

Geneformer

Documentation · Repository · Source Code · Container · build_main

Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes (Theodoris et al. 2023) (Chen et al. 2024)

Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in network biology. For this task, Geneformer is used to create a batch-corrected cell embedding.

Harmony

Documentation · Repository · Source Code · Container · build_main

Fast, sensitive and accurate integration of single-cell data with Harmony (Korsunsky et al. 2019)

Harmony is a general-purpose R package with an efficient algorithm for integrating multiple data sets. It is especially useful for large single-cell datasets such as single-cell RNA-seq.

Harmonypy

Documentation · Repository · Source Code · Container · build_main

harmonypy is a port of the harmony R package by Ilya Korsunsky (Korsunsky et al. 2019)

Harmony is a general-purpose R package with an efficient algorithm for integrating multiple data sets. It is especially useful for large single-cell datasets such as single-cell RNA-seq.

LIGER

Documentation · Repository · Source Code · Container · build_main

Linked Inference of Genomic Experimental Relationships (Welch et al. 2019)

LIGER or linked inference of genomic experimental relationships uses iNMF deriving and implementing a novel coordinate descent algorithm to efficiently do the factorization. Joint clustering is performed and factor loadings are normalised.

mnnpy

Documentation · Repository · Source Code · Container · build_main

Batch effect correction by matching mutual nearest neighbors, Python implementation (Kang Kang2022)

An implementation of MNN correct in python featuring low memory usage, full multicore support and compatibility with the scanpy framework.

Batch effect correction by matching mutual nearest neighbors (Haghverdi et al, 2018) has been implemented as a function ‘mnnCorrect’ in the R package scran. Sadly it’s extremely slow for big datasets and doesn’t make full use of the parallel architecture of modern CPUs.

This project is a python implementation of the MNN correct algorithm which takes advantage of python’s extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones.

pyliger

Documentation · Repository · Source Code · Container · build_main

Python implementation of LIGER (Linked Inference of Genomic Experimental Relationships (Welch et al. 2019)

LIGER (installed as rliger) is a package for integrating and analyzing multiple single-cell datasets, developed by the Macosko lab and maintained/extended by the Welch lab. It relies on integrative non-negative matrix factorization to identify shared and dataset-specific factors.

SCALEX

Documentation · Repository · Source Code · Container · build_main

Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space (Xiong et al. 2022)

SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space.

Scanorama

Documentation · Repository · Source Code · Container · build_main

Efficient integration of heterogeneous single-cell transcriptomes using Scanorama (Hie, Bryson, and Berger 2019)

Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets. It is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis.

scANVI

Documentation · Repository · Source Code · Container · build_main

scANVI is a deep learning method that considers cell type labels (Lopez et al. 2018)

scANVI (single-cell ANnotation using Variational Inference; Python class SCANVI) is a semi-supervised model for single-cell transcriptomics data. In a sense, it can be seen as a scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells.

scGPT (fine-tuned)

Documentation · Repository · Source Code · Container · build_main

A foundation model for single-cell biology (fine-tuned) (Cui et al. 2024)

scGPT is a foundation model for single-cell biology based on a generative pre-trained transformer and trained on a repository of over 33 million cells.

Here, we fine-tune the pre-trained model for the batch integration task.

scGPT (zero shot)

Documentation · Repository · Source Code · Container · build_main

A foundation model for single-cell biology (zero shot) (Cui et al. 2024)

scGPT is a foundation model for single-cell biology based on a generative pre-trained transformer and trained on a repository of over 33 million cells.

Here, we use zero-shot output from a pre-trained model to get an integrated embedding for the batch integration task.

SCimilarity

Documentation · Repository · Source Code · Container · build_main

SCimilarity provides unifying representation of single cell expression profiles (Heimberg et al. 2023)

SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training

scPRINT

Documentation · Repository · Source Code · Container · build_main

scPRINT is a large transformer model built for the inference of gene networks (Kalfon et al. 2024)

scPRINT is a large transformer model built for the inference of gene networks (connections between genes explaining the cell’s expression profile) from scRNAseq data.

It uses novel encoding and decoding of the cell expression profile and new pre-training methodologies to learn a cell model.

scPRINT can be used to perform the following analyses:

expression denoising: increase the resolution of your scRNAseq data
cell embedding: generate a low-dimensional representation of your dataset
label prediction: predict the cell type, disease, sequencer, sex, and ethnicity of your cells
gene network inference: generate a gene network from any cell or cell cluster in your scRNAseq dataset

scVI

Documentation · Repository · Source Code · Container · build_main

scVI combines a variational autoencoder with a hierarchical Bayesian model (Lopez et al. 2018)

scVI combines a variational autoencoder with a hierarchical Bayesian model. It uses the negative binomial distribution to describe gene expression of each cell, conditioned on unobserved factors and the batch variable. ScVI is run as implemented in Luecken et al.

UCE

Documentation · Repository · Source Code · Container · build_main

UCE offers a unified biological latent space that can represent any cell (Rosen et al. 2023)

Universal Cell Embedding (UCE) is a single-cell foundation model that offers a unified biological latent space that can represent any cell, regardless of tissue or species

Control method info

Show

Embed cell types

Documentation · Repository · Source Code · Container · build_main

Cells are embedded as a one-hot encoding of celltype labels

Perfect embedding by celltype with jitter

Documentation · Repository · Source Code · Container · build_main

Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding

No integration

Documentation · Repository · Source Code · Container · build_main

Original feature space is not modified

No integration by Batch

Documentation · Repository · Source Code · Container · build_main

Cells are embedded by computing PCA independently on each batch

Shuffle integration

Documentation · Repository · Source Code · Container · build_main

Integrations are randomly permuted

Shuffle integration by batch

Documentation · Repository · Source Code · Container · build_main

Integrations are randomly permuted within each batch

Shuffle integration by cell type

Documentation · Repository · Source Code · Container · build_main

Integrations are randomly permuted within each cell type

Metric info

Show

ASW batch

Source code · Container

Modified average silhouette width (ASW) of batch (Luecken et al. 2021b).

We consider the absolute silhouette width, s(i), on batch labels per cell i. Here, 0 indicates that batches are well mixed, and any deviation from 0 indicates a batch effect: 𝑠batch(𝑖)=𝑠(𝑖).

To ensure higher scores indicate better batch mixing, these scores are scaled by subtracting them from 1. As we expect batches to integrate within cell identity clusters, we compute the batchASWj score for each cell label j separately, using the equation: batchASW𝑗=1𝐶𝑗∑𝑖∈𝐶𝑗1−𝑠batch(𝑖),

where Cj is the set of cells with the cell label j andCj denotes the number of cells in that set.

To obtain the final batchASW score, the label-specific batchASWj scores are averaged: batchASW=1𝑀∑𝑗∈𝑀batchASW𝑗.

Here, M is the set of unique cell labels.

ASW Label

Source code · Container

Average silhouette of cell identity labels (cell types) (Luecken et al. 2021b).

For the bio-conservation score, the ASW was computed on cell identity labels and scaled to a value between 0 and 1 using the equation: celltypeASW=(ASW_C+1)/2,

where C denotes the set of all cell identity labels. For information about the batch silhouette score, check sil_batch.

Cell Cycle Conservation

Source code · Container

Cell cycle conservation score based on principle component regression on cell cycle gene scores (Luecken et al. 2021b).

The cell-cycle conservation score evaluates how well the cell-cycle effect can be captured before and after integration. We computed cell-cycle scores using Scanpy’s score_cell_cycle function with a reference gene set from Tirosh et al for the respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and human data (using capitalization to convert between the gene symbols). We then computed the variance contribution of the resulting S and G2/M phase scores using principal component regression (Principal component regression), which was performed for each batch separately. The differences in variance before, Varbefore, and after, Varafter, integration were aggregated into a final score between 0 and 1, using the equation: CCconservation=1−Varafter−Varbefore/Varbefore.

In this equation, values close to 0 indicate lower conservation and 1 indicates complete conservation of the variance explained by cell cycle. In other words, the variance remains unchanged within each batch for complete conservation, while any deviation from the preintegration variance contribution reduces the score.

ARI

Source code · Container

Adjusted Rand Index compares clustering overlap, correcting for random labels and considering correct overlaps and disagreements (Luecken et al. 2021b) (Hubert and Arabie 1985).

The Adjusted Rand Index (ARI) compares the overlap of two clusterings; it considers both correct clustering overlaps while also counting correct disagreements between two clusterings. We compared the cell-type labels with the NMI-optimized Louvain clustering computed on the integrated dataset. The adjustment of the Rand index corrects for randomly correct labels. An ARI of 0 or 1 corresponds to random labeling or a perfect match, respectively.

NMI

Source code · Container

NMI compares overlap by scaling using mean entropy terms and optimizing Louvain clustering to obtain the best match between clusters and labels (Amelio and Pizzuti 2015) (Luecken et al. 2021b).

Normalized Mutual Information (NMI) compares the overlap of two clusterings. We used NMI to compare the cell-type labels with Louvain clusters computed on the integrated dataset. The overlap was scaled using the mean of the entropy terms for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated clustering or a perfect match, respectively. We performed optimized Louvain clustering for this metric to obtain the best match between clusters and labels.

Graph Connectivity

Source code · Container

Connectivity of the subgraph per cell type label (Luecken et al. 2021b).

The graph connectivity metric assesses whether the kNN graph representation, G, of the integrated data directly connects all cells with the same cell identity label. For each cell identity label c, we created the subset kNN graph G(Nc;Ec) to contain only cells from a given label. Using these subset kNN graphs, we computed the graph connectivity score using the equation:

gc =1/C Σc∈CLCC(G(Nc;Ec))/Nc.

Here, C represents the set of cell identity labels,LCC() is the number of nodes in the largest connected component of the graph, andNc is the number of nodes with cell identity c. The resultant score has a range of (0;1], where 1 indicates that all cells with the same cell identity are connected in the integrated kNN graph, and the lowest possible score indicates a graph where no cell is connected. As this score is computed on the kNN graph, it can be used to evaluate all integration outputs.

HVG overlap

Source code · Container

Overlap of highly variable genes per batch before and after integration (Luecken et al. 2021b).

The HVG conservation score is a proxy for the preservation of the biological signal. If the data integration method returned a corrected data matrix, we computed the number of HVGs before and after correction for each batch via Scanpy’s highly_variable_genes function (using the ‘cell ranger’ flavor). If available, we computed 500 HVGs per batch. If fewer than 500 genes were present in the integrated object for a batch, the number of HVGs was set to half the total genes in that batch. The overlap coefficient is as follows: overlap(𝑋,𝑌)=𝑋∩𝑌/min(𝑋,𝑌),

where X and Y denote the fraction of preserved informative genes. The overall HVG score is the mean of the per-batch HVG overlap coefficients.

Isolated label ASW

Source code · Container

Evaluate how well isolated labels separate by average silhouette width (Luecken et al. 2021b).

Isolated cell labels are defined as the labels present in the least number of batches in the integration task. The score evaluates how well these isolated labels separate from other cell identities.

The isolated label ASW score is obtained by computing the ASW of isolated versus non-isolated labels on the PCA embedding (ASW metric above) and scaling this score to be between 0 and 1. The final score for each metric version consists of the mean isolated score of all isolated labels.

Isolated label F1 score

Source code · Container

Evaluate how well isolated labels coincide with clusters (Luecken et al. 2021b).

We developed two isolated label scores to evaluate how well the data integration methods dealt with cell identity labels shared by few batches. Specifically, we identified isolated cell labels as the labels present in the least number of batches in the integration task. The score evaluates how well these isolated labels separate from other cell identities. We implemented the isolated label metric in two versions: (1) the best clustering of the isolated label (F1 score) and (2) the global ASW of the isolated label. For the cluster-based score, we first optimize the cluster assignment of the isolated label using the F1 score˚ across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1. The optimal F1 score for the isolated label is then used as the metric score. The F1 score is a weighted mean of precision and recall given by the equation: 𝐹1=2×(precision×recall)/(precision+recall).

It returns a value between 0 and 1, where 1 shows that all of the isolated label cells and no others are captured in the cluster. For the isolated label ASW score, we compute the ASW of isolated versus nonisolated labels on the PCA embedding (ASW metric above) and scale this score to be between 0 and 1. The final score for each metric version consists of the mean isolated score of all isolated labels.

kBET

Source code · Container

kBET algorithm to determine how well batches are mixed within a cell type (Luecken et al. 2021b).

The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition of a k nearest neighborhood of a cell is similar to the expected (global) label composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset of cells, and the results are summarized as a rejection rate over all tested neighborhoods. Thus, kBET works on a kNN graph.

We compute kNN graphs where k = 50 for joint embeddings and corrected feature outputs via Scanpy preprocessing steps. To test for technical effects and to account for cell-type frequency shifts across datasets, we applied kBET separately on the batch variable for each cell identity label. Using the kBET defaults, a k equal to the median of the number of cells per batch within each label is used for this computation. Additionally, we set the minimum and maximum thresholds of k to 10 and 100, respectively. As kNN graphs that have been subset by cell identity labels may no longer be connected, we compute kBET per connected component. If >25% of cells were assigned to connected components too small for kBET computation (smaller than k × 3), we assigned a kBET score of 1 to denote poor batch removal. Subsequently, kBET scores for each label were averaged and subtracted from 1 to give a final kBET score.

In Open Problems we do not run kBET on graph outputs to avoid computation-intensive diffusion processes being run.

iLISI

Source code · Container

Local inverse Simpson’s Index (Luecken et al. 2021b).

Local Inverse Simpson’s Index metrics adapted from Korsunsky et al. 2019 to run on all full feature, embedding and kNN integration outputs via shortest path-based distance computation on single-cell kNN graphs. The metric assesses whether clusters of cells in a single-cell RNA-seq dataset are well-mixed across a categorical batch variable.

The original LISI score ranges from 0 to the number of categories, with the latter indicating good cell mixing. This is rescaled to a score between 0 and 1.

cLISI

Source code · Container

Local inverse Simpson’s Index (Luecken et al. 2021b).

The original LISI score ranges from 0 to the number of categories, with the latter indicating good cell mixing. This is rescaled to a score between 0 and 1.

PCR

Source code · Container

Compare explained variance by batch before and after integration (Luecken et al. 2021b).

Principal component regression, derived from PCA, has previously been used to quantify batch removal. Briefly, the R2 was calculated from a linear regression of the covariate of interest (for example, the batch variable B) onto each principal component. The variance contribution of the batch effect per principal component was then calculated as the product of the variance explained by the ith principal component (PC) and the corresponding R2(PCiB). The sum across all variance contributions by the batch effects in all principal components gives the total variance explained by the batch variable as follows: Var(𝐶𝐵)=∑𝑖=1𝐺Var(𝐶PC𝑖)×𝑅2(PC𝑖𝐵),

where Var(CPCi) is the variance of the data matrix C explained by the ith principal component.

Quality control results

Show

Category	Name	Value	Condition	Severity
Scaling	Best score bbknn ilisi	35.0348000	best_score <= 2	✗✗✗
Raw results	Method 'scgpt_finetuned' %missing	1.0000000	pct_missing <= .1	✗✗✗
Raw results	Method 'scprint' %missing	1.0000000	pct_missing <= .1	✗✗✗
Raw results	Method 'batchelor_mnn_correct' %missing	0.8589744	pct_missing <= .1	✗✗✗
Raw results	Method 'mnnpy' %missing	0.8589744	pct_missing <= .1	✗✗✗
Raw results	Metric 'hvg_overlap' %missing	0.7564103	pct_missing <= .1	✗✗✗
Raw results	Method 'bbknn' %missing	0.5512821	pct_missing <= .1	✗✗✗
Raw results	Method 'geneformer' %missing	0.4102564	pct_missing <= .1	✗✗✗
Raw results	Method 'scgpt_zeroshot' %missing	0.4102564	pct_missing <= .1	✗✗✗
Raw results	Method 'scimilarity' %missing	0.4102564	pct_missing <= .1	✗✗✗
Raw results	Metric 'kbet' %missing	0.4038462	pct_missing <= .1	✗✗✗
Raw results	Dataset 'cellxgene_census/hypomap' %missing	0.3786982	pct_missing <= .1	✗✗✗
Raw results	Metric 'isolated_label_asw' %missing	0.3653846	pct_missing <= .1	✗✗✗
Raw results	Dataset 'cellxgene_census/mouse_pancreas_atlas' %missing	0.3579882	pct_missing <= .1	✗✗✗
Raw results	Metric 'isolated_label_f1' %missing	0.3333333	pct_missing <= .1	✗✗✗
Dataset info	Pct 'task_id' missing	1.0000000	percent_missing(dataset_info, field)	✗✗
Method info	Pct 'paper_reference' missing	0.7307692	percent_missing(method_info, field)	✗✗
Metric info	Pct 'paper_reference' missing	1.0000000	percent_missing(metric_info, field)	✗✗
Raw results	Dataset 'cellxgene_census/dkd' %missing	0.2810651	pct_missing <= .1	✗✗
Raw results	Metric 'asw_label' %missing	0.2243590	pct_missing <= .1	✗✗
Raw results	Dataset 'cellxgene_census/tabula_sapiens' %missing	0.2218935	pct_missing <= .1	✗✗
Raw results	Dataset 'cellxgene_census/gtex_v9' %missing	0.2189349	pct_missing <= .1	✗✗
Raw results	Dataset 'cellxgene_census/immune_cell_atlas' %missing	0.2189349	pct_missing <= .1	✗✗
Raw results	Metric 'asw_batch' %missing	0.2179487	pct_missing <= .1	✗✗
Raw results	Metric 'cell_cycle_conservation' %missing	0.2179487	pct_missing <= .1	✗✗
Raw results	Metric 'pcr' %missing	0.2179487	pct_missing <= .1	✗✗
Raw results	Metric 'ari' %missing	0.1794872	pct_missing <= .1	✗
Raw results	Metric 'clisi' %missing	0.1794872	pct_missing <= .1	✗
Raw results	Metric 'graph_connectivity' %missing	0.1794872	pct_missing <= .1	✗
Raw results	Metric 'ilisi' %missing	0.1794872	pct_missing <= .1	✗
Raw results	Metric 'nmi' %missing	0.1794872	pct_missing <= .1	✗
Scaling	Worst score scanorama hvg_overlap	-1.5279000	worst_score >= -1	✗
Raw results	Method 'uce' %missing	0.1410256	pct_missing <= .1	✗
Raw results	Method 'batchelor_fastmnn' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'harmony' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'harmonypy' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'liger' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'pyliger' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'scanvi' %missing	0.1282051	pct_missing <= .1	✗
Raw results	Method 'scvi' %missing	0.1282051	pct_missing <= .1	✗
Scaling	Worst score scalex hvg_overlap	-1.2449000	worst_score >= -1	✗
Raw results	Method 'embed_cell_types' %missing	0.1153846	pct_missing <= .1	✗
Raw results	Method 'embed_cell_types_jittered' %missing	0.1153846	pct_missing <= .1	✗
Raw results	Method 'no_integration' %missing	0.1153846	pct_missing <= .1	✗
Raw results	Method 'no_integration_batch' %missing	0.1153846	pct_missing <= .1	✗

Normalisation visualisation

Show

Authors

Michaela Mueller (maintainer, author) ,
Malte Luecken (author) ,
Daniel Strobl (author) ,
Robrecht Cannoodt (contributor) ,
Scott Gigante (contributor) ,
Kai Waldrant (contributor) ,
Nartin Kim (contributor) ,

References

Amelio, Alessia, and Clara Pizzuti. 2015. “Is Normalized Mutual Information a Fair Measure for Comparing Community Detection Methods?” In Proceedings of the 2015 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining 2015. ASONAM ’15. ACM. https://doi.org/10.1145/2808797.2809344.

Chazarra-Gil, Ruben, Stijn van Dongen, Vladimir Yu Kiselev, and Martin Hemberg. 2021. “Flexible Comparison of Batch Correction Methods for Single-Cell RNA-Seq Using BatchBench.” Nucleic Acids Research 49 (7): e42–42. https://doi.org/10.1093/nar/gkab004.

Chen, Han, Madhavan S Venkatesh, Javier Gomez Ortega, Siddharth V Mahesh, Tarak N Nandi, Ravi K Madduri, Karin Pelka, and Christina V Theodoris. 2024. “Quantized Multi-Task Learning for Context-Specific Representations of Gene Network Dynamics,” August. https://doi.org/10.1101/2024.08.16.608180.

Cui, Haotian, Chloe Wang, Hassaan Maan, Kuan Pang, Fengning Luo, Nan Duan, and Bo Wang. 2024. “scGPT: Toward Building a Foundation Model for Single-Cell Multi-Omics Using Generative AI.” Nature Methods 21 (8): 1470–80. https://doi.org/10.1038/s41592-024-02201-0.

Domínguez Conde, C., C. Xu, L. B. Jarvis, D. B. Rainbow, S. B. Wells, T. Gomes, S. K. Howlett, et al. 2022. “Cross-Tissue Immune Cell Analysis Reveals Tissue-Specific Features in Humans.” Science 376 (6594). https://doi.org/10.1126/science.abl5197.

Eraslan, Gökcen, Eugene Drokhlyansky, Shankara Anand, Evgenij Fiskin, Ayshwarya Subramanian, Michal Slyper, Jiali Wang, et al. 2022. “Single-Nucleus Cross-Tissue Molecular Reference Maps Toward Understanding Disease Gene Function.” Science 376 (6594). https://doi.org/10.1126/science.abl4290.

Haghverdi, Laleh, Aaron T L Lun, Michael D Morgan, and John C Marioni. 2018. “Batch Effects in Single-Cell RNA-Sequencing Data Are Corrected by Matching Mutual Nearest Neighbors.” Nature Biotechnology 36 (5): 421–27. https://doi.org/10.1038/nbt.4091.

Heimberg, Graham, Tony Kuo, Daryle DePianto, Tobias Heigl, Nathaniel Diamant, Omar Salem, Gabriele Scalia, et al. 2023. “Scalable Querying of Human Cell Atlases via a Foundational Model Reveals Commonalities Across Fibrosis-Associated Macrophages,” July. https://doi.org/10.1101/2023.07.18.549537.

Hie, Brian, Bryan Bryson, and Bonnie Berger. 2019. “Efficient Integration of Heterogeneous Single-Cell Transcriptomes Using Scanorama.” Nature Biotechnology 37 (6): 685–91. https://doi.org/10.1038/s41587-019-0113-3.

Hrovatin, Karin, Aimée Bastidas-Ponce, Mostafa Bakhti, Luke Zappia, Maren Büttner, Ciro Sallino, Michael Sterr, et al. 2023. “Delineating Mouse β-Cell Identity During Lifetime and in Diabetes with a Single Cell Atlas.” bioRxiv. https://doi.org/10.1101/2022.12.22.521557.

Hubert, Lawrence, and Phipps Arabie. 1985. “Comparing Partitions.” Journal of Classification 2 (1): 193–218. https://doi.org/10.1007/bf01908075.

Johnson, W. Evan, Cheng Li, and Ariel Rabinovic. 2006. “Adjusting Batch Effects in Microarray Expression Data Using Empirical Bayes Methods.” Biostatistics 8 (1): 118–27. https://doi.org/10.1093/biostatistics/kxj037.

Jones, Robert C., Jim Karkanias, Mark A. Krasnow, Angela Oliveira Pisco, Stephen R. Quake, Julia Salzman, Nir Yosef, et al. 2022. “The Tabula Sapiens: A Multiple-Organ, Single-Cell Transcriptomic Atlas of Humans.” Science 376 (6594). https://doi.org/10.1126/science.abl4896.

Kalfon, Jérémie, Jules Samaran, Gabriel Peyré, and Laura Cantini. 2024. “scPRINT: Pre-Training on 50 Million Cells Allows Robust Gene Network Predictions,” July. https://doi.org/10.1101/2024.07.29.605556.

Kang, Chris. Kang2022. “Mnnpy.” GitHub Repository. https://github.com/chriscainx/mnnpy; GitHub.

Korsunsky, Ilya, Nghia Millard, Jean Fan, Kamil Slowikowski, Fan Zhang, Kevin Wei, Yuriy Baglaenko, Michael Brenner, Po-ru Loh, and Soumya Raychaudhuri. 2019. “Fast, Sensitive and Accurate Integration of Single-Cell Data with Harmony.” Nature Methods 16 (12): 1289–96. https://doi.org/10.1038/s41592-019-0619-0.

Lopez, Romain, Jeffrey Regier, Michael B. Cole, Michael I. Jordan, and Nir Yosef. 2018. “Deep Generative Modeling for Single-Cell Transcriptomics.” Nature Methods 15 (12): 1053–58. https://doi.org/10.1038/s41592-018-0229-2.

Luecken, Malte D., M. Büttner, K. Chaichoompu, A. Danese, M. Interlandi, M. F. Mueller, D. C. Strobl, et al. 2021a. “Benchmarking Atlas-Level Data Integration in Single-Cell Genomics.” Nature Methods 19 (1): 41–50. https://doi.org/10.1038/s41592-021-01336-8.

———, et al. 2021b. “Benchmarking Atlas-Level Data Integration in Single-Cell Genomics.” Nature Methods 19 (1): 41–50. https://doi.org/10.1038/s41592-021-01336-8.

Mereu, Elisabetta, Atefeh Lafzi, Catia Moutinho, Christoph Ziegenhain, Davis J McCarthy, Adrian Alvarez-Varela, Eduard Batlle, et al. 2020. “Benchmarking Single-Cell RNA-Sequencing Protocols for Cell Atlas Projects.” Nature Biotechnology 38 (6): 747–55. https://doi.org/10.1038/s41587-020-0469-4.

Polański, Krzysztof, Matthew D Young, Zhichao Miao, Kerstin B Meyer, Sarah A Teichmann, and Jong-Eun Park. 2019. “BBKNN: Fast Batch Alignment of Single Cell Transcriptomes.” Edited by Bonnie Berger. Bioinformatics 36 (3): 964–65. https://doi.org/10.1093/bioinformatics/btz625.

Rosen, Yanay, Yusuf Roohani, Ayush Agrawal, Leon Samotorcan, Tabula Sapiens Consortium, Stephen R. Quake, and Jure Leskovec. 2023. “Universal Cell Embeddings: A Foundation Model for Cell Biology,” November. https://doi.org/10.1101/2023.11.28.568918.

Steuernagel, Lukas, Brian Y. H. Lam, Paul Klemm, Georgina K. C. Dowsett, Corinna A. Bauder, John A. Tadross, Tamara Sotelo Hitschfeld, et al. 2022. “HypoMap—a Unified Single-Cell Gene Expression Atlas of the Murine Hypothalamus.” Nature Metabolism 4 (10): 1402–19. https://doi.org/10.1038/s42255-022-00657-y.

Theodoris, Christina V., Ling Xiao, Anant Chopra, Mark D. Chaffin, Zeina R. Al Sayed, Matthew C. Hill, Helene Mantineo, et al. 2023. “Transfer Learning Enables Predictions in Network Biology.” Nature 618 (7965): 616–24. https://doi.org/10.1038/s41586-023-06139-9.

Tran, Hoa Thi Nhu, Kok Siong Ang, Marion Chevrier, Xiaomeng Zhang, Nicole Yee Shin Lee, Michelle Goh, and Jinmiao Chen. 2020. “A Benchmark of Batch-Effect Correction Methods for Single-Cell RNA Sequencing Data.” Genome Biology 21 (1). https://doi.org/10.1186/s13059-019-1850-9.

Welch, Joshua D., Velina Kozareva, Ashley Ferreira, Charles Vanderburg, Carly Martin, and Evan Z. Macosko. 2019. “Single-Cell Multi-Omic Integration Compares and Contrasts Features of Brain Cell Identity.” Cell 177 (7): 1873–1887.e17. https://doi.org/10.1016/j.cell.2019.05.006.

Wilson, Parker C., Yoshiharu Muto, Haojia Wu, Anil Karihaloo, Sushrut S. Waikar, and Benjamin D. Humphreys. 2022. “Multimodal Single Cell Sequencing Implicates Chromatin Accessibility and Genetic Background in Diabetic Kidney Disease Progression.” Nature Communications 13 (1). https://doi.org/10.1038/s41467-022-32972-z.

Xiong, Lei, Kang Tian, Yuzhe Li, Weixi Ning, Xin Gao, and Qiangfeng Cliff Zhang. 2022. “Online Single-Cell Data Integration Through Projecting Heterogeneous Datasets into a Common Cell-Embedding Space.” Nature Communications 13 (1). https://doi.org/10.1038/s41467-022-33758-z.

Zappia, Luke, Belinda Phipson, and Alicia Oshlack. 2018. “Exploring the Single-Cell RNA-Seq Analysis Landscape with the scRNA-Tools Database.” Edited by Dina Schneidman. PLOS Computational Biology 14 (6): e1006245. https://doi.org/10.1371/journal.pcbi.1006245.