Dimensionality reduction for visualisation

Reduction of high-dimensional datasets to 2D for visualization & interpretation

4 datasets · 23 methods · 3 control methods · 10 metrics

Info

Task info Method info Metric info Dataset info Results

Dimensionality reduction is one of the key challenges in single-cell data representation. Routine single-cell RNA sequencing (scRNA-seq) experiments measure cells in roughly 20,000-30,000 dimensions (i.e., features - mostly gene transcripts but also other functional elements encoded in mRNA such as lncRNAs). Since its inception, scRNA-seq experiments have been growing in terms of the number of cells measured. Originally, cutting-edge SmartSeq experiments would yield a few hundred cells, at best. Now, it is not uncommon to see experiments that yield over 100,000 cells or even > 1 million cells.

Each feature in a dataset functions as a single dimension. While each of the ~30,000 dimensions measured in each cell contribute to an underlying data structure, the overall structure of the data is challenging to display in few dimensions due to data sparsity and the “curse of dimensionality” (distances in high dimensional data don’t distinguish data points well). Thus, we need to find a way to dimensionally reduce the data for visualization and interpretation.

Summary

function aggregate_scores(obj) {
  return d3.mean(obj.map(val => {
    if (val.score === undefined || isNaN(val.score)) return 0;
    return Math.min(1, Math.max(0, val.score))
  }));
}

function transpose_list_of_objects(list) {
  return Object.fromEntries(Object.keys(list[0]).map(key => [key, list.map(d => d[key])]))
}

function label_time(time) {
  if (time < 1e-5) return "0s";
  if (time < 1) return "<1s";
  if (time < 60) return `${Math.floor(time)}s`;
  if (time < 3600) return `${Math.floor(time / 60)}m`;
  if (time < 3600 * 24) return `${Math.floor(time / 3600)}h`;
  if (time < 3600 * 24 * 7) return `${Math.floor(time / 3600 / 24)}d`;
  return ">7d"; // Assuming missing values are encoded as NaN
}

function label_memory(x_mb, include_mb = true) {
  if (!include_mb && x_mb < 1e3) return "<1G";
  if (x_mb < 1) return "<1M";
  if (x_mb < 1e3) return `${Math.round(x_mb)}M`;
  if (x_mb < 1e6) return `${Math.round(x_mb / 1e3)}G`;
  if (x_mb < 1e9) return `${Math.round(x_mb / 1e6)}T`;
  return ">1P";
}

function mean_na_rm(x) {
  return d3.mean(x.filter(d => !isNaN(d)));
}

aggregate_scores = ƒ(obj)

transpose_list_of_objects = ƒ(list)

label_time = ƒ(time)

label_memory = ƒ(…)

mean_na_rm = ƒ(x)

poss_dataset_ids = dataset_info
  .map(d => d.dataset_id)
  .filter(d => results.map(r => r.dataset_id).includes(d))
poss_method_ids = method_info
  .map(d => d.method_id)
  .filter(d => results.map(r => r.method_id).includes(d))
poss_metric_ids = metric_info
  .map(d => d.metric_id)
  .filter(d => results.map(r => Object.keys(r.scaled_scores)).flat().includes(d))

poss_dataset_ids = Array(4) ["mouse_hspc_nestorowa2016", "olsson_2016_mouse_blood", "tenx_5k_pbmc", "zebrafish_labs"]

poss_method_ids = Array(26) ["densmap_logCP10k", "densmap_logCP10k_1kHVG", "densmap_pca_logCP10k", "densmap_pca_logCP10k_1kHVG", "diffusion_map", "neuralee_default", "neuralee_logCP10k_1kHVG", "pca_logCP10k", "pca_logCP10k_1kHVG", "phate_default", "phate_logCP10k", "phate_logCP10k_1kHVG", "phate_sqrt", "pymde_distances_log_cp10k", "pymde_distances_log_cp10k_hvg", "pymde_neighbors_log_cp10k", "pymde_neighbors_log_cp10k_hvg", "random_features", "spectral_features", "true_features", …]

poss_metric_ids = Array(10) ["continuity", "density_preservation", "distance_correlation", "distance_correlation_spectral", "lcmc", "qglobal", "qlocal", "qnn", "qnn_auc", "trustworthiness"]

has_resources = results[0].hasOwnProperty("resources")
has_exit_codes = results[0].hasOwnProperty("exit_codes")

results_long = results.flatMap(d => {
  return Object.entries(d.scaled_scores).map(([metric_id, value]) =>
    ({
      method_id: d.method_id,
      dataset_id: d.dataset_id,
      metric_id: metric_id,
      score: value
    })
  )
}).filter(d => method_ids.includes(d.method_id) && metric_ids.includes(d.metric_id) && dataset_ids.includes(d.dataset_id))

overall = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => ({method_id, mean_score: aggregate_scores(values)}))

per_dataset = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const datasets = d3.groups(values, d => d.dataset_id)
      .map(([dataset_id, values]) => ({["dataset_" + dataset_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...datasets}
  })

per_metric = d3.groups(results_long, d => d.method_id)
  .map(([method_id, values]) => {
    const metrics = d3.groups(values, d => d.metric_id)
      .map(([metric_id, values]) => ({["metric_" + metric_id]: aggregate_scores(values)}))
      .reduce((a, b) => ({...a, ...b}), {})
    return {method_id, ...metrics}
  })

results_resources = {
  let results_resources = null

  if (has_resources) {
    results_resources = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        ...d.resources
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return results_resources
}

resources = {
  let resources = null

  if (has_resources) {
    resources = d3.groups(results_resources, d => d.method_id)
      .map(([method_id, values]) => {
        const mean_peak_memory_mb = mean_na_rm(values.map(d => d.peak_memory_mb))
        const mean_disk_read_mb = mean_na_rm(values.map(d => d.disk_read_mb))
        const mean_disk_write_mb = mean_na_rm(values.map(d => d.disk_write_mb))
        const mean_duration_sec = mean_na_rm(values.map(d => d.duration_sec))

        return ({
          method_id,
          mean_cpu_pct: mean_na_rm(values.map(d => d.cpu_pct)),
          mean_peak_memory_mb,
          mean_peak_memory_log: -Math.log10(mean_peak_memory_mb),
          mean_peak_memory_str: " " + label_memory(mean_peak_memory_mb) + " ",
          mean_disk_read_mb: mean_na_rm(values.map(d => d.disk_read_mb)),
          mean_disk_read_log: -Math.log10(mean_disk_read_mb),
          mean_disk_read_str: " " + label_memory(mean_disk_read_mb) + " ",
          mean_disk_write_mb: mean_na_rm(values.map(d => d.disk_write_mb)),
          mean_disk_write_log: -Math.log10(mean_disk_write_mb),
          mean_disk_write_str: " " + label_memory(mean_disk_write_mb) + " ",
          mean_duration_sec,
          mean_duration_log: -Math.log10(mean_duration_sec),
          mean_duration_str: " " + label_time(mean_duration_sec) + " "
        })
      })
  }

  return resources
}

exit_codes = {
  let exit_codes = null

  if (has_exit_codes) {
    exit_codes = results.flatMap(d => {
      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: Object.values(d.exit_codes)
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  } else {
    exit_codes = results_resources.flatMap(d => {
      let exit_code = d.exit_code
      if (exit_code === undefined) {
        // If there is not exit code, assume the method ran successfully
        exit_code = 0
      }

      return ({
        method_id: d.method_id,
        dataset_id: d.dataset_id,
        exit_codes: [exit_code]
      })
    }).filter(d => method_ids.includes(d.method_id) && dataset_ids.includes(d.dataset_id))
  }

  return exit_codes
}

error_reasons = d3.groups(exit_codes, d => d.method_id)
  .map(([method_id, values]) => {
    const all_codes = values.flatMap(d => d.exit_codes)

    if (all_codes.length === 0) {
      return {method_id, error_reason: []}
    }

    const error_pct_oom = d3.mean(all_codes, d => d === 137)
    const error_pct_timeout = d3.mean(all_codes, d => d === 143)
    const error_pct_na = d3.mean(all_codes, d => d === 99)
    const error_pct_error = d3.mean(all_codes, d => d > 0) - error_pct_oom - error_pct_timeout - error_pct_na
    const error_pct_unknown = d3.mean(all_codes, d => d < 0)
    const error_pct_ok = d3.mean(all_codes, d => d === 0)
    return ({
      method_id,
      error_reason: [
        error_pct_oom,
        error_pct_timeout,
        error_pct_error,
        error_pct_unknown,
        error_pct_na,
        error_pct_ok
      ],
    })
  })

summary_all = method_info
  .filter(d => show_con || !d.is_baseline)
  .filter(d => method_ids.includes(d.method_id))
  .map(method => {
    const method_id = method.method_id
    const method_name = method.method_name
    const mean_score = overall.find(d => d.method_id === method_id).mean_score
    const datasets = per_dataset.find(d => d.method_id === method_id)
    const metrics = per_metric.find(d => d.method_id === method_id)
    const error_reasons_ = error_reasons.find(d => d.method_id === method_id)

    let summary = {
      method_id,
      method_name,
      mean_score,
      ...datasets,
      ...metrics,
      ...error_reasons_
    }

    if (has_resources) {
      const resources_ = resources.find(d => d.method_id === method_id)
      summary = {...summary, ...resources_}
    }
    return summary
  })
  .sort((a, b) => b.mean_score - a.mean_score)

// make sure the first entry contains all columns
column_info = {
  let column_info = [
    {
      id: "method_name",
      name: "Name",
      label: null,
      group: "method",
      geom: "text",
      palette: null
    },
    {
      id: "mean_score",
      name: "Score",
      group: "overall",
      geom: "bar",
      palette: "overall"
    },
    {
      id: "error_reason",
      name: "Error reason",
      group: "overall",
      geom: "pie",
      palette: "error_reason"
    },
    ...dataset_info
      .filter(d => dataset_ids.includes(d.dataset_id))
      .map(
        d => ({
          id: "dataset_" + d.dataset_id,
          name: d.dataset_name,
          group: "dataset",
          geom: "funkyrect",
          palette: "dataset"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
    ...metric_info
      .filter(d => metric_ids.includes(d.metric_id))
      .map(
        d => ({
          id: "metric_" + d.metric_id,
          name: d.metric_name,
          group: "metric",
          geom: "funkyrect",
          palette: "metric"
        })
      )
      .sort((a, b) => a.name.localeCompare(b.name)),
  ]

  if (has_resources) {
    column_info.push(
      {
        id: "mean_cpu_pct",
        name: "%CPU",
        group: "resources",
        geom: "funkyrect",
        palette: "resources"
      },
      {
        id: "mean_peak_memory_log",
        name: "Peak memory",
        label: "mean_peak_memory_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_read_log",
        name: "Disk read",
        label: "mean_disk_read_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_disk_write_log",
        name: "Disk write",
        label: "mean_disk_write_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      },
      {
        id: "mean_duration_log",
        name: "Duration",
        label: "mean_duration_str",
        group: "resources",
        geom: "rect",
        palette: "resources"
      }
    )
  }

  column_info = column_info.map(d => {
    if (d.id === "method_name") {
      return {...d, options: {width: 15, hjust: 0}}
    } else if (d.id === "is_baseline") {
      return {...d, options: {width: 1}}
    } else if (d.geom === "bar") {
      return {...d, options: {width: 4}}
    } else {
      return d
    }
  })

  return column_info
}

column_groups = {
  let column_groups = [
    {
      group: "method",
      palette: null,
      level1: ""
    },
    {
      group: "overall",
      palette: "overall",
      level1: "Overall"
    },
    {
      group: "error_reason",
      palette: "error_reason",
      level1: "Error reason"
    },
    {
      group: "dataset",
      palette: "dataset",
      level1: dataset_info.length >= 3 ? "Datasets" : ""
    },
    {
      group: "metric",
      palette: "metric",
      level1: metric_info.length >= 3 ? "Metrics" : ""
    }
  ]

  if (has_resources) {
    column_groups.push(
      {group: "resources", palette: "resources", level1: "Resources"}
    )
  }

  return column_groups
}

palettes = [
  {
    overall: "Greys",
    dataset: "Blues",
    metric: "Reds",
    resources: "YlOrBr",
    error_reason: {
      colors: ["#8DD3C7", "#FFFFB3", "#BEBADA", "#fdb462", "#999999", "#FFFFFF"],
      names: [
        "Memory limit exceeded",
        "Time limit exceeded",
        "Execution error",
        "Unknown error",
        "Not applicable",
        "No error"
      ]
    }
  }
][0]

has_resources = true

has_exit_codes = false

results_long = Array(884) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

overall = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

per_dataset = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

per_metric = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

results_resources = Array(104) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

resources = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

exit_codes = Array(104) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

error_reasons = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

summary_all = Array(26) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

column_info = Array(22) [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, Object, …]

column_groups = Array(6) [Object, Object, Object, Object, Object, Object]

palettes = Object {overall: "Greys", dataset: "Blues", metric: "Reds", resources: "YlOrBr", error_reason: Object}

funkyheatmap(
    transpose_list_of_objects(summary_all),
    transpose_list_of_objects(column_info),
    [],
    transpose_list_of_objects(column_groups),
    [],
    palettes,
    {
        fontSize: 14,
        rowHeight: 26,
        rootStyle: 'max-width: none',
        colorByRank: color_by_rank,
        theme: {
            oddRowBackground: 'var(--bs-body-bg)',
            evenRowBackground: 'var(--bs-button-hover)',
            textColor: 'var(--bs-body-color)',
            strokeColor: 'var(--bs-body-color)',
            headerColor: 'var(--bs-body-color)',
            hoverColor: 'var(--bs-body-color)'
        }
    },
    scale_column
);

Figure 1: Overview of the results per method. This figures shows the mean of the scaled scores (group Overall), the mean scores per dataset (group Dataset) and the mean scores per metric (group Metric).

Display settings

viewof color_by_rank = Inputs.toggle({label: "Color by rank:", value: true})
viewof scale_column = Inputs.toggle({label: "Minmax column:", value: false})
viewof show_con = Inputs.toggle({label: "Show control methods:", value: true})

color_by_rank = true

scale_column = false

show_con = true

Filter datasets

viewof dataset_ids = Inputs.checkbox(
  dataset_info.filter(d => poss_dataset_ids.includes(d.dataset_id)),
  {
    keyof: d => d.dataset_name,
    valueof: d => d.dataset_id,
    value: dataset_info.map(d => d.dataset_id),
    label: "Datasets:"
  }
)

dataset_ids = Array(4) ["mouse_hspc_nestorowa2016", "olsson_2016_mouse_blood", "tenx_5k_pbmc", "zebrafish_labs"]

Filter methods

viewof method_ids = Inputs.checkbox(
  method_info.filter(d => poss_method_ids.includes(d.method_id)),
  {
    keyof: d => d.method_name,
    valueof: d => d.method_id,
    value: method_info.map(d => d.method_id),
    label: "Methods:"
  }
)

Filter metrics

viewof metric_ids = Inputs.checkbox(
  metric_info.filter(d => poss_metric_ids.includes(d.metric_id)),
  {
    keyof: d => d.metric_name,
    valueof: d => d.metric_id,
    value: metric_info.map(d => d.metric_id),
    label: "Metrics:"
  }
)

funkyheatmap = (await require('d3@7').then(d3 => {
  window.d3 = d3;
  window._ = _;
  return import('https://unpkg.com/funkyheatmapjs@0.2.5');
})).default;

Results

Results table of the scores per method, dataset and metric (after scaling). Use the filters to make a custom subselection of methods and datasets. The “Overall mean” dataset is the mean value across all datasets.

Filters Active - 1


densMAP (logCP10k)5
densMAP (logCP10k, 1kHVG)5
densMAP PCA (logCP10k)5
densMAP PCA (logCP10k, 1kHVG)5
Diffusion maps5
NeuralEE (CPU) (Default)5
NeuralEE (CPU) (logCP10k, 1kHVG)5
PCA (logCP10k)5
PCA (logCP10k, 1kHVG)5
PHATE (default)5
PHATE (gamma=0)5
PHATE (logCP10k)5
PHATE (logCP10k, 1kHVG)5
PyMDE Preserve Distances (logCP10k)5
PyMDE Preserve Distances (logCP10k, 1kHVG)5
PyMDE Preserve Neighbors (logCP10k)5
PyMDE Preserve Neighbors (logCP10k, 1kHVG)5
t-SNE (logCP10k)5
t-SNE (logCP10k, 1kHVG)5
UMAP (logCP10k)5
UMAP (logCP10k, 1kHVG)5
UMAP PCA (logCP10k)5
UMAP PCA (logCP10k, 1kHVG)5


5k Peripheral blood mononuclear cells23
Mouse hematopoietic stem cell differentiation23
Mouse myeloid lineage differentiation23
Overall mean23
Zebrafish23

Method	Dataset	Mean score	continuity	Density preservation	Distance correlation	Distance correlation (spectral)	local continuity meta criterion	global property	local property	co-KNN size	co-KNN AUC	trustworthiness	Runtime (s)	CPU (%)	Memory (GB)

Method	Dataset	Mean score	continuity	Density preservation	Distance correlation	Distance correlation (spectral)	local continuity meta criterion	global property	local property	co-KNN size	co-KNN AUC	trustworthiness	Runtime (s)	CPU (%)	Memory (GB)
densMAP (logCP10k)	Overall mean	0.43	0.65	0.65	0.56	-0.03	0.15	0.53	0.27	0.17	0.49	0.79	859	144	2.07
densMAP PCA (logCP10k)	Overall mean	0.37	0.62	0.47	0.35	-0.03	0.15	0.51	0.28	0.17	0.47	0.84	600	162	0.97
PyMDE Preserve Distances (logCP10k)	Overall mean	0.35	0.59	0.29	0.59	0.03	0.08	0.52	0.20	0.10	0.47	0.75	1,076	399	19.00
densMAP PCA (logCP10k, 1kHVG)	Overall mean	0.34	0.60	0.33	0.29	0.05	0.13	0.49	0.26	0.15	0.45	0.81	509	78	0.79
t-SNE (logCP10k)	Overall mean	0.34	0.62	0.12	0.36	-0.01	0.16	0.51	0.26	0.18	0.48	0.85	862	293	1.19
densMAP (logCP10k, 1kHVG)	Overall mean	0.33	0.60	0.38	0.28	-0.02	0.12	0.49	0.24	0.14	0.45	0.80	610	79	0.84
NeuralEE (CPU) (logCP10k, 1kHVG)	Overall mean	0.33	0.62	0.15	0.36	0.10	0.12	0.52	0.25	0.14	0.47	0.78	751	246	0.97
PyMDE Preserve Neighbors (logCP10k, 1kHVG)	Overall mean	0.32	0.61	0.01	0.34	0.08	0.13	0.51	0.26	0.15	0.47	0.81	691	168	1.00
t-SNE (logCP10k, 1kHVG)	Overall mean	0.32	0.61	0.07	0.29	0.09	0.13	0.50	0.25	0.15	0.46	0.82	750	192	1.12
PyMDE Preserve Neighbors (logCP10k)	Overall mean	0.32	0.62	0.01	0.36	-0.02	0.15	0.51	0.27	0.17	0.47	0.84	852	230	1.18
UMAP (logCP10k)	Overall mean	0.32	0.65	0.09	0.42	-0.06	0.14	0.52	0.26	0.16	0.47	0.79	748	145	2.02
UMAP PCA (logCP10k)	Overall mean	0.31	0.60	0.05	0.36	-0.06	0.15	0.51	0.27	0.16	0.47	0.84	547	196	0.97
PHATE (logCP10k, 1kHVG)	Overall mean	0.31	0.60	-0.02	0.33	0.07	0.12	0.51	0.26	0.13	0.46	0.80	461	256	0.86
UMAP PCA (logCP10k, 1kHVG)	Overall mean	0.31	0.61	-0.00	0.32	0.06	0.12	0.50	0.25	0.14	0.46	0.82	542	79	0.78
PHATE (gamma=0)	Overall mean	0.30	0.60	0.03	0.30	-0.04	0.12	0.50	0.25	0.14	0.46	0.83	560	264	1.03
PCA (logCP10k, 1kHVG)	Overall mean	0.29	0.59	0.16	0.27	0.09	0.12	0.50	0.24	0.14	0.45	0.74	371	42	0.74
UMAP (logCP10k, 1kHVG)	Overall mean	0.28	0.61	-0.02	0.28	-0.01	0.12	0.50	0.24	0.14	0.45	0.81	455	95	0.81
PyMDE Preserve Distances (logCP10k, 1kHVG)	Overall mean	0.28	0.53	0.22	0.33	0.12	0.07	0.51	0.20	0.09	0.45	0.73	921	334	18.99
NeuralEE (CPU) (Default)	Overall mean	0.28	0.60	-0.06	0.35	-0.01	0.13	0.51	0.25	0.15	0.47	0.79	736	199	5.18
PCA (logCP10k)	Overall mean	0.28	0.60	0.03	0.43	-0.14	0.11	0.50	0.24	0.13	0.46	0.77	349	90	0.89
PHATE (default)	Overall mean	0.28	0.60	0.03	0.27	-0.09	0.12	0.49	0.24	0.14	0.45	0.82	471	173	1.03
PHATE (logCP10k)	Overall mean	0.27	0.62	-0.07	0.30	-0.08	0.13	0.50	0.26	0.14	0.46	0.82	565	202	0.88
Diffusion maps	Overall mean	0.23	0.36	-0.06	0.35	0.04	0.06	0.51	0.21	0.08	0.45	0.75	656	242	2.01

Dataset info

Show

Mouse hematopoietic stem cell differentiation

Source dataset · Data source

1.6k hematopoietic stem and progenitor cells from mouse bone marrow. Sequenced by Smart-seq2. 1920 cells x 43258 features with 3 cell type labels (Nestorowa et al. 2016).

Mouse myeloid lineage differentiation

Source dataset · Data source

Myeloid lineage differentiation from mouse blood. Sequenced by SMARTseq in 2016 by Olsson et al. 660 cells x 112815 features with 4 cell type labels (Olsson et al. 2016).

5k Peripheral blood mononuclear cells

Source dataset · Data source

5k Peripheral Blood Mononuclear Cells (PBMCs) from a healthy donor. Sequenced on 10X v3 chemistry in July 2019 by 10X Genomics. 5247 cells x 20822 features with no cell type labels (10x Genomics 2019).

Zebrafish

Source dataset · Data source

90k cells from zebrafish embryos throughout the first day of development, with and without a knockout of chordin, an important developmental gene. Dimensions: 26022 cells, 25258 genes. 24 cell types (avg. 1084±1156 cells per cell type) (Wagner et al. 2018).

Method info

Show

densMAP (logCP10k)

Repository · Source Code · Container · v1.0.0

densMAP is a modification of UMAP that adds an extra cost term in order to preserve information about the relative local density of the data. It is performed on the same inputs as UMAP (Narayan, Berger, and Cho 2021)

Diffusion maps uses an affinity matrix to describe the similarity between data points, which is then transformed into a graph Laplacian. The eigenvalue-weighted eigenvectors of the graph Laplacian are then used to create the embedding. Diffusion maps is calculated on the logCPM expression matrix (Coifman and Lafon 2006)

NeuralEE (CPU) (Default)

Repository · Source Code · Container · v1.0.0

NeuralEE is a neural network implementation of elastic embedding. It is a non-linear method that preserves pairwise distances between data points. NeuralEE uses a neural network to optimize an objective function that measures the difference between pairwise distances in the original high-dimensional space and the two-dimensional space. It is computed on both the recommended input from the package authors of 500 HVGs selected from a logged expression matrix (without sequencing depth scaling) and the default logCPM matrix with 1000 HVGs (Xiong et al. 2020)

NeuralEE (CPU) (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

PCA (logCP10k)

Repository · Source Code · Container · v1.0.0

PCA or “Principal Component Analysis” is a linear method that finds orthogonal directions in the data that capture the most variance. The first two principal components are chosen as the two-dimensional embedding. We select only the first two principal components as the two-dimensional embedding. PCA is calculated on the logCPM expression matrix with and without selecting 1000 HVGs (Pearson 1901)

PCA (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

PHATE (default)

Repository · Source Code · Container · v1.0.0

PHATE or “Potential of Heat - diffusion for Affinity - based Transition Embedding” uses the potential of heat diffusion to preserve trajectories in a dataset via a diffusion process. It is an affinity - based method that creates an embedding by finding the dominant eigenvalues of a Markov transition matrix. We evaluate several variants including using the recommended square - root transformed CPM matrix as input, this input with the gamma parameter set to zero and the normal logCPM transformed matrix with and without HVG selection (Moon et al. 2019)

PHATE (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

PHATE (logCP10k)

Repository · Source Code · Container · v1.0.0

PHATE (gamma=0)

Repository · Source Code · Container · v1.0.0

PyMDE Preserve Distances (logCP10k)

Repository · Source Code · Container · v1.0.0

PyMDE is a Python implementation of minimum-distortion embedding. It is a non-linear method that preserves distances between cells or neighborhoods in the high-dimensional space. It is computed with options to preserve distances between cells or neighbourhoods and with the logCPM matrix with and without HVG selection as input (Agrawal, Ali, and Boyd 2021)

PyMDE Preserve Distances (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

PyMDE Preserve Neighbors (logCP10k)

Repository · Source Code · Container · v1.0.0

PyMDE Preserve Neighbors (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

t-SNE (logCP10k)

Repository · Source Code · Container · v1.0.0

t-SNE or t-distributed Stochastic Neighbor Embedding converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. We use the implementation in the scanpy package with the result of PCA on the logCPM expression matrix (with and without HVG selection) (van der Maaten and Hinton 2008)

t-SNE (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

UMAP (logCP10k)

Repository · Source Code · Container · v1.0.0

UMAP or Uniform Manifold Approximation and Projection is an algorithm for dimension reduction based on manifold learning techniques and ideas from topological data analysis. We perform UMAP on the logCPM expression matrix before and after HVG selection and with and without PCA as a pre-processing step (McInnes, Healy, and Melville 2018)

UMAP (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

UMAP PCA (logCP10k)

Repository · Source Code · Container · v1.0.0

UMAP PCA (logCP10k, 1kHVG)

Repository · Source Code · Container · v1.0.0

Control method info

Show

Random Features

Repository · Source Code · Container · v1.0.0

Randomly generated two-dimensional coordinates from a normal distribution (Open Problems for Single Cell Analysis Consortium 2022)

Spectral Features

Repository · Source Code · Container · v1.0.0

Use 1000-dimensional diffusions maps as an embedding (Open Problems for Single Cell Analysis Consortium 2022)

True Features

Repository · Source Code · Container · v1.0.0

Use of the original feature inputs as the ‘embedding’ (Open Problems for Single Cell Analysis Consortium 2022)

Metric info

Show

continuity

Source code · Container

Continuity measures error of hard extrusions based on nearest neighbor coranking (Zhang, Shang, and Zhang 2021).

Density preservation

Source code · Container

Similarity between local densities in the high-dimensional data and the reduced data (Narayan, Berger, and Cho 2021).

Distance correlation

Source code · Container

Spearman correlation between all pairwise Euclidean distances in the original and dimension-reduced data (Schober, Boer, and Schwarte 2018).

Distance correlation (spectral)

Source code · Container

Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data (Coifman and Lafon 2006).

local continuity meta criterion

Source code · Container

The local continuity meta criterion is the co-KNN size with baseline removal which favors locality (Zhang, Shang, and Zhang 2021).

global property

Source code · Container

The global property metric is a summary of the global co-KNN (Zhang, Shang, and Zhang 2021).

local property

Source code · Container

The local property metric is a summary of the local co-KNN (Zhang, Shang, and Zhang 2021).

co-KNN size

Source code · Container

co-KNN size counts how many points are in both k-nearest neighbors before and after the dimensionality reduction (Zhang, Shang, and Zhang 2021).

co-KNN AUC

Source code · Container

co-KNN AUC is area under the co-KNN curve (Zhang, Shang, and Zhang 2021).

trustworthiness

Source code · Container

a measurement of similarity between the rank of each point’s nearest neighbors in the high-dimensional data and the reduced data (Venna and Kaski 2001).

Quality control results

Show

Category	Name	Value	Condition	Severity
Raw results	Dataset 'zebrafish_labs' %missing	0.60	pct_missing <= .1	✗✗✗
Raw results	Metric 'continuity' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Metric 'lcmc' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Metric 'qglobal' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Metric 'qlocal' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Metric 'qnn' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Metric 'qnn_auc' %missing	0.25	pct_missing <= .1	✗✗
Raw results	Method 'densmap_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'densmap_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'densmap_pca_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'densmap_pca_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'diffusion_map' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'neuralee_default' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'neuralee_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pca_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pca_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'phate_default' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'phate_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'phate_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'phate_sqrt' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pymde_distances_log_cp10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pymde_distances_log_cp10k_hvg' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pymde_neighbors_log_cp10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'pymde_neighbors_log_cp10k_hvg' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'random_features' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'spectral_features' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'true_features' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'tsne_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'tsne_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'umap_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'umap_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'umap_pca_logCP10k' %missing	0.15	pct_missing <= .1	✗
Raw results	Method 'umap_pca_logCP10k_1kHVG' %missing	0.15	pct_missing <= .1	✗

Normalisation visualisation

Show

Warning: Removed 1 row containing missing values (`geom_path()`).

Warning: Removed 156 rows containing missing values (`geom_point()`).

Authors

Luke Zappia (maintainer, author) ,
Michael Vinyard (author)
Michal Klein (author)
Scott Gigante (author) ,
Ben DeMeo (author)
Robrecht Cannoodt (author) ,
Kai Waldrant (contributor) ,
Sai Nirmayi Yasa (contributor) ,
Juan A. Cordero Varela (contributor) ,

References

10x Genomics. 2019. “5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-b Antibodies (V3 Chemistry).” https://www.10xgenomics.com/resources/datasets/5-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-with-cell-surface-proteins-v-3-chemistry-3-1-standard-3-1-0.

Agrawal, Akshay, Alnur Ali, and Stephen Boyd. 2021. “Minimum-Distortion Embedding.” Foundations and Trends in Machine Learning 14 (3): 211–378. https://doi.org/10.1561/2200000090.

Coifman, Ronald R., and Stéphane Lafon. 2006. “Diffusion Maps.” Applied and Computational Harmonic Analysis 21 (1): 5–30. https://doi.org/10.1016/j.acha.2006.04.006.

McInnes, Leland, John Healy, and James Melville. 2018. “UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction.” arXiv. https://doi.org/10.48550/arxiv.1802.03426.

Moon, Kevin R., David van Dijk, Zheng Wang, Scott Gigante, Daniel B. Burkhardt, William S. Chen, Kristina Yim, et al. 2019. “Visualizing Structure and Transitions in High-Dimensional Biological Data.” Nature Biotechnology 37 (12): 1482–92. https://doi.org/10.1038/s41587-019-0336-3.

Narayan, Ashwin, Bonnie Berger, and Hyunghoon Cho. 2021. “Assessing Single-Cell Transcriptomic Variability Through Density-Preserving Data Visualization.” Nature Biotechnology 39 (6): 765–74. https://doi.org/10.1038/s41587-020-00801-7.

Nestorowa, Sonia, Fiona K. Hamey, Blanca Pijuan Sala, Evangelia Diamanti, Mairi Shepherd, Elisa Laurenti, Nicola K. Wilson, David G. Kent, and Berthold Göttgens. 2016. “A Single-Cell Resolution Map of Mouse Hematopoietic Stem and Progenitor Cell Differentiation.” Blood 128 (8): e20–31. https://doi.org/10.1182/blood-2016-05-716480.

Olsson, Andre, Meenakshi Venkatasubramanian, Viren K. Chaudhri, Bruce J. Aronow, Nathan Salomonis, Harinder Singh, and H. Leighton Grimes. 2016. “Single-Cell Analysis of Mixed-Lineage States Leading to a Binary Cell Fate Choice.” Nature 537 (7622): 698–702. https://doi.org/10.1038/nature19348.

Open Problems for Single Cell Analysis Consortium. 2022. “Open Problems.” https://openproblems.bio.

Pearson, Karl. 1901. “On Lines and Planes of Closest Fit to Systems of Points in Space.” The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science 2 (11): 559–72. https://doi.org/10.1080/14786440109462720.

Schober, Patrick, Christa Boer, and Lothar A. Schwarte. 2018. “Correlation Coefficients.” Anesthesia & Analgesia 126 (5): 1763–68. https://doi.org/10.1213/ane.0000000000002864.

van der Maaten, Laurens, and Geoffrey Hinton. 2008. “Visualizing Data Using t-SNE.” Journal of Machine Learning Research 9 (86): 2579–2605. http://jmlr.org/papers/v9/vandermaaten08a.html.

Venna, Jarkko, and Samuel Kaski. 2001. “Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study.” In Artificial Neural Networks ICANN 2001, 485–91. Springer Berlin Heidelberg. https://doi.org/{10.1007/3-540-44668-0\_68}.

Wagner, Daniel E., Caleb Weinreb, Zach M. Collins, James A. Briggs, Sean G. Megason, and Allon M. Klein. 2018. “Single-Cell Mapping of Gene Expression Landscapes and Lineage in the Zebrafish Embryo.” Science 360 (6392): 981–87. https://doi.org/10.1126/science.aar4362.

Xiong, Jiankang, Fuzhou Gong, Lin Wan, and Liang Ma. 2020. “NeuralEE: A GPU-Accelerated Elastic Embedding Dimensionality Reduction Method for Visualizing Large-Scale scRNA-Seq Data.” Frontiers in Genetics 11. https://doi.org/10.3389/fgene.2020.00786.

Zhang, Yinsheng, Qian Shang, and Guoming Zhang. 2021. “pyDRMetrics - a Python Toolkit for Dimensionality Reduction Quality Assessment.” Heliyon 7 (2): e06199. https://doi.org/10.1016/j.heliyon.2021.e06199.

Info

Summary

Results

Dataset info

Mouse hematopoietic stem cell differentiation

Mouse myeloid lineage differentiation

5k Peripheral blood mononuclear cells

Zebrafish

Method info

densMAP (logCP10k)

densMAP (logCP10k, 1kHVG)

densMAP PCA (logCP10k)

densMAP PCA (logCP10k, 1kHVG)

Diffusion maps

NeuralEE (CPU) (Default)

NeuralEE (CPU) (logCP10k, 1kHVG)

PCA (logCP10k)

PCA (logCP10k, 1kHVG)

PHATE (default)

PHATE (logCP10k, 1kHVG)

PHATE (logCP10k)

PHATE (gamma=0)

PyMDE Preserve Distances (logCP10k)

PyMDE Preserve Distances (logCP10k, 1kHVG)

PyMDE Preserve Neighbors (logCP10k)

PyMDE Preserve Neighbors (logCP10k, 1kHVG)

t-SNE (logCP10k)

t-SNE (logCP10k, 1kHVG)

UMAP (logCP10k)

UMAP (logCP10k, 1kHVG)

UMAP PCA (logCP10k)

UMAP PCA (logCP10k, 1kHVG)

Control method info

Random Features

Spectral Features

True Features

Metric info

continuity

Density preservation

Distance correlation

Distance correlation (spectral)

local continuity meta criterion

global property

local property

co-KNN size

co-KNN AUC

trustworthiness

Quality control results

Normalisation visualisation

Authors

References