Using sample representation methods#

In this tutorial, we will demostrate how to use patpy to build sample representation from single-cell transcriptomics data, and how to compare representations from different methods

Install the patpy package#

This package contains interface to sample representation methods as well as some handy analysis functions.

!pip install git+https://github.com/lueckenlab/patpy.git@main

Import packages#

import ehrapy as ep
import pandas as pd
import scanpy as sc
import patpy
import seaborn as sns
from plottable import ColumnDefinition, Table
from plottable.cmap import normed_cmap
from plottable.plots import bar
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
patpy.__version__
'0.9.0'

Read the data#

Here, we use COMBAT dataset. This dataset contains 783k cells from 140 COVID-19 patients and healthy donors.

ADATA_PATH = "/Users/vladimir.shitov/Documents/programming/pat_rep_benchmark/data/combat/combat_processed.h5ad"
adata = sc.read_h5ad(ADATA_PATH)
adata
AnnData object with n_obs × n_vars = 783677 × 3000
    obs: 'Annotation_cluster_id', 'Annotation_cluster_name', 'Annotation_minor_subset', 'Annotation_major_subset', 'Annotation_cell_type', 'GEX_region', 'QC_ngenes', 'QC_total_UMI', 'QC_pct_mitochondrial', 'QC_scrub_doublet_scores', 'TCR_chain_composition', 'TCR_clone_ID', 'TCR_clone_count', 'TCR_clone_proportion', 'TCR_contains_unproductive', 'TCR_doublet', 'TCR_chain_TRA', 'TCR_v_gene_TRA', 'TCR_d_gene_TRA', 'TCR_j_gene_TRA', 'TCR_c_gene_TRA', 'TCR_productive_TRA', 'TCR_cdr3_TRA', 'TCR_umis_TRA', 'TCR_chain_TRA2', 'TCR_v_gene_TRA2', 'TCR_d_gene_TRA2', 'TCR_j_gene_TRA2', 'TCR_c_gene_TRA2', 'TCR_productive_TRA2', 'TCR_cdr3_TRA2', 'TCR_umis_TRA2', 'TCR_chain_TRB', 'TCR_v_gene_TRB', 'TCR_d_gene_TRB', 'TCR_j_gene_TRB', 'TCR_c_gene_TRB', 'TCR_productive_TRB', 'TCR_chain_TRB2', 'TCR_v_gene_TRB2', 'TCR_d_gene_TRB2', 'TCR_j_gene_TRB2', 'TCR_c_gene_TRB2', 'TCR_productive_TRB2', 'TCR_cdr3_TRB2', 'TCR_umis_TRB2', 'BCR_umis_HC', 'BCR_contig_qc_HC', 'BCR_functionality_HC', 'BCR_v_call_HC', 'BCR_v_score_HC', 'BCR_j_call_HC', 'BCR_j_score_HC', 'BCR_junction_aa_HC', 'BCR_total_mut_HC', 'BCR_s_mut_HC', 'BCR_r_mut_HC', 'BCR_c_gene_HC', 'BCR_clone_per_replicate_HC', 'BCR_clone_global_HC', 'BCR_clonal_abundance_HC', 'BCR_locus_LC', 'BCR_umis_LC', 'BCR_contig_qc_LC', 'BCR_functionality_LC', 'BCR_v_call_LC', 'BCR_v_score_LC', 'BCR_j_call_LC', 'BCR_j_score_LC', 'BCR_junction_aa_LC', 'BCR_total_mut_LC', 'BCR_s_mut_LC', 'BCR_r_mut_LC', 'BCR_c_gene_LC', 'COMBAT_ID', 'scRNASeq_sample_ID', 'COMBAT_participant_timepoint_ID', 'Source', 'Age', 'Sex', 'Race', 'BMI', 'Hospitalstay', 'Death28', 'Institute', 'PreExistingHeartDisease', 'PreExistingLungDisease', 'PreExistingKidneyDisease', 'PreExistingDiabetes', 'PreExistingHypertension', 'PreExistingImmunocompromised', 'Smoking', 'Symptomatic', 'Requiredvasoactive', 'Respiratorysupport', 'SARSCoV2PCR', 'Outcome', 'TimeSinceOnset', 'Ethnicity', 'Tissue', 'DiseaseClassification', 'Pool_ID', 'Channel_ID', 'ifn_1_score', '_scvi_batch', '_scvi_labels', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'Institute', 'ObjectCreateDate', 'Source_colors', 'Technology', 'X_scpoli', '_scvi_manager_uuid', '_scvi_uuid', 'genome_annotation_version', 'hvg', 'log1p', 'pca', 'scpoli_distances', 'scpoli_parameters', 'scpoli_samples'
    obsm: 'X_pca', 'X_scANVI_batch', 'X_scANVI_sample', 'X_scVI_batch', 'X_scVI_sample', 'X_scpoli', 'X_umap', 'X_umap_source'
    varm: 'PCs'
    layers: 'X_raw_counts'

Set columns containing sample IDs, cell types and metadata#

sample_id_col = "scRNASeq_sample_ID"
cell_type_key = "cell_type"
samples_metadata_cols = ["Source", "Outcome", "Death28", "Institute", "Pool_ID"]

Currently, there is no such columns as “cell_type” in the data. But cell types are stored in the Annotation_major_subset column. Let’s rename it to cell_type for better readability.

adata.obs.rename(columns={"Annotation_major_subset": cell_type_key}, inplace=True)

Store metadata and calculate QC metrics#

metadata = adata.obs[samples_metadata_cols + [sample_id_col]].drop_duplicates()
metadata.set_index(sample_id_col, inplace=True)
metadata
Source Outcome Death28 Institute Pool_ID
scRNASeq_sample_ID
S00109-Ja001E-PBCa COVID_SEV 2.0 0 Oxford gPlexA
S00112-Ja003E-PBCa COVID_MILD 5.0 0 Oxford gPlexA
S00005-Ja005E-PBCa COVID_CRIT 2.0 0 Oxford gPlexA
S00061-Ja003E-PBCa COVID_SEV 4.0 0 Oxford gPlexA
S00056-Ja003E-PBCa COVID_SEV 3.0 0 Oxford gPlexA
... ... ... ... ... ...
S00065-Ja003E-PBCa COVID_CRIT 2.0 0 Oxford gPlexK
S00048-Ja003E-PBCa COVID_SEV 4.0 0 Oxford gPlexK
G05112-Ja005E-PBCa COVID_HCW_MILD 6.0 0 Oxford gPlexK
N00038-Ja001E-PBGa Sepsis NaN 0 Oxford gPlexK
U00501-Ua005E-PBUa Flu 2.0 0 St_Georges gPlexK

138 rows × 5 columns

cell_qc_metadata = patpy.pp.calculate_cell_qc_metrics(
    adata, sample_key=sample_id_col, cell_qc_vars=["QC_ngenes", "QC_pct_mitochondrial", "QC_scrub_doublet_scores"]
)
cell_qc_metadata
median_QC_ngenes median_QC_pct_mitochondrial median_QC_scrub_doublet_scores
scRNASeq_sample_ID
G05061-Ja005E-PBCa 1107.0 3.011159 0.050648
G05064-Ja005E-PBCa 975.0 1.332430 0.060894
G05073-Ja005E-PBCa 1141.0 2.422559 0.044530
G05077-Ja005E-PBCa 1125.0 2.946723 0.048490
G05078-Ja005E-PBCa 999.0 2.825308 0.052783
... ... ... ...
U00607-Ua005E-PBUa 1827.0 2.982509 0.043323
U00613-Ua005E-PBUa 1251.5 2.053083 0.036956
U00617-Ua005E-PBUa 1410.5 3.886215 0.057906
U00619-Ua005E-PBUa 1532.0 2.688350 0.041823
U00701-Ua005E-PBUa 1093.5 3.714650 0.063941

138 rows × 3 columns

n_genes_metadata = patpy.pp.calculate_n_cells_per_sample(adata, sample_id_col)
n_genes_metadata
n_cells
scRNASeq_sample_ID
S00052-Ja005E-PBCa 13918
H00054-Ha001E-PBGa 10938
H00067-Ha001E-PBGa 10781
N00023-Ja001E-PBGa 10484
H00053-Ha001E-PBGa 10458
... ...
U00607-Ua005E-PBUa 1021
U00613-Ua005E-PBUa 970
U00701-Ua005E-PBUa 872
U00601-Ua005E-PBUa 619
U00504-Ua005E-PBUa 161

138 rows × 1 columns

composition_metadata = patpy.pp.calculate_compositional_metrics(adata, sample_id_col, [cell_type_key], normalize_to=100)
composition_metadata
cell_type cell_type_B cell_type_CD4 cell_type_CD8 cell_type_DC cell_type_DN cell_type_DP cell_type_GDT cell_type_HSC cell_type_MAIT cell_type_Mast cell_type_NK cell_type_PB cell_type_PLT cell_type_RET cell_type_cMono cell_type_iNKT cell_type_ncMono
scRNASeq_sample_ID
G05061-Ja005E-PBCa 6.324900 33.921438 12.366844 1.597870 0.532623 0.499334 0.898802 0.066578 4.677097 0.000000 18.159121 0.316245 0.166445 0.016644 15.812250 0.033289 4.610519
G05064-Ja005E-PBCa 3.405158 47.147482 16.400581 1.819806 1.228090 0.725689 2.188233 0.022329 1.317405 0.000000 7.457854 0.446578 0.000000 0.000000 14.357486 0.000000 3.483309
G05073-Ja005E-PBCa 5.194338 45.609405 16.278791 1.487524 1.247601 0.839731 4.654511 0.011996 2.195298 0.011996 3.730806 0.203935 0.047985 0.000000 13.963532 0.083973 4.438580
G05077-Ja005E-PBCa 5.846211 29.231056 14.596909 1.377770 0.446844 1.340532 0.465463 0.167567 0.800596 0.018619 22.844908 1.079873 0.074474 0.018619 18.004096 0.055856 3.630609
G05078-Ja005E-PBCa 1.366381 39.000106 15.591569 2.340854 0.762631 0.730855 2.648025 0.211842 1.737104 0.021184 10.666243 0.148289 0.021184 0.000000 19.521237 0.497829 4.734668
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
U00607-Ua005E-PBUa 3.623898 10.577865 1.273262 4.701273 0.097943 0.195886 0.195886 0.685602 0.097943 0.000000 5.876592 1.175318 1.958864 0.097943 37.904016 0.000000 31.537708
U00613-Ua005E-PBUa 7.835052 26.391753 16.907216 0.721649 0.412371 0.309278 1.649485 0.000000 0.103093 0.000000 5.154639 1.237113 0.206186 0.000000 37.938144 0.000000 1.134021
U00617-Ua005E-PBUa 2.977233 41.418564 16.462347 0.437828 0.525394 0.262697 0.262697 0.963222 0.087566 0.000000 7.530648 17.513135 0.788091 0.087566 9.719790 0.000000 0.963222
U00619-Ua005E-PBUa 3.537618 35.077230 15.894370 0.896861 0.697559 0.298954 2.441455 1.145989 0.099651 0.099651 17.588440 2.840060 4.534131 0.000000 10.363727 0.000000 4.484305
U00701-Ua005E-PBUa 1.032110 15.596330 39.334862 1.261468 0.802752 0.458716 0.573394 0.114679 0.000000 0.000000 12.844037 0.000000 0.344037 0.229358 24.885321 0.000000 2.522936

138 rows × 17 columns

Merge metadata tables

metadata = pd.concat(
    [
        metadata,
        cell_qc_metadata.loc[metadata.index],
        n_genes_metadata.loc[metadata.index],
        composition_metadata.loc[metadata.index],
    ],
    axis=1,
)
metadata.shape
../_images/3c6e42d2351a31b9ea880c4957a4cdd76cf57e4348c2ebacc9f9caad4353d0a4.png

Quality control#

To reduce noise in the representations, we need to remove samples with too few cells:

adata = patpy.pp.filter_small_samples(adata, sample_key=sample_id_col, sample_size_threshold=100)
0 samples removed:

If necessary, we can also remove cell types with too few cells in at least one sample:

adata = patpy.pp.filter_small_cell_types(adata, sample_key=sample_id_col, cell_group_key=cell_type_key, cell_type_size_threshold=10)

Some methods require this filtering step prior to building representation. In this notebook, we’ll focus on simpler methods that can be used with only sample filtering.

Run a simple pseudobulk representation#

This method aggregates all cells from each sample into a single vector. It can be interpreted as an average cell from each sample. Any input space can be used specified by layer parameter. For example, to use count data, use layer="X" or layer="raw". We find batch-corrected space to be the best for the representation, so we use layer="X_scVI" below.

pseudobulk = patpy.tl.Pseudobulk(sample_key=sample_id_col, cell_group_key=cell_type_key, layer="X_scVI_batch")
pseudobulk.prepare_anndata(adata)

Calculate a matrix of distances between samples#

distances = pseudobulk.calculate_distance_matrix(force=True)
distances
array([[0.        , 2.13826083, 3.26751086, ..., 2.26537604, 3.47556017,
        3.50837059],
       [2.13826083, 0.        , 2.57066436, ..., 2.67332482, 3.12726692,
        2.3833747 ],
       [3.26751086, 2.57066436, 0.        , ..., 3.81414183, 3.50147504,
        2.11261287],
       ...,
       [2.26537604, 2.67332482, 3.81414183, ..., 0.        , 3.55756191,
        4.0558818 ],
       [3.47556017, 3.12726692, 3.50147504, ..., 3.55756191, 0.        ,
        2.70576771],
       [3.50837059, 2.3833747 , 2.11261287, ..., 4.0558818 , 2.70576771,
        0.        ]])

We can also see the pseudobulks for each sample:

pseudobulk.sample_representation
array([[-0.39708972, -0.1610353 , -0.60592145, ...,  0.04138801,
        -0.23706943,  0.03939089],
       [-0.16564418,  0.15338482, -0.00623349, ..., -0.36909145,
        -0.61890358, -0.26146826],
       [-0.91906726, -0.34432968, -0.03022187, ..., -1.03792512,
        -0.26926878,  0.30427763],
       ...,
       [ 0.07143331, -0.52035475, -0.04083514, ..., -0.01975986,
        -0.2641018 , -0.32993519],
       [-0.86681354, -0.71001476,  0.40946293, ...,  0.38672951,
         0.59467804,  0.03230282],
       [-0.98729014, -0.46418488,  0.28515279, ..., -0.19687848,
        -0.42034405,  0.18162856]])

We can then visualise sample representation using dimensionality reduction methods:

pseudobulk.plot_embedding(method="UMAP", metadata_cols=samples_metadata_cols);
../_images/28cd87a22835b3681eeb09461f573aeea1623536cff4a8882dff04a18d66a122.png

Other dimensionality reduction methods are also available

pseudobulk.plot_embedding(method="TSNE", metadata_cols=samples_metadata_cols);
../_images/8e9158dc7ba1b1c8df50b9d4cb6ce45dd8796d1068c43e2493151ba9b16dd1ec.png
pseudobulk.plot_embedding(method="MDS", metadata_cols=samples_metadata_cols);
../_images/066a51f3f96d167305e60a437d8ee6ebcd05c63aadb4d55232aeb2774618351c.png

Evaluate how well a covariate is represented#

Let’s try to classify “Outcome” based on the nearest neighbors for each sample in the representation:

pseudobulk.evaluate_representation(target="Outcome", method="knn", n_neighbors=5, task="classification")
{'score': 0.13977126662840952,
 'metric': 'f1_macro_calibrated',
 'n_unique': 6,
 'n_observations': 113,
 'method': 'knn'}

It doesn’t work too good. Now we can try to solve the ranking problem for the same covariate. It will train regressor and use a different metric for evaluation:

pseudobulk.evaluate_representation(target="Outcome", method="knn", n_neighbors=5, task="ranking")
{'score': 0.6103813851943187,
 'metric': 'spearman_r',
 'n_unique': 6,
 'n_observations': 113,
 'method': 'knn'}

Now let’s see how well Pool is represented. This is a technical covariate so we don’t want score to be high:

pseudobulk.evaluate_representation(target="Pool_ID", method="knn", n_neighbors=5, task="classification")
{'score': 0.13579292359838518,
 'metric': 'f1_macro_calibrated',
 'n_unique': 10,
 'n_observations': 138,
 'method': 'knn'}

Save the distances to adata to use later

adata.uns["pseudobulk_distances"] = distances
adata.uns["pseudobulk_samples"] = pseudobulk.samples
adata.uns["pseudobulk_UMAP"] = pseudobulk.embeddings["UMAP"]

Run cell type composition representation#

This representation is based on cell type composition differences. It is calculated as a difference between cell type proportions in each sample.

composition = patpy.tl.CellGroupComposition(sample_key=sample_id_col, cell_group_key=cell_type_key, layer="X_scVI")
composition.prepare_anndata(adata)
composition_distances = composition.calculate_distance_matrix(force=True)
composition.plot_embedding(method="UMAP", metadata_cols=samples_metadata_cols)
array([<Axes: xlabel='UMAP_0', ylabel='UMAP_1'>,
       <Axes: xlabel='UMAP_0', ylabel='UMAP_1'>,
       <Axes: xlabel='UMAP_0', ylabel='UMAP_1'>,
       <Axes: xlabel='UMAP_0', ylabel='UMAP_1'>,
       <Axes: xlabel='UMAP_0', ylabel='UMAP_1'>], dtype=object)
../_images/32bf43eca2ff933e03f3e38ca4d3c433af41e4e7bfcd1c842247fd6b552e21d2.png

We can also visualise cell type proportions for each sample:

composition.sample_representation.plot(kind="bar", stacked=True, figsize=(10, 8), width=0.8)
plt.xticks([])
plt.legend(loc=(1.05, 0), title="Cell type");
../_images/da85b264f2f02e1e83adab7141d11c306e6495e14f1f2bc030aaf8b6a61991d6.png

We can see that cell type composition reflects patient outcome worse than pseudobulk:

composition.evaluate_representation(target="Outcome", method="knn", n_neighbors=5, task="ranking")
{'score': 0.4739538892493282,
 'metric': 'spearman_r',
 'n_unique': 6,
 'n_observations': 113,
 'method': 'knn'}
adata.uns["composition_distances"] = composition_distances
adata.uns["composition_samples"] = composition.samples

Run PILOT representation#

PILOT is an Optimal Transport-based tool, which calculates distances between samples based on cell type proportion differences taking into account cell type similarities. Note that to run it, you need to install the dependencies additionally:

pip install patpy[pilot]

pilot = patpy.tl.PILOT(
    sample_key=sample_id_col,
    cell_group_key=cell_type_key,
    layer="X_scVI_batch",
    sample_state_col="Outcome",  # It is not used for distances calculation and doesn't really matter
)
pilot.prepare_anndata(adata)
pilot.calculate_distance_matrix()
array([[0.        , 0.14451829, 0.42457524, ..., 0.10953257, 0.21343671,
        0.36804272],
       [0.14451829, 0.        , 0.32280757, ..., 0.10045083, 0.13157015,
        0.25766875],
       [0.42457524, 0.32280757, 0.        , ..., 0.36857701, 0.23438838,
        0.08670158],
       ...,
       [0.10953257, 0.10045083, 0.36857701, ..., 0.        , 0.17271665,
        0.31134621],
       [0.21343671, 0.13157015, 0.23438838, ..., 0.17271665, 0.        ,
        0.21202688],
       [0.36804272, 0.25766875, 0.08670158, ..., 0.31134621, 0.21202688,
        0.        ]])
pilot.plot_embedding(method="UMAP", metadata_cols=samples_metadata_cols);
../_images/d69cd8503effd2414a8558a27b7ae298c7676cf64bb9826191e3d5bce4cca615.png
adata.uns["pilot_distances"] = pilot.calculate_distance_matrix()
adata.uns["pilot_samples"] = pilot.samples
adata.uns["pilot_UMAP"] = pilot.embeddings["UMAP"]

Let’s check the covariates:

pilot.evaluate_representation(target="Outcome", method="knn", n_neighbors=5, task="ranking")
{'score': 0.48835133146990367,
 'metric': 'spearman_r',
 'n_unique': 6,
 'n_observations': 113,
 'method': 'knn'}
pilot.evaluate_representation(target="Pool_ID", method="knn", n_neighbors=5, task="classification")
{'score': 0.01832203003010072,
 'metric': 'f1_macro_calibrated',
 'n_unique': 10,
 'n_observations': 138,
 'method': 'knn'}

Outcome is represented a bit worse, but Pool ID almost doesn’t affect the representation

Compare representations#

Let’s write a small function to put all the sample representations into a single object. It will make sure that the order of samples is identical and cluster the patients

def align_representations(adata, meta_adata, samples, methods):
    """
    Align representations of different methods to have the same order of samples.

    Additionally runs clustering with Leiden algorithm.
    """
    for method in methods:
        #     samples_to_take = np.isin(adata.uns[f"{method}_samples"], samples)
        representation_samples = adata.uns[f"{method}_samples"].tolist()
        samples_order = [representation_samples.index(sample) for sample in samples if sample in representation_samples]

        assert (adata.uns[f"{method}_samples"][samples_order] == samples).all(), "Order of samples is not correct"

        # meta_adata.obsm["umap"] = meta_adata.obsm[f"{method}_UMAP"]
        meta_adata.obsm[f"{method}_distances"] = adata.uns[f"{method}_distances"][samples_order][:, samples_order]

        ep.pp.neighbors(
            meta_adata, use_rep=f"{method}_distances", key_added=f"{method}_neighbors", metric="precomputed"
        )
        ep.tl.leiden(meta_adata, key_added=f"{method}_leiden", neighbors_key=f"{method}_neighbors")

    return meta_adata
combat_methods = [
    "pseudobulk",
    "composition",
    "pilot",
]
combat_samples = list(set(adata.uns[f"{method}_samples"]) for method in combat_methods)
combat_samples = list(set.intersection(*combat_samples))
len(combat_samples)
../_images/b3437138c30d612d3e297c9ce3829756b344b4a0cb5f4ce5e8f0633222774fc7.png
metadata = metadata.loc[combat_samples]
combat_meta_adata = ep.io.df_to_anndata(metadata)
combat_meta_adata = ep.pp.encode(combat_meta_adata, autodetect=True)
combat_meta_adata
! Features 'Outcome', 'Death28' were detected as categorical features stored numerically.Please verify and correct using `ep.ad.replace_feature_types` if necessary.
! Feature types were inferred and stored in adata.var[feature_type]. Please verify using `ep.ad.feature_type_overview` and adjust if necessary using `ep.ad.replace_feature_types`.

AnnData object with n_obs × n_vars = 138 × 43
    obs: 'Source', 'Institute', 'Pool_ID'
    var: 'feature_type', 'unencoded_var_names', 'encoding_mode'
    layers: 'original'
combat_meta_adata = align_representations(
    adata=adata,
    meta_adata=combat_meta_adata,
    samples=combat_samples,
    methods=combat_methods,
)

Define types and prediction tasks for the covariates#

benchmark_schema = {
    "technical": ["Institute", "Pool_ID", "n_cells", "median_QC_ngenes"],
    "clinical": ["Death28", "Outcome", "Source"],
}

cols_with_tasks = {
    "Institute": "classification",
    "Pool_ID": "classification",
    "n_cells": "regression",
    "median_QC_ngenes": "regression",
    "Death28": "classification",
    "Outcome": "ranking",
    "Source": "classification",
}
results = []

for method in combat_methods:
    for covariate_type in benchmark_schema:
        for col in benchmark_schema[covariate_type]:
            task = cols_with_tasks[col]
            try:
                result = patpy.tl.evaluate_representation(
                    distances=combat_meta_adata.obsm[f"{method}_distances"],
                    target=metadata[col],
                    method="knn",
                    task=task,
                    #             n_neighbors=5
                )
            except Exception as e:
                print("Method:", method)
                print("Col:", col)
                print("Task:", task)
                print(e)
                print()
                continue
            #             raise(e)
            result["representation"] = method
            result["covariate"] = col
            result["covariate_type"] = covariate_type

            if result["metric"] == "spearman_r":
                result["score"] = abs(result["score"])

            if covariate_type == "technical":
                result["score"] = 1 - result["score"]

            results.append(result)

Scores are ranged from 0 to 1

  • For covariates that we defined as technical, 0 means that covariate strongly affects the representation, and 1 means that this covariate is randomly distributed across representation

  • For biological and clinical covariates, 0 means that a covariate is not represented well (it is randomly distributed), while 1 means that similar patients have similar values of covariate

knn_results = pd.DataFrame(results)
knn_results.sort_values("score", ascending=False)
score metric n_unique n_observations method representation covariate covariate_type
0 1.000000 f1_macro_calibrated 2 138 knn pseudobulk Institute technical
8 1.000000 f1_macro_calibrated 10 138 knn composition Pool_ID technical
15 1.000000 f1_macro_calibrated 10 138 knn pilot Pool_ID technical
14 1.000000 f1_macro_calibrated 2 138 knn pilot Institute technical
7 0.928846 f1_macro_calibrated 2 138 knn composition Institute technical
1 0.824655 f1_macro_calibrated 10 138 knn pseudobulk Pool_ID technical
9 0.803490 spearman_r 138 138 knn composition n_cells technical
16 0.792614 spearman_r 138 138 knn pilot n_cells technical
10 0.752516 spearman_r 119 138 knn composition median_QC_ngenes technical
17 0.656404 spearman_r 119 138 knn pilot median_QC_ngenes technical
2 0.632495 spearman_r 138 138 knn pseudobulk n_cells technical
5 0.603334 spearman_r 6 113 knn pseudobulk Outcome clinical
19 0.505120 spearman_r 6 113 knn pilot Outcome clinical
12 0.446273 spearman_r 6 113 knn composition Outcome clinical
3 0.414135 spearman_r 119 138 knn pseudobulk median_QC_ngenes technical
6 0.292073 f1_macro_calibrated 8 138 knn pseudobulk Source clinical
11 0.219991 f1_macro_calibrated 2 138 knn composition Death28 clinical
20 0.160770 f1_macro_calibrated 8 138 knn pilot Source clinical
18 0.137885 f1_macro_calibrated 2 138 knn pilot Death28 clinical
13 0.125450 f1_macro_calibrated 8 138 knn composition Source clinical
4 0.046083 f1_macro_calibrated 2 138 knn pseudobulk Death28 clinical
# plt.figure(figsize=(10, 20))
sns.barplot(data=knn_results, y="covariate", x="score", orient="h", hue="representation", palette="tab20")
plt.xlim(0, 1.05)
plt.title("KNN-score", fontsize=24)
Text(0.5, 1.0, 'KNN-score')
../_images/259f807ab05daf35573f1c776bbd07b407a4009cfa6b95455b8a212766e05cdf.png
knn_results_wide = knn_results.pivot(index="representation", columns="covariate", values="score")
# knn_results_wide.reset_index(inplace=True)

# Order the columns as in benchmark schema
cols_order = ["total"]

for covariate_type in benchmark_schema:
    cols_order.extend(benchmark_schema[covariate_type])
    cols_order.append(covariate_type)

cmap = LinearSegmentedColormap.from_list(
    name="bugw", colors=["#FF9693", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"], N=256
)

col_defs = []

col_defs.append(
    ColumnDefinition(
        "total",
        width=0.7,
        plot_fn=bar,
        plot_kw={
            "cmap": cmap,
            "plot_bg_bar": True,
            "annotate": True,
            "height": 0.5,
            "lw": 0.5,
            "formatter": lambda x: round(x, 2),
        },
    )
)

for covariate_type in benchmark_schema:
    type_cols = benchmark_schema[covariate_type]

    for col in type_cols:
        col_def = ColumnDefinition(
            name=col,
            width=0.75,
            formatter=lambda x: round(x, 2),
            textprops={
                "ha": "center",
                "bbox": {"boxstyle": "circle", "pad": 0.35},
            },
            cmap=normed_cmap(knn_results["score"], cmap=matplotlib.cm.PiYG, num_stds=2.5),
            group=covariate_type,
        )

        col_defs.append(col_def)

    knn_results_wide[covariate_type] = knn_results_wide[type_cols].mean(axis=1)
    col_defs.append(
        ColumnDefinition(
            covariate_type,
            width=0.7,
            plot_fn=bar,
            plot_kw={
                "cmap": cmap,
                "plot_bg_bar": True,
                "annotate": True,
                "height": 0.5,
                "lw": 0.5,
                "formatter": lambda x: round(x, 2),
            },
        )
    )


#     col_defs.append(type_cols)
clin_weight = 2 / 3

knn_results_wide["total"] = (
    clin_weight * knn_results_wide["clinical"] + (1 - clin_weight) * knn_results_wide["technical"]
)
fig, ax = plt.subplots(figsize=(18, 6))

Table(knn_results_wide[cols_order].sort_values("total", ascending=False), column_definitions=tuple(col_defs), ax=ax)
<plottable.table.Table at 0x42cf8fd50>
../_images/28516ae9a73375ffeb3c5b5ca30cbf3a034c43adb0e8abf00e5f2429921a773d.png

Other methods#

There are other methods implemented that you might want to try:

  • Aggregated per cell type (or some other group) pseudobulk: patpy.tl.GroupedPseudobulk

  • Composition differences: patpy.tl.CellGroupComposition

  • scPoli: patpy.tl.SCPoli

  • MrVI: patpy.tl.MrVI

  • PhEMD: patpy.tl.PhEMD

  • MOFA: patpy.tl.MOFA

  • GloScope: patpy.tl.GloScope

In the sample representation benchmark, we found GloScope to be the best representation method. For running it, R installation is required. You can use environment from this repository or manually install required dependencies.