Source code for scripts.metrics.score_bulk

import pathlib as pl
from argparse import ArgumentParser
from typing import Literal, Tuple

import numpy as np
import pandas as pd

_NORMMETHOD = Literal["rank", "log_rank", "log", "custom"]
_SCORERNAME = Literal["ssgsea", "average"]



[docs]
class AvgBulkScorer:
    """This class creates a scorer that takes the average of the (std or not) bulk gex as
    a proxy of signature score"""

    def __init__(self, std: bool):
        """
        Args:

            name: name of the scorer
            std: if True, the bulk gex will be standardized before the average is computed
        Returns:

            None

        """
        self.std = std


[docs]
    def score(self, bulk_values: pd.DataFrame, metasig: np.ndarray) -> pd.Series:
        """The main scoring function

        Args:

            bulk_values: a df of size (n_samples, n_genes) with the bulk gene expression
            metasig: a list of genes representing the signature to score
        Returns:

            a series with the score for each patient

        """
        intersection = bulk_values.columns.intersection(metasig)
        if len(intersection) == 0:
            raise ValueError(
                "There is no common gene between the metasignature and the bulk DataFrame"
            )
        else:
            df = bulk_values.loc[:, intersection]
            if self.std:
                df = (df - df.mean()) / df.std()
            return df.mean(axis=1)





[docs]
def get_data(
        bulk_file: pl.Path,
        metasignature_file: pl.Path,
        truesignature_file: pl.Path,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Helper function to download files

    Args:

        bulk_file: path to file with the bulk gex
        purity_file: path to the file with the purity info
        metasignature_file: path to the file with the metasignature genes
        truesignature_file: path to the file with the true signature genes

    Returns:

        a scorer object

    """

    bulk_values = pd.read_csv(bulk_file, index_col=0)
    bulk_values = bulk_values.loc[~bulk_values.index.duplicated()]

    metasignatures = pd.read_csv(metasignature_file)
    truesignatures = pd.read_csv(truesignature_file)

    return bulk_values, metasignatures, truesignatures




[docs]
def get_all_scores(
        bulk_values: pd.DataFrame,
        metasignatures: pd.DataFrame,
        truesignatures: pd.DataFrame
) -> pd.DataFrame:
    """Computes the scores for all metasignatures and true signatures using
        the appropriate scorer function

    Args:

        bulk_values: df of bulk gex of size (n_samples, n_genes)
        purity: series with the purity information per patient + cancer type of size (n_samples, 2)
        metasignatures: df with n_metasignatures columns, containing in each column the
            list of genes that constitute the metasignature
        truesignatures: df with n_true signatures columns, containing in each column the
            list of genes that constitute the true signature
        std: only used if scorer name is average, if True the bulk gex will be standardized
            before computing the average
        sample_norm_method: only used if scorer name is ssgsea, what method to use
            for sample norm in ssgsea (see ssgsea doc for more info)

    Returns:

        the dataframe of size (n_samples, n_metasignatures + n_true signatures + 1 + 1),
            containing all scores on the metasignatures, the true signatuers, the purity information,
            and TCGA the cancer type the scoring was performed on

    """

    scorer = AvgBulkScorer(std=True)

    all_scores = {}
    for sig in truesignatures.columns:
        all_scores[sig] = scorer.score(
            bulk_values=bulk_values, metasig=truesignatures[sig].ravel()
        )
    for sig in metasignatures.columns:
        all_scores[sig] = scorer.score(
            bulk_values=bulk_values, metasig=metasignatures[sig].ravel()
        )

    all_scores = pd.concat(all_scores, axis=1)

    return all_scores




[docs]
def score_dataset(
        bulk_file: pl.Path,
        metasignature_file: pl.Path,
        truesignature_file: pl.Path
) -> pd.DataFrame:
    """Main function, computes the bulk score for metasignatures and reference signatures

    Args:

        bulk_file: path to file with the bulk gex
        metasignature_file: path to the file with the metasignature genes
        truesignature_file: path to the file with the true signature genes
        scorer_name: which scoring to use
        std: only used if scorer name is average, if True the bulk gex will be standardized
            before computing the average
        sample_norm_method: only used if scorer name is ssgsea, what method to use
            for sample norm in ssgsea (see ssgsea doc for more info)

    Returns:

        the dataframe of size (n_samples, n_metasignatures + n_true signatures + 1 + 1),
            containing all scores on the metasignatures, the true signatuers, the purity information,
            and TCGA the cancer type the scoring was performed on

    """

    bulk_values, metasignatures, truesignatures = get_data(
        bulk_file=bulk_file,
        metasignature_file=metasignature_file,
        truesignature_file=truesignature_file,
    )

    all_scores = get_all_scores(
        bulk_values=bulk_values,
        metasignatures=metasignatures,
        truesignatures=truesignatures,
    )
    return all_scores




[docs]
def get_args():
    parser = ArgumentParser()
    parser.add_argument("--data-path", "-d", type=str)
    parser.add_argument("--annotation-path", "-a", type=str)
    parser.add_argument("--metasig-path", "-m", type=str)

    return parser.parse_args()




[docs]
def main():
    args = get_args()
    scores = score_dataset(bulk_file=args.data_path,
                           truesignature_file=args.annotation_path,
                           metasignature_file=args.metasig_path)



if __name__ == '__main__':
    main()