Source code for scripts.metrics.score_bulk

import pathlib as pl
from argparse import ArgumentParser
from typing import Literal, Tuple

import numpy as np
import pandas as pd

_NORMMETHOD = Literal["rank", "log_rank", "log", "custom"]
_SCORERNAME = Literal["ssgsea", "average"]


[docs] class AvgBulkScorer: """This class creates a scorer that takes the average of the (std or not) bulk gex as a proxy of signature score""" def __init__(self, std: bool): """ Args: name: name of the scorer std: if True, the bulk gex will be standardized before the average is computed Returns: None """ self.std = std
[docs] def score(self, bulk_values: pd.DataFrame, metasig: np.ndarray) -> pd.Series: """The main scoring function Args: bulk_values: a df of size (n_samples, n_genes) with the bulk gene expression metasig: a list of genes representing the signature to score Returns: a series with the score for each patient """ intersection = bulk_values.columns.intersection(metasig) if len(intersection) == 0: raise ValueError( "There is no common gene between the metasignature and the bulk DataFrame" ) else: df = bulk_values.loc[:, intersection] if self.std: df = (df - df.mean()) / df.std() return df.mean(axis=1)
[docs] def get_data( bulk_file: pl.Path, metasignature_file: pl.Path, truesignature_file: pl.Path, ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Helper function to download files Args: bulk_file: path to file with the bulk gex purity_file: path to the file with the purity info metasignature_file: path to the file with the metasignature genes truesignature_file: path to the file with the true signature genes Returns: a scorer object """ bulk_values = pd.read_csv(bulk_file, index_col=0) bulk_values = bulk_values.loc[~bulk_values.index.duplicated()] metasignatures = pd.read_csv(metasignature_file) truesignatures = pd.read_csv(truesignature_file) return bulk_values, metasignatures, truesignatures
[docs] def get_all_scores( bulk_values: pd.DataFrame, metasignatures: pd.DataFrame, truesignatures: pd.DataFrame ) -> pd.DataFrame: """Computes the scores for all metasignatures and true signatures using the appropriate scorer function Args: bulk_values: df of bulk gex of size (n_samples, n_genes) purity: series with the purity information per patient + cancer type of size (n_samples, 2) metasignatures: df with n_metasignatures columns, containing in each column the list of genes that constitute the metasignature truesignatures: df with n_true signatures columns, containing in each column the list of genes that constitute the true signature std: only used if scorer name is average, if True the bulk gex will be standardized before computing the average sample_norm_method: only used if scorer name is ssgsea, what method to use for sample norm in ssgsea (see ssgsea doc for more info) Returns: the dataframe of size (n_samples, n_metasignatures + n_true signatures + 1 + 1), containing all scores on the metasignatures, the true signatuers, the purity information, and TCGA the cancer type the scoring was performed on """ scorer = AvgBulkScorer(std=True) all_scores = {} for sig in truesignatures.columns: all_scores[sig] = scorer.score( bulk_values=bulk_values, metasig=truesignatures[sig].ravel() ) for sig in metasignatures.columns: all_scores[sig] = scorer.score( bulk_values=bulk_values, metasig=metasignatures[sig].ravel() ) all_scores = pd.concat(all_scores, axis=1) return all_scores
[docs] def score_dataset( bulk_file: pl.Path, metasignature_file: pl.Path, truesignature_file: pl.Path ) -> pd.DataFrame: """Main function, computes the bulk score for metasignatures and reference signatures Args: bulk_file: path to file with the bulk gex metasignature_file: path to the file with the metasignature genes truesignature_file: path to the file with the true signature genes scorer_name: which scoring to use std: only used if scorer name is average, if True the bulk gex will be standardized before computing the average sample_norm_method: only used if scorer name is ssgsea, what method to use for sample norm in ssgsea (see ssgsea doc for more info) Returns: the dataframe of size (n_samples, n_metasignatures + n_true signatures + 1 + 1), containing all scores on the metasignatures, the true signatuers, the purity information, and TCGA the cancer type the scoring was performed on """ bulk_values, metasignatures, truesignatures = get_data( bulk_file=bulk_file, metasignature_file=metasignature_file, truesignature_file=truesignature_file, ) all_scores = get_all_scores( bulk_values=bulk_values, metasignatures=metasignatures, truesignatures=truesignatures, ) return all_scores
[docs] def get_args(): parser = ArgumentParser() parser.add_argument("--data-path", "-d", type=str) parser.add_argument("--annotation-path", "-a", type=str) parser.add_argument("--metasig-path", "-m", type=str) return parser.parse_args()
[docs] def main(): args = get_args() scores = score_dataset(bulk_file=args.data_path, truesignature_file=args.annotation_path, metasignature_file=args.metasig_path)
if __name__ == '__main__': main()