Source code for scripts.metrics.marker_overlap
import logging
import scanpy as sc
import pandas as pd
import argparse
import numpy as np
logging.basicConfig()
logging.root.setLevel(logging.INFO)
_LOGGER = logging.getLogger(__name__)
[docs]
def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--input", "-i", type=str,
help="Path to meta signatures stored in .csv.")
parser.add_argument("--output", "-o", type=str, help="Final results.")
parser.add_argument("--data-path", "-d", type=str, help="Path to the anndata.")
parser.add_argument("--annotation-path", "-a", type=str,
help="Path to the folder holding all the known signatures.")
args = parser.parse_args()
return args
[docs]
def get_overlap(gt_signatures: pd.DataFrame, signatures: pd.DataFrame, var_names: np.array) -> pd.DataFrame:
results = pd.DataFrame(columns=gt_signatures.columns, index=signatures.columns)
for gt_sig, gt_list in gt_signatures.items():
gt_list = set(var_names.intersection(gt_list))
for name, gene_list in signatures.items():
gene_list = set(gene_list.dropna())
results.loc[name, gt_sig] = len(gene_list.intersection(gt_list))
results[gt_sig] = results[gt_sig] / len(gt_list)
return results
[docs]
def main() -> None:
args = get_args()
gt_signatures = pd.read_csv(args.annotation_path)
signatures = pd.read_csv(args.input)
var_names = sc.read_h5ad(args.data_path).var_names
overlap = get_overlap(gt_signatures, signatures, var_names)
overlap.to_csv(args.output)
if __name__ == '__main__':
main()