BassanilabNeoRanking / utils /GlobalParameters.py
NeoDiscoveryAdmin's picture
Added and modified code files, test data, and related documentation
34d27b9
import os
import os.path
from typing import Final
class GlobalParameters:
"""
Class to store global parameters
Attributes:
base_dir (str): base directory of project files
data_dir (str): directory that holds data files
plot_dir (str): directory that holds figure files
classifier_result_dir (str): directory that holds classifier result files
classifier_model_dir (str): directory that holds classifier model files
neopep_data_org_file (str): tab file containing all neo-peptide data
mutation_data_org_file (str): tab file containing all mutation data
neopep_data_ml_sel_file (str): tab file containing rows of neo-peptide data selected for ML
mutation_data_ml_sel_file (str): tab file containing rows of mutation data selected for ML
neopep_data_ml_file (str): tab file containing neo-peptide data normalized for ML
mutation_data_ml_file (str): tab file containing mutation data data normalized for ML
neopep_data_plot_file (str): tab file containing neo-peptide data normalized for histogram and scatter plots
mutation_data_plot_file (str): tab file containing mutation data normalized for histogram and scatter plots
cat_to_num_info_files (dict[str, dict[str, str]]): dictionary with file names for imputation of categorical
variables
tesla_result_file (str): results from TESLA paper containing FR, TTIF, and AUPRC scores of different groups
gartner_nmer_train_file (str): training data matrix from Gartner et al with mutation features and immunogenicity
annotation downloaded from figshare link provided in Gartner et al
gartner_nmer_test_file (str): testing data matrix from Gartner et al with mutation features and immunogenicity
annotation downloaded from figshare link provided in Gartner et al
gartner_nmer_rank_file (str): file containing the ranking of mutations in NCI_test obtained by Gartner et al.
gartner_mmp_rank_file (str): file containing the ranking of neo-peptides in NCI_test obtained by Gartner et al.
hlaI_allele_file (str): file containing the HLA class I alleles of all patients
datasets (list[str]): datasets used in this study ['NCI', 'NCI_train', 'NCI_test', 'TESLA', 'HiTIDE']
datasets_encoding (list[str]): datasets used for encoding categorical values ['NCI', 'NCI_train']
peptide_types (list[str]): peptide types ['neopep', 'mutation']
objectives (list[str]): objectives for data normalization ['ml', 'plot']
response_types (list[str]): immunogenicity measurement response types ['CD8', 'negative', 'not_tested']
mutation_types (list[str]): mutation types to include ['SNV', 'INSERTION', 'DELETION', 'FSS']
classifiers (list[str]): classifiers used in this study
aas (list[str]): list of amino acids
ml_features_neopep (list[str]): list of features used for classification of neo-peptides
features_neopep (list[str]): list of features for neo-peptides
feature_types_neopep (dict[str, any]): types of features_neopep
ml_feature_mv_neopep (dict[str, str]): order of features_neopep values (used for missing value imputation)
ml_features_mutation (list[str]): list of features used for classification of neo-peptides
features_mutation (list[str]): list of features for neo-peptides
feature_types_mutation (dict[str, any]): types of features_neopep
ml_feature_mv_mutation (dict[str, str]): order of features_mutation values (used for missing value imputation)
nr_hyperopt_rep (int): number of replicate hyperopt runs
nr_hyperopt_iter (int): number of hyperopt iterations
nr_hyperopt_cv (int): number of hyperopt cross-validation folds
neopep_alpha (float): value of alpha in rank_score function used for training neo-peptides
mutation_alpha (float): value of alpha in rank_score function used for training mutations
normalizer (str): normalizer to be used ('q': quantile, 'p': power, 'z': standard, 'i': minmax, 'l': log, 'a': asinh, 'n': none)
nr_non_immuno_neopeps (int): nr non-immunogenic peptides sampled
cat_type (str): conversion of categorical to numerical values. either 'float' or 'int'
max_netmhc_rank (float): maximal netmhc rank for neo-peptide. -1 if no filter applied
excluded_genes (list): peptides of these genes are excluded from prioritization
plot_normalization (dict): feature normalization for plots only (not for ML)
plot_feature_names (dict): feature names used in plots
color_immunogenic (str): color used to represent immunogenic peptides in plots
color_negative (str): color used to represent non-immunogenic peptides in plots
"""
base_dir: Final[str] = os.getenv('NEORANKING_RESOURCE')
code_dir: Final[str] = os.getenv('NEORANKING_CODE')
data_dir: Final[str] = os.path.join(base_dir, "data")
plot_dir: Final[str] = os.path.join(base_dir, "plots")
classifier_result_dir: Final[str] = os.path.join(base_dir, "classifier_results")
classifier_model_dir: Final[str] = os.path.join(base_dir, "classifier_models")
neopep_data_org_file: Final[str] = os.path.join(data_dir, "Neopep_data_org.txt")
mutation_data_org_file: Final[str] = os.path.join(data_dir, "Mutation_data_org.txt")
neopep_data_ml_sel_file: Final[str] = os.path.join(data_dir, "Neopep_data_ml_sel.txt")
mutation_data_ml_sel_file: Final[str] = os.path.join(data_dir, "Mutation_data_ml_sel.txt")
neopep_data_ml_file: Final[str] = os.path.join(data_dir, "Neopep_data_ml_norm.txt")
mutation_data_ml_file: Final[str] = os.path.join(data_dir, "Mutation_data_ml_norm.txt")
neopep_data_plot_file: Final[str] = os.path.join(data_dir, "Neopep_data_plot_norm.txt")
mutation_data_plot_file: Final[str] = os.path.join(data_dir, "Mutation_data_plot_norm.txt")
cat_to_num_info_files: Final[dict] = \
{
'neopep': {'NCI_train': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_neopep_NCI_train.txt'),
'NCI': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_neopep_NCI_all.txt')},
'mutation': {'NCI_train': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_mutation_NCI_train.txt'),
'NCI': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_mutation_NCI_all.txt')}
}
tesla_result_file: Final[str] = os.path.join(data_dir, "mmc5.xlsx")
gartner_nmer_train_file: Final[str] = os.path.join(data_dir, 'NmersTrainingSet.txt')
gartner_nmer_test_file: Final[str] = os.path.join(data_dir, 'NmersTestingSet.txt')
gartner_nmer_rank_file: Final[str] = os.path.join(code_dir, 'Data/Gartner_nmers_ranking.txt')
hlaI_allele_file: Final[str] = os.path.join(data_dir, 'hla', 'HLA_allotypes.txt')
datasets: Final[list] = ['NCI', 'NCI_train', 'NCI_test', 'TESLA', 'HiTIDE']
datasets_encoding: Final[list] = ['NCI', 'NCI_train']
peptide_types: Final[list] = ['neopep', 'mutation']
objectives: Final[list] = ['ml', 'plot']
response_types: Final[list] = ['CD8', 'negative', 'not_tested']
mutation_types: Final[list] = ['SNV', 'INSERTION', 'DELETION', 'FSS']
aas: Final[list] = \
['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
classifiers = ['SVM', 'SVM-lin', 'RF', 'CART', 'ADA', 'LR', 'NNN', 'XGBoost']
neopep_alpha: Final[float] = 0.005
mutation_alpha: Final[float] = 0.05
nr_hyperopt_rep = 10
nr_hyperopt_iter = 200
nr_hyperopt_cv = 5
normalizer: Final[str] = 'n'
nr_non_immuno_neopeps: Final[int] = 500000
cat_type: Final[str] = 'float' # either float or int
max_netmhc_rank: Final[int] = 20
excluded_genes: Final[list] = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DRB3', 'HLA-DRB4', 'HLA-DRB5',
'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DMA', 'TRBV3', 'TRBV5',
'TRBV6', 'TRBV6-1', 'TRBV10', 'TRBV10-1', 'TRBV11', 'TRAV12', 'KRT1', 'PRSS3']
# Neo Test
ml_features_neopep: Final[list] = \
[
'mutant_other_significant_alleles', 'mutant_rank', 'mutant_rank_PRIME',
'mutant_rank_netMHCpan',
'mut_Rank_Stab', 'mut_netchop_score_ct',
'TAP_score',
'seq_len']
features_neopep: Final[list] = \
['patient', 'dataset', 'train_test', 'response_type', 'Nb_Samples', 'Sample_Tissue', 'Cancer_Type',
'chromosome', 'genomic_coord', 'ref', 'alt', 'gene', 'protein_coord', 'aa_mutant', 'aa_wt',
'pep_mut_start', 'TumorContent', 'Zygosity', 'mutation_type'] + ml_features_neopep
# Neo Test
feature_types_neopep: Final[dict] = {
'patient': 'str',
'dataset': 'category',
'train_test': 'category',
'response_type': 'category',
'Nb_Samples': 'str',
'Sample_Tissue': 'str',
'Cancer_Type': 'str',
'chromosome': 'str',
'genomic_coord': 'int64',
'ref': 'str',
'alt': 'str',
'gene': 'str',
'protein_coord': 'int32',
'aa_mutant': 'category',
'aa_wt': 'category',
'mutant_seq': 'str',
'wt_seq': 'str',
'pep_mut_start': 'int8',
'TumorContent': 'float64',
'Zygosity': 'category',
'mutation_type': 'category',
'mutant_rank': 'float64',
'mutant_rank_netMHCpan': 'float64',
'mutant_rank_PRIME': 'float64',
'mut_Rank_Stab': 'float64',
'TAP_score': 'float64',
'mut_netchop_score_ct': 'float64',
'mutant_other_significant_alleles': 'int8',
'seq_len': 'category'
}
# Neo Test
ml_feature_mv_neopep: Final[dict] = {
'mutant_rank': 'max',
'mutant_rank_netMHCpan': 'max',
'mutant_rank_PRIME': 'max',
'mut_Rank_Stab': 'max',
'TAP_score': 'min',
'mut_netchop_score_ct': 'min',
'mutant_other_significant_alleles': 'min',
}
ml_features_mutation: Final[list] = \
['CCF', 'Clonality', 'Zygosity', 'Sample_Tissue_expression_GTEx',
'TCGA_Cancer_expression', 'rnaseq_TPM', 'rnaseq_alt_support',
'MIN_MUT_RANK_CI_MIXMHC', 'COUNT_MUT_RANK_CI_MIXMHC',
'WT_BEST_RANK_CI_MIXMHC', 'MIN_MUT_RANK_CI_PRIME',
'COUNT_MUT_RANK_CI_PRIME', 'WT_BEST_RANK_CI_PRIME',
'COUNT_MUT_RANK_CI_netMHCpan', 'CSCAPE_score', 'gene_driver_Intogen',
'nb_mutations_in_gene_Intogen', 'nb_same_mutation_Intogen',
'mutation_driver_statement_Intogen', 'GTEx_all_tissues_expression_mean',
'bestWTMatchScore_I', 'bestWTMatchOverlap_I', 'bestMutationScore_I',
'bestWTPeptideCount_I', 'mut_Rank_EL_0', 'wt_Rank_EL_0',
'mut_Rank_EL_1', 'wt_Rank_EL_1', 'mut_Rank_EL_2', 'wt_Rank_EL_2',
'mut_Rank_Stab_0', 'mut_Rank_Stab_1', 'mut_Rank_Stab_2',
'mut_netchop_score', 'mut_TAP_score_0', 'next_best_BA_mut_ranks',
'DAI_0', 'DAI_1', 'DAI_2']
features_mutation: Final[list] = \
['patient', 'dataset', 'train_test', 'response_type', 'Nb_Samples', 'Sample_Tissue', 'Cancer_Type',
'chromosome', 'genomic_coord', 'ref', 'alt', 'gene', 'protein_coord', 'aa_mutant', 'aa_wt', 'pep_mut_start',
'TumorContent', 'mutation_type'] + ml_features_mutation
feature_types_mutation: Final[dict] = {
'patient': 'category',
'dataset': 'category',
'train_test': 'category',
'response_type': 'category',
'Nb_Samples': 'str',
'Sample_Tissue': 'str',
'Cancer_Type': 'str',
'chromosome': 'str',
'genomic_coord': 'int64',
'ref': 'str',
'alt': 'str',
'gene': 'str',
'protein_coord': 'int32',
'aa_mutant': 'category',
'aa_wt': 'category',
'mutant_seq': 'str',
'wt_seq': 'str',
'pep_mut_start': 'int8',
'TumorContent': 'float64',
'CCF': 'float64',
'Clonality': 'category',
'Zygosity': 'category',
'mutation_type': 'category',
'nb_same_mutation_Intogen': 'float64',
'nb_mutations_in_gene_Intogen': 'float64',
'mutation_driver_statement_Intogen': 'category',
'gene_driver_Intogen': 'category',
'rnaseq_TPM': 'float64',
'TCGA_Cancer_expression': 'float64',
'bestMutationScore_I': 'float64',
'bestWTPeptideCount_I': 'int32',
'bestWTMatchScore_I': 'float64',
'bestWTMatchOverlap_I': 'float64',
'rnaseq_alt_support': 'float64',
'CSCAPE_score': 'float64',
'GTEx_all_tissues_expression_mean': 'float64',
'Sample_Tissue_expression_GTEx': 'float64',
'COUNT_MUT_RANK_CI_MIXMHC': 'int32',
'COUNT_MUT_RANK_CI_PRIME': 'int32',
'COUNT_MUT_RANK_CI_netMHCpan': 'int32',
'MIN_MUT_RANK_CI_MIXMHC': 'float64',
'WT_BEST_RANK_CI_MIXMHC': 'float64',
'MIN_MUT_RANK_CI_PRIME': 'float64',
'WT_BEST_RANK_CI_PRIME': 'float64',
'next_best_BA_mut_ranks': 'float64',
'mut_Rank_EL_0': 'float64',
'mut_Rank_EL_1': 'float64',
'mut_Rank_EL_2': 'float64',
'wt_Rank_EL_0': 'float64',
'wt_Rank_EL_1': 'float64',
'wt_Rank_EL_2': 'float64',
'mut_Rank_Stab_0': 'float64',
'mut_Rank_Stab_1': 'float64',
'mut_Rank_Stab_2': 'float64',
'DAI_0': 'float64',
'DAI_1': 'float64',
'DAI_2': 'float64',
'mut_TAP_score_0': 'float64',
'mut_netchop_score': 'float64'
}
ml_feature_mv_mutation: Final[dict] = {
'nb_same_mutation_Intogen': 'min',
'nb_mutations_in_gene_Intogen': 'min',
'rnaseq_TPM': 'min',
'TCGA_Cancer_expression': 'min',
'bestMutationScore_I': 'min',
'bestWTPeptideCount_I': 'min',
'bestWTMatchScore_I': 'min',
'bestWTMatchOverlap_I': 'min',
'rnaseq_alt_support': 'min',
'CCF': 0.9,
'CSCAPE_score': 'min',
'GTEx_all_tissues_expression_mean': 'min',
'Sample_Tissue_expression_GTEx': 'min',
'COUNT_MUT_RANK_CI_MIXMHC': 'min',
'COUNT_MUT_RANK_CI_PRIME': 'min',
'COUNT_MUT_RANK_CI_netMHCpan': 'min',
'MIN_MUT_RANK_CI_MIXMHC': 'max',
'WT_BEST_RANK_CI_MIXMHC': 'max',
'MIN_MUT_RANK_CI_PRIME': 'max',
'WT_BEST_RANK_CI_PRIME': 'max',
'next_best_BA_mut_ranks': 'max',
'mut_Rank_EL_0': 'max',
'mut_Rank_EL_1': 'max',
'mut_Rank_EL_2': 'max',
'wt_Rank_EL_0': 'max',
'wt_Rank_EL_1': 'max',
'wt_Rank_EL_2': 'max',
'mut_Rank_Stab_0': 'max',
'mut_Rank_Stab_1': 'max',
'mut_Rank_Stab_2': 'max',
'DAI_0': 'cnt',
'DAI_1': 'cnt',
'DAI_2': 'cnt',
'mut_TAP_score_0': 'min',
'mut_netchop_score': 'min'
}
#
# Visualization
#
color_immunogenic = 'darkorange'
color_negative = 'royalblue'
plot_file_formats = ['pdf', 'svg', 'png']
plot_normalization: Final[dict] = \
{'mutant_rank_PRIME': 'l', 'wt_best_rank_PRIME': 'l', 'mutant_rank': 'l', 'wt_best_rank': 'l',
'mutant_rank_netMHCpan': 'l', 'wt_best_rank_netMHCpan': 'l', 'mut_Rank_Stab': 'l', 'wt_Rank_Stab': 'l',
'mut_Stab_Score': 'n', 'wt_Stab_Score': 'n', 'TAP_score': 'n', 'mut_netchop_score_ct': 'n',
'mut_binding_score': 'n', 'mut_is_binding_pos': 'n', 'pep_mut_start': 'i', 'mut_aa_coeff': 'n', 'DAI': 'n',
'rnaseq_TPM': 'a', 'rnaseq_alt_support': 'n', 'GTEx_all_tissues_expression_mean': 'a',
'Sample_Tissue_expression_GTEx': 'a', 'TCGA_Cancer_expression': 'a', 'bestWTMatchScore_I': 'a',
'bestWTMatchOverlap_I': 'n', 'bestMutationScore_I': 'a', 'bestWTPeptideCount_I': 'a', 'bestWTMatchType_I': 'n',
'mutant_other_significant_alleles': 'n', 'CSCAPE_score': 'n', 'Clonality': 'n',
'CCF': 'n', 'nb_same_mutation_Intogen': 'a', 'nb_mutations_in_gene_Intogen': 'a',
'nb_mutations_same_position_Intogen': 'a', 'mutation_driver_statement_Intogen': 'n',
'gene_driver_Intogen': 'n', 'DAI_NetMHC': 'n', 'DAI_MixMHC': 'n', 'DAI_NetStab': 'n',
'DAI_MixMHC_mbp': 'n', 'seq_len': 'n', 'DAI_aa_coeff': 'n', 'mut_Rank_EL_0': 'l',
'mut_Rank_EL_1': 'l', 'mut_Rank_EL_2': 'l', 'wt_Rank_EL_0': 'l', 'wt_Rank_EL_1': 'l', 'wt_Rank_EL_2': 'l',
'mut_Rank_Stab_0': 'l', 'mut_Rank_Stab_1': 'l', 'mut_Rank_Stab_2': 'l', 'DAI_0': 'n', 'DAI_1': 'n',
'DAI_2': 'n', 'mut_TAP_score_0': 'n', 'mut_netchop_score': 'n', 'COUNT_MUT_RANK_CI_MIXMHC': 'n',
'COUNT_MUT_RANK_CI_PRIME': 'n', 'COUNT_MUT_RANK_CI_netMHCpan': 'n', 'mut_nr_strong_binders_0': 'n',
'mut_nr_weak_binding_alleles_0': 'n', 'MIN_MUT_RANK_CI_MIXMHC': 'l', 'WT_BEST_RANK_CI_MIXMHC': 'l',
'MIN_MUT_RANK_CI_PRIME': 'l', 'WT_BEST_RANK_CI_PRIME': 'l', 'next_best_BA_mut_ranks': 'l'
}
plot_feature_names: Final[dict] = \
{'mutant_rank': 'MixMHCpred Rank', 'mutant_rank_netMHCpan': 'NetMHCpan Rank', 'mutant_rank_PRIME': 'PRIME Rank',
'mut_Rank_Stab': 'NetStab Rank', 'TAP_score': 'NetTAP Score', 'mut_netchop_score_ct': 'NetChop CT Score',
'mut_binding_score': 'MixMHCpred Score at Mutation', 'mut_is_binding_pos': 'Mutation at Anchor',
'pep_mut_start': 'Mutation Position', 'mut_aa_coeff': 'PRIME Coeff at Mutation',
'DAI_NetMHC': 'NetMHCpan log_Rank DAI', 'DAI_MixMHC': 'MixMHCpred log_Rank DAI',
'DAI_NetStab': 'NetStab log_Rank DAI', 'mutant_other_significant_alleles': 'Number Binding Alleles',
'DAI_MixMHC_mbp': 'MixMHCpred Score DAI', 'rnaseq_TPM': 'RNAseq Expression(TPM)',
'rnaseq_alt_support': 'RNAseq Mutation Coverage',
'GTEx_all_tissues_expression_mean': 'GTEx Mean Tissue Expression',
'Sample_Tissue_expression_GTEx': 'GTEx Sample Tissue Expression',
'TCGA_Cancer_expression': 'TCGA Cancer Expression',
'bestWTMatchScore_I': 'ipMSDB Peptide Score', 'bestWTMatchOverlap_I': 'ipMSDB Peptide Overlap',
'bestMutationScore_I': 'ipMSDB Mutation Score', 'bestWTPeptideCount_I': 'ipMSDB Peptide Count',
'bestWTMatchType_I': 'ipMSDB Peptide Match Type', 'CSCAPE_score': 'CSCAPE Score', 'Zygosity': 'Zygosity',
'Clonality': 'Clonality', 'CCF': 'Cancer Cell Fraction',
'nb_same_mutation_Intogen': 'Intogen Same Mutation Count',
'nb_mutations_in_gene_Intogen': 'Intogen Gene Mutation Count',
'nb_mutations_same_position_Intogen': 'Intogen Mutation Same Position Count',
'mutation_driver_statement_Intogen': 'Intogen Mutation Driver Statement',
'gene_driver_Intogen': 'Gene Driver Intogen', 'pep_mut_start_9': 'Mutation Position Length 9',
'pep_mut_start_10': 'Mutation Position Length 10', 'pep_mut_start_11': 'Mutation Position Length 11',
'pep_mut_start_12': 'Mutation Position Length 12', 'seq_len': 'Peptide Length',
'DAI_aa_coeff': 'PRIME Coefficient DAI', 'COUNT_MUT_RANK_CI_MIXMHC': 'MixMHCpred Binding Peptide Count',
'COUNT_MUT_RANK_CI_PRIME': 'PRIME Binding Peptide Count',
'COUNT_MUT_RANK_CI_netMHCpan': 'NetMHC Binding Peptide Count',
'MIN_MUT_RANK_CI_MIXMHC': 'Minimal Mut MixMHCpred Rank', 'MIN_MUT_RANK_CI_PRIME': 'Minimal Mut PRIME Rank',
'WT_BEST_RANK_CI_MIXMHC': 'Minimal WT MixMHCpred Rank', 'WT_BEST_RANK_CI_PRIME': 'Minimal WT PRIME Rank',
'next_best_BA_mut_ranks': 'Second Mut BA rank', 'mut_Rank_EL_0': 'Best Mut EL Rank',
'mut_Rank_EL_1': 'Second Mut EL Rank', 'mut_Rank_EL_2': 'Third Mut EL Rank', 'wt_Rank_EL_0': 'Best WT EL Rank',
'wt_Rank_EL_1': 'Second WT EL Rank', 'wt_Rank_EL_2': 'Third WT EL Rank',
'mut_Rank_Stab_0': 'Best Mut Stab Rank',
'mut_Rank_Stab_1': 'Second Mut Stab Rank', 'mut_Rank_Stab_2': 'Third Mut Stab Rank',
'DAI_0': 'BEST EL Rank DAI',
'DAI_1': 'Second EL Rank DAI', 'DAI_2': 'Third EL Rank DAI', 'mut_TAP_score_0': 'Best Mut TAP Score',
'mut_netchop_score': 'Best Mut NetChop Score'
}
@staticmethod
def get_cat_to_num_info_file(dataset: str, peptide_type: str):
if dataset in GlobalParameters.datasets_encoding:
return GlobalParameters.cat_to_num_info_files[peptide_type][dataset]
else:
return None