BassanilabNeoRanking / utils /GlobalParameters.py

Added and modified code files, test data, and related documentation

34d27b9 4 months ago

20.6 kB

	import os
	import os.path
	from typing import Final


	class GlobalParameters:
	"""
	Class to store global parameters
	Attributes:
	base_dir (str): base directory of project files
	data_dir (str): directory that holds data files
	plot_dir (str): directory that holds figure files
	classifier_result_dir (str): directory that holds classifier result files
	classifier_model_dir (str): directory that holds classifier model files
	neopep_data_org_file (str): tab file containing all neo-peptide data
	mutation_data_org_file (str): tab file containing all mutation data
	neopep_data_ml_sel_file (str): tab file containing rows of neo-peptide data selected for ML
	mutation_data_ml_sel_file (str): tab file containing rows of mutation data selected for ML
	neopep_data_ml_file (str): tab file containing neo-peptide data normalized for ML
	mutation_data_ml_file (str): tab file containing mutation data data normalized for ML
	neopep_data_plot_file (str): tab file containing neo-peptide data normalized for histogram and scatter plots
	mutation_data_plot_file (str): tab file containing mutation data normalized for histogram and scatter plots
	cat_to_num_info_files (dict[str, dict[str, str]]): dictionary with file names for imputation of categorical
	variables
	tesla_result_file (str): results from TESLA paper containing FR, TTIF, and AUPRC scores of different groups
	gartner_nmer_train_file (str): training data matrix from Gartner et al with mutation features and immunogenicity
	annotation downloaded from figshare link provided in Gartner et al
	gartner_nmer_test_file (str): testing data matrix from Gartner et al with mutation features and immunogenicity
	annotation downloaded from figshare link provided in Gartner et al
	gartner_nmer_rank_file (str): file containing the ranking of mutations in NCI_test obtained by Gartner et al.
	gartner_mmp_rank_file (str): file containing the ranking of neo-peptides in NCI_test obtained by Gartner et al.
	hlaI_allele_file (str): file containing the HLA class I alleles of all patients
	datasets (list[str]): datasets used in this study ['NCI', 'NCI_train', 'NCI_test', 'TESLA', 'HiTIDE']
	datasets_encoding (list[str]): datasets used for encoding categorical values ['NCI', 'NCI_train']
	peptide_types (list[str]): peptide types ['neopep', 'mutation']
	objectives (list[str]): objectives for data normalization ['ml', 'plot']
	response_types (list[str]): immunogenicity measurement response types ['CD8', 'negative', 'not_tested']
	mutation_types (list[str]): mutation types to include ['SNV', 'INSERTION', 'DELETION', 'FSS']
	classifiers (list[str]): classifiers used in this study
	aas (list[str]): list of amino acids
	ml_features_neopep (list[str]): list of features used for classification of neo-peptides
	features_neopep (list[str]): list of features for neo-peptides
	feature_types_neopep (dict[str, any]): types of features_neopep
	ml_feature_mv_neopep (dict[str, str]): order of features_neopep values (used for missing value imputation)
	ml_features_mutation (list[str]): list of features used for classification of neo-peptides
	features_mutation (list[str]): list of features for neo-peptides
	feature_types_mutation (dict[str, any]): types of features_neopep
	ml_feature_mv_mutation (dict[str, str]): order of features_mutation values (used for missing value imputation)
	nr_hyperopt_rep (int): number of replicate hyperopt runs
	nr_hyperopt_iter (int): number of hyperopt iterations
	nr_hyperopt_cv (int): number of hyperopt cross-validation folds
	neopep_alpha (float): value of alpha in rank_score function used for training neo-peptides
	mutation_alpha (float): value of alpha in rank_score function used for training mutations
	normalizer (str): normalizer to be used ('q': quantile, 'p': power, 'z': standard, 'i': minmax, 'l': log, 'a': asinh, 'n': none)
	nr_non_immuno_neopeps (int): nr non-immunogenic peptides sampled
	cat_type (str): conversion of categorical to numerical values. either 'float' or 'int'
	max_netmhc_rank (float): maximal netmhc rank for neo-peptide. -1 if no filter applied
	excluded_genes (list): peptides of these genes are excluded from prioritization
	plot_normalization (dict): feature normalization for plots only (not for ML)
	plot_feature_names (dict): feature names used in plots
	color_immunogenic (str): color used to represent immunogenic peptides in plots
	color_negative (str): color used to represent non-immunogenic peptides in plots
	"""

	base_dir: Final[str] = os.getenv('NEORANKING_RESOURCE')
	code_dir: Final[str] = os.getenv('NEORANKING_CODE')
	data_dir: Final[str] = os.path.join(base_dir, "data")
	plot_dir: Final[str] = os.path.join(base_dir, "plots")
	classifier_result_dir: Final[str] = os.path.join(base_dir, "classifier_results")
	classifier_model_dir: Final[str] = os.path.join(base_dir, "classifier_models")

	neopep_data_org_file: Final[str] = os.path.join(data_dir, "Neopep_data_org.txt")
	mutation_data_org_file: Final[str] = os.path.join(data_dir, "Mutation_data_org.txt")
	neopep_data_ml_sel_file: Final[str] = os.path.join(data_dir, "Neopep_data_ml_sel.txt")
	mutation_data_ml_sel_file: Final[str] = os.path.join(data_dir, "Mutation_data_ml_sel.txt")
	neopep_data_ml_file: Final[str] = os.path.join(data_dir, "Neopep_data_ml_norm.txt")
	mutation_data_ml_file: Final[str] = os.path.join(data_dir, "Mutation_data_ml_norm.txt")
	neopep_data_plot_file: Final[str] = os.path.join(data_dir, "Neopep_data_plot_norm.txt")
	mutation_data_plot_file: Final[str] = os.path.join(data_dir, "Mutation_data_plot_norm.txt")

	cat_to_num_info_files: Final[dict] = \
	{
	'neopep': {'NCI_train': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_neopep_NCI_train.txt'),
	'NCI': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_neopep_NCI_all.txt')},
	'mutation': {'NCI_train': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_mutation_NCI_train.txt'),
	'NCI': os.path.join(data_dir, 'cat_encoding', 'Cat_to_num_info_mutation_NCI_all.txt')}
	}

	tesla_result_file: Final[str] = os.path.join(data_dir, "mmc5.xlsx")
	gartner_nmer_train_file: Final[str] = os.path.join(data_dir, 'NmersTrainingSet.txt')
	gartner_nmer_test_file: Final[str] = os.path.join(data_dir, 'NmersTestingSet.txt')
	gartner_nmer_rank_file: Final[str] = os.path.join(code_dir, 'Data/Gartner_nmers_ranking.txt')
	hlaI_allele_file: Final[str] = os.path.join(data_dir, 'hla', 'HLA_allotypes.txt')

	datasets: Final[list] = ['NCI', 'NCI_train', 'NCI_test', 'TESLA', 'HiTIDE']
	datasets_encoding: Final[list] = ['NCI', 'NCI_train']
	peptide_types: Final[list] = ['neopep', 'mutation']
	objectives: Final[list] = ['ml', 'plot']
	response_types: Final[list] = ['CD8', 'negative', 'not_tested']
	mutation_types: Final[list] = ['SNV', 'INSERTION', 'DELETION', 'FSS']

	aas: Final[list] = \
	['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

	classifiers = ['SVM', 'SVM-lin', 'RF', 'CART', 'ADA', 'LR', 'NNN', 'XGBoost']
	neopep_alpha: Final[float] = 0.005
	mutation_alpha: Final[float] = 0.05
	nr_hyperopt_rep = 10
	nr_hyperopt_iter = 200
	nr_hyperopt_cv = 5
	normalizer: Final[str] = 'n'
	nr_non_immuno_neopeps: Final[int] = 500000
	cat_type: Final[str] = 'float' # either float or int
	max_netmhc_rank: Final[int] = 20

	excluded_genes: Final[list] = ['HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DRB3', 'HLA-DRB4', 'HLA-DRB5',
	'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DMA', 'TRBV3', 'TRBV5',
	'TRBV6', 'TRBV6-1', 'TRBV10', 'TRBV10-1', 'TRBV11', 'TRAV12', 'KRT1', 'PRSS3']
	# Neo Test
	ml_features_neopep: Final[list] = \
	[
	'mutant_other_significant_alleles', 'mutant_rank', 'mutant_rank_PRIME',
	'mutant_rank_netMHCpan',
	'mut_Rank_Stab', 'mut_netchop_score_ct',
	'TAP_score',
	'seq_len']

	features_neopep: Final[list] = \
	['patient', 'dataset', 'train_test', 'response_type', 'Nb_Samples', 'Sample_Tissue', 'Cancer_Type',
	'chromosome', 'genomic_coord', 'ref', 'alt', 'gene', 'protein_coord', 'aa_mutant', 'aa_wt',
	'pep_mut_start', 'TumorContent', 'Zygosity', 'mutation_type'] + ml_features_neopep

	# Neo Test
	feature_types_neopep: Final[dict] = {
	'patient': 'str',
	'dataset': 'category',
	'train_test': 'category',
	'response_type': 'category',
	'Nb_Samples': 'str',
	'Sample_Tissue': 'str',
	'Cancer_Type': 'str',
	'chromosome': 'str',
	'genomic_coord': 'int64',
	'ref': 'str',
	'alt': 'str',
	'gene': 'str',
	'protein_coord': 'int32',
	'aa_mutant': 'category',
	'aa_wt': 'category',
	'mutant_seq': 'str',
	'wt_seq': 'str',
	'pep_mut_start': 'int8',
	'TumorContent': 'float64',
	'Zygosity': 'category',
	'mutation_type': 'category',
	'mutant_rank': 'float64',
	'mutant_rank_netMHCpan': 'float64',
	'mutant_rank_PRIME': 'float64',
	'mut_Rank_Stab': 'float64',
	'TAP_score': 'float64',
	'mut_netchop_score_ct': 'float64',
	'mutant_other_significant_alleles': 'int8',
	'seq_len': 'category'
	}

	# Neo Test
	ml_feature_mv_neopep: Final[dict] = {
	'mutant_rank': 'max',
	'mutant_rank_netMHCpan': 'max',
	'mutant_rank_PRIME': 'max',
	'mut_Rank_Stab': 'max',
	'TAP_score': 'min',
	'mut_netchop_score_ct': 'min',
	'mutant_other_significant_alleles': 'min',
	}

	ml_features_mutation: Final[list] = \
	['CCF', 'Clonality', 'Zygosity', 'Sample_Tissue_expression_GTEx',
	'TCGA_Cancer_expression', 'rnaseq_TPM', 'rnaseq_alt_support',
	'MIN_MUT_RANK_CI_MIXMHC', 'COUNT_MUT_RANK_CI_MIXMHC',
	'WT_BEST_RANK_CI_MIXMHC', 'MIN_MUT_RANK_CI_PRIME',
	'COUNT_MUT_RANK_CI_PRIME', 'WT_BEST_RANK_CI_PRIME',
	'COUNT_MUT_RANK_CI_netMHCpan', 'CSCAPE_score', 'gene_driver_Intogen',
	'nb_mutations_in_gene_Intogen', 'nb_same_mutation_Intogen',
	'mutation_driver_statement_Intogen', 'GTEx_all_tissues_expression_mean',
	'bestWTMatchScore_I', 'bestWTMatchOverlap_I', 'bestMutationScore_I',
	'bestWTPeptideCount_I', 'mut_Rank_EL_0', 'wt_Rank_EL_0',
	'mut_Rank_EL_1', 'wt_Rank_EL_1', 'mut_Rank_EL_2', 'wt_Rank_EL_2',
	'mut_Rank_Stab_0', 'mut_Rank_Stab_1', 'mut_Rank_Stab_2',
	'mut_netchop_score', 'mut_TAP_score_0', 'next_best_BA_mut_ranks',
	'DAI_0', 'DAI_1', 'DAI_2']

	features_mutation: Final[list] = \
	['patient', 'dataset', 'train_test', 'response_type', 'Nb_Samples', 'Sample_Tissue', 'Cancer_Type',
	'chromosome', 'genomic_coord', 'ref', 'alt', 'gene', 'protein_coord', 'aa_mutant', 'aa_wt', 'pep_mut_start',
	'TumorContent', 'mutation_type'] + ml_features_mutation

	feature_types_mutation: Final[dict] = {
	'patient': 'category',
	'dataset': 'category',
	'train_test': 'category',
	'response_type': 'category',
	'Nb_Samples': 'str',
	'Sample_Tissue': 'str',
	'Cancer_Type': 'str',
	'chromosome': 'str',
	'genomic_coord': 'int64',
	'ref': 'str',
	'alt': 'str',
	'gene': 'str',
	'protein_coord': 'int32',
	'aa_mutant': 'category',
	'aa_wt': 'category',
	'mutant_seq': 'str',
	'wt_seq': 'str',
	'pep_mut_start': 'int8',
	'TumorContent': 'float64',
	'CCF': 'float64',
	'Clonality': 'category',
	'Zygosity': 'category',
	'mutation_type': 'category',
	'nb_same_mutation_Intogen': 'float64',
	'nb_mutations_in_gene_Intogen': 'float64',
	'mutation_driver_statement_Intogen': 'category',
	'gene_driver_Intogen': 'category',
	'rnaseq_TPM': 'float64',
	'TCGA_Cancer_expression': 'float64',
	'bestMutationScore_I': 'float64',
	'bestWTPeptideCount_I': 'int32',
	'bestWTMatchScore_I': 'float64',
	'bestWTMatchOverlap_I': 'float64',
	'rnaseq_alt_support': 'float64',
	'CSCAPE_score': 'float64',
	'GTEx_all_tissues_expression_mean': 'float64',
	'Sample_Tissue_expression_GTEx': 'float64',
	'COUNT_MUT_RANK_CI_MIXMHC': 'int32',
	'COUNT_MUT_RANK_CI_PRIME': 'int32',
	'COUNT_MUT_RANK_CI_netMHCpan': 'int32',
	'MIN_MUT_RANK_CI_MIXMHC': 'float64',
	'WT_BEST_RANK_CI_MIXMHC': 'float64',
	'MIN_MUT_RANK_CI_PRIME': 'float64',
	'WT_BEST_RANK_CI_PRIME': 'float64',
	'next_best_BA_mut_ranks': 'float64',
	'mut_Rank_EL_0': 'float64',
	'mut_Rank_EL_1': 'float64',
	'mut_Rank_EL_2': 'float64',
	'wt_Rank_EL_0': 'float64',
	'wt_Rank_EL_1': 'float64',
	'wt_Rank_EL_2': 'float64',
	'mut_Rank_Stab_0': 'float64',
	'mut_Rank_Stab_1': 'float64',
	'mut_Rank_Stab_2': 'float64',
	'DAI_0': 'float64',
	'DAI_1': 'float64',
	'DAI_2': 'float64',
	'mut_TAP_score_0': 'float64',
	'mut_netchop_score': 'float64'
	}

	ml_feature_mv_mutation: Final[dict] = {
	'nb_same_mutation_Intogen': 'min',
	'nb_mutations_in_gene_Intogen': 'min',
	'rnaseq_TPM': 'min',
	'TCGA_Cancer_expression': 'min',
	'bestMutationScore_I': 'min',
	'bestWTPeptideCount_I': 'min',
	'bestWTMatchScore_I': 'min',
	'bestWTMatchOverlap_I': 'min',
	'rnaseq_alt_support': 'min',
	'CCF': 0.9,
	'CSCAPE_score': 'min',
	'GTEx_all_tissues_expression_mean': 'min',
	'Sample_Tissue_expression_GTEx': 'min',
	'COUNT_MUT_RANK_CI_MIXMHC': 'min',
	'COUNT_MUT_RANK_CI_PRIME': 'min',
	'COUNT_MUT_RANK_CI_netMHCpan': 'min',
	'MIN_MUT_RANK_CI_MIXMHC': 'max',
	'WT_BEST_RANK_CI_MIXMHC': 'max',
	'MIN_MUT_RANK_CI_PRIME': 'max',
	'WT_BEST_RANK_CI_PRIME': 'max',
	'next_best_BA_mut_ranks': 'max',
	'mut_Rank_EL_0': 'max',
	'mut_Rank_EL_1': 'max',
	'mut_Rank_EL_2': 'max',
	'wt_Rank_EL_0': 'max',
	'wt_Rank_EL_1': 'max',
	'wt_Rank_EL_2': 'max',
	'mut_Rank_Stab_0': 'max',
	'mut_Rank_Stab_1': 'max',
	'mut_Rank_Stab_2': 'max',
	'DAI_0': 'cnt',
	'DAI_1': 'cnt',
	'DAI_2': 'cnt',
	'mut_TAP_score_0': 'min',
	'mut_netchop_score': 'min'
	}

	#
	# Visualization
	#
	color_immunogenic = 'darkorange'
	color_negative = 'royalblue'
	plot_file_formats = ['pdf', 'svg', 'png']

	plot_normalization: Final[dict] = \
	{'mutant_rank_PRIME': 'l', 'wt_best_rank_PRIME': 'l', 'mutant_rank': 'l', 'wt_best_rank': 'l',
	'mutant_rank_netMHCpan': 'l', 'wt_best_rank_netMHCpan': 'l', 'mut_Rank_Stab': 'l', 'wt_Rank_Stab': 'l',
	'mut_Stab_Score': 'n', 'wt_Stab_Score': 'n', 'TAP_score': 'n', 'mut_netchop_score_ct': 'n',
	'mut_binding_score': 'n', 'mut_is_binding_pos': 'n', 'pep_mut_start': 'i', 'mut_aa_coeff': 'n', 'DAI': 'n',
	'rnaseq_TPM': 'a', 'rnaseq_alt_support': 'n', 'GTEx_all_tissues_expression_mean': 'a',
	'Sample_Tissue_expression_GTEx': 'a', 'TCGA_Cancer_expression': 'a', 'bestWTMatchScore_I': 'a',
	'bestWTMatchOverlap_I': 'n', 'bestMutationScore_I': 'a', 'bestWTPeptideCount_I': 'a', 'bestWTMatchType_I': 'n',
	'mutant_other_significant_alleles': 'n', 'CSCAPE_score': 'n', 'Clonality': 'n',
	'CCF': 'n', 'nb_same_mutation_Intogen': 'a', 'nb_mutations_in_gene_Intogen': 'a',
	'nb_mutations_same_position_Intogen': 'a', 'mutation_driver_statement_Intogen': 'n',
	'gene_driver_Intogen': 'n', 'DAI_NetMHC': 'n', 'DAI_MixMHC': 'n', 'DAI_NetStab': 'n',
	'DAI_MixMHC_mbp': 'n', 'seq_len': 'n', 'DAI_aa_coeff': 'n', 'mut_Rank_EL_0': 'l',
	'mut_Rank_EL_1': 'l', 'mut_Rank_EL_2': 'l', 'wt_Rank_EL_0': 'l', 'wt_Rank_EL_1': 'l', 'wt_Rank_EL_2': 'l',
	'mut_Rank_Stab_0': 'l', 'mut_Rank_Stab_1': 'l', 'mut_Rank_Stab_2': 'l', 'DAI_0': 'n', 'DAI_1': 'n',
	'DAI_2': 'n', 'mut_TAP_score_0': 'n', 'mut_netchop_score': 'n', 'COUNT_MUT_RANK_CI_MIXMHC': 'n',
	'COUNT_MUT_RANK_CI_PRIME': 'n', 'COUNT_MUT_RANK_CI_netMHCpan': 'n', 'mut_nr_strong_binders_0': 'n',
	'mut_nr_weak_binding_alleles_0': 'n', 'MIN_MUT_RANK_CI_MIXMHC': 'l', 'WT_BEST_RANK_CI_MIXMHC': 'l',
	'MIN_MUT_RANK_CI_PRIME': 'l', 'WT_BEST_RANK_CI_PRIME': 'l', 'next_best_BA_mut_ranks': 'l'
	}

	plot_feature_names: Final[dict] = \
	{'mutant_rank': 'MixMHCpred Rank', 'mutant_rank_netMHCpan': 'NetMHCpan Rank', 'mutant_rank_PRIME': 'PRIME Rank',
	'mut_Rank_Stab': 'NetStab Rank', 'TAP_score': 'NetTAP Score', 'mut_netchop_score_ct': 'NetChop CT Score',
	'mut_binding_score': 'MixMHCpred Score at Mutation', 'mut_is_binding_pos': 'Mutation at Anchor',
	'pep_mut_start': 'Mutation Position', 'mut_aa_coeff': 'PRIME Coeff at Mutation',
	'DAI_NetMHC': 'NetMHCpan log_Rank DAI', 'DAI_MixMHC': 'MixMHCpred log_Rank DAI',
	'DAI_NetStab': 'NetStab log_Rank DAI', 'mutant_other_significant_alleles': 'Number Binding Alleles',
	'DAI_MixMHC_mbp': 'MixMHCpred Score DAI', 'rnaseq_TPM': 'RNAseq Expression(TPM)',
	'rnaseq_alt_support': 'RNAseq Mutation Coverage',
	'GTEx_all_tissues_expression_mean': 'GTEx Mean Tissue Expression',
	'Sample_Tissue_expression_GTEx': 'GTEx Sample Tissue Expression',
	'TCGA_Cancer_expression': 'TCGA Cancer Expression',
	'bestWTMatchScore_I': 'ipMSDB Peptide Score', 'bestWTMatchOverlap_I': 'ipMSDB Peptide Overlap',
	'bestMutationScore_I': 'ipMSDB Mutation Score', 'bestWTPeptideCount_I': 'ipMSDB Peptide Count',
	'bestWTMatchType_I': 'ipMSDB Peptide Match Type', 'CSCAPE_score': 'CSCAPE Score', 'Zygosity': 'Zygosity',
	'Clonality': 'Clonality', 'CCF': 'Cancer Cell Fraction',
	'nb_same_mutation_Intogen': 'Intogen Same Mutation Count',
	'nb_mutations_in_gene_Intogen': 'Intogen Gene Mutation Count',
	'nb_mutations_same_position_Intogen': 'Intogen Mutation Same Position Count',
	'mutation_driver_statement_Intogen': 'Intogen Mutation Driver Statement',
	'gene_driver_Intogen': 'Gene Driver Intogen', 'pep_mut_start_9': 'Mutation Position Length 9',
	'pep_mut_start_10': 'Mutation Position Length 10', 'pep_mut_start_11': 'Mutation Position Length 11',
	'pep_mut_start_12': 'Mutation Position Length 12', 'seq_len': 'Peptide Length',
	'DAI_aa_coeff': 'PRIME Coefficient DAI', 'COUNT_MUT_RANK_CI_MIXMHC': 'MixMHCpred Binding Peptide Count',
	'COUNT_MUT_RANK_CI_PRIME': 'PRIME Binding Peptide Count',
	'COUNT_MUT_RANK_CI_netMHCpan': 'NetMHC Binding Peptide Count',
	'MIN_MUT_RANK_CI_MIXMHC': 'Minimal Mut MixMHCpred Rank', 'MIN_MUT_RANK_CI_PRIME': 'Minimal Mut PRIME Rank',
	'WT_BEST_RANK_CI_MIXMHC': 'Minimal WT MixMHCpred Rank', 'WT_BEST_RANK_CI_PRIME': 'Minimal WT PRIME Rank',
	'next_best_BA_mut_ranks': 'Second Mut BA rank', 'mut_Rank_EL_0': 'Best Mut EL Rank',
	'mut_Rank_EL_1': 'Second Mut EL Rank', 'mut_Rank_EL_2': 'Third Mut EL Rank', 'wt_Rank_EL_0': 'Best WT EL Rank',
	'wt_Rank_EL_1': 'Second WT EL Rank', 'wt_Rank_EL_2': 'Third WT EL Rank',
	'mut_Rank_Stab_0': 'Best Mut Stab Rank',
	'mut_Rank_Stab_1': 'Second Mut Stab Rank', 'mut_Rank_Stab_2': 'Third Mut Stab Rank',
	'DAI_0': 'BEST EL Rank DAI',
	'DAI_1': 'Second EL Rank DAI', 'DAI_2': 'Third EL Rank DAI', 'mut_TAP_score_0': 'Best Mut TAP Score',
	'mut_netchop_score': 'Best Mut NetChop Score'
	}

	@staticmethod
	def get_cat_to_num_info_file(dataset: str, peptide_type: str):
	if dataset in GlobalParameters.datasets_encoding:
	return GlobalParameters.cat_to_num_info_files[peptide_type][dataset]
	else:
	return None