Spaces:
Running
Running
| """Configuration utils.""" | |
| import os | |
| import json | |
| import pandas as pd | |
| from pytoda.transforms import Compose | |
| from pytoda.smiles.transforms import SMILESToTokenIndexes, LeftPadding, Canonicalization | |
| from cos import ensure_filepath_from_uri, COS_BUCKET_URI | |
| from utils import load | |
| # model files | |
| MODEL_WEIGHTS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.pt")) | |
| MODEL_PARAMS_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "model.json")) | |
| # SMILES language file | |
| SMILES_LANGUAGE_URI = ensure_filepath_from_uri( | |
| os.path.join(COS_BUCKET_URI, "smiles_language.pkl") | |
| ) | |
| # gene expression file | |
| GENE_EXPRESSION_URI = ensure_filepath_from_uri( | |
| os.path.join(COS_BUCKET_URI, "gene_expression.csv.zip") | |
| ) | |
| # genes file | |
| GENES_URI = ensure_filepath_from_uri(os.path.join(COS_BUCKET_URI, "genes.pkl")) | |
| # genes standardization parameters | |
| GENE_EXPRESSION_STANDARDIZATION_URI = ensure_filepath_from_uri( | |
| os.path.join(COS_BUCKET_URI, "gene_expression_standardization.pkl") | |
| ) | |
| # load the model | |
| with open(MODEL_PARAMS_URI) as fp: | |
| MODEL_PARAMS = json.load(fp) | |
| MAX_LENGTH = MODEL_PARAMS["smiles_padding_length"] | |
| # load SMILES language | |
| SMILES_LANGUAGE = load(SMILES_LANGUAGE_URI) | |
| # load gene expression | |
| GENE_EXPRESSION = pd.read_csv(GENE_EXPRESSION_URI, compression="zip", low_memory=False) | |
| # load genes | |
| GENES = load(GENES_URI) | |
| # load gene standardization parameters | |
| GENE_STANDARDIZATION_PARAMETERS = load(GENE_EXPRESSION_STANDARDIZATION_URI) | |
| # smiles transformations | |
| SMILES_TRANSFORMS = [ | |
| Canonicalization(), | |
| SMILESToTokenIndexes(smiles_language=SMILES_LANGUAGE), | |
| LeftPadding(padding_length=MAX_LENGTH, padding_index=SMILES_LANGUAGE.padding_index), | |
| ] | |
| SMILES_TOKENIZE_FN = Compose(SMILES_TRANSFORMS) | |
| # prepare default gene expression data | |
| # NOTE: transpose and reset work around to ensure we have all needed genes | |
| GENE_EXPRESSION_DATA = GENE_EXPRESSION.T.reindex(GENES).fillna(0.0).T.values | |
| # NOTE: sub-selecting exisiting columns to remove all the genes | |
| to_drop = list(set(GENES) & set(GENE_EXPRESSION.columns)) | |
| GENE_EXPRESSION_METADATA = GENE_EXPRESSION.drop(to_drop, axis=1) | |
| del GENE_EXPRESSION | |
| # housekeeping | |
| RESULTS_EXPIRATION_SECONDS = float( | |
| os.environ.get( | |
| "PACCMANN_RESULTS_EXPIRATION_SECONDS", | |
| # every week | |
| 60 * 60 * 24 * 7, | |
| ) | |
| ) | |
| # SMILES parameters | |
| # TODO: think whether we should enforce canonicalization | |
| CANON = { | |
| "canonical": MODEL_PARAMS["canonical"], | |
| "kekulize": MODEL_PARAMS["kekulize"], | |
| "all_bonds_explicit": MODEL_PARAMS["all_bonds_explicit"], | |
| "all_hs_explicit": MODEL_PARAMS["all_hs_explicit"], | |
| "randomize": MODEL_PARAMS["randomize"], | |
| "remove_bonddir": MODEL_PARAMS["remove_bonddir"], | |
| "smiles_maximum_length": MODEL_PARAMS["smiles_padding_length"], | |
| } | |