Spaces:

ChemFM
/

molecular_property_prediction_zero_gpu

Running on Zero

App Files Files Community

feiyang-cai commited on Oct 17, 2024

Commit

6bb1bdf

1 Parent(s): 50fe1a2

finish the basic function

Browse files

Files changed (3) hide show

app.py +185 -4
dataset_descriptions.json +112 -0
utils.py +286 -0

app.py CHANGED Viewed

@@ -1,7 +1,188 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from huggingface_hub import HfApi, get_collection, list_collections
+from utils import MolecularPropertyPredictionModel, task_types, dataset_descriptions
+import pandas as pd
+import os
+def get_models():
+    # this is the collection id for the molecular property prediction models
+    collection = get_collection("ChemFM/molecular-property-prediction-6710141ffc31f31a47d6fc0c")
+    models = dict()
+    for item in collection.items:
+        if item.item_type == "model":
+            item_name = item.item_id.split("/")[-1]
+            models[item_name] = item.item_id
+            assert item_name in task_types, f"{item_name} is not in the task_types"
+            assert item_name in dataset_descriptions, f"{item_name} is not in the dataset_descriptions"
+    return models
+candidate_models = get_models()
+properties = list(candidate_models.keys())
+model = MolecularPropertyPredictionModel()
+def get_description(property_name):
+    return dataset_descriptions[property_name]
+def predict_single_label(smiles, property_name):
+    adapter_id = candidate_models[property_name]
+    info = model.swith_adapter(property_name, adapter_id)
+    running_status = None
+    if info == "keep":
+        running_status = "Adapter is the same as the current one"
+        #print("Adapter is the same as the current one")
+    elif info == "switched":
+        running_status = "Adapter is switched successfully"
+        #print("Adapter is switched successfully")
+    elif info == "error":
+        running_status = "Adapter is not found"
+        #print("Adapter is not found")
+        return "NA", running_status
+    else:
+        running_status = "Unknown error"
+        return "NA", running_status
+    #prediction = model.predict(smiles, property_name, adapter_id)
+    prediction = model.predict_single_smiles(smiles, task_types[property_name])
+    if prediction is None:
+        return "NA", "Invalid SMILES string"
+    # if the prediction is a float, round it to 3 decimal places
+    if isinstance(prediction, float):
+        prediction = round(prediction, 3)
+    return prediction, "Prediction is done"
+def predict_file(file, property_name):
+    adapter_id = candidate_models[property_name]
+    info = model.swith_adapter(property_name, adapter_id)
+    running_status = None
+    if info == "keep":
+        running_status = "Adapter is the same as the current one"
+        #print("Adapter is the same as the current one")
+    elif info == "switched":
+        running_status = "Adapter is switched successfully"
+        #print("Adapter is switched successfully")
+    elif info == "error":
+        running_status = "Adapter is not found"
+        #print("Adapter is not found")
+        return None, None, file, running_status
+    else:
+        running_status = "Unknown error"
+        return None, None, file, running_status
+    df = pd.read_csv(file)
+    # we have already checked the file contains the "smiles" column
+    df = model.predict_file(df, task_types[property_name])
+    # we should save this file to the disk to be downloaded
+    # rename the file to have "_prediction" suffix
+    prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
+    print(file, prediction_file)
+    # save the file to the disk
+    df.to_csv(prediction_file, index=False)
+    return gr.update(visible=False), gr.DownloadButton(label="Download", value=prediction_file, visible=True), prediction_file, "Prediction is done"
+def validate_file(file):
+    try:
+        if file.endswith(".csv"):
+            df = pd.read_csv(file)
+            if "smiles" not in df.columns:
+                # we should clear the file input
+                return "Invalid file content. The csv file must contain column named 'smiles'", \
+                         None, gr.update(visible=False), gr.update(visible=False)
+            # check the length of the smiles
+            length = len(df["smiles"])
+        elif file.endswith(".smi"):
+            return "Invalid file extension", \
+                    None, gr.update(visible=False), gr.update(visible=False)
+        else:
+            return "Invalid file extension", \
+                    None, gr.update(visible=False), gr.update(visible=False)
+    except Exception as e:
+        return "Invalid file content.", \
+                None, gr.update(visible=False), gr.update(visible=False)
+    if length > 100:
+        return "The space does not support the file containing more than 100 SMILES", \
+                None, gr.update(visible=False), gr.update(visible=False)
+    return "Valid file", file, gr.update(visible=True), gr.update(visible=False)
+def raise_error(status):
+    if status != "Valid file":
+        raise gr.Error(status)
+    return None
+def clear_file(download_button):
+    # we might need to delete the prediction file and uploaded file
+    prediction_path = download_button
+    print(prediction_path)
+    if prediction_path and os.path.exists(prediction_path):
+        os.remove(prediction_path)
+        original_data_file_0 = prediction_path.replace("_prediction.csv", ".csv")
+        original_data_file_1 = prediction_path.replace("_prediction.csv", ".smi")
+        if os.path.exists(original_data_file_0):
+            os.remove(original_data_file_0)
+        if os.path.exists(original_data_file_1):
+            os.remove(original_data_file_1)
+    #if os.path.exists(file):
+    #    os.remove(file)
+    #prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
+    #if os.path.exists(prediction_file):
+    #    os.remove(prediction_file)
+    return gr.update(visible=False), gr.update(visible=False), None
+def build_inference():
+    with gr.Blocks() as demo:
+        # first row - Dropdown input
+        #with gr.Row():
+        dropdown = gr.Dropdown(properties, label="Property", value=properties[0])
+        description_box = gr.Textbox(label="Property description", lines=5,
+                                     interactive=False,
+                                     value=dataset_descriptions[properties[0]])
+        # third row - Textbox input and prediction label
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                textbox = gr.Textbox(label="Molecule SMILES", type="text", placeholder="Provide a SMILES string here",
+                                     lines=1)
+                predict_single_smiles_button = gr.Button("Predict", size='sm')
+            prediction = gr.Label("Prediction will appear here")
+        running_terminal_label = gr.Textbox(label="Running status", type="text", placeholder=None, lines=10, interactive=False)
+        input_file = gr.File(label="Molecule file",
+                       file_count='single',
+                       file_types=[".smi", ".csv"], height=300)
+        predict_file_button = gr.Button("Predict", size='sm', visible=False)
+        download_button = gr.DownloadButton("Download", size='sm', visible=False)
+        # dropdown change event
+        dropdown.change(get_description, inputs=dropdown, outputs=description_box)
+        # predict single button click event
+        predict_single_smiles_button.click(predict_single_label, inputs=[textbox, dropdown], outputs=[prediction, running_terminal_label])
+        # input file upload event
+        file_status = gr.State()
+        input_file.upload(fn=validate_file, inputs=input_file, outputs=[file_status, input_file, predict_file_button, download_button]).success(raise_error, inputs=file_status, outputs=file_status)
+        # input file clear event
+        input_file.clear(fn=clear_file, inputs=[download_button], outputs=[predict_file_button, download_button, input_file])
+        # predict file button click event
+        predict_file_button.click(predict_file, inputs=[input_file, dropdown], outputs=[predict_file_button, download_button, input_file, running_terminal_label])
+    return demo
+demo = build_inference()
+if __name__ == '__main__':
+    demo.launch()

dataset_descriptions.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+    "ADMET_Caco2_Wang": {
+        "task_type": "regression",
+        "description": "predict drug permeability, measured in cm/s, using the Caco-2 cell line as an in vitro model to simulate human intestinal tissue permeability",
+        "num_molecules": 906
+    },
+    "ADMET_Bioavailability_Ma": {
+        "task_type": "classification",
+        "description": "predict oral bioavailability with binary labels, indicating the rate and extent a drug becomes available at its site of action",
+        "num_molecules": 640
+    },
+    "ADMET_Lipophilicity_AstraZeneca": {
+        "task_type": "regression",
+        "description": "predict lipophilicity with continuous labels, measured as a log-ratio, indicating a drug's ability to dissolve in lipid environments",
+        "num_molecules": 4200
+    },
+    "ADMET_Solubility_AqSolDB": {
+        "task_type": "regression",
+        "description": "predict aqueous solubility with continuous labels, measured in log mol/L, indicating a drug's ability to dissolve in water",
+        "num_molecules": 9982
+    },
+    "ADMET_HIA_Hou": {
+        "task_type": "classification",
+        "description": "predict human intestinal absorption (HIA) with binary labels, indicating a drug's ability to be absorbed into the bloodstream",
+        "num_molecules": 578
+    },
+    "ADMET_Pgp_Broccatelli": {
+        "task_type": "classification",
+        "description": "predict P-glycoprotein (Pgp) inhibition with binary labels, indicating a drug's potential to alter bioavailability and overcome multidrug resistance",
+        "num_molecules": 1212
+    },
+    "ADMET_BBB_Martins": {
+        "task_type": "classification",
+        "description": "predict blood-brain barrier permeability with binary labels, indicating a drug's ability to penetrate the barrier to reach the brain",
+        "num_molecules": 1915
+    },
+    "ADMET_PPBR_AZ": {
+        "task_type": "regression",
+        "description": "predict plasma protein binding rate with continuous labels, indicating the percentage of a drug bound to plasma proteins in the blood",
+        "num_molecules": 1797
+    },
+    "ADMET_VDss_Lombardo": {
+        "task_type": "regression",
+        "description": "predict the volume of distribution at steady state (VDss), indicating drug concentration in tissues versus blood",
+        "num_molecules": 1130
+    },
+    "ADMET_CYP2C9_Veith": {
+        "task_type": "classification",
+        "description": "predict CYP2C9 inhibition with binary labels, indicating the drug's ability to inhibit the CYP2C9 enzyme involved in metabolism",
+        "num_molecules": 12092
+    },
+    "ADMET_CYP2D6_Veith": {
+        "task_type": "classification",
+        "description": "predict CYP2D6 inhibition with binary labels, indicating the drug's potential to inhibit the CYP2D6 enzyme involved in metabolism",
+        "num_molecules": 13130
+    },
+    "ADMET_CYP3A4_Veith": {
+        "task_type": "classification",
+        "description": "predict CPY3A4 inhibition with binary labels, indicating the drug's ability to inhibit the CPY3A4 enzyme involved in metabolism",
+        "num_molecules": 12328
+    },
+    "ADMET_CYP2C9_Substrate_CarbonMangels": {
+        "task_type": "classification",
+        "description": "predict whether a drug is a substrate of the CYP2C9 enzyme with binary labels, indicating its potential to be metabolized",
+        "num_molecules": 666
+    },
+    "ADMET_CYP2D6_Substrate_CarbonMangels": {
+        "task_type": "classification",
+        "description": "predict whether a drug is a substrate of the CYP2D6 enzyme with binary labels, indicating its potential to be metabolized",
+        "num_molecules": 664
+    },
+    "ADMET_CYP3A4_Substrate_CarbonMangels": {
+        "task_type": "classification",
+        "description": "predict whether a drug is a substrate of the CYP3A4 enzyme with binary labels, indicating its potential to be metabolized",
+        "num_molecules": 667
+    },
+    "ADMET_Half_Life_Obach": {
+        "task_type": "regression",
+        "description": "predict the half-life duration of a drug, measured in hours, indicating the time for its concentration to reduce by half",
+        "num_molecules": 667
+    },
+    "ADMET_Clearance_Hepatocyte_AZ": {
+        "task_type": "regression",
+        "description": "predict drug clearance, measured in \u03bcL/min/10^6 cells, from hepatocyte experiments, indicating the rate at which the drug is removed from body",
+        "num_molecules": 1020
+    },
+    "ADMET_Clearance_Microsome_AZ": {
+        "task_type": "regression",
+        "description": "predict drug clearance, measured in mL/min/g, from microsome experiments, indicating the rate at which the drug is removed from body",
+        "num_molecules": 1102
+    },
+    "ADMET_LD50_Zhu": {
+        "task_type": "regression",
+        "description": "predict the acute toxicity of a drug, measured as the dose leading to lethal effects in log(kg/mol)",
+        "num_molecules": 7385
+    },
+    "ADMET_hERG": {
+        "task_type": "classification",
+        "description": "predict whether a drug blocks the hERG channel, which is crucial for heart rhythm, potentially leading to adverse effects",
+        "num_molecules": 648
+    },
+    "ADMET_AMES": {
+        "task_type": "classification",
+        "description": "predict whether a drug is mutagenic with binary labels, indicating its ability to induce genetic alterations",
+        "num_molecules": 7255
+    },
+    "ADMET_DILI": {
+        "task_type": "classification",
+        "description": "predict whether a drug can cause liver injury with binary labels, indicating its potential for hepatotoxicity",
+        "num_molecules": 475
+    }
+}

utils.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
+from typing import Optional, Dict, Sequence, List
+import transformers
+from peft import PeftModel
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from dataclasses import dataclass
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+import numpy as np
+from huggingface_hub import hf_hub_download
+import os
+import pickle
+from sklearn import preprocessing
+import json
+from rdkit import RDLogger, Chem
+# Suppress RDKit INFO messages
+RDLogger.DisableLog('rdApp.*')
+# we have a dictionary to store the task types of the models
+task_types = {
+    "admet_ppbr_az": "regression",
+    "admet_half_life_obach": "regression",
+}
+# read the dataset descriptions
+with open("dataset_descriptions.json", "r") as f:
+    dataset_description_temp = json.load(f)
+dataset_descriptions = dict()
+for dataset in dataset_description_temp:
+    dataset_name = dataset.lower()
+    dataset_descriptions[dataset_name] = \
+        f"{dataset_name} is a {dataset_description_temp[dataset]['task_type']} task, " + \
+        f"where the goal is to {dataset_description_temp[dataset]['description']}."
+class Scaler:
+    def __init__(self, log=False):
+        self.log = log
+        self.offset = None
+        self.scaler = None
+    def fit(self, y):
+        # make the values non-negative
+        self.offset = np.min([np.min(y), 0.0])
+        y = y.reshape(-1, 1) - self.offset
+        # scale the input data
+        if self.log:
+            y = np.log10(y + 1.0)
+        self.scaler = preprocessing.StandardScaler().fit(y)
+    def transform(self, y):
+        y = y.reshape(-1, 1) - self.offset
+        # scale the input data
+        if self.log:
+            y = np.log10(y + 1.0)
+        y_scale = self.scaler.transform(y)
+        return y_scale
+    def inverse_transform(self, y_scale):
+        y = self.scaler.inverse_transform(y_scale.reshape(-1, 1))
+        if self.log:
+            y = 10.0**y - 1.0
+        y = y + self.offset
+        return y
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+    non_special_tokens = None,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + tokenizer.add_tokens(non_special_tokens)
+    num_old_tokens = model.get_input_embeddings().weight.shape[0]
+    num_new_tokens = len(tokenizer) - num_old_tokens
+    if num_new_tokens == 0:
+        return
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings_data = model.get_input_embeddings().weight.data
+        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
+    print(f"Resized tokenizer and embedding from {num_old_tokens} to {len(tokenizer)} tokens.")
+@dataclass
+class DataCollator(object):
+    tokenizer: transformers.PreTrainedTokenizer
+    source_max_len: int
+    molecule_start_str: str
+    end_str: str
+    def augment_molecule(self, molecule: str) -> str:
+        return self.sme.augment([molecule])[0]
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        sources = []
+        targets = []
+        for example in instances:
+            smiles = example['smiles'].strip()
+            smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
+            # get the properties except the smiles and mol_id cols
+            #props = [example[col] if example[col] is not None else np.nan for col in sorted(example.keys()) if col not in ['smiles', 'is_aug']]
+            source = f"{self.molecule_start_str}{smiles}{self.end_str}"
+            sources.append(source)
+        # Tokenize
+        tokenized_sources_with_prompt = self.tokenizer(
+            sources,
+            max_length=self.source_max_len,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        input_ids = [torch.tensor(tokenized_source) for tokenized_source in tokenized_sources_with_prompt['input_ids']]
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+        data_dict = {
+            'input_ids': input_ids,
+            'attention_mask': input_ids.ne(self.tokenizer.pad_token_id),
+        }
+        return data_dict
+class MolecularPropertyPredictionModel():
+    def __init__(self):
+        self.adapter_name = None
+        # we need to keep track of the paths of adapter scalers
+        # we don't want to download the same scaler multiple times
+        self.apapter_scaler_path = dict()
+        DEFAULT_PAD_TOKEN = "[PAD]"
+        # load the base model
+        config = AutoConfig.from_pretrained(
+            "ChemFM/ChemFM-3B",
+            num_labels=1,
+            finetuning_task="classification", # this is not about our task type
+            trust_remote_code=True,
+        )
+        self.base_model = AutoModelForSequenceClassification.from_pretrained(
+            "ChemFM/ChemFM-3B",
+            config=config,
+            device_map="cpu",
+            trust_remote_code=True,
+        )
+        # load the tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "ChemFM/admet_ppbr_az",
+            trust_remote_code=True,
+        )
+        special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=special_tokens_dict,
+            tokenizer=self.tokenizer,
+            model=self.base_model
+        )
+        self.base_model.config.pad_token_id = self.tokenizer.pad_token_id
+        self.data_collator = DataCollator(
+            tokenizer=self.tokenizer,
+            source_max_len=512,
+            molecule_start_str="<molstart>",
+            end_str="<eos>",
+        )
+    def swith_adapter(self, adapter_name, adapter_id):
+        # return flag:
+        # keep: adapter is the same as the current one
+        # switched: adapter is switched successfully
+        # error: adapter is not found
+        if adapter_name == self.adapter_name:
+            return "keep"
+        # switch adapter
+        try:
+            self.adapter_name = adapter_name
+            self.lora_model = PeftModel.from_pretrained(self.base_model, adapter_id)
+            if adapter_name not in self.apapter_scaler_path:
+                self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl")
+            if os.path.exists(self.apapter_scaler_path[adapter_name]):
+                self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
+            else:
+                self.scaler = None
+            return "switched"
+        except Exception as e:
+            # handle error
+            return "error"
+    def predict(self, valid_df, task_type):
+        test_dataset = Dataset.from_pandas(valid_df)
+        # construct the dataloader
+        test_loader = torch.utils.data.DataLoader(
+            test_dataset,
+            batch_size=4,
+            collate_fn=self.data_collator,
+        )
+        # predict
+        y_pred = []
+        for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), desc="Evaluating"):
+            with torch.no_grad():
+                batch = {k: v.to(self.lora_model.device) for k, v in batch.items()}
+                outputs = self.lora_model(**batch)
+            if task_type == "regression": # TODO: check if the model is regression or classification
+                y_pred.append(outputs.logits.cpu().detach().numpy())
+            else:
+                y_pred.append((torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy())
+        y_pred = np.concatenate(y_pred, axis=0)
+        if task_type=="regression" and self.scaler is not None:
+            y_pred = self.scaler.inverse_transform(y_pred)
+        return y_pred
+    def predict_single_smiles(self, smiles, task_type):
+        assert task_type in ["regression", "classification"]
+        # check the SMILES string is valid
+        if not Chem.MolFromSmiles(smiles):
+            return None
+        valid_df = pd.DataFrame([smiles], columns=['smiles'])
+        results = self.predict(valid_df, task_type)
+        # predict
+        return results.item()
+    def predict_file(self, df, task_type):
+        # we should add the index first
+        df = df.reset_index()
+        # we need to check the SMILES strings are valid, the invalid ones will be moved to the last
+        valid_idx = []
+        invalid_idx = []
+        for idx, smiles in enumerate(df['smiles']):
+            if Chem.MolFromSmiles(smiles):
+                valid_idx.append(idx)
+            else:
+                invalid_idx.append(idx)
+        valid_df = df.loc[valid_idx]
+        # get the smiles list
+        valid_df_smiles = valid_df['smiles'].tolist()
+        input_df = pd.DataFrame(valid_df_smiles, columns=['smiles'])
+        results = self.predict(input_df, task_type)
+        # add the results to the dataframe
+        df.loc[valid_idx, 'prediction'] = results
+        df.loc[invalid_idx, 'prediction'] = np.nan
+        # drop the index column
+        df = df.drop(columns=['index'])
+        # phrase file
+        return df