Spaces:

wangjin2000
/

ESM2PPI

Paused

App Files Files Community

wangjin2000 commited on Sep 18, 2024

Commit

2334c10

verified ·

1 Parent(s): d17119b

Create app.py

Browse files

Files changed (1) hide show

app.py +383 -0

app.py ADDED Viewed

	@@ -0,0 +1,383 @@

+#ref: https://huggingface.co/blog/AmelieSchreiber/esmbind
+import gradio as gr
+import os
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+#import wandb
+import numpy as np
+import torch
+import torch.nn as nn
+import pickle
+import xml.etree.ElementTree as ET
+from datetime import datetime
+from sklearn.model_selection import train_test_split
+from sklearn.utils.class_weight import compute_class_weight
+from sklearn.metrics import (
+    accuracy_score,
+    precision_recall_fscore_support,
+    roc_auc_score,
+    matthews_corrcoef
+)
+from transformers import (
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    TrainingArguments,
+    Trainer
+)
+from peft import PeftModel
+from datasets import Dataset
+from accelerate import Accelerator
+# Imports specific to the custom peft lora model
+from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
+from plot_pdb import plot_struc
+def suggest(option):
+    if option == "Plastic degradation protein":
+        suggestion = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
+    elif option == "Default protein":
+        #suggestion = "MAPLRKTYVLKLYVAGNTPNSVRALKTLNNILEKEFKGVYALKVIDVLKNPQLAEEDKILATPTLAKVLPPPVRRIIGDLSNREKVLIGLDLLYEEIGDQAEDDLGLE"
+        suggestion = "MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT"
+    elif option == "Antifreeze protein":
+        suggestion = "QCTGGADCTSCTGACTGCGNCPNAVTCTNSQHCVKANTCTGSTDCNTAQTCTNSKDCFEANTCTDSTNCYKATACTNSSGCPGH"
+    elif option == "AI Generated protein":
+        suggestion = "MSGMKKLYEYTVTTLDEFLEKLKEFILNTSKDKIYKLTITNPKLIKDIGKAIAKAAEIADVDPKEIEEMIKAVEENELTKLVITIEQTDDKYVIKVELENEDGLVHSFEIYFKNKEEMEKFLELLEKLISKLSGS"
+    elif option == "7-bladed propeller fold":
+        suggestion = "VKLAGNSSLCPINGWAVYSKDNSIRIGSKGDVFVIREPFISCSHLECRTFFLTQGALLNDKHSNGTVKDRSPHRTLMSCPVGEAPSPYNSRFESVAWSASACHDGTSWLTIGISGPDNGAVAVLKYNGIITDTIKSWRNNILRTQESECACVNGSCFTVMTDGPSNGQASYKIFKMEKGKVVKSVELDAPNYHYEECSCYPNAGEITCVCRDNWHGSNRPWVSFNQNLEYQIGYICSGVFGDNPRPNDGTGSCGPVSSNGAYGVKGFSFKYGNGVWIGRTKSTNSRSGFEMIWDPNGWTETDSSFSVKQDIVAITDWSGYSGSFVQHPELTGLDCIRPCFWVELIRGRPKESTIWTSGSSISFCGVNSDTVGWSWPDGAELPFTIDK"
+    else:
+        suggestion = ""
+    return suggestion
+# Helper Functions and Data Preparation
+def truncate_labels(labels, max_length):
+    """Truncate labels to the specified max_length."""
+    return [label[:max_length] for label in labels]
+def compute_metrics(p):
+    """Compute metrics for evaluation."""
+    predictions, labels = p
+    predictions = np.argmax(predictions, axis=2)
+    # Remove padding (-100 labels)
+    predictions = predictions[labels != -100].flatten()
+    labels = labels[labels != -100].flatten()
+    # Compute accuracy
+    accuracy = accuracy_score(labels, predictions)
+    # Compute precision, recall, F1 score, and AUC
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
+    auc = roc_auc_score(labels, predictions)
+    # Compute MCC
+    mcc = matthews_corrcoef(labels, predictions)
+    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc, 'mcc': mcc}
+def compute_loss(model, inputs):
+    """Custom compute_loss function."""
+    logits = model(**inputs).logits
+    labels = inputs["labels"]
+    loss_fct = nn.CrossEntropyLoss(weight=class_weights)
+    active_loss = inputs["attention_mask"].view(-1) == 1
+    active_logits = logits.view(-1, model.config.num_labels)
+    active_labels = torch.where(
+        active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+    )
+    loss = loss_fct(active_logits, active_labels)
+    return loss
+# Define Custom Trainer Class
+# Since we are using class weights, due to the imbalance between non-binding residues and binding residues, we will need a custom weighted trainer.
+class WeightedTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        outputs = model(**inputs)
+        loss = compute_loss(model, inputs)
+        return (loss, outputs) if return_outputs else loss
+# Predict binding site with finetuned PEFT model
+def predict_bind(base_model_path,PEFT_model_path,input_seq):
+    # Load the model
+    base_model = AutoModelForTokenClassification.from_pretrained(base_model_path)
+    loaded_model = PeftModel.from_pretrained(base_model, PEFT_model_path)
+    # Ensure the model is in evaluation mode
+    loaded_model.eval()
+    # Tokenization
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+    # Tokenize the sequence
+    inputs = tokenizer(input_seq, return_tensors="pt", truncation=True, max_length=1024, padding='max_length')
+    # Run the model
+    with torch.no_grad():
+        logits = loaded_model(**inputs).logits
+    # Get predictions
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # Convert input ids back to tokens
+    predictions = torch.argmax(logits, dim=2)
+    binding_site=[]
+    pos = 0
+    # Print the predicted labels for each token
+    for token, prediction in zip(tokens, predictions[0].numpy()):
+        if token not in ['<pad>', '<cls>', '<eos>']:
+            pos += 1
+            print((pos, token, id2label[prediction]))
+            if prediction == 1:
+                print((pos, token, id2label[prediction]))
+                binding_site.append([pos, token, id2label[prediction]])
+    return binding_site
+# fine-tuning function
+def train_function_no_sweeps(base_model_path):   #, train_dataset, test_dataset):
+    # Set the LoRA config
+    config = {
+        "lora_alpha": 1, #try 0.5, 1, 2, ..., 16
+        "lora_dropout": 0.2,
+        "lr": 5.701568055793089e-04,
+        "lr_scheduler_type": "cosine",
+        "max_grad_norm": 0.5,
+        "num_train_epochs": 1,  #3, jw 20240628
+        "per_device_train_batch_size": 12,
+        "r": 2,
+        "weight_decay": 0.2,
+        # Add other hyperparameters as needed
+    }
+    base_model = AutoModelForTokenClassification.from_pretrained(base_model_path, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+    # Tokenization
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
+    train_tokenized = tokenizer(train_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)
+    test_tokenized = tokenizer(test_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)
+    train_dataset = Dataset.from_dict({k: v for k, v in train_tokenized.items()}).add_column("labels", train_labels)
+    test_dataset = Dataset.from_dict({k: v for k, v in test_tokenized.items()}).add_column("labels", test_labels)
+    # Convert the model into a PeftModel
+    peft_config = LoraConfig(
+        task_type=TaskType.TOKEN_CLS,
+        inference_mode=False,
+        r=config["r"],
+        lora_alpha=config["lora_alpha"],
+        target_modules=["query", "key", "value"], # also try "dense_h_to_4h" and "dense_4h_to_h"
+        lora_dropout=config["lora_dropout"],
+        bias="none" # or "all" or "lora_only"
+    )
+    base_model = get_peft_model(base_model, peft_config)
+    # Use the accelerator
+    base_model = accelerator.prepare(base_model)
+    train_dataset = accelerator.prepare(train_dataset)
+    test_dataset = accelerator.prepare(test_dataset)
+    model_name_base = base_model_path.split("/")[1]
+    timestamp = datetime.now().strftime('%Y-%m-%d_%H')
+    # Training setup
+    training_args = TrainingArguments(
+        output_dir=f"{model_name_base}-lora-binding-sites_{timestamp}",
+        learning_rate=config["lr"],
+        lr_scheduler_type=config["lr_scheduler_type"],
+        gradient_accumulation_steps=1,
+        max_grad_norm=config["max_grad_norm"],
+        per_device_train_batch_size=config["per_device_train_batch_size"],
+        per_device_eval_batch_size=config["per_device_train_batch_size"],
+        num_train_epochs=config["num_train_epochs"],
+        weight_decay=config["weight_decay"],
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+        greater_is_better=True,
+        push_to_hub=True,  #jw 20240701  False,
+        logging_dir=None,
+        logging_first_step=False,
+        logging_steps=200,
+        save_total_limit=7,
+        no_cuda=False,
+        seed=8893,
+        fp16=True,
+        #report_to='wandb'
+        report_to=None,
+        hub_token = HF_TOKEN, #jw 20240701
+    )
+    # Initialize Trainer
+    trainer = WeightedTrainer(
+        model=base_model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+        tokenizer=tokenizer,
+        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
+        compute_metrics=compute_metrics,
+    )
+    # Train and Save Model
+    trainer.train()
+    return save_path
+# Constants & Globals
+HF_TOKEN = os.environ.get("HF_token")
+print("HF_TOKEN:",HF_TOKEN)
+MODEL_OPTIONS = [
+    "facebook/esm2_t6_8M_UR50D",
+    "facebook/esm2_t12_35M_UR50D",
+    "facebook/esm2_t33_650M_UR50D",
+]  # models users can choose from
+PEFT_MODEL_OPTIONS = [
+    "wangjin2000/esm2_t6_8M-lora-binding-sites_2024-07-02_09-26-54",
+    "AmelieSchreiber/esm2_t12_35M_lora_binding_sites_v2_cp3",
+]  # finetuned models
+# Load the data from pickle files (replace with your local paths)
+with open("./datasets/train_sequences_chunked_by_family.pkl", "rb") as f:
+    train_sequences = pickle.load(f)
+with open("./datasets/test_sequences_chunked_by_family.pkl", "rb") as f:
+    test_sequences = pickle.load(f)
+with open("./datasets/train_labels_chunked_by_family.pkl", "rb") as f:
+    train_labels = pickle.load(f)
+with open("./datasets/test_labels_chunked_by_family.pkl", "rb") as f:
+    test_labels = pickle.load(f)
+max_sequence_length = 1000
+# Directly truncate the entire list of labels
+train_labels = truncate_labels(train_labels, max_sequence_length)
+test_labels = truncate_labels(test_labels, max_sequence_length)
+# Compute Class Weights
+classes = [0, 1]
+flat_train_labels = [label for sublist in train_labels for label in sublist]
+class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=flat_train_labels)
+accelerator = Accelerator()
+class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)
+# Define labels and model
+id2label = {0: "No binding site", 1: "Binding site"}
+label2id = {v: k for k, v in id2label.items()}
+'''
+# debug result
+dubug_result = saved_path  #predictions  #class_weights
+'''
+demo = gr.Blocks(title="DEMO FOR ESM2Bind")
+with demo:
+    gr.Markdown("# DEMO FOR ESM2Bind")
+    #gr.Textbox(dubug_result)
+    with gr.Column():
+        gr.Markdown("## Select a base model and a corresponding PEFT finetune model")
+        with gr.Row():
+            with gr.Column(scale=5, variant="compact"):
+                base_model_name = gr.Dropdown(
+                    choices=MODEL_OPTIONS,
+                    value=MODEL_OPTIONS[0],
+                    label="Base Model Name",
+                    interactive = True,
+                )
+                PEFT_model_name = gr.Dropdown(
+                    choices=PEFT_MODEL_OPTIONS,
+                    value=PEFT_MODEL_OPTIONS[0],
+                    label="PEFT Model Name",
+                    interactive = True,
+                )
+            with gr.Column(scale=5, variant="compact"):
+                    name = gr.Dropdown(
+                        label="Choose a Sample Protein",
+                        value="Default protein",
+                        choices=["Default protein", "Antifreeze protein", "Plastic degradation protein",  "AI Generated protein", "7-bladed propeller fold", "custom"]
+                    )
+        gr.Markdown(
+                "## Predict binding site and Plot structure for selected protein sequence:"
+                )
+        with gr.Row():
+            with gr.Column(variant="compact", scale = 8):
+                input_seq = gr.Textbox(
+                    lines=1,
+                    max_lines=12,
+                    label="Protein sequency to be predicted:",
+                    value="MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT",
+                    placeholder="Paste your protein sequence here...",
+                    interactive = True,
+                )
+                text_pos = gr.Textbox(
+                    lines=1,
+                    max_lines=12,
+                    label="Sequency Position:",
+                    placeholder=
+                    "012345678911234567892123456789312345678941234567895123456789612345678971234567898123456789912345678901234567891123456789",
+                    interactive=False,
+                )
+            with gr.Column(variant="compact", scale = 2):
+                    predict_btn = gr.Button(
+                        value="Predict binding site",
+                        interactive=True,
+                        variant="primary",
+                    )
+                    plot_struc_btn = gr.Button(value = "Plot ESMFold Predicted Structure ", variant="primary")
+        with gr.Row():
+            with gr.Column(variant="compact", scale = 5):
+                output_text = gr.Textbox(
+                    lines=1,
+                    max_lines=12,
+                    label="Output",
+                    placeholder="Output",
+                )
+            with gr.Column(variant="compact", scale = 5):
+                finetune_button = gr.Button(
+                    value="Finetune Pre-trained Model",
+                    interactive=True,
+                    variant="primary",
+                )
+        with gr.Row():
+            output_viewer = gr.HTML()
+            output_file = gr.File(
+                label="Download as Text File",
+                file_count="single",
+                type="filepath",
+                interactive=False,
+            )
+    # select protein sample
+    name.change(fn=suggest, inputs=name, outputs=input_seq)
+    # "Predict binding site" actions
+    predict_btn.click(
+        fn = predict_bind,
+        inputs=[base_model_name,PEFT_model_name,input_seq],
+        outputs = [output_text],
+    )
+    # "Finetune Pre-trained Model" actions
+    finetune_button.click(
+        fn = train_function_no_sweeps,
+        inputs=[base_model_name],
+        outputs = [output_text],
+    )
+    # plot protein structure
+    plot_struc_btn.click(fn=plot_struc, inputs=input_seq, outputs=[output_file, output_viewer])
+demo.launch()