Spaces:

wangjin2000
/

ESM2PPI

Paused

App Files Files Community

wangjin2000 commited on Sep 19, 2024

Commit

cdb0d81

verified ·

1 Parent(s): 519edc2

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -137

app.py CHANGED Viewed

@@ -3,10 +3,13 @@ import gradio as gr
 import os
 from transformers import Trainer, TrainingArguments, AutoTokenizer, EsmForMaskedLM, TrainerCallback
-from torch.utils.data import DataLoader, Dataset, RandomSampler
-import pandas as pd
 import torch
 from torch.optim import AdamW
 #import wandb
 import numpy as np
 from datetime import datetime
@@ -114,17 +117,25 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
     sequence = protein_seq + binder_seq
     original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
     length_of_binder = len(binder_seq)
     # Prepare a batch with each row having one masked token from the binder sequence
     masked_inputs = original_input.repeat(length_of_binder, 1)
     positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
     masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
     # Prepare labels for the masked tokens
     labels = torch.full_like(masked_inputs, -100)
     labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
     # Get model predictions and calculate loss
     with torch.no_grad():
         outputs = model(masked_inputs, labels=labels)
@@ -135,37 +146,7 @@ def compute_pseudo_perplexity(model, tokenizer, protein_seq, binder_seq):
     pseudo_perplexity = np.exp(avg_loss)
     return pseudo_perplexity
-# Alternative implementation: Use Loop
-def compute_pseudo_perplexity2(model, tokenizer, protein_seq, binder_seq):
-    sequence = protein_seq + binder_seq
-    tensor_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
-    total_loss = 0
-    # Loop through each token in the binder sequence
-    for i in range(-len(binder_seq)-1, -1):
-        # Create a copy of the original tensor
-        masked_input = tensor_input.clone()
-        # Mask one token at a time
-        masked_input[0, i] = tokenizer.mask_token_id
-        # Create labels
-        labels = torch.full(tensor_input.shape, -100).to(model.device)
-        labels[0, i] = tensor_input[0, i]
-        # Get model prediction and loss
-        with torch.no_grad():
-            outputs = model(masked_input, labels=labels)
-            total_loss += outputs.loss.item()
-    # Calculate the average loss
-    avg_loss = total_loss / len(binder_seq)
-    # Calculate pseudo perplexity
-    pseudo_perplexity = np.exp(avg_loss)
-    return pseudo_perplexity
-def generate_peptide_for_single_sequence(protein_seq, peptide_length = 15, top_k = 3, num_binders = 4):
     peptide_length = int(peptide_length)
     top_k = int(top_k)
@@ -212,9 +193,36 @@ def generate_peptide(input_seqs, peptide_length=15, top_k=3, num_binders=4):
             for binder, ppl in binders:
                 results.append([seq, binder, ppl])
         return pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
 def suggest(option):
-    if option == "Plastic degradation protein":
-        suggestion = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
     elif option == "Default protein":
         #suggestion = "MAPLRKTYVLKLYVAGNTPNSVRALKTLNNILEKEFKGVYALKVIDVLKNPQLAEEDKILATPTLAKVLPPPVRRIIGDLSNREKVLIGLDLLYEEIGDQAEDDLGLE"
         suggestion = "MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT"
@@ -227,97 +235,6 @@ def suggest(option):
     else:
         suggestion = ""
     return suggestion
-# Helper Functions and Data Preparation
-def truncate_labels(labels, max_length):
-    """Truncate labels to the specified max_length."""
-    return [label[:max_length] for label in labels]
-def compute_metrics(p):
-    """Compute metrics for evaluation."""
-    predictions, labels = p
-    predictions = np.argmax(predictions, axis=2)
-    # Remove padding (-100 labels)
-    predictions = predictions[labels != -100].flatten()
-    labels = labels[labels != -100].flatten()
-    # Compute accuracy
-    accuracy = accuracy_score(labels, predictions)
-    # Compute precision, recall, F1 score, and AUC
-    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
-    auc = roc_auc_score(labels, predictions)
-    # Compute MCC
-    mcc = matthews_corrcoef(labels, predictions)
-    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc, 'mcc': mcc}
-def compute_loss(model, inputs):
-    """Custom compute_loss function."""
-    logits = model(**inputs).logits
-    labels = inputs["labels"]
-    loss_fct = nn.CrossEntropyLoss(weight=class_weights)
-    active_loss = inputs["attention_mask"].view(-1) == 1
-    active_logits = logits.view(-1, model.config.num_labels)
-    active_labels = torch.where(
-        active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-    )
-    loss = loss_fct(active_logits, active_labels)
-    return loss
-# Predict binding site with finetuned PEFT model
-def predict_bind(base_model_path,PEFT_model_path,input_seq):
-    # Load the model
-    base_model = AutoModelForTokenClassification.from_pretrained(base_model_path)
-    loaded_model = PeftModel.from_pretrained(base_model, PEFT_model_path)
-    # Ensure the model is in evaluation mode
-    loaded_model.eval()
-    # Tokenization
-    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
-    # Tokenize the sequence
-    inputs = tokenizer(input_seq, return_tensors="pt", truncation=True, max_length=1024, padding='max_length')
-    # Run the model
-    with torch.no_grad():
-        logits = loaded_model(**inputs).logits
-    # Get predictions
-    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # Convert input ids back to tokens
-    predictions = torch.argmax(logits, dim=2)
-    binding_site=[]
-    pos = 0
-    # Print the predicted labels for each token
-    for token, prediction in zip(tokens, predictions[0].numpy()):
-        if token not in ['<pad>', '<cls>', '<eos>']:
-            pos += 1
-            print((pos, token, id2label[prediction]))
-            if prediction == 1:
-                print((pos, token, id2label[prediction]))
-                binding_site.append([pos, token, id2label[prediction]])
-    return binding_site
-MODEL_OPTIONS = [
-    "facebook/esm2_t6_8M_UR50D",
-    "facebook/esm2_t12_35M_UR50D",
-    "facebook/esm2_t33_650M_UR50D",
-]  # models users can choose from
-PEFT_MODEL_OPTIONS = [
-    "wangjin2000/esm2_t6_8M-lora-binding-sites_2024-07-02_09-26-54",
-    "AmelieSchreiber/esm2_t12_35M_lora_binding_sites_v2_cp3",
-]  # finetuned models
-'''
-# debug result
-dubug_result = saved_path  #predictions  #class_weights
-'''
 demo = gr.Blocks(title="ESM2 for Protein-Protein Interaction (ESM2PPI)")
@@ -345,8 +262,8 @@ with demo:
             with gr.Column(scale=5, variant="compact"):
                     name = gr.Dropdown(
                         label="Choose a Sample Protein",
-                        value="Default protein",
-                        choices=["Default protein", "Antifreeze protein", "Plastic degradation protein",  "AI Generated protein", "7-bladed propeller fold", "custom"]
                     )
         gr.Markdown(
                 "## Predict binding site and Plot structure for selected protein sequence:"
@@ -356,8 +273,8 @@ with demo:
                 input_seq = gr.Textbox(
                     lines=1,
                     max_lines=12,
-                    label="Protein sequency to be predicted:",
-                    value="MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT",
                     placeholder="Paste your protein sequence here...",
                     interactive = True,
                 )
@@ -371,7 +288,7 @@ with demo:
                 )
             with gr.Column(variant="compact", scale = 2):
                     predict_btn = gr.Button(
-                        value="Predict binding site",
                         interactive=True,
                         variant="primary",
                     )
@@ -402,9 +319,9 @@ with demo:
     # select protein sample
     name.change(fn=suggest, inputs=name, outputs=input_seq)
-    # "Predict binding site" actions
     predict_btn.click(
-        fn = predict_bind,
         inputs=[base_model_name,PEFT_model_name,input_seq],
         outputs = [output_text],
     )

 import os
 from transformers import Trainer, TrainingArguments, AutoTokenizer, EsmForMaskedLM, TrainerCallback
 import torch
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.optim import AdamW
+from torch.distributions import Categorical
+import pandas as pd
 #import wandb
 import numpy as np
 from datetime import datetime
     sequence = protein_seq + binder_seq
     original_input = tokenizer.encode(sequence, return_tensors='pt').to(model.device)
     length_of_binder = len(binder_seq)
+    print("length of original_input:",len(original_input))
+    print("length of binder:",length_of_binder)
+    print("original_input:",original_input)
     # Prepare a batch with each row having one masked token from the binder sequence
     masked_inputs = original_input.repeat(length_of_binder, 1)
     positions_to_mask = torch.arange(-length_of_binder - 1, -1, device=model.device)
+    print("masked_inputs:",masked_inputs)
+    print("positions_to_mask:",positions_to_mask)
     masked_inputs[torch.arange(length_of_binder), positions_to_mask] = tokenizer.mask_token_id
+    print("masked_inputs tokens:",masked_inputs[torch.arange(length_of_binder), positions_to_mask])
     # Prepare labels for the masked tokens
     labels = torch.full_like(masked_inputs, -100)
+    print("labels:",labels)
     labels[torch.arange(length_of_binder), positions_to_mask] = original_input[0, positions_to_mask]
+    print("labels 137:",labels)
     # Get model predictions and calculate loss
     with torch.no_grad():
         outputs = model(masked_inputs, labels=labels)
     pseudo_perplexity = np.exp(avg_loss)
     return pseudo_perplexity
+def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 4):
     peptide_length = int(peptide_length)
     top_k = int(top_k)
             for binder, ppl in binders:
                 results.append([seq, binder, ppl])
         return pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
+# Predict peptide binder with finetuned model
+def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, top_k=3, num_binders=4):
+    # Load the model
+    loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
+    # Ensure the model is in evaluation mode
+    loaded_model.eval()
+    # Tokenization
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+    if isinstance(input_seqs, str):  # Single sequence
+        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
+        resuls_df = pd.DataFrame(binders, columns=['Binder', 'Pseudo Perplexity'])
+    elif isinstance(input_seqs, list):  # List of sequences
+        results = []
+        for seq in input_seqs:
+            binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders)
+            for binder, ppl in binders:
+                results.append([seq, binder, ppl])
+        resuls_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'Pseudo Perplexity'])
+        print(results_df)
+    return result_df
 def suggest(option):
+    if option == "Protein:P63279":
+        suggestion = "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS"
     elif option == "Default protein":
         #suggestion = "MAPLRKTYVLKLYVAGNTPNSVRALKTLNNILEKEFKGVYALKVIDVLKNPQLAEEDKILATPTLAKVLPPPVRRIIGDLSNREKVLIGLDLLYEEIGDQAEDDLGLE"
         suggestion = "MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT"
     else:
         suggestion = ""
     return suggestion
 demo = gr.Blocks(title="ESM2 for Protein-Protein Interaction (ESM2PPI)")
             with gr.Column(scale=5, variant="compact"):
                     name = gr.Dropdown(
                         label="Choose a Sample Protein",
+                        value="Protein:P63279",
+                        choices=["Default protein", "Antifreeze protein", "Protein:P63279", "AI Generated protein", "7-bladed propeller fold", "custom"]
                     )
         gr.Markdown(
                 "## Predict binding site and Plot structure for selected protein sequence:"
                 input_seq = gr.Textbox(
                     lines=1,
                     max_lines=12,
+                    label="Protein:P63279 to be predicted:",
+                    value="MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKLRMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELLNEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS",
                     placeholder="Paste your protein sequence here...",
                     interactive = True,
                 )
                 )
             with gr.Column(variant="compact", scale = 2):
                     predict_btn = gr.Button(
+                        value="Predict peptide sequence",
                         interactive=True,
                         variant="primary",
                     )
     # select protein sample
     name.change(fn=suggest, inputs=name, outputs=input_seq)
+    # "Predict peptide sequence" actions
     predict_btn.click(
+        fn = predict_peptide,
         inputs=[base_model_name,PEFT_model_name,input_seq],
         outputs = [output_text],
     )