Spaces:

wangjin2000
/

ESM2PPI

Paused

App Files Files Community

wangjin2000 commited on Oct 29, 2024

Commit

db7f9c6

verified ·

1 Parent(s): 18227f6

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -17

app.py CHANGED Viewed

@@ -36,13 +36,14 @@ PEFT_MODEL_OPTIONS = [
 #build datasets
 class ProteinDataset(Dataset):
-    def __init__(self, file, tokenizer):
         data = pd.read_csv(file)
         self.tokenizer = tokenizer
         self.proteins = data["Receptor Sequence"].tolist()
         self.peptides = data["Binder"].tolist()
         #self.proteins = data["P_Sequence"].tolist()  #header defined by Lin Qiao
         #self.peptides = data["p_Sequence"].tolist()
     def __len__(self):
         return len(self.proteins)
@@ -55,14 +56,14 @@ class ProteinDataset(Dataset):
         complex_seq = protein_seq + masked_peptide
         # Tokenize and pad the complex sequence
-        complex_input = self.tokenizer(complex_seq, return_tensors="pt", padding="max_length", max_length = 552, truncation=True)
         input_ids = complex_input["input_ids"].squeeze()
         attention_mask = complex_input["attention_mask"].squeeze()
         # Create labels (tokens for ground truth AAs)
         label_seq = protein_seq + peptide_seq
-        labels = self.tokenizer(label_seq, return_tensors="pt", padding="max_length", max_length = 552, truncation=True)["input_ids"].squeeze()
         # Set non-masked positions in the labels tensor to -100
         labels = torch.where(input_ids == self.tokenizer.mask_token_id, labels, -100)
@@ -70,7 +71,7 @@ class ProteinDataset(Dataset):
         return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
 # fine-tuning function
-def finetune(base_model_path):   #, train_dataset, test_dataset):
     #load base model
     base_model = EsmForMaskedLM.from_pretrained(base_model_path)
@@ -78,9 +79,8 @@ def finetune(base_model_path):   #, train_dataset, test_dataset):
     # Tokenization
     tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
-    train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer)
-    test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer)
-    print("line 84 testset:",test_dataset)
     model_name_base = base_model_path.split("/")[1]
     timestamp = datetime.now().strftime('%Y-%m-%d_%H')
@@ -179,7 +179,7 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
     return binders_with_ppl
 # Predict peptide binder with finetuned model
-def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, top_k=3, num_binders=4):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
@@ -211,7 +211,7 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
     return results_df, PPC
-def predict_peptide_from_file(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, top_k=3, num_binders=4):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
@@ -221,6 +221,12 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, input_seqs,
     # Tokenization
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
     if isinstance(input_seqs, str):  # Single sequence
         binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
         results_df = pd.DataFrame(binders, columns=['Binder', 'Pseudo Perplexity'])
@@ -296,7 +302,7 @@ with demo:
                         file_count="single",
                         file_types=[".tsv", ".csv"],
                         type="filepath",
-                        height=20,
                     )
         gr.Markdown(
                 "## Predict peptide sequence:"
@@ -321,11 +327,17 @@ with demo:
                 )
             with gr.Column(variant="compact", scale = 2):
                     predict_btn = gr.Button(
-                        value="Predict peptide sequence",
                         interactive=True,
                         variant="primary",
                     )
-                    plot_struc_btn = gr.Button(value = "Plot ESMFold Predicted Structure ", variant="primary")
         with gr.Row():
             with gr.Column(variant="compact", scale = 5):
                 output_text = gr.Textbox(
@@ -340,11 +352,6 @@ with demo:
                     interactive=True,
                     variant="primary",
                 )
-                predict_file_btn = gr.Button(
-                    value="Predict peptide from a local file",
-                    interactive=True,
-                    variant="primary",
-                )
         with gr.Row():
             output_viewer = gr.HTML()
             output_file = gr.File(

 #build datasets
 class ProteinDataset(Dataset):
+    def __init__(self, file, tokenizer, peptide_length):
         data = pd.read_csv(file)
         self.tokenizer = tokenizer
         self.proteins = data["Receptor Sequence"].tolist()
         self.peptides = data["Binder"].tolist()
         #self.proteins = data["P_Sequence"].tolist()  #header defined by Lin Qiao
         #self.peptides = data["p_Sequence"].tolist()
+        self.max_length_pm = 500 + 2 + peptide_length  #assume the maz length of protein is 500
     def __len__(self):
         return len(self.proteins)
         complex_seq = protein_seq + masked_peptide
         # Tokenize and pad the complex sequence
+        complex_input = self.tokenizer(complex_seq, return_tensors="pt", padding="max_length", max_length = self.max_length_pm, truncation=True)
         input_ids = complex_input["input_ids"].squeeze()
         attention_mask = complex_input["attention_mask"].squeeze()
         # Create labels (tokens for ground truth AAs)
         label_seq = protein_seq + peptide_seq
+        labels = self.tokenizer(label_seq, return_tensors="pt", padding="max_length", max_length = self.max_length_pm, truncation=True)["input_ids"].squeeze()
         # Set non-masked positions in the labels tensor to -100
         labels = torch.where(input_ids == self.tokenizer.mask_token_id, labels, -100)
         return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
 # fine-tuning function
+def finetune(base_model_path, peptide_length):   #, train_dataset, test_dataset):
     #load base model
     base_model = EsmForMaskedLM.from_pretrained(base_model_path)
     # Tokenization
     tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
+    train_dataset = ProteinDataset("./datasets/pepnn_train.csv", tokenizer, peptide_length)
+    test_dataset = ProteinDataset("./datasets/pepnn_test.csv", tokenizer, peptide_length)
     model_name_base = base_model_path.split("/")[1]
     timestamp = datetime.now().strftime('%Y-%m-%d_%H')
     return binders_with_ppl
 # Predict peptide binder with finetuned model
+def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15,  num_binders=4, top_k=3):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
     return results_df, PPC
+def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, peptide_length=15, num_binders=4, top_k=3):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
     # Tokenization
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+    eval_dataset = ProteinDataset(file_obj, tokenizer, peptide_length)
+    print("eval_dataset:",eval_dataset)
+    input_seqs = eval_dataset["input_ids"]
+    print("line 228 - input_seqs:",input_seqs)
     if isinstance(input_seqs, str):  # Single sequence
         binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
         results_df = pd.DataFrame(binders, columns=['Binder', 'Pseudo Perplexity'])
                         file_count="single",
                         file_types=[".tsv", ".csv"],
                         type="filepath",
+                        height=10,
                     )
         gr.Markdown(
                 "## Predict peptide sequence:"
                 )
             with gr.Column(variant="compact", scale = 2):
                     predict_btn = gr.Button(
+                        value="Predict peptide sequence from a protein sequence",
+                        interactive=True,
+                        variant="primary",
+                    )
+                    plot_struc_btn = gr.Button(value = "Plot ESMFold predicted structure ", variant="primary")
+                    predict_file_btn = gr.Button(
+                        value="Predict peptide from a local file",
                         interactive=True,
                         variant="primary",
                     )
         with gr.Row():
             with gr.Column(variant="compact", scale = 5):
                 output_text = gr.Textbox(
                     interactive=True,
                     variant="primary",
                 )
         with gr.Row():
             output_viewer = gr.HTML()
             output_file = gr.File(