Spaces:

wangjin2000
/

ESM2PPI

Paused

App Files Files Community

wangjin2000 commited on Nov 19, 2024

Commit

c7d6740

verified ·

1 Parent(s): 8b7118b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -18

app.py CHANGED Viewed

@@ -193,7 +193,7 @@ def compute_plddt_iptm(protein_seq, binder_seq):
     return avg_plddt, ptm
-def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5):
     start = time.time()
     peptide_length = int(peptide_length)
@@ -208,8 +208,8 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
         # Generate binder
         masked_peptide = '<mask>' * peptide_length
         input_sequence = protein_seq + masked_peptide
-        #inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
-        inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
         print("198:model.device in generate_:",model.device)
         with torch.no_grad():
@@ -229,8 +229,10 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
         ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
         # Get PLDDT from ESMFold model
-        #plddt_value, iPTM_value = compute_plddt_iptm(protein_seq, generated_binder) #too time-consuming
-        #plddt_value, iPTM_value = [0, 0]
         # Add the generated binder and its PPL to the results list
         binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
@@ -242,9 +244,9 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
     return binders_with_ppl_plddt
 # Predict peptide binder with finetuned model
-def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15,  num_binders=4, top_k=3):
     # Load the model
-    loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
     # Ensure the model is in evaluation mode
     loaded_model.eval()
@@ -253,13 +255,13 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
     if isinstance(input_seqs, str):  # Single sequence
-        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
         results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
     elif isinstance(input_seqs, list):  # List of sequences
         results = []
         for seq in input_seqs:
-            binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders)
             for binder, ppl, plddt, iptm in binders:
                 results.append([seq, binder, ppl, plddt, iptm])
         results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
@@ -272,10 +274,10 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
     return results_df, PPC
-def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3):
     start = time.time()
     # Load the model
-    loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
     # Ensure the model is in evaluation mode
     loaded_model.eval()
@@ -296,14 +298,16 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, m
         peptide_length = min([len(peptide_seq), max_peptide_length])  # use the same length of ground truth peptide length for prediction limited to max_peptide_length
         #get metrics for ground truth peptide
-        ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq)
-        plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
-        #plddt, iptm = [0, 0]
         results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
         #predict peptides
-        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders)
         for binder, ppl, plddt, iptm in binders:
             results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
@@ -367,7 +371,7 @@ with demo:
                 with gr.Row():
                     peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
                     num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
-                    plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM", value="no")
             with gr.Column(scale=5, variant="compact"):
                     name = gr.Dropdown(
                         label="Choose a Sample Protein",
@@ -444,14 +448,14 @@ with demo:
     # "Predict peptide sequence" actions
     predict_btn.click(
         fn = predict_peptide,
-        inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides],
         outputs = [output_text, input_seq],
     )
     # "Predict peptide from a local file" actions
     predict_file_btn.click(
         fn = predict_peptide_from_file,
-        inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides],
         outputs = [output_file],
     )

     return avg_plddt, ptm
+def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5, plddt_iptm_yes="no"):
     start = time.time()
     peptide_length = int(peptide_length)
         # Generate binder
         masked_peptide = '<mask>' * peptide_length
         input_sequence = protein_seq + masked_peptide
+        inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
+        #inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
         print("198:model.device in generate_:",model.device)
         with torch.no_grad():
         ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
         # Get PLDDT from ESMFold model
+        if  plddt_iptm_yes=="yes":
+            plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
+        else:
+            plddt, iptm = [0, 0]
         # Add the generated binder and its PPL to the results list
         binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
     return binders_with_ppl_plddt
 # Predict peptide binder with finetuned model
+def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15,  num_binders=4, top_k=3, plddt_iptm_yes="no"):
     # Load the model
+    loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)   #.to(device) inference use cpu
     # Ensure the model is in evaluation mode
     loaded_model.eval()
     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
     if isinstance(input_seqs, str):  # Single sequence
+        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders, plddt_iptm_yes)
         results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
     elif isinstance(input_seqs, list):  # List of sequences
         results = []
         for seq in input_seqs:
+            binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
             for binder, ppl, plddt, iptm in binders:
                 results.append([seq, binder, ppl, plddt, iptm])
         results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
     return results_df, PPC
+def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3, plddt_iptm_yes="no"):
     start = time.time()
     # Load the model
+    loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)  #.to(device)
     # Ensure the model is in evaluation mode
     loaded_model.eval()
         peptide_length = min([len(peptide_seq), max_peptide_length])  # use the same length of ground truth peptide length for prediction limited to max_peptide_length
         #get metrics for ground truth peptide
+        ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq
+        if  plddt_iptm_yes=="yes":
+            plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
+        else:
+            plddt, iptm = [0, 0]
         results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
         #predict peptides
+        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
         for binder, ppl, plddt, iptm in binders:
             results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
                 with gr.Row():
                     peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
                     num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
+                    plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM (slow!)", value="no")
             with gr.Column(scale=5, variant="compact"):
                     name = gr.Dropdown(
                         label="Choose a Sample Protein",
     # "Predict peptide sequence" actions
     predict_btn.click(
         fn = predict_peptide,
+        inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides,plddt_iptm_yes],
         outputs = [output_text, input_seq],
     )
     # "Predict peptide from a local file" actions
     predict_file_btn.click(
         fn = predict_peptide_from_file,
+        inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides,plddt_iptm_yes],
         outputs = [output_file],
     )