Spaces:

wangjin2000
/

ESM2PPI

Paused

wangjin2000 commited on Nov 5, 2024

Commit

4a62eed

verified ·

1 Parent(s): 6a8b48e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -238,7 +238,7 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
     return results_df, PPC
-def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, peptide_length=15, num_binders=4, top_k=3):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
@@ -253,17 +253,23 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, p
     results = []
     for i, row in input.iterrows():
-        seq = row['Receptor Sequence']
-        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders)
-        results_idf = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
-        peptide_lp = results_idf['Binder'][results_idf['PPL'].idxmin()] #Choosing the one with the lowest perplexity
-        #for binder, ppl, plddt, iptm in binders:
-        results.append([seq, peptide_lp])
-        print("263: results: ", results)
-        #peptide_lp = results_i['Binder'][results_df['PPL'].idxmin()] #Choosing the one with the lowest perplexity
-    results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
     timestamp = datetime.now().strftime('%Y-%m-%d_%H')
     outpath = (

     return results_df, PPC
+def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=4, top_k=3):
     # Load the model
     loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path)
     results = []
     for i, row in input.iterrows():
+        protein_seq = row['Receptor Sequence']
+        peptide_seq = row['Binder']
+        peptide_length = min([len(peptide_seq) max_peptide_length])  # use the same length of ground truth peptide length for prediction limited to max_peptide_length
+        #get metrics for ground truth peptide
+        ppl_value = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq)
+        plddt_value, iPTM_value = compute_plddt_iptm(protein_seq, peptide_seq)
+        results.append([seq, binder, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
+        #predict peptides
+        binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders)
+        for binder, ppl, plddt, iptm in binders:
+            results.append([seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
+    results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM', 'GT_Flag'])
     timestamp = datetime.now().strftime('%Y-%m-%d_%H')
     outpath = (