wangjin2000 commited on
Commit
c7d6740
·
verified ·
1 Parent(s): 8b7118b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -18
app.py CHANGED
@@ -193,7 +193,7 @@ def compute_plddt_iptm(protein_seq, binder_seq):
193
 
194
  return avg_plddt, ptm
195
 
196
- def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5):
197
  start = time.time()
198
 
199
  peptide_length = int(peptide_length)
@@ -208,8 +208,8 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
208
  # Generate binder
209
  masked_peptide = '<mask>' * peptide_length
210
  input_sequence = protein_seq + masked_peptide
211
- #inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
212
- inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
213
  print("198:model.device in generate_:",model.device)
214
 
215
  with torch.no_grad():
@@ -229,8 +229,10 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
229
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
230
 
231
  # Get PLDDT from ESMFold model
232
- #plddt_value, iPTM_value = compute_plddt_iptm(protein_seq, generated_binder) #too time-consuming
233
- #plddt_value, iPTM_value = [0, 0]
 
 
234
 
235
  # Add the generated binder and its PPL to the results list
236
  binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
@@ -242,9 +244,9 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
242
  return binders_with_ppl_plddt
243
 
244
  # Predict peptide binder with finetuned model
245
- def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, num_binders=4, top_k=3):
246
  # Load the model
247
- loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
248
 
249
  # Ensure the model is in evaluation mode
250
  loaded_model.eval()
@@ -253,13 +255,13 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
253
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
254
 
255
  if isinstance(input_seqs, str): # Single sequence
256
- binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
257
  results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
258
 
259
  elif isinstance(input_seqs, list): # List of sequences
260
  results = []
261
  for seq in input_seqs:
262
- binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders)
263
  for binder, ppl, plddt, iptm in binders:
264
  results.append([seq, binder, ppl, plddt, iptm])
265
  results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
@@ -272,10 +274,10 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
272
 
273
  return results_df, PPC
274
 
275
- def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3):
276
  start = time.time()
277
  # Load the model
278
- loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
279
 
280
  # Ensure the model is in evaluation mode
281
  loaded_model.eval()
@@ -296,14 +298,16 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, m
296
  peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
297
 
298
  #get metrics for ground truth peptide
299
- ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq)
300
- plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
301
- #plddt, iptm = [0, 0]
 
 
302
 
303
  results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
304
 
305
  #predict peptides
306
- binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders)
307
 
308
  for binder, ppl, plddt, iptm in binders:
309
  results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
@@ -367,7 +371,7 @@ with demo:
367
  with gr.Row():
368
  peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
369
  num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
370
- plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM", value="no")
371
  with gr.Column(scale=5, variant="compact"):
372
  name = gr.Dropdown(
373
  label="Choose a Sample Protein",
@@ -444,14 +448,14 @@ with demo:
444
  # "Predict peptide sequence" actions
445
  predict_btn.click(
446
  fn = predict_peptide,
447
- inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides],
448
  outputs = [output_text, input_seq],
449
  )
450
 
451
  # "Predict peptide from a local file" actions
452
  predict_file_btn.click(
453
  fn = predict_peptide_from_file,
454
- inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides],
455
  outputs = [output_file],
456
  )
457
 
 
193
 
194
  return avg_plddt, ptm
195
 
196
+ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5, plddt_iptm_yes="no"):
197
  start = time.time()
198
 
199
  peptide_length = int(peptide_length)
 
208
  # Generate binder
209
  masked_peptide = '<mask>' * peptide_length
210
  input_sequence = protein_seq + masked_peptide
211
+ inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
212
+ #inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
213
  print("198:model.device in generate_:",model.device)
214
 
215
  with torch.no_grad():
 
229
  ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
230
 
231
  # Get PLDDT from ESMFold model
232
+ if plddt_iptm_yes=="yes":
233
+ plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
234
+ else:
235
+ plddt, iptm = [0, 0]
236
 
237
  # Add the generated binder and its PPL to the results list
238
  binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
 
244
  return binders_with_ppl_plddt
245
 
246
  # Predict peptide binder with finetuned model
247
+ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, num_binders=4, top_k=3, plddt_iptm_yes="no"):
248
  # Load the model
249
+ loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path) #.to(device) inference use cpu
250
 
251
  # Ensure the model is in evaluation mode
252
  loaded_model.eval()
 
255
  tokenizer = AutoTokenizer.from_pretrained(base_model_path)
256
 
257
  if isinstance(input_seqs, str): # Single sequence
258
+ binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders, plddt_iptm_yes)
259
  results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
260
 
261
  elif isinstance(input_seqs, list): # List of sequences
262
  results = []
263
  for seq in input_seqs:
264
+ binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
265
  for binder, ppl, plddt, iptm in binders:
266
  results.append([seq, binder, ppl, plddt, iptm])
267
  results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
 
274
 
275
  return results_df, PPC
276
 
277
+ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3, plddt_iptm_yes="no"):
278
  start = time.time()
279
  # Load the model
280
+ loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path) #.to(device)
281
 
282
  # Ensure the model is in evaluation mode
283
  loaded_model.eval()
 
298
  peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
299
 
300
  #get metrics for ground truth peptide
301
+ ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq
302
+ if plddt_iptm_yes=="yes":
303
+ plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
304
+ else:
305
+ plddt, iptm = [0, 0]
306
 
307
  results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
308
 
309
  #predict peptides
310
+ binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
311
 
312
  for binder, ppl, plddt, iptm in binders:
313
  results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
 
371
  with gr.Row():
372
  peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
373
  num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
374
+ plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM (slow!)", value="no")
375
  with gr.Column(scale=5, variant="compact"):
376
  name = gr.Dropdown(
377
  label="Choose a Sample Protein",
 
448
  # "Predict peptide sequence" actions
449
  predict_btn.click(
450
  fn = predict_peptide,
451
+ inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides,plddt_iptm_yes],
452
  outputs = [output_text, input_seq],
453
  )
454
 
455
  # "Predict peptide from a local file" actions
456
  predict_file_btn.click(
457
  fn = predict_peptide_from_file,
458
+ inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides,plddt_iptm_yes],
459
  outputs = [output_file],
460
  )
461