Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -193,7 +193,7 @@ def compute_plddt_iptm(protein_seq, binder_seq):
|
|
| 193 |
|
| 194 |
return avg_plddt, ptm
|
| 195 |
|
| 196 |
-
def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5):
|
| 197 |
start = time.time()
|
| 198 |
|
| 199 |
peptide_length = int(peptide_length)
|
|
@@ -208,8 +208,8 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
|
|
| 208 |
# Generate binder
|
| 209 |
masked_peptide = '<mask>' * peptide_length
|
| 210 |
input_sequence = protein_seq + masked_peptide
|
| 211 |
-
|
| 212 |
-
inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
|
| 213 |
print("198:model.device in generate_:",model.device)
|
| 214 |
|
| 215 |
with torch.no_grad():
|
|
@@ -229,8 +229,10 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
|
|
| 229 |
ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
|
| 230 |
|
| 231 |
# Get PLDDT from ESMFold model
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
| 234 |
|
| 235 |
# Add the generated binder and its PPL to the results list
|
| 236 |
binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
|
|
@@ -242,9 +244,9 @@ def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_
|
|
| 242 |
return binders_with_ppl_plddt
|
| 243 |
|
| 244 |
# Predict peptide binder with finetuned model
|
| 245 |
-
def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, num_binders=4, top_k=3):
|
| 246 |
# Load the model
|
| 247 |
-
loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
|
| 248 |
|
| 249 |
# Ensure the model is in evaluation mode
|
| 250 |
loaded_model.eval()
|
|
@@ -253,13 +255,13 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
|
|
| 253 |
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
|
| 254 |
|
| 255 |
if isinstance(input_seqs, str): # Single sequence
|
| 256 |
-
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders)
|
| 257 |
results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
|
| 258 |
|
| 259 |
elif isinstance(input_seqs, list): # List of sequences
|
| 260 |
results = []
|
| 261 |
for seq in input_seqs:
|
| 262 |
-
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders)
|
| 263 |
for binder, ppl, plddt, iptm in binders:
|
| 264 |
results.append([seq, binder, ppl, plddt, iptm])
|
| 265 |
results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
|
|
@@ -272,10 +274,10 @@ def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_l
|
|
| 272 |
|
| 273 |
return results_df, PPC
|
| 274 |
|
| 275 |
-
def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3):
|
| 276 |
start = time.time()
|
| 277 |
# Load the model
|
| 278 |
-
loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path).to(device)
|
| 279 |
|
| 280 |
# Ensure the model is in evaluation mode
|
| 281 |
loaded_model.eval()
|
|
@@ -296,14 +298,16 @@ def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, m
|
|
| 296 |
peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
|
| 297 |
|
| 298 |
#get metrics for ground truth peptide
|
| 299 |
-
ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
| 302 |
|
| 303 |
results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
|
| 304 |
|
| 305 |
#predict peptides
|
| 306 |
-
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders)
|
| 307 |
|
| 308 |
for binder, ppl, plddt, iptm in binders:
|
| 309 |
results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
|
|
@@ -367,7 +371,7 @@ with demo:
|
|
| 367 |
with gr.Row():
|
| 368 |
peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
|
| 369 |
num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
|
| 370 |
-
plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM", value="no")
|
| 371 |
with gr.Column(scale=5, variant="compact"):
|
| 372 |
name = gr.Dropdown(
|
| 373 |
label="Choose a Sample Protein",
|
|
@@ -444,14 +448,14 @@ with demo:
|
|
| 444 |
# "Predict peptide sequence" actions
|
| 445 |
predict_btn.click(
|
| 446 |
fn = predict_peptide,
|
| 447 |
-
inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides],
|
| 448 |
outputs = [output_text, input_seq],
|
| 449 |
)
|
| 450 |
|
| 451 |
# "Predict peptide from a local file" actions
|
| 452 |
predict_file_btn.click(
|
| 453 |
fn = predict_peptide_from_file,
|
| 454 |
-
inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides],
|
| 455 |
outputs = [output_file],
|
| 456 |
)
|
| 457 |
|
|
|
|
| 193 |
|
| 194 |
return avg_plddt, ptm
|
| 195 |
|
| 196 |
+
def generate_peptide_for_single_sequence(model, tokenizer, protein_seq, peptide_length = 15, top_k = 3, num_binders = 5, plddt_iptm_yes="no"):
|
| 197 |
start = time.time()
|
| 198 |
|
| 199 |
peptide_length = int(peptide_length)
|
|
|
|
| 208 |
# Generate binder
|
| 209 |
masked_peptide = '<mask>' * peptide_length
|
| 210 |
input_sequence = protein_seq + masked_peptide
|
| 211 |
+
inputs = tokenizer(input_sequence, return_tensors="pt").to(model.device)
|
| 212 |
+
#inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
|
| 213 |
print("198:model.device in generate_:",model.device)
|
| 214 |
|
| 215 |
with torch.no_grad():
|
|
|
|
| 229 |
ppl_value = compute_pseudo_perplexity(model, tokenizer, protein_seq, generated_binder)
|
| 230 |
|
| 231 |
# Get PLDDT from ESMFold model
|
| 232 |
+
if plddt_iptm_yes=="yes":
|
| 233 |
+
plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
|
| 234 |
+
else:
|
| 235 |
+
plddt, iptm = [0, 0]
|
| 236 |
|
| 237 |
# Add the generated binder and its PPL to the results list
|
| 238 |
binders_with_ppl_plddt.append([generated_binder, ppl_value, plddt_value, iPTM_value])
|
|
|
|
| 244 |
return binders_with_ppl_plddt
|
| 245 |
|
| 246 |
# Predict peptide binder with finetuned model
|
| 247 |
+
def predict_peptide(base_model_path, finetuned_model_path, input_seqs, peptide_length=15, num_binders=4, top_k=3, plddt_iptm_yes="no"):
|
| 248 |
# Load the model
|
| 249 |
+
loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path) #.to(device) inference use cpu
|
| 250 |
|
| 251 |
# Ensure the model is in evaluation mode
|
| 252 |
loaded_model.eval()
|
|
|
|
| 255 |
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
|
| 256 |
|
| 257 |
if isinstance(input_seqs, str): # Single sequence
|
| 258 |
+
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, input_seqs, peptide_length, top_k, num_binders, plddt_iptm_yes)
|
| 259 |
results_df = pd.DataFrame(binders, columns=['Binder', 'PPL', 'pLDDT', 'iPTM'])
|
| 260 |
|
| 261 |
elif isinstance(input_seqs, list): # List of sequences
|
| 262 |
results = []
|
| 263 |
for seq in input_seqs:
|
| 264 |
+
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
|
| 265 |
for binder, ppl, plddt, iptm in binders:
|
| 266 |
results.append([seq, binder, ppl, plddt, iptm])
|
| 267 |
results_df = pd.DataFrame(results, columns=['Input Sequence', 'Binder', 'PPL', 'pLDDT', 'iPTM'])
|
|
|
|
| 274 |
|
| 275 |
return results_df, PPC
|
| 276 |
|
| 277 |
+
def predict_peptide_from_file(base_model_path, finetuned_model_path, file_obj, max_peptide_length=15, num_binders=5, top_k=3, plddt_iptm_yes="no"):
|
| 278 |
start = time.time()
|
| 279 |
# Load the model
|
| 280 |
+
loaded_model = AutoModelForMaskedLM.from_pretrained(finetuned_model_path) #.to(device)
|
| 281 |
|
| 282 |
# Ensure the model is in evaluation mode
|
| 283 |
loaded_model.eval()
|
|
|
|
| 298 |
peptide_length = min([len(peptide_seq), max_peptide_length]) # use the same length of ground truth peptide length for prediction limited to max_peptide_length
|
| 299 |
|
| 300 |
#get metrics for ground truth peptide
|
| 301 |
+
ppl = compute_pseudo_perplexity(loaded_model, tokenizer, protein_seq, peptide_seq
|
| 302 |
+
if plddt_iptm_yes=="yes":
|
| 303 |
+
plddt, iptm = compute_plddt_iptm(protein_seq, peptide_seq) #too time-consuming
|
| 304 |
+
else:
|
| 305 |
+
plddt, iptm = [0, 0]
|
| 306 |
|
| 307 |
results.append([protein_seq, peptide_seq, ppl, plddt, iptm, 1]) # flag 1 for ground truth peptide
|
| 308 |
|
| 309 |
#predict peptides
|
| 310 |
+
binders = generate_peptide_for_single_sequence(loaded_model, tokenizer, protein_seq, peptide_length, top_k, num_binders, plddt_iptm_yes)
|
| 311 |
|
| 312 |
for binder, ppl, plddt, iptm in binders:
|
| 313 |
results.append([protein_seq, binder, ppl, plddt, iptm, 0]) # flag 0 for generated peptide
|
|
|
|
| 371 |
with gr.Row():
|
| 372 |
peptide_length=gr.Slider(minimum=10, maximum=100, step=1, label="Peptide Maximum Length", value=15)
|
| 373 |
num_pred_peptides=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Predicted Peptides", value=5)
|
| 374 |
+
plddt_iptm_yes=gr.Radio(["yes", "no"],label="Compute pLDDT and iPTM (slow!)", value="no")
|
| 375 |
with gr.Column(scale=5, variant="compact"):
|
| 376 |
name = gr.Dropdown(
|
| 377 |
label="Choose a Sample Protein",
|
|
|
|
| 448 |
# "Predict peptide sequence" actions
|
| 449 |
predict_btn.click(
|
| 450 |
fn = predict_peptide,
|
| 451 |
+
inputs=[base_model_name,PEFT_model_name,input_seq,peptide_length,num_pred_peptides,plddt_iptm_yes],
|
| 452 |
outputs = [output_text, input_seq],
|
| 453 |
)
|
| 454 |
|
| 455 |
# "Predict peptide from a local file" actions
|
| 456 |
predict_file_btn.click(
|
| 457 |
fn = predict_peptide_from_file,
|
| 458 |
+
inputs=[base_model_name,PEFT_model_name,uploaded_file,peptide_length,num_pred_peptides,plddt_iptm_yes],
|
| 459 |
outputs = [output_file],
|
| 460 |
)
|
| 461 |
|