B3PPs_Predict / app.py
Bhadralab's picture
Update app.py
da1f78f verified
import re
import torch
import pandas as pd
import gradio as gr
from io import StringIO
from transformers import EsmForSequenceClassification, EsmTokenizer
# --- Load tokenizer & model ---
tokenizer = EsmTokenizer.from_pretrained(
"facebook/esm2_t6_8M_UR50D",
do_lower_case=False
)
model = EsmForSequenceClassification.from_pretrained("model/best_model5")
model.eval()
# --- FASTA Reader ---
def read_fasta(fasta_string):
sequences = []
headers = []
seq_buffer = []
for line in StringIO(fasta_string):
line = line.strip()
if not line:
continue
if line.startswith(">"):
if seq_buffer:
sequences.append("".join(seq_buffer))
seq_buffer.clear()
headers.append(line)
else:
if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
raise ValueError(
"Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed."
)
if len(line) > 30:
raise ValueError(
f"Sequence too long: '{line}' ({len(line)} > 30 characters)."
)
seq_buffer.append(line)
if seq_buffer:
sequences.append("".join(seq_buffer))
if len(headers) != len(sequences):
raise ValueError(
f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. "
"Each header must be followed by a sequence."
)
return headers, sequences
def predict_peptide_class(sequences):
sequences = [str(s) for s in sequences]
inputs = tokenizer(
sequences,
return_tensors='pt',
padding=True,
truncation=True,
max_length=30
)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
return probs, classes
def predict_from_fasta(fasta_input):
"""Processes FASTA input and returns predictions in table + CSV download."""
try:
headers, sequences = read_fasta(fasta_input)
if not sequences:
df = pd.DataFrame({"Error": ["No valid sequences found."]})
return df, None
probs, classes = predict_peptide_class(sequences)
probs_rounded = [f"{p:.2f}" for p in probs]
df = pd.DataFrame({
"Header": headers,
"Sequence": sequences,
"Probability": probs_rounded,
"Predicted Class": classes
})
# Save as CSV file
csv_path = "predictions.csv"
df.to_csv(csv_path, index=False)
return df, csv_path
except ValueError as e:
df = pd.DataFrame({"Error": [str(e)]})
return df, None
except Exception as e:
df = pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
return df, None
iface = gr.Interface(
fn=predict_from_fasta,
inputs=gr.Textbox(
lines=10,
placeholder="Paste your peptide sequences in FASTA format here"
),
outputs=[
gr.Dataframe(label="Predictions"),
gr.File(label="Download CSV")
],
title="B3PP Predictor",
description=(
"Submit peptide sequences in FASTA format to determine their potential as "
"blood-brain barrier penetration peptides. Sequences must consist exclusively "
"of natural amino acids in uppercase letters, with a maximum length of 30 characters."
)
)
if __name__ == "__main__":
iface.launch()