Spaces:

Bhadralab
/

B3PPs_Predict

Sleeping

App Files Files Community

Bhadralab commited on Sep 15, 2025

Commit

c925c32

verified ·

1 Parent(s): bbd8ca0

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -3

app.py CHANGED Viewed

@@ -1,3 +1,21 @@
 def read_fasta(fasta_string):
     """Parses FASTA format input and returns headers + sequences with validation."""
     sequences = []
@@ -16,11 +34,11 @@ def read_fasta(fasta_string):
         else:
             if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                 raise ValueError(
-                    "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
                 )
             if len(line) > 30:
                 raise ValueError(
-                    f"Sequence too long: '{line}' (Max 30 characters allowed)."
                 )
             seq_buffer.append(line)
@@ -36,7 +54,23 @@ def read_fasta(fasta_string):
     return headers, sequences
 def predict_from_fasta(fasta_input):
     """Processes FASTA input and returns predictions in a table format."""
     try:
@@ -45,7 +79,6 @@ def predict_from_fasta(fasta_input):
             return pd.DataFrame({"Error": ["No valid sequences found."]})
         probs, classes = predict_peptide_class(sequences)
         return pd.DataFrame({
             "Header": headers,
             "Sequence": sequences,
@@ -57,3 +90,22 @@ def predict_from_fasta(fasta_input):
         return pd.DataFrame({"Error": [str(e)]})
     except Exception as e:
         return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})

+import re
+import torch
+import pandas as pd
+import gradio as gr
+from io import StringIO
+from transformers import EsmForSequenceClassification, EsmTokenizer
+# --- Load tokenizer & model ---
+tokenizer = EsmTokenizer.from_pretrained(
+    "facebook/esm2_t6_8M_UR50D",
+    do_lower_case=False
+)
+# Load your fine-tuned model directory (must contain config.json, pytorch_model.bin, tokenizer files)
+model = EsmForSequenceClassification.from_pretrained("model/best_model5")
+model.eval()
+# --- FASTA Reader ---
 def read_fasta(fasta_string):
     """Parses FASTA format input and returns headers + sequences with validation."""
     sequences = []
         else:
             if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                 raise ValueError(
+                    "Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed."
                 )
             if len(line) > 30:
                 raise ValueError(
+                    f"Sequence too long: '{line}' ({len(line)} > 30 characters)."
                 )
             seq_buffer.append(line)
     return headers, sequences
+# --- Prediction ---
+def predict_peptide_class(sequences):
+    """Predicts peptide classifications in batch."""
+    inputs = tokenizer(
+        sequences,               # <- flat list of strings
+        return_tensors='pt',
+        padding=True,
+        truncation=True,
+        max_length=30
+    )
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
+    classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
+    return probs, classes
+# --- Combined Handler ---
 def predict_from_fasta(fasta_input):
     """Processes FASTA input and returns predictions in a table format."""
     try:
             return pd.DataFrame({"Error": ["No valid sequences found."]})
         probs, classes = predict_peptide_class(sequences)
         return pd.DataFrame({
             "Header": headers,
             "Sequence": sequences,
         return pd.DataFrame({"Error": [str(e)]})
     except Exception as e:
         return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
+# --- Gradio UI ---
+iface = gr.Interface(
+    fn=predict_from_fasta,
+    inputs=gr.Textbox(
+        lines=10,
+        placeholder="Paste your peptide sequences in FASTA format here"
+    ),
+    outputs=gr.Dataframe(),
+    title="B3PP Predictor",
+    description=(
+        "Submit peptide sequences in FASTA format to determine their potential as "
+        "blood-brain barrier penetration peptides. Sequences must consist exclusively "
+        "of natural amino acids in uppercase letters, with a maximum length of 30 characters."
+    )
+)
+if __name__ == "__main__":
+    iface.launch()