Spaces:

Bhadralab
/

B3PPs_Predict

Sleeping

App Files Files Community

Bhadralab commited on Sep 15, 2025

Commit

bbd8ca0

verified ·

1 Parent(s): a34c6e6

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -59

app.py CHANGED Viewed

@@ -1,27 +1,8 @@
-import re
-import torch
-import pandas as pd
-import gradio as gr
-from io import StringIO
-from transformers import EsmForSequenceClassification, EsmTokenizer
-# --- Load tokenizer & model ---
-tokenizer = EsmTokenizer.from_pretrained(
-    "facebook/esm2_t6_8M_UR50D",
-    do_lower_case=False
-)
-# Load your fine-tuned model directory
-model = EsmForSequenceClassification.from_pretrained("model/best_model5")
-model.eval()
-# --- FASTA Reader ---
 def read_fasta(fasta_string):
-    """Parses FASTA format input and returns sequences with validation."""
     sequences = []
     headers = []
     seq_buffer = []
-    header = None
     for line in StringIO(fasta_string):
         line = line.strip()
@@ -31,15 +12,12 @@ def read_fasta(fasta_string):
             if seq_buffer:
                 sequences.append("".join(seq_buffer))
                 seq_buffer.clear()
-            header = line
-            headers.append(header)
         else:
-            # Check amino acids
             if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                 raise ValueError(
                     "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
                 )
-            # Length check
             if len(line) > 30:
                 raise ValueError(
                     f"Sequence too long: '{line}' (Max 30 characters allowed)."
@@ -49,25 +27,16 @@ def read_fasta(fasta_string):
     if seq_buffer:
         sequences.append("".join(seq_buffer))
     return headers, sequences
-# --- Prediction ---
-def predict_peptide_class(sequences):
-    """Predicts peptide classifications in batch."""
-    inputs = tokenizer(
-        sequences,
-        return_tensors='pt',
-        padding=True,
-        truncation=True,
-        max_length=30
-    )
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
-    classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
-    return probs, classes
-# --- Combined Handler ---
 def predict_from_fasta(fasta_input):
     """Processes FASTA input and returns predictions in a table format."""
     try:
@@ -76,6 +45,7 @@ def predict_from_fasta(fasta_input):
             return pd.DataFrame({"Error": ["No valid sequences found."]})
         probs, classes = predict_peptide_class(sequences)
         return pd.DataFrame({
             "Header": headers,
             "Sequence": sequences,
@@ -87,22 +57,3 @@ def predict_from_fasta(fasta_input):
         return pd.DataFrame({"Error": [str(e)]})
     except Exception as e:
         return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
-# --- Gradio UI ---
-iface = gr.Interface(
-    fn=predict_from_fasta,
-    inputs=gr.Textbox(
-        lines=10,
-        placeholder="Paste your peptide sequences in FASTA format here"
-    ),
-    outputs=gr.Dataframe(),
-    title="B3PP Predictor",
-    description=(
-        "Submit peptide sequences in FASTA format to determine their potential as "
-        "blood-brain barrier penetration peptides. Sequences must consist exclusively "
-        "of natural amino acids in uppercase letters, with a maximum length of 30 characters."
-    )
-)
-if __name__ == "__main__":
-    iface.launch()

 def read_fasta(fasta_string):
+    """Parses FASTA format input and returns headers + sequences with validation."""
     sequences = []
     headers = []
     seq_buffer = []
     for line in StringIO(fasta_string):
         line = line.strip()
             if seq_buffer:
                 sequences.append("".join(seq_buffer))
                 seq_buffer.clear()
+            headers.append(line)
         else:
             if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
                 raise ValueError(
                     "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
                 )
             if len(line) > 30:
                 raise ValueError(
                     f"Sequence too long: '{line}' (Max 30 characters allowed)."
     if seq_buffer:
         sequences.append("".join(seq_buffer))
+    # Ensure same number of headers and sequences
+    if len(headers) != len(sequences):
+        raise ValueError(
+            f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. "
+            "Each header must be followed by a sequence."
+        )
     return headers, sequences
 def predict_from_fasta(fasta_input):
     """Processes FASTA input and returns predictions in a table format."""
     try:
             return pd.DataFrame({"Error": ["No valid sequences found."]})
         probs, classes = predict_peptide_class(sequences)
         return pd.DataFrame({
             "Header": headers,
             "Sequence": sequences,
         return pd.DataFrame({"Error": [str(e)]})
     except Exception as e:
         return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})