Bhadralab commited on
Commit
c925c32
·
verified ·
1 Parent(s): bbd8ca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -3
app.py CHANGED
@@ -1,3 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def read_fasta(fasta_string):
2
  """Parses FASTA format input and returns headers + sequences with validation."""
3
  sequences = []
@@ -16,11 +34,11 @@ def read_fasta(fasta_string):
16
  else:
17
  if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
18
  raise ValueError(
19
- "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
20
  )
21
  if len(line) > 30:
22
  raise ValueError(
23
- f"Sequence too long: '{line}' (Max 30 characters allowed)."
24
  )
25
  seq_buffer.append(line)
26
 
@@ -36,7 +54,23 @@ def read_fasta(fasta_string):
36
 
37
  return headers, sequences
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
40
  def predict_from_fasta(fasta_input):
41
  """Processes FASTA input and returns predictions in a table format."""
42
  try:
@@ -45,7 +79,6 @@ def predict_from_fasta(fasta_input):
45
  return pd.DataFrame({"Error": ["No valid sequences found."]})
46
 
47
  probs, classes = predict_peptide_class(sequences)
48
-
49
  return pd.DataFrame({
50
  "Header": headers,
51
  "Sequence": sequences,
@@ -57,3 +90,22 @@ def predict_from_fasta(fasta_input):
57
  return pd.DataFrame({"Error": [str(e)]})
58
  except Exception as e:
59
  return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import pandas as pd
4
+ import gradio as gr
5
+ from io import StringIO
6
+ from transformers import EsmForSequenceClassification, EsmTokenizer
7
+
8
+ # --- Load tokenizer & model ---
9
+ tokenizer = EsmTokenizer.from_pretrained(
10
+ "facebook/esm2_t6_8M_UR50D",
11
+ do_lower_case=False
12
+ )
13
+
14
+ # Load your fine-tuned model directory (must contain config.json, pytorch_model.bin, tokenizer files)
15
+ model = EsmForSequenceClassification.from_pretrained("model/best_model5")
16
+ model.eval()
17
+
18
+ # --- FASTA Reader ---
19
  def read_fasta(fasta_string):
20
  """Parses FASTA format input and returns headers + sequences with validation."""
21
  sequences = []
 
34
  else:
35
  if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
36
  raise ValueError(
37
+ "Invalid FASTA format: Only natural amino acids (ACDEFGHIKLMNPQRSTVWY) allowed."
38
  )
39
  if len(line) > 30:
40
  raise ValueError(
41
+ f"Sequence too long: '{line}' ({len(line)} > 30 characters)."
42
  )
43
  seq_buffer.append(line)
44
 
 
54
 
55
  return headers, sequences
56
 
57
+ # --- Prediction ---
58
+ def predict_peptide_class(sequences):
59
+ """Predicts peptide classifications in batch."""
60
+ inputs = tokenizer(
61
+ sequences, # <- flat list of strings
62
+ return_tensors='pt',
63
+ padding=True,
64
+ truncation=True,
65
+ max_length=30
66
+ )
67
+ with torch.no_grad():
68
+ logits = model(**inputs).logits
69
+ probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
70
+ classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
71
+ return probs, classes
72
 
73
+ # --- Combined Handler ---
74
  def predict_from_fasta(fasta_input):
75
  """Processes FASTA input and returns predictions in a table format."""
76
  try:
 
79
  return pd.DataFrame({"Error": ["No valid sequences found."]})
80
 
81
  probs, classes = predict_peptide_class(sequences)
 
82
  return pd.DataFrame({
83
  "Header": headers,
84
  "Sequence": sequences,
 
90
  return pd.DataFrame({"Error": [str(e)]})
91
  except Exception as e:
92
  return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
93
+
94
+ # --- Gradio UI ---
95
+ iface = gr.Interface(
96
+ fn=predict_from_fasta,
97
+ inputs=gr.Textbox(
98
+ lines=10,
99
+ placeholder="Paste your peptide sequences in FASTA format here"
100
+ ),
101
+ outputs=gr.Dataframe(),
102
+ title="B3PP Predictor",
103
+ description=(
104
+ "Submit peptide sequences in FASTA format to determine their potential as "
105
+ "blood-brain barrier penetration peptides. Sequences must consist exclusively "
106
+ "of natural amino acids in uppercase letters, with a maximum length of 30 characters."
107
+ )
108
+ )
109
+
110
+ if __name__ == "__main__":
111
+ iface.launch()