Bhadralab commited on
Commit
bbd8ca0
·
verified ·
1 Parent(s): a34c6e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -59
app.py CHANGED
@@ -1,27 +1,8 @@
1
- import re
2
- import torch
3
- import pandas as pd
4
- import gradio as gr
5
- from io import StringIO
6
- from transformers import EsmForSequenceClassification, EsmTokenizer
7
-
8
- # --- Load tokenizer & model ---
9
- tokenizer = EsmTokenizer.from_pretrained(
10
- "facebook/esm2_t6_8M_UR50D",
11
- do_lower_case=False
12
- )
13
-
14
- # Load your fine-tuned model directory
15
- model = EsmForSequenceClassification.from_pretrained("model/best_model5")
16
- model.eval()
17
-
18
- # --- FASTA Reader ---
19
  def read_fasta(fasta_string):
20
- """Parses FASTA format input and returns sequences with validation."""
21
  sequences = []
22
  headers = []
23
  seq_buffer = []
24
- header = None
25
 
26
  for line in StringIO(fasta_string):
27
  line = line.strip()
@@ -31,15 +12,12 @@ def read_fasta(fasta_string):
31
  if seq_buffer:
32
  sequences.append("".join(seq_buffer))
33
  seq_buffer.clear()
34
- header = line
35
- headers.append(header)
36
  else:
37
- # Check amino acids
38
  if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
39
  raise ValueError(
40
  "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
41
  )
42
- # Length check
43
  if len(line) > 30:
44
  raise ValueError(
45
  f"Sequence too long: '{line}' (Max 30 characters allowed)."
@@ -49,25 +27,16 @@ def read_fasta(fasta_string):
49
  if seq_buffer:
50
  sequences.append("".join(seq_buffer))
51
 
 
 
 
 
 
 
 
52
  return headers, sequences
53
 
54
- # --- Prediction ---
55
- def predict_peptide_class(sequences):
56
- """Predicts peptide classifications in batch."""
57
- inputs = tokenizer(
58
- sequences,
59
- return_tensors='pt',
60
- padding=True,
61
- truncation=True,
62
- max_length=30
63
- )
64
- with torch.no_grad():
65
- logits = model(**inputs).logits
66
- probs = torch.sigmoid(logits)[:, 1].cpu().numpy()
67
- classes = ["B3PP" if p > 0.5 else "Non-B3PP" for p in probs]
68
- return probs, classes
69
 
70
- # --- Combined Handler ---
71
  def predict_from_fasta(fasta_input):
72
  """Processes FASTA input and returns predictions in a table format."""
73
  try:
@@ -76,6 +45,7 @@ def predict_from_fasta(fasta_input):
76
  return pd.DataFrame({"Error": ["No valid sequences found."]})
77
 
78
  probs, classes = predict_peptide_class(sequences)
 
79
  return pd.DataFrame({
80
  "Header": headers,
81
  "Sequence": sequences,
@@ -87,22 +57,3 @@ def predict_from_fasta(fasta_input):
87
  return pd.DataFrame({"Error": [str(e)]})
88
  except Exception as e:
89
  return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})
90
-
91
- # --- Gradio UI ---
92
- iface = gr.Interface(
93
- fn=predict_from_fasta,
94
- inputs=gr.Textbox(
95
- lines=10,
96
- placeholder="Paste your peptide sequences in FASTA format here"
97
- ),
98
- outputs=gr.Dataframe(),
99
- title="B3PP Predictor",
100
- description=(
101
- "Submit peptide sequences in FASTA format to determine their potential as "
102
- "blood-brain barrier penetration peptides. Sequences must consist exclusively "
103
- "of natural amino acids in uppercase letters, with a maximum length of 30 characters."
104
- )
105
- )
106
-
107
- if __name__ == "__main__":
108
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def read_fasta(fasta_string):
2
+ """Parses FASTA format input and returns headers + sequences with validation."""
3
  sequences = []
4
  headers = []
5
  seq_buffer = []
 
6
 
7
  for line in StringIO(fasta_string):
8
  line = line.strip()
 
12
  if seq_buffer:
13
  sequences.append("".join(seq_buffer))
14
  seq_buffer.clear()
15
+ headers.append(line)
 
16
  else:
 
17
  if not re.match(r'^[ACDEFGHIKLMNPQRSTVWY]+$', line):
18
  raise ValueError(
19
  "Invalid FASTA format: Sequences must contain only natural amino acids (ACDEFGHIKLMNPQRSTVWY)."
20
  )
 
21
  if len(line) > 30:
22
  raise ValueError(
23
  f"Sequence too long: '{line}' (Max 30 characters allowed)."
 
27
  if seq_buffer:
28
  sequences.append("".join(seq_buffer))
29
 
30
+ # Ensure same number of headers and sequences
31
+ if len(headers) != len(sequences):
32
+ raise ValueError(
33
+ f"FASTA parsing error: Found {len(headers)} headers but {len(sequences)} sequences. "
34
+ "Each header must be followed by a sequence."
35
+ )
36
+
37
  return headers, sequences
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
40
  def predict_from_fasta(fasta_input):
41
  """Processes FASTA input and returns predictions in a table format."""
42
  try:
 
45
  return pd.DataFrame({"Error": ["No valid sequences found."]})
46
 
47
  probs, classes = predict_peptide_class(sequences)
48
+
49
  return pd.DataFrame({
50
  "Header": headers,
51
  "Sequence": sequences,
 
57
  return pd.DataFrame({"Error": [str(e)]})
58
  except Exception as e:
59
  return pd.DataFrame({"Error": [f"Unexpected error: {str(e)}"]})