m0ksh commited on
Commit
b7e5b63
·
verified ·
1 Parent(s): 68a01ab

Sync from GitHub (preserve manual model files)

Browse files
StreamlitApp/StreamlitApp.py CHANGED
@@ -1,5 +1,4 @@
1
- """Main Streamlit entrypoint wiring Predict, Analyze, Optimize, Visualize, and t-SNE pages."""
2
-
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
@@ -43,19 +42,13 @@ except Exception:
43
 
44
 
45
  def _tooltip_label(label: str, tooltip_text: str) -> None:
46
- """
47
- Render a label with a hover tooltip using HTML 'title' attribute.
48
- """
49
  safe = _html.escape(tooltip_text, quote=True)
50
  st.markdown(f"{label} <span title='{safe}' style='cursor:help;color:#666'>(i)</span>", unsafe_allow_html=True)
51
 
52
 
53
  def _try_copy_to_clipboard(text: str) -> None:
54
- """
55
- Best-effort clipboard copy (server-side only).
56
- Avoids streamlit.components.html — iframe/JS can fail on Hugging Face Spaces
57
- (TypeError: Failed to fetch dynamically imported module for static/js chunks).
58
- """
59
  if pyperclip is not None:
60
  try:
61
  pyperclip.copy(text)
 
1
+ # Main Streamlit entrypoint wiring Predict, Analyze, Optimize, Visualize, and t-SNE pages.
 
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
 
42
 
43
 
44
  def _tooltip_label(label: str, tooltip_text: str) -> None:
45
+ # Render a label with a lightweight HTML hover tooltip.
 
 
46
  safe = _html.escape(tooltip_text, quote=True)
47
  st.markdown(f"{label} <span title='{safe}' style='cursor:help;color:#666'>(i)</span>", unsafe_allow_html=True)
48
 
49
 
50
  def _try_copy_to_clipboard(text: str) -> None:
51
+ # Best-effort server-side clipboard copy (browser copy is intentionally avoided).
 
 
 
 
52
  if pyperclip is not None:
53
  try:
54
  pyperclip.copy(text)
StreamlitApp/utils/analyze.py CHANGED
@@ -1,16 +1,15 @@
1
- """Sequence composition and physicochemical property helpers."""
2
-
3
  from collections import Counter
4
 
5
  def aa_composition(sequence):
6
- """Return normalized frequencies for the 20 canonical amino acids."""
7
  amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
8
  counts = Counter(sequence)
9
  total = len(sequence)
10
  return {aa: counts.get(aa, 0) / total for aa in amino_acids}
11
 
12
  def compute_properties(sequence):
13
- """Compute simple length, mass, hydrophobicity, and net-charge signals."""
14
  aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
15
  'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
16
  'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
 
1
+ # Sequence composition and physicochemical property helpers.
 
2
  from collections import Counter
3
 
4
  def aa_composition(sequence):
5
+ # Return normalized frequencies for the 20 canonical amino acids.
6
  amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
7
  counts = Counter(sequence)
8
  total = len(sequence)
9
  return {aa: counts.get(aa, 0) / total for aa in amino_acids}
10
 
11
  def compute_properties(sequence):
12
+ # Compute simple length, mass, hydrophobicity, and net-charge signals.
13
  aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
14
  'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
15
  'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,
StreamlitApp/utils/optimize.py CHANGED
@@ -1,5 +1,4 @@
1
- """Heuristic mutation search used by the Optimize page."""
2
-
3
  import random
4
  from utils.predict import predict_amp
5
 
@@ -10,7 +9,7 @@ POSITIVE = set("KRH")
10
  NEGATIVE = set("DE")
11
 
12
  def mutate_residue(residue):
13
- """Return a candidate replacement residue and rationale."""
14
  if residue in POSITIVE:
15
  return residue, "Retained strong positive residue"
16
  elif residue in NEGATIVE:
@@ -23,10 +22,7 @@ def mutate_residue(residue):
23
  return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
24
 
25
  def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
26
- """
27
- Iteratively optimize sequence to increase AMP probability.
28
- Tries mutating all positions per round and accepts the best change.
29
- """
30
  current_seq = seq
31
  label, conf = predict_amp(current_seq, model)
32
  best_conf = conf
 
1
+ # Heuristic mutation search used by the Optimize page.
 
2
  import random
3
  from utils.predict import predict_amp
4
 
 
9
  NEGATIVE = set("DE")
10
 
11
  def mutate_residue(residue):
12
+ # Return a candidate replacement residue and rationale.
13
  if residue in POSITIVE:
14
  return residue, "Retained strong positive residue"
15
  elif residue in NEGATIVE:
 
22
  return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
23
 
24
  def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
25
+ # Iteratively improve AMP probability by accepting the best mutation per round.
 
 
 
26
  current_seq = seq
27
  label, conf = predict_amp(current_seq, model)
28
  best_conf = conf
StreamlitApp/utils/peptide_extras.py CHANGED
@@ -1,8 +1,5 @@
1
- """
2
- Optional peptide UI helpers: 3D approximation (py3Dmol), known-AMP similarity, residue highlighting.
3
-
4
- Does not modify model loading or prediction logic.
5
- """
6
  from __future__ import annotations
7
 
8
  import csv
@@ -28,10 +25,7 @@ def _amp_data_csv_path() -> pathlib.Path:
28
 
29
 
30
  def _load_known_amps_from_csv() -> List[str]:
31
- """
32
- Load unique sequences labeled as AMP (label == 1) from Data/ampData.csv.
33
- Sequences are uppercased for consistent similarity matching.
34
- """
35
  path = _amp_data_csv_path()
36
  if not path.exists():
37
  return list(_FALLBACK_KNOWN_AMPS)
@@ -113,7 +107,7 @@ _ONE_TO_THREE = {
113
 
114
 
115
  def sequence_similarity(seq1: str, seq2: str) -> float:
116
- """Position-wise match rate normalized by max length (as specified)."""
117
  if not seq1 or not seq2:
118
  return 0.0
119
  matches = sum(1 for a, b in zip(seq1, seq2) if a == b)
@@ -121,7 +115,7 @@ def sequence_similarity(seq1: str, seq2: str) -> float:
121
 
122
 
123
  def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
124
- """Return the closest known AMP and simple position-match similarity score."""
125
  if not sequence or not KNOWN_AMPS:
126
  return None, 0.0
127
  seq = "".join(c for c in sequence.upper() if not c.isspace())
@@ -138,7 +132,7 @@ def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
138
 
139
 
140
  def get_residue_color(aa: str) -> str:
141
- """Map one-letter residue to a py3Dmol color name (single-letter, uppercased)."""
142
  ch = aa.upper() if aa else ""
143
  positive = ["K", "R", "H"]
144
  negative = ["D", "E"]
@@ -153,7 +147,7 @@ def get_residue_color(aa: str) -> str:
153
 
154
 
155
  def residue_color_mpl(aa: str) -> str:
156
- """Matplotlib-compatible hex colors matching `get_residue_color` categories (high-contrast for plots)."""
157
  cat = get_residue_color(aa)
158
  return {
159
  "blue": "#1D4ED8",
@@ -198,10 +192,7 @@ COMPACT_MAP_LEGEND: str = """
198
 
199
 
200
  def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
201
- """
202
- Detailed helical wheel (matplotlib polar): radial spokes, sequence-order connectors (i→i+1),
203
- and colored residue disks — same chemistry classes as 3D / HTML maps (high-contrast colors).
204
- """
205
  import matplotlib.pyplot as plt
206
  from matplotlib import patheffects as pe
207
 
@@ -289,7 +280,7 @@ def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2))
289
 
290
 
291
  def get_residue_style(aa: str) -> str:
292
- """Inline styles for sequence map colors aligned with wheel / 3D categories (high contrast)."""
293
  positive = ["K", "R", "H"]
294
  negative = ["D", "E"]
295
  hydrophobic = ["A", "V", "I", "L", "M", "F", "W", "Y"]
@@ -303,7 +294,7 @@ def get_residue_style(aa: str) -> str:
303
 
304
 
305
  def build_importance_map_html(sequence: str) -> str:
306
- """Build HTML for residue importance highlighting (escape non-AA safely)."""
307
  import html as html_mod
308
 
309
  # Emit one colored <span> per residue for inline sequence highlighting.
@@ -318,10 +309,7 @@ def build_importance_map_html(sequence: str) -> str:
318
 
319
 
320
  def generate_helix_pdb(sequence: str, smooth: bool = False) -> str:
321
- """
322
- Generate a minimal PDB string (helix-like CA trace).
323
- When smooth=True, apply light coordinate smoothing for a softer backbone path.
324
- """
325
  pdb_lines: List[str] = []
326
  atom_index = 1
327
  clean = "".join(c for c in sequence.upper() if not c.isspace())
@@ -375,12 +363,7 @@ def render_3d_structure(
375
  enhanced: bool = False,
376
  spin: bool = False,
377
  ) -> bool:
378
- """
379
- Render py3Dmol view: gray stick backbone + colored spheres per residue (CA-only PDB).
380
- When enhanced=True: smoother helix path, slightly larger spheres, more labels.
381
- When spin=True: enable viewer spin (off by default).
382
- Not a real folded structure — helix-like CA trace only.
383
- """
384
  import streamlit.components.v1 as components
385
 
386
  # Input sanitization keeps renderer stable across pasted FASTA/text snippets.
 
1
+ # Optional peptide UI helpers: 3D approximation (py3Dmol), known-AMP similarity, and residue highlighting.
2
+ # This module is UI-oriented and does not alter model loading or prediction logic.
 
 
 
3
  from __future__ import annotations
4
 
5
  import csv
 
25
 
26
 
27
  def _load_known_amps_from_csv() -> List[str]:
28
+ # Load unique AMP-labeled sequences from CSV and normalize to uppercase.
 
 
 
29
  path = _amp_data_csv_path()
30
  if not path.exists():
31
  return list(_FALLBACK_KNOWN_AMPS)
 
107
 
108
 
109
  def sequence_similarity(seq1: str, seq2: str) -> float:
110
+ # Compute simple position-wise match score normalized by the longer sequence.
111
  if not seq1 or not seq2:
112
  return 0.0
113
  matches = sum(1 for a, b in zip(seq1, seq2) if a == b)
 
115
 
116
 
117
  def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
118
+ # Return the closest known AMP and its simple position-match similarity score.
119
  if not sequence or not KNOWN_AMPS:
120
  return None, 0.0
121
  seq = "".join(c for c in sequence.upper() if not c.isspace())
 
132
 
133
 
134
  def get_residue_color(aa: str) -> str:
135
+ # Map one-letter residue codes to py3Dmol color categories.
136
  ch = aa.upper() if aa else ""
137
  positive = ["K", "R", "H"]
138
  negative = ["D", "E"]
 
147
 
148
 
149
  def residue_color_mpl(aa: str) -> str:
150
+ # Return high-contrast Matplotlib colors that mirror the 3D residue categories.
151
  cat = get_residue_color(aa)
152
  return {
153
  "blue": "#1D4ED8",
 
192
 
193
 
194
  def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
195
+ # Build a detailed helical wheel with spokes, sequence connectors, and color-coded residues.
 
 
 
196
  import matplotlib.pyplot as plt
197
  from matplotlib import patheffects as pe
198
 
 
280
 
281
 
282
  def get_residue_style(aa: str) -> str:
283
+ # Return inline CSS style for sequence-map residue coloring.
284
  positive = ["K", "R", "H"]
285
  negative = ["D", "E"]
286
  hydrophobic = ["A", "V", "I", "L", "M", "F", "W", "Y"]
 
294
 
295
 
296
  def build_importance_map_html(sequence: str) -> str:
297
+ # Build safe HTML spans for residue-by-residue chemical highlighting.
298
  import html as html_mod
299
 
300
  # Emit one colored <span> per residue for inline sequence highlighting.
 
309
 
310
 
311
  def generate_helix_pdb(sequence: str, smooth: bool = False) -> str:
312
+ # Generate a minimal CA-only helix-like PDB approximation, with optional smoothing.
 
 
 
313
  pdb_lines: List[str] = []
314
  atom_index = 1
315
  clean = "".join(c for c in sequence.upper() if not c.isspace())
 
363
  enhanced: bool = False,
364
  spin: bool = False,
365
  ) -> bool:
366
+ # Render CA-only py3Dmol structure with category coloring and optional enhanced styling/spin.
 
 
 
 
 
367
  import streamlit.components.v1 as components
368
 
369
  # Input sanitization keeps renderer stable across pasted FASTA/text snippets.
StreamlitApp/utils/predict.py CHANGED
@@ -1,5 +1,4 @@
1
- """Model loading, sequence encoding, and AMP inference helpers."""
2
-
3
  import pathlib
4
  import numpy as np
5
  import torch
@@ -24,7 +23,7 @@ class FastMLP(nn.Module):
24
 
25
  @st.cache_resource
26
  def load_model():
27
- """Load model weights once per Streamlit process."""
28
  # Always resolve relative to the StreamlitApp folder, not the process CWD.
29
  streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
30
  repo_root = streamlitapp_dir.parent
@@ -51,10 +50,7 @@ def load_model():
51
  return model
52
 
53
  def encode_sequence(seq, max_len=51):
54
- """
55
- Converts amino acid sequence to flattened one-hot vector
56
- padded/truncated to match model input_dim (1024)
57
- """
58
  amino_acids = "ACDEFGHIKLMNPQRSTVWY"
59
  aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
60
 
@@ -72,10 +68,7 @@ def encode_sequence(seq, max_len=51):
72
  return flat
73
 
74
  def predict_amp(sequence, model):
75
- """
76
- Takes an amino acid sequence string and the loaded model,
77
- returns ("AMP"/"Non-AMP") and probability
78
- """
79
  x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
80
 
81
  # Sigmoid(logit) gives AMP probability in [0, 1].
 
1
+ # Model loading, sequence encoding, and AMP inference helpers.
 
2
  import pathlib
3
  import numpy as np
4
  import torch
 
23
 
24
  @st.cache_resource
25
  def load_model():
26
+ # Load model weights once per Streamlit process.
27
  # Always resolve relative to the StreamlitApp folder, not the process CWD.
28
  streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
29
  repo_root = streamlitapp_dir.parent
 
50
  return model
51
 
52
  def encode_sequence(seq, max_len=51):
53
+ # Convert sequence to a padded/truncated flattened one-hot vector (1024 dims).
 
 
 
54
  amino_acids = "ACDEFGHIKLMNPQRSTVWY"
55
  aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
56
 
 
68
  return flat
69
 
70
  def predict_amp(sequence, model):
71
+ # Run AMP inference and return predicted label plus AMP probability.
 
 
 
72
  x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
73
 
74
  # Sigmoid(logit) gives AMP probability in [0, 1].
StreamlitApp/utils/rateLimit.py CHANGED
@@ -1,5 +1,4 @@
1
- """Simple in-memory sliding-window rate limiter."""
2
-
3
  import time
4
  from collections import deque
5
 
 
1
+ # Simple in-memory sliding-window rate limiter.
 
2
  import time
3
  from collections import deque
4
 
StreamlitApp/utils/ui_helpers.py CHANGED
@@ -1,5 +1,4 @@
1
- """UI-facing formatting and summary helpers shared across pages."""
2
-
3
  import html as _html
4
  from typing import Dict, List, Tuple, Optional
5
 
@@ -7,9 +6,7 @@ from utils.analyze import compute_properties
7
 
8
 
9
  def predicted_confidence(row: Dict) -> Optional[float]:
10
- """
11
- Convert stored model probability (AMP probability) into "confidence of the predicted label".
12
- """
13
  if not row:
14
  return None
15
  pred = row.get("Prediction")
@@ -39,13 +36,7 @@ def heuristic_reason_for_profile(charge: float, hydro_fraction: float) -> str:
39
 
40
 
41
  def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
42
- """
43
- Return dict with top-candidate info:
44
- - sequence
45
- - predicted_confidence (AMP-prob for AMP rows, else Non-AMP prob)
46
- - label
47
- - reason (heuristic based on computed properties)
48
- """
49
  if not predictions:
50
  return None
51
 
@@ -85,9 +76,7 @@ def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
85
 
86
 
87
  def mutation_heatmap_html(original: str, final: str) -> str:
88
- """
89
- Compare residues position-by-position. Changed residues are highlighted in red.
90
- """
91
  orig = original or ""
92
  fin = final or ""
93
  max_len = max(len(orig), len(fin))
@@ -138,9 +127,7 @@ def _ideal_distance_to_interval(value: float, low: float, high: float) -> float:
138
 
139
 
140
  def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_conf: float) -> Dict:
141
- """
142
- Compute small summary signals for the Optimize page.
143
- """
144
  orig_seq = orig_seq or ""
145
  final_seq = final_seq or ""
146
 
@@ -197,9 +184,7 @@ def sequence_length_warning(seq: str) -> Optional[str]:
197
 
198
 
199
  def sequence_health_label(conf_prob: float, charge: float, hydro_fraction: float) -> Tuple[str, str]:
200
- """
201
- Returns: (label, color_css)
202
- """
203
  # Very high model confidence is treated as strong even outside ideal property ranges.
204
  if conf_prob >= 0.9:
205
  return "Strong AMP candidate", "#2ca02c"
 
1
+ # UI-facing formatting and summary helpers shared across pages.
 
2
  import html as _html
3
  from typing import Dict, List, Tuple, Optional
4
 
 
6
 
7
 
8
  def predicted_confidence(row: Dict) -> Optional[float]:
9
+ # Convert AMP probability into confidence of the predicted class.
 
 
10
  if not row:
11
  return None
12
  pred = row.get("Prediction")
 
36
 
37
 
38
  def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
39
+ # Select best candidate row and attach a short profile-based reason.
 
 
 
 
 
 
40
  if not predictions:
41
  return None
42
 
 
76
 
77
 
78
  def mutation_heatmap_html(original: str, final: str) -> str:
79
+ # Highlight per-position residue changes between original and final sequences.
 
 
80
  orig = original or ""
81
  fin = final or ""
82
  max_len = max(len(orig), len(fin))
 
127
 
128
 
129
  def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_conf: float) -> Dict:
130
+ # Compute confidence and property deltas for the Optimize summary panel.
 
 
131
  orig_seq = orig_seq or ""
132
  final_seq = final_seq or ""
133
 
 
184
 
185
 
186
  def sequence_health_label(conf_prob: float, charge: float, hydro_fraction: float) -> Tuple[str, str]:
187
+ # Return a short quality label plus color for Analyze page status display.
 
 
188
  # Very high model confidence is treated as strong even outside ideal property ranges.
189
  if conf_prob >= 0.9:
190
  return "Strong AMP candidate", "#2ca02c"
StreamlitApp/utils/visualize.py CHANGED
@@ -1,5 +1,4 @@
1
- """Legacy t-SNE helper retained for ad-hoc embedding previews."""
2
-
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  from sklearn.manifold import TSNE
@@ -9,7 +8,7 @@ import numpy as np
9
  from utils.predict import encode_sequence
10
 
11
  def tsne_visualization(sequences, model):
12
- """Project model embeddings into 2D and render a quick scatter plot."""
13
  st.info("Generating embeddings... this may take a moment.")
14
  embeddings = []
15
  for seq in sequences:
 
1
+ # Legacy t-SNE helper retained for ad-hoc embedding previews.
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  from sklearn.manifold import TSNE
 
8
  from utils.predict import encode_sequence
9
 
10
  def tsne_visualization(sequences, model):
11
+ # Project model embeddings into 2D and render a quick scatter plot.
12
  st.info("Generating embeddings... this may take a moment.")
13
  embeddings = []
14
  for seq in sequences: