m0ksh commited on
Commit
69969dc
·
verified ·
1 Parent(s): e118453

Sync from GitHub (preserve manual model files)

Browse files
StreamlitApp/StreamlitApp.py CHANGED
@@ -25,6 +25,7 @@ from utils.ui_helpers import (
25
  build_analysis_summary_text,
26
  )
27
  from utils.peptide_extras import (
 
28
  find_most_similar,
29
  build_importance_map_html,
30
  render_3d_structure,
@@ -416,6 +417,10 @@ elif page == "Analyze":
416
  st.markdown(build_importance_map_html(sequence), unsafe_allow_html=True)
417
 
418
  st.subheader("Most Similar Known AMP")
 
 
 
 
419
  match_seq, sim_score = find_most_similar(sequence)
420
  if match_seq is not None:
421
  st.write(f"Sequence: **{match_seq}**")
 
25
  build_analysis_summary_text,
26
  )
27
  from utils.peptide_extras import (
28
+ KNOWN_AMPS,
29
  find_most_similar,
30
  build_importance_map_html,
31
  render_3d_structure,
 
417
  st.markdown(build_importance_map_html(sequence), unsafe_allow_html=True)
418
 
419
  st.subheader("Most Similar Known AMP")
420
+ st.caption(
421
+ f"Compared against **{len(KNOWN_AMPS)}** unique AMP sequences from the training set "
422
+ f"(`Data/ampData.csv`, label = 1)."
423
+ )
424
  match_seq, sim_score = find_most_similar(sequence)
425
  if match_seq is not None:
426
  st.write(f"Sequence: **{match_seq}**")
StreamlitApp/utils/peptide_extras.py CHANGED
@@ -5,17 +5,62 @@ Does not modify model loading or prediction logic.
5
  """
6
  from __future__ import annotations
7
 
 
8
  import math
 
9
  from typing import List, Optional, Tuple
10
 
11
- # Small reference set of known AMP sequences (for similarity display only).
12
- KNOWN_AMPS: List[str] = [
13
  "KWKLFKKIGAVLKVL",
14
  "GIGKFLHSAKKFGKAFVGEIMNS",
15
  "LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLV",
16
  "KLFKKILKYL",
17
  "FLPLLAGLAANFLPKIFCKITRKC",
18
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # One-letter -> three-letter (for minimal PDB lines for py3Dmol).
21
  _ONE_TO_THREE = {
@@ -53,10 +98,13 @@ def sequence_similarity(seq1: str, seq2: str) -> float:
53
  def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
54
  if not sequence or not KNOWN_AMPS:
55
  return None, 0.0
 
 
 
56
  best_seq = KNOWN_AMPS[0]
57
- best_score = sequence_similarity(sequence, KNOWN_AMPS[0])
58
  for amp in KNOWN_AMPS[1:]:
59
- score = sequence_similarity(sequence, amp)
60
  if score > best_score:
61
  best_score = score
62
  best_seq = amp
 
5
  """
6
  from __future__ import annotations
7
 
8
+ import csv
9
  import math
10
+ import pathlib
11
  from typing import List, Optional, Tuple
12
 
13
+ # Fallback if `Data/ampData.csv` is missing (e.g. local dev without Data/).
14
+ _FALLBACK_KNOWN_AMPS: Tuple[str, ...] = (
15
  "KWKLFKKIGAVLKVL",
16
  "GIGKFLHSAKKFGKAFVGEIMNS",
17
  "LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLV",
18
  "KLFKKILKYL",
19
  "FLPLLAGLAANFLPKIFCKITRKC",
20
+ )
21
+
22
+
23
+ def _amp_data_csv_path() -> pathlib.Path:
24
+ # StreamlitApp/utils/peptide_extras.py -> repo root is parents[2]
25
+ return pathlib.Path(__file__).resolve().parents[2] / "Data" / "ampData.csv"
26
+
27
+
28
+ def _load_known_amps_from_csv() -> List[str]:
29
+ """
30
+ Load unique sequences labeled as AMP (label == 1) from Data/ampData.csv.
31
+ Sequences are uppercased for consistent similarity matching.
32
+ """
33
+ path = _amp_data_csv_path()
34
+ if not path.exists():
35
+ return list(_FALLBACK_KNOWN_AMPS)
36
+
37
+ seen: set[str] = set()
38
+ amps: List[str] = []
39
+ try:
40
+ with path.open(newline="", encoding="utf-8") as f:
41
+ reader = csv.DictReader(f)
42
+ if not reader.fieldnames or "sequence" not in reader.fieldnames:
43
+ return list(_FALLBACK_KNOWN_AMPS)
44
+ for row in reader:
45
+ label = str(row.get("label", "")).strip()
46
+ if label != "1":
47
+ continue
48
+ raw = (row.get("sequence") or "").strip()
49
+ if not raw:
50
+ continue
51
+ seq = raw.upper()
52
+ if seq in seen:
53
+ continue
54
+ seen.add(seq)
55
+ amps.append(seq)
56
+ except Exception:
57
+ return list(_FALLBACK_KNOWN_AMPS)
58
+
59
+ return amps if amps else list(_FALLBACK_KNOWN_AMPS)
60
+
61
+
62
+ # Known AMP pool for similarity search (from ampData.csv label=1, or fallback list).
63
+ KNOWN_AMPS: List[str] = _load_known_amps_from_csv()
64
 
65
  # One-letter -> three-letter (for minimal PDB lines for py3Dmol).
66
  _ONE_TO_THREE = {
 
98
  def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
99
  if not sequence or not KNOWN_AMPS:
100
  return None, 0.0
101
+ seq = "".join(c for c in sequence.upper() if not c.isspace())
102
+ if not seq:
103
+ return None, 0.0
104
  best_seq = KNOWN_AMPS[0]
105
+ best_score = sequence_similarity(seq, KNOWN_AMPS[0])
106
  for amp in KNOWN_AMPS[1:]:
107
+ score = sequence_similarity(seq, amp)
108
  if score > best_score:
109
  best_score = score
110
  best_seq = amp