Spaces:

m0ksh
/

PeptideAI

Sleeping

App Files Files Community

m0ksh commited on Mar 22

Commit

69969dc

verified ·

1 Parent(s): e118453

Sync from GitHub (preserve manual model files)

Browse files

Files changed (2) hide show

StreamlitApp/StreamlitApp.py +5 -0
StreamlitApp/utils/peptide_extras.py +53 -5

StreamlitApp/StreamlitApp.py CHANGED Viewed

@@ -25,6 +25,7 @@ from utils.ui_helpers import (
     build_analysis_summary_text,
 )
 from utils.peptide_extras import (
     find_most_similar,
     build_importance_map_html,
     render_3d_structure,
@@ -416,6 +417,10 @@ elif page == "Analyze":
             st.markdown(build_importance_map_html(sequence), unsafe_allow_html=True)
             st.subheader("Most Similar Known AMP")
             match_seq, sim_score = find_most_similar(sequence)
             if match_seq is not None:
                 st.write(f"Sequence: **{match_seq}**")

     build_analysis_summary_text,
 )
 from utils.peptide_extras import (
+    KNOWN_AMPS,
     find_most_similar,
     build_importance_map_html,
     render_3d_structure,
             st.markdown(build_importance_map_html(sequence), unsafe_allow_html=True)
             st.subheader("Most Similar Known AMP")
+            st.caption(
+                f"Compared against **{len(KNOWN_AMPS)}** unique AMP sequences from the training set "
+                f"(`Data/ampData.csv`, label = 1)."
+            )
             match_seq, sim_score = find_most_similar(sequence)
             if match_seq is not None:
                 st.write(f"Sequence: **{match_seq}**")

StreamlitApp/utils/peptide_extras.py CHANGED Viewed

@@ -5,17 +5,62 @@ Does not modify model loading or prediction logic.
 """
 from __future__ import annotations
 import math
 from typing import List, Optional, Tuple
-# Small reference set of known AMP sequences (for similarity display only).
-KNOWN_AMPS: List[str] = [
     "KWKLFKKIGAVLKVL",
     "GIGKFLHSAKKFGKAFVGEIMNS",
     "LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLV",
     "KLFKKILKYL",
     "FLPLLAGLAANFLPKIFCKITRKC",
-]
 # One-letter -> three-letter (for minimal PDB lines for py3Dmol).
 _ONE_TO_THREE = {
@@ -53,10 +98,13 @@ def sequence_similarity(seq1: str, seq2: str) -> float:
 def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
     if not sequence or not KNOWN_AMPS:
         return None, 0.0
     best_seq = KNOWN_AMPS[0]
-    best_score = sequence_similarity(sequence, KNOWN_AMPS[0])
     for amp in KNOWN_AMPS[1:]:
-        score = sequence_similarity(sequence, amp)
         if score > best_score:
             best_score = score
             best_seq = amp

 """
 from __future__ import annotations
+import csv
 import math
+import pathlib
 from typing import List, Optional, Tuple
+# Fallback if `Data/ampData.csv` is missing (e.g. local dev without Data/).
+_FALLBACK_KNOWN_AMPS: Tuple[str, ...] = (
     "KWKLFKKIGAVLKVL",
     "GIGKFLHSAKKFGKAFVGEIMNS",
     "LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLV",
     "KLFKKILKYL",
     "FLPLLAGLAANFLPKIFCKITRKC",
+)
+def _amp_data_csv_path() -> pathlib.Path:
+    # StreamlitApp/utils/peptide_extras.py -> repo root is parents[2]
+    return pathlib.Path(__file__).resolve().parents[2] / "Data" / "ampData.csv"
+def _load_known_amps_from_csv() -> List[str]:
+    """
+    Load unique sequences labeled as AMP (label == 1) from Data/ampData.csv.
+    Sequences are uppercased for consistent similarity matching.
+    """
+    path = _amp_data_csv_path()
+    if not path.exists():
+        return list(_FALLBACK_KNOWN_AMPS)
+    seen: set[str] = set()
+    amps: List[str] = []
+    try:
+        with path.open(newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            if not reader.fieldnames or "sequence" not in reader.fieldnames:
+                return list(_FALLBACK_KNOWN_AMPS)
+            for row in reader:
+                label = str(row.get("label", "")).strip()
+                if label != "1":
+                    continue
+                raw = (row.get("sequence") or "").strip()
+                if not raw:
+                    continue
+                seq = raw.upper()
+                if seq in seen:
+                    continue
+                seen.add(seq)
+                amps.append(seq)
+    except Exception:
+        return list(_FALLBACK_KNOWN_AMPS)
+    return amps if amps else list(_FALLBACK_KNOWN_AMPS)
+# Known AMP pool for similarity search (from ampData.csv label=1, or fallback list).
+KNOWN_AMPS: List[str] = _load_known_amps_from_csv()
 # One-letter -> three-letter (for minimal PDB lines for py3Dmol).
 _ONE_TO_THREE = {
 def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
     if not sequence or not KNOWN_AMPS:
         return None, 0.0
+    seq = "".join(c for c in sequence.upper() if not c.isspace())
+    if not seq:
+        return None, 0.0
     best_seq = KNOWN_AMPS[0]
+    best_score = sequence_similarity(seq, KNOWN_AMPS[0])
     for amp in KNOWN_AMPS[1:]:
+        score = sequence_similarity(seq, amp)
         if score > best_score:
             best_score = score
             best_seq = amp