Spaces:

m0ksh
/

PeptideAI

Sleeping

App Files Files Community

m0ksh commited on Mar 23

Commit

b7e5b63

verified ·

1 Parent(s): 68a01ab

Sync from GitHub (preserve manual model files)

Browse files

Files changed (8) hide show

StreamlitApp/StreamlitApp.py +3 -10
StreamlitApp/utils/analyze.py +3 -4
StreamlitApp/utils/optimize.py +3 -7
StreamlitApp/utils/peptide_extras.py +12 -29
StreamlitApp/utils/predict.py +4 -11
StreamlitApp/utils/rateLimit.py +1 -2
StreamlitApp/utils/ui_helpers.py +6 -21
StreamlitApp/utils/visualize.py +2 -3

StreamlitApp/StreamlitApp.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Main Streamlit entrypoint wiring Predict, Analyze, Optimize, Visualize, and t-SNE pages."""
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -43,19 +42,13 @@ except Exception:
 def _tooltip_label(label: str, tooltip_text: str) -> None:
-    """
-    Render a label with a hover tooltip using HTML 'title' attribute.
-    """
     safe = _html.escape(tooltip_text, quote=True)
     st.markdown(f"{label} <span title='{safe}' style='cursor:help;color:#666'>(i)</span>", unsafe_allow_html=True)
 def _try_copy_to_clipboard(text: str) -> None:
-    """
-    Best-effort clipboard copy (server-side only).
-    Avoids streamlit.components.html — iframe/JS can fail on Hugging Face Spaces
-    (TypeError: Failed to fetch dynamically imported module for static/js chunks).
-    """
     if pyperclip is not None:
         try:
             pyperclip.copy(text)

+# Main Streamlit entrypoint wiring Predict, Analyze, Optimize, Visualize, and t-SNE pages.
 import streamlit as st
 import pandas as pd
 import numpy as np
 def _tooltip_label(label: str, tooltip_text: str) -> None:
+    # Render a label with a lightweight HTML hover tooltip.
     safe = _html.escape(tooltip_text, quote=True)
     st.markdown(f"{label} <span title='{safe}' style='cursor:help;color:#666'>(i)</span>", unsafe_allow_html=True)
 def _try_copy_to_clipboard(text: str) -> None:
+    # Best-effort server-side clipboard copy (browser copy is intentionally avoided).
     if pyperclip is not None:
         try:
             pyperclip.copy(text)

StreamlitApp/utils/analyze.py CHANGED Viewed

@@ -1,16 +1,15 @@
-"""Sequence composition and physicochemical property helpers."""
 from collections import Counter
 def aa_composition(sequence):
-    """Return normalized frequencies for the 20 canonical amino acids."""
     amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
     counts = Counter(sequence)
     total = len(sequence)
     return {aa: counts.get(aa, 0) / total for aa in amino_acids}
 def compute_properties(sequence):
-    """Compute simple length, mass, hydrophobicity, and net-charge signals."""
     aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
                   'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
                   'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,

+# Sequence composition and physicochemical property helpers.
 from collections import Counter
 def aa_composition(sequence):
+    # Return normalized frequencies for the 20 canonical amino acids.
     amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
     counts = Counter(sequence)
     total = len(sequence)
     return {aa: counts.get(aa, 0) / total for aa in amino_acids}
 def compute_properties(sequence):
+    # Compute simple length, mass, hydrophobicity, and net-charge signals.
     aa_weights = {'A': 89.1, 'R': 174.2, 'N': 132.1, 'D': 133.1, 'C': 121.2,
                   'E': 147.1, 'Q': 146.2, 'G': 75.1, 'H': 155.2, 'I': 131.2,
                   'L': 131.2, 'K': 146.2, 'M': 149.2, 'F': 165.2, 'P': 115.1,

StreamlitApp/utils/optimize.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Heuristic mutation search used by the Optimize page."""
 import random
 from utils.predict import predict_amp
@@ -10,7 +9,7 @@ POSITIVE = set("KRH")
 NEGATIVE = set("DE")
 def mutate_residue(residue):
-    """Return a candidate replacement residue and rationale."""
     if residue in POSITIVE:
         return residue, "Retained strong positive residue"
     elif residue in NEGATIVE:
@@ -23,10 +22,7 @@ def mutate_residue(residue):
         return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
 def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
-    """
-    Iteratively optimize sequence to increase AMP probability.
-    Tries mutating all positions per round and accepts the best change.
-    """
     current_seq = seq
     label, conf = predict_amp(current_seq, model)
     best_conf = conf

+# Heuristic mutation search used by the Optimize page.
 import random
 from utils.predict import predict_amp
 NEGATIVE = set("DE")
 def mutate_residue(residue):
+    # Return a candidate replacement residue and rationale.
     if residue in POSITIVE:
         return residue, "Retained strong positive residue"
     elif residue in NEGATIVE:
         return random.choice(list(HYDROPHOBIC)), "Adjusted physicochemical profile"
 def optimize_sequence(seq, model, max_rounds=20, confidence_threshold=0.001):
+    # Iteratively improve AMP probability by accepting the best mutation per round.
     current_seq = seq
     label, conf = predict_amp(current_seq, model)
     best_conf = conf

StreamlitApp/utils/peptide_extras.py CHANGED Viewed

@@ -1,8 +1,5 @@
-"""
-Optional peptide UI helpers: 3D approximation (py3Dmol), known-AMP similarity, residue highlighting.
-Does not modify model loading or prediction logic.
-"""
 from __future__ import annotations
 import csv
@@ -28,10 +25,7 @@ def _amp_data_csv_path() -> pathlib.Path:
 def _load_known_amps_from_csv() -> List[str]:
-    """
-    Load unique sequences labeled as AMP (label == 1) from Data/ampData.csv.
-    Sequences are uppercased for consistent similarity matching.
-    """
     path = _amp_data_csv_path()
     if not path.exists():
         return list(_FALLBACK_KNOWN_AMPS)
@@ -113,7 +107,7 @@ _ONE_TO_THREE = {
 def sequence_similarity(seq1: str, seq2: str) -> float:
-    """Position-wise match rate normalized by max length (as specified)."""
     if not seq1 or not seq2:
         return 0.0
     matches = sum(1 for a, b in zip(seq1, seq2) if a == b)
@@ -121,7 +115,7 @@ def sequence_similarity(seq1: str, seq2: str) -> float:
 def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
-    """Return the closest known AMP and simple position-match similarity score."""
     if not sequence or not KNOWN_AMPS:
         return None, 0.0
     seq = "".join(c for c in sequence.upper() if not c.isspace())
@@ -138,7 +132,7 @@ def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
 def get_residue_color(aa: str) -> str:
-    """Map one-letter residue to a py3Dmol color name (single-letter, uppercased)."""
     ch = aa.upper() if aa else ""
     positive = ["K", "R", "H"]
     negative = ["D", "E"]
@@ -153,7 +147,7 @@ def get_residue_color(aa: str) -> str:
 def residue_color_mpl(aa: str) -> str:
-    """Matplotlib-compatible hex colors matching `get_residue_color` categories (high-contrast for plots)."""
     cat = get_residue_color(aa)
     return {
         "blue": "#1D4ED8",
@@ -198,10 +192,7 @@ COMPACT_MAP_LEGEND: str = """
 def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
-    """
-    Detailed helical wheel (matplotlib polar): radial spokes, sequence-order connectors (i→i+1),
-    and colored residue disks — same chemistry classes as 3D / HTML maps (high-contrast colors).
-    """
     import matplotlib.pyplot as plt
     from matplotlib import patheffects as pe
@@ -289,7 +280,7 @@ def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2))
 def get_residue_style(aa: str) -> str:
-    """Inline styles for sequence map — colors aligned with wheel / 3D categories (high contrast)."""
     positive = ["K", "R", "H"]
     negative = ["D", "E"]
     hydrophobic = ["A", "V", "I", "L", "M", "F", "W", "Y"]
@@ -303,7 +294,7 @@ def get_residue_style(aa: str) -> str:
 def build_importance_map_html(sequence: str) -> str:
-    """Build HTML for residue importance highlighting (escape non-AA safely)."""
     import html as html_mod
     # Emit one colored <span> per residue for inline sequence highlighting.
@@ -318,10 +309,7 @@ def build_importance_map_html(sequence: str) -> str:
 def generate_helix_pdb(sequence: str, smooth: bool = False) -> str:
-    """
-    Generate a minimal PDB string (helix-like CA trace).
-    When smooth=True, apply light coordinate smoothing for a softer backbone path.
-    """
     pdb_lines: List[str] = []
     atom_index = 1
     clean = "".join(c for c in sequence.upper() if not c.isspace())
@@ -375,12 +363,7 @@ def render_3d_structure(
     enhanced: bool = False,
     spin: bool = False,
 ) -> bool:
-    """
-    Render py3Dmol view: gray stick backbone + colored spheres per residue (CA-only PDB).
-    When enhanced=True: smoother helix path, slightly larger spheres, more labels.
-    When spin=True: enable viewer spin (off by default).
-    Not a real folded structure — helix-like CA trace only.
-    """
     import streamlit.components.v1 as components
     # Input sanitization keeps renderer stable across pasted FASTA/text snippets.

+# Optional peptide UI helpers: 3D approximation (py3Dmol), known-AMP similarity, and residue highlighting.
+# This module is UI-oriented and does not alter model loading or prediction logic.
 from __future__ import annotations
 import csv
 def _load_known_amps_from_csv() -> List[str]:
+    # Load unique AMP-labeled sequences from CSV and normalize to uppercase.
     path = _amp_data_csv_path()
     if not path.exists():
         return list(_FALLBACK_KNOWN_AMPS)
 def sequence_similarity(seq1: str, seq2: str) -> float:
+    # Compute simple position-wise match score normalized by the longer sequence.
     if not seq1 or not seq2:
         return 0.0
     matches = sum(1 for a, b in zip(seq1, seq2) if a == b)
 def find_most_similar(sequence: str) -> Tuple[Optional[str], float]:
+    # Return the closest known AMP and its simple position-match similarity score.
     if not sequence or not KNOWN_AMPS:
         return None, 0.0
     seq = "".join(c for c in sequence.upper() if not c.isspace())
 def get_residue_color(aa: str) -> str:
+    # Map one-letter residue codes to py3Dmol color categories.
     ch = aa.upper() if aa else ""
     positive = ["K", "R", "H"]
     negative = ["D", "E"]
 def residue_color_mpl(aa: str) -> str:
+    # Return high-contrast Matplotlib colors that mirror the 3D residue categories.
     cat = get_residue_color(aa)
     return {
         "blue": "#1D4ED8",
 def plot_helical_wheel(sequence: str, figsize: Tuple[float, float] = (6.2, 6.2)) -> Any:
+    # Build a detailed helical wheel with spokes, sequence connectors, and color-coded residues.
     import matplotlib.pyplot as plt
     from matplotlib import patheffects as pe
 def get_residue_style(aa: str) -> str:
+    # Return inline CSS style for sequence-map residue coloring.
     positive = ["K", "R", "H"]
     negative = ["D", "E"]
     hydrophobic = ["A", "V", "I", "L", "M", "F", "W", "Y"]
 def build_importance_map_html(sequence: str) -> str:
+    # Build safe HTML spans for residue-by-residue chemical highlighting.
     import html as html_mod
     # Emit one colored <span> per residue for inline sequence highlighting.
 def generate_helix_pdb(sequence: str, smooth: bool = False) -> str:
+    # Generate a minimal CA-only helix-like PDB approximation, with optional smoothing.
     pdb_lines: List[str] = []
     atom_index = 1
     clean = "".join(c for c in sequence.upper() if not c.isspace())
     enhanced: bool = False,
     spin: bool = False,
 ) -> bool:
+    # Render CA-only py3Dmol structure with category coloring and optional enhanced styling/spin.
     import streamlit.components.v1 as components
     # Input sanitization keeps renderer stable across pasted FASTA/text snippets.

StreamlitApp/utils/predict.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Model loading, sequence encoding, and AMP inference helpers."""
 import pathlib
 import numpy as np
 import torch
@@ -24,7 +23,7 @@ class FastMLP(nn.Module):
 @st.cache_resource
 def load_model():
-    """Load model weights once per Streamlit process."""
     # Always resolve relative to the StreamlitApp folder, not the process CWD.
     streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
     repo_root = streamlitapp_dir.parent
@@ -51,10 +50,7 @@ def load_model():
     return model
 def encode_sequence(seq, max_len=51):
-    """
-    Converts amino acid sequence to flattened one-hot vector
-    padded/truncated to match model input_dim (1024)
-    """
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
@@ -72,10 +68,7 @@ def encode_sequence(seq, max_len=51):
     return flat
 def predict_amp(sequence, model):
-    """
-    Takes an amino acid sequence string and the loaded model,
-    returns ("AMP"/"Non-AMP") and probability
-    """
     x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
     # Sigmoid(logit) gives AMP probability in [0, 1].

+# Model loading, sequence encoding, and AMP inference helpers.
 import pathlib
 import numpy as np
 import torch
 @st.cache_resource
 def load_model():
+    # Load model weights once per Streamlit process.
     # Always resolve relative to the StreamlitApp folder, not the process CWD.
     streamlitapp_dir = pathlib.Path(__file__).resolve().parent.parent
     repo_root = streamlitapp_dir.parent
     return model
 def encode_sequence(seq, max_len=51):
+    # Convert sequence to a padded/truncated flattened one-hot vector (1024 dims).
     amino_acids = "ACDEFGHIKLMNPQRSTVWY"
     aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}
     return flat
 def predict_amp(sequence, model):
+    # Run AMP inference and return predicted label plus AMP probability.
     x = torch.tensor(encode_sequence(sequence), dtype=torch.float32).unsqueeze(0)
     # Sigmoid(logit) gives AMP probability in [0, 1].

StreamlitApp/utils/rateLimit.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Simple in-memory sliding-window rate limiter."""
 import time
 from collections import deque

+# Simple in-memory sliding-window rate limiter.
 import time
 from collections import deque

StreamlitApp/utils/ui_helpers.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""UI-facing formatting and summary helpers shared across pages."""
 import html as _html
 from typing import Dict, List, Tuple, Optional
@@ -7,9 +6,7 @@ from utils.analyze import compute_properties
 def predicted_confidence(row: Dict) -> Optional[float]:
-    """
-    Convert stored model probability (AMP probability) into "confidence of the predicted label".
-    """
     if not row:
         return None
     pred = row.get("Prediction")
@@ -39,13 +36,7 @@ def heuristic_reason_for_profile(charge: float, hydro_fraction: float) -> str:
 def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
-    """
-    Return dict with top-candidate info:
-      - sequence
-      - predicted_confidence (AMP-prob for AMP rows, else Non-AMP prob)
-      - label
-      - reason (heuristic based on computed properties)
-    """
     if not predictions:
         return None
@@ -85,9 +76,7 @@ def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
 def mutation_heatmap_html(original: str, final: str) -> str:
-    """
-    Compare residues position-by-position. Changed residues are highlighted in red.
-    """
     orig = original or ""
     fin = final or ""
     max_len = max(len(orig), len(fin))
@@ -138,9 +127,7 @@ def _ideal_distance_to_interval(value: float, low: float, high: float) -> float:
 def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_conf: float) -> Dict:
-    """
-    Compute small summary signals for the Optimize page.
-    """
     orig_seq = orig_seq or ""
     final_seq = final_seq or ""
@@ -197,9 +184,7 @@ def sequence_length_warning(seq: str) -> Optional[str]:
 def sequence_health_label(conf_prob: float, charge: float, hydro_fraction: float) -> Tuple[str, str]:
-    """
-    Returns: (label, color_css)
-    """
     # Very high model confidence is treated as strong even outside ideal property ranges.
     if conf_prob >= 0.9:
         return "Strong AMP candidate", "#2ca02c"

+# UI-facing formatting and summary helpers shared across pages.
 import html as _html
 from typing import Dict, List, Tuple, Optional
 def predicted_confidence(row: Dict) -> Optional[float]:
+    # Convert AMP probability into confidence of the predicted class.
     if not row:
         return None
     pred = row.get("Prediction")
 def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
+    # Select best candidate row and attach a short profile-based reason.
     if not predictions:
         return None
 def mutation_heatmap_html(original: str, final: str) -> str:
+    # Highlight per-position residue changes between original and final sequences.
     orig = original or ""
     fin = final or ""
     max_len = max(len(orig), len(fin))
 def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_conf: float) -> Dict:
+    # Compute confidence and property deltas for the Optimize summary panel.
     orig_seq = orig_seq or ""
     final_seq = final_seq or ""
 def sequence_health_label(conf_prob: float, charge: float, hydro_fraction: float) -> Tuple[str, str]:
+    # Return a short quality label plus color for Analyze page status display.
     # Very high model confidence is treated as strong even outside ideal property ranges.
     if conf_prob >= 0.9:
         return "Strong AMP candidate", "#2ca02c"

StreamlitApp/utils/visualize.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""Legacy t-SNE helper retained for ad-hoc embedding previews."""
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
@@ -9,7 +8,7 @@ import numpy as np
 from utils.predict import encode_sequence
 def tsne_visualization(sequences, model):
-    """Project model embeddings into 2D and render a quick scatter plot."""
     st.info("Generating embeddings... this may take a moment.")
     embeddings = []
     for seq in sequences:

+# Legacy t-SNE helper retained for ad-hoc embedding previews.
 import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.manifold import TSNE
 from utils.predict import encode_sequence
 def tsne_visualization(sequences, model):
+    # Project model embeddings into 2D and render a quick scatter plot.
     st.info("Generating embeddings... this may take a moment.")
     embeddings = []
     for seq in sequences: