Spaces:

ChatterjeeLab
/

PeptiVerse

Running

App Files Files Community

yinuozhang commited on Nov 25, 2025

Commit

3aedb16

1 Parent(s): 9e9ca0b

add tango

Browse files

Files changed (5) hide show

app.py +407 -30
description.md +18 -0
tango_x86_64_release +3 -0
tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -11,10 +11,17 @@ from pathlib import Path
 import json
 import time
 from typing import List, Dict, Any, Tuple, Optional
 from huggingface_hub import snapshot_download
 from pathlib import Path
 import os
 def pick_assets_root() -> Path:
     # HF Spaces container uses /home/user; detect via SPACE_ID or existence
@@ -58,8 +65,8 @@ for k, v in {
 ASSETS_MODELS = ASSETS / "models";        ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
 ASSETS_DATA   = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
-MODEL_REPO = "ChatterjeeLab/Classifier_Weight"
-DATASET_REPO = "ChatterjeeLab/Classifier_Weight"
 def fetch_models_and_data():
     snapshot_download(
@@ -106,6 +113,135 @@ def is_smiles_like(s: str) -> bool:
     maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
     return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
 # ==================== Model Classes ====================
 # --- add this utility somewhere above UnifiedPeptidePredictor ---
@@ -157,7 +293,7 @@ from transformers import AutoModelForMaskedLM
 class PeptideCLMFeaturizer:
     """
     Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
-    Use the SAME tokenizer files, max_length, and pooling you used in training your XGB models.
     """
     def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
         self.device = device
@@ -535,7 +671,6 @@ class TrainingDataManager:
             if len(vals) == 0:
                 return None
-            # Use the conventional log Peff unit; keep your prior display threshold (-4.0) or set median
             threshold_default = float(np.median(vals))
             return {
                 "values": vals,
@@ -764,7 +899,7 @@ class UnifiedPeptidePredictor:
         # Model registry
         self.models = {}
         self.model_configs = self.get_model_configs()
         # Data manager
         self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
         self._protein_cache = {}
@@ -837,8 +972,8 @@ class UnifiedPeptidePredictor:
                 'path': 'models/best_model_nonfouling.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
-                'positive_label': 'Non-fouling',
-                'negative_label': 'Fouling'
             },
             'nonfouling_smiles': {
                 'type': 'xgboost',
@@ -846,8 +981,8 @@ class UnifiedPeptidePredictor:
                 'path': 'models/nonfouling-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
-                'positive_label': 'Non-fouling',
-                'negative_label': 'Fouling'
             },
             'binding_affinity': {
                 'type': 'binding',
@@ -859,12 +994,22 @@ class UnifiedPeptidePredictor:
             'binding_affinity_smiles': {
                 'type': 'binding_smiles',
                 'input': 'sequence+smiles',
-                'path': 'models/binding_affinity_smiles.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity (SMILES)'
             },
         }
     def load_all_models(self):
         """Load all available models"""
         for name, config in self.model_configs.items():
@@ -1076,7 +1221,109 @@ class UnifiedPeptidePredictor:
     def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
         return self.smiles_featurizer.embed_list([s])[0]
 # ==================== Gradio Interface ====================
@@ -1102,6 +1349,10 @@ def predict_properties(
     half_life: bool,
     nonfouling: bool,
     binding_affinity: bool,
     progress=gr.Progress()
 ):
     """Main prediction function"""
@@ -1127,11 +1378,10 @@ def predict_properties(
     # Collect selected properties
     selected_properties = []
-    # Map UI checkboxes to your internal model keys
     checkbox_to_keys = {
         'hemolysis':       ['hemolysis_seq', 'hemolysis_smiles'],
         'solubility':      ['solubility_seq', 'solubility_smiles'],
-        'permeability':    ['permeability_smiles'],  # only smiles in your current config
         'half_life':       ['half_life_seq', 'binding_affinity_smiles'],
         'nonfouling':      ['nonfouling_seq', 'nonfouling_smiles'],       # adjust if you have a real cytotox model
     }
@@ -1192,7 +1442,88 @@ def predict_properties(
                     })
                 except Exception as e:
                     print(f"Error predicting {prop}: {e}")
     # Handle binding affinity separately
     if binding_affinity and input_text:
         # Sequence–Sequence binding
@@ -1209,12 +1540,27 @@ def predict_properties(
                             protein_seq,
                             binder_seq
                         )
                         results.append({
-                            'Sequence': f"Protein–{binder_seq[:20]}...",
-                            'Property': pred.model_configs['binding_affinity']['display_name'],
-                            'Prediction': binding_class,  # e.g., Tight/Medium/Weak
                             'Value': f"{affinity:.3f}",
-                            'Unit': pred.model_configs['binding_affinity']['unit']
                         })
                 except Exception as e:
                     print(f"Error in sequence binding prediction: {e}")
@@ -1237,12 +1583,27 @@ def predict_properties(
                         protein_seq,
                         smi
                     )
                     results.append({
-                        'Sequence': f"Protein–{smi[:20]}...",
-                        'Property': pred.model_configs['binding_affinity_smiles']['display_name'],
-                        'Prediction': label,             # Tight (���7.5) / Medium (6.0–7.5) / Weak (<6.0)
                         'Value': f"{affinity:.3f}",
-                        'Unit': pred.model_configs['binding_affinity_smiles']['unit'],
                     })
             except Exception as e:
                 print(f"Error in SMILES binding prediction: {e}")
@@ -1336,7 +1697,7 @@ def load_example(example_name):
     return "", ""
 def on_example_change(name: str):
-    binder, protein = load_example(name)   # your helper above
     show_protein = (name == "Protein-Peptide")
     return (
         gr.update(value=binder),                         # input_text
@@ -1376,7 +1737,7 @@ h1 {
     text-align: center;
     margin-bottom: 10px !important;
 }
-h2 {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
@@ -1403,7 +1764,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
     gr.Markdown(
         """
         # 🌐 PeptiVerse
-        ## \t  Peptide Property Predictions
         """
     )
@@ -1452,13 +1813,29 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
                 with gr.Column(scale=1):
                     with gr.Group():
                         gr.Markdown("### ⚙️ Select Properties")
                         with gr.Accordion("Sequence Properties", open=True):
                             hemolysis = gr.Checkbox(label="🩸 Hemolysis ↓", value=True)
                             solubility = gr.Checkbox(label="💧 Solubility ↑", value=True)
                             permeability = gr.Checkbox(label="🪣 Permeability ↑", value=False)
                             half_life = gr.Checkbox(label="⏱️ Half-life ↑", value=False)
-                            nonfouling = gr.Checkbox(label="👯 Non-Fouling ↑", value=False)
                         with gr.Accordion("Binding Prediction", open=False):
                             binding_affinity = gr.Checkbox(label="🔗 Binding Affinity ↑", value=False)
                             gr.Markdown("*Requires protein sequence input*")
@@ -1468,7 +1845,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
                 with gr.Column(scale=1):
                     property_selector = gr.Dropdown(
                         choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
-                                "nonfouling", "binding_affinity"],
                         label="Select Property",
                         value="hemolysis"
                     )
@@ -1550,7 +1927,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
             input_text, input_type, protein_seq,
             hemolysis, solubility, permeability,
             half_life, nonfouling,
-            binding_affinity
         ],
         outputs=[results_df, status_output]
     )

 import json
 import time
 from typing import List, Dict, Any, Tuple, Optional
+import subprocess
+from collections import defaultdict
 from huggingface_hub import snapshot_download
 from pathlib import Path
 import os
+try:
+    from Bio.SeqUtils.ProtParam import ProteinAnalysis
+    BIOPYTHON_AVAILABLE = True
+except ImportError:
+    BIOPYTHON_AVAILABLE = False
+    print("BioPython not available. Using fallback for pI/charge calculations.")
 def pick_assets_root() -> Path:
     # HF Spaces container uses /home/user; detect via SPACE_ID or existence
 ASSETS_MODELS = ASSETS / "models";        ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
 ASSETS_DATA   = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
+MODEL_REPO = "ChatterjeeLab/Classifier_Weight"       # model repo
+DATASET_REPO = "ChatterjeeLab/Classifier_Weight"        # dataset repo (create this)
 def fetch_models_and_data():
     snapshot_download(
     maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
     return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
+# ==================== Sequence Analysis ====================
+class SequenceAnalyzer:
+    """Calculate physicochemical properties of peptide sequences"""
+    # pKa values for amino acids
+    PKA_VALUES = {
+        'N_term': 9.6,
+        'C_term': 2.3,
+        'D': 3.9,  # Aspartic acid
+        'E': 4.2,  # Glutamic acid
+        'H': 6.0,  # Histidine
+        'C': 8.3,  # Cysteine
+        'Y': 10.1,  # Tyrosine
+        'K': 10.5,  # Lysine
+        'R': 12.5,  # Arginine
+    }
+    @classmethod
+    def calculate_net_charge(cls, sequence: str, pH: float = 7.0) -> float:
+        """Calculate net charge at given pH using Henderson-Hasselbalch equation"""
+        if BIOPYTHON_AVAILABLE:
+            try:
+                analyzer = ProteinAnalysis(sequence)
+                return analyzer.charge_at_pH(pH)
+            except:
+                pass
+        # Fallback calculation
+        charge = 0
+        # N-terminus
+        charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['N_term']))
+        # C-terminus
+        charge -= 1 / (1 + 10**(cls.PKA_VALUES['C_term'] - pH))
+        # Count charged residues
+        for aa in sequence:
+            if aa in 'KR':  # Positive
+                pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['K' if aa == 'K' else 'R'])
+                charge += 1 / (1 + 10**(pH - pKa))
+            elif aa in 'DE':  # Negative
+                pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['D' if aa == 'D' else 'E'])
+                charge -= 1 / (1 + 10**(pKa - pH))
+            elif aa == 'H':  # Histidine (positive when protonated)
+                charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['H']))
+            elif aa == 'C':  # Cysteine (negative when deprotonated)
+                charge -= 1 / (1 + 10**(cls.PKA_VALUES['C'] - pH))
+            elif aa == 'Y':  # Tyrosine (negative when deprotonated)
+                charge -= 1 / (1 + 10**(cls.PKA_VALUES['Y'] - pH))
+        return round(charge, 2)
+    @classmethod
+    def calculate_isoelectric_point(cls, sequence: str) -> float:
+        """Calculate theoretical pI using bisection method"""
+        if BIOPYTHON_AVAILABLE:
+            try:
+                analyzer = ProteinAnalysis(sequence)
+                return analyzer.isoelectric_point()
+            except:
+                pass
+        # Fallback: Bisection method
+        pH_min, pH_max = 0.0, 14.0
+        epsilon = 0.01
+        while (pH_max - pH_min) > epsilon:
+            pH_mid = (pH_min + pH_max) / 2
+            charge = cls.calculate_net_charge(sequence, pH_mid)
+            if abs(charge) < epsilon:
+                return round(pH_mid, 2)
+            if charge > 0:
+                pH_min = pH_mid
+            else:
+                pH_max = pH_mid
+        return round((pH_min + pH_max) / 2, 2)
+    @classmethod
+    def calculate_molecular_weight(cls, sequence: str) -> float:
+        """Calculate molecular weight"""
+        if BIOPYTHON_AVAILABLE:
+            try:
+                analyzer = ProteinAnalysis(sequence)
+                return analyzer.molecular_weight()
+            except:
+                pass
+        # Fallback: approximate calculation
+        weights = {
+            'A': 89.1, 'C': 121.2, 'D': 133.1, 'E': 147.1, 'F': 165.2,
+            'G': 75.1, 'H': 155.2, 'I': 131.2, 'K': 146.2, 'L': 131.2,
+            'M': 149.2, 'N': 132.1, 'P': 115.1, 'Q': 146.2, 'R': 174.2,
+            'S': 105.1, 'T': 119.1, 'V': 117.1, 'W': 204.2, 'Y': 181.2
+        }
+        mw = sum(weights.get(aa, 0) for aa in sequence)
+        # Subtract water for peptide bonds
+        mw -= 18.0 * (len(sequence) - 1)
+        return round(mw, 1)
+    @classmethod
+    def calculate_hydrophobicity(cls, sequence: str) -> float:
+        """Calculate GRAVY (grand average of hydropathy)"""
+        if BIOPYTHON_AVAILABLE:
+            try:
+                analyzer = ProteinAnalysis(sequence)
+                return analyzer.gravy()
+            except:
+                pass
+        # Kyte-Doolittle scale
+        hydrophobicity = {
+            'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
+            'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
+            'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
+            'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
+        }
+        if len(sequence) == 0:
+            return 0
+        total = sum(hydrophobicity.get(aa, 0) for aa in sequence)
+        return round(total / len(sequence), 2)
 # ==================== Model Classes ====================
 # --- add this utility somewhere above UnifiedPeptidePredictor ---
 class PeptideCLMFeaturizer:
     """
     Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
+    Use the SAME tokenizer files, max_length, and pooling you used in training XGB models.
     """
     def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
         self.device = device
             if len(vals) == 0:
                 return None
             threshold_default = float(np.median(vals))
             return {
                 "values": vals,
         # Model registry
         self.models = {}
         self.model_configs = self.get_model_configs()
+        self.sequence_analyzer = SequenceAnalyzer()
         # Data manager
         self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
         self._protein_cache = {}
                 'path': 'models/best_model_nonfouling.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
+                'positive_label': 'Non-toxic',
+                'negative_label': 'Toxic'
             },
             'nonfouling_smiles': {
                 'type': 'xgboost',
                 'path': 'models/nonfouling-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
+                'positive_label': 'Stable',
+                'negative_label': 'Unstable'
             },
             'binding_affinity': {
                 'type': 'binding',
             'binding_affinity_smiles': {
                 'type': 'binding_smiles',
                 'input': 'sequence+smiles',
+                'path': 'models/binding-affinity_smiles.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity (SMILES)'
             },
         }
+    def analyze_sequence(self, sequence: str, pH: float = 7.0) -> Dict[str, Any]:
+        """Comprehensive sequence analysis including charge, pI, and aggregation"""
+        results = {}
+        # Basic properties
+        results['length'] = len(sequence)
+        results['molecular_weight'] = self.sequence_analyzer.calculate_molecular_weight(sequence)
+        results['net_charge'] = self.sequence_analyzer.calculate_net_charge(sequence, pH)
+        results['isoelectric_point'] = self.sequence_analyzer.calculate_isoelectric_point(sequence)
+        results['hydrophobicity'] = self.sequence_analyzer.calculate_hydrophobicity(sequence)
+        return results
     def load_all_models(self):
         """Load all available models"""
         for name, config in self.model_configs.items():
     def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
         return self.smiles_featurizer.embed_list([s])[0]
+    @staticmethod
+    def affinity_to_nM(affinity: float) -> float:
+        """
+        Convert model affinity score (pKd / pKi / pIC50 style: -log10(K [M]))
+        to an approximate concentration in nM.
+        """
+        # K [M] = 10^(-affinity); then convert M -> nM (1e9 factor)
+        return 10.0 ** (-float(affinity)) * 1e9
+# ==================== TANGO INTEGRATION ====================
+# TANGO executable: same folder as this script
+try:
+    HERE = Path(__file__).resolve().parent
+except NameError:
+    HERE = Path(".").resolve()
+TANGO_EXE = str(HERE / "tango_x86_64_release")
+# Default params (adjust if you like)
+DEFAULT_TANGO_PARAMS = {
+    "nt": "N",
+    "ct": "N",
+    "ph": "7.0",
+    "te": "310",     # Kelvin (~37 °C)
+    "io": "0.05",
+    "tf": "0",
+    "stab": "-10",
+    "conc": "0.0001",
+}
+def _parse_tango_keyvals(text: str) -> dict:
+    """
+    Parse lines like:
+    'AGG 0 AMYLO 6.41e-13 TURN 7.06 HELIX 0 HELAGG 0 BETA 19.67'
+    into {'AMYLO': [...], 'BETA': [...], ...}
+    """
+    buckets = defaultdict(list)
+    for line in text.splitlines():
+        pairs = re.findall(
+            r'\b(AGG|AMYLO|TURN|HELIX|HELAGG|BETA)\s+([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b',
+            line
+        )
+        for k, v in pairs:
+            try:
+                buckets[k].append(float(v))
+            except ValueError:
+                pass
+    return dict(buckets)
+def _agg(vals, how="sum"):
+    if not vals:
+        return None
+    if how == "sum":
+        return float(sum(vals))
+    if how == "max":
+        return float(max(vals))
+    if how == "mean":
+        return float(sum(vals) / len(vals))
+    return None
+def run_tango_for_sequence(
+    seq: str,
+    pH_value: str,
+    ident: str = "seq",
+    params: dict | None = None,
+    exe: str = TANGO_EXE,
+) -> dict:
+    """
+    Run TANGO on a single sequence and return:
+      - amyloid aggregation (AMYLO sum/max)
+      - β-sheet aggregation (BETA sum/max)
+    """
+    params = {**DEFAULT_TANGO_PARAMS, **(params or {})}
+    params["ph"] = pH_value
+    cmd = [exe, ident] + [f'{k}="{v}"' for k, v in params.items()] + [f'seq="{seq}"']
+    # TANGO likes a single shell command
+    p = subprocess.run(" ".join(cmd), shell=True, capture_output=True, text=True)
+    out = (p.stdout or "") + (("\n[STDERR]\n" + p.stderr) if p.stderr else "")
+    buckets = _parse_tango_keyvals(out)
+    amylo_vals = buckets.get("AMYLO", [])
+    beta_vals  = buckets.get("BETA",  [])
+    agg_vals   = buckets.get("AGG",   [])
+    tango_amylo_max = _agg(amylo_vals, "max")
+    tango_amylo_sum = _agg(amylo_vals, "sum")
+    tango_beta_max  = _agg(beta_vals,  "max")
+    tango_beta_sum  = _agg(beta_vals,  "sum")
+    tango_agg_sum   = _agg(agg_vals,   "sum")
+    return {
+        "tango_amylo_max": tango_amylo_max,
+        "tango_amylo_sum": tango_amylo_sum,
+        "tango_beta_max":  tango_beta_max,
+        "tango_beta_sum":  tango_beta_sum,
+        "tango_agg_sum":   tango_agg_sum,
+        "raw_output": out.strip(),
+    }
 # ==================== Gradio Interface ====================
     half_life: bool,
     nonfouling: bool,
     binding_affinity: bool,
+    tango_amyloid: bool,
+    tango_beta: bool,
+    include_physicochemical: bool,
+    pH_value: float,
     progress=gr.Progress()
 ):
     """Main prediction function"""
     # Collect selected properties
     selected_properties = []
     checkbox_to_keys = {
         'hemolysis':       ['hemolysis_seq', 'hemolysis_smiles'],
         'solubility':      ['solubility_seq', 'solubility_smiles'],
+        'permeability':    ['permeability_smiles'],
         'half_life':       ['half_life_seq', 'binding_affinity_smiles'],
         'nonfouling':      ['nonfouling_seq', 'nonfouling_smiles'],       # adjust if you have a real cytotox model
     }
                     })
                 except Exception as e:
                     print(f"Error predicting {prop}: {e}")
+            if input_type == "Sequence":
+                if include_physicochemical:
+                    seq_display = seq[:30] + '...' if len(seq) > 30 else seq
+                    progress((seq_idx + 0.3) / len(lines), f"Calculating physicochemical properties...")
+                    analysis = pred.analyze_sequence(seq, pH_value)
+                    results.append({
+                        'Sequence': seq_display,
+                        'Property': '📏 Length',
+                        'Prediction': '',
+                        'Value': str(analysis['length']),
+                        'Unit': 'aa'
+                    })
+                    results.append({
+                        'Sequence': seq_display,
+                        'Property': '⚖️ Molecular Weight',
+                        'Prediction': '',
+                        'Value': f"{analysis['molecular_weight']:.1f}",
+                        'Unit': 'Da'
+                    })
+                    results.append({
+                        'Sequence': seq_display,
+                        'Property': f'⚡ Net Charge (pH {pH_value})',
+                        'Prediction': '',
+                        'Value': f"{analysis['net_charge']:.2f}",
+                        'Unit': ''
+                    })
+                    results.append({
+                        'Sequence': seq_display,
+                        'Property': '🎯 Isoelectric Point',
+                        'Prediction': '',
+                        'Value': f"{analysis['isoelectric_point']:.2f}",
+                        'Unit': 'pH'
+                    })
+                    hydro = analysis['hydrophobicity']
+                    if hydro <= -4.5:
+                        hydro_label = "Hydrophilic"
+                    elif hydro >= 4.5:
+                        hydro_label = "Hydrophobic"
+                    else:
+                        hydro_label = "Intermediate"
+                    results.append({
+                        'Sequence': seq_display,
+                        'Property': '💦 Hydrophobicity (GRAVY)',
+                        'Prediction': hydro_label,
+                        'Value': f"{hydro:.2f}",
+                        'Unit': 'GRAVY (Kyte-Doolittle)',
+                    })
+            if input_type == "Sequence" and (tango_amyloid or tango_beta):
+                try:
+                    # Run once per sequence
+                    tango_res = run_tango_for_sequence(
+                        seq,
+                        pH_value=pH_value,
+                        ident=f"seq{seq_idx+1}",
+                        params=None  # override pH/te here if you want
+                    )
+                    short_seq = seq[:30] + '...' if len(seq) > 30 else seq
+                    if tango_amyloid and tango_res["tango_amylo_sum"] is not None:
+                        results.append({
+                            'Sequence': short_seq,
+                            'Property': "🧱 TANGO Amyloid Aggregation",
+                            'Prediction': "",
+                            'Value': f"{tango_res['tango_amylo_sum']:.3f}",
+                            'Unit': "TANGO (sum)"
+                        })
+                    if tango_beta and tango_res["tango_beta_sum"] is not None:
+                        results.append({
+                            'Sequence': short_seq,
+                            'Property': "🧬 TANGO β-sheet Aggregation",
+                            'Prediction': "",
+                            'Value': f"{tango_res['tango_beta_sum']:.3f}",
+                            'Unit': "TANGO (sum)"
+                        })
+                except Exception as e:
+                    print(f"Error running TANGO for sequence {seq_idx+1}: {e}")
     # Handle binding affinity separately
     if binding_affinity and input_text:
         # Sequence–Sequence binding
                             protein_seq,
                             binder_seq
                         )
+                        kd_nM = pred.affinity_to_nM(affinity)
+                        seq_label = f"Protein–{binder_seq[:20]}..."
+                        prop_base = pred.model_configs['binding_affinity']['display_name']
+                        # Row 1: affinity score (pKd-like)
                         results.append({
+                            'Sequence': seq_label,
+                            'Property': f"{prop_base} (score)",
+                            'Prediction': binding_class,
                             'Value': f"{affinity:.3f}",
+                            'Unit': "Affinity score (pKd-like)",
+                        })
+                        # Row 2: converted Kd in nM
+                        results.append({
+                            'Sequence': seq_label,
+                            'Property': f"{prop_base} (Kd est.)",
+                            'Prediction': binding_class,
+                            'Value': f"{kd_nM:.3g}",
+                            'Unit': "nM (Kd/Ki/IC50)",
                         })
                 except Exception as e:
                     print(f"Error in sequence binding prediction: {e}")
                         protein_seq,
                         smi
                     )
+                    kd_nM = pred.affinity_to_nM(affinity)
+                    seq_label = f"Protein–{smi[:20]}..."
+                    prop_base = pred.model_configs['binding_affinity_smiles']['display_name']
+                    # Row 1: affinity score (pKd-like)
                     results.append({
+                        'Sequence': seq_label,
+                        'Property': f"{prop_base} (score)",
+                        'Prediction': label,   # Tight / Medium / Weak
                         'Value': f"{affinity:.3f}",
+                        'Unit': "Affinity score (pKd-like)",
+                    })
+                    # Row 2: converted Kd in nM
+                    results.append({
+                        'Sequence': seq_label,
+                        'Property': f"{prop_base} (Kd est.)",
+                        'Prediction': label,
+                        'Value': f"{kd_nM:.3g}",
+                        'Unit': "nM (Kd/Ki/IC50)",
                     })
             except Exception as e:
                 print(f"Error in SMILES binding prediction: {e}")
     return "", ""
 def on_example_change(name: str):
+    binder, protein = load_example(name)
     show_protein = (name == "Protein-Peptide")
     return (
         gr.update(value=binder),                         # input_text
     text-align: center;
     margin-bottom: 10px !important;
 }
+h3 {
     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     gr.Markdown(
         """
         # 🌐 PeptiVerse
+        ### \t  Peptide Property Predictions
         """
     )
                 with gr.Column(scale=1):
                     with gr.Group():
                         gr.Markdown("### ⚙️ Select Properties")
+                        with gr.Accordion("Physicochemical Properties", open=True):
+                            include_physicochemical = gr.Checkbox(
+                                label="🧪 Calculate Basic Properties",
+                                value=True,
+                                info="MW, net charge, pI, hydrophobicity"
+                            )
+                            pH_value = gr.Slider(
+                                minimum=0,
+                                maximum=14,
+                                value=7.0,
+                                step=0.1,
+                                label="pH for Net Charge",
+                                info="Physiological pH is ~7.4"
+                            )
                         with gr.Accordion("Sequence Properties", open=True):
                             hemolysis = gr.Checkbox(label="🩸 Hemolysis ↓", value=True)
                             solubility = gr.Checkbox(label="💧 Solubility ↑", value=True)
                             permeability = gr.Checkbox(label="🪣 Permeability ↑", value=False)
                             half_life = gr.Checkbox(label="⏱️ Half-life ↑", value=False)
+                            nonfouling = gr.Checkbox(label="👯 Non-Fouling ↑", value=False)
+                            tango_amyloid = gr.Checkbox(label="🧱 TANGO Amyloid Aggregation ↓", value=False)
+                            tango_beta = gr.Checkbox(label="🧬 TANGO β-sheet Aggregation ↓", value=False)
                         with gr.Accordion("Binding Prediction", open=False):
                             binding_affinity = gr.Checkbox(label="🔗 Binding Affinity ↑", value=False)
                             gr.Markdown("*Requires protein sequence input*")
                 with gr.Column(scale=1):
                     property_selector = gr.Dropdown(
                         choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
+                                "nonfouling", "binding_affinity", "tango_amyloid", "tango_beta"],
                         label="Select Property",
                         value="hemolysis"
                     )
             input_text, input_type, protein_seq,
             hemolysis, solubility, permeability,
             half_life, nonfouling,
+            binding_affinity, tango_amyloid, tango_beta, include_physicochemical, pH_value,
         ],
         outputs=[results_df, status_output]
     )

description.md CHANGED Viewed

@@ -10,6 +10,7 @@ Our models are trained on curated datasets from multiple sources:
 - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
 - **Description:** Probability of peptide disrupting red blood cell membranes.
 - **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
 #### Solubility Dataset
 - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
@@ -45,6 +46,7 @@ Our models are trained on curated datasets from multiple sources:
 - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
 - **Quality:** Binding class cutoffs: Tight ≥ 7.5, Medium 6.0–7.5, Weak < 6.0
 - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
 ### Model Architecture
@@ -56,6 +58,22 @@ Our models are trained on curated datasets from multiple sources:
 ### Model Training and Weight Hosting
 - [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
 ### Citation
 If you use this tool, please cite:

 - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
 - **Description:** Probability of peptide disrupting red blood cell membranes.
 - **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
+- **Interpretation** 50% of read blood cells being lysed at x ug/ml concetration (HC50). If HC50 < 100uM, considered as hemolytic, otherwise non-hemolytic.
 #### Solubility Dataset
 - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
 - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
 - **Quality:** Binding class cutoffs: Tight ≥ 7.5, Medium 6.0–7.5, Weak < 6.0
 - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
+- **Interpretation** Affinity_measure = -log_10(K).
 ### Model Architecture
 ### Model Training and Weight Hosting
 - [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
+### 🧪 Physicochemical Properties
+#### Net Charge Calculation
+- Uses Henderson-Hasselbalch equation
+- pH-dependent calculation
+- Considers all ionizable groups (K, R, H, D, E, C, Y, termini)
+#### Isoelectric Point (pI)
+- Bisection method to find pH where net charge = 0
+- Precision: ±0.01 pH units
+#### Hydrophobicity (GRAVY)
+- Grand Average of Hydropathy
+- Uses Kyte-Doolittle scale
+- Range: -4.5 (hydrophilic) to +4.5 (hydrophobic)
 ### Citation
 If you use this tool, please cite:

tango_x86_64_release ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0e381c28f847487069b0df29bb9d4f766391066710500d3170ecb73d9f31dbf
+size 211205

tokenizer/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/tokenizer/__pycache__/__init__.cpython-310.pyc and b/tokenizer/__pycache__/__init__.cpython-310.pyc differ

tokenizer/__pycache__/my_tokenizers.cpython-310.pyc CHANGED Viewed

Binary files a/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc and b/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc differ