Spaces:
Running
Running
Commit
Β·
3aedb16
1
Parent(s):
9e9ca0b
add tango
Browse files- app.py +407 -30
- description.md +18 -0
- tango_x86_64_release +3 -0
- tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
- tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
app.py
CHANGED
|
@@ -11,10 +11,17 @@ from pathlib import Path
|
|
| 11 |
import json
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any, Tuple, Optional
|
| 14 |
-
|
|
|
|
| 15 |
from huggingface_hub import snapshot_download
|
| 16 |
from pathlib import Path
|
| 17 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def pick_assets_root() -> Path:
|
| 20 |
# HF Spaces container uses /home/user; detect via SPACE_ID or existence
|
|
@@ -58,8 +65,8 @@ for k, v in {
|
|
| 58 |
ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
|
| 59 |
ASSETS_DATA = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
|
| 60 |
|
| 61 |
-
MODEL_REPO = "ChatterjeeLab/Classifier_Weight"
|
| 62 |
-
DATASET_REPO = "ChatterjeeLab/Classifier_Weight"
|
| 63 |
|
| 64 |
def fetch_models_and_data():
|
| 65 |
snapshot_download(
|
|
@@ -106,6 +113,135 @@ def is_smiles_like(s: str) -> bool:
|
|
| 106 |
maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
|
| 107 |
return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
# ==================== Model Classes ====================
|
| 110 |
|
| 111 |
# --- add this utility somewhere above UnifiedPeptidePredictor ---
|
|
@@ -157,7 +293,7 @@ from transformers import AutoModelForMaskedLM
|
|
| 157 |
class PeptideCLMFeaturizer:
|
| 158 |
"""
|
| 159 |
Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
|
| 160 |
-
Use the SAME tokenizer files, max_length, and pooling you used in training
|
| 161 |
"""
|
| 162 |
def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
|
| 163 |
self.device = device
|
|
@@ -535,7 +671,6 @@ class TrainingDataManager:
|
|
| 535 |
if len(vals) == 0:
|
| 536 |
return None
|
| 537 |
|
| 538 |
-
# Use the conventional log Peff unit; keep your prior display threshold (-4.0) or set median
|
| 539 |
threshold_default = float(np.median(vals))
|
| 540 |
return {
|
| 541 |
"values": vals,
|
|
@@ -764,7 +899,7 @@ class UnifiedPeptidePredictor:
|
|
| 764 |
# Model registry
|
| 765 |
self.models = {}
|
| 766 |
self.model_configs = self.get_model_configs()
|
| 767 |
-
|
| 768 |
# Data manager
|
| 769 |
self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
|
| 770 |
self._protein_cache = {}
|
|
@@ -837,8 +972,8 @@ class UnifiedPeptidePredictor:
|
|
| 837 |
'path': 'models/best_model_nonfouling.json',
|
| 838 |
'unit': 'Probability',
|
| 839 |
'display_name': 'π― Non-Fouling',
|
| 840 |
-
'positive_label': 'Non-
|
| 841 |
-
'negative_label': '
|
| 842 |
},
|
| 843 |
'nonfouling_smiles': {
|
| 844 |
'type': 'xgboost',
|
|
@@ -846,8 +981,8 @@ class UnifiedPeptidePredictor:
|
|
| 846 |
'path': 'models/nonfouling-xgboost_smiles.json',
|
| 847 |
'unit': 'Probability',
|
| 848 |
'display_name': 'π― Non-Fouling',
|
| 849 |
-
'positive_label': '
|
| 850 |
-
'negative_label': '
|
| 851 |
},
|
| 852 |
'binding_affinity': {
|
| 853 |
'type': 'binding',
|
|
@@ -859,12 +994,22 @@ class UnifiedPeptidePredictor:
|
|
| 859 |
'binding_affinity_smiles': {
|
| 860 |
'type': 'binding_smiles',
|
| 861 |
'input': 'sequence+smiles',
|
| 862 |
-
'path': 'models/
|
| 863 |
'unit': 'Probability',
|
| 864 |
'display_name': 'π Binding Affinity (SMILES)'
|
| 865 |
},
|
| 866 |
}
|
| 867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
def load_all_models(self):
|
| 869 |
"""Load all available models"""
|
| 870 |
for name, config in self.model_configs.items():
|
|
@@ -1076,7 +1221,109 @@ class UnifiedPeptidePredictor:
|
|
| 1076 |
|
| 1077 |
def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
|
| 1078 |
return self.smiles_featurizer.embed_list([s])[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1079 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1080 |
|
| 1081 |
# ==================== Gradio Interface ====================
|
| 1082 |
|
|
@@ -1102,6 +1349,10 @@ def predict_properties(
|
|
| 1102 |
half_life: bool,
|
| 1103 |
nonfouling: bool,
|
| 1104 |
binding_affinity: bool,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1105 |
progress=gr.Progress()
|
| 1106 |
):
|
| 1107 |
"""Main prediction function"""
|
|
@@ -1127,11 +1378,10 @@ def predict_properties(
|
|
| 1127 |
# Collect selected properties
|
| 1128 |
selected_properties = []
|
| 1129 |
|
| 1130 |
-
# Map UI checkboxes to your internal model keys
|
| 1131 |
checkbox_to_keys = {
|
| 1132 |
'hemolysis': ['hemolysis_seq', 'hemolysis_smiles'],
|
| 1133 |
'solubility': ['solubility_seq', 'solubility_smiles'],
|
| 1134 |
-
'permeability': ['permeability_smiles'],
|
| 1135 |
'half_life': ['half_life_seq', 'binding_affinity_smiles'],
|
| 1136 |
'nonfouling': ['nonfouling_seq', 'nonfouling_smiles'], # adjust if you have a real cytotox model
|
| 1137 |
}
|
|
@@ -1192,7 +1442,88 @@ def predict_properties(
|
|
| 1192 |
})
|
| 1193 |
except Exception as e:
|
| 1194 |
print(f"Error predicting {prop}: {e}")
|
| 1195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1196 |
# Handle binding affinity separately
|
| 1197 |
if binding_affinity and input_text:
|
| 1198 |
# SequenceβSequence binding
|
|
@@ -1209,12 +1540,27 @@ def predict_properties(
|
|
| 1209 |
protein_seq,
|
| 1210 |
binder_seq
|
| 1211 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1212 |
results.append({
|
| 1213 |
-
'Sequence':
|
| 1214 |
-
'Property':
|
| 1215 |
-
'Prediction': binding_class,
|
| 1216 |
'Value': f"{affinity:.3f}",
|
| 1217 |
-
'Unit':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1218 |
})
|
| 1219 |
except Exception as e:
|
| 1220 |
print(f"Error in sequence binding prediction: {e}")
|
|
@@ -1237,12 +1583,27 @@ def predict_properties(
|
|
| 1237 |
protein_seq,
|
| 1238 |
smi
|
| 1239 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1240 |
results.append({
|
| 1241 |
-
'Sequence':
|
| 1242 |
-
'Property':
|
| 1243 |
-
'Prediction': label,
|
| 1244 |
'Value': f"{affinity:.3f}",
|
| 1245 |
-
'Unit':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1246 |
})
|
| 1247 |
except Exception as e:
|
| 1248 |
print(f"Error in SMILES binding prediction: {e}")
|
|
@@ -1336,7 +1697,7 @@ def load_example(example_name):
|
|
| 1336 |
return "", ""
|
| 1337 |
|
| 1338 |
def on_example_change(name: str):
|
| 1339 |
-
binder, protein = load_example(name)
|
| 1340 |
show_protein = (name == "Protein-Peptide")
|
| 1341 |
return (
|
| 1342 |
gr.update(value=binder), # input_text
|
|
@@ -1376,7 +1737,7 @@ h1 {
|
|
| 1376 |
text-align: center;
|
| 1377 |
margin-bottom: 10px !important;
|
| 1378 |
}
|
| 1379 |
-
|
| 1380 |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 1381 |
-webkit-background-clip: text;
|
| 1382 |
-webkit-text-fill-color: transparent;
|
|
@@ -1403,7 +1764,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1403 |
gr.Markdown(
|
| 1404 |
"""
|
| 1405 |
# π PeptiVerse
|
| 1406 |
-
|
| 1407 |
"""
|
| 1408 |
)
|
| 1409 |
|
|
@@ -1452,13 +1813,29 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1452 |
with gr.Column(scale=1):
|
| 1453 |
with gr.Group():
|
| 1454 |
gr.Markdown("### βοΈ Select Properties")
|
| 1455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1456 |
with gr.Accordion("Sequence Properties", open=True):
|
| 1457 |
hemolysis = gr.Checkbox(label="π©Έ Hemolysis β", value=True)
|
| 1458 |
solubility = gr.Checkbox(label="π§ Solubility β", value=True)
|
| 1459 |
permeability = gr.Checkbox(label="πͺ£ Permeability β", value=False)
|
| 1460 |
half_life = gr.Checkbox(label="β±οΈ Half-life β", value=False)
|
| 1461 |
-
nonfouling = gr.Checkbox(label="π― Non-Fouling β", value=False)
|
|
|
|
|
|
|
| 1462 |
with gr.Accordion("Binding Prediction", open=False):
|
| 1463 |
binding_affinity = gr.Checkbox(label="π Binding Affinity β", value=False)
|
| 1464 |
gr.Markdown("*Requires protein sequence input*")
|
|
@@ -1468,7 +1845,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1468 |
with gr.Column(scale=1):
|
| 1469 |
property_selector = gr.Dropdown(
|
| 1470 |
choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
|
| 1471 |
-
"nonfouling", "binding_affinity"],
|
| 1472 |
label="Select Property",
|
| 1473 |
value="hemolysis"
|
| 1474 |
)
|
|
@@ -1550,7 +1927,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1550 |
input_text, input_type, protein_seq,
|
| 1551 |
hemolysis, solubility, permeability,
|
| 1552 |
half_life, nonfouling,
|
| 1553 |
-
binding_affinity
|
| 1554 |
],
|
| 1555 |
outputs=[results_df, status_output]
|
| 1556 |
)
|
|
|
|
| 11 |
import json
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any, Tuple, Optional
|
| 14 |
+
import subprocess
|
| 15 |
+
from collections import defaultdict
|
| 16 |
from huggingface_hub import snapshot_download
|
| 17 |
from pathlib import Path
|
| 18 |
import os
|
| 19 |
+
try:
|
| 20 |
+
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
| 21 |
+
BIOPYTHON_AVAILABLE = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
BIOPYTHON_AVAILABLE = False
|
| 24 |
+
print("BioPython not available. Using fallback for pI/charge calculations.")
|
| 25 |
|
| 26 |
def pick_assets_root() -> Path:
|
| 27 |
# HF Spaces container uses /home/user; detect via SPACE_ID or existence
|
|
|
|
| 65 |
ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
|
| 66 |
ASSETS_DATA = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
|
| 67 |
|
| 68 |
+
MODEL_REPO = "ChatterjeeLab/Classifier_Weight" # model repo
|
| 69 |
+
DATASET_REPO = "ChatterjeeLab/Classifier_Weight" # dataset repo (create this)
|
| 70 |
|
| 71 |
def fetch_models_and_data():
|
| 72 |
snapshot_download(
|
|
|
|
| 113 |
maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
|
| 114 |
return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
|
| 115 |
|
| 116 |
+
# ==================== Sequence Analysis ====================
|
| 117 |
+
|
| 118 |
+
class SequenceAnalyzer:
|
| 119 |
+
"""Calculate physicochemical properties of peptide sequences"""
|
| 120 |
+
|
| 121 |
+
# pKa values for amino acids
|
| 122 |
+
PKA_VALUES = {
|
| 123 |
+
'N_term': 9.6,
|
| 124 |
+
'C_term': 2.3,
|
| 125 |
+
'D': 3.9, # Aspartic acid
|
| 126 |
+
'E': 4.2, # Glutamic acid
|
| 127 |
+
'H': 6.0, # Histidine
|
| 128 |
+
'C': 8.3, # Cysteine
|
| 129 |
+
'Y': 10.1, # Tyrosine
|
| 130 |
+
'K': 10.5, # Lysine
|
| 131 |
+
'R': 12.5, # Arginine
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
@classmethod
|
| 135 |
+
def calculate_net_charge(cls, sequence: str, pH: float = 7.0) -> float:
|
| 136 |
+
"""Calculate net charge at given pH using Henderson-Hasselbalch equation"""
|
| 137 |
+
if BIOPYTHON_AVAILABLE:
|
| 138 |
+
try:
|
| 139 |
+
analyzer = ProteinAnalysis(sequence)
|
| 140 |
+
return analyzer.charge_at_pH(pH)
|
| 141 |
+
except:
|
| 142 |
+
pass
|
| 143 |
+
|
| 144 |
+
# Fallback calculation
|
| 145 |
+
charge = 0
|
| 146 |
+
|
| 147 |
+
# N-terminus
|
| 148 |
+
charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['N_term']))
|
| 149 |
+
|
| 150 |
+
# C-terminus
|
| 151 |
+
charge -= 1 / (1 + 10**(cls.PKA_VALUES['C_term'] - pH))
|
| 152 |
+
|
| 153 |
+
# Count charged residues
|
| 154 |
+
for aa in sequence:
|
| 155 |
+
if aa in 'KR': # Positive
|
| 156 |
+
pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['K' if aa == 'K' else 'R'])
|
| 157 |
+
charge += 1 / (1 + 10**(pH - pKa))
|
| 158 |
+
elif aa in 'DE': # Negative
|
| 159 |
+
pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['D' if aa == 'D' else 'E'])
|
| 160 |
+
charge -= 1 / (1 + 10**(pKa - pH))
|
| 161 |
+
elif aa == 'H': # Histidine (positive when protonated)
|
| 162 |
+
charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['H']))
|
| 163 |
+
elif aa == 'C': # Cysteine (negative when deprotonated)
|
| 164 |
+
charge -= 1 / (1 + 10**(cls.PKA_VALUES['C'] - pH))
|
| 165 |
+
elif aa == 'Y': # Tyrosine (negative when deprotonated)
|
| 166 |
+
charge -= 1 / (1 + 10**(cls.PKA_VALUES['Y'] - pH))
|
| 167 |
+
|
| 168 |
+
return round(charge, 2)
|
| 169 |
+
|
| 170 |
+
@classmethod
|
| 171 |
+
def calculate_isoelectric_point(cls, sequence: str) -> float:
|
| 172 |
+
"""Calculate theoretical pI using bisection method"""
|
| 173 |
+
if BIOPYTHON_AVAILABLE:
|
| 174 |
+
try:
|
| 175 |
+
analyzer = ProteinAnalysis(sequence)
|
| 176 |
+
return analyzer.isoelectric_point()
|
| 177 |
+
except:
|
| 178 |
+
pass
|
| 179 |
+
|
| 180 |
+
# Fallback: Bisection method
|
| 181 |
+
pH_min, pH_max = 0.0, 14.0
|
| 182 |
+
epsilon = 0.01
|
| 183 |
+
|
| 184 |
+
while (pH_max - pH_min) > epsilon:
|
| 185 |
+
pH_mid = (pH_min + pH_max) / 2
|
| 186 |
+
charge = cls.calculate_net_charge(sequence, pH_mid)
|
| 187 |
+
|
| 188 |
+
if abs(charge) < epsilon:
|
| 189 |
+
return round(pH_mid, 2)
|
| 190 |
+
|
| 191 |
+
if charge > 0:
|
| 192 |
+
pH_min = pH_mid
|
| 193 |
+
else:
|
| 194 |
+
pH_max = pH_mid
|
| 195 |
+
|
| 196 |
+
return round((pH_min + pH_max) / 2, 2)
|
| 197 |
+
|
| 198 |
+
@classmethod
|
| 199 |
+
def calculate_molecular_weight(cls, sequence: str) -> float:
|
| 200 |
+
"""Calculate molecular weight"""
|
| 201 |
+
if BIOPYTHON_AVAILABLE:
|
| 202 |
+
try:
|
| 203 |
+
analyzer = ProteinAnalysis(sequence)
|
| 204 |
+
return analyzer.molecular_weight()
|
| 205 |
+
except:
|
| 206 |
+
pass
|
| 207 |
+
|
| 208 |
+
# Fallback: approximate calculation
|
| 209 |
+
weights = {
|
| 210 |
+
'A': 89.1, 'C': 121.2, 'D': 133.1, 'E': 147.1, 'F': 165.2,
|
| 211 |
+
'G': 75.1, 'H': 155.2, 'I': 131.2, 'K': 146.2, 'L': 131.2,
|
| 212 |
+
'M': 149.2, 'N': 132.1, 'P': 115.1, 'Q': 146.2, 'R': 174.2,
|
| 213 |
+
'S': 105.1, 'T': 119.1, 'V': 117.1, 'W': 204.2, 'Y': 181.2
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
mw = sum(weights.get(aa, 0) for aa in sequence)
|
| 217 |
+
# Subtract water for peptide bonds
|
| 218 |
+
mw -= 18.0 * (len(sequence) - 1)
|
| 219 |
+
return round(mw, 1)
|
| 220 |
+
|
| 221 |
+
@classmethod
|
| 222 |
+
def calculate_hydrophobicity(cls, sequence: str) -> float:
|
| 223 |
+
"""Calculate GRAVY (grand average of hydropathy)"""
|
| 224 |
+
if BIOPYTHON_AVAILABLE:
|
| 225 |
+
try:
|
| 226 |
+
analyzer = ProteinAnalysis(sequence)
|
| 227 |
+
return analyzer.gravy()
|
| 228 |
+
except:
|
| 229 |
+
pass
|
| 230 |
+
|
| 231 |
+
# Kyte-Doolittle scale
|
| 232 |
+
hydrophobicity = {
|
| 233 |
+
'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
|
| 234 |
+
'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
|
| 235 |
+
'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
|
| 236 |
+
'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
if len(sequence) == 0:
|
| 240 |
+
return 0
|
| 241 |
+
|
| 242 |
+
total = sum(hydrophobicity.get(aa, 0) for aa in sequence)
|
| 243 |
+
return round(total / len(sequence), 2)
|
| 244 |
+
|
| 245 |
# ==================== Model Classes ====================
|
| 246 |
|
| 247 |
# --- add this utility somewhere above UnifiedPeptidePredictor ---
|
|
|
|
| 293 |
class PeptideCLMFeaturizer:
|
| 294 |
"""
|
| 295 |
Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
|
| 296 |
+
Use the SAME tokenizer files, max_length, and pooling you used in training XGB models.
|
| 297 |
"""
|
| 298 |
def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
|
| 299 |
self.device = device
|
|
|
|
| 671 |
if len(vals) == 0:
|
| 672 |
return None
|
| 673 |
|
|
|
|
| 674 |
threshold_default = float(np.median(vals))
|
| 675 |
return {
|
| 676 |
"values": vals,
|
|
|
|
| 899 |
# Model registry
|
| 900 |
self.models = {}
|
| 901 |
self.model_configs = self.get_model_configs()
|
| 902 |
+
self.sequence_analyzer = SequenceAnalyzer()
|
| 903 |
# Data manager
|
| 904 |
self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
|
| 905 |
self._protein_cache = {}
|
|
|
|
| 972 |
'path': 'models/best_model_nonfouling.json',
|
| 973 |
'unit': 'Probability',
|
| 974 |
'display_name': 'π― Non-Fouling',
|
| 975 |
+
'positive_label': 'Non-toxic',
|
| 976 |
+
'negative_label': 'Toxic'
|
| 977 |
},
|
| 978 |
'nonfouling_smiles': {
|
| 979 |
'type': 'xgboost',
|
|
|
|
| 981 |
'path': 'models/nonfouling-xgboost_smiles.json',
|
| 982 |
'unit': 'Probability',
|
| 983 |
'display_name': 'π― Non-Fouling',
|
| 984 |
+
'positive_label': 'Stable',
|
| 985 |
+
'negative_label': 'Unstable'
|
| 986 |
},
|
| 987 |
'binding_affinity': {
|
| 988 |
'type': 'binding',
|
|
|
|
| 994 |
'binding_affinity_smiles': {
|
| 995 |
'type': 'binding_smiles',
|
| 996 |
'input': 'sequence+smiles',
|
| 997 |
+
'path': 'models/binding-affinity_smiles.pt',
|
| 998 |
'unit': 'Probability',
|
| 999 |
'display_name': 'π Binding Affinity (SMILES)'
|
| 1000 |
},
|
| 1001 |
}
|
| 1002 |
+
def analyze_sequence(self, sequence: str, pH: float = 7.0) -> Dict[str, Any]:
|
| 1003 |
+
"""Comprehensive sequence analysis including charge, pI, and aggregation"""
|
| 1004 |
+
results = {}
|
| 1005 |
+
|
| 1006 |
+
# Basic properties
|
| 1007 |
+
results['length'] = len(sequence)
|
| 1008 |
+
results['molecular_weight'] = self.sequence_analyzer.calculate_molecular_weight(sequence)
|
| 1009 |
+
results['net_charge'] = self.sequence_analyzer.calculate_net_charge(sequence, pH)
|
| 1010 |
+
results['isoelectric_point'] = self.sequence_analyzer.calculate_isoelectric_point(sequence)
|
| 1011 |
+
results['hydrophobicity'] = self.sequence_analyzer.calculate_hydrophobicity(sequence)
|
| 1012 |
+
return results
|
| 1013 |
def load_all_models(self):
|
| 1014 |
"""Load all available models"""
|
| 1015 |
for name, config in self.model_configs.items():
|
|
|
|
| 1221 |
|
| 1222 |
def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
|
| 1223 |
return self.smiles_featurizer.embed_list([s])[0]
|
| 1224 |
+
|
| 1225 |
+
@staticmethod
|
| 1226 |
+
def affinity_to_nM(affinity: float) -> float:
|
| 1227 |
+
"""
|
| 1228 |
+
Convert model affinity score (pKd / pKi / pIC50 style: -log10(K [M]))
|
| 1229 |
+
to an approximate concentration in nM.
|
| 1230 |
+
"""
|
| 1231 |
+
# K [M] = 10^(-affinity); then convert M -> nM (1e9 factor)
|
| 1232 |
+
return 10.0 ** (-float(affinity)) * 1e9
|
| 1233 |
+
|
| 1234 |
|
| 1235 |
+
# ==================== TANGO INTEGRATION ====================
|
| 1236 |
+
|
| 1237 |
+
# TANGO executable: same folder as this script
|
| 1238 |
+
try:
|
| 1239 |
+
HERE = Path(__file__).resolve().parent
|
| 1240 |
+
except NameError:
|
| 1241 |
+
HERE = Path(".").resolve()
|
| 1242 |
+
|
| 1243 |
+
TANGO_EXE = str(HERE / "tango_x86_64_release")
|
| 1244 |
+
|
| 1245 |
+
# Default params (adjust if you like)
|
| 1246 |
+
DEFAULT_TANGO_PARAMS = {
|
| 1247 |
+
"nt": "N",
|
| 1248 |
+
"ct": "N",
|
| 1249 |
+
"ph": "7.0",
|
| 1250 |
+
"te": "310", # Kelvin (~37 Β°C)
|
| 1251 |
+
"io": "0.05",
|
| 1252 |
+
"tf": "0",
|
| 1253 |
+
"stab": "-10",
|
| 1254 |
+
"conc": "0.0001",
|
| 1255 |
+
}
|
| 1256 |
+
|
| 1257 |
+
def _parse_tango_keyvals(text: str) -> dict:
|
| 1258 |
+
"""
|
| 1259 |
+
Parse lines like:
|
| 1260 |
+
'AGG 0 AMYLO 6.41e-13 TURN 7.06 HELIX 0 HELAGG 0 BETA 19.67'
|
| 1261 |
+
into {'AMYLO': [...], 'BETA': [...], ...}
|
| 1262 |
+
"""
|
| 1263 |
+
buckets = defaultdict(list)
|
| 1264 |
+
for line in text.splitlines():
|
| 1265 |
+
pairs = re.findall(
|
| 1266 |
+
r'\b(AGG|AMYLO|TURN|HELIX|HELAGG|BETA)\s+([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b',
|
| 1267 |
+
line
|
| 1268 |
+
)
|
| 1269 |
+
for k, v in pairs:
|
| 1270 |
+
try:
|
| 1271 |
+
buckets[k].append(float(v))
|
| 1272 |
+
except ValueError:
|
| 1273 |
+
pass
|
| 1274 |
+
return dict(buckets)
|
| 1275 |
+
|
| 1276 |
+
def _agg(vals, how="sum"):
|
| 1277 |
+
if not vals:
|
| 1278 |
+
return None
|
| 1279 |
+
if how == "sum":
|
| 1280 |
+
return float(sum(vals))
|
| 1281 |
+
if how == "max":
|
| 1282 |
+
return float(max(vals))
|
| 1283 |
+
if how == "mean":
|
| 1284 |
+
return float(sum(vals) / len(vals))
|
| 1285 |
+
return None
|
| 1286 |
+
|
| 1287 |
+
def run_tango_for_sequence(
|
| 1288 |
+
seq: str,
|
| 1289 |
+
pH_value: str,
|
| 1290 |
+
ident: str = "seq",
|
| 1291 |
+
params: dict | None = None,
|
| 1292 |
+
exe: str = TANGO_EXE,
|
| 1293 |
+
) -> dict:
|
| 1294 |
+
"""
|
| 1295 |
+
Run TANGO on a single sequence and return:
|
| 1296 |
+
- amyloid aggregation (AMYLO sum/max)
|
| 1297 |
+
- Ξ²-sheet aggregation (BETA sum/max)
|
| 1298 |
+
"""
|
| 1299 |
+
params = {**DEFAULT_TANGO_PARAMS, **(params or {})}
|
| 1300 |
+
params["ph"] = pH_value
|
| 1301 |
+
cmd = [exe, ident] + [f'{k}="{v}"' for k, v in params.items()] + [f'seq="{seq}"']
|
| 1302 |
+
|
| 1303 |
+
# TANGO likes a single shell command
|
| 1304 |
+
p = subprocess.run(" ".join(cmd), shell=True, capture_output=True, text=True)
|
| 1305 |
+
out = (p.stdout or "") + (("\n[STDERR]\n" + p.stderr) if p.stderr else "")
|
| 1306 |
+
|
| 1307 |
+
buckets = _parse_tango_keyvals(out)
|
| 1308 |
+
|
| 1309 |
+
amylo_vals = buckets.get("AMYLO", [])
|
| 1310 |
+
beta_vals = buckets.get("BETA", [])
|
| 1311 |
+
agg_vals = buckets.get("AGG", [])
|
| 1312 |
+
|
| 1313 |
+
tango_amylo_max = _agg(amylo_vals, "max")
|
| 1314 |
+
tango_amylo_sum = _agg(amylo_vals, "sum")
|
| 1315 |
+
tango_beta_max = _agg(beta_vals, "max")
|
| 1316 |
+
tango_beta_sum = _agg(beta_vals, "sum")
|
| 1317 |
+
tango_agg_sum = _agg(agg_vals, "sum")
|
| 1318 |
+
|
| 1319 |
+
return {
|
| 1320 |
+
"tango_amylo_max": tango_amylo_max,
|
| 1321 |
+
"tango_amylo_sum": tango_amylo_sum,
|
| 1322 |
+
"tango_beta_max": tango_beta_max,
|
| 1323 |
+
"tango_beta_sum": tango_beta_sum,
|
| 1324 |
+
"tango_agg_sum": tango_agg_sum,
|
| 1325 |
+
"raw_output": out.strip(),
|
| 1326 |
+
}
|
| 1327 |
|
| 1328 |
# ==================== Gradio Interface ====================
|
| 1329 |
|
|
|
|
| 1349 |
half_life: bool,
|
| 1350 |
nonfouling: bool,
|
| 1351 |
binding_affinity: bool,
|
| 1352 |
+
tango_amyloid: bool,
|
| 1353 |
+
tango_beta: bool,
|
| 1354 |
+
include_physicochemical: bool,
|
| 1355 |
+
pH_value: float,
|
| 1356 |
progress=gr.Progress()
|
| 1357 |
):
|
| 1358 |
"""Main prediction function"""
|
|
|
|
| 1378 |
# Collect selected properties
|
| 1379 |
selected_properties = []
|
| 1380 |
|
|
|
|
| 1381 |
checkbox_to_keys = {
|
| 1382 |
'hemolysis': ['hemolysis_seq', 'hemolysis_smiles'],
|
| 1383 |
'solubility': ['solubility_seq', 'solubility_smiles'],
|
| 1384 |
+
'permeability': ['permeability_smiles'],
|
| 1385 |
'half_life': ['half_life_seq', 'binding_affinity_smiles'],
|
| 1386 |
'nonfouling': ['nonfouling_seq', 'nonfouling_smiles'], # adjust if you have a real cytotox model
|
| 1387 |
}
|
|
|
|
| 1442 |
})
|
| 1443 |
except Exception as e:
|
| 1444 |
print(f"Error predicting {prop}: {e}")
|
| 1445 |
+
if input_type == "Sequence":
|
| 1446 |
+
if include_physicochemical:
|
| 1447 |
+
seq_display = seq[:30] + '...' if len(seq) > 30 else seq
|
| 1448 |
+
progress((seq_idx + 0.3) / len(lines), f"Calculating physicochemical properties...")
|
| 1449 |
+
analysis = pred.analyze_sequence(seq, pH_value)
|
| 1450 |
+
|
| 1451 |
+
results.append({
|
| 1452 |
+
'Sequence': seq_display,
|
| 1453 |
+
'Property': 'π Length',
|
| 1454 |
+
'Prediction': '',
|
| 1455 |
+
'Value': str(analysis['length']),
|
| 1456 |
+
'Unit': 'aa'
|
| 1457 |
+
})
|
| 1458 |
+
results.append({
|
| 1459 |
+
'Sequence': seq_display,
|
| 1460 |
+
'Property': 'βοΈ Molecular Weight',
|
| 1461 |
+
'Prediction': '',
|
| 1462 |
+
'Value': f"{analysis['molecular_weight']:.1f}",
|
| 1463 |
+
'Unit': 'Da'
|
| 1464 |
+
})
|
| 1465 |
+
results.append({
|
| 1466 |
+
'Sequence': seq_display,
|
| 1467 |
+
'Property': f'β‘ Net Charge (pH {pH_value})',
|
| 1468 |
+
'Prediction': '',
|
| 1469 |
+
'Value': f"{analysis['net_charge']:.2f}",
|
| 1470 |
+
'Unit': ''
|
| 1471 |
+
})
|
| 1472 |
+
results.append({
|
| 1473 |
+
'Sequence': seq_display,
|
| 1474 |
+
'Property': 'π― Isoelectric Point',
|
| 1475 |
+
'Prediction': '',
|
| 1476 |
+
'Value': f"{analysis['isoelectric_point']:.2f}",
|
| 1477 |
+
'Unit': 'pH'
|
| 1478 |
+
})
|
| 1479 |
+
hydro = analysis['hydrophobicity']
|
| 1480 |
+
if hydro <= -4.5:
|
| 1481 |
+
hydro_label = "Hydrophilic"
|
| 1482 |
+
elif hydro >= 4.5:
|
| 1483 |
+
hydro_label = "Hydrophobic"
|
| 1484 |
+
else:
|
| 1485 |
+
hydro_label = "Intermediate"
|
| 1486 |
+
|
| 1487 |
+
results.append({
|
| 1488 |
+
'Sequence': seq_display,
|
| 1489 |
+
'Property': 'π¦ Hydrophobicity (GRAVY)',
|
| 1490 |
+
'Prediction': hydro_label,
|
| 1491 |
+
'Value': f"{hydro:.2f}",
|
| 1492 |
+
'Unit': 'GRAVY (Kyte-Doolittle)',
|
| 1493 |
+
})
|
| 1494 |
+
if input_type == "Sequence" and (tango_amyloid or tango_beta):
|
| 1495 |
+
try:
|
| 1496 |
+
# Run once per sequence
|
| 1497 |
+
tango_res = run_tango_for_sequence(
|
| 1498 |
+
seq,
|
| 1499 |
+
pH_value=pH_value,
|
| 1500 |
+
ident=f"seq{seq_idx+1}",
|
| 1501 |
+
params=None # override pH/te here if you want
|
| 1502 |
+
)
|
| 1503 |
+
|
| 1504 |
+
short_seq = seq[:30] + '...' if len(seq) > 30 else seq
|
| 1505 |
+
|
| 1506 |
+
if tango_amyloid and tango_res["tango_amylo_sum"] is not None:
|
| 1507 |
+
results.append({
|
| 1508 |
+
'Sequence': short_seq,
|
| 1509 |
+
'Property': "π§± TANGO Amyloid Aggregation",
|
| 1510 |
+
'Prediction': "",
|
| 1511 |
+
'Value': f"{tango_res['tango_amylo_sum']:.3f}",
|
| 1512 |
+
'Unit': "TANGO (sum)"
|
| 1513 |
+
})
|
| 1514 |
+
|
| 1515 |
+
if tango_beta and tango_res["tango_beta_sum"] is not None:
|
| 1516 |
+
results.append({
|
| 1517 |
+
'Sequence': short_seq,
|
| 1518 |
+
'Property': "𧬠TANGO β-sheet Aggregation",
|
| 1519 |
+
'Prediction': "",
|
| 1520 |
+
'Value': f"{tango_res['tango_beta_sum']:.3f}",
|
| 1521 |
+
'Unit': "TANGO (sum)"
|
| 1522 |
+
})
|
| 1523 |
+
|
| 1524 |
+
except Exception as e:
|
| 1525 |
+
print(f"Error running TANGO for sequence {seq_idx+1}: {e}")
|
| 1526 |
+
|
| 1527 |
# Handle binding affinity separately
|
| 1528 |
if binding_affinity and input_text:
|
| 1529 |
# SequenceβSequence binding
|
|
|
|
| 1540 |
protein_seq,
|
| 1541 |
binder_seq
|
| 1542 |
)
|
| 1543 |
+
kd_nM = pred.affinity_to_nM(affinity)
|
| 1544 |
+
|
| 1545 |
+
seq_label = f"Proteinβ{binder_seq[:20]}..."
|
| 1546 |
+
prop_base = pred.model_configs['binding_affinity']['display_name']
|
| 1547 |
+
|
| 1548 |
+
# Row 1: affinity score (pKd-like)
|
| 1549 |
results.append({
|
| 1550 |
+
'Sequence': seq_label,
|
| 1551 |
+
'Property': f"{prop_base} (score)",
|
| 1552 |
+
'Prediction': binding_class,
|
| 1553 |
'Value': f"{affinity:.3f}",
|
| 1554 |
+
'Unit': "Affinity score (pKd-like)",
|
| 1555 |
+
})
|
| 1556 |
+
|
| 1557 |
+
# Row 2: converted Kd in nM
|
| 1558 |
+
results.append({
|
| 1559 |
+
'Sequence': seq_label,
|
| 1560 |
+
'Property': f"{prop_base} (Kd est.)",
|
| 1561 |
+
'Prediction': binding_class,
|
| 1562 |
+
'Value': f"{kd_nM:.3g}",
|
| 1563 |
+
'Unit': "nM (Kd/Ki/IC50)",
|
| 1564 |
})
|
| 1565 |
except Exception as e:
|
| 1566 |
print(f"Error in sequence binding prediction: {e}")
|
|
|
|
| 1583 |
protein_seq,
|
| 1584 |
smi
|
| 1585 |
)
|
| 1586 |
+
kd_nM = pred.affinity_to_nM(affinity)
|
| 1587 |
+
|
| 1588 |
+
seq_label = f"Proteinβ{smi[:20]}..."
|
| 1589 |
+
prop_base = pred.model_configs['binding_affinity_smiles']['display_name']
|
| 1590 |
+
|
| 1591 |
+
# Row 1: affinity score (pKd-like)
|
| 1592 |
results.append({
|
| 1593 |
+
'Sequence': seq_label,
|
| 1594 |
+
'Property': f"{prop_base} (score)",
|
| 1595 |
+
'Prediction': label, # Tight / Medium / Weak
|
| 1596 |
'Value': f"{affinity:.3f}",
|
| 1597 |
+
'Unit': "Affinity score (pKd-like)",
|
| 1598 |
+
})
|
| 1599 |
+
|
| 1600 |
+
# Row 2: converted Kd in nM
|
| 1601 |
+
results.append({
|
| 1602 |
+
'Sequence': seq_label,
|
| 1603 |
+
'Property': f"{prop_base} (Kd est.)",
|
| 1604 |
+
'Prediction': label,
|
| 1605 |
+
'Value': f"{kd_nM:.3g}",
|
| 1606 |
+
'Unit': "nM (Kd/Ki/IC50)",
|
| 1607 |
})
|
| 1608 |
except Exception as e:
|
| 1609 |
print(f"Error in SMILES binding prediction: {e}")
|
|
|
|
| 1697 |
return "", ""
|
| 1698 |
|
| 1699 |
def on_example_change(name: str):
|
| 1700 |
+
binder, protein = load_example(name)
|
| 1701 |
show_protein = (name == "Protein-Peptide")
|
| 1702 |
return (
|
| 1703 |
gr.update(value=binder), # input_text
|
|
|
|
| 1737 |
text-align: center;
|
| 1738 |
margin-bottom: 10px !important;
|
| 1739 |
}
|
| 1740 |
+
h3 {
|
| 1741 |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 1742 |
-webkit-background-clip: text;
|
| 1743 |
-webkit-text-fill-color: transparent;
|
|
|
|
| 1764 |
gr.Markdown(
|
| 1765 |
"""
|
| 1766 |
# π PeptiVerse
|
| 1767 |
+
### \t Peptide Property Predictions
|
| 1768 |
"""
|
| 1769 |
)
|
| 1770 |
|
|
|
|
| 1813 |
with gr.Column(scale=1):
|
| 1814 |
with gr.Group():
|
| 1815 |
gr.Markdown("### βοΈ Select Properties")
|
| 1816 |
+
with gr.Accordion("Physicochemical Properties", open=True):
|
| 1817 |
+
include_physicochemical = gr.Checkbox(
|
| 1818 |
+
label="π§ͺ Calculate Basic Properties",
|
| 1819 |
+
value=True,
|
| 1820 |
+
info="MW, net charge, pI, hydrophobicity"
|
| 1821 |
+
)
|
| 1822 |
+
|
| 1823 |
+
pH_value = gr.Slider(
|
| 1824 |
+
minimum=0,
|
| 1825 |
+
maximum=14,
|
| 1826 |
+
value=7.0,
|
| 1827 |
+
step=0.1,
|
| 1828 |
+
label="pH for Net Charge",
|
| 1829 |
+
info="Physiological pH is ~7.4"
|
| 1830 |
+
)
|
| 1831 |
with gr.Accordion("Sequence Properties", open=True):
|
| 1832 |
hemolysis = gr.Checkbox(label="π©Έ Hemolysis β", value=True)
|
| 1833 |
solubility = gr.Checkbox(label="π§ Solubility β", value=True)
|
| 1834 |
permeability = gr.Checkbox(label="πͺ£ Permeability β", value=False)
|
| 1835 |
half_life = gr.Checkbox(label="β±οΈ Half-life β", value=False)
|
| 1836 |
+
nonfouling = gr.Checkbox(label="π― Non-Fouling β", value=False)
|
| 1837 |
+
tango_amyloid = gr.Checkbox(label="π§± TANGO Amyloid Aggregation β", value=False)
|
| 1838 |
+
tango_beta = gr.Checkbox(label="𧬠TANGO Ξ²-sheet Aggregation β", value=False)
|
| 1839 |
with gr.Accordion("Binding Prediction", open=False):
|
| 1840 |
binding_affinity = gr.Checkbox(label="π Binding Affinity β", value=False)
|
| 1841 |
gr.Markdown("*Requires protein sequence input*")
|
|
|
|
| 1845 |
with gr.Column(scale=1):
|
| 1846 |
property_selector = gr.Dropdown(
|
| 1847 |
choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
|
| 1848 |
+
"nonfouling", "binding_affinity", "tango_amyloid", "tango_beta"],
|
| 1849 |
label="Select Property",
|
| 1850 |
value="hemolysis"
|
| 1851 |
)
|
|
|
|
| 1927 |
input_text, input_type, protein_seq,
|
| 1928 |
hemolysis, solubility, permeability,
|
| 1929 |
half_life, nonfouling,
|
| 1930 |
+
binding_affinity, tango_amyloid, tango_beta, include_physicochemical, pH_value,
|
| 1931 |
],
|
| 1932 |
outputs=[results_df, status_output]
|
| 1933 |
)
|
description.md
CHANGED
|
@@ -10,6 +10,7 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
- **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
|
|
|
|
| 13 |
|
| 14 |
#### Solubility Dataset
|
| 15 |
- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
|
|
@@ -45,6 +46,7 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 45 |
- **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
|
| 46 |
- **Quality:** Binding class cutoffs: Tight β₯ 7.5, Medium 6.0β7.5, Weak < 6.0
|
| 47 |
- **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
|
|
|
|
| 48 |
|
| 49 |
### Model Architecture
|
| 50 |
|
|
@@ -56,6 +58,22 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 56 |
### Model Training and Weight Hosting
|
| 57 |
- [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
### Citation
|
| 60 |
|
| 61 |
If you use this tool, please cite:
|
|
|
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
- **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
|
| 13 |
+
- **Interpretation** 50% of read blood cells being lysed at x ug/ml concetration (HC50). If HC50 < 100uM, considered as hemolytic, otherwise non-hemolytic.
|
| 14 |
|
| 15 |
#### Solubility Dataset
|
| 16 |
- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
|
|
|
|
| 46 |
- **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
|
| 47 |
- **Quality:** Binding class cutoffs: Tight β₯ 7.5, Medium 6.0β7.5, Weak < 6.0
|
| 48 |
- **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
|
| 49 |
+
- **Interpretation** Affinity_measure = -log_10(K).
|
| 50 |
|
| 51 |
### Model Architecture
|
| 52 |
|
|
|
|
| 58 |
### Model Training and Weight Hosting
|
| 59 |
- [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
|
| 60 |
|
| 61 |
+
### π§ͺ Physicochemical Properties
|
| 62 |
+
|
| 63 |
+
#### Net Charge Calculation
|
| 64 |
+
- Uses Henderson-Hasselbalch equation
|
| 65 |
+
- pH-dependent calculation
|
| 66 |
+
- Considers all ionizable groups (K, R, H, D, E, C, Y, termini)
|
| 67 |
+
|
| 68 |
+
#### Isoelectric Point (pI)
|
| 69 |
+
- Bisection method to find pH where net charge = 0
|
| 70 |
+
- Precision: Β±0.01 pH units
|
| 71 |
+
|
| 72 |
+
#### Hydrophobicity (GRAVY)
|
| 73 |
+
- Grand Average of Hydropathy
|
| 74 |
+
- Uses Kyte-Doolittle scale
|
| 75 |
+
- Range: -4.5 (hydrophilic) to +4.5 (hydrophobic)
|
| 76 |
+
|
| 77 |
### Citation
|
| 78 |
|
| 79 |
If you use this tool, please cite:
|
tango_x86_64_release
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0e381c28f847487069b0df29bb9d4f766391066710500d3170ecb73d9f31dbf
|
| 3 |
+
size 211205
|
tokenizer/__pycache__/__init__.cpython-310.pyc
CHANGED
|
Binary files a/tokenizer/__pycache__/__init__.cpython-310.pyc and b/tokenizer/__pycache__/__init__.cpython-310.pyc differ
|
|
|
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc
CHANGED
|
Binary files a/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc and b/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc differ
|
|
|