yinuozhang commited on
Commit
3aedb16
Β·
1 Parent(s): 9e9ca0b
app.py CHANGED
@@ -11,10 +11,17 @@ from pathlib import Path
11
  import json
12
  import time
13
  from typing import List, Dict, Any, Tuple, Optional
14
-
 
15
  from huggingface_hub import snapshot_download
16
  from pathlib import Path
17
  import os
 
 
 
 
 
 
18
 
19
  def pick_assets_root() -> Path:
20
  # HF Spaces container uses /home/user; detect via SPACE_ID or existence
@@ -58,8 +65,8 @@ for k, v in {
58
  ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
59
  ASSETS_DATA = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
60
 
61
- MODEL_REPO = "ChatterjeeLab/Classifier_Weight"
62
- DATASET_REPO = "ChatterjeeLab/Classifier_Weight"
63
 
64
  def fetch_models_and_data():
65
  snapshot_download(
@@ -106,6 +113,135 @@ def is_smiles_like(s: str) -> bool:
106
  maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
107
  return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # ==================== Model Classes ====================
110
 
111
  # --- add this utility somewhere above UnifiedPeptidePredictor ---
@@ -157,7 +293,7 @@ from transformers import AutoModelForMaskedLM
157
  class PeptideCLMFeaturizer:
158
  """
159
  Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
160
- Use the SAME tokenizer files, max_length, and pooling you used in training your XGB models.
161
  """
162
  def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
163
  self.device = device
@@ -535,7 +671,6 @@ class TrainingDataManager:
535
  if len(vals) == 0:
536
  return None
537
 
538
- # Use the conventional log Peff unit; keep your prior display threshold (-4.0) or set median
539
  threshold_default = float(np.median(vals))
540
  return {
541
  "values": vals,
@@ -764,7 +899,7 @@ class UnifiedPeptidePredictor:
764
  # Model registry
765
  self.models = {}
766
  self.model_configs = self.get_model_configs()
767
-
768
  # Data manager
769
  self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
770
  self._protein_cache = {}
@@ -837,8 +972,8 @@ class UnifiedPeptidePredictor:
837
  'path': 'models/best_model_nonfouling.json',
838
  'unit': 'Probability',
839
  'display_name': 'πŸ‘― Non-Fouling',
840
- 'positive_label': 'Non-fouling',
841
- 'negative_label': 'Fouling'
842
  },
843
  'nonfouling_smiles': {
844
  'type': 'xgboost',
@@ -846,8 +981,8 @@ class UnifiedPeptidePredictor:
846
  'path': 'models/nonfouling-xgboost_smiles.json',
847
  'unit': 'Probability',
848
  'display_name': 'πŸ‘― Non-Fouling',
849
- 'positive_label': 'Non-fouling',
850
- 'negative_label': 'Fouling'
851
  },
852
  'binding_affinity': {
853
  'type': 'binding',
@@ -859,12 +994,22 @@ class UnifiedPeptidePredictor:
859
  'binding_affinity_smiles': {
860
  'type': 'binding_smiles',
861
  'input': 'sequence+smiles',
862
- 'path': 'models/binding_affinity_smiles.pt',
863
  'unit': 'Probability',
864
  'display_name': 'πŸ”— Binding Affinity (SMILES)'
865
  },
866
  }
867
-
 
 
 
 
 
 
 
 
 
 
868
  def load_all_models(self):
869
  """Load all available models"""
870
  for name, config in self.model_configs.items():
@@ -1076,7 +1221,109 @@ class UnifiedPeptidePredictor:
1076
 
1077
  def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
1078
  return self.smiles_featurizer.embed_list([s])[0]
 
 
 
 
 
 
 
 
 
 
1079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1080
 
1081
  # ==================== Gradio Interface ====================
1082
 
@@ -1102,6 +1349,10 @@ def predict_properties(
1102
  half_life: bool,
1103
  nonfouling: bool,
1104
  binding_affinity: bool,
 
 
 
 
1105
  progress=gr.Progress()
1106
  ):
1107
  """Main prediction function"""
@@ -1127,11 +1378,10 @@ def predict_properties(
1127
  # Collect selected properties
1128
  selected_properties = []
1129
 
1130
- # Map UI checkboxes to your internal model keys
1131
  checkbox_to_keys = {
1132
  'hemolysis': ['hemolysis_seq', 'hemolysis_smiles'],
1133
  'solubility': ['solubility_seq', 'solubility_smiles'],
1134
- 'permeability': ['permeability_smiles'], # only smiles in your current config
1135
  'half_life': ['half_life_seq', 'binding_affinity_smiles'],
1136
  'nonfouling': ['nonfouling_seq', 'nonfouling_smiles'], # adjust if you have a real cytotox model
1137
  }
@@ -1192,7 +1442,88 @@ def predict_properties(
1192
  })
1193
  except Exception as e:
1194
  print(f"Error predicting {prop}: {e}")
1195
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1196
  # Handle binding affinity separately
1197
  if binding_affinity and input_text:
1198
  # Sequence–Sequence binding
@@ -1209,12 +1540,27 @@ def predict_properties(
1209
  protein_seq,
1210
  binder_seq
1211
  )
 
 
 
 
 
 
1212
  results.append({
1213
- 'Sequence': f"Protein–{binder_seq[:20]}...",
1214
- 'Property': pred.model_configs['binding_affinity']['display_name'],
1215
- 'Prediction': binding_class, # e.g., Tight/Medium/Weak
1216
  'Value': f"{affinity:.3f}",
1217
- 'Unit': pred.model_configs['binding_affinity']['unit']
 
 
 
 
 
 
 
 
 
1218
  })
1219
  except Exception as e:
1220
  print(f"Error in sequence binding prediction: {e}")
@@ -1237,12 +1583,27 @@ def predict_properties(
1237
  protein_seq,
1238
  smi
1239
  )
 
 
 
 
 
 
1240
  results.append({
1241
- 'Sequence': f"Protein–{smi[:20]}...",
1242
- 'Property': pred.model_configs['binding_affinity_smiles']['display_name'],
1243
- 'Prediction': label, # Tight (οΏ½οΏ½οΏ½7.5) / Medium (6.0–7.5) / Weak (<6.0)
1244
  'Value': f"{affinity:.3f}",
1245
- 'Unit': pred.model_configs['binding_affinity_smiles']['unit'],
 
 
 
 
 
 
 
 
 
1246
  })
1247
  except Exception as e:
1248
  print(f"Error in SMILES binding prediction: {e}")
@@ -1336,7 +1697,7 @@ def load_example(example_name):
1336
  return "", ""
1337
 
1338
  def on_example_change(name: str):
1339
- binder, protein = load_example(name) # your helper above
1340
  show_protein = (name == "Protein-Peptide")
1341
  return (
1342
  gr.update(value=binder), # input_text
@@ -1376,7 +1737,7 @@ h1 {
1376
  text-align: center;
1377
  margin-bottom: 10px !important;
1378
  }
1379
- h2 {
1380
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1381
  -webkit-background-clip: text;
1382
  -webkit-text-fill-color: transparent;
@@ -1403,7 +1764,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
1403
  gr.Markdown(
1404
  """
1405
  # 🌐 PeptiVerse
1406
- ## \t Peptide Property Predictions
1407
  """
1408
  )
1409
 
@@ -1452,13 +1813,29 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
1452
  with gr.Column(scale=1):
1453
  with gr.Group():
1454
  gr.Markdown("### βš™οΈ Select Properties")
1455
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1456
  with gr.Accordion("Sequence Properties", open=True):
1457
  hemolysis = gr.Checkbox(label="🩸 Hemolysis ↓", value=True)
1458
  solubility = gr.Checkbox(label="πŸ’§ Solubility ↑", value=True)
1459
  permeability = gr.Checkbox(label="πŸͺ£ Permeability ↑", value=False)
1460
  half_life = gr.Checkbox(label="⏱️ Half-life ↑", value=False)
1461
- nonfouling = gr.Checkbox(label="πŸ‘― Non-Fouling ↑", value=False)
 
 
1462
  with gr.Accordion("Binding Prediction", open=False):
1463
  binding_affinity = gr.Checkbox(label="πŸ”— Binding Affinity ↑", value=False)
1464
  gr.Markdown("*Requires protein sequence input*")
@@ -1468,7 +1845,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
1468
  with gr.Column(scale=1):
1469
  property_selector = gr.Dropdown(
1470
  choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
1471
- "nonfouling", "binding_affinity"],
1472
  label="Select Property",
1473
  value="hemolysis"
1474
  )
@@ -1550,7 +1927,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
1550
  input_text, input_type, protein_seq,
1551
  hemolysis, solubility, permeability,
1552
  half_life, nonfouling,
1553
- binding_affinity
1554
  ],
1555
  outputs=[results_df, status_output]
1556
  )
 
11
  import json
12
  import time
13
  from typing import List, Dict, Any, Tuple, Optional
14
+ import subprocess
15
+ from collections import defaultdict
16
  from huggingface_hub import snapshot_download
17
  from pathlib import Path
18
  import os
19
+ try:
20
+ from Bio.SeqUtils.ProtParam import ProteinAnalysis
21
+ BIOPYTHON_AVAILABLE = True
22
+ except ImportError:
23
+ BIOPYTHON_AVAILABLE = False
24
+ print("BioPython not available. Using fallback for pI/charge calculations.")
25
 
26
  def pick_assets_root() -> Path:
27
  # HF Spaces container uses /home/user; detect via SPACE_ID or existence
 
65
  ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
66
  ASSETS_DATA = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
67
 
68
+ MODEL_REPO = "ChatterjeeLab/Classifier_Weight" # model repo
69
+ DATASET_REPO = "ChatterjeeLab/Classifier_Weight" # dataset repo (create this)
70
 
71
  def fetch_models_and_data():
72
  snapshot_download(
 
113
  maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
114
  return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
115
 
116
+ # ==================== Sequence Analysis ====================
117
+
118
+ class SequenceAnalyzer:
119
+ """Calculate physicochemical properties of peptide sequences"""
120
+
121
+ # pKa values for amino acids
122
+ PKA_VALUES = {
123
+ 'N_term': 9.6,
124
+ 'C_term': 2.3,
125
+ 'D': 3.9, # Aspartic acid
126
+ 'E': 4.2, # Glutamic acid
127
+ 'H': 6.0, # Histidine
128
+ 'C': 8.3, # Cysteine
129
+ 'Y': 10.1, # Tyrosine
130
+ 'K': 10.5, # Lysine
131
+ 'R': 12.5, # Arginine
132
+ }
133
+
134
+ @classmethod
135
+ def calculate_net_charge(cls, sequence: str, pH: float = 7.0) -> float:
136
+ """Calculate net charge at given pH using Henderson-Hasselbalch equation"""
137
+ if BIOPYTHON_AVAILABLE:
138
+ try:
139
+ analyzer = ProteinAnalysis(sequence)
140
+ return analyzer.charge_at_pH(pH)
141
+ except:
142
+ pass
143
+
144
+ # Fallback calculation
145
+ charge = 0
146
+
147
+ # N-terminus
148
+ charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['N_term']))
149
+
150
+ # C-terminus
151
+ charge -= 1 / (1 + 10**(cls.PKA_VALUES['C_term'] - pH))
152
+
153
+ # Count charged residues
154
+ for aa in sequence:
155
+ if aa in 'KR': # Positive
156
+ pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['K' if aa == 'K' else 'R'])
157
+ charge += 1 / (1 + 10**(pH - pKa))
158
+ elif aa in 'DE': # Negative
159
+ pKa = cls.PKA_VALUES.get(aa, cls.PKA_VALUES['D' if aa == 'D' else 'E'])
160
+ charge -= 1 / (1 + 10**(pKa - pH))
161
+ elif aa == 'H': # Histidine (positive when protonated)
162
+ charge += 1 / (1 + 10**(pH - cls.PKA_VALUES['H']))
163
+ elif aa == 'C': # Cysteine (negative when deprotonated)
164
+ charge -= 1 / (1 + 10**(cls.PKA_VALUES['C'] - pH))
165
+ elif aa == 'Y': # Tyrosine (negative when deprotonated)
166
+ charge -= 1 / (1 + 10**(cls.PKA_VALUES['Y'] - pH))
167
+
168
+ return round(charge, 2)
169
+
170
+ @classmethod
171
+ def calculate_isoelectric_point(cls, sequence: str) -> float:
172
+ """Calculate theoretical pI using bisection method"""
173
+ if BIOPYTHON_AVAILABLE:
174
+ try:
175
+ analyzer = ProteinAnalysis(sequence)
176
+ return analyzer.isoelectric_point()
177
+ except:
178
+ pass
179
+
180
+ # Fallback: Bisection method
181
+ pH_min, pH_max = 0.0, 14.0
182
+ epsilon = 0.01
183
+
184
+ while (pH_max - pH_min) > epsilon:
185
+ pH_mid = (pH_min + pH_max) / 2
186
+ charge = cls.calculate_net_charge(sequence, pH_mid)
187
+
188
+ if abs(charge) < epsilon:
189
+ return round(pH_mid, 2)
190
+
191
+ if charge > 0:
192
+ pH_min = pH_mid
193
+ else:
194
+ pH_max = pH_mid
195
+
196
+ return round((pH_min + pH_max) / 2, 2)
197
+
198
+ @classmethod
199
+ def calculate_molecular_weight(cls, sequence: str) -> float:
200
+ """Calculate molecular weight"""
201
+ if BIOPYTHON_AVAILABLE:
202
+ try:
203
+ analyzer = ProteinAnalysis(sequence)
204
+ return analyzer.molecular_weight()
205
+ except:
206
+ pass
207
+
208
+ # Fallback: approximate calculation
209
+ weights = {
210
+ 'A': 89.1, 'C': 121.2, 'D': 133.1, 'E': 147.1, 'F': 165.2,
211
+ 'G': 75.1, 'H': 155.2, 'I': 131.2, 'K': 146.2, 'L': 131.2,
212
+ 'M': 149.2, 'N': 132.1, 'P': 115.1, 'Q': 146.2, 'R': 174.2,
213
+ 'S': 105.1, 'T': 119.1, 'V': 117.1, 'W': 204.2, 'Y': 181.2
214
+ }
215
+
216
+ mw = sum(weights.get(aa, 0) for aa in sequence)
217
+ # Subtract water for peptide bonds
218
+ mw -= 18.0 * (len(sequence) - 1)
219
+ return round(mw, 1)
220
+
221
+ @classmethod
222
+ def calculate_hydrophobicity(cls, sequence: str) -> float:
223
+ """Calculate GRAVY (grand average of hydropathy)"""
224
+ if BIOPYTHON_AVAILABLE:
225
+ try:
226
+ analyzer = ProteinAnalysis(sequence)
227
+ return analyzer.gravy()
228
+ except:
229
+ pass
230
+
231
+ # Kyte-Doolittle scale
232
+ hydrophobicity = {
233
+ 'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8,
234
+ 'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
235
+ 'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5,
236
+ 'S': -0.8, 'T': -0.7, 'V': 4.2, 'W': -0.9, 'Y': -1.3
237
+ }
238
+
239
+ if len(sequence) == 0:
240
+ return 0
241
+
242
+ total = sum(hydrophobicity.get(aa, 0) for aa in sequence)
243
+ return round(total / len(sequence), 2)
244
+
245
  # ==================== Model Classes ====================
246
 
247
  # --- add this utility somewhere above UnifiedPeptidePredictor ---
 
293
  class PeptideCLMFeaturizer:
294
  """
295
  Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
296
+ Use the SAME tokenizer files, max_length, and pooling you used in training XGB models.
297
  """
298
  def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
299
  self.device = device
 
671
  if len(vals) == 0:
672
  return None
673
 
 
674
  threshold_default = float(np.median(vals))
675
  return {
676
  "values": vals,
 
899
  # Model registry
900
  self.models = {}
901
  self.model_configs = self.get_model_configs()
902
+ self.sequence_analyzer = SequenceAnalyzer()
903
  # Data manager
904
  self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
905
  self._protein_cache = {}
 
972
  'path': 'models/best_model_nonfouling.json',
973
  'unit': 'Probability',
974
  'display_name': 'πŸ‘― Non-Fouling',
975
+ 'positive_label': 'Non-toxic',
976
+ 'negative_label': 'Toxic'
977
  },
978
  'nonfouling_smiles': {
979
  'type': 'xgboost',
 
981
  'path': 'models/nonfouling-xgboost_smiles.json',
982
  'unit': 'Probability',
983
  'display_name': 'πŸ‘― Non-Fouling',
984
+ 'positive_label': 'Stable',
985
+ 'negative_label': 'Unstable'
986
  },
987
  'binding_affinity': {
988
  'type': 'binding',
 
994
  'binding_affinity_smiles': {
995
  'type': 'binding_smiles',
996
  'input': 'sequence+smiles',
997
+ 'path': 'models/binding-affinity_smiles.pt',
998
  'unit': 'Probability',
999
  'display_name': 'πŸ”— Binding Affinity (SMILES)'
1000
  },
1001
  }
1002
+ def analyze_sequence(self, sequence: str, pH: float = 7.0) -> Dict[str, Any]:
1003
+ """Comprehensive sequence analysis including charge, pI, and aggregation"""
1004
+ results = {}
1005
+
1006
+ # Basic properties
1007
+ results['length'] = len(sequence)
1008
+ results['molecular_weight'] = self.sequence_analyzer.calculate_molecular_weight(sequence)
1009
+ results['net_charge'] = self.sequence_analyzer.calculate_net_charge(sequence, pH)
1010
+ results['isoelectric_point'] = self.sequence_analyzer.calculate_isoelectric_point(sequence)
1011
+ results['hydrophobicity'] = self.sequence_analyzer.calculate_hydrophobicity(sequence)
1012
+ return results
1013
  def load_all_models(self):
1014
  """Load all available models"""
1015
  for name, config in self.model_configs.items():
 
1221
 
1222
  def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
1223
  return self.smiles_featurizer.embed_list([s])[0]
1224
+
1225
+ @staticmethod
1226
+ def affinity_to_nM(affinity: float) -> float:
1227
+ """
1228
+ Convert model affinity score (pKd / pKi / pIC50 style: -log10(K [M]))
1229
+ to an approximate concentration in nM.
1230
+ """
1231
+ # K [M] = 10^(-affinity); then convert M -> nM (1e9 factor)
1232
+ return 10.0 ** (-float(affinity)) * 1e9
1233
+
1234
 
1235
+ # ==================== TANGO INTEGRATION ====================
1236
+
1237
+ # TANGO executable: same folder as this script
1238
+ try:
1239
+ HERE = Path(__file__).resolve().parent
1240
+ except NameError:
1241
+ HERE = Path(".").resolve()
1242
+
1243
+ TANGO_EXE = str(HERE / "tango_x86_64_release")
1244
+
1245
+ # Default params (adjust if you like)
1246
+ DEFAULT_TANGO_PARAMS = {
1247
+ "nt": "N",
1248
+ "ct": "N",
1249
+ "ph": "7.0",
1250
+ "te": "310", # Kelvin (~37 Β°C)
1251
+ "io": "0.05",
1252
+ "tf": "0",
1253
+ "stab": "-10",
1254
+ "conc": "0.0001",
1255
+ }
1256
+
1257
+ def _parse_tango_keyvals(text: str) -> dict:
1258
+ """
1259
+ Parse lines like:
1260
+ 'AGG 0 AMYLO 6.41e-13 TURN 7.06 HELIX 0 HELAGG 0 BETA 19.67'
1261
+ into {'AMYLO': [...], 'BETA': [...], ...}
1262
+ """
1263
+ buckets = defaultdict(list)
1264
+ for line in text.splitlines():
1265
+ pairs = re.findall(
1266
+ r'\b(AGG|AMYLO|TURN|HELIX|HELAGG|BETA)\s+([+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)\b',
1267
+ line
1268
+ )
1269
+ for k, v in pairs:
1270
+ try:
1271
+ buckets[k].append(float(v))
1272
+ except ValueError:
1273
+ pass
1274
+ return dict(buckets)
1275
+
1276
+ def _agg(vals, how="sum"):
1277
+ if not vals:
1278
+ return None
1279
+ if how == "sum":
1280
+ return float(sum(vals))
1281
+ if how == "max":
1282
+ return float(max(vals))
1283
+ if how == "mean":
1284
+ return float(sum(vals) / len(vals))
1285
+ return None
1286
+
1287
+ def run_tango_for_sequence(
1288
+ seq: str,
1289
+ pH_value: str,
1290
+ ident: str = "seq",
1291
+ params: dict | None = None,
1292
+ exe: str = TANGO_EXE,
1293
+ ) -> dict:
1294
+ """
1295
+ Run TANGO on a single sequence and return:
1296
+ - amyloid aggregation (AMYLO sum/max)
1297
+ - Ξ²-sheet aggregation (BETA sum/max)
1298
+ """
1299
+ params = {**DEFAULT_TANGO_PARAMS, **(params or {})}
1300
+ params["ph"] = pH_value
1301
+ cmd = [exe, ident] + [f'{k}="{v}"' for k, v in params.items()] + [f'seq="{seq}"']
1302
+
1303
+ # TANGO likes a single shell command
1304
+ p = subprocess.run(" ".join(cmd), shell=True, capture_output=True, text=True)
1305
+ out = (p.stdout or "") + (("\n[STDERR]\n" + p.stderr) if p.stderr else "")
1306
+
1307
+ buckets = _parse_tango_keyvals(out)
1308
+
1309
+ amylo_vals = buckets.get("AMYLO", [])
1310
+ beta_vals = buckets.get("BETA", [])
1311
+ agg_vals = buckets.get("AGG", [])
1312
+
1313
+ tango_amylo_max = _agg(amylo_vals, "max")
1314
+ tango_amylo_sum = _agg(amylo_vals, "sum")
1315
+ tango_beta_max = _agg(beta_vals, "max")
1316
+ tango_beta_sum = _agg(beta_vals, "sum")
1317
+ tango_agg_sum = _agg(agg_vals, "sum")
1318
+
1319
+ return {
1320
+ "tango_amylo_max": tango_amylo_max,
1321
+ "tango_amylo_sum": tango_amylo_sum,
1322
+ "tango_beta_max": tango_beta_max,
1323
+ "tango_beta_sum": tango_beta_sum,
1324
+ "tango_agg_sum": tango_agg_sum,
1325
+ "raw_output": out.strip(),
1326
+ }
1327
 
1328
  # ==================== Gradio Interface ====================
1329
 
 
1349
  half_life: bool,
1350
  nonfouling: bool,
1351
  binding_affinity: bool,
1352
+ tango_amyloid: bool,
1353
+ tango_beta: bool,
1354
+ include_physicochemical: bool,
1355
+ pH_value: float,
1356
  progress=gr.Progress()
1357
  ):
1358
  """Main prediction function"""
 
1378
  # Collect selected properties
1379
  selected_properties = []
1380
 
 
1381
  checkbox_to_keys = {
1382
  'hemolysis': ['hemolysis_seq', 'hemolysis_smiles'],
1383
  'solubility': ['solubility_seq', 'solubility_smiles'],
1384
+ 'permeability': ['permeability_smiles'],
1385
  'half_life': ['half_life_seq', 'binding_affinity_smiles'],
1386
  'nonfouling': ['nonfouling_seq', 'nonfouling_smiles'], # adjust if you have a real cytotox model
1387
  }
 
1442
  })
1443
  except Exception as e:
1444
  print(f"Error predicting {prop}: {e}")
1445
+ if input_type == "Sequence":
1446
+ if include_physicochemical:
1447
+ seq_display = seq[:30] + '...' if len(seq) > 30 else seq
1448
+ progress((seq_idx + 0.3) / len(lines), f"Calculating physicochemical properties...")
1449
+ analysis = pred.analyze_sequence(seq, pH_value)
1450
+
1451
+ results.append({
1452
+ 'Sequence': seq_display,
1453
+ 'Property': 'πŸ“ Length',
1454
+ 'Prediction': '',
1455
+ 'Value': str(analysis['length']),
1456
+ 'Unit': 'aa'
1457
+ })
1458
+ results.append({
1459
+ 'Sequence': seq_display,
1460
+ 'Property': 'βš–οΈ Molecular Weight',
1461
+ 'Prediction': '',
1462
+ 'Value': f"{analysis['molecular_weight']:.1f}",
1463
+ 'Unit': 'Da'
1464
+ })
1465
+ results.append({
1466
+ 'Sequence': seq_display,
1467
+ 'Property': f'⚑ Net Charge (pH {pH_value})',
1468
+ 'Prediction': '',
1469
+ 'Value': f"{analysis['net_charge']:.2f}",
1470
+ 'Unit': ''
1471
+ })
1472
+ results.append({
1473
+ 'Sequence': seq_display,
1474
+ 'Property': '🎯 Isoelectric Point',
1475
+ 'Prediction': '',
1476
+ 'Value': f"{analysis['isoelectric_point']:.2f}",
1477
+ 'Unit': 'pH'
1478
+ })
1479
+ hydro = analysis['hydrophobicity']
1480
+ if hydro <= -4.5:
1481
+ hydro_label = "Hydrophilic"
1482
+ elif hydro >= 4.5:
1483
+ hydro_label = "Hydrophobic"
1484
+ else:
1485
+ hydro_label = "Intermediate"
1486
+
1487
+ results.append({
1488
+ 'Sequence': seq_display,
1489
+ 'Property': 'πŸ’¦ Hydrophobicity (GRAVY)',
1490
+ 'Prediction': hydro_label,
1491
+ 'Value': f"{hydro:.2f}",
1492
+ 'Unit': 'GRAVY (Kyte-Doolittle)',
1493
+ })
1494
+ if input_type == "Sequence" and (tango_amyloid or tango_beta):
1495
+ try:
1496
+ # Run once per sequence
1497
+ tango_res = run_tango_for_sequence(
1498
+ seq,
1499
+ pH_value=pH_value,
1500
+ ident=f"seq{seq_idx+1}",
1501
+ params=None # override pH/te here if you want
1502
+ )
1503
+
1504
+ short_seq = seq[:30] + '...' if len(seq) > 30 else seq
1505
+
1506
+ if tango_amyloid and tango_res["tango_amylo_sum"] is not None:
1507
+ results.append({
1508
+ 'Sequence': short_seq,
1509
+ 'Property': "🧱 TANGO Amyloid Aggregation",
1510
+ 'Prediction': "",
1511
+ 'Value': f"{tango_res['tango_amylo_sum']:.3f}",
1512
+ 'Unit': "TANGO (sum)"
1513
+ })
1514
+
1515
+ if tango_beta and tango_res["tango_beta_sum"] is not None:
1516
+ results.append({
1517
+ 'Sequence': short_seq,
1518
+ 'Property': "🧬 TANGO β-sheet Aggregation",
1519
+ 'Prediction': "",
1520
+ 'Value': f"{tango_res['tango_beta_sum']:.3f}",
1521
+ 'Unit': "TANGO (sum)"
1522
+ })
1523
+
1524
+ except Exception as e:
1525
+ print(f"Error running TANGO for sequence {seq_idx+1}: {e}")
1526
+
1527
  # Handle binding affinity separately
1528
  if binding_affinity and input_text:
1529
  # Sequence–Sequence binding
 
1540
  protein_seq,
1541
  binder_seq
1542
  )
1543
+ kd_nM = pred.affinity_to_nM(affinity)
1544
+
1545
+ seq_label = f"Protein–{binder_seq[:20]}..."
1546
+ prop_base = pred.model_configs['binding_affinity']['display_name']
1547
+
1548
+ # Row 1: affinity score (pKd-like)
1549
  results.append({
1550
+ 'Sequence': seq_label,
1551
+ 'Property': f"{prop_base} (score)",
1552
+ 'Prediction': binding_class,
1553
  'Value': f"{affinity:.3f}",
1554
+ 'Unit': "Affinity score (pKd-like)",
1555
+ })
1556
+
1557
+ # Row 2: converted Kd in nM
1558
+ results.append({
1559
+ 'Sequence': seq_label,
1560
+ 'Property': f"{prop_base} (Kd est.)",
1561
+ 'Prediction': binding_class,
1562
+ 'Value': f"{kd_nM:.3g}",
1563
+ 'Unit': "nM (Kd/Ki/IC50)",
1564
  })
1565
  except Exception as e:
1566
  print(f"Error in sequence binding prediction: {e}")
 
1583
  protein_seq,
1584
  smi
1585
  )
1586
+ kd_nM = pred.affinity_to_nM(affinity)
1587
+
1588
+ seq_label = f"Protein–{smi[:20]}..."
1589
+ prop_base = pred.model_configs['binding_affinity_smiles']['display_name']
1590
+
1591
+ # Row 1: affinity score (pKd-like)
1592
  results.append({
1593
+ 'Sequence': seq_label,
1594
+ 'Property': f"{prop_base} (score)",
1595
+ 'Prediction': label, # Tight / Medium / Weak
1596
  'Value': f"{affinity:.3f}",
1597
+ 'Unit': "Affinity score (pKd-like)",
1598
+ })
1599
+
1600
+ # Row 2: converted Kd in nM
1601
+ results.append({
1602
+ 'Sequence': seq_label,
1603
+ 'Property': f"{prop_base} (Kd est.)",
1604
+ 'Prediction': label,
1605
+ 'Value': f"{kd_nM:.3g}",
1606
+ 'Unit': "nM (Kd/Ki/IC50)",
1607
  })
1608
  except Exception as e:
1609
  print(f"Error in SMILES binding prediction: {e}")
 
1697
  return "", ""
1698
 
1699
  def on_example_change(name: str):
1700
+ binder, protein = load_example(name)
1701
  show_protein = (name == "Protein-Peptide")
1702
  return (
1703
  gr.update(value=binder), # input_text
 
1737
  text-align: center;
1738
  margin-bottom: 10px !important;
1739
  }
1740
+ h3 {
1741
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
1742
  -webkit-background-clip: text;
1743
  -webkit-text-fill-color: transparent;
 
1764
  gr.Markdown(
1765
  """
1766
  # 🌐 PeptiVerse
1767
+ ### \t Peptide Property Predictions
1768
  """
1769
  )
1770
 
 
1813
  with gr.Column(scale=1):
1814
  with gr.Group():
1815
  gr.Markdown("### βš™οΈ Select Properties")
1816
+ with gr.Accordion("Physicochemical Properties", open=True):
1817
+ include_physicochemical = gr.Checkbox(
1818
+ label="πŸ§ͺ Calculate Basic Properties",
1819
+ value=True,
1820
+ info="MW, net charge, pI, hydrophobicity"
1821
+ )
1822
+
1823
+ pH_value = gr.Slider(
1824
+ minimum=0,
1825
+ maximum=14,
1826
+ value=7.0,
1827
+ step=0.1,
1828
+ label="pH for Net Charge",
1829
+ info="Physiological pH is ~7.4"
1830
+ )
1831
  with gr.Accordion("Sequence Properties", open=True):
1832
  hemolysis = gr.Checkbox(label="🩸 Hemolysis ↓", value=True)
1833
  solubility = gr.Checkbox(label="πŸ’§ Solubility ↑", value=True)
1834
  permeability = gr.Checkbox(label="πŸͺ£ Permeability ↑", value=False)
1835
  half_life = gr.Checkbox(label="⏱️ Half-life ↑", value=False)
1836
+ nonfouling = gr.Checkbox(label="πŸ‘― Non-Fouling ↑", value=False)
1837
+ tango_amyloid = gr.Checkbox(label="🧱 TANGO Amyloid Aggregation ↓", value=False)
1838
+ tango_beta = gr.Checkbox(label="🧬 TANGO Ξ²-sheet Aggregation ↓", value=False)
1839
  with gr.Accordion("Binding Prediction", open=False):
1840
  binding_affinity = gr.Checkbox(label="πŸ”— Binding Affinity ↑", value=False)
1841
  gr.Markdown("*Requires protein sequence input*")
 
1845
  with gr.Column(scale=1):
1846
  property_selector = gr.Dropdown(
1847
  choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
1848
+ "nonfouling", "binding_affinity", "tango_amyloid", "tango_beta"],
1849
  label="Select Property",
1850
  value="hemolysis"
1851
  )
 
1927
  input_text, input_type, protein_seq,
1928
  hemolysis, solubility, permeability,
1929
  half_life, nonfouling,
1930
+ binding_affinity, tango_amyloid, tango_beta, include_physicochemical, pH_value,
1931
  ],
1932
  outputs=[results_df, status_output]
1933
  )
description.md CHANGED
@@ -10,6 +10,7 @@ Our models are trained on curated datasets from multiple sources:
10
  - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
11
  - **Description:** Probability of peptide disrupting red blood cell membranes.
12
  - **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
 
13
 
14
  #### Solubility Dataset
15
  - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
@@ -45,6 +46,7 @@ Our models are trained on curated datasets from multiple sources:
45
  - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
46
  - **Quality:** Binding class cutoffs: Tight β‰₯ 7.5, Medium 6.0–7.5, Weak < 6.0
47
  - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
 
48
 
49
  ### Model Architecture
50
 
@@ -56,6 +58,22 @@ Our models are trained on curated datasets from multiple sources:
56
  ### Model Training and Weight Hosting
57
  - [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ### Citation
60
 
61
  If you use this tool, please cite:
 
10
  - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
11
  - **Description:** Probability of peptide disrupting red blood cell membranes.
12
  - **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
13
+ - **Interpretation** 50% of read blood cells being lysed at x ug/ml concetration (HC50). If HC50 < 100uM, considered as hemolytic, otherwise non-hemolytic.
14
 
15
  #### Solubility Dataset
16
  - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
 
46
  - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
47
  - **Quality:** Binding class cutoffs: Tight β‰₯ 7.5, Medium 6.0–7.5, Weak < 6.0
48
  - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
49
+ - **Interpretation** Affinity_measure = -log_10(K).
50
 
51
  ### Model Architecture
52
 
 
58
  ### Model Training and Weight Hosting
59
  - [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
60
 
61
+ ### πŸ§ͺ Physicochemical Properties
62
+
63
+ #### Net Charge Calculation
64
+ - Uses Henderson-Hasselbalch equation
65
+ - pH-dependent calculation
66
+ - Considers all ionizable groups (K, R, H, D, E, C, Y, termini)
67
+
68
+ #### Isoelectric Point (pI)
69
+ - Bisection method to find pH where net charge = 0
70
+ - Precision: Β±0.01 pH units
71
+
72
+ #### Hydrophobicity (GRAVY)
73
+ - Grand Average of Hydropathy
74
+ - Uses Kyte-Doolittle scale
75
+ - Range: -4.5 (hydrophilic) to +4.5 (hydrophobic)
76
+
77
  ### Citation
78
 
79
  If you use this tool, please cite:
tango_x86_64_release ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e381c28f847487069b0df29bb9d4f766391066710500d3170ecb73d9f31dbf
3
+ size 211205
tokenizer/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/tokenizer/__pycache__/__init__.cpython-310.pyc and b/tokenizer/__pycache__/__init__.cpython-310.pyc differ
 
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc CHANGED
Binary files a/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc and b/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc differ