Spaces:
Running
Running
Commit
Β·
82cd634
1
Parent(s):
4e0581e
upload model path
Browse files- app.py +215 -33
- description.md +10 -15
app.py
CHANGED
|
@@ -12,7 +12,74 @@ import json
|
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any, Tuple, Optional
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
try:
|
| 17 |
from rdkit import Chem
|
| 18 |
from rdkit.Chem import Descriptors, AllChem
|
|
@@ -357,14 +424,13 @@ class PeptideCNN(nn.Module):
|
|
| 357 |
# ==================== Data Management ====================
|
| 358 |
|
| 359 |
class TrainingDataManager:
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
self.data_dir = Path(__file__).resolve().parent / data_dir
|
| 363 |
self.data_dir.mkdir(exist_ok=True)
|
| 364 |
self.statistics = self.load_statistics()
|
| 365 |
|
| 366 |
def _load_half_life_csv(self):
|
| 367 |
-
csv_path = self.data_dir / "half_life_smiles.csv"
|
| 368 |
if not csv_path.exists():
|
| 369 |
return None
|
| 370 |
try:
|
|
@@ -397,8 +463,8 @@ class TrainingDataManager:
|
|
| 397 |
Returns: {'values': y, 'unit': 'Class (0=neg, 1=pos)', 'kind': 'binary', 'n_pos': int, 'n_neg': int}
|
| 398 |
or None if missing.
|
| 399 |
"""
|
| 400 |
-
pos_path = self.data_dir / f"{prefix}-positive.npz"
|
| 401 |
-
neg_path = self.data_dir / f"{prefix}-negative.npz"
|
| 402 |
if not pos_path.exists() or not neg_path.exists():
|
| 403 |
return None
|
| 404 |
try:
|
|
@@ -420,6 +486,68 @@ class TrainingDataManager:
|
|
| 420 |
except Exception as e:
|
| 421 |
print(f"[TrainingDataManager] binary load error for '{prefix}': {e}")
|
| 422 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
def load_statistics(self):
|
| 425 |
"""Load pre-computed statistics for each property"""
|
|
@@ -472,7 +600,7 @@ class TrainingDataManager:
|
|
| 472 |
# Overlay real half-life
|
| 473 |
hl = self._load_half_life_csv()
|
| 474 |
if hl is not None:
|
| 475 |
-
stats["half_life"].update(hl)
|
| 476 |
|
| 477 |
# Overlay real solubility from sol-* (binary)
|
| 478 |
sol = self._load_binary_pair("sol")
|
|
@@ -487,6 +615,14 @@ class TrainingDataManager:
|
|
| 487 |
hemo = self._load_binary_pair("hemo")
|
| 488 |
if hemo is not None:
|
| 489 |
stats["hemolysis"].update(hemo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
|
| 491 |
return stats
|
| 492 |
|
|
@@ -520,16 +656,39 @@ class TrainingDataManager:
|
|
| 520 |
# continuous
|
| 521 |
fig = go.Figure()
|
| 522 |
fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))
|
|
|
|
|
|
|
| 523 |
if "threshold" in s and s["threshold"] is not None:
|
| 524 |
fig.add_vline(
|
| 525 |
-
x=s["threshold"],
|
| 526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
)
|
|
|
|
|
|
|
| 528 |
if current_value is not None:
|
| 529 |
fig.add_vline(
|
| 530 |
-
x=current_value,
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
| 532 |
)
|
|
|
|
| 533 |
fig.update_layout(
|
| 534 |
title=f"{property_name.replace('_', ' ').title()} Distribution",
|
| 535 |
xaxis_title=s.get("unit", ""),
|
|
@@ -539,6 +698,7 @@ class TrainingDataManager:
|
|
| 539 |
)
|
| 540 |
return fig
|
| 541 |
|
|
|
|
| 542 |
def get_property_info(self, property_name):
|
| 543 |
if property_name not in self.statistics:
|
| 544 |
return None
|
|
@@ -606,7 +766,7 @@ class UnifiedPeptidePredictor:
|
|
| 606 |
self.model_configs = self.get_model_configs()
|
| 607 |
|
| 608 |
# Data manager
|
| 609 |
-
self.data_manager = TrainingDataManager()
|
| 610 |
self._protein_cache = {}
|
| 611 |
# Load models
|
| 612 |
self.load_all_models()
|
|
@@ -617,7 +777,7 @@ class UnifiedPeptidePredictor:
|
|
| 617 |
'hemolysis_seq': {
|
| 618 |
'type': 'xgboost',
|
| 619 |
'input': 'sequence',
|
| 620 |
-
'path': 'best_model_hemolysis.json',
|
| 621 |
'inverse_score': False,
|
| 622 |
'unit': 'Probability',
|
| 623 |
'display_name': 'π©Έ Hemolysis',
|
|
@@ -627,7 +787,7 @@ class UnifiedPeptidePredictor:
|
|
| 627 |
'hemolysis_smiles': {
|
| 628 |
'type': 'xgboost',
|
| 629 |
'input': 'smiles',
|
| 630 |
-
'path': 'hemolysis-xgboost_smiles.json',
|
| 631 |
'inverse_score': False,
|
| 632 |
'unit': 'Probability',
|
| 633 |
'display_name': 'π©Έ Hemolysis',
|
|
@@ -637,7 +797,7 @@ class UnifiedPeptidePredictor:
|
|
| 637 |
'solubility_seq': {
|
| 638 |
'type': 'xgboost',
|
| 639 |
'input': 'sequence',
|
| 640 |
-
'path': 'best_model_solubility.json',
|
| 641 |
'unit': 'Probability',
|
| 642 |
'display_name': 'π§ Solubility',
|
| 643 |
'positive_label': 'Soluble',
|
|
@@ -646,7 +806,7 @@ class UnifiedPeptidePredictor:
|
|
| 646 |
'solubility_smiles': {
|
| 647 |
'type': 'xgboost',
|
| 648 |
'input': 'smiles',
|
| 649 |
-
'path': 'solubility-xgboost_smiles.json',
|
| 650 |
'unit': 'Probability',
|
| 651 |
'display_name': 'π§ Solubility',
|
| 652 |
'positive_label': 'Soluble',
|
|
@@ -655,7 +815,7 @@ class UnifiedPeptidePredictor:
|
|
| 655 |
'permeability_smiles': {
|
| 656 |
'type': 'xgboost',
|
| 657 |
'input': 'smiles',
|
| 658 |
-
'path': 'permeability-xgboost_smiles.json',
|
| 659 |
'unit': 'Probability',
|
| 660 |
'display_name': 'πͺ£ Permeability',
|
| 661 |
'positive_label': 'Permeable',
|
|
@@ -664,7 +824,7 @@ class UnifiedPeptidePredictor:
|
|
| 664 |
'half_life_seq': {
|
| 665 |
'type': 'pytorch_cnn',
|
| 666 |
'input': 'sequence',
|
| 667 |
-
'path': 'best_model_half_life.pth',
|
| 668 |
'transform': lambda x: 10**x,
|
| 669 |
'unit': 'hours',
|
| 670 |
'display_name': 'β±οΈ Half-life',
|
|
@@ -674,7 +834,7 @@ class UnifiedPeptidePredictor:
|
|
| 674 |
'nonfouling_seq': {
|
| 675 |
'type': 'xgboost',
|
| 676 |
'input': 'sequence',
|
| 677 |
-
'path': 'best_model_nonfouling.json',
|
| 678 |
'unit': 'Probability',
|
| 679 |
'display_name': 'π― Non-Fouling',
|
| 680 |
'positive_label': 'Non-toxic',
|
|
@@ -683,7 +843,7 @@ class UnifiedPeptidePredictor:
|
|
| 683 |
'nonfouling_smiles': {
|
| 684 |
'type': 'xgboost',
|
| 685 |
'input': 'smiles',
|
| 686 |
-
'path': 'nonfouling-xgboost_smiles.json',
|
| 687 |
'unit': 'Probability',
|
| 688 |
'display_name': 'π― Non-Fouling',
|
| 689 |
'positive_label': 'Stable',
|
|
@@ -692,14 +852,14 @@ class UnifiedPeptidePredictor:
|
|
| 692 |
'binding_affinity': {
|
| 693 |
'type': 'binding',
|
| 694 |
'input': 'dual_sequence',
|
| 695 |
-
'path': 'binding_affinity_unpooled.pt',
|
| 696 |
'unit': 'Probability',
|
| 697 |
'display_name': 'π Binding Affinity'
|
| 698 |
},
|
| 699 |
'binding_affinity_smiles': {
|
| 700 |
'type': 'binding_smiles',
|
| 701 |
'input': 'sequence+smiles',
|
| 702 |
-
'path': 'binding-affinity_smiles.pt',
|
| 703 |
'unit': 'Probability',
|
| 704 |
'display_name': 'π Binding Affinity (SMILES)'
|
| 705 |
},
|
|
@@ -927,7 +1087,7 @@ def initialize():
|
|
| 927 |
"""Initialize the predictor"""
|
| 928 |
global predictor
|
| 929 |
if predictor is None:
|
| 930 |
-
predictor = UnifiedPeptidePredictor(model_dir=
|
| 931 |
return predictor
|
| 932 |
|
| 933 |
|
|
@@ -1175,7 +1335,21 @@ def load_example(example_name):
|
|
| 1175 |
return examples[example_name][0], ""
|
| 1176 |
return "", ""
|
| 1177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1179 |
# ==================== Gradio App ====================
|
| 1180 |
|
| 1181 |
custom_css = """
|
|
@@ -1202,7 +1376,18 @@ h1 {
|
|
| 1202 |
text-align: center;
|
| 1203 |
margin-bottom: 10px !important;
|
| 1204 |
}
|
| 1205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1206 |
table {
|
| 1207 |
font-size: 14px !important;
|
| 1208 |
}
|
|
@@ -1217,8 +1402,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1217 |
# Header
|
| 1218 |
gr.Markdown(
|
| 1219 |
"""
|
| 1220 |
-
#
|
| 1221 |
-
### Peptide Property Predictions
|
| 1222 |
"""
|
| 1223 |
)
|
| 1224 |
|
|
@@ -1319,7 +1504,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1319 |
# Results Section
|
| 1320 |
with gr.Group():
|
| 1321 |
gr.Markdown("### π Results")
|
| 1322 |
-
gr.Markdown("*Click on property names to view distribution plots*")
|
| 1323 |
|
| 1324 |
results_df = gr.Dataframe(
|
| 1325 |
headers=["Sequence", "Property", "Prediction", "Value", "Unit"],
|
|
@@ -1340,8 +1524,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1340 |
"""
|
| 1341 |
---
|
| 1342 |
<div style='text-align: center; color: #6b7280;'>
|
| 1343 |
-
<p>
|
| 1344 |
-
<p style='font-size: 0.9em;'>Click on property names in results to view training data distributions</p>
|
| 1345 |
</div>
|
| 1346 |
"""
|
| 1347 |
)
|
|
@@ -1349,7 +1532,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1349 |
# Event Handlers
|
| 1350 |
def update_visibility(binding_checked):
|
| 1351 |
return gr.update(visible=binding_checked)
|
| 1352 |
-
|
| 1353 |
binding_affinity.change(
|
| 1354 |
update_visibility,
|
| 1355 |
inputs=[binding_affinity],
|
|
@@ -1357,11 +1540,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
|
|
| 1357 |
)
|
| 1358 |
|
| 1359 |
example_dropdown.change(
|
| 1360 |
-
|
| 1361 |
inputs=[example_dropdown],
|
| 1362 |
outputs=[input_text, protein_seq]
|
| 1363 |
)
|
| 1364 |
-
|
| 1365 |
predict_btn.click(
|
| 1366 |
predict_properties,
|
| 1367 |
inputs=[
|
|
|
|
| 12 |
import time
|
| 13 |
from typing import List, Dict, Any, Tuple, Optional
|
| 14 |
|
| 15 |
+
from huggingface_hub import snapshot_download
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
def pick_assets_root() -> Path:
|
| 20 |
+
# HF Spaces container uses /home/user; detect via SPACE_ID or existence
|
| 21 |
+
spaces_root = Path("/home/user/assets")
|
| 22 |
+
if os.environ.get("SPACE_ID") or spaces_root.parent.exists():
|
| 23 |
+
try:
|
| 24 |
+
spaces_root.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
return spaces_root
|
| 26 |
+
except Exception:
|
| 27 |
+
pass # fall through to local options
|
| 28 |
+
|
| 29 |
+
# Allow manual override
|
| 30 |
+
env = os.environ.get("HF_ASSETS_DIR")
|
| 31 |
+
if env:
|
| 32 |
+
p = Path(env); p.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
return p
|
| 34 |
+
|
| 35 |
+
# Local fallbacks
|
| 36 |
+
for p in [Path.home() / "assets", Path.cwd() / "assets", Path("/tmp/assets")]:
|
| 37 |
+
try:
|
| 38 |
+
p.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
return p
|
| 40 |
+
except Exception:
|
| 41 |
+
continue
|
| 42 |
+
raise RuntimeError("No writable assets directory found.")
|
| 43 |
+
|
| 44 |
+
ASSETS = pick_assets_root()
|
| 45 |
+
|
| 46 |
+
# Put all caches on the same writable disk
|
| 47 |
+
for k, v in {
|
| 48 |
+
"HF_HOME": str(ASSETS / "hf"),
|
| 49 |
+
"HUGGINGFACE_HUB_CACHE": str(ASSETS / "hf" / "cache"),
|
| 50 |
+
"TRANSFORMERS_CACHE": str(ASSETS / "transformers"),
|
| 51 |
+
"HF_DATASETS_CACHE": str(ASSETS / "hf" / "datasets"),
|
| 52 |
+
"XDG_CACHE_HOME": str(ASSETS / "xdg"),
|
| 53 |
+
"TMPDIR": str(ASSETS / "tmp"),
|
| 54 |
+
}.items():
|
| 55 |
+
os.environ.setdefault(k, v)
|
| 56 |
+
Path(v).mkdir(parents=True, exist_ok=True)
|
| 57 |
+
|
| 58 |
+
ASSETS_MODELS = ASSETS / "models"; ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
|
| 59 |
+
ASSETS_DATA = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
|
| 60 |
+
|
| 61 |
+
MODEL_REPO = "ChatterjeeLab/Classifier_Weight"
|
| 62 |
+
DATASET_REPO = "ChatterjeeLab/Classifier_Weight"
|
| 63 |
+
|
| 64 |
+
def fetch_models_and_data():
|
| 65 |
+
snapshot_download(
|
| 66 |
+
repo_id=MODEL_REPO,
|
| 67 |
+
local_dir=str(ASSETS_MODELS),
|
| 68 |
+
local_dir_use_symlinks=True,
|
| 69 |
+
allow_patterns=[
|
| 70 |
+
"models/*.pt","models/*.pth","models/*.ckpt","models/*.safetensors",
|
| 71 |
+
"models/*.json","models/*.yaml","models/*.yml",
|
| 72 |
+
],
|
| 73 |
+
)
|
| 74 |
+
snapshot_download(
|
| 75 |
+
repo_id=DATASET_REPO, # <-- no repo_type here
|
| 76 |
+
local_dir=str(ASSETS_DATA),
|
| 77 |
+
local_dir_use_symlinks=True,
|
| 78 |
+
allow_patterns=["training_data/*.csv","training_data/*.npz","training_data/*.md"],
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
fetch_models_and_data()
|
| 82 |
+
|
| 83 |
try:
|
| 84 |
from rdkit import Chem
|
| 85 |
from rdkit.Chem import Descriptors, AllChem
|
|
|
|
| 424 |
# ==================== Data Management ====================
|
| 425 |
|
| 426 |
class TrainingDataManager:
|
| 427 |
+
def __init__(self, data_dir=ASSETS_DATA):
|
| 428 |
+
self.data_dir = Path(data_dir)
|
|
|
|
| 429 |
self.data_dir.mkdir(exist_ok=True)
|
| 430 |
self.statistics = self.load_statistics()
|
| 431 |
|
| 432 |
def _load_half_life_csv(self):
|
| 433 |
+
csv_path = self.data_dir / "training_data/half_life_smiles.csv"
|
| 434 |
if not csv_path.exists():
|
| 435 |
return None
|
| 436 |
try:
|
|
|
|
| 463 |
Returns: {'values': y, 'unit': 'Class (0=neg, 1=pos)', 'kind': 'binary', 'n_pos': int, 'n_neg': int}
|
| 464 |
or None if missing.
|
| 465 |
"""
|
| 466 |
+
pos_path = self.data_dir / f"training_data/{prefix}-positive.npz"
|
| 467 |
+
neg_path = self.data_dir / f"training_data/{prefix}-negative.npz"
|
| 468 |
if not pos_path.exists() or not neg_path.exists():
|
| 469 |
return None
|
| 470 |
try:
|
|
|
|
| 486 |
except Exception as e:
|
| 487 |
print(f"[TrainingDataManager] binary load error for '{prefix}': {e}")
|
| 488 |
return None
|
| 489 |
+
|
| 490 |
+
def _load_binding_affinity_csv(self):
|
| 491 |
+
"""
|
| 492 |
+
Read c-binding.csv and return the raw affinity values (pKd/pKi-like, i.e., -log scale).
|
| 493 |
+
No filtering/clipping β only numeric conversion with NaNs dropped so plotting works.
|
| 494 |
+
"""
|
| 495 |
+
csv_path = self.data_dir / "training_data/c-binding.csv"
|
| 496 |
+
if not csv_path.exists():
|
| 497 |
+
return None
|
| 498 |
+
try:
|
| 499 |
+
df = pd.read_csv(csv_path)
|
| 500 |
+
if "affinity" not in df.columns:
|
| 501 |
+
raise ValueError("CSV must contain an 'affinity' column.")
|
| 502 |
+
|
| 503 |
+
vals = pd.to_numeric(df["affinity"], errors="coerce").dropna().to_numpy()
|
| 504 |
+
if len(vals) == 0:
|
| 505 |
+
return None
|
| 506 |
+
|
| 507 |
+
return {
|
| 508 |
+
"values": vals,
|
| 509 |
+
"description": "Proteinβligand binding affinity normalized",
|
| 510 |
+
"unit": "score",
|
| 511 |
+
"threshold": 7.5, # main threshold (tight)
|
| 512 |
+
"threshold_secondary": 6.0, # weak threshold
|
| 513 |
+
"kind": "continuous",
|
| 514 |
+
"download_link": str(csv_path),
|
| 515 |
+
}
|
| 516 |
+
except Exception as e:
|
| 517 |
+
print(f"[TrainingDataManager] binding-affinity load error: {e}")
|
| 518 |
+
return None
|
| 519 |
+
|
| 520 |
+
def _load_permeability_pampa_csv(self):
|
| 521 |
+
"""
|
| 522 |
+
Load PAMPA permeability values from training_data/nc-CPP-processed.csv.
|
| 523 |
+
Expects columns: 'SMILES','PAMPA'. We only parse PAMPA as float; NaNs are dropped.
|
| 524 |
+
No filtering/clipping.
|
| 525 |
+
"""
|
| 526 |
+
csv_path = self.data_dir / "training_data/nc-CPP-processed.csv"
|
| 527 |
+
if not csv_path.exists():
|
| 528 |
+
return None
|
| 529 |
+
try:
|
| 530 |
+
df = pd.read_csv(csv_path)
|
| 531 |
+
if "PAMPA" not in df.columns:
|
| 532 |
+
raise ValueError("CSV must contain a 'PAMPA' column.")
|
| 533 |
+
|
| 534 |
+
vals = pd.to_numeric(df["PAMPA"], errors="coerce").dropna().to_numpy()
|
| 535 |
+
if len(vals) == 0:
|
| 536 |
+
return None
|
| 537 |
+
|
| 538 |
+
# Use the conventional log Peff unit; keep your prior display threshold (-4.0) or set median
|
| 539 |
+
threshold_default = float(np.median(vals))
|
| 540 |
+
return {
|
| 541 |
+
"values": vals,
|
| 542 |
+
"description": "Cell membrane permeability measurements",
|
| 543 |
+
"unit": "log Peff",
|
| 544 |
+
"threshold": threshold_default,
|
| 545 |
+
"kind": "continuous",
|
| 546 |
+
"download_link": str(csv_path),
|
| 547 |
+
}
|
| 548 |
+
except Exception as e:
|
| 549 |
+
print(f"[TrainingDataManager] permeability PAMPA load error: {e}")
|
| 550 |
+
return None
|
| 551 |
|
| 552 |
def load_statistics(self):
|
| 553 |
"""Load pre-computed statistics for each property"""
|
|
|
|
| 600 |
# Overlay real half-life
|
| 601 |
hl = self._load_half_life_csv()
|
| 602 |
if hl is not None:
|
| 603 |
+
stats["half_life (smiles)"].update(hl)
|
| 604 |
|
| 605 |
# Overlay real solubility from sol-* (binary)
|
| 606 |
sol = self._load_binary_pair("sol")
|
|
|
|
| 615 |
hemo = self._load_binary_pair("hemo")
|
| 616 |
if hemo is not None:
|
| 617 |
stats["hemolysis"].update(hemo)
|
| 618 |
+
|
| 619 |
+
ba = self._load_binding_affinity_csv()
|
| 620 |
+
if ba is not None:
|
| 621 |
+
stats["binding_affinity"].update(ba)
|
| 622 |
+
|
| 623 |
+
pampa = self._load_permeability_pampa_csv()
|
| 624 |
+
if pampa is not None:
|
| 625 |
+
stats["permeability"].update(pampa)
|
| 626 |
|
| 627 |
return stats
|
| 628 |
|
|
|
|
| 656 |
# continuous
|
| 657 |
fig = go.Figure()
|
| 658 |
fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))
|
| 659 |
+
|
| 660 |
+
# Primary threshold (if any)
|
| 661 |
if "threshold" in s and s["threshold"] is not None:
|
| 662 |
fig.add_vline(
|
| 663 |
+
x=float(s["threshold"]),
|
| 664 |
+
line_dash="dash",
|
| 665 |
+
line_color="purple" if property_name == "binding_affinity" else "red",
|
| 666 |
+
annotation_text=(
|
| 667 |
+
"Tight threshold: {:.3f}".format(float(s["threshold"]))
|
| 668 |
+
if property_name == "binding_affinity"
|
| 669 |
+
else "Threshold: {:.3f}".format(float(s["threshold"]))
|
| 670 |
+
),
|
| 671 |
+
)
|
| 672 |
+
|
| 673 |
+
# Secondary threshold for binding (weak)
|
| 674 |
+
if property_name == "binding_affinity" and "threshold_secondary" in s and s["threshold_secondary"] is not None:
|
| 675 |
+
fig.add_vline(
|
| 676 |
+
x=float(s["threshold_secondary"]),
|
| 677 |
+
line_dash="dash",
|
| 678 |
+
line_color="orange",
|
| 679 |
+
annotation_text="Weak threshold: {:.3f}".format(float(s["threshold_secondary"])),
|
| 680 |
)
|
| 681 |
+
|
| 682 |
+
# Current value
|
| 683 |
if current_value is not None:
|
| 684 |
fig.add_vline(
|
| 685 |
+
x=float(current_value),
|
| 686 |
+
line_dash="solid",
|
| 687 |
+
line_color="green",
|
| 688 |
+
line_width=3,
|
| 689 |
+
annotation_text=f"Your Result: {float(current_value):.3f}",
|
| 690 |
)
|
| 691 |
+
|
| 692 |
fig.update_layout(
|
| 693 |
title=f"{property_name.replace('_', ' ').title()} Distribution",
|
| 694 |
xaxis_title=s.get("unit", ""),
|
|
|
|
| 698 |
)
|
| 699 |
return fig
|
| 700 |
|
| 701 |
+
|
| 702 |
def get_property_info(self, property_name):
|
| 703 |
if property_name not in self.statistics:
|
| 704 |
return None
|
|
|
|
| 766 |
self.model_configs = self.get_model_configs()
|
| 767 |
|
| 768 |
# Data manager
|
| 769 |
+
self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
|
| 770 |
self._protein_cache = {}
|
| 771 |
# Load models
|
| 772 |
self.load_all_models()
|
|
|
|
| 777 |
'hemolysis_seq': {
|
| 778 |
'type': 'xgboost',
|
| 779 |
'input': 'sequence',
|
| 780 |
+
'path': 'models/best_model_hemolysis.json',
|
| 781 |
'inverse_score': False,
|
| 782 |
'unit': 'Probability',
|
| 783 |
'display_name': 'π©Έ Hemolysis',
|
|
|
|
| 787 |
'hemolysis_smiles': {
|
| 788 |
'type': 'xgboost',
|
| 789 |
'input': 'smiles',
|
| 790 |
+
'path': 'models/hemolysis-xgboost_smiles.json',
|
| 791 |
'inverse_score': False,
|
| 792 |
'unit': 'Probability',
|
| 793 |
'display_name': 'π©Έ Hemolysis',
|
|
|
|
| 797 |
'solubility_seq': {
|
| 798 |
'type': 'xgboost',
|
| 799 |
'input': 'sequence',
|
| 800 |
+
'path': 'models/best_model_solubility.json',
|
| 801 |
'unit': 'Probability',
|
| 802 |
'display_name': 'π§ Solubility',
|
| 803 |
'positive_label': 'Soluble',
|
|
|
|
| 806 |
'solubility_smiles': {
|
| 807 |
'type': 'xgboost',
|
| 808 |
'input': 'smiles',
|
| 809 |
+
'path': 'models/solubility-xgboost_smiles.json',
|
| 810 |
'unit': 'Probability',
|
| 811 |
'display_name': 'π§ Solubility',
|
| 812 |
'positive_label': 'Soluble',
|
|
|
|
| 815 |
'permeability_smiles': {
|
| 816 |
'type': 'xgboost',
|
| 817 |
'input': 'smiles',
|
| 818 |
+
'path': 'models/permeability-xgboost_smiles.json',
|
| 819 |
'unit': 'Probability',
|
| 820 |
'display_name': 'πͺ£ Permeability',
|
| 821 |
'positive_label': 'Permeable',
|
|
|
|
| 824 |
'half_life_seq': {
|
| 825 |
'type': 'pytorch_cnn',
|
| 826 |
'input': 'sequence',
|
| 827 |
+
'path': 'models/best_model_half_life.pth',
|
| 828 |
'transform': lambda x: 10**x,
|
| 829 |
'unit': 'hours',
|
| 830 |
'display_name': 'β±οΈ Half-life',
|
|
|
|
| 834 |
'nonfouling_seq': {
|
| 835 |
'type': 'xgboost',
|
| 836 |
'input': 'sequence',
|
| 837 |
+
'path': 'models/best_model_nonfouling.json',
|
| 838 |
'unit': 'Probability',
|
| 839 |
'display_name': 'π― Non-Fouling',
|
| 840 |
'positive_label': 'Non-toxic',
|
|
|
|
| 843 |
'nonfouling_smiles': {
|
| 844 |
'type': 'xgboost',
|
| 845 |
'input': 'smiles',
|
| 846 |
+
'path': 'models/nonfouling-xgboost_smiles.json',
|
| 847 |
'unit': 'Probability',
|
| 848 |
'display_name': 'π― Non-Fouling',
|
| 849 |
'positive_label': 'Stable',
|
|
|
|
| 852 |
'binding_affinity': {
|
| 853 |
'type': 'binding',
|
| 854 |
'input': 'dual_sequence',
|
| 855 |
+
'path': 'models/binding_affinity_unpooled.pt',
|
| 856 |
'unit': 'Probability',
|
| 857 |
'display_name': 'π Binding Affinity'
|
| 858 |
},
|
| 859 |
'binding_affinity_smiles': {
|
| 860 |
'type': 'binding_smiles',
|
| 861 |
'input': 'sequence+smiles',
|
| 862 |
+
'path': 'models/binding-affinity_smiles.pt',
|
| 863 |
'unit': 'Probability',
|
| 864 |
'display_name': 'π Binding Affinity (SMILES)'
|
| 865 |
},
|
|
|
|
| 1087 |
"""Initialize the predictor"""
|
| 1088 |
global predictor
|
| 1089 |
if predictor is None:
|
| 1090 |
+
predictor = UnifiedPeptidePredictor(model_dir=ASSETS_MODELS)
|
| 1091 |
return predictor
|
| 1092 |
|
| 1093 |
|
|
|
|
| 1335 |
return examples[example_name][0], ""
|
| 1336 |
return "", ""
|
| 1337 |
|
| 1338 |
+
def on_example_change(name: str):
|
| 1339 |
+
binder, protein = load_example(name) # your helper above
|
| 1340 |
+
show_protein = (name == "Protein-Peptide")
|
| 1341 |
+
return (
|
| 1342 |
+
gr.update(value=binder), # input_text
|
| 1343 |
+
gr.update(value=protein, visible=show_protein) # protein_seq (and toggle visibility)
|
| 1344 |
+
)
|
| 1345 |
|
| 1346 |
+
def on_example_load(name: str):
|
| 1347 |
+
binder, protein = load_example(name)
|
| 1348 |
+
show_protein = (name == "Protein-Peptide")
|
| 1349 |
+
return (
|
| 1350 |
+
gr.update(value=binder), # input_text
|
| 1351 |
+
gr.update(value=protein, visible=show_protein) # protein_seq + visibility
|
| 1352 |
+
)
|
| 1353 |
# ==================== Gradio App ====================
|
| 1354 |
|
| 1355 |
custom_css = """
|
|
|
|
| 1376 |
text-align: center;
|
| 1377 |
margin-bottom: 10px !important;
|
| 1378 |
}
|
| 1379 |
+
h3 {
|
| 1380 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 1381 |
+
-webkit-background-clip: text;
|
| 1382 |
+
-webkit-text-fill-color: transparent;
|
| 1383 |
+
text-align: center !important;
|
| 1384 |
+
font-size: 1.3em !important;
|
| 1385 |
+
margin-top: -5px !important;
|
| 1386 |
+
}
|
| 1387 |
+
.gr-form {
|
| 1388 |
+
border-radius: 12px !important;
|
| 1389 |
+
border-color: #e5e7eb !important;
|
| 1390 |
+
}
|
| 1391 |
table {
|
| 1392 |
font-size: 14px !important;
|
| 1393 |
}
|
|
|
|
| 1402 |
# Header
|
| 1403 |
gr.Markdown(
|
| 1404 |
"""
|
| 1405 |
+
# π PeptiVerse
|
| 1406 |
+
### \t Peptide Property Predictions
|
| 1407 |
"""
|
| 1408 |
)
|
| 1409 |
|
|
|
|
| 1504 |
# Results Section
|
| 1505 |
with gr.Group():
|
| 1506 |
gr.Markdown("### π Results")
|
|
|
|
| 1507 |
|
| 1508 |
results_df = gr.Dataframe(
|
| 1509 |
headers=["Sequence", "Property", "Prediction", "Value", "Unit"],
|
|
|
|
| 1524 |
"""
|
| 1525 |
---
|
| 1526 |
<div style='text-align: center; color: #6b7280;'>
|
| 1527 |
+
<p>Please Cite Us.</p>
|
|
|
|
| 1528 |
</div>
|
| 1529 |
"""
|
| 1530 |
)
|
|
|
|
| 1532 |
# Event Handlers
|
| 1533 |
def update_visibility(binding_checked):
|
| 1534 |
return gr.update(visible=binding_checked)
|
| 1535 |
+
|
| 1536 |
binding_affinity.change(
|
| 1537 |
update_visibility,
|
| 1538 |
inputs=[binding_affinity],
|
|
|
|
| 1540 |
)
|
| 1541 |
|
| 1542 |
example_dropdown.change(
|
| 1543 |
+
on_example_change,
|
| 1544 |
inputs=[example_dropdown],
|
| 1545 |
outputs=[input_text, protein_seq]
|
| 1546 |
)
|
|
|
|
| 1547 |
predict_btn.click(
|
| 1548 |
predict_properties,
|
| 1549 |
inputs=[
|
description.md
CHANGED
|
@@ -5,29 +5,29 @@
|
|
| 5 |
Our models are trained on curated datasets from multiple sources:
|
| 6 |
|
| 7 |
#### Hemolysis Dataset
|
| 8 |
-
- **Primary Source:** [
|
| 9 |
-
- **Secondary Source:**
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
- **Download:** [hemolysis_training_data.csv](#)
|
| 13 |
|
| 14 |
#### Solubility Dataset
|
| 15 |
-
- **Primary Source:** [
|
| 16 |
-
- **Secondary Source:**
|
| 17 |
- **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
|
| 18 |
- **Description:** Probability of peptide remaining dissolved in aqueous conditions.
|
| 19 |
- **Download:** [solubility_training_data.csv](#)
|
| 20 |
|
| 21 |
#### Non-Fouling Dataset
|
| 22 |
-
- **Primary Source:** [
|
| 23 |
-
- **Secondary Source:** [
|
| 24 |
- **Size:** 3,600 positive, 13,585 negative
|
| 25 |
- **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
|
| 26 |
- **Download:** [solubility_training_data.csv](#)
|
| 27 |
|
| 28 |
#### Permeability Dataset
|
| 29 |
-
- **Primary Source:** [
|
| 30 |
-
- **Secondary Source:**
|
| 31 |
- **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
|
| 32 |
- **Description:** Probability of peptide penetrating the cell membrane.
|
| 33 |
- **Download:** [binding_affinity_training_data.csv](#)
|
|
@@ -57,14 +57,9 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 57 |
|
| 58 |
If you use this tool, please cite:
|
| 59 |
```
|
| 60 |
-
|
| 61 |
-
title={PeptiProp: Unified Platform for Peptide Property Prediction},
|
| 62 |
-
author={Your Name et al.},
|
| 63 |
-
journal={Journal Name},
|
| 64 |
-
year={2024}
|
| 65 |
-
}
|
| 66 |
```
|
| 67 |
|
| 68 |
### Contact
|
| 69 |
|
| 70 |
-
For questions or collaborations: [
|
|
|
|
| 5 |
Our models are trained on curated datasets from multiple sources:
|
| 6 |
|
| 7 |
#### Hemolysis Dataset
|
| 8 |
+
- **Primary Source:** [the Database of Antimicrobial Activity and Structure of Peptides (DBAASPv3)](https://academic.oup.com/nar/article-abstract/49/D1/D288/5957160)
|
| 9 |
+
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
- **Download:** [hemolysis_training_data.csv](#)
|
| 13 |
|
| 14 |
#### Solubility Dataset
|
| 15 |
+
- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
|
| 16 |
+
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 17 |
- **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
|
| 18 |
- **Description:** Probability of peptide remaining dissolved in aqueous conditions.
|
| 19 |
- **Download:** [solubility_training_data.csv](#)
|
| 20 |
|
| 21 |
#### Non-Fouling Dataset
|
| 22 |
+
- **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
|
| 23 |
+
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 24 |
- **Size:** 3,600 positive, 13,585 negative
|
| 25 |
- **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
|
| 26 |
- **Download:** [solubility_training_data.csv](#)
|
| 27 |
|
| 28 |
#### Permeability Dataset
|
| 29 |
+
- **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
|
| 30 |
+
- **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
|
| 31 |
- **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
|
| 32 |
- **Description:** Probability of peptide penetrating the cell membrane.
|
| 33 |
- **Download:** [binding_affinity_training_data.csv](#)
|
|
|
|
| 57 |
|
| 58 |
If you use this tool, please cite:
|
| 59 |
```
|
| 60 |
+
place holder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
```
|
| 62 |
|
| 63 |
### Contact
|
| 64 |
|
| 65 |
+
For questions or collaborations: [yzhang@u.duke.nus.edu](mailto:yzhang@u.duke.nus.edu)
|