Spaces:

ChatterjeeLab
/

PeptiVerse

Running

App Files Files Community

yinuozhang commited on Nov 10, 2025

Commit

82cd634

1 Parent(s): 4e0581e

upload model path

Browse files

Files changed (2) hide show

app.py +215 -33
description.md +10 -15

app.py CHANGED Viewed

@@ -12,7 +12,74 @@ import json
 import time
 from typing import List, Dict, Any, Tuple, Optional
-# Try to import RDKit for SMILES support
 try:
     from rdkit import Chem
     from rdkit.Chem import Descriptors, AllChem
@@ -357,14 +424,13 @@ class PeptideCNN(nn.Module):
 # ==================== Data Management ====================
 class TrainingDataManager:
-    """Manage training data statistics and distributions"""
-    def __init__(self, data_dir="training_data"):
-        self.data_dir = Path(__file__).resolve().parent / data_dir
         self.data_dir.mkdir(exist_ok=True)
         self.statistics = self.load_statistics()
     def _load_half_life_csv(self):
-        csv_path = self.data_dir / "half_life_smiles.csv"
         if not csv_path.exists():
             return None
         try:
@@ -397,8 +463,8 @@ class TrainingDataManager:
         Returns: {'values': y, 'unit': 'Class (0=neg, 1=pos)', 'kind': 'binary', 'n_pos': int, 'n_neg': int}
         or None if missing.
         """
-        pos_path = self.data_dir / f"{prefix}-positive.npz"
-        neg_path = self.data_dir / f"{prefix}-negative.npz"
         if not pos_path.exists() or not neg_path.exists():
             return None
         try:
@@ -420,6 +486,68 @@ class TrainingDataManager:
         except Exception as e:
             print(f"[TrainingDataManager] binary load error for '{prefix}': {e}")
             return None
     def load_statistics(self):
         """Load pre-computed statistics for each property"""
@@ -472,7 +600,7 @@ class TrainingDataManager:
         # Overlay real half-life
         hl = self._load_half_life_csv()
         if hl is not None:
-            stats["half_life"].update(hl)
         # Overlay real solubility from sol-* (binary)
         sol = self._load_binary_pair("sol")
@@ -487,6 +615,14 @@ class TrainingDataManager:
         hemo = self._load_binary_pair("hemo")
         if hemo is not None:
             stats["hemolysis"].update(hemo)
         return stats
@@ -520,16 +656,39 @@ class TrainingDataManager:
         # continuous
         fig = go.Figure()
         fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))
         if "threshold" in s and s["threshold"] is not None:
             fig.add_vline(
-                x=s["threshold"], line_dash="dash", line_color="red",
-                annotation_text=f"Threshold: {s['threshold']:.3f}"
             )
         if current_value is not None:
             fig.add_vline(
-                x=current_value, line_dash="solid", line_color="green", line_width=3,
-                annotation_text=f"Your Result: {current_value:.3f}"
             )
         fig.update_layout(
             title=f"{property_name.replace('_', ' ').title()} Distribution",
             xaxis_title=s.get("unit", ""),
@@ -539,6 +698,7 @@ class TrainingDataManager:
         )
         return fig
     def get_property_info(self, property_name):
         if property_name not in self.statistics:
             return None
@@ -606,7 +766,7 @@ class UnifiedPeptidePredictor:
         self.model_configs = self.get_model_configs()
         # Data manager
-        self.data_manager = TrainingDataManager()
         self._protein_cache = {}
         # Load models
         self.load_all_models()
@@ -617,7 +777,7 @@ class UnifiedPeptidePredictor:
             'hemolysis_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
-                'path': 'best_model_hemolysis.json',
                 'inverse_score': False,
                 'unit': 'Probability',
                 'display_name': '🩸 Hemolysis',
@@ -627,7 +787,7 @@ class UnifiedPeptidePredictor:
             'hemolysis_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
-                'path': 'hemolysis-xgboost_smiles.json',
                 'inverse_score': False,
                 'unit': 'Probability',
                 'display_name': '🩸 Hemolysis',
@@ -637,7 +797,7 @@ class UnifiedPeptidePredictor:
             'solubility_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
-                'path': 'best_model_solubility.json',
                 'unit': 'Probability',
                 'display_name': '💧 Solubility',
                 'positive_label': 'Soluble',
@@ -646,7 +806,7 @@ class UnifiedPeptidePredictor:
             'solubility_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
-                'path': 'solubility-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '💧 Solubility',
                 'positive_label': 'Soluble',
@@ -655,7 +815,7 @@ class UnifiedPeptidePredictor:
             'permeability_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
-                'path': 'permeability-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '🪣 Permeability',
                 'positive_label': 'Permeable',
@@ -664,7 +824,7 @@ class UnifiedPeptidePredictor:
             'half_life_seq': {
                 'type': 'pytorch_cnn',
                 'input': 'sequence',
-                'path': 'best_model_half_life.pth',
                 'transform': lambda x: 10**x,
                 'unit': 'hours',
                 'display_name': '⏱️ Half-life',
@@ -674,7 +834,7 @@ class UnifiedPeptidePredictor:
             'nonfouling_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
-                'path': 'best_model_nonfouling.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
                 'positive_label': 'Non-toxic',
@@ -683,7 +843,7 @@ class UnifiedPeptidePredictor:
             'nonfouling_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
-                'path': 'nonfouling-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
                 'positive_label': 'Stable',
@@ -692,14 +852,14 @@ class UnifiedPeptidePredictor:
             'binding_affinity': {
                 'type': 'binding',
                 'input': 'dual_sequence',
-                'path': 'binding_affinity_unpooled.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity'
             },
             'binding_affinity_smiles': {
                 'type': 'binding_smiles',
                 'input': 'sequence+smiles',
-                'path': 'binding-affinity_smiles.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity (SMILES)'
             },
@@ -927,7 +1087,7 @@ def initialize():
     """Initialize the predictor"""
     global predictor
     if predictor is None:
-        predictor = UnifiedPeptidePredictor(model_dir="models")
     return predictor
@@ -1175,7 +1335,21 @@ def load_example(example_name):
             return examples[example_name][0], ""
     return "", ""
 # ==================== Gradio App ====================
 custom_css = """
@@ -1202,7 +1376,18 @@ h1 {
     text-align: center;
     margin-bottom: 10px !important;
 }
 table {
     font-size: 14px !important;
 }
@@ -1217,8 +1402,8 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
     # Header
     gr.Markdown(
         """
-        # ☄️ PeptiVerse
-        ### Peptide Property Predictions
         """
     )
@@ -1319,7 +1504,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
     # Results Section
     with gr.Group():
         gr.Markdown("### 📊 Results")
-        gr.Markdown("*Click on property names to view distribution plots*")
         results_df = gr.Dataframe(
             headers=["Sequence", "Property", "Prediction", "Value", "Unit"],
@@ -1340,8 +1524,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
         """
         ---
         <div style='text-align: center; color: #6b7280;'>
-            <p>Models: ESM2-650M embeddings + XGBoost/CNN classifiers</p>
-            <p style='font-size: 0.9em;'>Click on property names in results to view training data distributions</p>
         </div>
         """
     )
@@ -1349,7 +1532,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
     # Event Handlers
     def update_visibility(binding_checked):
         return gr.update(visible=binding_checked)
     binding_affinity.change(
         update_visibility,
         inputs=[binding_affinity],
@@ -1357,11 +1540,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as de
     )
     example_dropdown.change(
-        load_example,
         inputs=[example_dropdown],
         outputs=[input_text, protein_seq]
     )
     predict_btn.click(
         predict_properties,
         inputs=[

 import time
 from typing import List, Dict, Any, Tuple, Optional
+from huggingface_hub import snapshot_download
+from pathlib import Path
+import os
+def pick_assets_root() -> Path:
+    # HF Spaces container uses /home/user; detect via SPACE_ID or existence
+    spaces_root = Path("/home/user/assets")
+    if os.environ.get("SPACE_ID") or spaces_root.parent.exists():
+        try:
+            spaces_root.mkdir(parents=True, exist_ok=True)
+            return spaces_root
+        except Exception:
+            pass  # fall through to local options
+    # Allow manual override
+    env = os.environ.get("HF_ASSETS_DIR")
+    if env:
+        p = Path(env); p.mkdir(parents=True, exist_ok=True)
+        return p
+    # Local fallbacks
+    for p in [Path.home() / "assets", Path.cwd() / "assets", Path("/tmp/assets")]:
+        try:
+            p.mkdir(parents=True, exist_ok=True)
+            return p
+        except Exception:
+            continue
+    raise RuntimeError("No writable assets directory found.")
+ASSETS = pick_assets_root()
+# Put all caches on the same writable disk
+for k, v in {
+    "HF_HOME": str(ASSETS / "hf"),
+    "HUGGINGFACE_HUB_CACHE": str(ASSETS / "hf" / "cache"),
+    "TRANSFORMERS_CACHE": str(ASSETS / "transformers"),
+    "HF_DATASETS_CACHE": str(ASSETS / "hf" / "datasets"),
+    "XDG_CACHE_HOME": str(ASSETS / "xdg"),
+    "TMPDIR": str(ASSETS / "tmp"),
+}.items():
+    os.environ.setdefault(k, v)
+    Path(v).mkdir(parents=True, exist_ok=True)
+ASSETS_MODELS = ASSETS / "models";        ASSETS_MODELS.mkdir(parents=True, exist_ok=True)
+ASSETS_DATA   = ASSETS / "training_data"; ASSETS_DATA.mkdir(parents=True, exist_ok=True)
+MODEL_REPO = "ChatterjeeLab/Classifier_Weight"
+DATASET_REPO = "ChatterjeeLab/Classifier_Weight"
+def fetch_models_and_data():
+    snapshot_download(
+        repo_id=MODEL_REPO,
+        local_dir=str(ASSETS_MODELS),
+        local_dir_use_symlinks=True,
+        allow_patterns=[
+            "models/*.pt","models/*.pth","models/*.ckpt","models/*.safetensors",
+            "models/*.json","models/*.yaml","models/*.yml",
+        ],
+    )
+    snapshot_download(
+        repo_id=DATASET_REPO,                    # <-- no repo_type here
+        local_dir=str(ASSETS_DATA),
+        local_dir_use_symlinks=True,
+        allow_patterns=["training_data/*.csv","training_data/*.npz","training_data/*.md"],
+    )
+fetch_models_and_data()
 try:
     from rdkit import Chem
     from rdkit.Chem import Descriptors, AllChem
 # ==================== Data Management ====================
 class TrainingDataManager:
+    def __init__(self, data_dir=ASSETS_DATA):
+        self.data_dir = Path(data_dir)
         self.data_dir.mkdir(exist_ok=True)
         self.statistics = self.load_statistics()
     def _load_half_life_csv(self):
+        csv_path = self.data_dir / "training_data/half_life_smiles.csv"
         if not csv_path.exists():
             return None
         try:
         Returns: {'values': y, 'unit': 'Class (0=neg, 1=pos)', 'kind': 'binary', 'n_pos': int, 'n_neg': int}
         or None if missing.
         """
+        pos_path = self.data_dir / f"training_data/{prefix}-positive.npz"
+        neg_path = self.data_dir / f"training_data/{prefix}-negative.npz"
         if not pos_path.exists() or not neg_path.exists():
             return None
         try:
         except Exception as e:
             print(f"[TrainingDataManager] binary load error for '{prefix}': {e}")
             return None
+    def _load_binding_affinity_csv(self):
+        """
+        Read c-binding.csv and return the raw affinity values (pKd/pKi-like, i.e., -log scale).
+        No filtering/clipping — only numeric conversion with NaNs dropped so plotting works.
+        """
+        csv_path = self.data_dir / "training_data/c-binding.csv"
+        if not csv_path.exists():
+            return None
+        try:
+            df = pd.read_csv(csv_path)
+            if "affinity" not in df.columns:
+                raise ValueError("CSV must contain an 'affinity' column.")
+            vals = pd.to_numeric(df["affinity"], errors="coerce").dropna().to_numpy()
+            if len(vals) == 0:
+                return None
+            return {
+                "values": vals,
+                "description": "Protein–ligand binding affinity normalized",
+                "unit": "score",
+                "threshold": 7.5,               # main threshold (tight)
+                "threshold_secondary": 6.0,     # weak threshold
+                "kind": "continuous",
+                "download_link": str(csv_path),
+            }
+        except Exception as e:
+            print(f"[TrainingDataManager] binding-affinity load error: {e}")
+            return None
+    def _load_permeability_pampa_csv(self):
+        """
+        Load PAMPA permeability values from training_data/nc-CPP-processed.csv.
+        Expects columns: 'SMILES','PAMPA'. We only parse PAMPA as float; NaNs are dropped.
+        No filtering/clipping.
+        """
+        csv_path = self.data_dir / "training_data/nc-CPP-processed.csv"
+        if not csv_path.exists():
+            return None
+        try:
+            df = pd.read_csv(csv_path)
+            if "PAMPA" not in df.columns:
+                raise ValueError("CSV must contain a 'PAMPA' column.")
+            vals = pd.to_numeric(df["PAMPA"], errors="coerce").dropna().to_numpy()
+            if len(vals) == 0:
+                return None
+            # Use the conventional log Peff unit; keep your prior display threshold (-4.0) or set median
+            threshold_default = float(np.median(vals))
+            return {
+                "values": vals,
+                "description": "Cell membrane permeability measurements",
+                "unit": "log Peff",
+                "threshold": threshold_default,
+                "kind": "continuous",
+                "download_link": str(csv_path),
+            }
+        except Exception as e:
+            print(f"[TrainingDataManager] permeability PAMPA load error: {e}")
+            return None
     def load_statistics(self):
         """Load pre-computed statistics for each property"""
         # Overlay real half-life
         hl = self._load_half_life_csv()
         if hl is not None:
+            stats["half_life (smiles)"].update(hl)
         # Overlay real solubility from sol-* (binary)
         sol = self._load_binary_pair("sol")
         hemo = self._load_binary_pair("hemo")
         if hemo is not None:
             stats["hemolysis"].update(hemo)
+        ba = self._load_binding_affinity_csv()
+        if ba is not None:
+            stats["binding_affinity"].update(ba)
+        pampa = self._load_permeability_pampa_csv()
+        if pampa is not None:
+            stats["permeability"].update(pampa)
         return stats
         # continuous
         fig = go.Figure()
         fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))
+        # Primary threshold (if any)
         if "threshold" in s and s["threshold"] is not None:
             fig.add_vline(
+                x=float(s["threshold"]),
+                line_dash="dash",
+                line_color="purple" if property_name == "binding_affinity" else "red",
+                annotation_text=(
+                    "Tight threshold: {:.3f}".format(float(s["threshold"]))
+                    if property_name == "binding_affinity"
+                    else "Threshold: {:.3f}".format(float(s["threshold"]))
+                ),
+            )
+        # Secondary threshold for binding (weak)
+        if property_name == "binding_affinity" and "threshold_secondary" in s and s["threshold_secondary"] is not None:
+            fig.add_vline(
+                x=float(s["threshold_secondary"]),
+                line_dash="dash",
+                line_color="orange",
+                annotation_text="Weak threshold: {:.3f}".format(float(s["threshold_secondary"])),
             )
+        # Current value
         if current_value is not None:
             fig.add_vline(
+                x=float(current_value),
+                line_dash="solid",
+                line_color="green",
+                line_width=3,
+                annotation_text=f"Your Result: {float(current_value):.3f}",
             )
         fig.update_layout(
             title=f"{property_name.replace('_', ' ').title()} Distribution",
             xaxis_title=s.get("unit", ""),
         )
         return fig
     def get_property_info(self, property_name):
         if property_name not in self.statistics:
             return None
         self.model_configs = self.get_model_configs()
         # Data manager
+        self.data_manager = TrainingDataManager(data_dir=ASSETS_DATA)
         self._protein_cache = {}
         # Load models
         self.load_all_models()
             'hemolysis_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
+                'path': 'models/best_model_hemolysis.json',
                 'inverse_score': False,
                 'unit': 'Probability',
                 'display_name': '🩸 Hemolysis',
             'hemolysis_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
+                'path': 'models/hemolysis-xgboost_smiles.json',
                 'inverse_score': False,
                 'unit': 'Probability',
                 'display_name': '🩸 Hemolysis',
             'solubility_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
+                'path': 'models/best_model_solubility.json',
                 'unit': 'Probability',
                 'display_name': '💧 Solubility',
                 'positive_label': 'Soluble',
             'solubility_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
+                'path': 'models/solubility-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '💧 Solubility',
                 'positive_label': 'Soluble',
             'permeability_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
+                'path': 'models/permeability-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '🪣 Permeability',
                 'positive_label': 'Permeable',
             'half_life_seq': {
                 'type': 'pytorch_cnn',
                 'input': 'sequence',
+                'path': 'models/best_model_half_life.pth',
                 'transform': lambda x: 10**x,
                 'unit': 'hours',
                 'display_name': '⏱️ Half-life',
             'nonfouling_seq': {
                 'type': 'xgboost',
                 'input': 'sequence',
+                'path': 'models/best_model_nonfouling.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
                 'positive_label': 'Non-toxic',
             'nonfouling_smiles': {
                 'type': 'xgboost',
                 'input': 'smiles',
+                'path': 'models/nonfouling-xgboost_smiles.json',
                 'unit': 'Probability',
                 'display_name': '👯 Non-Fouling',
                 'positive_label': 'Stable',
             'binding_affinity': {
                 'type': 'binding',
                 'input': 'dual_sequence',
+                'path': 'models/binding_affinity_unpooled.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity'
             },
             'binding_affinity_smiles': {
                 'type': 'binding_smiles',
                 'input': 'sequence+smiles',
+                'path': 'models/binding-affinity_smiles.pt',
                 'unit': 'Probability',
                 'display_name': '🔗 Binding Affinity (SMILES)'
             },
     """Initialize the predictor"""
     global predictor
     if predictor is None:
+        predictor = UnifiedPeptidePredictor(model_dir=ASSETS_MODELS)
     return predictor
             return examples[example_name][0], ""
     return "", ""
+def on_example_change(name: str):
+    binder, protein = load_example(name)   # your helper above
+    show_protein = (name == "Protein-Peptide")
+    return (
+        gr.update(value=binder),                         # input_text
+        gr.update(value=protein, visible=show_protein)   # protein_seq (and toggle visibility)
+    )
+def on_example_load(name: str):
+    binder, protein = load_example(name)
+    show_protein = (name == "Protein-Peptide")
+    return (
+        gr.update(value=binder),                          # input_text
+        gr.update(value=protein, visible=show_protein)    # protein_seq + visibility
+    )
 # ==================== Gradio App ====================
 custom_css = """
     text-align: center;
     margin-bottom: 10px !important;
 }
+h3 {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    text-align: center !important;
+    font-size: 1.3em !important;
+    margin-top: -5px !important;
+}
+.gr-form {
+    border-radius: 12px !important;
+    border-color: #e5e7eb !important;
+}
 table {
     font-size: 14px !important;
 }
     # Header
     gr.Markdown(
         """
+        # 🌐 PeptiVerse
+        ### \t  Peptide Property Predictions
         """
     )
     # Results Section
     with gr.Group():
         gr.Markdown("### 📊 Results")
         results_df = gr.Dataframe(
             headers=["Sequence", "Property", "Prediction", "Value", "Unit"],
         """
         ---
         <div style='text-align: center; color: #6b7280;'>
+            <p>Please Cite Us.</p>
         </div>
         """
     )
     # Event Handlers
     def update_visibility(binding_checked):
         return gr.update(visible=binding_checked)
     binding_affinity.change(
         update_visibility,
         inputs=[binding_affinity],
     )
     example_dropdown.change(
+        on_example_change,
         inputs=[example_dropdown],
         outputs=[input_text, protein_seq]
     )
     predict_btn.click(
         predict_properties,
         inputs=[

description.md CHANGED Viewed

@@ -5,29 +5,29 @@
 Our models are trained on curated datasets from multiple sources:
 #### Hemolysis Dataset
-- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
-- **Secondary Source:** the Database of Antimicrobial Activity and Structure of Peptides (DBAASPv3)
 - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
 - **Description:** Probability of peptide disrupting red blood cell membranes.
 - **Download:** [hemolysis_training_data.csv](#)
 #### Solubility Dataset
-- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
-- **Secondary Source:** PROSO-II
 - **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
 - **Description:** Probability of peptide remaining dissolved in aqueous conditions.
 - **Download:** [solubility_training_data.csv](#)
 #### Non-Fouling Dataset
-- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
-- **Secondary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
 - **Size:** 3,600 positive, 13,585 negative
 - **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
 - **Download:** [solubility_training_data.csv](#)
 #### Permeability Dataset
-- **Primary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
-- **Secondary Source:** CycPeptMPDB
 - **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
 - **Description:** Probability of peptide penetrating the cell membrane.
 - **Download:** [binding_affinity_training_data.csv](#)
@@ -57,14 +57,9 @@ Our models are trained on curated datasets from multiple sources:
 If you use this tool, please cite:
 ```
-@article{peptiprop2024,
-    title={PeptiProp: Unified Platform for Peptide Property Prediction},
-    author={Your Name et al.},
-    journal={Journal Name},
-    year={2024}
-}
 ```
 ### Contact
-For questions or collaborations: [contact@example.com](mailto:contact@example.com)

 Our models are trained on curated datasets from multiple sources:
 #### Hemolysis Dataset
+- **Primary Source:** [the Database of Antimicrobial Activity and Structure of Peptides (DBAASPv3)](https://academic.oup.com/nar/article-abstract/49/D1/D288/5957160)
+- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
 - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
 - **Description:** Probability of peptide disrupting red blood cell membranes.
 - **Download:** [hemolysis_training_data.csv](#)
 #### Solubility Dataset
+- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
+- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
 - **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
 - **Description:** Probability of peptide remaining dissolved in aqueous conditions.
 - **Download:** [solubility_training_data.csv](#)
 #### Non-Fouling Dataset
+- **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
+- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
 - **Size:** 3,600 positive, 13,585 negative
 - **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
 - **Download:** [solubility_training_data.csv](#)
 #### Permeability Dataset
+- **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
+- **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
 - **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
 - **Description:** Probability of peptide penetrating the cell membrane.
 - **Download:** [binding_affinity_training_data.csv](#)
 If you use this tool, please cite:
 ```
+place holder
 ```
 ### Contact
+For questions or collaborations: [yzhang@u.duke.nus.edu](mailto:yzhang@u.duke.nus.edu)