Spaces:

ChatterjeeLab
/

PeptiVerse

Running

App Files Files Community

yinuozhang commited on Nov 10, 2025

Commit

c1bbdd6

1 Parent(s): 9c11751

lfs

Browse files

Files changed (16) hide show

app.py +1398 -0
description.md +70 -0
requirements.txt +11 -0
tokenizer/__init__.py +0 -0
tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
tokenizer/my_tokenizers.py +424 -0
tokenizer/new_splits.txt +159 -0
tokenizer/new_vocab.txt +587 -0
training_data/half_life_smiles.csv +3 -0
training_data/hemo-negative.npz +3 -0
training_data/hemo-positive.npz +3 -0
training_data/nf-negative.npz +3 -0
training_data/nf-positive.npz +3 -0
training_data/sol-negative.npz +3 -0
training_data/sol-positive.npz +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,1398 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import xgboost as xgb
+from transformers import AutoTokenizer, AutoModel, AutoConfig, EsmModel, EsmTokenizer
+import plotly.graph_objects as go
+from pathlib import Path
+import json
+import time
+from typing import List, Dict, Any, Tuple, Optional
+# Try to import RDKit for SMILES support
+try:
+    from rdkit import Chem
+    from rdkit.Chem import Descriptors, AllChem
+    RDKIT_AVAILABLE = True
+except ImportError:
+    RDKIT_AVAILABLE = False
+    print("RDKit not available. SMILES input will be disabled.")
+import re
+AA_RE = re.compile(r'^[ACDEFGHIKLMNPQRSTVWYBXZJUO\-]+$', re.IGNORECASE)
+def is_aa_sequence_like(s: str) -> bool:
+    s = s.strip().replace(" ", "")
+    if not s:
+        return False
+    # Very lenient: allow AA letters + optional '-' for readability
+    return bool(AA_RE.fullmatch(s)) and any(c.isalpha() for c in s)
+def is_smiles_like(s: str) -> bool:
+    s = s.strip()
+    if not s:
+        return False
+    # Heuristic: SMILES often contains these symbols; also reject if it looks like pure AA
+    maybe_smiles_chars = set("=#()[]+\\/-@1234567890")
+    return (any(ch in maybe_smiles_chars for ch in s) or not is_aa_sequence_like(s)) and len(s) >= 2
+# ==================== Model Classes ====================
+# --- add this utility somewhere above UnifiedPeptidePredictor ---
+def load_cnn_weights_safely(model: nn.Module, ckpt_path: Path, device: torch.device):
+    """
+    Load a CNN checkpoint that might include old ESM weights, DDP prefixes, or different wrappers.
+    Strips unknown prefixes and ignores non-matching keys gracefully.
+    """
+    ckpt = torch.load(ckpt_path, map_location=device)
+    # 1) Extract a state dict from various formats
+    if isinstance(ckpt, dict) and any(k in ckpt for k in ["state_dict", "model_state_dict", "weights"]):
+        sd = ckpt.get("state_dict") or ckpt.get("model_state_dict") or ckpt.get("weights")
+    elif isinstance(ckpt, dict):
+        # Probably already a state_dict
+        sd = ckpt
+    else:
+        # Possibly a full pickled model; try to read its state_dict
+        try:
+            sd = ckpt.state_dict()
+        except Exception as e:
+            raise RuntimeError(f"Unsupported checkpoint format at {ckpt_path}: {type(ckpt)}") from e
+    # 2) Normalize keys: strip DDP 'module.' and drop old ESM-containing parameters
+    cleaned = {}
+    for k, v in sd.items():
+        k2 = k
+        if k2.startswith("module."):
+            k2 = k2[len("module."):]
+        # drop anything from the embedded ESM or other now-missing submodules
+        if k2.startswith("esm_model.") or k2.startswith("esm.") or k2.startswith("backbone.esm."):
+            continue
+        cleaned[k2] = v
+    # 3) Load non-strictly so extra/missing heads don't crash
+    missing, unexpected = model.load_state_dict(cleaned, strict=False)
+    # Optional: log what happened so you can verify
+    if unexpected:
+        print(f"[load_cnn_weights_safely] Unexpected keys ignored: {sorted(unexpected)[:6]}{'...' if len(unexpected)>6 else ''}")
+    if missing:
+        print(f"[load_cnn_weights_safely] Missing keys not found in checkpoint: {sorted(missing)[:6]}{'...' if len(missing)>6 else ''}")
+# ====== PeptideCLM SMILES featurizer ======
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from transformers import AutoModelForMaskedLM
+class PeptideCLMFeaturizer:
+    """
+    Mean-pool hidden states from PeptideCLM-23M-all for SMILES tokens produced by SMILES_SPE_Tokenizer.
+    Use the SAME tokenizer files, max_length, and pooling you used in training your XGB models.
+    """
+    def __init__(self, vocab_path: str, splits_path: str, device: torch.device, max_length: int = 256):
+        self.device = device
+        self.max_length = max_length
+        self.tok = SMILES_SPE_Tokenizer(vocab_path, splits_path)
+        self.model = AutoModelForMaskedLM.from_pretrained("aaronfeller/PeptideCLM-23M-all").roformer.to(device).eval()
+    @torch.no_grad()
+    def embed_list(self, smiles_list: list[str]) -> np.ndarray:
+        feats = []
+        for s in smiles_list:
+            toks = self.tok(s, return_tensors="pt", truncation=True, padding=True)
+            toks = {k: v.to(self.device) for k, v in toks.items()}
+            out = self.model(**toks).last_hidden_state  # [1, L, H]
+            mask = toks["attention_mask"].unsqueeze(-1)  # [1, L, 1]
+            pooled = (out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+            feats.append(pooled.squeeze(0).float().cpu().numpy())
+        return np.stack(feats, axis=0)  # [N, H]
+class UnpooledBindingPredictor(nn.Module):
+    """Binding affinity predictor with cross-attention mechanism"""
+    def __init__(self,
+                 esm_model_name="facebook/esm2_t33_650M_UR50D",
+                 hidden_dim=512,
+                 kernel_sizes=[3, 5, 7],
+                 n_heads=8,
+                 n_layers=3,
+                 dropout=0.1,
+                 freeze_esm=True):
+        super().__init__()
+        # Use these everywhere for consistency
+        self.tight_threshold = 7.5
+        self.weak_threshold  = 6.0
+        self.esm_model = AutoModel.from_pretrained(esm_model_name)
+        self.config = AutoConfig.from_pretrained(esm_model_name)
+        if freeze_esm:
+            for p in self.esm_model.parameters():
+                p.requires_grad = False
+        esm_dim = self.config.hidden_size
+        out_ch = 64
+        self.protein_conv_layers = nn.ModuleList([
+            nn.Conv1d(esm_dim, out_ch, k, padding='same') for k in kernel_sizes
+        ])
+        self.binder_conv_layers = nn.ModuleList([
+            nn.Conv1d(esm_dim, out_ch, k, padding='same') for k in kernel_sizes
+        ])
+        total = out_ch * len(kernel_sizes) * 2
+        self.protein_projection = nn.Linear(total, hidden_dim)
+        self.binder_projection  = nn.Linear(total, hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.binder_norm  = nn.LayerNorm(hidden_dim)
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim),
+                ),
+                'norm2': nn.LayerNorm(hidden_dim),
+            }) for _ in range(n_layers)
+        ])
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        self.regression_head     = nn.Linear(hidden_dim, 1)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity: torch.Tensor | float) -> torch.LongTensor | int:
+        """
+        0: tight (>= tight_threshold)
+        1: medium [weak_threshold, tight_threshold)
+        2: weak (< weak_threshold)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight  = affinity >= self.tight_threshold
+            weak   = affinity <  self.weak_threshold
+            medium = ~(tight | weak)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium] = 1
+            classes[weak]   = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0
+            elif affinity < self.weak_threshold:
+                return 2
+            else:
+                return 1
+    def compute_embeddings(self, input_ids, attention_mask=None):
+        out = self.esm_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
+        return out.last_hidden_state
+    def process_sequence(self, unpooled_emb, conv_layers, attention_mask=None):
+        x = unpooled_emb.transpose(1, 2)  # [B, C_in=E, L]
+        conv_outputs = [F.relu(conv(x)) for conv in conv_layers]  # list of [B, C_out, L]
+        conv_output = torch.cat(conv_outputs, dim=1)  # [B, sumC, L]
+        if attention_mask is not None:
+            mask = attention_mask.unsqueeze(1).expand(-1, conv_output.size(1), -1)
+            masked = conv_output.masked_fill(mask == 0, float('-inf'))
+            max_pooled = masked.max(dim=2)[0]
+            sum_pooled = (conv_output * mask).sum(dim=2)
+            denom = mask.sum(dim=2).clamp(min=1.0)
+            avg_pooled = sum_pooled / denom
+        else:
+            max_pooled = conv_output.max(dim=2)[0]
+            avg_pooled = conv_output.mean(dim=2)
+        return torch.cat([max_pooled, avg_pooled], dim=1)  # [B, 2*sumC]
+    def forward(self, protein_input_ids, binder_input_ids, protein_mask=None, binder_mask=None):
+        protein_unpooled = self.compute_embeddings(protein_input_ids, protein_mask)
+        binder_unpooled  = self.compute_embeddings(binder_input_ids,  binder_mask)
+        protein_features = self.process_sequence(protein_unpooled, self.protein_conv_layers, protein_mask)
+        binder_features  = self.process_sequence(binder_unpooled,  self.binder_conv_layers,  binder_mask)
+        protein = self.protein_norm(self.protein_projection(protein_features))
+        binder  = self.binder_norm(self.binder_projection(binder_features))
+        # make them "sequence length 1" for MHA (L,B,D)
+        protein = protein.unsqueeze(0).transpose(0,1)
+        binder  = binder.unsqueeze(0).transpose(0,1)
+        for layer in self.cross_attention_layers:
+            p_attn = layer['attention'](protein, binder, binder)[0]
+            protein = layer['norm1'](protein + p_attn)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            b_attn = layer['attention'](binder, protein, protein)[0]
+            binder  = layer['norm1'](binder + b_attn)
+            binder  = layer['norm2'](binder + layer['ffn'](binder))
+        protein_pool = protein.mean(dim=0).squeeze(0)
+        binder_pool  = binder.mean(dim=0).squeeze(0)
+        shared = self.shared_head(torch.cat([protein_pool, binder_pool], dim=-1))
+        reg   = self.regression_head(shared)           # [1]
+        logits= self.classification_head(shared)       # [3]
+        return reg, logits
+# ------- SMILES + Protein binding model (reg + 3-class) -------
+class ImprovedBindingPredictor(nn.Module):
+    def __init__(self, esm_dim=1280, smiles_dim=768, hidden_dim=512, n_heads=8, n_layers=3, dropout=0.1):
+        super().__init__()
+        self.tight_threshold = 7.5
+        self.weak_threshold  = 6.0
+        self.smiles_projection  = nn.Linear(smiles_dim, hidden_dim)
+        self.protein_projection = nn.Linear(esm_dim,    hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.smiles_norm  = nn.LayerNorm(hidden_dim)
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim),
+                ),
+                'norm2': nn.LayerNorm(hidden_dim),
+            }) for _ in range(n_layers)
+        ])
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        self.regression_head    = nn.Linear(hidden_dim, 1)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity):
+        """Convert affinity values to class indices
+        0: tight binding (>= 7.5)
+        1: medium binding (6.0-7.5)
+        2: weak binding (< 6.0)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight_mask = affinity >= self.tight_threshold
+            weak_mask = affinity < self.weak_threshold
+            medium_mask = ~(tight_mask | weak_mask)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium_mask] = 1
+            classes[weak_mask] = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0  # tight binding
+            elif affinity < self.weak_threshold:
+                return 2  # weak binding
+            else:
+                return 1  # medium binding
+    def forward(self, protein_emb: torch.Tensor, smiles_emb: torch.Tensor):
+        # protein_emb: [1, E], smiles_emb: [1, H]
+        protein = self.protein_norm(self.protein_projection(protein_emb))  # [1, D]
+        smiles  = self.smiles_norm(self.smiles_projection(smiles_emb))     # [1, D]
+        # Treat as "sequence length"=1 tokens; mha still works (QKV dims match)
+        protein = protein.unsqueeze(0)  # [1, 1, D] -> (L, B, D) expected, we’ll keep batch in 2nd dim:
+        smiles  = smiles.unsqueeze(0)   # [1, 1, D]
+        protein = protein.transpose(0, 1)  # [B=1, L=1, D] -> MHA wants [L, B, D]
+        smiles  = smiles.transpose(0, 1)
+        for layer in self.cross_attention_layers:
+            attn_p = layer['attention'](protein, smiles, smiles)[0]
+            protein = layer['norm1'](protein + attn_p)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            attn_s = layer['attention'](smiles, protein, protein)[0]
+            smiles  = layer['norm1'](smiles + attn_s)
+            smiles  = layer['norm2'](smiles + layer['ffn'](smiles))
+        # pool over L (it's 1, so mean==squeeze)
+        protein_pool = protein.mean(dim=0).squeeze(0)  # [D]
+        smiles_pool  = smiles.mean(dim=0).squeeze(0)   # [D]
+        combined = torch.cat([protein_pool, smiles_pool], dim=-1)  # [2D]
+        shared   = self.shared_head(combined)
+        reg  = self.regression_head(shared)            # scalar pKd/pKi
+        logits = self.classification_head(shared)      # 3-class
+        return reg, logits
+class PeptideCNN(nn.Module):
+    """CNN model for single peptide property prediction"""
+    def __init__(self, input_dim=1280, hidden_dims=None, output_dim=160, dropout_rate=0.3):
+        super().__init__()
+        if hidden_dims is None:
+            hidden_dims = [input_dim // 2, input_dim // 4]
+        self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1)
+        self.fc = nn.Linear(hidden_dims[1], output_dim)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.predictor = nn.Linear(output_dim, 1)
+    def forward(self, esm_embeddings, return_features=False):
+        x = esm_embeddings.permute(0, 2, 1)
+        x = F.relu(self.conv1(x))
+        x = self.dropout(x)
+        x = F.relu(self.conv2(x))
+        x = self.dropout(x)
+        x = x.permute(0, 2, 1)
+        x = x.mean(dim=1)
+        features = self.fc(x)
+        if return_features:
+            return features
+        return self.predictor(features)
+# ==================== Data Management ====================
+class TrainingDataManager:
+    """Manage training data statistics and distributions"""
+    def __init__(self, data_dir="training_data"):
+        self.data_dir = Path(__file__).resolve().parent / data_dir
+        self.data_dir.mkdir(exist_ok=True)
+        self.statistics = self.load_statistics()
+    def _load_half_life_csv(self):
+        csv_path = self.data_dir / "half_life_smiles.csv"
+        if not csv_path.exists():
+            return None
+        try:
+            df = pd.read_csv(csv_path)
+            if "log_hour" in df.columns:
+                vals = pd.to_numeric(df["log_hour"], errors="coerce").dropna().to_numpy()
+            else:
+                if "half_life_hours" not in df.columns:
+                    if "half_life" in df.columns:
+                        df["half_life_hours"] = pd.to_numeric(df["half_life"], errors="coerce") / 3600.0
+                    else:
+                        raise ValueError("CSV must contain 'log_hour' or 'half_life_hours' (or 'half_life').")
+                hh = pd.to_numeric(df["half_life_hours"], errors="coerce")
+                vals = np.log10(hh.replace(0, np.nan)).dropna().to_numpy()
+            if len(vals) == 0:
+                return None
+            return {
+                "values": vals,
+                "unit": "log10(hours)",
+                "threshold": float(np.median(vals)),  # median on log scale
+                "kind": "continuous",
+            }
+        except Exception as e:
+            print(f"[TrainingDataManager] half-life load error: {e}")
+            return None
+    def _load_binary_pair(self, prefix: str):
+        """
+        Load binary labels from <prefix>-positive.npz and <prefix>-negative.npz
+        Returns: {'values': y, 'unit': 'Class (0=neg, 1=pos)', 'kind': 'binary', 'n_pos': int, 'n_neg': int}
+        or None if missing.
+        """
+        pos_path = self.data_dir / f"{prefix}-positive.npz"
+        neg_path = self.data_dir / f"{prefix}-negative.npz"
+        if not pos_path.exists() or not neg_path.exists():
+            return None
+        try:
+            with np.load(pos_path) as pos:
+                pos_data = pos["arr_0"]
+            with np.load(neg_path) as neg:
+                neg_data = neg["arr_0"]
+            y = np.concatenate(
+                [np.ones(len(pos_data), dtype=int), np.zeros(len(neg_data), dtype=int)],
+                axis=0
+            )
+            return {
+                "values": y,
+                "unit": "Class (0=neg, 1=pos)",
+                "kind": "binary",
+                "n_pos": int(len(pos_data)),
+                "n_neg": int(len(neg_data)),
+            }
+        except Exception as e:
+            print(f"[TrainingDataManager] binary load error for '{prefix}': {e}")
+            return None
+    def load_statistics(self):
+        """Load pre-computed statistics for each property"""
+        stats = {
+            'hemolysis': {
+                'values': np.random.beta(2, 5, 1000),
+                'description': 'Probability of peptide disrupting red blood cell membranes.',
+                'unit': 'Probability',
+                'threshold': 0.5,
+                'download_link': '#'
+            },
+            'solubility': {
+                'values': np.random.normal(5, 2, 1000),
+                'description': 'Probability of peptide remaining dissolved in aqueous conditions.',
+                'unit': 'Probability',
+                'threshold': 0.5,
+                'download_link': '#'
+            },
+            'binding_affinity': {
+                'values': np.random.normal(7, 1.5, 1000),
+                'description': 'Protein-peptide binding affinity',
+                'unit': 'Probability',
+                'threshold': 7.5,
+                'download_link': '#'
+            },
+            'half_life (smiles)': {
+                # will be overwritten below if CSV exists
+                'values': np.random.lognormal(2, 1, 1000),
+                'description': 'Serum half-life from clinical and preclinical studies',
+                'unit': 'Hours',
+                'threshold': 2.0,   # hours (default fallback)
+                'download_link': '#'
+            },
+            'nonfouling': {
+                'values': np.random.lognormal(4, 1, 1000),
+                'description': 'A nonfouling peptide resists nonspecific interactions and protein adsorption.',
+                'unit': 'Probability',
+                'threshold': 0.5,
+                'download_link': '#'
+            },
+            'permeability': {
+                'values': np.random.normal(-4, 1, 1000),
+                'description': 'Cell membrane permeability measurements',
+                'unit': 'Probability of peptide penetrating the cell membrane.',
+                'threshold': 0.5,
+                'download_link': '#'
+            }
+        }
+        # Overlay real half-life
+        hl = self._load_half_life_csv()
+        if hl is not None:
+            stats["half_life"].update(hl)
+        # Overlay real solubility from sol-* (binary)
+        sol = self._load_binary_pair("sol")
+        if sol is not None:
+            stats["solubility"].update(sol)
+        # Overlay real non-fouling from nf-* (binary)
+        nf = self._load_binary_pair("nf")
+        if nf is not None:
+            stats["nonfouling"].update(nf)
+        hemo = self._load_binary_pair("hemo")
+        if hemo is not None:
+            stats["hemolysis"].update(hemo)
+        return stats
+    def get_distribution_plot(self, property_name, current_value=None):
+        if property_name not in self.statistics:
+            return None
+        s = self.statistics[property_name]
+        vals = np.asarray(s["values"])
+        kind = s.get("kind", "continuous")
+        if kind == "binary":
+            n0 = int((vals == 0).sum())
+            n1 = int((vals == 1).sum())
+            total = max(n0 + n1, 1)
+            fig = go.Figure()
+            fig.add_trace(go.Bar(x=["Negative (0)", "Positive (1)"], y=[n0, n1]))
+            fig.update_layout(
+                title=f"{property_name.replace('_',' ').title()} — Class Balance",
+                xaxis_title="Class",
+                yaxis_title="Count",
+                height=400,
+                showlegend=False,
+                annotations=[
+                    dict(x="Negative (0)", y=n0, text=f"{n0} ({n0/total:.1%})", showarrow=False, yshift=8),
+                    dict(x="Positive (1)", y=n1, text=f"{n1} ({n1/total:.1%})", showarrow=False, yshift=8),
+                ],
+            )
+            return fig
+        # continuous
+        fig = go.Figure()
+        fig.add_trace(go.Histogram(x=vals, nbinsx=50, name="Training Data"))
+        if "threshold" in s and s["threshold"] is not None:
+            fig.add_vline(
+                x=s["threshold"], line_dash="dash", line_color="red",
+                annotation_text=f"Threshold: {s['threshold']:.3f}"
+            )
+        if current_value is not None:
+            fig.add_vline(
+                x=current_value, line_dash="solid", line_color="green", line_width=3,
+                annotation_text=f"Your Result: {current_value:.3f}"
+            )
+        fig.update_layout(
+            title=f"{property_name.replace('_', ' ').title()} Distribution",
+            xaxis_title=s.get("unit", ""),
+            yaxis_title="Count",
+            height=400,
+            showlegend=False,
+        )
+        return fig
+    def get_property_info(self, property_name):
+        if property_name not in self.statistics:
+            return None
+        s = self.statistics[property_name]
+        vals = np.asarray(s["values"])
+        kind = s.get("kind", "continuous")
+        info = {
+            "description": s.get("description", ""),
+            "unit": s.get("unit", ""),
+            "n_samples": int(len(vals)),
+            "mean": float(np.mean(vals)),
+            "std": float(np.std(vals)),
+            "min": float(np.min(vals)),
+            "max": float(np.max(vals)),
+            "percentiles": {},
+        }
+        if kind == "binary":
+            info["n_neg"] = int((vals == 0).sum())
+            info["n_pos"] = int((vals == 1).sum())
+        else:
+            pct = np.percentile(vals, [10, 25, 50, 75, 90])
+            info["percentiles"] = {
+                "10%": float(pct[0]),
+                "25%": float(pct[1]),
+                "50% (median)": float(pct[2]),
+                "75%": float(pct[3]),
+                "90%": float(pct[4]),
+            }
+        return info
+def _base_stat_key(model_key: str) -> str:
+    # strip modality suffixes to find stats in TrainingDataManager
+    for suf in ("_seq", "_smiles"):
+        if model_key.endswith(suf):
+            return model_key[:-len(suf)]
+    return model_key
+# ==================== Unified Predictor ====================
+class UnifiedPeptidePredictor:
+    """Main predictor handling all model types"""
+    def __init__(self, model_dir="models"):
+        self.model_dir = Path(model_dir)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize tokenizer and ESM model
+        print("Loading ESM model...")
+        self.tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
+        self.esm_model.to(self.device)
+        self.esm_model.eval()
+        self.tokenizer_dir = Path("tokenizer")
+        self.smiles_featurizer = PeptideCLMFeaturizer(
+                vocab_path=f"{self.tokenizer_dir}/new_vocab.txt",
+                splits_path=f"{self.tokenizer_dir}/new_splits.txt",
+                device=self.device,
+            )
+        # Model registry
+        self.models = {}
+        self.model_configs = self.get_model_configs()
+        # Data manager
+        self.data_manager = TrainingDataManager()
+        self._protein_cache = {}
+        # Load models
+        self.load_all_models()
+    def get_model_configs(self):
+        """Define model configurations"""
+        return {
+            'hemolysis_seq': {
+                'type': 'xgboost',
+                'input': 'sequence',
+                'path': 'best_model_hemolysis.json',
+                'inverse_score': False,
+                'unit': 'Probability',
+                'display_name': '🩸 Hemolysis',
+                'positive_label': 'Non-hemolytic',
+                'negative_label': 'Hemolytic'
+            },
+            'hemolysis_smiles': {
+                'type': 'xgboost',
+                'input': 'smiles',
+                'path': 'hemolysis-xgboost_smiles.json',
+                'inverse_score': False,
+                'unit': 'Probability',
+                'display_name': '🩸 Hemolysis',
+                'positive_label': 'Non-hemolytic',
+                'negative_label': 'Hemolytic'
+            },
+            'solubility_seq': {
+                'type': 'xgboost',
+                'input': 'sequence',
+                'path': 'best_model_solubility.json',
+                'unit': 'Probability',
+                'display_name': '💧 Solubility',
+                'positive_label': 'Soluble',
+                'negative_label': 'Insoluble'
+            },
+            'solubility_smiles': {
+                'type': 'xgboost',
+                'input': 'smiles',
+                'path': 'solubility-xgboost_smiles.json',
+                'unit': 'Probability',
+                'display_name': '💧 Solubility',
+                'positive_label': 'Soluble',
+                'negative_label': 'Insoluble'
+            },
+            'permeability_smiles': {
+                'type': 'xgboost',
+                'input': 'smiles',
+                'path': 'permeability-xgboost_smiles.json',
+                'unit': 'Probability',
+                'display_name': '🪣 Permeability',
+                'positive_label': 'Permeable',
+                'negative_label': 'Impermeable'
+            },
+            'half_life_seq': {
+                'type': 'pytorch_cnn',
+                'input': 'sequence',
+                'path': 'best_model_half_life.pth',
+                'transform': lambda x: 10**x,
+                'unit': 'hours',
+                'display_name': '⏱️ Half-life',
+                'positive_label': 'Stable',
+                'negative_label': 'Unstable'
+            },
+            'nonfouling_seq': {
+                'type': 'xgboost',
+                'input': 'sequence',
+                'path': 'best_model_nonfouling.json',
+                'unit': 'Probability',
+                'display_name': '👯 Non-Fouling',
+                'positive_label': 'Non-toxic',
+                'negative_label': 'Toxic'
+            },
+            'nonfouling_smiles': {
+                'type': 'xgboost',
+                'input': 'smiles',
+                'path': 'nonfouling-xgboost_smiles.json',
+                'unit': 'Probability',
+                'display_name': '👯 Non-Fouling',
+                'positive_label': 'Stable',
+                'negative_label': 'Unstable'
+            },
+            'binding_affinity': {
+                'type': 'binding',
+                'input': 'dual_sequence',
+                'path': 'binding_affinity_unpooled.pt',
+                'unit': 'Probability',
+                'display_name': '🔗 Binding Affinity'
+            },
+            'binding_affinity_smiles': {
+                'type': 'binding_smiles',
+                'input': 'sequence+smiles',
+                'path': 'binding-affinity_smiles.pt',
+                'unit': 'Probability',
+                'display_name': '🔗 Binding Affinity (SMILES)'
+            },
+        }
+    def load_all_models(self):
+        """Load all available models"""
+        for name, config in self.model_configs.items():
+            model_path = self.model_dir / config['path']
+            if not model_path.exists():
+                print(f"Warning: Model {name} not found at {model_path}")
+                continue
+            try:
+                if config['type'] == 'xgboost':
+                    self.models[name] = xgb.Booster(model_file=str(model_path))
+                elif config['type'] == 'pytorch_cnn':
+                        model = PeptideCNN().to(self.device)
+                        ckpt_path = model_path  # Path from config
+                        load_cnn_weights_safely(model, ckpt_path, self.device)
+                        model.eval()
+                        self.models[name] = model
+                elif config['type'] == 'binding':
+                    checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
+                    model = UnpooledBindingPredictor(
+                        hidden_dim=384,
+                        kernel_sizes=[3, 5, 7],
+                        n_heads=8,
+                        n_layers=4,
+                        dropout=0.14561457009902096,
+                        freeze_esm=True
+                    ).to(self.device)
+                    model.load_state_dict(checkpoint['model_state_dict'])
+                    model.eval()
+                    self.models[name] = model
+                elif config['type'] == 'binding_smiles':
+                    ckpt = torch.load(model_path, map_location=self.device, weights_only=False)
+                    model = ImprovedBindingPredictor(
+                        esm_dim=1280, smiles_dim=768, hidden_dim=512, n_heads=8, n_layers=3, dropout=0.1
+                    ).to(self.device)
+                    model.load_state_dict(ckpt['model_state_dict'])
+                    model.eval()
+                    self.models[name] = model
+                print(f"✓ Loaded {name}")
+            except Exception as e:
+                print(f"Error loading {name}: {e}")
+    def _protein_embed_mean(self, protein_seq: str) -> torch.Tensor:
+        """Mean-pool ESM last_hidden_state -> [1, 1280]"""
+        toks = self.tokenizer(protein_seq, return_tensors="pt", padding=True, truncation=True, max_length=1024)
+        toks = {k: v.to(self.device) for k, v in toks.items()}
+        with torch.no_grad():
+            out  = self.esm_model(**toks).last_hidden_state  # [1, L, E]
+            mask = toks['attention_mask'].unsqueeze(-1)      # [1, L, 1]
+            pooled = (out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)  # [1, E]
+        return pooled
+    def _get_protein_vec(self, protein_seq: str) -> torch.Tensor:
+        key = protein_seq.strip()
+        if key in self._protein_cache:
+            return self._protein_cache[key]
+        vec = self._protein_embed_mean(key)
+        self._protein_cache[key] = vec
+        return vec
+    def _smiles_embed_mean(self, smiles: str) -> torch.Tensor:
+        vec = self.smiles_featurizer.embed_list([smiles])[0]  # np [H]
+        return torch.from_numpy(vec).to(self.device).unsqueeze(0)  # [1, H]
+    def predict_property(self, model, config, value: str, input_type: str):
+        """
+        value: either AA sequence (Sequence mode) or SMILES (SMILES mode)
+        """
+        if config['type'] == 'xgboost':
+            if input_type == 'SMILES':
+                if config.get('input') != 'smiles':
+                    raise RuntimeError(f"Model {config['display_name']} expects sequence, not SMILES.")
+                feats = self._features_from_smiles_peptclm(value)[None, ...]  # [1, D]
+            else:
+                if config.get('input') == 'smiles':
+                    raise RuntimeError(f"Model {config['display_name']} expects SMILES, not sequence.")
+                # ESM mean-pooled features
+                toks = self.tokenizer(value, return_tensors="pt", padding=True, truncation=True, max_length=512)
+                toks = {k: v.to(self.device) for k, v in toks.items()}
+                with torch.no_grad():
+                    out = self.esm_model(**toks).last_hidden_state
+                mask = toks["attention_mask"].unsqueeze(-1)
+                pooled = (out * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+                feats = pooled.float().cpu().numpy()  # [1, 1280]
+            # Optional safety check
+            expected = model.num_features()
+            if feats.shape[1] != expected:
+                raise RuntimeError(f"Feature dim mismatch: got {feats.shape[1]}, booster expects {expected}")
+            dmat = xgb.DMatrix(feats)
+            pred = model.predict(dmat)[0]
+            if config.get('inverse_score', False):
+                pred = 1 - pred
+            return float(pred)
+        elif config['type'] == 'pytorch_cnn':
+            if input_type == 'SMILES':
+                raise RuntimeError(f"{config['display_name']} (CNN) expects AA sequence, not SMILES.")
+            toks = self.tokenizer(value, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            toks = {k: v.to(self.device) for k, v in toks.items()}
+            with torch.no_grad():
+                out = self.esm_model(**toks).last_hidden_state
+                y = model(out).squeeze().item()
+                if 'transform' in config:
+                    y = config['transform'](y)
+            return float(y)
+        else:
+            raise NotImplementedError(config['type'])
+    def predict_sequence_property(self, model, config, sequence):
+        """Predict property from sequence"""
+        inputs = self.tokenizer(
+            sequence,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.esm_model(**inputs)
+            embeddings = outputs.last_hidden_state
+        if config['type'] == 'xgboost':
+            attention_mask = inputs['attention_mask']
+            masked_embeddings = embeddings * attention_mask.unsqueeze(-1)
+            sum_embeddings = masked_embeddings.sum(dim=1)
+            seq_lengths = attention_mask.sum(dim=1, keepdim=True)
+            mean_embeddings = sum_embeddings / seq_lengths
+            features = mean_embeddings.cpu().numpy()
+            dmatrix = xgb.DMatrix(features)
+            prediction = model.predict(dmatrix)[0]
+            if config.get('inverse_score', False):
+                prediction = 1 - prediction
+        elif config['type'] == 'pytorch_cnn':
+            prediction = model(embeddings).squeeze().item()
+            if 'transform' in config:
+                prediction = config['transform'](prediction)
+        return prediction
+    def predict_binding(self, model, protein_seq, binder_seq, prefer_thresholds: bool = True):
+        """Predict (affinity, class_label). If prefer_thresholds=True, label is derived from model.tight/weak thresholds."""
+        protein_tokens = self.tokenizer(
+            protein_seq, return_tensors="pt",
+            padding="max_length", max_length=1024, truncation=True
+        )
+        binder_tokens = self.tokenizer(
+            binder_seq, return_tensors="pt",
+            padding="max_length", max_length=1024, truncation=True
+        )
+        protein_ids = protein_tokens['input_ids'].to(self.device)
+        protein_mask= protein_tokens['attention_mask'].to(self.device)
+        binder_ids  = binder_tokens['input_ids'].to(self.device)
+        binder_mask = binder_tokens['attention_mask'].to(self.device)
+        with torch.no_grad():
+            reg, logits = model(protein_ids, binder_ids, protein_mask, binder_mask)
+            affinity = float(reg.squeeze().item())
+            # 1) threshold-based class:
+            cls_by_thr = int(model.get_binding_class(affinity))
+            # 2) logits-based class:
+            cls_by_logit = int(torch.argmax(logits, dim=-1).item())
+        class_names = ['Tight', 'Medium', 'Weak']
+        # choose which one you want to show
+        cls_idx = cls_by_thr if prefer_thresholds else cls_by_logit
+        # decorate with explicit cutoffs for UI clarity
+        if cls_idx == 0:
+            label = f"Tight (≥ {model.tight_threshold:.1f})"
+        elif cls_idx == 1:
+            label = f"Medium ({model.weak_threshold:.1f}–{model.tight_threshold:.1f})"
+        else:
+            label = f"Weak (< {model.weak_threshold:.1f})"
+        return affinity, label
+    def predict_binding_smiles(self, model, protein_seq: str, smiles_str: str, prefer_thresholds: bool = True) -> tuple[float, str]:
+        prot_vec   = self._get_protein_vec(protein_seq)   # [1, 1280]
+        smiles_vec = self._smiles_embed_mean(smiles_str)  # [1, 768]
+        with torch.no_grad():
+            reg, logits = model(prot_vec, smiles_vec)
+            affinity = float(reg.squeeze().item())
+            cls_by_thr   = int(model.get_binding_class(affinity))
+            cls_by_logit = int(torch.argmax(logits, dim=-1).item())
+        cls_idx = cls_by_thr if prefer_thresholds else cls_by_logit
+        if cls_idx == 0:
+            label = f"Tight (≥ {model.tight_threshold:.1f})"
+        elif cls_idx == 1:
+            label = f"Medium ({model.weak_threshold:.1f}–{model.tight_threshold:.1f})"
+        else:
+            label = f"Weak (< {model.weak_threshold:.1f})"
+        return affinity, label
+    def _features_from_smiles_peptclm(self, s: str) -> np.ndarray:
+        return self.smiles_featurizer.embed_list([s])[0]
+# ==================== Gradio Interface ====================
+# Global predictor
+predictor = None
+def initialize():
+    """Initialize the predictor"""
+    global predictor
+    if predictor is None:
+        predictor = UnifiedPeptidePredictor(model_dir="models")
+    return predictor
+def predict_properties(
+    input_text: str,
+    input_type: str,
+    protein_seq: str,
+    # Individual property checkboxes
+    hemolysis: bool,
+    solubility: bool,
+    permeability: bool,
+    half_life: bool,
+    nonfouling: bool,
+    binding_affinity: bool,
+    progress=gr.Progress()
+):
+    """Main prediction function"""
+    if not input_text or not input_text.strip():
+        return None, "⚠️ Please provide an input sequence"
+    lines = [s.strip() for s in input_text.split("\n") if s.strip()]
+    if input_type == "SMILES":
+        bad = [s for s in lines if not is_smiles_like(s)]
+        if bad:
+            return None, f"⚠️ You selected SMILES but {len(bad)} input line(s) don't look like SMILES. Example bad line: {bad[0][:60]}"
+        if binding_affinity and not protein_seq:
+            return None, "⚠️ For SMILES binding, please provide a protein sequence in the 'Protein Sequence' box."
+    else:
+        bad = [s for s in lines if not is_aa_sequence_like(s)]
+        if bad:
+            return None, f"⚠️ You selected Sequence but {len(bad)} input line(s) don't look like amino-acid sequences. Example bad line: {bad[0][:60]}"
+    pred = initialize()
+    results = []
+    # Collect selected properties
+    selected_properties = []
+    # Map UI checkboxes to your internal model keys
+    checkbox_to_keys = {
+        'hemolysis':       ['hemolysis_seq', 'hemolysis_smiles'],
+        'solubility':      ['solubility_seq', 'solubility_smiles'],
+        'permeability':    ['permeability_smiles'],  # only smiles in your current config
+        'half_life':       ['half_life_seq', 'binding_affinity_smiles'],
+        'nonfouling':      ['nonfouling_seq', 'nonfouling_smiles'],       # adjust if you have a real cytotox model
+    }
+    selected_properties = []
+    for ui_name, is_selected in {
+        'hemolysis': hemolysis,
+        'solubility': solubility,
+        'permeability': permeability,
+        'half_life': half_life,
+        'nonfouling': nonfouling,
+    }.items():
+        if not is_selected:
+            continue
+        # choose the variant that matches the current input type
+        keys = checkbox_to_keys.get(ui_name, [])
+        for key in keys:
+            if key in pred.model_configs:
+                expected_input = pred.model_configs[key].get('input', 'sequence')
+                if (input_type == 'SMILES' and expected_input == 'smiles') or \
+                (input_type == 'Sequence' and expected_input == 'sequence'):
+                    if key in pred.models:
+                        selected_properties.append(key)
+    # Process sequences for regular properties
+    if selected_properties:
+        sequences = [s.strip() for s in input_text.split('\n') if s.strip()]
+        for seq_idx, seq in enumerate(sequences):
+            progress((seq_idx + 1) / len(sequences), f"Processing sequence {seq_idx + 1}/{len(sequences)}")
+            for prop in selected_properties:
+                config = pred.model_configs[prop]
+                model = pred.models[prop]
+                try:
+                    value = pred.predict_property(model, config, seq, input_type)
+                    stat_key = _base_stat_key(prop)
+                    threshold = pred.data_manager.statistics.get(stat_key, {}).get('threshold')
+                    if threshold is not None:
+                        # which direction?
+                        if stat_key in ['hemolysis']:           # lower is better
+                            label = config['positive_label'] if value < threshold else config['negative_label']
+                        else:                                    # higher is better by default for these examples
+                            label = config['positive_label'] if value > threshold else config['negative_label']
+                    else:
+                        label = ""
+                    # Create clickable property name
+                    prop_display = f'<a href="#" onclick="show_distribution(\'{prop}\', {value})">{config["display_name"]}</a>'
+                    results.append({
+                        'Sequence': seq[:30] + '...' if len(seq) > 30 else seq,
+                        'Property': config["display_name"],
+                        'Prediction': label,
+                        'Value': f"{value:.3f}",
+                        'Unit': config['unit']
+                    })
+                except Exception as e:
+                    print(f"Error predicting {prop}: {e}")
+    # Handle binding affinity separately
+    if binding_affinity and input_text:
+        # Sequence–Sequence binding
+        if input_type == "Sequence":
+            if 'binding_affinity' in pred.models:
+                progress(0.9, "Predicting binding affinity (sequence) ...")
+                if not protein_seq:
+                    return None, "⚠️ Please provide a protein sequence for binding prediction."
+                try:
+                    binder_seqs = [s.strip() for s in input_text.split('\n') if s.strip()]
+                    for binder_seq in binder_seqs:
+                        affinity, binding_class = pred.predict_binding(
+                            pred.models['binding_affinity'],
+                            protein_seq,
+                            binder_seq
+                        )
+                        results.append({
+                            'Sequence': f"Protein–{binder_seq[:20]}...",
+                            'Property': pred.model_configs['binding_affinity']['display_name'],
+                            'Prediction': binding_class,  # e.g., Tight/Medium/Weak
+                            'Value': f"{affinity:.3f}",
+                            'Unit': pred.model_configs['binding_affinity']['unit']
+                        })
+                except Exception as e:
+                    print(f"Error in sequence binding prediction: {e}")
+        # Sequence + SMILES binding
+        else:  # input_type == "SMILES"
+            if 'binding_affinity_smiles' not in pred.models:
+                return None, "⚠️ SMILES binding model not loaded. Please add the checkpoint to models/ and restart."
+            if not protein_seq:
+                return None, "⚠️ For SMILES binding, please provide a protein sequence."
+            # quick AA check for protein_seq
+            if not is_aa_sequence_like(protein_seq):
+                return None, "⚠️ The provided protein sequence does not look like an amino-acid sequence."
+            progress(0.9, "Predicting binding affinity (SMILES) ...")
+            try:
+                smiles_list = [s.strip() for s in input_text.split('\n') if s.strip()]
+                for smi in smiles_list:
+                    affinity, label = pred.predict_binding_smiles(
+                        pred.models['binding_affinity_smiles'],
+                        protein_seq,
+                        smi
+                    )
+                    results.append({
+                        'Sequence': f"Protein–{smi[:20]}...",
+                        'Property': pred.model_configs['binding_affinity_smiles']['display_name'],
+                        'Prediction': label,             # Tight (≥7.5) / Medium (6.0–7.5) / Weak (<6.0)
+                        'Value': f"{affinity:.3f}",
+                        'Unit': pred.model_configs['binding_affinity_smiles']['unit'],
+                    })
+            except Exception as e:
+                print(f"Error in SMILES binding prediction: {e}")
+    if not results:
+        return None, "⚠️ Please select at least one property to predict"
+    # Create summary
+    n_sequences = len(set(r['Sequence'] for r in results))
+    n_properties = len(set(r['Property'] for r in results))
+    status = f"✅ Completed {len(results)} predictions ({n_sequences} sequence(s), {n_properties} properties)"
+    if binding_affinity:
+        status += "  \n**Binding class cutoffs:** Tight ≥ 7.5, Medium 6.0–7.5, Weak < 6.0"
+    return pd.DataFrame(results), status
+def show_distribution(property_name, predicted_value=None):
+    """Show distribution plot + info for selected property."""
+    pred = initialize()
+    if not property_name:
+        return None, "Select a property to view its distribution."
+    # Get the first property if a list was passed
+    prop = property_name[0] if isinstance(property_name, list) else property_name
+    # Generate the plot (works for both binary & continuous)
+    fig = pred.data_manager.get_distribution_plot(prop, predicted_value)
+    # Build info panel with correct fields per kind
+    stats = pred.data_manager.statistics.get(prop, {})
+    kind = stats.get("kind", "continuous")
+    info = pred.data_manager.get_property_info(prop)
+    if not info:
+        return fig, "No information available for this property."
+    title = prop.replace('_', ' ').title()
+    if kind == "binary":
+        n_pos = info.get("n_pos", int((stats.get("values") == 1).sum() if "values" in stats else 0))
+        n_neg = info.get("n_neg", int((stats.get("values") == 0).sum() if "values" in stats else 0))
+        total = max(n_pos + n_neg, 1)
+        info_text = f"""
+### {title} Information
+**Description:** {info.get('description','')}
+**Statistics (Binary):**
+- Samples: {info['n_samples']:,}
+- Positives (1): {n_pos:,} ({n_pos/total:.1%})
+- Negatives (0): {n_neg:,} ({n_neg/total:.1%})
+"""
+    else:
+        p = info.get("percentiles", {})
+        info_text = f"""
+### {title} Information
+**Description:** {info.get('description','')}
+**Statistics:**
+- Samples: {info['n_samples']:,}
+- Mean: {info['mean']:.3f} {info['unit']}
+- Std Dev: {info['std']:.3f}
+- Range: [{info['min']:.3f}, {info['max']:.3f}]
+**Percentiles:**
+- 10%: {p.get('10%', float('nan')):.3f}
+- 25%: {p.get('25%', float('nan')):.3f}
+- 50% (median): {p.get('50% (median)', float('nan')):.3f}
+- 75%: {p.get('75%', float('nan')):.3f}
+- 90%: {p.get('90%', float('nan')):.3f}
+"""
+    return fig, info_text
+def load_example(example_name):
+    """Load example sequences"""
+    examples = {
+        "T7": ("HAIYPRH", ""),
+        "Protein-Peptide": ("MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLST", "GIVEQCCTSICSLYQLENYCN")
+    }
+    if example_name in examples:
+        if example_name == "Protein-Peptide":
+            return examples[example_name][1], examples[example_name][0]  # Binder, Protein
+        else:
+            return examples[example_name][0], ""
+    return "", ""
+# ==================== Gradio App ====================
+custom_css = """
+.gradio-container {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+}
+.gr-button-primary {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    color: white !important;
+}
+.gr-button-primary:hover {
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
+}
+h1 {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-size: 2.5em !important;
+    text-align: center;
+    margin-bottom: 10px !important;
+}
+table {
+    font-size: 14px !important;
+}
+.property-result:hover {
+    background: #f0f0f0;
+    cursor: pointer;
+}
+"""
+with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo")) as demo:
+    # Header
+    gr.Markdown(
+        """
+        # ☄️ PeptiVerse
+        ### Peptide Property Predictions
+        """
+    )
+    with gr.Tabs():
+        # Main Prediction Tab
+        with gr.TabItem("🔬 Predict"):
+            with gr.Row():
+                # Input Section
+                with gr.Column(scale=1):
+                    with gr.Group():
+                        gr.Markdown("### 📝 Input")
+                        input_type = gr.Radio(
+                            ["Sequence", "SMILES"],
+                            label="Input Type",
+                            value="Sequence"
+                        )
+                        input_text = gr.Textbox(
+                            label="Peptide Sequence(s) / Binder",
+                            placeholder="Enter amino acid sequence(s), one per line",
+                            lines=6
+                        )
+                        protein_seq = gr.Textbox(
+                            label="Protein Sequence (for binding prediction)",
+                            placeholder="Enter protein sequence for binding affinity prediction",
+                            lines=3,
+                            visible=False
+                        )
+                        gr.Markdown("**Examples:**")
+                        example_dropdown = gr.Dropdown(
+                            choices=["T7","Protein-Peptide"],
+                            label="Load Example",
+                            interactive=True
+                        )
+                        file_input = gr.File(
+                            label="Or Upload File",
+                            file_types=[".txt", ".fasta", ".fa"],
+                            visible=False
+                        )
+                # Property Selection
+                with gr.Column(scale=1):
+                    with gr.Group():
+                        gr.Markdown("### ⚙️ Select Properties")
+                        with gr.Accordion("Sequence Properties", open=True):
+                            hemolysis = gr.Checkbox(label="🩸 Hemolysis ↓", value=True)
+                            solubility = gr.Checkbox(label="💧 Solubility ↑", value=True)
+                            permeability = gr.Checkbox(label="🪣 Permeability ↑", value=False)
+                            half_life = gr.Checkbox(label="⏱️ Half-life ↑", value=False)
+                            nonfouling = gr.Checkbox(label="👯 Non-Fouling ↑", value=False)
+                        with gr.Accordion("Binding Prediction", open=False):
+                            binding_affinity = gr.Checkbox(label="🔗 Binding Affinity ↑", value=False)
+                            gr.Markdown("*Requires protein sequence input*")
+        # Distribution Analysis Tab
+        with gr.TabItem("📊 Distributions"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    property_selector = gr.Dropdown(
+                        choices=["hemolysis", "solubility", "permeability", "half_life (smiles)",
+                                "nonfouling", "binding_affinity"],
+                        label="Select Property",
+                        value="hemolysis"
+                    )
+                    test_value = gr.Number(label="Test Value among Distribution", value=None)
+                    show_dist_btn = gr.Button("Show Distribution")
+                with gr.Column(scale=2):
+                    dist_plot_tab = gr.Plot(label="Score Distribution")
+                    dist_info_tab = gr.Markdown()
+        # Data Documentation Tab
+        with gr.TabItem("📚 Documentation"):
+            file_path = "description.md"
+            try:
+                with open(file_path, "r", encoding="utf-8") as f:
+                    markdown_content = f.read()
+            except FileNotFoundError:
+                print(f"Error: The file '{file_path}' was not found.")
+            except Exception as e:
+                print(f"An error occurred: {e}")
+            gr.Markdown(
+                markdown_content
+            )
+    # Action Buttons
+    with gr.Row():
+        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+        predict_btn = gr.Button("🚀 Predict Properties", variant="primary", scale=2)
+    # Status
+    status_output = gr.Markdown("")
+    # Results Section
+    with gr.Group():
+        gr.Markdown("### 📊 Results")
+        gr.Markdown("*Click on property names to view distribution plots*")
+        results_df = gr.Dataframe(
+            headers=["Sequence", "Property", "Prediction", "Value", "Unit"],
+            datatype=["str", "str", "str", "str", "str"],
+            interactive=False
+        )
+    # Hidden components for distribution modal
+    with gr.Row(visible=False) as distribution_row:
+        with gr.Column():
+            selected_property = gr.Textbox(visible=False)
+            dist_plot_modal = gr.Plot()         # <-- renamed
+            dist_info_modal = gr.Markdown()     # <-- renamed
+            close_btn = gr.Button("Close")
+    # Footer
+    gr.Markdown(
+        """
+        ---
+        <div style='text-align: center; color: #6b7280;'>
+            <p>Models: ESM2-650M embeddings + XGBoost/CNN classifiers</p>
+            <p style='font-size: 0.9em;'>Click on property names in results to view training data distributions</p>
+        </div>
+        """
+    )
+    # Event Handlers
+    def update_visibility(binding_checked):
+        return gr.update(visible=binding_checked)
+    binding_affinity.change(
+        update_visibility,
+        inputs=[binding_affinity],
+        outputs=[protein_seq]
+    )
+    example_dropdown.change(
+        load_example,
+        inputs=[example_dropdown],
+        outputs=[input_text, protein_seq]
+    )
+    predict_btn.click(
+        predict_properties,
+        inputs=[
+            input_text, input_type, protein_seq,
+            hemolysis, solubility, permeability,
+            half_life, nonfouling,
+            binding_affinity
+        ],
+        outputs=[results_df, status_output]
+    )
+    clear_btn.click(
+        lambda: ("", "", None, ""),
+        outputs=[input_text, protein_seq, results_df, status_output]
+    )
+    # Add JavaScript for clickable property names
+    demo.load(js="""
+    function show_distribution(property, value) {
+        // This would open a modal with the distribution
+        console.log('Show distribution for', property, 'with value', value);
+    }
+    """)
+    show_dist_btn.click(
+        show_distribution,
+        inputs=[property_selector, test_value],
+        outputs=[dist_plot_tab, dist_info_tab]
+    )
+if __name__ == "__main__":
+    print("Initializing models...")
+    initialize()
+    print("Ready!")
+    demo.launch(share=True)

description.md ADDED Viewed

	@@ -0,0 +1,70 @@

+## Data Sources and Methods
+### Training Data Collection
+Our models are trained on curated datasets from multiple sources:
+#### Hemolysis Dataset
+- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
+- **Secondary Source:** the Database of Antimicrobial Activity and Structure of Peptides (DBAASPv3)
+- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
+- **Description:** Probability of peptide disrupting red blood cell membranes.
+- **Download:** [hemolysis_training_data.csv](#)
+#### Solubility Dataset
+- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
+- **Secondary Source:** PROSO-II
+- **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
+- **Description:** Probability of peptide remaining dissolved in aqueous conditions.
+- **Download:** [solubility_training_data.csv](#)
+#### Non-Fouling Dataset
+- **Primary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
+- **Secondary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
+- **Size:** 3,600 positive, 13,585 negative
+- **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
+- **Download:** [solubility_training_data.csv](#)
+#### Permeability Dataset
+- **Primary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
+- **Secondary Source:** CycPeptMPDB
+- **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
+- **Description:** Probability of peptide penetrating the cell membrane.
+- **Download:** [binding_affinity_training_data.csv](#)
+#### Half-life Dataset
+- **Primary Source:** [Thpdb2](https://doi.org/10.1016/j.drudis.2024.104047), [PepTherDia](https://doi.org/10.1016/j.drudis.2021.02.019), [peplife](https://www.nature.com/articles/srep36617)
+- **Size:** 105 wt, 275 wt+noncanonical, human-only
+- **Clean-ups:** Data are all transformed into log\(hour\)
+- **Download:** [binding_affinity_training_data.csv](#)
+#### Binding Affinity Dataset
+- **Primary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
+- **Size:** 1,781 protein-peptide complexes, canonical and non-canonical
+- **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
+- **Quality:** Binding class cutoffs: Tight ≥ 7.5, Medium 6.0–7.5, Weak < 6.0
+- **Download:** [binding_affinity_training_data.csv](#)
+### Model Architecture
+- **Sequence Embeddings:** ESM-2 650M parameter model
+- **XGBoost Models:** Gradient boosting on pooled ESM embeddings
+- **CNN Models:** 1D convolutional networks with attention mechanisms
+- **Binding Model:** Cross-attention between protein and peptide representations
+### Citation
+If you use this tool, please cite:
+```
+@article{peptiprop2024,
+    title={PeptiProp: Unified Platform for Peptide Property Prediction},
+    author={Your Name et al.},
+    journal={Journal Name},
+    year={2024}
+}
+```
+### Contact
+For questions or collaborations: [contact@example.com](mailto:contact@example.com)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.0.0
+pandas>=2.0.0
+numpy>=1.24.0
+plotly>=5.14.0
+torch>=2.0.0
+transformers==4.46.0
+scikit-learn>=1.3.0
+biopython>=1.81
+rdkit>=2023.3.1
+seaborn
+SmielsPE

tokenizer/__init__.py ADDED Viewed

File without changes

tokenizer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (136 Bytes). View file

tokenizer/__pycache__/my_tokenizers.cpython-310.pyc ADDED Viewed

Binary file (16.2 kB). View file

tokenizer/my_tokenizers.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import collections
+import os
+import re
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+from SmilesPE.tokenizer import SPE_Tokenizer
+import torch
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class Atomwise_Tokenizer(object):
+    """Run atom-level SMILES tokenization"""
+    def __init__(self):
+        """ Constructs a atom-level Tokenizer.
+        """
+        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """ Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        spe_file (:obj:`string`):
+            File containing the trained SMILES Pair Encoding vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(self, vocab_file, spe_file,
+                unk_token="[UNK]",
+                sep_token="[SEP]",
+                pad_token="[PAD]",
+                cls_token="[CLS]",
+                mask_token="[MASK]",
+                **kwargs):
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        if not os.path.isfile(spe_file):
+            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
+        self.vocab = load_vocab(vocab_file)
+        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.spe_tokenizer.tokenize(text).split(' ')
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    # changed encode and decode functions
+    def encode(self, token_array):
+        token_ids = []
+        token_ids.append(2)
+        for token in token_array:
+            id = self._convert_token_to_id(token)
+            token_ids.append(id)
+        token_ids.append(3)
+        token_ids = torch.tensor([token_ids])
+        attn_mask = torch.ones_like(token_ids)
+        return {'input_ids': token_ids, 'attention_mask': attn_mask}
+    def decode(self, token_ids, skip_special_tokens=True):
+        token_ids = token_ids.squeeze(0).cpu().tolist()
+        token_array = []
+        for idx in token_ids:
+            if idx == 3:  # Stop decoding when token ID 3 is encountered
+                break
+            if skip_special_tokens and idx in self.all_special_ids:
+                continue
+            token = self._convert_id_to_token(idx)
+            token_array.append(token)
+        sequence = "".join(token_array)
+        return sequence
+    def batch_decode(self, batch_token_ids, skip_special_tokens=True):
+        sequences = []
+        for token_ids in batch_token_ids:
+            sequences.append(self.decode(token_ids))
+        return sequences
+    def get_token_split(self, token_ids):
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.cpu().tolist()
+        token_array = []
+        for seq_ids in token_ids:
+            seq_array = []
+            for id in seq_ids:
+                token = self._convert_id_to_token(id)
+                seq_array.append(token)
+            token_array.append(seq_array)
+        return token_array
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.tokenizer = Atomwise_Tokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)

tokenizer/new_splits.txt ADDED Viewed

	@@ -0,0 +1,159 @@

+c 1
+c 2
+c 3
+c 4
+c 5
+c 6
+c 7
+c 8
+c 9
+( c1
+( c2
+c1 )
+c2 )
+n 1
+n 2
+n 3
+n 4
+n 5
+n 6
+n 7
+n 8
+n 9
+( n1
+( n2
+n1 )
+n2 )
+O 1
+O 2
+O 3
+O 4
+O 5
+O 6
+O 7
+O 8
+O 9
+( O1
+( O2
+O2 )
+O2 )
+= O
+= C
+= c
+= N
+= n
+=C C
+=C N
+=C c
+=c c
+=N C
+=N c
+=n C
+=n c
+# N
+# C
+#N C
+#C C
+#C N
+#N N
+( C
+C )
+( O
+O )
+( N
+N )
+Br c
+( =O
+(=O )
+C (=O)
+C =O
+C =N
+C #N
+C #C
+C C
+CC C
+CC N
+CC O
+CC S
+CC c
+CC n
+C N
+CN C
+CN c
+C O
+CO C
+CO N
+CO c
+C S
+CS C
+CS S
+CS c
+C c
+Cl c
+C n
+F c
+N C
+NC C
+NC c
+N N
+N O
+N c
+N n
+O C
+OC C
+OC O
+OC c
+O N
+O O
+O c
+S C
+SC C
+SC c
+S S
+S c
+c c
+cc c
+cc n
+cc o
+cc s
+cc cc
+c n
+cn c
+cn n
+c o
+co c
+c s
+cs c
+cs n
+n c
+nc c
+nc n
+nc o
+nc s
+n n
+nn c
+nn n
+n o
+no c
+no n
+n s
+ns c
+ns n
+o c
+oc c
+o n
+s c
+sc c
+sc n
+s n
+N P
+P N
+C P
+P C
+N S
+S N
+C S
+S C
+S P
+P S
+C I

tokenizer/new_vocab.txt ADDED Viewed

	@@ -0,0 +1,587 @@

+[PAD]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+#
+%
+(
+)
++
+-
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+=
+@
+A
+B
+Br
+Brc
+C
+CC
+CCC
+CCN
+CCO
+CCS
+CCc
+CCn
+CN
+CNC
+CNc
+CO
+COC
+CON
+COc
+CS
+CSC
+CSS
+CSc
+Cc
+Cl
+Clc
+Cn
+F
+Fc
+H
+I
+K
+L
+M
+N
+NC
+NCC
+NCc
+NN
+NO
+Nc
+Nn
+O
+OC
+OCC
+OCO
+OCc
+ON
+OO
+Oc
+P
+R
+S
+SC
+SCC
+SCc
+SS
+Sc
+T
+X
+Z
+[
+\\
+(/
+]
+a
+b
+c
+cc
+ccc
+cccc
+ccn
+cco
+ccs
+cn
+cnc
+cnn
+co
+coc
+cs
+csc
+csn
+e
+g
+i
+l
+n
+nc
+ncc
+ncn
+nco
+ncs
+nn
+nnc
+nnn
+no
+noc
+non
+ns
+nsc
+nsn
+o
+oc
+occ
+on
+p
+r
+s
+sc
+scc
+scn
+sn
+t
+c1
+c2
+c3
+c4
+c5
+c6
+c7
+c8
+c9
+n1
+n2
+n3
+n4
+n5
+n6
+n7
+n8
+n9
+O1
+O2
+O3
+O4
+O5
+O6
+O7
+O8
+O9
+(c1
+(c2
+c1)
+c2)
+(n1
+(n2
+n1)
+n2)
+(O1
+(O2
+O2)
+=O
+=C
+=c
+=N
+=n
+=CC
+=CN
+=Cc
+=cc
+=NC
+=Nc
+=nC
+=nc
+#C
+#CC
+#CN
+#N
+#NC
+#NN
+(C
+C)
+(O
+O)
+(N
+N)
+NP
+PN
+CP
+PC
+NS
+SN
+SP
+PS
+C(=O)
+(/Br)
+(/C#N)
+(/C)
+(/C=N)
+(/C=O)
+(/CBr)
+(/CC)
+(/CCC)
+(/CCF)
+(/CCN)
+(/CCO)
+(/CCl)
+(/CI)
+(/CN)
+(/CO)
+(/CS)
+(/Cl)
+(/F)
+(/I)
+(/N)
+(/NC)
+(/NCC)
+(/NO)
+(/O)
+(/OC)
+(/OCC)
+(/S)
+(/SC)
+(=C)
+(=C/C)
+(=C/F)
+(=C/I)
+(=C/N)
+(=C/O)
+(=CBr)
+(=CC)
+(=CCF)
+(=CCN)
+(=CCO)
+(=CCl)
+(=CF)
+(=CI)
+(=CN)
+(=CO)
+(=C\\C)
+(=C\\F)
+(=C\\I)
+(=C\\N)
+(=C\\O)
+(=N)
+(=N/C)
+(=N/N)
+(=N/O)
+(=NBr)
+(=NC)
+(=NCC)
+(=NCl)
+(=NN)
+(=NO)
+(=NOC)
+(=N\\C)
+(=N\\N)
+(=N\\O)
+(=O)
+(=S)
+(B)
+(Br)
+(C#C)
+(C#CC)
+(C#CI)
+(C#CO)
+(C#N)
+(C#SN)
+(C)
+(C=C)
+(C=CF)
+(C=CI)
+(C=N)
+(C=NN)
+(C=NO)
+(C=O)
+(C=S)
+(CBr)
+(CC#C)
+(CC#N)
+(CC)
+(CC=C)
+(CC=O)
+(CCBr)
+(CCC)
+(CCCC)
+(CCCF)
+(CCCI)
+(CCCN)
+(CCCO)
+(CCCS)
+(CCCl)
+(CCF)
+(CCI)
+(CCN)
+(CCNC)
+(CCNN)
+(CCNO)
+(CCO)
+(CCOC)
+(CCON)
+(CCS)
+(CCSC)
+(CCl)
+(CF)
+(CI)
+(CN)
+(CN=O)
+(CNC)
+(CNCC)
+(CNCO)
+(CNN)
+(CNNC)
+(CNO)
+(CNOC)
+(CO)
+(COC)
+(COCC)
+(COCI)
+(COCN)
+(COCO)
+(COF)
+(CON)
+(COO)
+(CS)
+(CSC)
+(CSCC)
+(CSCF)
+(CSO)
+(Cl)
+(F)
+(I)
+(N)
+(N=N)
+(N=NO)
+(N=O)
+(N=S)
+(NBr)
+(NC#N)
+(NC)
+(NC=N)
+(NC=O)
+(NC=S)
+(NCBr)
+(NCC)
+(NCCC)
+(NCCF)
+(NCCN)
+(NCCO)
+(NCCS)
+(NCCl)
+(NCNC)
+(NCO)
+(NCS)
+(NCl)
+(NN)
+(NN=O)
+(NNC)
+(NO)
+(NOC)
+(O)
+(OC#N)
+(OC)
+(OC=C)
+(OC=O)
+(OC=S)
+(OCBr)
+(OCC)
+(OCCC)
+(OCCF)
+(OCCI)
+(OCCN)
+(OCCO)
+(OCCS)
+(OCCl)
+(OCF)
+(OCI)
+(OCO)
+(OCOC)
+(OCON)
+(OCSC)
+(OCl)
+(OI)
+(ON)
+(OO)
+(OOC)
+(OOCC)
+(OOSN)
+(OSC)
+(P)
+(S)
+(SC#N)
+(SC)
+(SCC)
+(SCCC)
+(SCCF)
+(SCCN)
+(SCCO)
+(SCCS)
+(SCCl)
+(SCF)
+(SCN)
+(SCOC)
+(SCSC)
+(SCl)
+(SI)
+(SN)
+(SN=O)
+(SO)
+(SOC)
+(SOOO)
+(SS)
+(SSC)
+(SSCC)
+([At])
+([O-])
+([O])
+([S-])
+(\\Br)
+(\\C#N)
+(\\C)
+(\\C=N)
+(\\C=O)
+(\\CBr)
+(\\CC)
+(\\CCC)
+(\\CCO)
+(\\CCl)
+(\\CF)
+(\\CN)
+(\\CNC)
+(\\CO)
+(\\COC)
+(\\Cl)
+(\\F)
+(\\I)
+(\\N)
+(\\NC)
+(\\NCC)
+(\\NN)
+(\\NO)
+(\\NOC)
+(\\O)
+(\\OC)
+(\\OCC)
+(\\ON)
+(\\S)
+(\\SC)
+(\\SCC)
+[Ag+]
+[Ag-4]
+[Ag]
+[Al-3]
+[Al]
+[As+]
+[AsH3]
+[AsH]
+[As]
+[At]
+[B-]
+[B@-]
+[B@@-]
+[BH-]
+[BH2-]
+[BH3-]
+[B]
+[Ba]
+[Br+2]
+[BrH]
+[Br]
+[C+]
+[C-]
+[C@@H]
+[C@@]
+[C@H]
+[C@]
+[CH-]
+[CH2]
+[CH3]
+[CH]
+[C]
+[CaH2]
+[Ca]
+[Cl+2]
+[Cl+3]
+[Cl+]
+[Cs]
+[FH]
+[F]
+[H]
+[He]
+[I+2]
+[I+3]
+[I+]
+[IH]
+[I]
+[K]
+[Kr]
+[Li+]
+[LiH]
+[MgH2]
+[Mg]
+[N+]
+[N-]
+[N@+]
+[N@@+]
+[N@@]
+[N@]
+[NH+]
+[NH-]
+[NH2+]
+[NH3]
+[NH]
+[N]
+[Na]
+[O+]
+[O-]
+[OH+]
+[OH2]
+[OH]
+[O]
+[P+]
+[P@+]
+[P@@+]
+[P@@]
+[P@]
+[PH2]
+[PH]
+[P]
+[Ra]
+[Rb]
+[S+]
+[S-]
+[S@+]
+[S@@+]
+[S@@]
+[S@]
+[SH+]
+[SH2]
+[SH]
+[S]
+[Se+]
+[Se-2]
+[SeH2]
+[SeH]
+[Se]
+[Si@]
+[SiH2]
+[SiH]
+[Si]
+[SrH2]
+[TeH]
+[Te]
+[Xe]
+[Zn+2]
+[Zn-2]
+[Zn]
+[b-]
+[c+]
+[c-]
+[cH-]
+[cH]
+[c]
+[n+]
+[n-]
+[nH]
+[n]
+[o+]
+[s+]
+[se+]
+[se]
+[te+]
+[te]

training_data/half_life_smiles.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d90293170442bc81af2cf9f64656c40bf884733947ca52b2f9308f42220680c3
+size 174323

training_data/hemo-negative.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f83aad41f160deb6401bc0801bddc931488da6e1785749e6f72de6d0f154a37f
+size 109451

training_data/hemo-positive.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96cb24d5a7617f7e211cd48d2b0b424a46affa95716b96058058902068068d27
+size 27840

training_data/nf-negative.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e506e52e101308dd3882ca6bd45833a6e0837f9f240aa85d575c2a41e305b854
+size 21845190

training_data/nf-positive.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78caae183fe840b145275d9486a3f94a963989deb9d55a57995653bf1d497bf2
+size 41326

training_data/sol-negative.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3b6d380024e0483e15e3e219a7cbf23f4d178d823287cef24bc1bd918a817b6
+size 15469064

training_data/sol-positive.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46169267fd0d37d8a063a4e9fc1cdd9b701a9211b1f16515e3d569fcf2d4d859
+size 14056264