MitoInteract v1 - Pearson R=-0.9107

Browse files

Files changed (5) hide show

README.md +94 -0
config.json +24 -0
full_model.pt +3 -0
mitointeract_weights.pt +3 -0
model.py +112 -0

README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+license: mit
+tags:
+- biology
+- protein
+- drug-target-interaction
+- mitochondria
+- apoptosis
+- binding-affinity
+- esm2
+- chemberta
+datasets:
+- jglaser/binding_affinity
+metrics:
+- pearsonr
+- spearmanr
+- rmse
+- mae
+---
+# MitoInteract: Protein-Molecule Binding Affinity Prediction for Mitochondrial Apoptosis Research
+## Overview
+MitoInteract is a dual-encoder model that predicts **binding affinity (pKd)** between any protein and any molecule.
+It combines:
+- **ESM-2 650M** (protein encoder) for protein sequence understanding
+- **ChemBERTa** (molecule encoder) for SMILES-based molecular representation
+- **Bidirectional cross-attention** fusion layer
+- **4-layer MLP** regression head
+## Intended Use
+This model is designed for **mitochondrial apoptosis research**, enabling researchers to:
+- Predict how ceramides interact with mitochondrial membrane proteins (VDAC1, VDAC2)
+- Screen BCL-2 family protein interactions with BH3 mimetic drugs (venetoclax, navitoclax, ABT-737)
+- Explore protein-lipid interactions in the apoptosis pathway
+- Run in-silico binding experiments before wet-lab validation
+## Quick Start
+```python
+from model import load_model, predict_binding
+# Load model
+model, config = load_model("full_model.pt", device="cuda")
+# Predict ceramide C16 binding to VDAC1
+result = predict_binding(
+    model,
+    protein_seq="MPPYLTFGLKAGALLPLTLPYVRAEAVTKLKLTLNAFEGASK...",  # VDAC1
+    smiles="CCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@H](O)/C=C/CCCCCCCCCCCCC",  # Ceramide C16
+    device="cuda"
+)
+print(f"Predicted pKd: {result['pKd']:.3f}")
+print(f"Predicted Kd: {result['Kd_uM']:.3f} µM")
+```
+## Key Apoptosis Targets
+| Protein | Role in Apoptosis |
+|---------|-------------------|
+| BCL-2 | Anti-apoptotic, prevents MOMP |
+| BCL-XL | Anti-apoptotic, sequesters BAX/BAK |
+| BAX | Pro-apoptotic, forms pores in outer membrane |
+| BAK | Pro-apoptotic, oligomerizes in membrane |
+| VDAC1 | Voltage-dependent anion channel, ceramide target |
+| Cytochrome c | Released during MOMP, activates caspase cascade |
+## Key Molecules
+| Molecule | Role |
+|----------|------|
+| Ceramide C16 | Lipid mediator, promotes MOMP via VDAC |
+| Ceramide C2 | Short-chain ceramide analog |
+| Venetoclax | BCL-2 inhibitor (FDA-approved) |
+| Navitoclax | BCL-2/BCL-XL dual inhibitor |
+| ABT-737 | BCL-2/BCL-XL/BCL-w inhibitor |
+| Cardiolipin | Mitochondrial inner membrane lipid |
+## Training Details
+- **Dataset**: jglaser/binding_affinity (1.9M protein-ligand pairs)
+- **Architecture**: ESM-2 650M (frozen) + ChemBERTa (frozen) + Cross-Attention + MLP
+- **Training**: AdamW, lr=1e-3, cosine schedule, early stopping
+- **Best Validation Pearson R**: -0.9107
+## Citation
+Based on:
+- BAPULM (arxiv:2411.04150) - frozen encoder + MLP pattern
+- SSM-DTA (arxiv:2206.09818) - CLS cross-attention fusion
+- ESM-2 (arxiv:2202.03555) - protein language model
+- ChemBERTa (arxiv:2010.09885) - molecular language model

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "esm_model": "facebook/esm2_t12_35M_UR50D",
+  "mol_model": "seyonec/ChemBERTa-zinc-base-v1",
+  "protein_dim": 480,
+  "mol_dim": 768,
+  "proj_dim": 256,
+  "n_heads": 8,
+  "dropout": 0.1,
+  "freeze_encoders": true,
+  "max_prot_len": 512,
+  "max_mol_len": 200,
+  "max_train_samples": 32,
+  "max_val_samples": 16,
+  "val_split": 0.05,
+  "batch_size": 4,
+  "lr": 0.001,
+  "weight_decay": 0.01,
+  "epochs": 2,
+  "warmup_steps": 500,
+  "grad_clip": 1.0,
+  "patience": 5,
+  "hub_model_id": "ethanolivertroy/MitoInteract",
+  "output_dir": "/app/mitointeract_output"
+}

full_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a3a4e5808ff918d4e5b24d2bfc96c4c90e63be9616b8833afcb09feec33324b
+size 315671990

mitointeract_weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:086de9279feeb3da3ae89826d3eb1d76ac9a2fbf3bc7ecbd6c3fcd492b604c2e
+size 5129631

model.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+MitoInteract Model Class Definition
+Copy this file to load the model for inference.
+"""
+import torch
+import torch.nn as nn
+from transformers import EsmModel, EsmTokenizer, AutoModel, AutoTokenizer
+class MitoInteract(nn.Module):
+    def __init__(
+        self,
+        esm_model_name="facebook/esm2_t33_650M_UR50D",
+        mol_model_name="seyonec/ChemBERTa-zinc-base-v1",
+        protein_dim=1280,
+        mol_dim=768,
+        proj_dim=256,
+        n_heads=8,
+        dropout=0.1,
+        freeze_encoders=True,
+    ):
+        super().__init__()
+        self.freeze_encoders = freeze_encoders
+        self.esm = EsmModel.from_pretrained(esm_model_name)
+        self.protein_dim = protein_dim
+        self.mol_encoder = AutoModel.from_pretrained(mol_model_name)
+        self.mol_dim = mol_dim
+        if freeze_encoders:
+            for p in self.esm.parameters(): p.requires_grad = False
+            for p in self.mol_encoder.parameters(): p.requires_grad = False
+        self.prot_proj = nn.Sequential(
+            nn.Linear(protein_dim, proj_dim), nn.LayerNorm(proj_dim), nn.ReLU(), nn.Dropout(dropout))
+        self.mol_proj = nn.Sequential(
+            nn.Linear(mol_dim, proj_dim), nn.LayerNorm(proj_dim), nn.ReLU(), nn.Dropout(dropout))
+        self.cross_attn_mol2prot = nn.MultiheadAttention(proj_dim, n_heads, dropout=dropout, batch_first=True)
+        self.cross_attn_prot2mol = nn.MultiheadAttention(proj_dim, n_heads, dropout=dropout, batch_first=True)
+        self.ln_mol2prot = nn.LayerNorm(proj_dim)
+        self.ln_prot2mol = nn.LayerNorm(proj_dim)
+        fused_dim = proj_dim * 2
+        self.mlp = nn.Sequential(
+            nn.Linear(fused_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(128, 1))
+    def encode_protein(self, input_ids, attention_mask):
+        ctx = torch.no_grad() if self.freeze_encoders else torch.enable_grad()
+        with ctx:
+            out = self.esm(input_ids=input_ids, attention_mask=attention_mask)
+        mask = attention_mask.unsqueeze(-1).float()
+        pooled = (out.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)
+        return pooled, out.last_hidden_state
+    def encode_molecule(self, input_ids, attention_mask):
+        ctx = torch.no_grad() if self.freeze_encoders else torch.enable_grad()
+        with ctx:
+            out = self.mol_encoder(input_ids=input_ids, attention_mask=attention_mask)
+        return out.pooler_output, out.last_hidden_state
+    def forward(self, prot_input_ids, prot_attention_mask, mol_input_ids, mol_attention_mask):
+        prot_pooled, prot_seq = self.encode_protein(prot_input_ids, prot_attention_mask)
+        mol_pooled, mol_seq = self.encode_molecule(mol_input_ids, mol_attention_mask)
+        prot_seq_proj = self.prot_proj(prot_seq)
+        mol_seq_proj = self.mol_proj(mol_seq)
+        prot_q = self.prot_proj(prot_pooled).unsqueeze(1)
+        mol_q = self.mol_proj(mol_pooled).unsqueeze(1)
+        prot_pad_mask = (prot_attention_mask == 0)
+        mol_pad_mask = (mol_attention_mask == 0)
+        h_prot2mol, _ = self.cross_attn_prot2mol(prot_q, mol_seq_proj, mol_seq_proj, key_padding_mask=mol_pad_mask)
+        h_mol2prot, _ = self.cross_attn_mol2prot(mol_q, prot_seq_proj, prot_seq_proj, key_padding_mask=prot_pad_mask)
+        h_prot2mol = self.ln_prot2mol(h_prot2mol.squeeze(1))
+        h_mol2prot = self.ln_mol2prot(h_mol2prot.squeeze(1))
+        fused = torch.cat([h_prot2mol, h_mol2prot], dim=-1)
+        return self.mlp(fused).squeeze(-1)
+def load_model(checkpoint_path, device="cpu"):
+    """Load trained MitoInteract model."""
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    config = checkpoint["config"]
+    model = MitoInteract(
+        esm_model_name=config["esm_model"],
+        mol_model_name=config["mol_model"],
+        protein_dim=config["protein_dim"],
+        mol_dim=config["mol_dim"],
+        proj_dim=config["proj_dim"],
+        n_heads=config["n_heads"],
+        dropout=config["dropout"],
+        freeze_encoders=True,
+    )
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    return model, config
+def predict_binding(model, protein_seq, smiles, device="cpu"):
+    """Predict binding affinity (pKd) for a protein-molecule pair."""
+    prot_tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+    mol_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
+    prot_enc = prot_tokenizer(protein_seq, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    mol_enc = mol_tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=200)
+    model = model.to(device)
+    with torch.no_grad():
+        pKd = model(
+            prot_enc["input_ids"].to(device), prot_enc["attention_mask"].to(device),
+            mol_enc["input_ids"].to(device), mol_enc["attention_mask"].to(device),
+        )
+    pKd_val = pKd.item()
+    Kd_uM = 10 ** (-pKd_val) * 1e6
+    return {"pKd": pKd_val, "Kd_uM": Kd_uM}