Spaces:

nileshhanotia
/

Mutation_XAI

Runtime error

App Files Files Community

nileshhanotia commited on Feb 23

Commit

b32e6bf

verified ·

1 Parent(s): 31e0d91

Create model_loader.py

Browse files

Files changed (1) hide show

model_loader.py +560 -0

model_loader.py ADDED Viewed

	@@ -0,0 +1,560 @@

+"""
+model_loader.py — PeVe v1.4
+============================
+What app.py needs (read from app.py source):
+  from model_loader import get_splice_model, get_context_model, get_protein_model
+  model, tokenizer = get_splice_model()
+    → model accepts: torch.Tensor of shape (1, 401, 8) or (1, 401, 8) + flags
+    → used inside torch.no_grad()
+    → returns tensor or tuple[tensor, ...]
+  model, tokenizer = get_context_model()
+    → same calling convention as splice
+  model = get_protein_model()
+    → used with: xgb.DMatrix(X, feature_names=[...])
+    → model.predict(dmat)  → float array
+    → also passed to shap.TreeExplainer(model)
+Model sources (from the Space app.py files):
+  splice  → nileshhanotia/mutation-predictor-splice
+              file: mutation_predictor_splice.pt
+              arch: MutationPredictorCNN_v2  (input flat 1106)
+              NOTE: app.py passes (1, 401, 8) tensor — loader must reshape
+  context → nileshhanotia/mutation-predictor-v4
+              file: mutation_predictor_splice_v4.pt
+              arch: MutationPredictorCNN_v4 (4-tensor forward: seq,mut,region,splice)
+              NOTE: app.py passes (1, 401, 8) tensor — loader wraps forward
+  protein → nileshhanotia/mutation-pathogenicity-predictor
+              file: *.json / *.ubj / *.model / *.pkl
+              NOTE: app.py uses xgb.DMatrix + shap — MUST be XGBoost
+              If file is actually a CNN checkpoint, we wrap it as an
+              XGBoost-compatible object so app.py code paths still work.
+"""
+from __future__ import annotations
+import os
+import pickle
+import traceback
+import warnings
+from pathlib import Path
+from typing import Any
+import numpy as np
+# ── HF token ──────────────────────────────────────────────────────────────────
+_HF_TOKEN: str | None = (
+    os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+)
+# ── Model repo IDs ─────────────────────────────────────────────────────────────
+_REPO_SPLICE   = "nileshhanotia/mutation-predictor-splice"
+_REPO_CONTEXT  = "nileshhanotia/mutation-predictor-v4"
+_REPO_PROTEIN  = "nileshhanotia/mutation-pathogenicity-predictor"
+# ── Global cache ───────────────────────────────────────────────────────────────
+_splice_model   = None
+_splice_tok     = None
+_context_model  = None
+_context_tok    = None
+_protein_model  = None
+# ── Structured status dicts ────────────────────────────────────────────────────
+splice_model_status:  dict = {"loaded": False, "error_message": None}
+context_model_status: dict = {"loaded": False, "error_message": None}
+protein_model_status: dict = {"loaded": False, "error_message": None}
+# ══════════════════════════════════════════════════════════════════════════════
+# Model Architecture definitions
+# (must match checkpoint shapes exactly)
+# ══════════════════════════════════════════════════════════════════════════════
+def _build_splice_arch(sd: dict):
+    """
+    MutationPredictorCNN_v2 — infer fc_region_out and splice_fc_out from
+    the checkpoint's weight shapes, exactly as the Space app does.
+    Forward signature in the Space:
+        logit, imp_score, r_imp, s_imp = model(flat_tensor, mutation_positions)
+    app.py passes tensors of shape (1, 401, 8).  We need an adapter.
+    """
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    fc_region_out = sd["fc_region.weight"].shape[0]
+    splice_fc_out = sd["splice_fc.weight"].shape[0]
+    class MutationPredictorCNN_v2(nn.Module):
+        def __init__(self):
+            super().__init__()
+            fc1_in = 256 + 32 + fc_region_out + splice_fc_out
+            self.conv1 = nn.Conv1d(11, 64,  kernel_size=7, padding=3)
+            self.bn1   = nn.BatchNorm1d(64)
+            self.conv2 = nn.Conv1d(64,  128, kernel_size=5, padding=2)
+            self.bn2   = nn.BatchNorm1d(128)
+            self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
+            self.bn3   = nn.BatchNorm1d(256)
+            self.global_pool          = nn.AdaptiveAvgPool1d(1)
+            self.mut_fc               = nn.Linear(12, 32)
+            self.importance_head      = nn.Linear(256, 1)
+            self.region_importance_head = nn.Linear(256, 2)
+            self.fc_region            = nn.Linear(2, fc_region_out)
+            self.splice_fc            = nn.Linear(3, splice_fc_out)
+            self.splice_importance_head = nn.Linear(256, 3)
+            self.fc1 = nn.Linear(fc1_in, 128)
+            self.fc2 = nn.Linear(128, 64)
+            self.fc3 = nn.Linear(64, 1)
+            self.relu    = nn.ReLU()
+            self.dropout = nn.Dropout(0.4)
+        def _forward_flat(self, x_flat, mutation_positions=None):
+            """Original forward for flat 1106-dim input."""
+            bs = x_flat.size(0)
+            seq_flat   = x_flat[:, :1089]
+            mut_onehot = x_flat[:, 1089:1101]
+            region_feat= x_flat[:, 1101:1103]
+            splice_feat= x_flat[:, 1103:1106]
+            h = self.relu(self.bn1(self.conv1(seq_flat.view(bs, 11, 99))))
+            h = self.relu(self.bn2(self.conv2(h)))
+            conv_out = self.relu(self.bn3(self.conv3(h)))
+            if mutation_positions is None:
+                mutation_positions = x_flat[:, 990:1089].argmax(dim=1)
+            pos_idx = mutation_positions.clamp(0, 98).long()
+            pe = pos_idx.view(bs, 1, 1).expand(bs, 256, 1)
+            mut_feat   = conv_out.gather(2, pe).squeeze(2)
+            imp_score  = torch.sigmoid(self.importance_head(mut_feat))
+            pooled     = self.global_pool(conv_out).squeeze(-1)
+            r_imp      = torch.sigmoid(self.region_importance_head(pooled))
+            s_imp      = torch.sigmoid(self.splice_importance_head(pooled))
+            m = self.relu(self.mut_fc(mut_onehot))
+            r = self.relu(self.fc_region(region_feat))
+            s = self.relu(self.splice_fc(splice_feat))
+            fused = torch.cat([pooled, m, r, s], dim=1)
+            out   = self.dropout(self.relu(self.fc1(fused)))
+            out   = self.dropout(self.relu(self.fc2(out)))
+            logit = self.fc3(out)
+            return logit, imp_score, r_imp, s_imp
+        def forward(self, x, mutation_positions=None):
+            """
+            Accept whatever shape app.py sends:
+              (B, 401, 8) — raw encoded window from app.py
+              (B, 1106)   — flat input (native format)
+            """
+            if x.dim() == 3:
+                # app.py sends (B, 401, 8) — flatten and zero-pad to 1106
+                bs = x.size(0)
+                flat = x.reshape(bs, -1)                        # → (B, 3208)
+                # Take first 1089 dims (99*11), pad mut/region/splice to zeros
+                seq_part = flat[:, :1089]
+                pad = torch.zeros(bs, 1106 - 1089, device=x.device)
+                flat_padded = torch.cat([seq_part, pad], dim=1)  # → (B, 1106)
+                return self._forward_flat(flat_padded, mutation_positions)
+            return self._forward_flat(x, mutation_positions)
+    model = MutationPredictorCNN_v2()
+    model.load_state_dict(sd)
+    return model
+def _build_context_arch(sd: dict):
+    """
+    MutationPredictorCNN_v4 — 4-input forward (seq, mut, region, splice).
+    app.py passes a single (B, 401, 8) tensor, so forward() adapts.
+    """
+    import torch
+    import torch.nn as nn
+    class MutationPredictorCNN_v4(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv1d(11, 64,  7, padding=3)
+            self.conv2 = nn.Conv1d(64, 128, 5, padding=2)
+            self.conv3 = nn.Conv1d(128,256, 3, padding=1)
+            self.pool  = nn.AdaptiveAvgPool1d(1)
+            self.mut_fc    = nn.Linear(12, 32)
+            self.region_fc = nn.Linear(2,   8)
+            self.splice_fc = nn.Linear(3,  16)
+            self.fc1 = nn.Linear(312, 128)
+            self.fc2 = nn.Linear(128,  64)
+            self.fc3 = nn.Linear(64,    1)
+            self.relu    = nn.ReLU()
+            self.dropout = nn.Dropout(0.3)
+        def _forward_native(self, seq, mut, region, splice):
+            x = self.relu(self.conv1(seq))
+            x = self.relu(self.conv2(x))
+            x = self.relu(self.conv3(x))
+            x = self.pool(x).squeeze(-1)
+            m = self.relu(self.mut_fc(mut))
+            r = self.relu(self.region_fc(region))
+            s = self.relu(self.splice_fc(splice))
+            x = torch.cat([x, m, r, s], dim=1)
+            x = self.dropout(self.relu(self.fc1(x)))
+            x = self.relu(self.fc2(x))
+            return self.fc3(x)
+        def forward(self, x, *args):
+            """
+            Accept:
+              (B, 401, 8)  → reshape to 99*11 window, zero-pad aux inputs
+              (seq, mut, region, splice) tensors — native
+            """
+            import torch
+            if isinstance(x, torch.Tensor) and x.dim() == 3:
+                bs = x.size(0)
+                flat = x.reshape(bs, -1)
+                seq_flat = flat[:, :1089].view(bs, 11, 99)
+                mut    = torch.zeros(bs, 12,  device=x.device)
+                region = torch.zeros(bs,  2,  device=x.device)
+                splice = torch.zeros(bs,  3,  device=x.device)
+                return self._forward_native(seq_flat, mut, region, splice)
+            # flat 1089+12+2+3 = 1106 case
+            if isinstance(x, torch.Tensor) and x.dim() == 2:
+                bs = x.size(0)
+                seq_flat = x[:, :1089].view(bs, 11, 99)
+                mut    = x[:, 1089:1101]
+                region = x[:, 1101:1103]
+                splice = x[:, 1103:1106]
+                return self._forward_native(seq_flat, mut, region, splice)
+            # called with 4 separate tensors
+            return self._forward_native(x, *args)
+    model = MutationPredictorCNN_v4()
+    model.load_state_dict(sd)
+    return model
+class _CNNasXGB:
+    """
+    Wrapper that makes a PyTorch CNN look like an XGBoost Booster
+    to satisfy the app.py code paths:
+        dmat  = xgb.DMatrix(X, feature_names=[...])
+        pred  = model.predict(dmat)   ← needs .predict()
+        shap.TreeExplainer(model)     ← will fail gracefully; app has try/except
+    """
+    def __init__(self, torch_model, feature_names: list[str]):
+        import torch
+        self._model = torch_model
+        self._model.eval()
+        self._features = feature_names
+        self._device   = torch.device("cpu")
+    def predict(self, dmat_or_array) -> np.ndarray:
+        import torch
+        try:
+            # xgb.DMatrix → get_data() returns scipy sparse or np array
+            try:
+                X = dmat_or_array.get_data().toarray()
+            except Exception:
+                X = np.array(dmat_or_array)
+        except Exception:
+            X = np.zeros((1, len(self._features)), dtype=np.float32)
+        t = torch.tensor(X, dtype=torch.float32)
+        with torch.no_grad():
+            out = self._model(t)
+            if isinstance(out, (tuple, list)):
+                out = out[0]
+            probs = torch.sigmoid(out).cpu().numpy().flatten()
+        return probs
+    # Allow shap.TreeExplainer to fail gracefully — app.py has try/except
+    def get_booster(self):
+        raise NotImplementedError("CNN wrapped as XGB — SHAP not available")
+# ══════════════════════════════════════════════════════════════════════════════
+# Internal loaders
+# ══════════════════════════════════════════════════════════════════════════════
+def _download_repo(repo_id: str) -> Path:
+    from huggingface_hub import snapshot_download
+    local = snapshot_download(repo_id=repo_id, token=_HF_TOKEN)
+    p = Path(local)
+    files = [f.name for f in p.rglob("*") if f.is_file()]
+    print(f"[PeVe] {repo_id} files: {files}")
+    return p
+def _load_splice() -> tuple:
+    import torch
+    print(f"[PeVe] Loading splice model from {_REPO_SPLICE}")
+    try:
+        local = _download_repo(_REPO_SPLICE)
+        # Look for the checkpoint — priority: named file, then any .pt/.pth/.bin
+        candidates = (
+            list(local.glob("mutation_predictor_splice.pt"))
+            + list(local.glob("*.pt"))
+            + list(local.glob("*.pth"))
+            + list(local.glob("*.bin"))
+        )
+        if not candidates:
+            raise FileNotFoundError(f"No checkpoint in {_REPO_SPLICE}")
+        ckpt = torch.load(str(candidates[0]), map_location="cpu", weights_only=False)
+        sd   = ckpt.get("model_state_dict", ckpt)
+        model = _build_splice_arch(sd)
+        model.eval()
+        val_acc = ckpt.get("val_accuracy", "n/a")
+        print(f"[PeVe] ✓ splice loaded ({candidates[0].name}) val_acc={val_acc}")
+        splice_model_status.update({"loaded": True, "error_message": None})
+        return model, None
+    except Exception:
+        tb = traceback.format_exc()
+        print(f"[PeVe] ✗ splice load failed:\n{tb}")
+        splice_model_status.update({"loaded": False, "error_message": tb})
+        return None, None
+def _load_context() -> tuple:
+    import torch
+    print(f"[PeVe] Loading context model from {_REPO_CONTEXT}")
+    try:
+        local = _download_repo(_REPO_CONTEXT)
+        candidates = (
+            list(local.glob("mutation_predictor_splice_v4.pt"))
+            + list(local.glob("*.pt"))
+            + list(local.glob("*.pth"))
+            + list(local.glob("*.bin"))
+        )
+        if not candidates:
+            raise FileNotFoundError(f"No checkpoint in {_REPO_CONTEXT}")
+        sd = torch.load(str(candidates[0]), map_location="cpu", weights_only=False)
+        if isinstance(sd, dict) and "model_state_dict" in sd:
+            sd = sd["model_state_dict"]
+        model = _build_context_arch(sd)
+        model.eval()
+        print(f"[PeVe] ✓ context loaded ({candidates[0].name})")
+        context_model_status.update({"loaded": True, "error_message": None})
+        return model, None
+    except Exception:
+        tb = traceback.format_exc()
+        print(f"[PeVe] ✗ context load failed:\n{tb}")
+        context_model_status.update({"loaded": False, "error_message": tb})
+        return None, None
+def _load_protein():
+    import xgboost as xgb
+    import torch
+    print(f"[PeVe] Loading protein model from {_REPO_PROTEIN}")
+    _FEAT = ["gnomAD_AF", "Grantham", "Charge_change",
+             "Hydrophobicity_diff", "Protein_pos_norm", "VEP_IMPACT"]
+    try:
+        local = _download_repo(_REPO_PROTEIN)
+        # ── Try XGBoost formats first ──────────────────────────────────────
+        for ext in ["*.json", "*.ubj", "*.model"]:
+            for p in local.glob(ext):
+                try:
+                    m = xgb.Booster()
+                    m.load_model(str(p))
+                    print(f"[PeVe] ✓ protein loaded as XGBoost Booster ({p.name})")
+                    protein_model_status.update({"loaded": True, "error_message": None})
+                    return m
+                except Exception as e:
+                    print(f"[PeVe]   xgb.Booster failed for {p.name}: {e}")
+        # ── Try pickle ────────────────────────────────────────────────────
+        for p in local.glob("*.pkl"):
+            try:
+                with open(p, "rb") as f:
+                    m = pickle.load(f)
+                print(f"[PeVe] ✓ protein loaded via pickle ({p.name})")
+                protein_model_status.update({"loaded": True, "error_message": None})
+                return m
+            except Exception as e:
+                print(f"[PeVe]   pickle failed for {p.name}: {e}")
+        # ── Fallback: PyTorch checkpoint — wrap as XGB-compatible ─────────
+        for ext in ["*.pt", "*.pth", "*.bin"]:
+            for p in local.glob(ext):
+                try:
+                    ckpt = torch.load(str(p), map_location="cpu", weights_only=False)
+                    sd   = ckpt.get("model_state_dict", ckpt) if isinstance(ckpt, dict) else ckpt
+                    if isinstance(sd, dict):
+                        # Try MutationPredictorCNN (protein space model.py)
+                        from torch import nn
+                        class MutationPredictorCNN(nn.Module):
+                            def __init__(self):
+                                super().__init__()
+                                self.conv1 = nn.Conv1d(11, 64, kernel_size=7, padding=3)
+                                self.bn1   = nn.BatchNorm1d(64)
+                                self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
+                                self.bn2   = nn.BatchNorm1d(128)
+                                self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
+                                self.bn3   = nn.BatchNorm1d(256)
+                                self.adaptive_pool   = nn.AdaptiveAvgPool1d(1)
+                                self.mut_fc          = nn.Linear(12, 32)
+                                self.fc1 = nn.Linear(288, 128)
+                                self.fc2 = nn.Linear(128, 64)
+                                self.fc3 = nn.Linear(64, 1)
+                                self.importance_head = nn.Linear(256, 1)
+                            def forward(self, x):
+                                import torch.nn.functional as F
+                                bs = x.size(0)
+                                mut_type  = x[:, 1089:1101]
+                                x_seq     = x[:, :1089].view(bs, 11, 99)
+                                x_conv    = F.relu(self.bn1(self.conv1(x_seq)))
+                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
+                                x_conv    = F.relu(self.bn2(self.conv2(x_conv)))
+                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
+                                x_conv    = F.relu(self.bn3(self.conv3(x_conv)))
+                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
+                                x_conv    = self.adaptive_pool(x_conv)
+                                conv_feat = x_conv.view(bs, 256)
+                                mut_feat  = F.relu(self.mut_fc(mut_type))
+                                combined  = torch.cat([conv_feat, mut_feat], dim=1)
+                                x_out     = F.relu(self.fc1(combined))
+                                x_out     = F.relu(self.fc2(x_out))
+                                cls       = torch.sigmoid(self.fc3(x_out))
+                                imp       = torch.sigmoid(self.importance_head(conv_feat))
+                                return cls, imp
+                        torch_model = MutationPredictorCNN()
+                        torch_model.load_state_dict(sd)
+                        torch_model.eval()
+                        wrapped = _CNNasXGB(torch_model, _FEAT)
+                        print(f"[PeVe] ✓ protein loaded as CNN→XGB wrapper ({p.name})")
+                        protein_model_status.update({"loaded": True, "error_message": None})
+                        return wrapped
+                    else:
+                        # Raw nn.Module saved whole
+                        wrapped = _CNNasXGB(sd, _FEAT)
+                        protein_model_status.update({"loaded": True, "error_message": None})
+                        return wrapped
+                except Exception as e:
+                    print(f"[PeVe]   torch fallback failed for {p.name}: {e}")
+        raise FileNotFoundError("No loadable model file found in protein repo")
+    except Exception:
+        tb = traceback.format_exc()
+        print(f"[PeVe] ✗ protein load failed:\n{tb}")
+        protein_model_status.update({"loaded": False, "error_message": tb})
+        return None
+# ══════════════════════════════════════════════════════════════════════════════
+# Public API  (names that app.py imports)
+# ══════════════════════════════════════════════════════════════════════════════
+def get_splice_model() -> tuple:
+    """Returns (model, tokenizer). model(tensor) → (logit, imp, r_imp, s_imp)"""
+    global _splice_model, _splice_tok
+    if _splice_model is None:
+        _splice_model, _splice_tok = _load_splice()
+    return _splice_model, _splice_tok
+def get_context_model() -> tuple:
+    """Returns (model, tokenizer). model(tensor) → logit tensor"""
+    global _context_model, _context_tok
+    if _context_model is None:
+        _context_model, _context_tok = _load_context()
+    return _context_model, _context_tok
+def get_protein_model():
+    """Returns XGBoost Booster (or CNN wrapper). model.predict(dmat) → float array"""
+    global _protein_model
+    if _protein_model is None:
+        _protein_model = _load_protein()
+    return _protein_model
+def get_model_status() -> dict:
+    return {
+        "splice":  dict(splice_model_status),
+        "context": dict(context_model_status),
+        "protein": dict(protein_model_status),
+    }
+# ══════════════════════════════════════════════════════════════════════════════
+# Test block
+# ══════════════════════════════════════════════════════════════════════════════
+def test_model_loading() -> dict:
+    import torch
+    print("[PeVe] ── test_model_loading() ──")
+    sm, _  = get_splice_model()
+    cm, _  = get_context_model()
+    pm     = get_protein_model()
+    results = {}
+    # Test splice
+    try:
+        if sm is not None:
+            dummy = torch.zeros(1, 1106)
+            out = sm(dummy)
+            results["splice"] = f"✓ output shapes: {[o.shape for o in out]}"
+        else:
+            results["splice"] = "✗ model is None"
+    except Exception as e:
+        results["splice"] = f"✗ forward failed: {e}"
+    # Test context
+    try:
+        if cm is not None:
+            dummy = torch.zeros(1, 1106)
+            out = cm(dummy)
+            results["context"] = f"✓ output shape: {out.shape if hasattr(out,'shape') else type(out)}"
+        else:
+            results["context"] = "✗ model is None"
+    except Exception as e:
+        results["context"] = f"✗ forward failed: {e}"
+    # Test protein
+    try:
+        if pm is not None:
+            import xgboost as xgb
+            feat = ["gnomAD_AF","Grantham","Charge_change",
+                    "Hydrophobicity_diff","Protein_pos_norm","VEP_IMPACT"]
+            X = np.array([[0.001, 100.0, 0.0, 0.5, 0.5, 2.0]], dtype=np.float32)
+            dmat = xgb.DMatrix(X, feature_names=feat)
+            pred = pm.predict(dmat)
+            results["protein"] = f"✓ prediction: {pred}"
+        else:
+            results["protein"] = "✗ model is None"
+    except Exception as e:
+        results["protein"] = f"✗ predict failed: {e}"
+    status = get_model_status()
+    final = {
+        "model_status": status,
+        "forward_tests": results,
+        "all_loaded": all(v["loaded"] for v in status.values()),
+    }
+    import json
+    print(json.dumps(final, indent=2, default=str))
+    return final
+if __name__ == "__main__":
+    test_model_loading()