Spaces:

nileshhanotia
/

Mutation_XAI

Sleeping

File size: 24,735 Bytes


"""
model_loader.py — PeVe v1.4
============================

What app.py needs (read from app.py source):

  from model_loader import get_splice_model, get_context_model, get_protein_model

  model, tokenizer = get_splice_model()
    → model accepts: torch.Tensor of shape (1, 401, 8) or (1, 401, 8) + flags
    → used inside torch.no_grad()
    → returns tensor or tuple[tensor, ...]

  model, tokenizer = get_context_model()
    → same calling convention as splice

  model = get_protein_model()
    → used with: xgb.DMatrix(X, feature_names=[...])
    → model.predict(dmat)  → float array
    → also passed to shap.TreeExplainer(model)

Model sources (from the Space app.py files):
  splice  → nileshhanotia/mutation-predictor-splice
              file: mutation_predictor_splice.pt
              arch: MutationPredictorCNN_v2  (input flat 1106)
              NOTE: app.py passes (1, 401, 8) tensor — loader must reshape

  context → nileshhanotia/mutation-predictor-v4
              file: mutation_predictor_splice_v4.pt
              arch: MutationPredictorCNN_v4 (4-tensor forward: seq,mut,region,splice)
              NOTE: app.py passes (1, 401, 8) tensor — loader wraps forward

  protein → nileshhanotia/mutation-pathogenicity-predictor
              file: *.json / *.ubj / *.model / *.pkl
              NOTE: app.py uses xgb.DMatrix + shap — MUST be XGBoost
              If file is actually a CNN checkpoint, we wrap it as an
              XGBoost-compatible object so app.py code paths still work.
"""
from __future__ import annotations

import os
import pickle
import traceback
import warnings
from pathlib import Path
from typing import Any

import numpy as np

# ── HF token ──────────────────────────────────────────────────────────────────
_HF_TOKEN: str | None = (
    os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
)

# ── Model repo IDs ─────────────────────────────────────────────────────────────
_REPO_SPLICE   = "nileshhanotia/mutation-predictor-splice"
_REPO_CONTEXT  = "nileshhanotia/mutation-predictor-v4"
_REPO_PROTEIN  = "nileshhanotia/mutation-pathogenicity-predictor"

# ── Global cache ───────────────────────────────────────────────────────────────
_splice_model   = None
_splice_tok     = None
_context_model  = None
_context_tok    = None
_protein_model  = None

# ── Structured status dicts ────────────────────────────────────────────────────
splice_model_status:  dict = {"loaded": False, "error_message": None}
context_model_status: dict = {"loaded": False, "error_message": None}
protein_model_status: dict = {"loaded": False, "error_message": None}


# ══════════════════════════════════════════════════════════════════════════════
# Model Architecture definitions
# (must match checkpoint shapes exactly)
# ══════════════════════════════════════════════════════════════════════════════

def _build_splice_arch(sd: dict):
    """
    MutationPredictorCNN_v2 — infer fc_region_out and splice_fc_out from
    the checkpoint's weight shapes, exactly as the Space app does.

    Forward signature in the Space:
        logit, imp_score, r_imp, s_imp = model(flat_tensor, mutation_positions)

    app.py passes tensors of shape (1, 401, 8).  We need an adapter.
    """
    import torch
    import torch.nn as nn
    import torch.nn.functional as F

    fc_region_out = sd["fc_region.weight"].shape[0]
    splice_fc_out = sd["splice_fc.weight"].shape[0]

    class MutationPredictorCNN_v2(nn.Module):
        def __init__(self):
            super().__init__()
            fc1_in = 256 + 32 + fc_region_out + splice_fc_out
            self.conv1 = nn.Conv1d(11, 64,  kernel_size=7, padding=3)
            self.bn1   = nn.BatchNorm1d(64)
            self.conv2 = nn.Conv1d(64,  128, kernel_size=5, padding=2)
            self.bn2   = nn.BatchNorm1d(128)
            self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
            self.bn3   = nn.BatchNorm1d(256)
            self.global_pool          = nn.AdaptiveAvgPool1d(1)
            self.mut_fc               = nn.Linear(12, 32)
            self.importance_head      = nn.Linear(256, 1)
            self.region_importance_head = nn.Linear(256, 2)
            self.fc_region            = nn.Linear(2, fc_region_out)
            self.splice_fc            = nn.Linear(3, splice_fc_out)
            self.splice_importance_head = nn.Linear(256, 3)
            self.fc1 = nn.Linear(fc1_in, 128)
            self.fc2 = nn.Linear(128, 64)
            self.fc3 = nn.Linear(64, 1)
            self.relu    = nn.ReLU()
            self.dropout = nn.Dropout(0.4)

        def _forward_flat(self, x_flat, mutation_positions=None):
            """Original forward for flat 1106-dim input."""
            bs = x_flat.size(0)
            seq_flat   = x_flat[:, :1089]
            mut_onehot = x_flat[:, 1089:1101]
            region_feat= x_flat[:, 1101:1103]
            splice_feat= x_flat[:, 1103:1106]
            h = self.relu(self.bn1(self.conv1(seq_flat.view(bs, 11, 99))))
            h = self.relu(self.bn2(self.conv2(h)))
            conv_out = self.relu(self.bn3(self.conv3(h)))
            if mutation_positions is None:
                mutation_positions = x_flat[:, 990:1089].argmax(dim=1)
            pos_idx = mutation_positions.clamp(0, 98).long()
            pe = pos_idx.view(bs, 1, 1).expand(bs, 256, 1)
            mut_feat   = conv_out.gather(2, pe).squeeze(2)
            imp_score  = torch.sigmoid(self.importance_head(mut_feat))
            pooled     = self.global_pool(conv_out).squeeze(-1)
            r_imp      = torch.sigmoid(self.region_importance_head(pooled))
            s_imp      = torch.sigmoid(self.splice_importance_head(pooled))
            m = self.relu(self.mut_fc(mut_onehot))
            r = self.relu(self.fc_region(region_feat))
            s = self.relu(self.splice_fc(splice_feat))
            fused = torch.cat([pooled, m, r, s], dim=1)
            out   = self.dropout(self.relu(self.fc1(fused)))
            out   = self.dropout(self.relu(self.fc2(out)))
            logit = self.fc3(out)
            return logit, imp_score, r_imp, s_imp

        def forward(self, x, mutation_positions=None):
            """
            Accept whatever shape app.py sends:
              (B, 401, 8) — raw encoded window from app.py
              (B, 1106)   — flat input (native format)
            """
            if x.dim() == 3:
                # app.py sends (B, 401, 8) — flatten and zero-pad to 1106
                bs = x.size(0)
                flat = x.reshape(bs, -1)                        # → (B, 3208)
                # Take first 1089 dims (99*11), pad mut/region/splice to zeros
                seq_part = flat[:, :1089]
                pad = torch.zeros(bs, 1106 - 1089, device=x.device)
                flat_padded = torch.cat([seq_part, pad], dim=1)  # → (B, 1106)
                return self._forward_flat(flat_padded, mutation_positions)
            return self._forward_flat(x, mutation_positions)

    model = MutationPredictorCNN_v2()
    model.load_state_dict(sd)
    return model


def _build_context_arch(sd: dict):
    """
    MutationPredictorCNN_v4 — 4-input forward (seq, mut, region, splice).
    app.py passes a single (B, 401, 8) tensor, so forward() adapts.
    """
    import torch
    import torch.nn as nn

    class MutationPredictorCNN_v4(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv1 = nn.Conv1d(11, 64,  7, padding=3)
            self.conv2 = nn.Conv1d(64, 128, 5, padding=2)
            self.conv3 = nn.Conv1d(128,256, 3, padding=1)
            self.pool  = nn.AdaptiveAvgPool1d(1)
            self.mut_fc    = nn.Linear(12, 32)
            self.region_fc = nn.Linear(2,   8)
            self.splice_fc = nn.Linear(3,  16)
            self.fc1 = nn.Linear(312, 128)
            self.fc2 = nn.Linear(128,  64)
            self.fc3 = nn.Linear(64,    1)
            self.relu    = nn.ReLU()
            self.dropout = nn.Dropout(0.3)

        def _forward_native(self, seq, mut, region, splice):
            x = self.relu(self.conv1(seq))
            x = self.relu(self.conv2(x))
            x = self.relu(self.conv3(x))
            x = self.pool(x).squeeze(-1)
            m = self.relu(self.mut_fc(mut))
            r = self.relu(self.region_fc(region))
            s = self.relu(self.splice_fc(splice))
            x = torch.cat([x, m, r, s], dim=1)
            x = self.dropout(self.relu(self.fc1(x)))
            x = self.relu(self.fc2(x))
            return self.fc3(x)

        def forward(self, x, *args):
            """
            Accept:
              (B, 401, 8)  → reshape to 99*11 window, zero-pad aux inputs
              (seq, mut, region, splice) tensors — native
            """
            import torch
            if isinstance(x, torch.Tensor) and x.dim() == 3:
                bs = x.size(0)
                flat = x.reshape(bs, -1)
                seq_flat = flat[:, :1089].view(bs, 11, 99)
                mut    = torch.zeros(bs, 12,  device=x.device)
                region = torch.zeros(bs,  2,  device=x.device)
                splice = torch.zeros(bs,  3,  device=x.device)
                return self._forward_native(seq_flat, mut, region, splice)
            # flat 1089+12+2+3 = 1106 case
            if isinstance(x, torch.Tensor) and x.dim() == 2:
                bs = x.size(0)
                seq_flat = x[:, :1089].view(bs, 11, 99)
                mut    = x[:, 1089:1101]
                region = x[:, 1101:1103]
                splice = x[:, 1103:1106]
                return self._forward_native(seq_flat, mut, region, splice)
            # called with 4 separate tensors
            return self._forward_native(x, *args)

    model = MutationPredictorCNN_v4()
    model.load_state_dict(sd)
    return model


class _CNNasXGB:
    """
    Wrapper that makes a PyTorch CNN look like an XGBoost Booster
    to satisfy the app.py code paths:
        dmat  = xgb.DMatrix(X, feature_names=[...])
        pred  = model.predict(dmat)   ← needs .predict()
        shap.TreeExplainer(model)     ← will fail gracefully; app has try/except
    """
    def __init__(self, torch_model, feature_names: list[str]):
        import torch
        self._model = torch_model
        self._model.eval()
        self._features = feature_names
        self._device   = torch.device("cpu")

    def predict(self, dmat_or_array) -> np.ndarray:
        import torch
        try:
            # xgb.DMatrix → get_data() returns scipy sparse or np array
            try:
                X = dmat_or_array.get_data().toarray()
            except Exception:
                X = np.array(dmat_or_array)
        except Exception:
            X = np.zeros((1, len(self._features)), dtype=np.float32)

        t = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            out = self._model(t)
            if isinstance(out, (tuple, list)):
                out = out[0]
            probs = torch.sigmoid(out).cpu().numpy().flatten()
        return probs

    # Allow shap.TreeExplainer to fail gracefully — app.py has try/except
    def get_booster(self):
        raise NotImplementedError("CNN wrapped as XGB — SHAP not available")


# ══════════════════════════════════════════════════════════════════════════════
# Internal loaders
# ══════════════════════════════════════════════════════════════════════════════

def _download_repo(repo_id: str) -> Path:
    from huggingface_hub import snapshot_download
    local = snapshot_download(repo_id=repo_id, token=_HF_TOKEN)
    p = Path(local)
    files = [f.name for f in p.rglob("*") if f.is_file()]
    print(f"[PeVe] {repo_id} files: {files}")
    return p


def _load_splice() -> tuple:
    import torch
    print(f"[PeVe] Loading splice model from {_REPO_SPLICE}")
    try:
        local = _download_repo(_REPO_SPLICE)

        # Look for the checkpoint — priority: named file, then any .pt/.pth/.bin
        candidates = (
            list(local.glob("mutation_predictor_splice.pt"))
            + list(local.glob("*.pt"))
            + list(local.glob("*.pth"))
            + list(local.glob("*.bin"))
        )
        if not candidates:
            raise FileNotFoundError(f"No checkpoint in {_REPO_SPLICE}")

        ckpt = torch.load(str(candidates[0]), map_location="cpu", weights_only=False)
        sd   = ckpt.get("model_state_dict", ckpt)

        model = _build_splice_arch(sd)
        model.eval()
        val_acc = ckpt.get("val_accuracy", "n/a")
        print(f"[PeVe] ✓ splice loaded ({candidates[0].name}) val_acc={val_acc}")
        splice_model_status.update({"loaded": True, "error_message": None})
        return model, None

    except Exception:
        tb = traceback.format_exc()
        print(f"[PeVe] ✗ splice load failed:\n{tb}")
        splice_model_status.update({"loaded": False, "error_message": tb})
        return None, None


def _load_context() -> tuple:
    import torch
    print(f"[PeVe] Loading context model from {_REPO_CONTEXT}")
    try:
        local = _download_repo(_REPO_CONTEXT)

        candidates = (
            list(local.glob("mutation_predictor_splice_v4.pt"))
            + list(local.glob("*.pt"))
            + list(local.glob("*.pth"))
            + list(local.glob("*.bin"))
        )
        if not candidates:
            raise FileNotFoundError(f"No checkpoint in {_REPO_CONTEXT}")

        sd = torch.load(str(candidates[0]), map_location="cpu", weights_only=False)
        if isinstance(sd, dict) and "model_state_dict" in sd:
            sd = sd["model_state_dict"]

        model = _build_context_arch(sd)
        model.eval()
        print(f"[PeVe] ✓ context loaded ({candidates[0].name})")
        context_model_status.update({"loaded": True, "error_message": None})
        return model, None

    except Exception:
        tb = traceback.format_exc()
        print(f"[PeVe] ✗ context load failed:\n{tb}")
        context_model_status.update({"loaded": False, "error_message": tb})
        return None, None


def _load_protein():
    import xgboost as xgb
    import torch
    print(f"[PeVe] Loading protein model from {_REPO_PROTEIN}")

    _FEAT = ["gnomAD_AF", "Grantham", "Charge_change",
             "Hydrophobicity_diff", "Protein_pos_norm", "VEP_IMPACT"]

    try:
        local = _download_repo(_REPO_PROTEIN)

        # ── Try XGBoost formats first ──────────────────────────────────────
        for ext in ["*.json", "*.ubj", "*.model"]:
            for p in local.glob(ext):
                try:
                    m = xgb.Booster()
                    m.load_model(str(p))
                    print(f"[PeVe] ✓ protein loaded as XGBoost Booster ({p.name})")
                    protein_model_status.update({"loaded": True, "error_message": None})
                    return m
                except Exception as e:
                    print(f"[PeVe]   xgb.Booster failed for {p.name}: {e}")

        # ── Try pickle ────────────────────────────────────────────────────
        for p in local.glob("*.pkl"):
            try:
                with open(p, "rb") as f:
                    m = pickle.load(f)
                print(f"[PeVe] ✓ protein loaded via pickle ({p.name})")
                protein_model_status.update({"loaded": True, "error_message": None})
                return m
            except Exception as e:
                print(f"[PeVe]   pickle failed for {p.name}: {e}")

        # ── Fallback: PyTorch checkpoint — wrap as XGB-compatible ─────────
        for ext in ["*.pt", "*.pth", "*.bin"]:
            for p in local.glob(ext):
                try:
                    ckpt = torch.load(str(p), map_location="cpu", weights_only=False)
                    sd   = ckpt.get("model_state_dict", ckpt) if isinstance(ckpt, dict) else ckpt

                    if isinstance(sd, dict):
                        # Try MutationPredictorCNN (protein space model.py)
                        from torch import nn
                        class MutationPredictorCNN(nn.Module):
                            def __init__(self):
                                super().__init__()
                                self.conv1 = nn.Conv1d(11, 64, kernel_size=7, padding=3)
                                self.bn1   = nn.BatchNorm1d(64)
                                self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
                                self.bn2   = nn.BatchNorm1d(128)
                                self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
                                self.bn3   = nn.BatchNorm1d(256)
                                self.adaptive_pool   = nn.AdaptiveAvgPool1d(1)
                                self.mut_fc          = nn.Linear(12, 32)
                                self.fc1 = nn.Linear(288, 128)
                                self.fc2 = nn.Linear(128, 64)
                                self.fc3 = nn.Linear(64, 1)
                                self.importance_head = nn.Linear(256, 1)

                            def forward(self, x):
                                import torch.nn.functional as F
                                bs = x.size(0)
                                mut_type  = x[:, 1089:1101]
                                x_seq     = x[:, :1089].view(bs, 11, 99)
                                x_conv    = F.relu(self.bn1(self.conv1(x_seq)))
                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
                                x_conv    = F.relu(self.bn2(self.conv2(x_conv)))
                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
                                x_conv    = F.relu(self.bn3(self.conv3(x_conv)))
                                x_conv    = nn.MaxPool1d(2, 2)(x_conv)
                                x_conv    = self.adaptive_pool(x_conv)
                                conv_feat = x_conv.view(bs, 256)
                                mut_feat  = F.relu(self.mut_fc(mut_type))
                                combined  = torch.cat([conv_feat, mut_feat], dim=1)
                                x_out     = F.relu(self.fc1(combined))
                                x_out     = F.relu(self.fc2(x_out))
                                cls       = torch.sigmoid(self.fc3(x_out))
                                imp       = torch.sigmoid(self.importance_head(conv_feat))
                                return cls, imp

                        torch_model = MutationPredictorCNN()
                        torch_model.load_state_dict(sd)
                        torch_model.eval()
                        wrapped = _CNNasXGB(torch_model, _FEAT)
                        print(f"[PeVe] ✓ protein loaded as CNN→XGB wrapper ({p.name})")
                        protein_model_status.update({"loaded": True, "error_message": None})
                        return wrapped
                    else:
                        # Raw nn.Module saved whole
                        wrapped = _CNNasXGB(sd, _FEAT)
                        protein_model_status.update({"loaded": True, "error_message": None})
                        return wrapped

                except Exception as e:
                    print(f"[PeVe]   torch fallback failed for {p.name}: {e}")

        raise FileNotFoundError("No loadable model file found in protein repo")

    except Exception:
        tb = traceback.format_exc()
        print(f"[PeVe] ✗ protein load failed:\n{tb}")
        protein_model_status.update({"loaded": False, "error_message": tb})
        return None


# ══════════════════════════════════════════════════════════════════════════════
# Public API  (names that app.py imports)
# ══════════════════════════════════════════════════════════════════════════════

def get_splice_model() -> tuple:
    """Returns (model, tokenizer). model(tensor) → (logit, imp, r_imp, s_imp)"""
    global _splice_model, _splice_tok
    if _splice_model is None:
        _splice_model, _splice_tok = _load_splice()
    return _splice_model, _splice_tok


def get_context_model() -> tuple:
    """Returns (model, tokenizer). model(tensor) → logit tensor"""
    global _context_model, _context_tok
    if _context_model is None:
        _context_model, _context_tok = _load_context()
    return _context_model, _context_tok


def get_protein_model():
    """Returns XGBoost Booster (or CNN wrapper). model.predict(dmat) → float array"""
    global _protein_model
    if _protein_model is None:
        _protein_model = _load_protein()
    return _protein_model


def get_model_status() -> dict:
    return {
        "splice":  dict(splice_model_status),
        "context": dict(context_model_status),
        "protein": dict(protein_model_status),
    }


════════════════════════════════════════════════════════════════════════
# Test block
# ══════════════════════════════════════════════════════════════════════════════

def test_model_loading() -> dict:
    import torch
    print("[PeVe] ── test_model_loading() ──")

    sm, _  = get_splice_model()
    cm, _  = get_context_model()
    pm     = get_protein_model()

    results = {}

    # Test splice
    try:
        if sm is not None:
            dummy = torch.zeros(1, 1106)
            out = sm(dummy)
            results["splice"] = f"✓ output shapes: {[o.shape for o in out]}"
        else:
            results["splice"] = "✗ model is None"
    except Exception as e:
        results["splice"] = f"✗ forward failed: {e}"

    # Test context
    try:
        if cm is not None:
            dummy = torch.zeros(1, 1106)
            out = cm(dummy)
            results["context"] = f"✓ output shape: {out.shape if hasattr(out,'shape') else type(out)}"
        else:
            results["context"] = "✗ model is None"
    except Exception as e:
        results["context"] = f"✗ forward failed: {e}"

    # Test protein
    try:
        if pm is not None:
            import xgboost as xgb
            feat = ["gnomAD_AF","Grantham","Charge_change",
                    "Hydrophobicity_diff","Protein_pos_norm","VEP_IMPACT"]
            X = np.array([[0.001, 100.0, 0.0, 0.5, 0.5, 2.0]], dtype=np.float32)
            dmat = xgb.DMatrix(X, feature_names=feat)
            pred = pm.predict(dmat)
            results["protein"] = f"✓ prediction: {pred}"
        else:
            results["protein"] = "✗ model is None"
    except Exception as e:
        results["protein"] = f"✗ predict failed: {e}"

    status = get_model_status()
    final = {
        "model_status": status,
        "forward_tests": results,
        "all_loaded": all(v["loaded"] for v in status.values()),
    }
    import json
    print(json.dumps(final, indent=2, default=str))
    return final


if __name__ == "__main__":
    test_model_loading()