Upload runs/exp_oracle_v3_binary7_separate_fast_h100/bundle_separate_oracle.py with huggingface_hub

Browse files

Files changed (1) hide show

runs/exp_oracle_v3_binary7_separate_fast_h100/bundle_separate_oracle.py +259 -0

runs/exp_oracle_v3_binary7_separate_fast_h100/bundle_separate_oracle.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Bundle 7 separate per-cell binary classifiers into one oracle.pt and
+compute the joint top-1 accuracy on the held-out test set.
+Joint inference: for a batch of sequences, run all 7 networks in
+parallel; sigmoid each output; argmax over the 7 sigmoid scores gives
+the predicted cell type. This is the LEONINE-FID-protocol joint
+classifier.
+Bundle format:
+  state["per_cell"][cell_name] = state_dict for that cell's network
+  state["config"] = shared DeepSTARR config (cells_types=("CELL",) for
+                    single-output head)
+  state["task"] = "classifier_binary7_separate_joint"
+  state["oracle"] = "deepstarr_7cell_separate"
+To use as a drop-in oracle: load via load_separate_oracle() (defined
+below) — returns a wrapper module with .forward(seqs) -> (B, 7) logits.
+"""
+from __future__ import annotations
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Sequence
+sys.path.insert(0, "/workspace/biomodel_reasoning_calling_study2/regureasoner_loop")
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from regureasoner.benchmarks.oracles.deepstarr_7cell import (
+    DeepSTARR7Cell, DeepSTARR7CellConfig,
+)
+# vectorized one-hot
+sys.path.insert(0, "/workspace/biomodel_reasoning_calling_study2/regureasoner_loop/scripts")
+from train_oracle_binary7_fast import one_hot_batch_fast  # noqa: E402
+CELL_TYPES = ("Ex", "In", "OPC", "Ast", "Oli", "Mic", "End")
+CELL_TO_IDX = {c: i for i, c in enumerate(CELL_TYPES)}
+def build_one_cell_model(target_cell: str, input_length: int = 600) -> DeepSTARR7Cell:
+    """Same architecture as scripts/train_oracle_binary_one_cell.py used
+    (xlarge config + 1-output head)."""
+    cfg = DeepSTARR7CellConfig(
+        cell_types=(target_cell,),
+        input_length=input_length,
+        fc_dim=1024, dropout=0.3,
+        conv_channels=(256, 256, 128, 120),
+        conv_kernels=(7, 5, 5, 3),
+    )
+    model = DeepSTARR7Cell(cfg)
+    fc_in = model.head.in_features
+    model.head = nn.Linear(fc_in, 1)
+    return model
+class SeparateBinaryOracle(nn.Module):
+    """Wrapper that holds 7 per-cell binary classifiers and produces
+    (B, 7) sigmoid scores via stacking.
+    Exposes encoder/dense/head so it works with the existing
+    oracle_aux_loss.py pathway WITHOUT changes:
+      h = oracle.encoder(soft_dna)   # we don't actually use this; instead
+                                      we route soft_dna through each cell's
+                                      forward via a custom .head() impl.
+      h = oracle.dense(h)
+      logits = oracle.head(h)        # (B, 7) — joint scores
+    To do this without forcing a Frankenstein "fake encoder", we expose:
+      .config.input_length, .config.cell_types, .num_cell_types
+    and patch oracle_aux_loss.py to call .forward(soft_dna) directly when
+    the oracle is a SeparateBinaryOracle. Simpler than mocking encoder/
+    dense/head.
+    """
+    def __init__(self, per_cell_models: dict, input_length: int = 600):
+        super().__init__()
+        # Order matters for argmax: keep CELL_TYPES order
+        self.cell_types = list(CELL_TYPES)
+        # Use ModuleDict so PyTorch tracks them
+        self.per_cell = nn.ModuleDict({c: per_cell_models[c] for c in CELL_TYPES})
+        self.num_cell_types = 7
+        # native_hidden = the per-cell penultimate width (used by FID's
+        # gaussian-stats step on the embed output)
+        first_net = next(iter(self.per_cell.values()))
+        # DeepSTARR7Cell.dense ends with an Identity-ish nn.Sequential whose
+        # last Linear has out_features = fc_dim. Pull from there.
+        self.native_hidden = int(first_net.config.fc_dim)
+        # Compatibility with downstream
+        from types import SimpleNamespace
+        self.config = SimpleNamespace(
+            input_length=input_length,
+            cell_types=tuple(CELL_TYPES),
+        )
+    def forward(self, soft_dna):
+        """Two input modes — both produce (B, 7) raw logits:
+          (1) soft_dna: (B, 4, L) tensor — the differentiable aux-loss path
+          (2) seqs:     Sequence[str]     — match OracleProtocol used by
+              the lab's existing celltype_specificity/compute_fid metrics
+              which expect oracle.forward(List[str]) → (B, C).
+        """
+        from regureasoner.benchmarks.oracles.base import one_hot_dna  # noqa
+        if isinstance(soft_dna, torch.Tensor):
+            x = soft_dna
+        else:
+            # Tokenize strings to (B, 4, L)
+            seqs = list(soft_dna)
+            device = next(self.parameters()).device
+            x = torch.stack([one_hot_dna(s, self.config.input_length) for s in seqs])
+            x = x.to(device)
+        outs = []
+        for c in CELL_TYPES:
+            net = self.per_cell[c]
+            h = net.encoder(x).flatten(1)
+            h = net.dense(h)
+            logit = net.head(h).squeeze(-1)  # (B,)
+            outs.append(logit)
+        return torch.stack(outs, dim=-1)  # (B, 7)
+    @torch.no_grad()
+    def embed(self, seqs):
+        """OracleProtocol-compatible: (B,) strings → (B, fc_dim) penultimate
+        feature. Average per-cell penultimate activations gives a single
+        general-purpose embedding for FID."""
+        from regureasoner.benchmarks.oracles.base import one_hot_dna  # noqa
+        self.eval()
+        device = next(self.parameters()).device
+        x = torch.stack([one_hot_dna(s, self.config.input_length) for s in seqs]).to(device)
+        feats = []
+        for c in CELL_TYPES:
+            net = self.per_cell[c]
+            h = net.encoder(x).flatten(1)
+            h = net.dense(h)
+            feats.append(h)
+        return torch.stack(feats, dim=0).mean(dim=0)  # (B, fc_dim)
+def evaluate_joint(oracle: SeparateBinaryOracle, eval_jsonl: str, device) -> dict:
+    oracle.eval()
+    rows = [json.loads(l) for l in open(eval_jsonl)]
+    preds, targets = [], []
+    bs = 256
+    for i in range(0, len(rows), bs):
+        chunk = rows[i:i+bs]
+        x = one_hot_batch_fast([r["sequence"] for r in chunk],
+                               oracle.config.input_length).to(device)
+        with torch.no_grad():
+            with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+                logits = oracle(x)  # (B, 7)
+        preds.append(torch.sigmoid(logits.float()).cpu().numpy())
+        for r in chunk:
+            ca = r["cell_activities"]
+            if len(ca) > 7: ca = ca[:7]
+            targets.append(int(np.argmax(ca)))
+    preds = np.concatenate(preds); targets = np.asarray(targets)
+    pred_idx = preds.argmax(axis=1); top1 = float((pred_idx == targets).mean())
+    pcr={}; pca={}
+    for c, name in enumerate(CELL_TYPES):
+        mask = targets == c
+        pcr[name] = float((pred_idx[mask]==c).mean()) if mask.any() else float("nan")
+        scores = preds[:, c]; labels = (targets == c).astype(int)
+        pos = scores[labels == 1]; neg = scores[labels == 0]
+        if len(pos) and len(neg):
+            all_s = np.concatenate([pos, neg])
+            ranks = (-all_s).argsort().argsort() + 1
+            n_pos = len(pos); n_neg = len(neg)
+            U = ranks[:n_pos].sum() - n_pos*(n_pos+1)/2
+            pca[name] = float(1.0 - (U/(n_pos*n_neg)))
+        else:
+            pca[name] = float("nan")
+    return {
+        "joint_top1": top1,
+        "mean_auroc": float(np.nanmean(list(pca.values()))),
+        "per_cell_recall": pcr,
+        "per_cell_auroc": pca,
+    }
+def main():
+    out_dir = Path("/workspace/dnathinker/runs/exp_oracle_v3_binary7_separate_fast_h100")
+    print(f"[load] bundling {out_dir}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    per_cell_state: dict = {}
+    per_cell_metrics: dict = {}
+    per_cell_models = {}
+    for cell in CELL_TYPES:
+        ck = out_dir / cell / "oracle.pt"
+        if not ck.exists():
+            print(f"  missing {cell} ckpt: {ck}")
+            continue
+        s = torch.load(ck, map_location="cpu", weights_only=False)
+        per_cell_state[cell] = s["state"]
+        per_cell_metrics[cell] = s.get("best_auroc")
+        m = build_one_cell_model(cell, input_length=600)
+        m.load_state_dict(s["state"], strict=True)
+        m = m.to(device).eval()
+        per_cell_models[cell] = m
+        print(f"  {cell}: best_auroc={s.get('best_auroc'):.4f}")
+    if len(per_cell_models) < 7:
+        print(f"[error] only got {len(per_cell_models)} of 7 cells; aborting bundle")
+        sys.exit(1)
+    oracle = SeparateBinaryOracle(per_cell_models).to(device)
+    print(f"\n[eval] computing joint top-1 on test set")
+    eval_jsonl = "/workspace/dnathinker/data/oracle/oracle_test.7cell.500.jsonl"
+    metrics = evaluate_joint(oracle, eval_jsonl, device)
+    print(f"  joint_top1   = {metrics['joint_top1']:.4f}")
+    print(f"  mean_auroc   = {metrics['mean_auroc']:.4f}")
+    print(f"  per_cell recall: {metrics['per_cell_recall']}")
+    print(f"  per_cell AUROC : {metrics['per_cell_auroc']}")
+    bundle = {
+        "per_cell": per_cell_state,
+        "config": {
+            "task": "classifier_binary7_separate",
+            "cell_types": list(CELL_TYPES),
+            "input_length": 600,
+            "deepstarr_fc_dim": 1024,
+            "deepstarr_dropout": 0.3,
+            "deepstarr_conv_channels": [256, 256, 128, 120],
+            "deepstarr_conv_kernels": [7, 5, 5, 3],
+            "deepstarr_pool_kernels": [3, 3, 3, 3],
+            "fc_dim": 1024, "dropout": 0.3,
+            "conv_channels": [256, 256, 128, 120],
+            "conv_kernels": [7, 5, 5, 3],
+            "pool_kernels": [3, 3, 3, 3],
+        },
+        "metrics": metrics,
+        "per_cell_train_metrics": per_cell_metrics,
+        "oracle": "deepstarr_7cell_separate",
+    }
+    bundle_path = out_dir / "oracle.pt"
+    torch.save(bundle, bundle_path)
+    print(f"\n[done] saved bundle to {bundle_path}")
+    print(f"  joint_top1 = {metrics['joint_top1']:.4f}")
+    print(f"  mean_auroc = {metrics['mean_auroc']:.4f}")
+    # Also save metrics.json for the meta indexer
+    with open(out_dir / "metrics.json", "w") as f:
+        json.dump({"joint_top1": metrics["joint_top1"],
+                   "mean_auroc": metrics["mean_auroc"],
+                   "per_cell": metrics["per_cell_auroc"],
+                   "per_cell_train": per_cell_metrics}, f, indent=2)
+    with open(out_dir / "_arch_meta.json", "w") as f:
+        json.dump({"task": "oracle_classifier_separate",
+                   "kind": "v3_binary7_separate",
+                   "label": f"LEONINE-strict 7 separate networks (joint top1 {metrics['joint_top1']:.3f})"},
+                  f)
+if __name__ == "__main__":
+    main()