Cactus-Compute
/

gemma4-e2b-grouped-k192-router

+#!/usr/bin/env python3
+"""
+rung6_moe_g4.py — Gemma-4 E2B port of rung6_moe.py.
+Same MECE MoE approach, adapted for Gemma-4's heterogeneous MLP widths:
+  - Layers 0-14:  D_FFN=6144 (INTERMEDIATE)
+  - Layers 15-34: D_FFN=12288 (INTERMEDIATE_WIDE)
+Per-layer A logits have different row counts; PRUNE_K is per-layer.
+Architecture:
+    - Frozen base weights: W_gate, W_up, W_down (per-layer, variable D_FFN)
+    - Trainable per-layer:
+        * Assignment logits A ∈ R^{D_FFN_i, K}
+        * Router W_r ∈ R^{D_MODEL, K_spec}   (K_spec = K - K_const)
+    - Expert k's soft mask: m_k[j] = softmax(A[j,:] / tau)[k]
+    - τ anneals 1.0 → 0.01
+    - Per-token forward:
+        1. Apply K_const always-on experts' combined soft mask to h
+        2. Route top-K_active specialist experts via W_r (+ noise)
+        3. Add selected specialist masks to combined mask (softmax-weighted within top-K)
+        4. h = gelu(gate) * up * combined_mask; y = W_down @ h
+    - Aux losses: Switch balance (α_b=0.01) + router z-loss (α_z=0.001)
+Usage:
+    # fix_both-style Gemma-4 launch
+    python rung6_moe_g4.py --phase g4_fixboth \
+        --K 8 --K_const 2 --K_active_spec 2 \
+        --init taylor --loss ce \
+        --int4_qat --int4_group_size 32 \
+        --calib_path 3BASiL/calibration_data/gemma4_e2b_it_bulk_50k.jsonl \
+        --eval_calib_path 3BASiL/calibration_data/gemma4_e2b_it_final_50k.jsonl \
+        --diverse_calib_path 3BASiL/calibration_data/diverse_wikitext.jsonl \
+        --kl_base_lambda 0.5 --kl_base_temp 8.0 \
+        --w_drift_lambda 1e-6 \
+        --max_steps 2000 --save_checkpoint ckpts/g4_fixboth.pt
+Output:
+    logs/rung6_moe_<phase>_results.json
+"""
+import argparse
+import json
+import math
+import os
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import AdamW
+try:
+    import bitsandbytes as bnb
+    _HAS_BNB = True
+except ImportError:
+    _HAS_BNB = False
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from gemma4_hf import (
+    load_gemma4 as load_model,
+    N_LAYERS,
+    HIDDEN_SIZE as D_MODEL,
+    DEVICE,
+    DTYPE,
+    INTERMEDIATE,
+    INTERMEDIATE_WIDE,
+    DOUBLE_WIDE_START,
+)
+from moe_recovery import (
+    recover_modules_via_generic_pipeline,
+    finetune_moe_per_layer,
+)
+CALIB_DATA_PATH = "3BASiL/calibration_data/gemma4_e2b_it_final_50k.jsonl"  # default; override via --calib_path
+BASELINE_PPL = 0.0    # Gemma-4 baselines TBD — set to 0 so diff prints as "+ppl"
+CLEAN_PPL    = 0.0
+# MAX_SEQ_LEN: per-record padded length. We use one-sequence-per-record so every
+# sequence starts with BOS + chat-template scaffold (no mid-document chunks losing
+# the BOS / scaffold context). 2048 covers ~70% of `final.jsonl` records fully;
+# longer records are truncated (prompt + response prefix that fits). This eliminates
+# the eval unfairness where mid-document chunks lacked BOS — base model lost context
+# while the trained student had memorized the chunked positions.
+MAX_SEQ_LEN  = 2048
+SEQ_LEN      = MAX_SEQ_LEN  # alias for back-compat (eval/train loops use SEQ_LEN)
+BATCH        = 1      # Gemma-4 E2B (4.65B) is ~17× larger than Gemma-3 (270M)
+GRAD_ACCUM   = 16     # 1 × 16 = 16 effective — keeps optimizer-step cadence similar
+EVAL_BATCHES = 0     # 0 = no cap; eval scans every chunk in the eval split
+LR           = 1e-4
+NOISE_SCALE  = 0.020264
+PRUNE_P      = 0.40                       # 40% kept (same per-token sparsity target as Gemma-3)
+def _d_ffn_at(layer_idx: int) -> int:
+    """Return the FFN intermediate size for a given layer index."""
+    return INTERMEDIATE_WIDE if layer_idx >= DOUBLE_WIDE_START else INTERMEDIATE
+def _prune_k_at(layer_idx: int) -> int:
+    """Per-layer target of active neurons at bottom-60 parity."""
+    return int(_d_ffn_at(layer_idx) * PRUNE_P)
+# ─────────────────────────── Int4 QAT (Phase I) ────────────────────────
+def quantize_int4_groupwise_ste(w, group_size=32):
+    """Fake-quantize w (fp) to int4 groupwise along last dim with STE gradient.
+    Symmetric int4: range [-7, 7] (one sign bit + 3 magnitude bits, skip -8 to stay
+    symmetric — matches AWQ/GGUF Q4_K convention). One scale per group (groupwise).
+    Forward: returns the dequantized weight. Backward: gradient passes through the
+    original weight unchanged via straight-through estimator.
+    w: [out_dim, in_dim] — typical nn.Linear.weight shape.
+    group_size: in_features per scale group. Default 32 to match GGUF Q4_0 / Q4_K
+    block size used by llama.cpp-family inference kernels. Gemma-3's in_features
+    (640, 1024, 2048) are all divisible by 32 — no padding needed.
+    """
+    out_dim, in_dim = w.shape
+    orig_dtype = w.dtype
+    # Do quant math in fp32 to avoid bf16 precision loss in scale/round steps.
+    w_fp32 = w.float()
+    pad = (group_size - in_dim % group_size) % group_size
+    if pad:
+        w_padded = F.pad(w_fp32, (0, pad))
+    else:
+        w_padded = w_fp32
+    n_groups = (in_dim + pad) // group_size
+    w_g = w_padded.view(out_dim, n_groups, group_size)
+    max_abs = w_g.abs().amax(dim=-1, keepdim=True).clamp_min(1e-6)
+    scale = max_abs / 7.0                                         # [out_dim, n_groups, 1]
+    w_int = torch.round(w_g / scale).clamp(-7, 7)
+    w_deq = (w_int * scale).view(out_dim, -1)                     # [out_dim, in_dim+pad]
+    if pad:
+        w_deq = w_deq[:, :in_dim]
+    w_deq = w_deq.to(orig_dtype)
+    # STE: forward = w_deq, backward = identity w.r.t. w
+    return w + (w_deq - w).detach()
+class Int4QuantLinear(nn.Linear):
+    """Drop-in nn.Linear replacement that fake-quantizes its weight to int4 in forward.
+    Subclasses nn.Linear, so state_dict keys (.weight, .bias) are identical to a
+    regular nn.Linear — cross-loadable. The quantization happens only in forward,
+    leaving the stored fp weight intact (trained by QAT gradients).
+    """
+    _group_size = 32  # GGUF Q4_0 / Q4_K block size — matches deploy-time inference kernels
+    def forward(self, x):
+        w_q = quantize_int4_groupwise_ste(self.weight, self._group_size)
+        return F.linear(x, w_q, self.bias)
+def apply_int4_inplace(model, group_size=32,
+                       target_substrings=("gate_proj", "up_proj", "down_proj",
+                                          "q_proj", "k_proj", "v_proj", "o_proj")):
+    """Actually quantize target Linear weights to int4 grid IN-PLACE (deployment simulation).
+    Unlike wrap_int4 (which fake-quantizes every forward via STE), this snaps the
+    stored fp weight to the int4 grid exactly once. Post-call the model behaves as
+    if it's been exported to a real int4 deploy format — no runtime quantize overhead.
+    Returns count of modified weights.
+    """
+    count = 0
+    with torch.no_grad():
+        for name, mod in model.named_modules():
+            if not isinstance(mod, nn.Linear):
+                continue
+            if isinstance(mod, Int4QuantLinear):
+                continue
+            if not any(t in name for t in target_substrings):
+                continue
+            w_q = quantize_int4_groupwise_ste(mod.weight, group_size).detach()
+            mod.weight.data.copy_(w_q)
+            count += 1
+    return count
+def apply_gaussian_noise_inplace(model, noise_scale,
+                                 target_substrings=("gate_proj", "up_proj", "down_proj",
+                                                    "q_proj", "k_proj", "v_proj", "o_proj"),
+                                 seed=0):
+    """Add N(0, noise_scale × p.std()) to target Linear weights IN-PLACE.
+    Gaussian proxy for quantization noise. For int4 group=32, analytically equivalent
+    noise_scale ≈ 0.129 (from σ_q/σ_w ≈ √((max_abs/7)²/12)/σ_w with
+    max_abs ≈ σ_w·√(2·ln group_size)). Returns count of modified weights.
+    """
+    gen = torch.Generator(device=DEVICE)
+    gen.manual_seed(seed)
+    count = 0
+    with torch.no_grad():
+        for name, mod in model.named_modules():
+            if not isinstance(mod, nn.Linear):
+                continue
+            if isinstance(mod, Int4QuantLinear):
+                # Skip to avoid compounding noise with fake-quant in forward (ambiguous semantics).
+                continue
+            if not any(t in name for t in target_substrings):
+                continue
+            w = mod.weight.data
+            std_w = w.float().std()
+            noise = torch.randn(w.shape, generator=gen, device=w.device, dtype=torch.float32) * std_w * noise_scale
+            w.add_(noise.to(w.dtype))
+            count += 1
+    return count
+class LoRALinear(nn.Module):
+    """Wraps an nn.Linear (incl. Int4QuantLinear). Base is frozen; trainable rank-r LoRA delta.
+    forward(x) = base(x) + (alpha / rank) * lora_b(lora_a(x))
+    A is initialized Kaiming-uniform; B is zero — so initial output equals base output.
+    """
+    def __init__(self, base_linear: nn.Linear, rank: int, alpha: float):
+        super().__init__()
+        self.base = base_linear
+        for p in self.base.parameters():
+            p.requires_grad_(False)
+        in_dim, out_dim = base_linear.in_features, base_linear.out_features
+        self.lora_a = nn.Linear(in_dim, rank, bias=False,
+                                device=base_linear.weight.device, dtype=base_linear.weight.dtype)
+        self.lora_b = nn.Linear(rank, out_dim, bias=False,
+                                device=base_linear.weight.device, dtype=base_linear.weight.dtype)
+        nn.init.kaiming_uniform_(self.lora_a.weight, a=5 ** 0.5)
+        nn.init.zeros_(self.lora_b.weight)
+        self.scale = alpha / rank
+    def forward(self, x):
+        return self.base(x) + self.lora_b(self.lora_a(x)) * self.scale
+def wrap_lora(model, rank: int, alpha: float,
+              target_substrings=("gate_proj", "up_proj", "down_proj",
+                                 "q_proj", "k_proj", "v_proj", "o_proj")):
+    """Replace target Linear modules with LoRALinear. Base is frozen; only LoRA A/B train.
+    Run AFTER wrap_int4 so the base inside LoRALinear is the int4-quantized Linear.
+    Returns number of wrapped modules and total LoRA params added.
+    """
+    count = 0
+    n_params = 0
+    for name, mod in list(model.named_modules()):
+        if not isinstance(mod, nn.Linear):
+            continue
+        if isinstance(mod, LoRALinear):
+            continue
+        if not any(t in name for t in target_substrings):
+            continue
+        new_mod = LoRALinear(mod, rank=rank, alpha=alpha)
+        parent_name, _, attr = name.rpartition(".")
+        parent = model.get_submodule(parent_name) if parent_name else model
+        setattr(parent, attr, new_mod)
+        n_params += sum(p.numel() for p in new_mod.lora_a.parameters()) + \
+                    sum(p.numel() for p in new_mod.lora_b.parameters())
+        count += 1
+    return count, n_params
+def wrap_int4(model, target_substrings=("gate_proj", "up_proj", "down_proj",
+                                        "q_proj", "k_proj", "v_proj", "o_proj")):
+    """Replace matching nn.Linear modules with Int4QuantLinear (subclass).
+    State-dict keys unchanged; weights shared (same Tensor). Returns count of wrapped modules."""
+    count = 0
+    for name, mod in list(model.named_modules()):
+        if not isinstance(mod, nn.Linear):
+            continue
+        if isinstance(mod, Int4QuantLinear):
+            continue  # already wrapped
+        if not any(t in name for t in target_substrings):
+            continue
+        new_mod = Int4QuantLinear(mod.in_features, mod.out_features,
+                                  bias=mod.bias is not None,
+                                  device=mod.weight.device, dtype=mod.weight.dtype)
+        # Share the underlying tensor (no copy) so optimizer state and grad flow are preserved
+        new_mod.weight = mod.weight
+        if mod.bias is not None:
+            new_mod.bias = mod.bias
+        parent_name, _, attr = name.rpartition(".")
+        parent = model.get_submodule(parent_name) if parent_name else model
+        setattr(parent, attr, new_mod)
+        count += 1
+    return count
+# ────────────────────────────── utilities ──────────────────────────────
+def corrupt_model(model, noise_scale=NOISE_SCALE, seed=42):
+    rng = torch.Generator(); rng.manual_seed(seed)
+    with torch.no_grad():
+        for p in model.parameters():
+            noise = torch.randn(p.shape, generator=rng, dtype=p.dtype).to(p.device)
+            p.add_(noise * p.std() * noise_scale)
+    print(f"  Corrupted model with noise_scale={noise_scale}")
+def load_seqs(tokenizer, split="train", calib_path=None, raw_text=False):
+    """Load tokenized sequences from a JSONL calibration file.
+    80/20 train/eval split within the file. Use split='all' to return all records
+    (useful when train path and eval path differ — no need to withhold).
+    Pass `calib_path` to override default.
+    Format: ONE sequence per record, length MAX_SEQ_LEN, padded with pad_token_id.
+    Every sequence starts with BOS + the chat-template scaffold. We do NOT chunk
+    long records into multiple length-MAX_SEQ_LEN pieces, because mid-document
+    chunks lack the BOS + chat scaffold and drop the base model out of distribution
+    while a trained student memorizes the chunked positions — that produced an
+    unfair eval comparison previously. Records longer than MAX_SEQ_LEN are
+    truncated to MAX_SEQ_LEN (prompt + response prefix that fits). Records whose
+    user prompt alone exceeds MAX_SEQ_LEN-1 (no room for a response token) are
+    skipped — they have no scored positions.
+    If raw_text=True, expects JSONL with a 'text' field (e.g., wikitext) and skips
+    the chat-template wrapping — suitable for KL-to-teacher regularization on a
+    diverse pretraining-style corpus. Each record produces one MAX_SEQ_LEN
+    sequence (truncated if longer) with every non-pad position scored.
+    """
+    path = calib_path or CALIB_DATA_PATH
+    records = []
+    with open(path) as f:
+        for line in f:
+            records.append(json.loads(line))
+    if split == "all":
+        pass  # use all records
+    else:
+        n_train = int(len(records) * 0.8)
+        records = records[:n_train] if split == "train" else records[n_train:]
+    pad_id = tokenizer.pad_token_id or 0
+    if raw_text:
+        # Pretraining-style: each record has a 'text' field; no chat template.
+        # One sequence per record (truncated to MAX_SEQ_LEN); every non-pad
+        # position is scored (no prompt mask — every token is informative).
+        seqs = []
+        for r in records:
+            text = r.get("text") or r.get("content") or ""
+            if not text:
+                continue
+            ids = tokenizer.encode(text, add_special_tokens=True)
+            if len(ids) < 32:
+                continue
+            ids = ids[:MAX_SEQ_LEN]
+            n = len(ids)
+            pad_len = MAX_SEQ_LEN - n
+            # labels[t] = ids[t+1] for t in [0, n-2]; labels[n-1] = -100 (boundary);
+            # labels[n:] = -100 (pad). Total length = MAX_SEQ_LEN.
+            labels_list = ids[1:n] + [-100] * (pad_len + 1)
+            assert len(labels_list) == MAX_SEQ_LEN
+            seqs.append({
+                "input_ids": torch.tensor(ids + [pad_id] * pad_len, dtype=torch.long),
+                "labels":    torch.tensor(labels_list, dtype=torch.long),
+            })
+        return seqs
+    # Chat-template format: mask user-prompt tokens with -100 in labels so only assistant
+    # response tokens are scored (CE training and PPL eval). Avoids over-fitting to the user
+    # prompt distribution and gives a meaningful PPL number for "how well does the model
+    # produce the assistant response given the prompt." Pretraining-style raw_text above is
+    # NOT masked (every token is informative).
+    seqs = []
+    for r in records:
+        msgs = [{"role": "user", "content": r["prompt"]},
+                {"role": "model", "content": r["response"]}]
+        try:
+            text      = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
+            user_text = tokenizer.apply_chat_template([msgs[0]], tokenize=False, add_generation_prompt=True)
+        except Exception:
+            text      = f"{r['prompt']}\n{r['response']}"
+            user_text = f"{r['prompt']}\n"
+        ids = tokenizer.encode(text, add_special_tokens=True)
+        # n_user = number of tokens at the START of `ids` that are user prompt + chat-template
+        # scaffolding. Tokens at positions [n_user, len(ids)) are the assistant response.
+        n_user = len(tokenizer.encode(user_text, add_special_tokens=True))
+        # Truncate to MAX_SEQ_LEN. Drop records where the user prompt alone fills the
+        # window (no scored response tokens would survive).
+        if n_user >= MAX_SEQ_LEN:
+            continue
+        ids = ids[:MAX_SEQ_LEN]
+        n = len(ids)
+        if n < 32:
+            continue
+        # labels[t] is the next-token target for position t (paired with logits[t]).
+        # labels[t] = ids[t+1] when ids[t+1] is in the response (i.e., t+1 >= n_user),
+        # else -100. Final labels[n-1] is set to -100 because there is no ids[n] inside
+        # the in-document content (truncation/end). labels[n:] = -100 (pad).
+        # In code:  first n_user-1 labels are -100 (prompt-token targets),
+        #           then labels[n_user-1 .. n-2] = ids[n_user .. n-1] (response targets),
+        #           then labels[n-1 .. MAX_SEQ_LEN-1] = -100.
+        n_mask = n_user - 1
+        pad_len = MAX_SEQ_LEN - n
+        labels_list = [-100] * n_mask + ids[n_user:n] + [-100] * (pad_len + 1)
+        assert len(labels_list) == MAX_SEQ_LEN, \
+            f"label len {len(labels_list)} != MAX_SEQ_LEN {MAX_SEQ_LEN}"
+        # Sanity: at least one scored position (otherwise drop).
+        if not any(l != -100 for l in labels_list):
+            continue
+        seqs.append({
+            "input_ids": torch.tensor(ids + [pad_id] * pad_len, dtype=torch.long),
+            "labels":    torch.tensor(labels_list, dtype=torch.long),
+        })
+    return seqs
+def kl_loss(s_logits, t_logits, temp=1.0, mask=None):
+    """KL(student || teacher), optional bool [B,T] mask of positions to score.
+    Without mask, equivalent to F.kl_div(reduction='batchmean') * temp**2 (legacy).
+    With mask, scales the masked elements as if positions outside the mask had
+    contributed 0 — preserves the same per-batch loss magnitude as legacy.
+    """
+    s_log  = F.log_softmax(s_logits / temp, dim=-1)
+    t_prob = F.softmax(t_logits / temp, dim=-1)
+    if mask is None:
+        return F.kl_div(s_log, t_prob, reduction="batchmean") * (temp ** 2)
+    # Per-(B,T) vocab-summed KL; preserve batchmean (sum / batch) semantics over masked subset.
+    elem = F.kl_div(s_log, t_prob, reduction="none").sum(dim=-1)  # [B, T]
+    return elem[mask].sum() / s_logits.shape[0] * (temp ** 2)
+def ce_loss(s_logits, labels):
+    return F.cross_entropy(
+        s_logits.reshape(-1, s_logits.size(-1)),
+        labels.reshape(-1), ignore_index=-100)
+@torch.no_grad()
+def eval_ppl(model, tokenizer, calib_path=None, max_seqs=None):
+    """Compute PPL over the eval split. If max_seqs is set, cap at that many
+    sequences (in load order — deterministic). Default behavior unchanged when
+    max_seqs is None."""
+    seqs = load_seqs(tokenizer, "eval", calib_path=calib_path)
+    if max_seqs is not None and max_seqs > 0:
+        seqs = seqs[:max_seqs]
+    loader = torch.utils.data.DataLoader(seqs, batch_size=1)
+    total_nll, total_tok = 0.0, 0
+    model.eval()
+    for i, batch in enumerate(loader):
+        if EVAL_BATCHES and i >= EVAL_BATCHES: break
+        ids    = batch["input_ids"].to(DEVICE)
+        labels = batch["labels"][:, :-1].to(DEVICE)
+        logits = model(ids)
+        loss = F.cross_entropy(
+            logits[:, :-1].reshape(-1, logits.size(-1)),
+            labels.reshape(-1), ignore_index=-100, reduction="sum")
+        total_nll += loss.item()
+        total_tok += (labels != -100).sum().item()
+    return math.exp(total_nll / total_tok) if total_tok > 0 else float("inf")
+# ──────────────────────── initialization helpers ────────────────────────
+def compute_taylor_saliency(model, tokenizer, n_batches=8, calib_path=None):
+    """Mean |h * dL/dh| per neuron per layer. Returns list[N_LAYERS] of [D_FFN] tensors.
+    Temporarily re-enables grad on model params; restores frozen state on exit."""
+    model.eval()
+    # Snapshot freeze state; temporarily unfreeze for Taylor computation
+    prev_grad = [p.requires_grad for p in model.parameters()]
+    for p in model.parameters(): p.requires_grad_(True)
+    try:
+        scores = [torch.zeros(_d_ffn_at(i), device=DEVICE) for i in range(N_LAYERS)]
+        seqs = load_seqs(tokenizer, "train", calib_path=calib_path)[:n_batches * BATCH]
+        loader = torch.utils.data.DataLoader(seqs, batch_size=BATCH)
+        caches = [None] * N_LAYERS
+        hooks = []
+        def make_hook(i):
+            def hook(mod, inp, out):
+                caches[i] = out
+                out.retain_grad()
+            return hook
+        for i, layer in enumerate(model.layers):
+            hooks.append(layer.mlp.gate_proj.register_forward_hook(make_hook(i)))
+        n_seen = 0
+        for batch in loader:
+            ids = batch["input_ids"].to(DEVICE)
+            labels = batch["labels"][:, :-1].to(DEVICE)
+            logits = model(ids)
+            loss = F.cross_entropy(
+                logits[:, :-1].reshape(-1, logits.size(-1)),
+                labels.reshape(-1), ignore_index=-100)
+            loss.backward()
+            for i in range(N_LAYERS):
+                if caches[i] is not None and caches[i].grad is not None:
+                    s = (caches[i].detach() * caches[i].grad.detach()).abs().mean(dim=(0, 1))
+                    scores[i] += s
+            model.zero_grad(set_to_none=True)
+            n_seen += 1
+            if n_seen >= n_batches: break
+        for h in hooks: h.remove()
+        scores = [s.detach() / max(n_seen, 1) for s in scores]
+        return scores
+    finally:
+        # Restore original freeze state
+        for p, g in zip(model.parameters(), prev_grad):
+            p.requires_grad_(g)
+        model.zero_grad(set_to_none=True)
+def init_assignment_logits(init_mode, K, K_const, taylor_scores=None, core_frac=0.5):
+    """Return per-layer list of A ∈ [D_FFN_i, K] init tensors."""
+    As = []
+    for i in range(N_LAYERS):
+        d_ffn_i = _d_ffn_at(i)
+        prune_k_i = _prune_k_at(i)
+        if init_mode == "random":
+            # std=0.5 gives softmax(A/1.0) mild bias but softmax(A/0.01) near one-hot
+            # at the end of training — room for A to grow meaningfully during anneal.
+            A = torch.randn(d_ffn_i, K) * 0.5
+        elif init_mode == "taylor":
+            assert taylor_scores is not None, "taylor init requires scores"
+            scores = taylor_scores[i].cpu()  # [D_FFN_i]
+            order = scores.argsort(descending=True)  # high-saliency first
+            # Softer init (±2.0) so τ-anneal 1.0→0.01 has dynamic range
+            A = torch.full((d_ffn_i, K), -2.0)
+            if K_const > 0:
+                # Top core_frac of prune_k_i active neurons into K_const always-on experts
+                n_core = int(prune_k_i * core_frac)
+                for rank, idx in enumerate(order[:n_core]):
+                    A[idx, rank % K_const] = 2.0
+                for rank, idx in enumerate(order[n_core:prune_k_i]):
+                    A[idx, K_const + rank % (K - K_const)] = 2.0
+                # Low-saliency neurons: uniform mild bias, τ-anneal drives assignment
+                for rank, idx in enumerate(order[prune_k_i:]):
+                    A[idx, rank % K] = 0.0
+            else:
+                for rank, idx in enumerate(order[:prune_k_i]):
+                    A[idx, rank % K] = 2.0
+                for rank, idx in enumerate(order[prune_k_i:]):
+                    A[idx, rank % K] = 0.0
+        elif init_mode == "em" or init_mode == "kmeans":
+            raise NotImplementedError(
+                f"init={init_mode} requires a precomputed file; use --init random|taylor for now")
+        else:
+            raise ValueError(f"Unknown init_mode: {init_mode}")
+        As.append(A)
+    return As
+def init_router_weights(init_mode, model, init_As, K, K_const, scale_multiplier=1.0):
+    """Return per-layer list of W_r ∈ [D_MODEL, K_spec] init tensors (None = use default random).
+    Modes:
+      - random: let MoEMLP use its default N(0, 0.02) init (returns list of Nones)
+      - zero:   W_r = 0 everywhere (uniform routing at init; router learns from scratch)
+      - centroid: W_r[:, k] = L2-normalized mean of W_gate rows for expert k's
+                  argmax-assigned specialist neurons, scaled to magnitude 0.02
+                  (matches default random init scale). Informed warm start.
+      - scaled_centroid: W_r[:, k] = scale_multiplier × mean of W_gate rows for
+                  expert k's assigned neurons (NOT normalized). The router's weight
+                  scale inherits the natural magnitude of the base model's W_gate
+                  columns — i.e., the router is "a multiple" of the underlying
+                  weight geometry. scale_multiplier sets that multiple explicitly.
+    """
+    K_spec = K - K_const
+    if K_spec == 0:
+        return [None] * N_LAYERS
+    W_rs = []
+    for i in range(N_LAYERS):
+        if init_mode == "random":
+            W_rs.append(None)
+        elif init_mode == "zero":
+            W_rs.append(torch.zeros(D_MODEL, K_spec))
+        elif init_mode == "centroid":
+            W_gate = model.layers[i].mlp.gate_proj.weight.detach().float().cpu()  # [D_FFN, D_MODEL]
+            A = init_As[i].cpu()  # [D_FFN, K]
+            assignment = A.argmax(dim=-1)  # [D_FFN], values in [0, K)
+            W_r = torch.zeros(D_MODEL, K_spec)
+            for k in range(K_spec):
+                expert_k = K_const + k   # specialist expert index in full K space
+                mask = (assignment == expert_k)
+                if mask.any():
+                    W_r[:, k] = W_gate[mask].mean(dim=0)
+                else:
+                    # Fallback: mean of all columns
+                    W_r[:, k] = W_gate.mean(dim=0)
+            W_r = F.normalize(W_r, dim=0) * 0.02   # unit-direction, small magnitude comparable to default
+            W_rs.append(W_r)
+        elif init_mode == "scaled_centroid":
+            W_gate = model.layers[i].mlp.gate_proj.weight.detach().float().cpu()  # [D_FFN, D_MODEL]
+            A = init_As[i].cpu()  # [D_FFN, K]
+            assignment = A.argmax(dim=-1)
+            W_r = torch.zeros(D_MODEL, K_spec)
+            for k in range(K_spec):
+                expert_k = K_const + k
+                mask = (assignment == expert_k)
+                if mask.any():
+                    W_r[:, k] = W_gate[mask].mean(dim=0)
+                else:
+                    W_r[:, k] = W_gate.mean(dim=0)
+            # NOT normalized — router magnitude inherits base-model weight scale.
+            # scale_multiplier is a free "how many multiples of base weights" knob.
+            W_r = W_r * scale_multiplier
+            W_rs.append(W_r)
+        else:
+            raise ValueError(f"Unknown W_r init_mode: {init_mode}")
+    return W_rs
+# ───────────────────────── MoE MLP module ─────────────────────────────
+class MoEMLP(nn.Module):
+    """
+    K experts via per-neuron softmax assignment (MECE mode) OR
+    independent sigmoid masks with orthogonality loss (sigmoid_ortho mode).
+    K_const always-on experts always apply; K_spec routed experts selected
+    via top-K_active_spec routing.
+    """
+    def __init__(self, base_mlp, K, K_const, K_active_spec, mece_mode, init_A,
+                 noise_std=1.0, freeze_base=True, init_W_r=None):
+        super().__init__()
+        self.gate_proj = base_mlp.gate_proj
+        self.up_proj   = base_mlp.up_proj
+        self.down_proj = base_mlp.down_proj
+        if freeze_base:
+            for p in self.gate_proj.parameters(): p.requires_grad_(False)
+            for p in self.up_proj.parameters():   p.requires_grad_(False)
+            for p in self.down_proj.parameters(): p.requires_grad_(False)
+        self.K = K
+        self.K_const = K_const
+        self.K_spec = K - K_const
+        self.K_active_spec = K_active_spec  # # of specialist experts fired per token
+        self.mece_mode = mece_mode          # "softmax" | "sigmoid_ortho"
+        self.noise_std = noise_std
+        self.tau = 1.0
+        # Assignment logits
+        self.A = nn.Parameter(init_A.to(DEVICE).float())
+        if self.K_spec > 0:
+            if init_W_r is not None:
+                self.W_r = nn.Parameter(init_W_r.to(DEVICE).float())
+            else:
+                self.W_r = nn.Parameter(torch.zeros(D_MODEL, self.K_spec, device=DEVICE, dtype=torch.float32))
+                nn.init.normal_(self.W_r, std=0.02)
+        else:
+            self.register_parameter("W_r", None)
+        # Diagnostics cache (populated during training forward)
+        self._last_logits = None
+        self._last_top_idx = None
+    def _expert_masks(self):
+        """Return [K, D_FFN] — each expert's soft mask."""
+        if self.mece_mode == "softmax":
+            probs = F.softmax(self.A / max(self.tau, 1e-3), dim=-1)   # [D_FFN, K]
+            return probs.T.contiguous()                                # [K, D_FFN]
+        elif self.mece_mode == "sigmoid_ortho":
+            return torch.sigmoid(self.A / max(self.tau, 1e-3)).T.contiguous()
+        else:
+            raise ValueError(self.mece_mode)
+    def forward(self, x):
+        gate_raw = self.gate_proj(x)                              # [B, T, D_FFN]
+        gate_act = F.gelu(gate_raw, approximate="tanh")
+        up_act   = self.up_proj(x)
+        h_pre    = gate_act * up_act                              # [B, T, D_FFN]
+        masks = self._expert_masks()                              # [K, D_FFN]
+        # Always-on core contribution
+        d_ffn = self.A.shape[0]                                   # per-layer D_FFN
+        if self.K_const > 0:
+            const_mask = masks[:self.K_const].sum(dim=0)          # [D_FFN]
+        else:
+            const_mask = torch.zeros(d_ffn, device=x.device, dtype=torch.float32)
+        # Routed specialist contribution
+        if self.K_spec > 0:
+            logits = x.to(torch.float32) @ self.W_r               # [B, T, K_spec]
+            if self.training and self.noise_std > 0:
+                logits = logits + torch.randn_like(logits) * (self.noise_std / (self.K_spec ** 0.5))
+            self._last_logits = logits
+            k_act = min(self.K_active_spec, self.K_spec)
+            top_vals, top_idx = logits.topk(k_act, dim=-1)        # [B, T, k_act]
+            self._last_top_idx = top_idx
+            top_w = F.softmax(top_vals, dim=-1)                   # [B, T, k_act]
+            spec_masks = masks[self.K_const:]                     # [K_spec, D_FFN]
+            gathered = spec_masks[top_idx]                        # [B, T, k_act, D_FFN]
+            spec_combined = (gathered * top_w.unsqueeze(-1)).sum(dim=-2)  # [B, T, D_FFN]
+            combined = const_mask.view(1, 1, -1) + spec_combined
+        else:
+            combined = const_mask.view(1, 1, -1).expand_as(h_pre)
+        h = h_pre * combined.to(x.dtype)
+        return self.down_proj(h)
+    def aux_loss(self, alpha_b=0.01, alpha_z=0.001):
+        """Switch balance loss (on specialists only) + router z-loss.
+        Balance (Switch/GShard top-k generalization):
+          f_k = fraction of tokens routed to expert k
+              = (tokens_selecting_k / total_tokens) / K_active_spec
+          p_k = mean softmax probability for expert k
+          L   = α * K * Σ f_k p_k,  minimized (→ 1/K) when uniform.
+        """
+        if self.K_spec == 0 or self._last_logits is None:
+            return torch.tensor(0.0, device=DEVICE)
+        logits = self._last_logits                                 # [B, T, K_spec]
+        probs  = F.softmax(logits, dim=-1)
+        top_idx = self._last_top_idx                               # [B, T, k_act]
+        hot = F.one_hot(top_idx, self.K_spec).float().sum(dim=-2)  # [B, T, K_spec]
+        # Normalize so Σ_k f_k = 1 regardless of K_active_spec.
+        f_k = hot.mean(dim=(0, 1)) / max(self.K_active_spec, 1)    # [K_spec]
+        p_k = probs.mean(dim=(0, 1))                               # [K_spec]
+        balance = alpha_b * self.K_spec * (f_k * p_k).sum()
+        lse = torch.logsumexp(logits, dim=-1)                      # [B, T]
+        z_loss = alpha_z * (lse ** 2).mean()
+        return balance + z_loss
+    def orth_loss(self):
+        """For sigmoid_ortho mode: penalize pairwise expert mask overlap."""
+        if self.mece_mode != "sigmoid_ortho": return torch.tensor(0.0, device=DEVICE)
+        masks = self._expert_masks()                               # [K, D_FFN]
+        # L2-normalize rows, then off-diagonal Gram
+        mn = F.normalize(masks, dim=-1)
+        gram = mn @ mn.T                                           # [K, K]
+        K = gram.size(0)
+        off = gram - torch.eye(K, device=gram.device)
+        return (off ** 2).sum() / (K * (K - 1) + 1e-8)
+# ───────────────────────────── training ─────────────────────────────
+def get_tau(step, max_steps, tau_start, tau_end, hold_frac=0.2):
+    """Linear anneal over first (1-hold_frac) of steps, then hold at tau_end.
+    This prevents τ-anneal shock — the model needs time to adapt to hard masks."""
+    anneal_steps = max(1, int(max_steps * (1 - hold_frac)))
+    if step >= anneal_steps:
+        return tau_end
+    frac = step / max(1, anneal_steps - 1)
+    return tau_start + frac * (tau_end - tau_start)
+def install_moe(model, K, K_const, K_active_spec, mece_mode, init_As, noise_std,
+                freeze_base=True, init_W_rs=None):
+    mlp_modules = []
+    if init_W_rs is None:
+        init_W_rs = [None] * N_LAYERS
+    for i in range(N_LAYERS):
+        new_mlp = MoEMLP(
+            base_mlp=model.layers[i].mlp,
+            K=K, K_const=K_const, K_active_spec=K_active_spec,
+            mece_mode=mece_mode, init_A=init_As[i], noise_std=noise_std,
+            freeze_base=freeze_base, init_W_r=init_W_rs[i])
+        model.layers[i].mlp = new_mlp
+        mlp_modules.append(new_mlp)
+    return mlp_modules
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--phase", type=str, default="A1")
+    parser.add_argument("--K", type=int, default=4)
+    parser.add_argument("--K_const", type=int, default=0)
+    parser.add_argument("--K_active_spec", type=int, default=-1,
+                        help="# specialists fired per token. Default = round(K_spec * 0.40 / (1 - K_const/K * 0.40)); falls back to max(1, round(K_spec*0.5))")
+    parser.add_argument("--loss", choices=["kl", "ce"], default="kl")
+    parser.add_argument("--init", choices=["random", "taylor", "em", "kmeans"], default="random")
+    parser.add_argument("--core_frac", type=float, default=0.5,
+                        help="Fraction of PRUNE_K active neurons to concentrate in K_const core (Taylor init only)")
+    parser.add_argument("--mece_mode", choices=["softmax", "sigmoid_ortho"], default="softmax")
+    parser.add_argument("--tau_start", type=float, default=1.0)
+    parser.add_argument("--tau_end", type=float, default=0.01)
+    parser.add_argument("--tau_hold_frac", type=float, default=0.2,
+                        help="Fraction of max_steps to HOLD at tau_end after annealing. Default 0.2 = "
+                             "anneal over first 80%, hold last 20%. For long continuation runs, set "
+                             "to e.g. 0.857 to give just 5k anneal steps and 30k hard-tau steps "
+                             "(out of 35k total).")
+    parser.add_argument("--max_steps", type=int, default=2000)
+    parser.add_argument("--lr", type=float, default=LR)
+    parser.add_argument("--alpha_b", type=float, default=0.01)
+    parser.add_argument("--alpha_z", type=float, default=0.001)
+    parser.add_argument("--alpha_orth", type=float, default=0.01)
+    parser.add_argument("--noise_std", type=float, default=1.0)
+    parser.add_argument("--eval_every", type=int, default=200)
+    parser.add_argument("--optimizer", choices=["adamw", "adamw8bit"], default="adamw",
+                        help="adamw8bit uses bitsandbytes 8-bit optimizer — saves ~28GB "
+                             "optimizer state on 4.65B model, required to --unfreeze_base on H100 80GB")
+    parser.add_argument("--freeze_embeddings", action="store_true",
+                        help="Freeze embed_tokens (+tied lm_head) and embed_tokens_per_layer. "
+                             "For Gemma-4 E2B these are 2.75B of 5.1B params and embed_tokens_per_layer "
+                             "is a single 2.35B-element tensor that exceeds bnb 8bit kernel limits. "
+                             "Freezing them makes --unfreeze_base feasible with plain fp32 AdamW on "
+                             "the remaining ~2.35B params (~19GB state, fits 80GB).")
+    parser.add_argument("--use_lora", action="store_true",
+                        help="Wrap target Linears with LoRALinear (frozen base + trainable rank-r delta). "
+                             "Use INSTEAD of full base fine-tuning. Combines naturally with --int4_qat: "
+                             "LoRA wraps the int4-quantized Linear. Trains only ~10-30M LoRA params + MoE.")
+    parser.add_argument("--lora_rank", type=int, default=16,
+                        help="LoRA rank (low-dim adapter dim). Typical: 8 (less capacity, less overfit), "
+                             "16 (default), 32 (more capacity).")
+    parser.add_argument("--lora_alpha", type=float, default=16.0,
+                        help="LoRA scaling factor; effective scale = alpha/rank. Default 16/16 = 1.0.")
+    parser.add_argument("--W_r_init", choices=["random", "zero", "centroid", "scaled_centroid"], default="random",
+                        help="Router W_r init: random (default), zero (uniform routing), "
+                             "centroid (mean W_gate row per Taylor-assigned expert, L2-normalized to 0.02 mag), "
+                             "scaled_centroid (mean W_gate row per expert, NOT normalized, scaled by --W_r_scale).")
+    parser.add_argument("--W_r_scale", type=float, default=1.0,
+                        help="Multiplier for scaled_centroid init. W_r = scale × mean(W_gate per expert). "
+                             "Values ~0.1–10 control how 'loud' the router is relative to base weight scale.")
+    parser.add_argument("--W_r_lr_mult", type=float, default=1.0,
+                        help="Learning rate multiplier for router W_r params (and A logits). "
+                             "E.g., 5.0 trains the router 5× faster than base weights. The router "
+                             "is ~0.03% of total params and has a specific job — higher LR can "
+                             "help it converge quickly without destabilizing base-weight training.")
+    parser.add_argument("--freeze_A", action="store_true",
+                        help="Freeze assignment logits A (only router + optionally base train)")
+    parser.add_argument("--unfreeze_base", action="store_true",
+                        help="Train base weights (W_gate/W_up/W_down, attn, norms). Default freezes them.")
+    parser.add_argument("--save_checkpoint", type=str, default="",
+                        help="Save final student state_dict to this path (.pt)")
+    parser.add_argument("--save_every", type=int, default=0,
+                        help="If >0 and --save_checkpoint set, also save an intermediate ckpt every "
+                             "N max_steps. Filename: <save_checkpoint stem>_step<N>.pt. Use for long "
+                             "runs where you may want to early-stop without losing progress.")
+    parser.add_argument("--shuffle_seed", type=int, default=0,
+                        help="Seed for the dataloader shuffle. Same seed → same record order. Use a "
+                             "different seed in continuation runs to expose the model to a new ordering "
+                             "of the dataset.")
+    parser.add_argument("--data_skip", type=int, default=0,
+                        help="Discard first N samples of the (shuffled) dataloader stream before "
+                             "training. Combine with same --shuffle_seed as a previous run to start "
+                             "where it left off — model sees fresh records first.")
+    parser.add_argument("--load_checkpoint", type=str, default="",
+                        help="Load student state_dict from this path BEFORE training (warm-start). "
+                             "Must be from a prior rung6_moe.py run with matching architecture.")
+    parser.add_argument("--calib_path", type=str, default=CALIB_DATA_PATH,
+                        help="Path to JSONL calibration data for TRAINING. Default: final.jsonl (640 records). "
+                             "Use bulk.jsonl (~12k records) or trajectories_25k.jsonl (25k) for more data.")
+    parser.add_argument("--eval_calib_path", type=str, default="",
+                        help="Path to JSONL calibration data for EVAL. Default: same as --calib_path. "
+                             "Set to final.jsonl for consistent eval across curriculum phases.")
+    parser.add_argument("--int4_qat", action="store_true",
+                        help="Enable int4 QAT: wrap target Linears (MLP + attention) with Int4QuantLinear "
+                             "so forward uses fake-quantized weights (groupwise STE, group_size=128).")
+    parser.add_argument("--int4_group_size", type=int, default=32,
+                        help="Groupwise int4 quant group size. Default 32 matches GGUF Q4_0/Q4_K deploy block size. "
+                             "128 is another common choice (AWQ-style) with less storage overhead but larger quant error.")
+    parser.add_argument("--eval_only", action="store_true",
+                        help="Skip training; just eval after setup (init + optional checkpoint load + optional "
+                             "int4 wrap). Useful for measuring untrained-int4 baseline or a specific checkpoint's "
+                             "eval PPL at tau_end without further optimization.")
+    # Knowledge preservation fixes
+    parser.add_argument("--diverse_calib_path", type=str, default="",
+                        help="Path to JSONL (raw 'text' field) for periodic KL-to-base preservation batches. "
+                             "Usually wikitext or similar pretraining-distribution text.")
+    parser.add_argument("--diverse_every_n", type=int, default=4,
+                        help="Every N optimizer steps, replace the normal CE batch with a KL-to-teacher pass "
+                             "on diverse data. Default 4 = ~25%% of batches.")
+    parser.add_argument("--main_kl_temp", type=float, default=1.0,
+                        help="Softmax temperature for the MAIN loss when --loss kl. "
+                             "T>1 softens teacher's argmax commitment. Useful for knowledge "
+                             "retention but too high (>5) can destabilize Gemma-4 training "
+                             "due to low teacher entropy.")
+    parser.add_argument("--kl_base_lambda", type=float, default=0.5,
+                        help="Scalar on the diverse-batch KL-to-teacher loss.")
+    parser.add_argument("--kl_base_temp", type=float, default=2.0,
+                        help="Softmax temperature for KL-to-teacher. >1 softens distributions, recovering "
+                             "tail mass — important when teacher entropy is low (e.g., Gemma-4 E2B). "
+                             "Try 2-5 for Gemma-3, 5-10 for Gemma-4.")
+    parser.add_argument("--w_drift_lambda", type=float, default=0.0,
+                        help="L2-to-base weight-drift penalty: λ × Σ ‖W_student − W_teacher‖² over trainable "
+                             "base weights (excluding MoE .A and .W_r). Prevents catastrophic forgetting by "
+                             "anchoring weights to base. Typical: 1e-6 to 1e-4.")
+    parser.add_argument("--real_int4_inplace", action="store_true",
+                        help="After load_checkpoint, snap target Linear weights to int4 grid in-place (no STE, "
+                             "no runtime overhead). Simulates deployment — forward uses plain nn.Linear with "
+                             "already-quantized weights. Combine with --eval_only for the real-int4 benchmark.")
+    parser.add_argument("--gaussian_noise_scale", type=float, default=0.0,
+                        help="Add N(0, scale × p.std()) Gaussian noise to target Linear weights in-place. "
+                             "Default 0.0 = disabled. 0.129 is the analytical int4 group=32 equivalent.")
+    # ── Activation-MSE recovery (mechanism A: generic per-module) ──
+    parser.add_argument("--recovery_steps", type=int, default=0,
+                        help="If >0: run module_recovery.recover_modules_sequentially on every per-layer "
+                             "MLP after install_moe + wrap_int4 (+ wrap_lora) and BEFORE main training. "
+                             "Default 0 = disabled.")
+    parser.add_argument("--recovery_lr", type=float, default=1e-4,
+                        help="LR for the generic recovery AdamW (only A and W_r receive grad — base "
+                             "and LoRA params are not in the trainable set during recovery).")
+    parser.add_argument("--recovery_n_batches", type=int, default=8,
+                        help="# calibration batches sampled from --calib_path for generic recovery.")
+    # ── Activation-MSE recovery (mechanism B: specialized MoE per-layer) ──
+    parser.add_argument("--moe_recovery_seconds_per_layer", type=float, default=0.0,
+                        help="If >0: run finetune_moe_per_layer for this many wall-clock seconds per "
+                             "MLP layer. Pre-caches teacher (X, Y), optimizes A and W_r only. "
+                             "Default 0 = disabled.")
+    parser.add_argument("--moe_recovery_lr", type=float, default=1e-3,
+                        help="LR for the specialized per-layer recovery (A and W_r are tiny — 1e-3 is fine).")
+    parser.add_argument("--moe_recovery_n_calib_records", type=int, default=32,
+                        help="# calibration records (single-sequence, len MAX_SEQ_LEN) cached for the "
+                             "specialized recovery. Memory ≈ 2 × N × MAX_SEQ_LEN × hidden × 2 bytes.")
+    parser.add_argument("--moe_recovery_use_student_inputs", type=lambda s: s.lower() in ("1", "true", "yes"),
+                        default=True,
+                        help="If True (default), refresh student X between layers so each layer sees "
+                             "error-corrected upstream activations. If False, use teacher X throughout "
+                             "(matches Sunday's original pipeline).")
+    parser.add_argument("--moe_recovery_optimizer", choices=["adam", "muon"], default="adam",
+                        help="Specialized recovery optimizer. 'muon' uses muon.MuonWithAdam (matrix-aware "
+                             "Newton-Schulz). A and W_r are both 2D so Muon-eligible.")
+    parser.add_argument("--moe_recovery_noise_std", type=float, default=-1.0,
+                        help="Override MoEMLP router noise during recovery. -1.0 = keep current "
+                             "MoEMLP setting (default 1.0 from MoE training convention). 0.0 = "
+                             "deterministic routing for clean per-step loss + meaningful best-state "
+                             "tracking + train/deploy match. Higher = more router exploration.")
+    args = parser.parse_args()
+    if not args.eval_calib_path:
+        args.eval_calib_path = args.calib_path
+    K_spec = args.K - args.K_const
+    assert K_spec >= 0 and args.K_const >= 0 and args.K >= 1
+    if args.K_active_spec < 0:
+        # Target per-token sparsity = 40% of D_FFN = PRUNE_K neurons.
+        # Each expert covers ~D_FFN/K neurons at MECE. K_const always fires (D_FFN/K * K_const).
+        # Need K_active_spec such that (K_const + K_active_spec) * D_FFN/K ≈ PRUNE_K
+        # → K_active_spec = round(K * PRUNE_P - K_const)
+        k_act = max(1, round(args.K * PRUNE_P) - args.K_const) if K_spec > 0 else 0
+        args.K_active_spec = k_act
+    assert args.K_active_spec <= K_spec
+    os.makedirs("logs", exist_ok=True)
+    print(f"=== Rung 6 MoE — phase={args.phase} ===")
+    print(f"  K={args.K}  K_const={args.K_const}  K_spec={K_spec}  K_active_spec={args.K_active_spec}")
+    print(f"  mece_mode={args.mece_mode}  init={args.init}  loss={args.loss}")
+    print(f"  tau: {args.tau_start} → {args.tau_end} over {args.max_steps} steps")
+    # Gemma-4 has two MLP widths (6144 / 12288). Report both layer types' active budgets.
+    ratio = (args.K_const + args.K_active_spec) / args.K
+    for width_name, d in (("narrow (layers 0-14)", INTERMEDIATE),
+                          ("wide   (layers 15+)", INTERMEDIATE_WIDE)):
+        eff_active = ratio * d
+        prune_k = int(d * PRUNE_P)
+        print(f"  {width_name}: active ~{eff_active:.0f}/{d}  "
+              f"(40% target = {prune_k}; diff = {eff_active - prune_k:+.0f})")
+    print(f"  freeze_A={args.freeze_A}  unfreeze_base={args.unfreeze_base}  W_r_init={args.W_r_init}")
+    if args.load_checkpoint: print(f"  load_checkpoint={args.load_checkpoint}")
+    if args.save_checkpoint: print(f"  save_checkpoint={args.save_checkpoint}")
+    print(f"Loading teacher & student on {DEVICE}...")
+    teacher, tokenizer = load_model()
+    teacher.eval()
+    for p in teacher.parameters(): p.requires_grad_(False)
+    student, _ = load_model()
+    # Note: NO corruption — rung 6 uses the CLEAN IT model.
+    freeze_base = not args.unfreeze_base
+    if freeze_base:
+        for p in student.parameters(): p.requires_grad_(False)  # freeze base first
+    # If unfreeze_base: leave requires_grad=True on all params (default)
+    # Embedding freeze for Gemma-4 (selectively keep embed_tokens and embed_tokens_per_layer
+    # frozen even when the rest of the base is training). Required for Gemma-4 4.65B on 80GB:
+    # embed_tokens_per_layer alone is a single 2.35B tensor that breaks bnb 8bit kernels, and
+    # embeddings rarely need to move for MoE-preservation work anyway.
+    if args.freeze_embeddings:
+        n_frozen = 0
+        for name, p in student.named_parameters():
+            if "embed_tokens" in name:  # catches embed_tokens and embed_tokens_per_layer (and tied lm_head)
+                p.requires_grad_(False)
+                n_frozen += p.numel()
+        print(f"  Froze embeddings: {n_frozen/1e9:.2f}B params (embed_tokens, embed_tokens_per_layer, tied lm_head)")
+    # Initialization
+    taylor_scores = None
+    if args.init == "taylor" and not args.load_checkpoint:
+        print("Computing Taylor saliency for init...")
+        taylor_scores = compute_taylor_saliency(student, tokenizer, n_batches=8, calib_path=args.calib_path)
+    init_As = init_assignment_logits(args.init if not args.load_checkpoint else "random",
+                                     args.K, args.K_const, taylor_scores, core_frac=args.core_frac)
+    init_W_rs = init_router_weights(args.W_r_init, student, init_As, args.K, args.K_const,
+                                    scale_multiplier=args.W_r_scale)
+    mlp_modules = install_moe(
+        student, K=args.K, K_const=args.K_const,
+        K_active_spec=args.K_active_spec, mece_mode=args.mece_mode,
+        init_As=init_As, noise_std=args.noise_std,
+        freeze_base=freeze_base, init_W_rs=init_W_rs)
+    # Optionally freeze A (only router trains) — done AFTER install_moe
+    if args.freeze_A:
+        for m in mlp_modules:
+            m.A.requires_grad_(False)
+        print("  A frozen — only router W_r (and base if --unfreeze_base) trains")
+    # Load warm-start checkpoint BEFORE computing trainable params
+    if args.load_checkpoint:
+        print(f"  Loading checkpoint from {args.load_checkpoint}...")
+        ckpt = torch.load(args.load_checkpoint, map_location=DEVICE)
+        state = ckpt.get('student_state', ckpt) if isinstance(ckpt, dict) else ckpt
+        missing, unexpected = student.load_state_dict(state, strict=False)
+        print(f"    missing={len(missing)} unexpected={len(unexpected)}")
+    # Int4 QAT: wrap target Linears AFTER state_dict load (keys unchanged — subclass of nn.Linear).
+    # Must happen BEFORE optimizer creation so parameter references are stable.
+    if args.int4_qat:
+        Int4QuantLinear._group_size = args.int4_group_size
+        n_wrapped = wrap_int4(student)
+        print(f"  Int4 QAT: wrapped {n_wrapped} nn.Linear modules (group_size={args.int4_group_size}, "
+              f"range [-7, 7]). Forward uses fake-quant; backward is STE through fp weight.")
+    # LoRA: wrap target Linears (incl. Int4QuantLinear) with LoRALinear so base is frozen
+    # and only LoRA A/B + MoE A logits/W_r train. Apply AFTER int4 so the base inside LoRA
+    # is the int4-quantized Linear (deploy-realistic).
+    if args.use_lora:
+        # Pure-LoRA semantics: freeze ALL base params (including attention, norms, scalars
+        # not in LoRA target list). MoE A/W_r and the LoRA adapters added by wrap_lora are
+        # the only trainable things. Overrides --unfreeze_base.
+        for name, p in student.named_parameters():
+            if not (name.endswith(".A") or name.endswith(".W_r")):
+                p.requires_grad_(False)
+        n_wrapped, n_lora_params = wrap_lora(student, rank=args.lora_rank, alpha=args.lora_alpha)
+        print(f"  LoRA: wrapped {n_wrapped} Linears with rank={args.lora_rank} alpha={args.lora_alpha} "
+              f"(trainable LoRA params: {n_lora_params/1e6:.2f}M)")
+    # Real int4 quantization in-place (deploy simulation — no runtime quant overhead).
+    if args.real_int4_inplace:
+        n_q = apply_int4_inplace(student, group_size=args.int4_group_size)
+        print(f"  Real int4 inplace: quantized {n_q} Linear weights to int4 grid "
+              f"(group_size={args.int4_group_size}); weights now on-grid, regular nn.Linear forward.")
+    # Gaussian-proxy noise benchmark.
+    if args.gaussian_noise_scale > 0:
+        n_g = apply_gaussian_noise_inplace(student, noise_scale=args.gaussian_noise_scale)
+        print(f"  Gaussian noise inplace: added N(0, {args.gaussian_noise_scale} × p.std()) "
+              f"to {n_g} Linear weights.")
+    # ────────── Activation-MSE recovery (mechanism A: generic) ──────────
+    # Runs AFTER install_moe + wrap_int4 (+ wrap_lora) so the recovered student
+    # is the deployed one (int4 fake-quant in the loop, MoE routing engaged at
+    # tau_end). Trainable params during recovery: same as training (i.e., A,
+    # W_r — base is frozen unless --unfreeze_base, in which case it'd also move,
+    # but we explicitly want only A/W_r so we do NOT alter requires_grad here).
+    if args.recovery_steps > 0:
+        # Hard routing during recovery — match deploy-time temperature.
+        for m in mlp_modules: m.tau = args.tau_end
+        # Optionally override router noise during recovery (default -1 = leave as-is).
+        prev_noise = [getattr(m, "noise_std", None) for m in mlp_modules]
+        if args.moe_recovery_noise_std >= 0:
+            for m in mlp_modules:
+                if hasattr(m, "noise_std"): m.noise_std = args.moe_recovery_noise_std
+        print(f"\n  [recovery A] generic recover_modules_sequentially "
+              f"steps={args.recovery_steps}  lr={args.recovery_lr}  "
+              f"n_batches={args.recovery_n_batches}  tau={args.tau_end}  "
+              f"noise={args.moe_recovery_noise_std if args.moe_recovery_noise_std >= 0 else 'unchanged'}")
+        # Restrict trainable set to MoE params (A, W_r) for the duration of
+        # recovery. Snapshot prior requires_grad so we can restore it for main
+        # training (e.g., LoRA adapters that should keep training afterwards).
+        prev_requires_grad = {n: p.requires_grad for n, p in student.named_parameters()}
+        # Restrict to A/W_r — but RESPECT --freeze_A: don't enable A if it was
+        # frozen pre-recovery. Same for W_r (in case caller froze it).
+        for n, p in student.named_parameters():
+            is_moe = n.endswith(".A") or n.endswith(".W_r")
+            p.requires_grad_(is_moe and prev_requires_grad[n])
+        # Pull `recovery_n_batches` calibration batches (input_ids only).
+        rec_seqs = load_seqs(tokenizer, "train", calib_path=args.calib_path)
+        rec_seqs = rec_seqs[:args.recovery_n_batches * BATCH]
+        rec_loader = torch.utils.data.DataLoader(rec_seqs, batch_size=BATCH)
+        rec_input_ids = [batch["input_ids"] for batch in rec_loader][:args.recovery_n_batches]
+        if not rec_input_ids:
+            print("  [recovery A] no calibration data — skipping")
+        else:
+            n_train_per_mlp = sum(
+                p.numel() for n, p in mlp_modules[0].named_parameters(recurse=False)
+                if p.requires_grad and n in ("A", "W_r")
+            )
+            print(f"  [recovery A] per-layer MoE trainable params: {n_train_per_mlp}")
+            rec_results = recover_modules_via_generic_pipeline(
+                student=student, teacher=teacher,
+                calibration_input_ids=rec_input_ids,
+                n_layers=N_LAYERS,
+                steps=args.recovery_steps,
+                lr=args.recovery_lr,
+                device=DEVICE,
+            )
+            for r in rec_results:
+                print(f"    {r['name']}  in_mse={r['input_mse']:.4e}  "
+                      f"out_pre={r['output_mse_before']:.4e}  out_post={r['output_mse_after']:.4e}")
+        # Restore prior requires_grad state, tau, and noise.
+        for n, p in student.named_parameters():
+            p.requires_grad_(prev_requires_grad[n])
+        for m in mlp_modules: m.tau = args.tau_start
+        if args.moe_recovery_noise_std >= 0:
+            for m, n in zip(mlp_modules, prev_noise):
+                if hasattr(m, "noise_std") and n is not None: m.noise_std = n
+    # ────────── Activation-MSE recovery (mechanism B: specialized MoE) ──────────
+    # Pre-cache (X, Y) per layer once via teacher forward, then per-layer
+    # time-budgeted optimization of A and W_r only with student-input
+    # propagation between layers.
+    if args.moe_recovery_seconds_per_layer > 0:
+        # Hard routing during recovery — match deploy-time temperature.
+        for m in mlp_modules: m.tau = args.tau_end
+        print(f"\n  [recovery B] finetune_moe_per_layer "
+              f"sec/layer={args.moe_recovery_seconds_per_layer}  "
+              f"lr={args.moe_recovery_lr}  n_calib={args.moe_recovery_n_calib_records}  "
+              f"use_student_inputs={args.moe_recovery_use_student_inputs}  "
+              f"opt={args.moe_recovery_optimizer}  tau={args.tau_end}")
+        moe_rec_seqs = load_seqs(tokenizer, "train", calib_path=args.calib_path)
+        moe_rec_seqs = moe_rec_seqs[:args.moe_recovery_n_calib_records * BATCH]
+        moe_rec_loader = torch.utils.data.DataLoader(moe_rec_seqs, batch_size=BATCH)
+        moe_rec_input_ids = [b["input_ids"] for b in moe_rec_loader][:args.moe_recovery_n_calib_records]
+        if not moe_rec_input_ids:
+            print("  [recovery B] no calibration data — skipping")
+        else:
+            n_train_per_mlp = sum(
+                p.numel() for n, p in mlp_modules[0].named_parameters(recurse=False)
+                if p.requires_grad and n in ("A", "W_r")
+            )
+            print(f"  [recovery B] per-layer MoE trainable params: {n_train_per_mlp}")
+            moe_rec_results = finetune_moe_per_layer(
+                student=student, teacher=teacher,
+                calibration_input_ids=moe_rec_input_ids,
+                n_layers=N_LAYERS,
+                seconds_per_layer=args.moe_recovery_seconds_per_layer,
+                lr=args.moe_recovery_lr,
+                optimizer=args.moe_recovery_optimizer,
+                use_student_inputs=args.moe_recovery_use_student_inputs,
+                device=DEVICE,
+                tau_end=args.tau_end,
+                noise_std=(None if args.moe_recovery_noise_std < 0 else args.moe_recovery_noise_std),
+            )
+            # Restore tau to start for main training.
+            for m in mlp_modules: m.tau = args.tau_start
+    trainable_params = [p for p in student.parameters() if p.requires_grad]
+    n_train = sum(p.numel() for p in trainable_params)
+    moe_params_max = sum(_d_ffn_at(i) * args.K for i in range(N_LAYERS)) \
+                     + N_LAYERS * D_MODEL * max(K_spec, 0)
+    trainable_base = sum(p.numel() for n, p in student.named_parameters()
+                         if p.requires_grad and not (n.endswith(".A") or n.endswith(".W_r")))
+    trainable_moe = sum(p.numel() for n, p in student.named_parameters()
+                        if p.requires_grad and (n.endswith(".A") or n.endswith(".W_r")))
+    print(f"  Trainable params: {n_train/1e6:.3f}M  "
+          f"(MoE: {trainable_moe/1e6:.3f}M / max {moe_params_max/1e6:.3f}M, "
+          f"base trainable: {trainable_base/1e6:.2f}M)")
+    if freeze_base and not args.freeze_A:
+        assert trainable_base == 0, f"freeze_base=True but {trainable_base} base params are trainable"
+        assert trainable_moe <= moe_params_max * 1.01, "Too many MoE params trainable"
+    if args.freeze_A:
+        assert trainable_moe <= N_LAYERS * D_MODEL * max(K_spec, 0) * 1.01, \
+            "freeze_A=True but A appears to be trainable"
+    # Eval-only mode: skip training entirely, jump to final eval at tau_end.
+    if args.eval_only:
+        print(f"  Eval-only mode — skipping training, evaluating at tau={args.tau_end}")
+        print(f"  Eval data: {args.eval_calib_path}")
+        for m in mlp_modules: m.tau = args.tau_end
+        final_ppl = eval_ppl(student, tokenizer, calib_path=args.eval_calib_path)
+        print(f"\n=== Eval-only PPL (tau={args.tau_end}): {final_ppl:.4f}  "
+              f"baseline(bottom60 CE)={BASELINE_PPL:.4f}  clean={CLEAN_PPL:.4f} ===")
+        out = {
+            "phase": args.phase, "config": vars(args),
+            "final_ppl": final_ppl,
+            "baseline_ppl": BASELINE_PPL, "clean_ppl": CLEAN_PPL,
+            "ppl_curve": [], "eval_only": True,
+        }
+        os.makedirs("logs", exist_ok=True)
+        out_path = f"logs/rung6_moe_{args.phase}_results.json"
+        with open(out_path, "w") as f:
+            json.dump(out, f, indent=2)
+        print(f"Saved to {out_path}")
+        return
+    # Split params into MoE (A + W_r) vs base for per-group LR.
+    # --W_r_lr_mult multiplies the MoE group's LR relative to base_params' args.lr.
+    moe_group_params = [p for n, p in student.named_parameters()
+                        if p.requires_grad and (n.endswith(".A") or n.endswith(".W_r"))]
+    base_group_params = [p for n, p in student.named_parameters()
+                         if p.requires_grad and not (n.endswith(".A") or n.endswith(".W_r"))]
+    param_groups = [
+        {"params": base_group_params, "lr": args.lr},
+        {"params": moe_group_params,  "lr": args.lr * args.W_r_lr_mult},
+    ]
+    print(f"  LR: base={args.lr:.2e}  MoE(A+W_r)={args.lr * args.W_r_lr_mult:.2e}  "
+          f"(multiplier={args.W_r_lr_mult})")
+    if args.optimizer == "adamw8bit":
+        if not _HAS_BNB:
+            raise RuntimeError("bitsandbytes not installed — pip install bitsandbytes")
+        # Paged variant handles huge tensors (Gemma-4's embed_tokens_per_layer is 2.35B params,
+        # exceeds non-paged bnb kernel grid limits → "invalid configuration argument").
+        optimizer = bnb.optim.PagedAdamW8bit(param_groups, weight_decay=0.01)
+        print(f"  Using bnb.optim.PagedAdamW8bit (~28GB optimizer-state savings, "
+              f"paged to handle Gemma-4's 2.35B embed_tokens_per_layer)")
+    else:
+        optimizer = AdamW(param_groups, weight_decay=0.01)
+    scheduler = CosineAnnealingLR(optimizer, T_max=args.max_steps, eta_min=args.lr * 0.1)
+    print(f"  Train data: {args.calib_path}")
+    print(f"  Eval  data: {args.eval_calib_path}")
+    # When train and eval paths differ, use ALL records of train file (no need to withhold 20%
+    # since eval comes from a separate file).
+    train_split = "all" if args.calib_path != args.eval_calib_path else "train"
+    seqs = load_seqs(tokenizer, train_split, calib_path=args.calib_path)
+    print(f"  Loaded {len(seqs)} train sequences of {MAX_SEQ_LEN} tokens = {len(seqs)*MAX_SEQ_LEN/1e6:.2f}M tokens"
+          f"  (split={train_split})")
+    # Deterministic shuffle: same --shuffle_seed reproduces the same record order.
+    # Use a different seed in continuation runs to expose model to NEW orderings of
+    # the dataset (avoids replaying the same trajectory the prior run already trained on).
+    g = torch.Generator(); g.manual_seed(args.shuffle_seed)
+    loader = torch.utils.data.DataLoader(seqs, BATCH, shuffle=True, generator=g)
+    loader_iter = iter(loader)
+    # Optional skip: discard first N samples of the shuffled stream before training begins.
+    # Useful when a previous run with the same shuffle_seed consumed N samples.
+    if args.data_skip > 0:
+        skipped = 0
+        for _ in range(args.data_skip):
+            try:
+                next(loader_iter); skipped += 1
+            except StopIteration:
+                loader_iter = iter(loader)
+                next(loader_iter); skipped += 1
+        print(f"  Skipped first {skipped} samples (data_skip={args.data_skip})")
+    # Optional knowledge-preservation: load diverse corpus + cache teacher base params.
+    diverse_loader_iter = None
+    diverse_dataset_obj = None
+    if args.diverse_calib_path:
+        print(f"  Diverse corpus (KL-to-base): {args.diverse_calib_path}")
+        diverse_seqs = load_seqs(tokenizer, "all", calib_path=args.diverse_calib_path, raw_text=True)
+        print(f"    {len(diverse_seqs)} sequences, every {args.diverse_every_n} steps, "
+              f"λ={args.kl_base_lambda}, T={args.kl_base_temp}")
+        diverse_dataset_obj = torch.utils.data.DataLoader(diverse_seqs, BATCH, shuffle=True)
+        diverse_loader_iter = iter(diverse_dataset_obj)
+    teacher_param_map = None
+    if args.w_drift_lambda > 0:
+        print(f"  W-drift penalty active: λ={args.w_drift_lambda} on trainable base params")
+        teacher_param_map = {n: p.detach() for n, p in teacher.named_parameters()}
+    step, accum_loss = 0, 0.0
+    optimizer.zero_grad()
+    t0 = time.time()
+    curve = []
+    while step < args.max_steps:
+        tau = get_tau(step, args.max_steps, args.tau_start, args.tau_end, hold_frac=args.tau_hold_frac)
+        for m in mlp_modules: m.tau = tau
+        student.train()
+        use_diverse = (diverse_loader_iter is not None and step > 0 and (step % args.diverse_every_n == 0))
+        if use_diverse:
+            # Pretraining-distribution preservation batch: KL-to-teacher at temperature T.
+            try: batch = next(diverse_loader_iter)
+            except StopIteration:
+                diverse_loader_iter = iter(diverse_dataset_obj); batch = next(diverse_loader_iter)
+            ids = batch["input_ids"].to(DEVICE)
+            with torch.no_grad():
+                t_logits = teacher(ids)
+            s_logits = student(ids)
+            # High-temperature KL: softens sharp teacher distributions to carry tail signal.
+            main_loss = args.kl_base_lambda * kl_loss(s_logits[:, :-1], t_logits[:, :-1], temp=args.kl_base_temp)
+        else:
+            # Normal CE/KL batch on IT trajectories.
+            try: batch = next(loader_iter)
+            except StopIteration:
+                loader_iter = iter(loader); batch = next(loader_iter)
+            ids = batch["input_ids"].to(DEVICE)
+            labels = batch["labels"][:, :-1].to(DEVICE)
+            with torch.no_grad():
+                t_logits = teacher(ids)
+            s_logits = student(ids)
+            if args.loss == "kl":
+                # Mask = positions where labels != -100 (i.e., assistant response only).
+                # Same masking we apply to CE — keeps "don't train on prompt tokens" consistent.
+                kl_mask = (labels != -100)
+                main_loss = kl_loss(s_logits[:, :-1], t_logits[:, :-1],
+                                    temp=args.main_kl_temp, mask=kl_mask)
+            else:
+                main_loss = ce_loss(s_logits[:, :-1], labels)
+        # Aux losses apply on every batch — functions of module state, not batch content.
+        aux = sum(m.aux_loss(args.alpha_b, args.alpha_z) for m in mlp_modules)
+        orth = sum(m.orth_loss() for m in mlp_modules) * args.alpha_orth
+        # Optional: weight-drift penalty on trainable base params (EWC-lite).
+        drift = torch.tensor(0.0, device=DEVICE)
+        if args.w_drift_lambda > 0:
+            for n, p in student.named_parameters():
+                if not p.requires_grad: continue
+                if n.endswith(".A") or n.endswith(".W_r"): continue
+                t = teacher_param_map.get(n) if teacher_param_map is not None else None
+                if t is not None and t.shape == p.shape:
+                    drift = drift + ((p - t) ** 2).sum()
+            drift = drift * args.w_drift_lambda
+        loss = (main_loss + aux + orth + drift) / GRAD_ACCUM
+        loss.backward()
+        accum_loss += loss.item()
+        if (step + 1) % GRAD_ACCUM == 0:
+            torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
+            optimizer.step(); scheduler.step(); optimizer.zero_grad()
+        if (step + 1) % args.eval_every == 0:
+            # Diagnostic metrics (argmax-based hard assignment regardless of τ)
+            with torch.no_grad():
+                avg_entropy = 0.0; avg_jaccard = 0.0
+                for m in mlp_modules:
+                    probs = F.softmax(m.A / max(tau, 1e-3), dim=-1)       # [D_FFN, K]
+                    ent = -(probs * (probs.clamp_min(1e-8)).log()).sum(-1).mean().item()
+                    avg_entropy += ent
+                    # Hard assignment: each neuron → argmax expert
+                    hard = F.one_hot(probs.argmax(dim=-1), args.K).float().T  # [K, D_FFN]
+                    inter = hard @ hard.T                                     # [K, K]
+                    sz = hard.sum(dim=-1, keepdim=True)                       # [K, 1]
+                    union = sz + sz.T - inter
+                    jac_off = (inter / union.clamp_min(1.0))
+                    jac_off = jac_off - torch.diag(torch.diag(jac_off))       # zero diagonal
+                    avg_jaccard += jac_off.sum().item() / (args.K * (args.K - 1) + 1e-8)
+                avg_entropy /= len(mlp_modules)
+                avg_jaccard /= len(mlp_modules)
+            ppl = eval_ppl(student, tokenizer, calib_path=args.eval_calib_path)
+            curve.append({
+                "step": step + 1, "ppl": ppl, "tau": tau,
+                "assign_entropy": avg_entropy, "jaccard": avg_jaccard,
+            })
+            print(f"  step={step+1:4d}  tau={tau:.4f}  loss={accum_loss*GRAD_ACCUM:.4f}  "
+                  f"ppl={ppl:.4f}  H(A)={avg_entropy:.3f}  Jac={avg_jaccard:.4f}  "
+                  f"t={time.time()-t0:.0f}s")
+            accum_loss = 0.0
+        # Intermediate ckpt save (--save_every) — single rolling file, OVERWRITES previous.
+        # Filename: <save_checkpoint stem>_intermediate.pt — only one extra ckpt on disk
+        # at any time. Read 'step' field of the saved dict to know which step it was at.
+        if args.save_every and args.save_checkpoint and (step + 1) % args.save_every == 0:
+            stem, ext = os.path.splitext(args.save_checkpoint)
+            inter_path = f"{stem}_intermediate{ext}"
+            os.makedirs(os.path.dirname(inter_path) or ".", exist_ok=True)
+            torch.save({
+                'student_state': student.state_dict(),
+                'config': vars(args),
+                'step': step + 1,
+            }, inter_path)
+            print(f"  [intermediate] overwrote {inter_path} (step {step+1})")
+        step += 1
+    # Final eval at tau_end
+    for m in mlp_modules: m.tau = args.tau_end
+    final_ppl = eval_ppl(student, tokenizer, calib_path=args.eval_calib_path)
+    print(f"\n=== Final PPL (tau={args.tau_end}): {final_ppl:.4f}  "
+          f"baseline(bottom60 CE)={BASELINE_PPL:.4f}  clean={CLEAN_PPL:.4f} ===")
+    out = {
+        "phase": args.phase, "config": vars(args),
+        "final_ppl": final_ppl,
+        "baseline_ppl": BASELINE_PPL, "clean_ppl": CLEAN_PPL,
+        "ppl_curve": curve,
+    }
+    os.makedirs("logs", exist_ok=True)
+    out_path = f"logs/rung6_moe_{args.phase}_results.json"
+    with open(out_path, "w") as f:
+        json.dump(out, f, indent=2)
+    print(f"Saved to {out_path}")
+    if args.save_checkpoint:
+        os.makedirs(os.path.dirname(args.save_checkpoint) or ".", exist_ok=True)
+        torch.save({
+            'student_state': student.state_dict(),
+            'config': vars(args),
+            'final_ppl': final_ppl,
+        }, args.save_checkpoint)
+        print(f"Saved checkpoint to {args.save_checkpoint}")
+if __name__ == "__main__":
+    main()