ataeff
/

janus4

Model card Files Files and versions

xet

Community

ataeff commited on Mar 25

Commit

9ee0dae

verified ·

1 Parent(s): b3a256b

Upload janus/janus_gpt_v4_lowrank.py with huggingface_hub

Browse files

Files changed (1) hide show

janus/janus_gpt_v4_lowrank.py +654 -0

janus/janus_gpt_v4_lowrank.py ADDED Viewed

	@@ -0,0 +1,654 @@

+"""
+Janus 285M GPT — nanochat fork with 3-way hybrid attention.
+Architecture delta from nanochat's gpt.py:
+  1. MLP: ReLU^2 -> SwiGLU (w_gate, w_up, w_down)
+  2. Attention: CausalSelfAttention -> JanusHybridAttention
+     - Standard QKV (FA3/SDPA, RoPE, QK-norm)
+     - RRPRAM: positional resonance via Wr[H, E, T_r], linear, non-quadratic
+     - Janus echo: Wj^T * Wj self-resonance
+     - Learned per-head 3-way gate: softmax([3]) blends the three pathways
+  3. No value_embeds / ve_gate (nanochat feature not used in Janus)
+Everything else from nanochat is preserved:
+  - resid_lambdas, x0_lambdas (per-layer residual scaling)
+  - smear_gate, smear_lambda (bigram token mixing)
+  - backout_lambda (mid-layer subtraction)
+  - RoPE, QK-norm, softcap=15, non-parametric RMSNorm
+  - Sliding window attention support
+Confirmed against checkpoint keys from janus_285m_base_final.pt:
+  V=32000, E=640, H=10, D=64, B=20, M=1664, T=1024, ~285M params
+"""
+from functools import partial
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nanochat.common import get_dist_info, print0, COMPUTE_DTYPE
+from nanochat.optim import MuonAdamW, DistMuonAdamW
+from nanochat.flash_attention import flash_attn
+@dataclass
+class JanusConfig:
+    sequence_len: int = 1024
+    vocab_size: int = 32000
+    n_layer: int = 20
+    n_head: int = 10       # number of query heads (H)
+    n_kv_head: int = 10    # same as n_head for Janus (no GQA)
+    n_embd: int = 640      # embedding dim (E)
+    mlp_hidden: int = 1664 # SwiGLU intermediate dim (M) — NOT 4*n_embd
+    rrpram_T: int = 1024   # RRPRAM positional dimension (T_r, same as sequence_len)
+    rrpram_rank: int = 64  # low-rank factorization rank (0 = full rank for backward compat)
+    # Sliding window attention pattern string, tiled across layers.
+    window_pattern: str = "L"  # Janus used full context (no sliding window)
+def norm(x):
+    if hasattr(F, 'rms_norm'):
+        return F.rms_norm(x, (x.size(-1),))
+    # Fallback for older PyTorch versions
+    variance = x.float().pow(2).mean(-1, keepdim=True)
+    return (x * torch.rsqrt(variance + 1e-6)).to(x.dtype)
+class Linear(nn.Linear):
+    """nn.Linear that casts weights to match input dtype in forward."""
+    def forward(self, x):
+        return F.linear(x, self.weight.to(dtype=x.dtype))
+def apply_rotary_emb(x, cos, sin):
+    assert x.ndim == 4  # (B, T, H, D)
+    d = x.shape[3] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    y1 = x1 * cos + x2 * sin
+    y2 = x1 * (-sin) + x2 * cos
+    return torch.cat([y1, y2], 3)
+class JanusHybridAttention(nn.Module):
+    """
+    3-way hybrid attention: QKV + RRPRAM + Janus echo, blended by learned per-head gate.
+    Pathway 1 - QKV (standard):
+        Standard scaled dot-product attention with RoPE and QK-norm.
+        Uses FA3 on Hopper, SDPA fallback elsewhere.
+    Pathway 2 - RRPRAM (positional resonance):
+        Wr: nn.Parameter [H, E, T_r] — positional pattern per head
+        score[t] = sum_e(x[t,e] * Wr[h,e,t]) — linear in T, non-quadratic
+        Attention: broadcast score to all query positions (with causal mask)
+        Values: separate Wvr projection
+    Pathway 3 - Janus echo (self-resonance):
+        echo = Wj(x)              — project through Wj
+        echo_back = echo @ Wj.T   — project back through transpose (W^T * W)
+        score[t] = dot(x[t], echo_back[t]) / sqrt(E)
+        Attention: score[i] * score[j] (with causal mask)
+        Values: echo itself (Wj(x))
+    Gate: nn.Parameter [H, 3], softmax per head, blends three pathway outputs.
+    """
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        # Pathway 1: Standard QKV
+        self.c_q = Linear(self.n_embd, self.n_head * self.head_dim, bias=False)
+        self.c_k = Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        # Pathway 2: RRPRAM
+        self.rrpram_rank = config.rrpram_rank
+        if config.rrpram_rank > 0:
+            # Low-rank factorization: Wr ≈ wr_a @ wr_b
+            # Was [H, E, T_r] = 6.5M per layer. Now wr_a[H,E,R]+wr_b[H,R,T] ≈ 1.1M
+            self.wr_a = nn.Parameter(torch.zeros(config.n_head, config.n_embd, config.rrpram_rank))
+            self.wr_b = nn.Parameter(torch.zeros(config.n_head, config.rrpram_rank, config.rrpram_T))
+        else:
+            # Full rank (backward compat with v3 checkpoints)
+            self.wr = nn.Parameter(torch.zeros(config.n_head, config.n_embd, config.rrpram_T))
+        # Separate value projection for RRPRAM
+        self.wvr = Linear(self.n_embd, self.n_embd, bias=False)
+        # Pathway 3: Janus echo (W^T * W self-resonance)
+        self.wj = Linear(self.n_embd, self.n_embd, bias=False)
+        # Per-head 3-way gate: [H, 3]
+        # Pad gate to multiple of 8 for DDP reduce_scatter compatibility
+        self.gate = nn.Parameter(torch.zeros(config.n_head, 3))
+        # Output projection
+        self.c_proj = Linear(self.n_embd, self.n_embd, bias=False)
+    def _rrpram_attention(self, x, vr, B, T, H, D):
+        """
+        RRPRAM pathway: positional resonance, linear in T.
+        x: (B, T, E) — input (after norm)
+        vr: (B, T, H, D) — RRPRAM values
+        score[t] = sum_e x[b,t,e] * wr[h,e,t]
+        This is einsum('bte,het->bht') with causal broadcast.
+        """
+        E = self.n_embd
+        sc = (D ** -0.5)
+        # Compute per-position scores: (B, H, T)
+        if self.rrpram_rank > 0:
+            # Low-rank: x → wr_a → intermediate (B,H,R) → wr_b → scores (B,H,T)
+            wr_a = self.wr_a.to(x.dtype)                          # (H, E, R)
+            wr_b_slice = self.wr_b[:, :, :T].to(x.dtype)          # (H, R, T)
+            intermediate = torch.einsum('bte,her->bhr', x, wr_a)  # (B, H, R)
+            scores = torch.einsum('bhr,hrt->bht', intermediate, wr_b_slice) * sc  # (B, H, T)
+        else:
+            # Full rank (backward compat)
+            wr_slice = self.wr[:, :, :T].to(x.dtype)  # (H, E, T)
+            scores = torch.einsum('bte,het->bht', x, wr_slice) * sc  # (B, H, T)
+        # Build causal attention from broadcast scores:
+        # attn[i, j] = score[j] for j <= i, -inf for j > i
+        # Efficient: expand scores to (B, H, 1, T) and apply causal mask
+        causal_mask = torch.triu(
+            torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1
+        )  # True where j > i
+        attn = scores.unsqueeze(2).expand(B, H, T, T)  # (B, H, T, T)
+        attn = attn.masked_fill(causal_mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).to(x.dtype)
+        # Apply to values: (B, H, T, T) @ (B, H, T, D) -> (B, H, T, D)
+        # vr is (B, T, H, D), transpose to (B, H, T, D)
+        vr_t = vr.transpose(1, 2)
+        out = torch.matmul(attn, vr_t)  # (B, H, T, D)
+        return out  # (B, H, T, D)
+    def _janus_echo_attention(self, x, B, T, H, D):
+        """
+        Janus echo pathway: W^T * W self-resonance.
+        echo = Wj(x)                         — (B, T, E)
+        echo_back = echo @ Wj.weight          — (B, T, E), i.e. F.linear(echo, Wj.T)
+        score[t] = dot(x[t], echo_back[t]) / sqrt(E)
+        attn[i,j] = score[i] * score[j]  (causal)
+        values = echo reshaped to (B, T, H, D)
+        """
+        E = self.n_embd
+        # echo = F.linear(x, wj) = x @ wj.T
+        echo = self.wj(x)  # (B, T, E)
+        # echo_back = echo @ wj.weight (standard mm, NOT transposed)
+        # wj.weight is [E, E] (PyTorch stores [out, in])
+        # F.linear(echo, wj.T) = echo @ wj = echo @ wj.weight.T.T = echo @ wj.weight
+        echo_back = torch.matmul(echo, self.wj.weight.to(echo.dtype))  # (B, T, E)
+        # Self-resonance scores (capped to prevent bf16 overflow in outer product)
+        scores = (x * echo_back).sum(dim=-1) / (E ** 0.5)  # (B, T)
+        scores = 15.0 * torch.tanh(scores / 15.0)  # softcap like logits
+        # Build attention: attn[i,j] = score[i] * score[j] (with causal mask)
+        causal_mask = torch.triu(
+            torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1
+        )
+        attn = scores.unsqueeze(-1) * scores.unsqueeze(-2)  # (B, T, T)
+        attn = attn.masked_fill(causal_mask.unsqueeze(0), float('-inf'))
+        attn = F.softmax(attn.float(), dim=-1).to(x.dtype)
+        # Values: echo reshaped to (B, T, H, D) -> (B, H, T, D)
+        jv = echo.view(B, T, H, D).transpose(1, 2)  # (B, H, T, D)
+        # Attention is (B, T, T), need (B, H, T, T) for per-head application
+        attn = attn.unsqueeze(1).expand(B, H, T, T)  # (B, H, T, T)
+        out = torch.matmul(attn, jv)  # (B, H, T, D)
+        return out  # (B, H, T, D)
+    def forward(self, x, cos_sin, window_size, kv_cache):
+        B, T, C = x.size()
+        H = self.n_head
+        D = self.head_dim
+        # === Pathway 1: Standard QKV attention ===
+        q = self.c_q(x).view(B, T, H, D)
+        k = self.c_k(x).view(B, T, self.n_kv_head, D)
+        v = self.c_v(x).view(B, T, self.n_kv_head, D)
+        # RoPE
+        cos, sin = cos_sin
+        q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin)
+        # QK norm (from nanochat)
+        q, k = norm(q), norm(k)
+        q = q * 1.2
+        k = k * 1.2
+        # Flash Attention
+        if kv_cache is None:
+            qkv_out = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size)
+        else:
+            k_cache, v_cache = kv_cache.get_layer_cache(self.layer_idx)
+            qkv_out = flash_attn.flash_attn_with_kvcache(
+                q, k_cache, v_cache, k=k, v=v,
+                cache_seqlens=kv_cache.cache_seqlens,
+                causal=True, window_size=window_size,
+            )
+            if self.layer_idx == kv_cache.n_layers - 1:
+                kv_cache.advance(T)
+        # qkv_out: (B, T, H, D) -> (B, H, T, D)
+        qkv_out = qkv_out.transpose(1, 2)
+        # === Pathway 2: RRPRAM ===
+        vr = self.wvr(x).view(B, T, H, D)
+        rrpram_out = self._rrpram_attention(x, vr, B, T, H, D)  # (B, H, T, D)
+        # === Pathway 3: Janus echo ===
+        janus_out = self._janus_echo_attention(x, B, T, H, D)  # (B, H, T, D)
+        # === 3-way gate blending ===
+        # gate: [H, 3] -> softmax -> [H, 3]
+        g = F.softmax(self.gate.float(), dim=-1).to(x.dtype)  # (H, 3)
+        # g[:, 0] = QKV weight, g[:, 1] = RRPRAM weight, g[:, 2] = Janus weight
+        # Reshape for broadcasting: (1, H, 1, 1) per pathway
+        g0 = g[:, 0].view(1, H, 1, 1)
+        g1 = g[:, 1].view(1, H, 1, 1)
+        g2 = g[:, 2].view(1, H, 1, 1)
+        blended = g0 * qkv_out + g1 * rrpram_out + g2 * janus_out  # (B, H, T, D)
+        # (B, H, T, D) -> (B, T, H, D) -> (B, T, E)
+        y = blended.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y
+class SwiGLU_MLP(nn.Module):
+    """SwiGLU MLP: gate(x) = SiLU(w_gate(x)) * w_up(x); out = w_down(gate(x))"""
+    def __init__(self, config):
+        super().__init__()
+        self.w_gate = Linear(config.n_embd, config.mlp_hidden, bias=False)
+        self.w_up = Linear(config.n_embd, config.mlp_hidden, bias=False)
+        self.w_down = Linear(config.mlp_hidden, config.n_embd, bias=False)
+    def forward(self, x):
+        return self.w_down(F.silu(self.w_gate(x)) * self.w_up(x))
+class Block(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.attn = JanusHybridAttention(config, layer_idx)
+        self.mlp = SwiGLU_MLP(config)
+    def forward(self, x, cos_sin, window_size, kv_cache):
+        x = x + self.attn(norm(x), cos_sin, window_size, kv_cache)
+        x = x + self.mlp(norm(x))
+        return x
+class JanusGPT(nn.Module):
+    """
+    Janus 285M: nanochat GPT with 3-way hybrid attention and SwiGLU MLP.
+    Preserves all nanochat mechanisms:
+    - resid_lambdas, x0_lambdas (per-layer residual scaling)
+    - smear_gate, smear_lambda (bigram token mixing)
+    - backout_lambda (mid-layer subtraction)
+    - Softcap=15 on logits
+    - Non-parametric RMSNorm
+    Removed from nanochat:
+    - value_embeds / ve_gate (not used in Janus)
+    """
+    def __init__(self, config, pad_vocab_size_to=64):
+        super().__init__()
+        self.config = config
+        self.window_sizes = self._compute_window_sizes(config)
+        padded_vocab_size = ((config.vocab_size + pad_vocab_size_to - 1) // pad_vocab_size_to) * pad_vocab_size_to
+        if padded_vocab_size != config.vocab_size:
+            print0(f"Padding vocab_size from {config.vocab_size} to {padded_vocab_size} for efficiency")
+        self.transformer = nn.ModuleDict({
+            "wte": nn.Embedding(padded_vocab_size, config.n_embd),
+            "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]),
+        })
+        self.lm_head = Linear(config.n_embd, padded_vocab_size, bias=False)
+        # Per-layer learnable scalars (from nanochat)
+        self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer))
+        self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer))
+        # Smear: mix previous token's embedding into current token
+        self.smear_gate = Linear(24, 1, bias=False)
+        self.smear_lambda = nn.Parameter(torch.zeros(1))
+        # Backout: subtract cached mid-layer residual
+        self.backout_lambda = nn.Parameter(0.2 * torch.ones(1))
+        # Rotary embeddings
+        self.rotary_seq_len = config.sequence_len * 10
+        head_dim = config.n_embd // config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.register_buffer("cos", cos, persistent=False)
+        self.register_buffer("sin", sin, persistent=False)
+    @torch.no_grad()
+    def init_weights(self):
+        """Initialize all weights. Matches nanochat conventions + Janus-specific init."""
+        # Embedding and unembedding
+        torch.nn.init.normal_(self.transformer.wte.weight, mean=0.0, std=0.8)
+        torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001)
+        # Transformer blocks
+        n_embd = self.config.n_embd
+        s = 3**0.5 * n_embd**-0.5
+        for block in self.transformer.h:
+            # QKV projections
+            torch.nn.init.uniform_(block.attn.c_q.weight, -s, s)
+            torch.nn.init.uniform_(block.attn.c_k.weight, -s, s)
+            torch.nn.init.uniform_(block.attn.c_v.weight, -s, s)
+            torch.nn.init.zeros_(block.attn.c_proj.weight)
+            # RRPRAM: Wr init with small values (positional patterns need to learn from data)
+            if hasattr(block.attn, 'wr_a'):
+                torch.nn.init.normal_(block.attn.wr_a, mean=0.0, std=0.01)
+                torch.nn.init.normal_(block.attn.wr_b, mean=0.0, std=0.01)
+            else:
+                torch.nn.init.normal_(block.attn.wr, mean=0.0, std=0.01)
+            # RRPRAM value projection
+            torch.nn.init.uniform_(block.attn.wvr.weight, -s, s)
+            # Janus echo projection
+            torch.nn.init.uniform_(block.attn.wj.weight, -s, s)
+            # Gate: init biased toward QKV (standard attention gets most weight early on)
+            # [H, 3]: column 0 = QKV (larger), columns 1,2 = RRPRAM, Janus (smaller)
+            block.attn.gate.data[:, 0] = 1.0   # QKV dominant
+            block.attn.gate.data[:, 1] = -0.5   # RRPRAM starts lower
+            block.attn.gate.data[:, 2] = -0.5   # Janus starts lower
+            # After softmax: ~0.58 QKV, ~0.21 RRPRAM, ~0.21 Janus
+            # SwiGLU MLP
+            torch.nn.init.uniform_(block.mlp.w_gate.weight, -s * 0.4, s * 0.4)
+            torch.nn.init.uniform_(block.mlp.w_up.weight, -s * 0.4, s * 0.4)
+            torch.nn.init.zeros_(block.mlp.w_down.weight)
+        # Per-layer scalars (from nanochat)
+        n_layer = self.config.n_layer
+        for i in range(n_layer):
+            self.resid_lambdas.data[i] = 1.15 - (0.10 * i / max(n_layer - 1, 1))
+        for i in range(n_layer):
+            self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1))
+        # Rotary embeddings
+        head_dim = self.config.n_embd // self.config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.cos, self.sin = cos, sin
+        # Cast embeddings to COMPUTE_DTYPE
+        if COMPUTE_DTYPE != torch.float16:
+            self.transformer.wte.to(dtype=COMPUTE_DTYPE)
+    def _precompute_rotary_embeddings(self, seq_len, head_dim, base=100000, device=None):
+        if device is None:
+            device = self.transformer.wte.weight.device
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (base ** (channel_range / head_dim))
+        t = torch.arange(seq_len, dtype=torch.float32, device=device)
+        freqs = torch.outer(t, inv_freq)
+        cos, sin = freqs.cos(), freqs.sin()
+        cos, sin = cos.to(COMPUTE_DTYPE), sin.to(COMPUTE_DTYPE)
+        cos, sin = cos[None, :, None, :], sin[None, :, None, :]
+        return cos, sin
+    def _compute_window_sizes(self, config):
+        pattern = config.window_pattern.upper()
+        assert all(c in "SL" for c in pattern), f"Invalid window_pattern: {pattern}"
+        long_window = config.sequence_len
+        short_window = -(-long_window // 4 // 128) * 128
+        char_to_window = {"L": (long_window, 0), "S": (short_window, 0)}
+        window_sizes = []
+        for layer_idx in range(config.n_layer):
+            char = pattern[layer_idx % len(pattern)]
+            window_sizes.append(char_to_window[char])
+        window_sizes[-1] = (long_window, 0)
+        return window_sizes
+    def get_device(self):
+        return self.transformer.wte.weight.device
+    def estimate_flops(self):
+        """Estimated FLOPs per token (forward + backward)."""
+        nparams = sum(p.numel() for p in self.parameters())
+        nparams_exclude = (self.transformer.wte.weight.numel() +
+                          self.resid_lambdas.numel() + self.x0_lambdas.numel() +
+                          self.smear_gate.weight.numel() + self.smear_lambda.numel() +
+                          self.backout_lambda.numel())
+        h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len
+        attn_flops = 0
+        for window_size in self.window_sizes:
+            window = window_size[0]
+            effective_seq = t if window < 0 else min(window, t)
+            # QKV attention FLOPs
+            attn_flops += 12 * h * q * effective_seq
+            # RRPRAM FLOPs (roughly linear, much cheaper than QKV)
+            attn_flops += 4 * h * q * effective_seq
+            # Janus echo FLOPs
+            attn_flops += 4 * h * q * effective_seq
+        num_flops_per_token = 6 * (nparams - nparams_exclude) + attn_flops
+        return num_flops_per_token
+    def num_scaling_params(self):
+        """Parameter counts for scaling law analysis."""
+        wte = sum(p.numel() for p in self.transformer.wte.parameters())
+        lm_head = sum(p.numel() for p in self.lm_head.parameters())
+        transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters())
+        scalars = (self.resid_lambdas.numel() + self.x0_lambdas.numel() +
+                   self.smear_gate.weight.numel() + self.smear_lambda.numel() +
+                   self.backout_lambda.numel())
+        total = wte + lm_head + transformer_matrices + scalars
+        assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
+        return {
+            'wte': wte,
+            'lm_head': lm_head,
+            'transformer_matrices': transformer_matrices,
+            'scalars': scalars,
+            'total': total,
+        }
+    def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02,
+                        weight_decay=0.0, scalar_lr=0.5,
+                        rrpram_lr_scale=0.5, janus_lr_scale=0.5, gate_lr=0.1):
+        """
+        Setup MuonAdamW optimizer with Janus-specific parameter groups.
+        Extra groups vs nanochat:
+        - wr (RRPRAM): AdamW with reduced LR (3D tensor, not suitable for Muon)
+        - wj (Janus echo): Muon with slightly reduced LR
+        - gate: AdamW with small LR and no weight decay (learned blending, keep stable)
+        """
+        model_dim = self.config.n_embd
+        ddp, rank, local_rank, world_size = get_dist_info()
+        # Collect all parameters by role
+        # Standard matrix params (QKV, projections, MLP): go to Muon
+        standard_matrix_params = []
+        # Janus-specific params: separate groups
+        wr_params = []      # RRPRAM positional patterns (3D, AdamW)
+        wj_params = []      # Janus echo projection (2D, Muon with separate LR)
+        wvr_params = []     # RRPRAM value projection (2D, Muon)
+        gate_params = []    # Per-head 3-way gate (2D small, AdamW)
+        for block in self.transformer.h:
+            # Standard attention matrices -> Muon
+            standard_matrix_params.extend([
+                block.attn.c_q.weight,
+                block.attn.c_k.weight,
+                block.attn.c_v.weight,
+                block.attn.c_proj.weight,
+            ])
+            # SwiGLU MLP -> Muon
+            standard_matrix_params.extend([
+                block.mlp.w_gate.weight,
+                block.mlp.w_up.weight,
+                block.mlp.w_down.weight,
+            ])
+            # RRPRAM Wr -> AdamW (3D tensor, Muon needs 2D)
+            if hasattr(block.attn, 'wr_a'):
+                wr_params.append(block.attn.wr_a)
+                wr_params.append(block.attn.wr_b)
+            else:
+                wr_params.append(block.attn.wr)
+            # RRPRAM Wvr -> Muon (standard 2D matrix)
+            wvr_params.append(block.attn.wvr.weight)
+            # Janus echo Wj -> Muon with separate LR
+            wj_params.append(block.attn.wj.weight)
+            # Gate -> AdamW (small 2D [H, 3])
+            gate_params.append(block.attn.gate)
+        embedding_params = list(self.transformer.wte.parameters())
+        lm_head_params = list(self.lm_head.parameters())
+        resid_params = [self.resid_lambdas]
+        x0_params = [self.x0_lambdas]
+        smear_params = [self.smear_gate.weight, self.smear_lambda, self.backout_lambda]
+        # Verify all params are accounted for
+        all_params_list = (standard_matrix_params + wr_params + wj_params + wvr_params +
+                          gate_params + embedding_params + lm_head_params +
+                          resid_params + x0_params + smear_params)
+        model_params = list(self.parameters())
+        assert len(model_params) == len(all_params_list), \
+            f"Parameter count mismatch: model has {len(model_params)}, grouped {len(all_params_list)}"
+        # Scale LR proportional to 1/sqrt(dmodel/768)
+        dmodel_lr_scale = (model_dim / 768) ** -0.5
+        print0(f"Scaling AdamW LR by 1/sqrt({model_dim}/768) = {dmodel_lr_scale:.6f}")
+        param_groups = [
+            # AdamW groups
+            dict(kind='adamw', params=lm_head_params,
+                 lr=unembedding_lr * dmodel_lr_scale, betas=(0.8, 0.96), eps=1e-10, weight_decay=0.01),
+            dict(kind='adamw', params=embedding_params,
+                 lr=embedding_lr * dmodel_lr_scale, betas=(0.8, 0.995), eps=1e-10, weight_decay=0.001),
+            dict(kind='adamw', params=resid_params,
+                 lr=scalar_lr * 0.01, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.05),
+            dict(kind='adamw', params=x0_params,
+                 lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0),
+            dict(kind='adamw', params=smear_params,
+                 lr=0.2, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0),
+            # Janus-specific AdamW groups
+            dict(kind='adamw', params=wr_params,
+                 lr=embedding_lr * dmodel_lr_scale * rrpram_lr_scale,
+                 betas=(0.9, 0.999), eps=1e-10, weight_decay=0.01),
+            dict(kind='adamw', params=gate_params,
+                 lr=gate_lr, betas=(0.9, 0.99), eps=1e-10, weight_decay=0.0),
+        ]
+        # Muon groups: group by shape for stacking
+        all_muon_2d = standard_matrix_params + wvr_params + wj_params
+        for shape in sorted({p.shape for p in all_muon_2d}):
+            group_params = [p for p in all_muon_2d if p.shape == shape]
+            param_groups.append(dict(
+                kind='muon', params=group_params, lr=matrix_lr,
+                momentum=0.95, ns_steps=5, beta2=0.9, weight_decay=weight_decay,
+            ))
+        Factory = DistMuonAdamW if ddp else MuonAdamW
+        optimizer = Factory(param_groups)
+        for group in optimizer.param_groups:
+            group["initial_lr"] = group["lr"]
+        return optimizer
+    def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'):
+        B, T = idx.size()
+        # Rotary embeddings
+        assert T <= self.cos.size(1)
+        assert idx.device == self.cos.device
+        T0 = 0 if kv_cache is None else kv_cache.get_pos()
+        cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T]
+        # Embed
+        x = self.transformer.wte(idx)
+        x = x.to(COMPUTE_DTYPE)
+        x = norm(x)
+        # Smear (from nanochat)
+        if kv_cache is None:
+            assert T > 1
+            gate = self.smear_lambda.to(x.dtype) * torch.sigmoid(self.smear_gate(x[:, 1:, :24]))
+            x = torch.cat([x[:, :1], x[:, 1:] + gate * x[:, :-1]], dim=1)
+        else:
+            x_pre_smear = kv_cache.prev_embedding
+            kv_cache.prev_embedding = x[:, -1:, :]
+            if T > 1:
+                gate = self.smear_lambda.to(x.dtype) * torch.sigmoid(self.smear_gate(x[:, 1:, :24]))
+                x = torch.cat([x[:, :1], x[:, 1:] + gate * x[:, :-1]], dim=1)
+            elif x_pre_smear is not None:
+                gate = self.smear_lambda.to(x.dtype) * torch.sigmoid(self.smear_gate(x[:, :, :24]))
+                x = x + gate * x_pre_smear
+        # Forward the transformer
+        x0 = x
+        n_layer = self.config.n_layer
+        backout_layer = n_layer // 2
+        x_backout = None
+        for i, block in enumerate(self.transformer.h):
+            x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0
+            x = block(x, cos_sin, self.window_sizes[i], kv_cache)
+            if i == backout_layer:
+                x_backout = x
+        # Backout subtraction
+        if x_backout is not None:
+            x = x - self.backout_lambda.to(x.dtype) * x_backout
+        x = norm(x)
+        # Logits with softcap
+        softcap = 15
+        logits = self.lm_head(x)
+        logits = logits[..., :self.config.vocab_size]
+        logits = logits.float()
+        logits = softcap * torch.tanh(logits / softcap)
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1),
+                                   ignore_index=-1, reduction=loss_reduction)
+            return loss
+        else:
+            return logits
+    @torch.inference_mode()
+    def generate(self, tokens, max_tokens, temperature=1.0, top_k=None, seed=42):
+        assert isinstance(tokens, list)
+        device = self.get_device()
+        rng = None
+        if temperature > 0:
+            rng = torch.Generator(device=device)
+            rng.manual_seed(seed)
+        ids = torch.tensor([tokens], dtype=torch.long, device=device)
+        for _ in range(max_tokens):
+            logits = self.forward(ids)
+            logits = logits[:, -1, :]
+            if top_k is not None and top_k > 0:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            if temperature > 0:
+                logits = logits / temperature
+                probs = F.softmax(logits, dim=-1)
+                next_ids = torch.multinomial(probs, num_samples=1, generator=rng)
+            else:
+                next_ids = torch.argmax(logits, dim=-1, keepdim=True)
+            ids = torch.cat((ids, next_ids), dim=1)
+            token = next_ids.item()
+            yield token