Cactus-Compute
/

gemma4-e2b-grouped-k96

+"""
+Gemma 4 E2B — clean PyTorch forward pass (text model only).
+Architecture:
+  - 35 decoder layers, hidden_size=1536, vocab=262144
+  - 8 Q heads, 1 KV head (MQA)
+  - Sliding attention layers (0-3, 5-8, 10-13, 15-18, 20-23, 25-28, 30-33):
+      head_dim=256, sliding_window=512, rope_theta=10000
+  - Full attention layers (every 5th: 4,9,14,19,24,29,34):
+      head_dim=512, partial_rotary_factor=0.25 (only first 128 of 512 dims rotated),
+      rope_theta=1000000
+  - MLP (all layers): GeGLU, intermediate_size=6144
+  - Per-layer auxiliary stream (full details below)
+  - layer_scalar: per-layer learned scalar multiplied onto residual contributions
+  - QK RMSNorm before RoPE, attn_scale=1.0
+  - Final: RMSNorm + tied lm_head + logit softcapping at 30.0
+Per-layer auxiliary stream:
+  Model-level (computed once, before all layers):
+    1. embed_tokens_per_layer(input_ids)          → [B, T, 35*256]  (vocab lookup)
+    2. per_layer_model_projection(x_embed)         → [B, T, 35*256]  (project hidden→aux)
+       scaled by hidden_size**-0.5
+    3. per_layer_projection_norm (RMSNorm(256)) on the projection slice per layer
+    4. Combine: per_layer_inputs = (embed_aux + proj_aux) * (1/sqrt(2))
+       reshaped to [B, T, 35, 256]
+  Per-layer (at layer i):
+    per_layer_input_i = per_layer_inputs[:, :, i, :]      # [B, T, 256]
+    x_normed = input_layernorm(x)
+    gate  = sigmoid(per_layer_input_gate(x_normed))       # [B, T, 256]
+    gated = gate * per_layer_input_i                      # [B, T, 256]
+    out   = per_layer_projection(gated)                   # [B, T, 1536]  (256→1536)
+    x     = x + post_per_layer_input_norm(out)
+  Weight shapes in checkpoint:
+    per_layer_model_projection.weight : [8960, 1536]   (Linear 1536→8960)
+    per_layer_projection_norm.weight  : [256]           (RMSNorm on 256-dim slices)
+    layers.i.per_layer_input_gate.weight  : [256, 1536] (Linear 1536→256)
+    layers.i.per_layer_projection.weight  : [1536, 256] (Linear 256→1536)
+    layers.i.post_per_layer_input_norm.weight : [1536]  (RMSNorm on 1536-dim output)
+"""
+import math
+import os
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from safetensors import safe_open
+from transformers import AutoTokenizer
+# ── device / dtype ────────────────────────────────────────────────────────────
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE  = torch.bfloat16
+# ── model path ────────────────────────────────────────────────────────────────
+# Try known HF repo caches in order; first one that exists wins. Override with
+# $GEMMA4_HF_REPO to point at an arbitrary repo cache (e.g., "google/gemma-4-e2b-it").
+_HUB_ROOT = Path(os.path.expanduser("~/.cache/huggingface/hub"))
+_REPO_CANDIDATES = (
+    os.environ.get("GEMMA4_HF_REPO", ""),
+    "gg-hf-gg/gemma-4-E2B",
+    "google/gemma-4-e2b-it",
+)
+def _resolve_model_paths():
+    """Return (snapshot_dir, safetensors_path). Picks first available repo+snapshot
+    that actually contains a .safetensors file. Iterates ALL snapshots per repo
+    before moving to the next repo — iterdir() order is not deterministic and HF
+    may keep multiple snapshots where only one has weights blob-resolved.
+    """
+    for repo in _REPO_CANDIDATES:
+        if not repo:
+            continue
+        repo_cache = _HUB_ROOT / ("models--" + repo.replace("/", "--"))
+        snap_root = repo_cache / "snapshots"
+        if not snap_root.is_dir():
+            continue
+        for snap in sorted(p for p in snap_root.iterdir() if p.is_dir()):
+            # Prefer model.safetensors (single-file) else any .safetensors
+            sft = snap / "model.safetensors"
+            if not sft.exists():
+                candidates = sorted(snap.glob("*.safetensors"))
+                if not candidates:
+                    continue
+                sft = candidates[0]
+            return snap, sft
+    raise FileNotFoundError(
+        "No Gemma-4 E2B HF cache found. Tried: " + ", ".join(r for r in _REPO_CANDIDATES if r)
+        + ". Run `hf download google/gemma-4-e2b-it` or set GEMMA4_HF_REPO."
+    )
+MODEL_DIR, SAFETENSORS_BLOB = _resolve_model_paths()
+# ── architecture constants ────────────────────────────────────────────────────
+N_LAYERS       = 35
+HIDDEN_SIZE    = 1536
+VOCAB_SIZE     = 262144
+N_Q_HEADS      = 8
+N_KV_HEADS     = 1
+HEAD_DIM_SLIDE = 256          # sliding attention head dim
+HEAD_DIM_FULL  = 512          # full attention head dim
+PER_LAYER_DIM  = 256          # per-layer auxiliary stream width per layer
+INTERMEDIATE        = 6144    # MLP intermediate size (layers 0-14)
+INTERMEDIATE_WIDE   = 12288   # double-wide MLP intermediate size (layers 15-34)
+# Layers 15-34 use double-wide MLP (use_double_wide_mlp=True in config)
+DOUBLE_WIDE_START   = 15
+SLIDING_WINDOW = 512
+ROPE_THETA_SLIDE  = 10_000.0
+ROPE_THETA_FULL   = 1_000_000.0
+PARTIAL_ROT_FULL  = 0.25      # only first floor(512*0.25)=128 dims get RoPE
+RMS_EPS           = 1e-6
+LOGIT_CAP         = 30.0
+ATTN_SCALE        = 1.0       # QK are RMSNorm'd, so no sqrt(d) scaling needed
+# Per-layer projection scale: hidden_size**-0.5 (applied to per_layer_model_projection output)
+PER_LAYER_PROJ_SCALE = HIDDEN_SIZE ** -0.5
+# Input combination scale: 1/sqrt(2) (mix embed aux + model projection)
+PER_LAYER_INPUT_SCALE = math.sqrt(0.5)  # = 1/sqrt(2)
+# Full-attention layers: every 5th layer (0-indexed: 4,9,14,19,24,29,34)
+FULL_ATTN_LAYERS = frozenset(range(4, N_LAYERS, 5))
+def is_full_attention(layer_idx: int) -> bool:
+    """Return True if layer_idx uses full (global) attention."""
+    return layer_idx in FULL_ATTN_LAYERS
+# ── RMSNorm ───────────────────────────────────────────────────────────────────
+class RMSNorm(nn.Module):
+    """RMSNorm with weight * normed, weight initialized to ones."""
+    def __init__(self, dim: int):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_f32 = x.float()
+        normed = x_f32 * torch.rsqrt(x_f32.pow(2).mean(-1, keepdim=True) + RMS_EPS)
+        return (normed * self.weight.float()).to(x.dtype)
+# ── RoPE ─────────────────────────────────────────────────────────────────────
+def build_rope_freqs(
+    head_dim: int,
+    max_seq: int,
+    theta: float,
+    device: torch.device,
+    n_rot_pairs: int | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Build cos/sin tables of shape [max_seq, head_dim].
+    For full-attention layers with partial rotation, only the first
+    n_rot_pairs*2 positions carry actual frequencies; the rest are zeros
+    (NoPE — no positional encoding for those dims).
+    Args:
+        head_dim:    total head dimension
+        max_seq:     maximum sequence length to precompute
+        theta:       RoPE base frequency
+        device:      target device
+        n_rot_pairs: if set, only compute real freqs for this many pairs;
+                     remaining dims get freq=0 (cos=1, sin=0 → identity).
+    """
+    half = head_dim // 2
+    if n_rot_pairs is None:
+        n_rot_pairs = half
+    # Build frequencies only for the pairs that actually rotate
+    inv_freq = 1.0 / (theta ** (
+        torch.arange(0, n_rot_pairs, device=device).float() / half
+    ))  # shape [n_rot_pairs]
+    # Pad with zeros for the remaining pairs (NoPE: cos=1, sin=0)
+    if n_rot_pairs < half:
+        inv_freq = torch.cat([
+            inv_freq,
+            torch.zeros(half - n_rot_pairs, device=device),
+        ])  # [half]
+    t = torch.arange(max_seq, device=device).float()
+    freqs = torch.outer(t, inv_freq)          # [T, half]
+    freqs = torch.cat([freqs, freqs], dim=-1) # [T, head_dim]
+    return freqs.cos(), freqs.sin()
+def apply_rope(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary embeddings.
+    Args:
+        x:   [B, H, T, head_dim]
+        cos: [T, head_dim]  (broadcastable)
+        sin: [T, head_dim]
+    """
+    half = x.shape[-1] // 2
+    x1, x2 = x[..., :half], x[..., half:]
+    rotated = torch.cat([-x2, x1], dim=-1)
+    T = x.shape[2]
+    cos_ = cos[:T].unsqueeze(0).unsqueeze(0).to(x.dtype)  # [1,1,T,D]
+    sin_ = sin[:T].unsqueeze(0).unsqueeze(0).to(x.dtype)
+    return x * cos_ + rotated * sin_
+# ── Attention ─────────────────────────────────────────────────────────────────
+class Attention(nn.Module):
+    """
+    Multi-query attention (8 Q heads, 1 KV head).
+    Sliding layers: head_dim=256, local window=512.
+    Full layers:    head_dim=512, causal (no window restriction).
+    """
+    def __init__(self, layer_idx: int):
+        super().__init__()
+        self.layer_idx   = layer_idx
+        self.full_attn   = is_full_attention(layer_idx)
+        self.head_dim    = HEAD_DIM_FULL if self.full_attn else HEAD_DIM_SLIDE
+        hd               = self.head_dim
+        self.q_proj = nn.Linear(HIDDEN_SIZE, N_Q_HEADS  * hd, bias=False)
+        self.k_proj = nn.Linear(HIDDEN_SIZE, N_KV_HEADS * hd, bias=False)
+        self.v_proj = nn.Linear(HIDDEN_SIZE, N_KV_HEADS * hd, bias=False)
+        self.o_proj = nn.Linear(N_Q_HEADS   * hd, HIDDEN_SIZE, bias=False)
+        self.q_norm = RMSNorm(hd)
+        self.k_norm = RMSNorm(hd)
+    def forward(
+        self,
+        x: torch.Tensor,    # [B, T, D]
+        cos: torch.Tensor,  # [T, head_dim]
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        B, T, _ = x.shape
+        hd = self.head_dim
+        q = self.q_proj(x).view(B, T, N_Q_HEADS,  hd).transpose(1, 2)  # [B,Hq,T,hd]
+        k = self.k_proj(x).view(B, T, N_KV_HEADS, hd).transpose(1, 2)  # [B,1,T,hd]
+        v = self.v_proj(x).view(B, T, N_KV_HEADS, hd).transpose(1, 2)
+        # Per-head QK normalisation (before RoPE)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Rotary position embeddings
+        q = apply_rope(q, cos, sin)
+        k = apply_rope(k, cos, sin)
+        # Expand KV to match Q heads (MQA)
+        k = k.expand(B, N_Q_HEADS, T, hd)
+        v = v.expand(B, N_Q_HEADS, T, hd)
+        if self.full_attn:
+            # Standard causal attention, no window restriction
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                is_causal=True,
+                scale=ATTN_SCALE,
+            )
+        else:
+            # Sliding window causal attention.
+            # attn_mask[i, j] = True means query-position i CAN attend to key-position j.
+            # Causal: j <= i  (can only attend to past/current positions)
+            # Window: i - j < SLIDING_WINDOW
+            idx = torch.arange(T, device=x.device)
+            # idx.unsqueeze(0)  = [1, T] broadcast as j (key) axis
+            # idx.unsqueeze(1)  = [T, 1] broadcast as i (query) axis
+            # mask[i, j] = True iff j <= i AND i - j < SLIDING_WINDOW
+            attn_mask = (
+                (idx.unsqueeze(0) <= idx.unsqueeze(1)) &          # j <= i (causal)
+                (idx.unsqueeze(1) - idx.unsqueeze(0) < SLIDING_WINDOW)  # i - j < W
+            )  # [T_q, T_k]
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=attn_mask,
+                scale=ATTN_SCALE,
+            )
+        out = out.transpose(1, 2).contiguous().view(B, T, N_Q_HEADS * hd)
+        return self.o_proj(out)
+# ── MLP (GeGLU) ───────────────────────────────────────────────────────────────
+class MLP(nn.Module):
+    """
+    GeGLU feed-forward network.
+    Layers 0-14:  intermediate_size=6144
+    Layers 15-34: intermediate_size=12288 (double-wide)
+    """
+    def __init__(self, layer_idx: int):
+        super().__init__()
+        inter = INTERMEDIATE_WIDE if layer_idx >= DOUBLE_WIDE_START else INTERMEDIATE
+        self.gate_proj = nn.Linear(HIDDEN_SIZE, inter, bias=False)
+        self.up_proj   = nn.Linear(HIDDEN_SIZE, inter, bias=False)
+        self.down_proj = nn.Linear(inter, HIDDEN_SIZE, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = F.gelu(self.gate_proj(x), approximate="tanh")
+        return self.down_proj(gate * self.up_proj(x))
+# ── Decoder layer ─────────────────────────────────────────────────────────────
+class Gemma4TextLayer(nn.Module):
+    """
+    Single Gemma 4 decoder layer.
+    Execution order (per forward call):
+      1. Per-layer auxiliary stream injection
+      2. Self-attention block (pre/post norm, residual scaled by layer_scalar)
+      3. MLP block (pre/post norm, residual scaled by layer_scalar)
+    Per-layer auxiliary stream injection:
+      Receives per_layer_input [B,T,256] = combined embed+projection for this layer.
+        x_normed = input_layernorm(x)
+        gate     = sigmoid(per_layer_input_gate(x_normed))   # [B,T,256]
+        gated    = gate * per_layer_input                     # [B,T,256]
+        out_1536 = per_layer_projection(gated)               # [B,T,1536]
+        x        = x + post_per_layer_input_norm(out_1536)
+    """
+    def __init__(self, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        # Attention
+        self.self_attn = Attention(layer_idx)
+        # MLP (double-wide for layers 15+)
+        self.mlp = MLP(layer_idx)
+        # Layer norms
+        self.input_layernorm            = RMSNorm(HIDDEN_SIZE)
+        self.post_attention_layernorm   = RMSNorm(HIDDEN_SIZE)
+        self.pre_feedforward_layernorm  = RMSNorm(HIDDEN_SIZE)
+        self.post_feedforward_layernorm = RMSNorm(HIDDEN_SIZE)
+        self.post_per_layer_input_norm  = RMSNorm(HIDDEN_SIZE)
+        # Per-layer auxiliary stream weights:
+        #   per_layer_input_gate:  Linear(1536→256),  weight=[256, 1536]
+        #   per_layer_projection:  Linear(256→1536),  weight=[1536, 256]
+        self.per_layer_input_gate  = nn.Linear(HIDDEN_SIZE, PER_LAYER_DIM, bias=False)
+        self.per_layer_projection  = nn.Linear(PER_LAYER_DIM, HIDDEN_SIZE, bias=False)
+        # Scalar multiplier for attention and MLP residual contributions
+        self.layer_scalar = nn.Parameter(torch.ones(1))
+    def forward(
+        self,
+        x: torch.Tensor,               # [B, T, D]
+        cos: torch.Tensor,             # RoPE tables for this layer type
+        sin: torch.Tensor,
+        per_layer_input: torch.Tensor, # [B, T, 256] combined embed+projection for this layer
+    ) -> torch.Tensor:
+        scalar = self.layer_scalar.to(x.dtype)
+        # ── 1. Per-layer auxiliary stream injection ──────────────────────────
+        # Gate uses the model's hidden activation (gelu_pytorch_tanh), matching
+        # the Gemma3n reference implementation.
+        # The layer_scalar multiplies all residual contributions (per-layer, attn, MLP).
+        x_normed = self.input_layernorm(x)
+        gate  = F.gelu(self.per_layer_input_gate(x_normed), approximate="tanh")  # [B,T,256]
+        gated = gate * per_layer_input                                            # [B,T,256]
+        out   = self.per_layer_projection(gated)                                  # [B,T,1536]
+        x     = x + scalar * self.post_per_layer_input_norm(out)
+        # ── 2. Self-attention ────────────────────────────────────────────────
+        # Apply input_layernorm again after the per-layer injection
+        h = self.self_attn(self.input_layernorm(x), cos, sin)
+        x = x + scalar * self.post_attention_layernorm(h)
+        # ── 3. MLP ───────────────────────────────────────────────────────────
+        h = self.mlp(self.pre_feedforward_layernorm(x))
+        x = x + scalar * self.post_feedforward_layernorm(h)
+        return x
+# ── Full model ─────────────────────────────────────────────────────────────────
+class Gemma4ForCausalLM(nn.Module):
+    """
+    Gemma 4 E2B text model (causal LM head, no vision/audio).
+    Tied embeddings: embed_tokens.weight is shared with lm_head.
+    Output logits are softcapped: 30 * tanh(logits / 30).
+    Per-layer auxiliary stream is computed model-level before layer iteration:
+      - embed_tokens_per_layer lookup:    [B,T,35*256]
+      - per_layer_model_projection:       Linear(1536→35*256)
+      - per_layer_projection_norm:        RMSNorm(256) per layer-slice
+      - combine:  per_layer_inputs = (embed_aux + proj_scaled) * (1/sqrt(2))
+    """
+    def __init__(self):
+        super().__init__()
+        # Token embeddings
+        self.embed_tokens           = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
+        self.embed_tokens_per_layer = nn.Embedding(VOCAB_SIZE, N_LAYERS * PER_LAYER_DIM)
+        # Final norm
+        self.norm = RMSNorm(HIDDEN_SIZE)
+        # Transformer layers
+        self.layers = nn.ModuleList([Gemma4TextLayer(i) for i in range(N_LAYERS)])
+        # Model-level per-layer projection (hidden → all layer aux dims at once)
+        # weight shape: [35*256, 1536] = [8960, 1536]
+        self.per_layer_model_projection = nn.Linear(
+            HIDDEN_SIZE, N_LAYERS * PER_LAYER_DIM, bias=False
+        )
+        # Norm applied to per-layer projection slices [256]
+        self.per_layer_projection_norm = RMSNorm(PER_LAYER_DIM)
+        # RoPE tables (computed lazily)
+        self._rope_slide_cos: torch.Tensor | None = None
+        self._rope_slide_sin: torch.Tensor | None = None
+        self._rope_full_cos:  torch.Tensor | None = None
+        self._rope_full_sin:  torch.Tensor | None = None
+        self._rope_seq:       int = 0
+    @staticmethod
+    def is_full_attention(layer_idx: int) -> bool:
+        return is_full_attention(layer_idx)
+    def _ensure_rope(self, seq_len: int, device: torch.device) -> None:
+        """Precompute (or extend) RoPE tables on demand."""
+        if self._rope_slide_cos is not None and self._rope_seq >= seq_len:
+            return
+        max_seq = max(seq_len, 2048)
+        # Sliding layers: head_dim=256, full rotation
+        cs, sn = build_rope_freqs(HEAD_DIM_SLIDE, max_seq, ROPE_THETA_SLIDE, device)
+        self._rope_slide_cos = cs
+        self._rope_slide_sin = sn
+        # Full-attention layers: head_dim=512, partial_rotary_factor=0.25.
+        # 512 * 0.25 = 128 dims rotated = 64 rotation pairs (half=256, 64 of 256 pairs).
+        n_rot = int(HEAD_DIM_FULL * PARTIAL_ROT_FULL) // 2  # = 64
+        cf, sf = build_rope_freqs(
+            HEAD_DIM_FULL, max_seq, ROPE_THETA_FULL, device, n_rot_pairs=n_rot
+        )
+        self._rope_full_cos = cf
+        self._rope_full_sin = sf
+        self._rope_seq = max_seq
+    def _compute_per_layer_inputs(
+        self, input_ids: torch.Tensor, x_embed: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Precompute per-layer auxiliary inputs for all 35 layers.
+        Returns:
+            per_layer_inputs: [B, T, N_LAYERS, PER_LAYER_DIM]
+        """
+        B, T = input_ids.shape
+        # 1. Token-based per-layer embeddings (vocabulary lookup)
+        # Scaled by sqrt(PER_LAYER_DIM)=16, matching Gemma3n's ScaledWordEmbedding convention
+        embed_aux = self.embed_tokens_per_layer(input_ids).to(x_embed.dtype)
+        embed_aux = embed_aux * math.sqrt(PER_LAYER_DIM)           # scale by sqrt(256)=16
+        # embed_aux: [B, T, 35*256]  reshape → [B, T, 35, 256]
+        embed_aux = embed_aux.view(B, T, N_LAYERS, PER_LAYER_DIM)
+        # 2. Hidden-state projection: project x_embed to [B, T, 35*256]
+        proj_all  = self.per_layer_model_projection(x_embed)  # [B, T, 35*256]
+        proj_all  = proj_all * PER_LAYER_PROJ_SCALE            # scale by 1/sqrt(hidden)
+        proj_all  = proj_all.view(B, T, N_LAYERS, PER_LAYER_DIM)
+        # Apply RMSNorm(256) to each layer slice
+        proj_all  = self.per_layer_projection_norm(proj_all)   # broadcast over [B,T,N]
+        # 3. Combine: (embed_aux + proj_normed) * (1/sqrt(2))
+        per_layer_inputs = (embed_aux + proj_all) * PER_LAYER_INPUT_SCALE
+        return per_layer_inputs  # [B, T, 35, 256]
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input_ids: [B, T] long tensor
+        Returns:
+            logits: [B, T, vocab_size] with softcapping applied
+        """
+        B, T = input_ids.shape
+        self._ensure_rope(T, input_ids.device)
+        # Token embeddings scaled by sqrt(hidden_size)
+        x = self.embed_tokens(input_ids) * math.sqrt(HIDDEN_SIZE)  # [B,T,D]
+        # Compute per-layer auxiliary inputs (uses unmodified x_embed)
+        per_layer_inputs = self._compute_per_layer_inputs(input_ids, x)
+        for i, layer in enumerate(self.layers):
+            per_layer_i = per_layer_inputs[:, :, i, :]  # [B, T, 256]
+            if is_full_attention(i):
+                cos, sin = self._rope_full_cos, self._rope_full_sin
+            else:
+                cos, sin = self._rope_slide_cos, self._rope_slide_sin
+            x = layer(x, cos, sin, per_layer_i)
+        x = self.norm(x)
+        # Tied lm_head: F.linear(x, embed_tokens.weight)
+        logits = F.linear(x, self.embed_tokens.weight.to(x.dtype))  # [B,T,V]
+        # Logit softcapping
+        logits = LOGIT_CAP * torch.tanh(logits / LOGIT_CAP)
+        return logits
+    @classmethod
+    def load_weights(
+        cls,
+        safetensors_path: str | Path,
+        device: str = "cpu",
+    ) -> "Gemma4ForCausalLM":
+        """
+        Load from the safetensors checkpoint.
+        Weight names in the file follow the pattern:
+            model.language_model.X  →  self.X
+        """
+        model  = cls()
+        path   = str(safetensors_path)
+        prefix = "model.language_model."
+        state  = {}
+        with safe_open(path, framework="pt", device=device) as f:
+            for key in f.keys():
+                if not key.startswith(prefix):
+                    continue
+                local_key = key[len(prefix):]  # strip "model.language_model."
+                state[local_key] = f.get_tensor(key)
+        missing, unexpected = model.load_state_dict(state, strict=False)
+        if missing:
+            print(f"[load_weights] {len(missing)} missing keys (first 5): {missing[:5]}")
+        if unexpected:
+            print(f"[load_weights] {len(unexpected)} unexpected keys (first 5): {unexpected[:5]}")
+        model = model.to(dtype=DTYPE)
+        return model
+# ── Convenience loader ─────────────────────────────────────────────────────────
+def load_gemma4(
+    device: str | None = None,
+) -> tuple[Gemma4ForCausalLM, AutoTokenizer]:
+    """
+    Load the Gemma 4 E2B model and tokenizer.
+    Returns:
+        (model, tokenizer)  — model is in eval mode on `device`.
+    """
+    if device is None:
+        device = DEVICE
+    print(f"Loading Gemma 4 E2B from {SAFETENSORS_BLOB} ...")
+    model = Gemma4ForCausalLM.load_weights(SAFETENSORS_BLOB, device=device)
+    model = model.to(device).eval()
+    print(f"Loading tokenizer from {MODEL_DIR} ...")
+    tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR), local_files_only=True)
+    return model, tokenizer
+# ── PPL evaluation ─────────────────────────────────────────────────────────────
+def ppl_on_text(
+    model: Gemma4ForCausalLM,
+    tokenizer: AutoTokenizer,
+    text: str,
+    device: str | None = None,
+    max_length: int = 1024,
+) -> float:
+    """
+    Compute token-level perplexity on `text`.
+    Args:
+        model:      Gemma4ForCausalLM in eval mode
+        tokenizer:  matching AutoTokenizer
+        text:       input string
+        device:     device for inference (defaults to DEVICE)
+        max_length: truncate to this many tokens
+    Returns:
+        perplexity (float)
+    """
+    if device is None:
+        device = DEVICE
+    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
+    input_ids = enc["input_ids"].to(device)
+    with torch.no_grad():
+        logits = model(input_ids)           # [1, T, V]
+    # Shift: predict token t+1 from position t
+    shift_logits = logits[0, :-1, :]        # [T-1, V]
+    shift_labels = input_ids[0, 1:]         # [T-1]
+    log_probs = F.log_softmax(shift_logits.float(), dim=-1)
+    nll = -log_probs.gather(1, shift_labels.unsqueeze(1)).squeeze(1).mean()
+    return nll.exp().item()
+# ── main ──────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    _WIKI_TEXT = (
+        "The transformer architecture was introduced in the paper "
+        "'Attention Is All You Need' by Vaswani et al. in 2017. "
+        "It relies entirely on self-attention mechanisms, dispensing with "
+        "recurrence and convolutions entirely. Transformers have since become "
+        "the dominant architecture for natural language processing, powering "
+        "models such as BERT, GPT, T5, and the Gemma family. "
+        "The key innovation is the multi-head attention mechanism, which allows "
+        "the model to jointly attend to information from different representation "
+        "subspaces at different positions. This is complemented by position-wise "
+        "feed-forward networks and residual connections with layer normalisation. "
+        "Large language models built on this architecture are trained on massive "
+        "corpora using next-token prediction (autoregressive language modelling) "
+        "or masked language modelling. They exhibit emergent capabilities such as "
+        "few-shot and zero-shot generalisation across a wide variety of tasks."
+    )
+    model, tokenizer = load_gemma4()
+    ppl = ppl_on_text(model, tokenizer, _WIKI_TEXT)
+    print(f"\nPerplexity on sample text: {ppl:.2f}  (target: ~17–18 for bfloat16)")