fix: critical T5 conditioner key sanitization and metadata

- extract_t5.py: sanitize .layer.0. → .layer_0. and .layer.1. → .layer_1.
for MLX ModuleParameters.unflattened() compatibility. Without this fix,
all 24 T5 transformer block weights remain at random initialization.
Also adds automatic T5 variant detection (small/base/large).

- config.json: correct T5 metadata from t5-small to t5-large
(d_model=1024, 24 layers, 16 heads, d_ff=4096).

- README.md: fix text encoder reference, add T5 extraction section,
MLX key sanitization note, T5 unscaled attention note, update Swift usage.

- verify_t5.py: new file — Python MLX reference implementation for
verifying T5 encoder output against the Swift implementation.

Files changed (4) hide show

README.md +25 -5
config.json +9 -9
extract_t5.py +271 -0
verify_t5.py +274 -0

README.md CHANGED Viewed

@@ -21,15 +21,28 @@ This is the MLX-native port of [facebook/audiogen-medium](https://huggingface.co
 - **Parameters**: ~1.5B (LM) + EnCodec compression model
 - **Sampling rate**: 16 kHz
 - **Frame rate**: 50 Hz (4 codebooks, delayed pattern)
-- **Text encoder**: T5-small (loaded separately)
 - **Max duration**: 10 seconds (configurable)
 ## Files
-- `config.json` — Model configuration
 - `model.safetensors` — LM + EnCodec weights
 - `model.safetensors.index.json` — Weight index (for sharded variants)
-- `tokenizer.json` / `tokenizer_config.json` — T5 tokenizer files
 ## Usage (Swift/MLX)
@@ -40,15 +53,22 @@ let model = try await AudioGenModel.fromPretrained(
     modelFolder: modelURL,
     t5Folder: t5URL
 )
-let audio = try await model.generateAudio(
-    description: "dog barking",
     duration: 5.0,
     cfgCoef: 3.0,
     temperature: 1.0,
     topK: 250
 )
 ```
 ## License
 This model is published under the [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) license (non-commercial use only), following the original [AudioGen license](https://huggingface.co/facebook/audiogen-medium).

 - **Parameters**: ~1.5B (LM) + EnCodec compression model
 - **Sampling rate**: 16 kHz
 - **Frame rate**: 50 Hz (4 codebooks, delayed pattern)
+- **Text encoder**: T5-large (d_model=1024, 24 layers, 16 heads)
 - **Max duration**: 10 seconds (configurable)
 ## Files
+- `config.json` — Model configuration (includes `t5_model_name` reference)
 - `model.safetensors` — LM + EnCodec weights
 - `model.safetensors.index.json` — Weight index (for sharded variants)
+### T5 Conditioner (extracted separately)
+The T5-large text encoder weights are not included in this repository. Use `extract_t5.py` to extract them from the original `facebook/audiogen-medium` checkpoint:
+```bash
+python extract_t5.py --output /path/to/audiogen-mlx/t5
+```
+This produces a `t5/` directory with `config.json`, `model.safetensors`, and tokenizer files.
+> **Note**: The T5 safetensors keys use MLX-compatible naming (`.layer_0.` / `.layer_1.`
+> instead of HuggingFace's `.layer.0.` / `.layer.1.`). This is required because MLX's
+> `ModuleParameters.unflattened()` splits on all dots.
 ## Usage (Swift/MLX)
     modelFolder: modelURL,
     t5Folder: t5URL
 )
+let tokens = try await model.generate(
+    descriptions: ["dog barking"],
     duration: 5.0,
     cfgCoef: 3.0,
     temperature: 1.0,
     topK: 250
 )
+let audio = model.decode(tokens: tokens)
 ```
+## T5 Attention
+T5's self-attention intentionally does **not** scale scores by `1/sqrt(d_k)`. This is a deliberate design choice in the T5 architecture — do not add scaling in the inference code.
 ## License
 This model is published under the [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) license (non-commercial use only), following the original [AudioGen license](https://huggingface.co/facebook/audiogen-medium).

config.json CHANGED Viewed

@@ -36,14 +36,14 @@
   "duration": 10.0,
   "numSamples": 1,
   "specialToken": 2048,
-  "tokenizer": "t5-small",
-  "t5_model_name": "t5-small",
   "clsToken": 2048,
   "padToken": 2048,
   "encodec": {
     "model_type": "encodec",
     "audio_channels": 1,
-    "num_filters": 64,
     "kernel_size": 7,
     "num_residual_layers": 1,
     "dilation_growth_rate": 2,
@@ -67,15 +67,15 @@
     "use_conv_shortcut": false
   },
   "t5": {
-    "model_name": "t5-small",
-    "d_model": 512,
     "d_kv": 64,
-    "d_ff": 2048,
-    "num_layers": 8,
-    "num_heads": 6,
     "relative_attention_num_buckets": 32,
     "relative_attention_max_distance": 128,
-    "dropout_rate": 0.1,
     "layer_norm_epsilon": 1e-06,
     "feed_forward_proj": "relu",
     "vocab_size": 32128,

   "duration": 10.0,
   "numSamples": 1,
   "specialToken": 2048,
+  "tokenizer": "t5-large",
+  "t5_model_name": "t5-large",
   "clsToken": 2048,
   "padToken": 2048,
   "encodec": {
     "model_type": "encodec",
     "audio_channels": 1,
+    "num_filters": 32,
     "kernel_size": 7,
     "num_residual_layers": 1,
     "dilation_growth_rate": 2,
     "use_conv_shortcut": false
   },
   "t5": {
+    "model_name": "t5-large",
+    "d_model": 1024,
     "d_kv": 64,
+    "d_ff": 4096,
+    "num_layers": 24,
+    "num_heads": 16,
     "relative_attention_num_buckets": 32,
     "relative_attention_max_distance": 128,
+    "dropout_rate": 0.0,
     "layer_norm_epsilon": 1e-06,
     "feed_forward_proj": "relu",
     "vocab_size": 32128,

extract_t5.py ADDED Viewed

	@@ -0,0 +1,271 @@

+#!/usr/bin/env python3
+"""
+Extract T5 conditioner weights from facebook/audiogen-medium for MLX.
+The original AudioGen model bundles a frozen T5 text encoder and a trained
+output projection inside condition_provider.*. The main MLX conversion strips
+these keys. This script extracts them into a t5/ subdirectory that the MLX
+AudioGen loader expects.
+Usage:
+    # Automatic: downloads from HuggingFace, extracts, cleans up
+    python extract_t5.py --output /path/to/audiogen-mlx/t5
+    # Manual: use a local state_dict.bin you already downloaded
+    python extract_t5.py --lm /path/to/state_dict.bin --output /path/to/audiogen-mlx/t5
+Output (in --output directory):
+    config.json             T5 encoder config (derived from weight shapes)
+    model.safetensors       T5 encoder weights + output_proj
+    tokenizer.json          Downloaded from google-t5/t5-small
+    tokenizer_config.json   Downloaded from google-t5/t5-small
+Requirements:
+    pip install torch safetensors huggingface_hub
+"""
+import argparse
+import json
+import os
+import struct
+import tempfile
+import shutil
+import torch
+from safetensors.torch import save_file
+from huggingface_hub import hf_hub_download
+T5_PREFIX = "condition_provider.conditioners.description.model."
+OUTPUT_PROJ_PREFIX = "condition_provider.conditioners.description.output_proj."
+def load_lm_state(path):
+    """Load the LM state dict from a PyTorch checkpoint."""
+    ckpt = torch.load(path, map_location="cpu", weights_only=True)
+    if "best_state" in ckpt:
+        return ckpt["best_state"]
+    return ckpt
+def extract_t5_weights(lm_state):
+    """Extract T5 encoder and output_proj weights from the LM state dict."""
+    t5_weights = {}
+    output_proj = {}
+    other_cp = []
+    for key, tensor in lm_state.items():
+        if not key.startswith("condition_provider."):
+            continue
+        if key.startswith(T5_PREFIX):
+            # Strip prefix to get standard HuggingFace T5 key format
+            new_key = key[len(T5_PREFIX):]
+            t5_weights[new_key] = tensor
+        elif key.startswith(OUTPUT_PROJ_PREFIX):
+            # output_proj.weight / output_proj.bias
+            new_key = key[len(OUTPUT_PROJ_PREFIX):]
+            output_proj[f"output_proj.{new_key}"] = tensor
+        else:
+            other_cp.append(key)
+    return t5_weights, output_proj, other_cp
+def sanitize_keys_for_mlx(weights):
+    """Rename T5 weight keys for MLX compatibility.
+    HuggingFace T5 uses keys like "encoder.block.0.layer.0.SelfAttention.q.weight"
+    where "layer.0" and "layer.1" are sub-module names. MLX's
+    ModuleParameters.unflattened() splits on ALL dots, which misparses "layer.0"
+    as {"layer": {"0": ...}} instead of treating it as a single key.
+    This renames ".layer.0." to ".layer_0." and ".layer.1." to ".layer_1." so
+    the keys work correctly with MLX's parameter loading.
+    """
+    sanitized = {}
+    for key, value in weights.items():
+        new_key = key
+        new_key = new_key.replace(".layer.0.", ".layer_0.")
+        new_key = new_key.replace(".layer.1.", ".layer_1.")
+        sanitized[new_key] = value
+    return sanitized
+def infer_t5_config(t5_weights):
+    """Determine T5 architecture from weight shapes."""
+    # shared.weight: [vocab_size, d_model]
+    shared = t5_weights.get("shared.weight")
+    if shared is None:
+        raise ValueError("Cannot find shared.weight in T5 weights")
+    vocab_size = shared.shape[0]
+    d_model = shared.shape[1]
+    # Find q projection to determine d_kv and num_heads
+    q_weight = t5_weights.get("encoder.block.0.layer.0.SelfAttention.q.weight")
+    if q_weight is None:
+        raise ValueError("Cannot find SelfAttention.q.weight")
+    # q.weight: [num_heads * d_kv, d_model]
+    total_kv = q_weight.shape[0]
+    # Find DenseReluDense.wi to determine d_ff
+    wi = t5_weights.get("encoder.block.0.layer.1.DenseReluDense.wi.weight")
+    if wi is None:
+        raise ValueError("Cannot find DenseReluDense.wi.weight")
+    d_ff = wi.shape[0]
+    # Count encoder layers
+    num_layers = 0
+    while f"encoder.block.{num_layers}.layer.0.SelfAttention.q.weight" in t5_weights:
+        num_layers += 1
+    # Determine d_kv and num_heads
+    # Standard T5 d_kv values: 64 (all sizes)
+    d_kv = 64
+    num_heads = total_kv // d_kv
+    # Check relative_attention_bias
+    rab = t5_weights.get(
+        "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+    )
+    num_buckets = rab.shape[0] if rab is not None else 32
+    # Determine T5 variant name from d_model
+    t5_variant = "t5-unknown"
+    if d_model == 512:
+        t5_variant = "t5-small"
+    elif d_model == 768:
+        t5_variant = "t5-base"
+    elif d_model == 1024:
+        t5_variant = "t5-large"
+    elif d_model == 4096:
+        t5_variant = "t5-3b"
+    config = {
+        "architectures": ["T5EncoderModel"],
+        "model_name": t5_variant,
+        "d_model": d_model,
+        "d_kv": d_kv,
+        "d_ff": d_ff,
+        "num_heads": num_heads,
+        "num_layers": num_layers,
+        "vocab_size": vocab_size,
+        "relative_attention_num_buckets": num_buckets,
+        "relative_attention_max_distance": 128,
+        "dropout_rate": 0.0,
+        "layer_norm_epsilon": 1e-6,
+        "feed_forward_proj": "relu",
+        "tie_word_embeddings": True,
+        "decoder_start_token_id": 0,
+        "model_type": "t5",
+    }
+    return config
+def download_tokenizer(output_dir):
+    """Download T5 tokenizer files from HuggingFace.
+    All T5 model sizes share the same SentencePiece tokenizer (32128 tokens),
+    so we download from t5-small for convenience.
+    """
+    repo = "google-t5/t5-small"
+    for filename in ["tokenizer.json", "tokenizer_config.json"]:
+        path = hf_hub_download(repo_id=repo, filename=filename)
+        dst = os.path.join(output_dir, filename)
+        shutil.copy2(path, dst)
+        print(f"  Copied {filename}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract T5 conditioner from facebook/audiogen-medium"
+    )
+    parser.add_argument(
+        "--lm",
+        help="Path to local state_dict.bin (skips download)",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Output directory for T5 weights (e.g. /path/to/model/t5)",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output, exist_ok=True)
+    # Get the state dict
+    if args.lm:
+        lm_path = args.lm
+        print(f"Loading local checkpoint: {lm_path}")
+    else:
+        print("Downloading facebook/audiogen-medium state_dict.bin ...")
+        lm_path = hf_hub_download(
+            repo_id="facebook/audiogen-medium",
+            filename="state_dict.bin",
+        )
+        print(f"  Downloaded to cache: {lm_path}")
+    print("Loading state dict ...")
+    lm_state = load_lm_state(lm_path)
+    print("Extracting T5 weights ...")
+    t5_weights, output_proj, other_cp = extract_t5_weights(lm_state)
+    print(f"  T5 encoder keys: {len(t5_weights)}")
+    print(f"  Output projection keys: {len(output_proj)}")
+    if other_cp:
+        print(f"  Other condition_provider keys (skipped): {len(other_cp)}")
+    if not t5_weights:
+        print("ERROR: No T5 weights found in checkpoint!")
+        return
+    # Infer T5 architecture
+    config = infer_t5_config(t5_weights)
+    print(f"  T5 config: {config['model_name']} — d_model={config['d_model']}, "
+          f"num_heads={config['num_heads']}, "
+          f"num_layers={config['num_layers']}, "
+          f"d_ff={config['d_ff']}, "
+          f"vocab_size={config['vocab_size']}")
+    if output_proj:
+        proj_w = output_proj.get("output_proj.weight")
+        if proj_w is not None:
+            print(f"  Output projection: {list(proj_w.shape)} "
+                  f"(T5 d_model={proj_w.shape[1]} → LM dim={proj_w.shape[0]})")
+    # Sanitize keys for MLX compatibility before saving
+    sanitized_t5 = sanitize_keys_for_mlx(t5_weights)
+    print(f"  Sanitized {len(sanitized_t5)} T5 keys (.layer.N. → .layer_N.)")
+    # Combine sanitized T5 weights + output_proj into one safetensors
+    all_weights = {}
+    all_weights.update(sanitized_t5)
+    all_weights.update(output_proj)
+    # Save safetensors
+    st_path = os.path.join(args.output, "model.safetensors")
+    print(f"Saving {len(all_weights)} tensors to {st_path} ...")
+    save_file(all_weights, st_path)
+    total_bytes = os.path.getsize(st_path)
+    print(f"  Size: {total_bytes / 1e6:.1f} MB")
+    # Save config
+    config_path = os.path.join(args.output, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"Saved config.json")
+    # Download tokenizer
+    print("Downloading T5 tokenizer ...")
+    download_tokenizer(args.output)
+    print(f"\nDone! T5 conditioner saved to: {args.output}")
+    print("Files:", sorted(os.listdir(args.output)))
+if __name__ == "__main__":
+    main()

verify_t5.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python3
+"""Verify T5 encoder output against Swift implementation.
+Loads the same T5 safetensors weights, runs the encoder on the same tokens,
+and prints output stats for comparison with the Swift logs.
+"""
+import math
+import mlx.core as mx
+import mlx.nn as nn
+import json
+from pathlib import Path
+MODEL_DIR = Path.home() / "Library/Application Support/Velvox/Models/audiogen-mlx/t5"
+# ── T5 LayerNorm (RMSNorm, no centering) ──
+class T5LayerNorm(nn.Module):
+    def __init__(self, dims, eps=1e-6):
+        super().__init__()
+        self.weight = mx.ones((dims,))
+        self.eps = eps
+    def __call__(self, x):
+        y = x.astype(mx.float32)
+        y = y * mx.rsqrt(mx.mean(y * y, axis=-1, keepdims=True) + self.eps)
+        return self.weight * y.astype(x.dtype)
+# ── T5 DenseReluDense ──
+class T5DenseActDense(nn.Module):
+    def __init__(self, d_model, d_ff, act="relu"):
+        super().__init__()
+        self.wi = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.act = act
+    def __call__(self, x):
+        h = self.wi(x)
+        h = nn.relu(h) if self.act == "relu" else nn.gelu(h)
+        return self.wo(h)
+# ── T5 Attention (NO sqrt(d_k) scaling — this is T5's design) ──
+class T5Attention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.num_heads = config["num_heads"]
+        self.d_kv = config["d_kv"]
+        self.d_model = config["d_model"]
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = config["relative_attention_num_buckets"]
+        self.max_distance = config.get("relative_attention_max_distance", 128)
+        self.q = nn.Linear(self.d_model, self.num_heads * self.d_kv, bias=False)
+        self.k = nn.Linear(self.d_model, self.num_heads * self.d_kv, bias=False)
+        self.v = nn.Linear(self.d_model, self.num_heads * self.d_kv, bias=False)
+        self.o = nn.Linear(self.num_heads * self.d_kv, self.d_model, bias=False)
+        if has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.num_buckets, self.num_heads)
+    @staticmethod
+    def _relative_position_bucket(rp, bidirectional=True, num_buckets=32, max_distance=128):
+        nb = num_buckets
+        result = mx.zeros(rp.shape, dtype=mx.int32)
+        if bidirectional:
+            nb = nb // 2
+            is_pos = mx.where(rp > 0, mx.array(nb, dtype=mx.int32), mx.array(0, dtype=mx.int32))
+            result = is_pos
+            rp = mx.abs(rp)
+        else:
+            rp = -mx.minimum(rp, mx.zeros_like(rp))
+        max_exact = nb // 2
+        is_small = rp < max_exact
+        large_rp = rp.astype(mx.float32)
+        log_ratio = mx.log(large_rp / max_exact) / math.log(max_distance / max_exact)
+        large_bucket = (log_ratio * (nb - max_exact)).astype(mx.int32) + max_exact
+        clamped = mx.minimum(large_bucket, mx.array(nb - 1, dtype=mx.int32))
+        buckets = mx.where(is_small, rp.astype(mx.int32), clamped)
+        return result + buckets
+    def compute_bias(self, q_len, k_len):
+        if not self.has_relative_attention_bias:
+            return None
+        ctx = mx.arange(q_len, dtype=mx.int32)
+        mem = mx.arange(k_len, dtype=mx.int32)
+        rp = mem.reshape(1, -1).astype(mx.float32) - ctx.reshape(-1, 1).astype(mx.float32)
+        rp_bucket = self._relative_position_bucket(
+            rp, bidirectional=True,
+            num_buckets=self.num_buckets, max_distance=self.max_distance
+        )
+        flat = rp_bucket.reshape(-1)
+        bias_flat = self.relative_attention_bias(flat)
+        bias = bias_flat.reshape(q_len, k_len, self.num_heads)
+        bias = bias.transpose(2, 0, 1)[None, :, :, :]  # [1, H, Q, K]
+        return bias
+    def __call__(self, hidden, mask=None, position_bias=None):
+        B, T, _ = hidden.shape
+        q = self.q(hidden).reshape(B, T, self.num_heads, self.d_kv)
+        k = self.k(hidden).reshape(B, T, self.num_heads, self.d_kv)
+        v = self.v(hidden).reshape(B, T, self.num_heads, self.d_kv)
+        q = q.transpose(0, 2, 1, 3)  # [B, H, T, d]
+        k = k.transpose(0, 2, 3, 1)  # [B, H, d, T]
+        v = v.transpose(0, 2, 1, 3)  # [B, H, T, d]
+        # T5: NO scaling by 1/sqrt(d_k)
+        scores = q @ k
+        if position_bias is None:
+            position_bias = self.compute_bias(T, T)
+        if position_bias is not None:
+            scores = scores + position_bias
+        weights = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
+        out = (weights @ v).transpose(0, 2, 1, 3).reshape(B, T, -1)
+        return self.o(out)
+# ── T5 Block ──
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.self_attn = T5Attention(config, has_relative_attention_bias)
+        self.layer_norm_sa = T5LayerNorm(config["d_model"], config.get("layer_norm_epsilon", 1e-6))
+        self.ff = T5DenseActDense(config["d_model"], config["d_ff"], config.get("feed_forward_proj", "relu"))
+        self.layer_norm_ff = T5LayerNorm(config["d_model"], config.get("layer_norm_epsilon", 1e-6))
+    def __call__(self, x, mask=None, position_bias=None):
+        normed = self.layer_norm_sa(x)
+        attn_out = self.self_attn(normed, mask=mask, position_bias=position_bias)
+        x = x + attn_out
+        normed = self.layer_norm_ff(x)
+        ff_out = self.ff(normed)
+        x = x + ff_out
+        return x
+# ── T5 Encoder ──
+class T5Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.shared = nn.Embedding(config["vocab_size"], config["d_model"])
+        self.blocks = [T5Block(config, has_relative_attention_bias=(i == 0))
+                       for i in range(config["num_layers"])]
+        self.final_layer_norm = T5LayerNorm(config["d_model"], config.get("layer_norm_epsilon", 1e-6))
+    def __call__(self, input_ids):
+        x = self.shared(input_ids)
+        # Compute position bias from first block, reuse for all
+        pos_bias = self.blocks[0].self_attn.compute_bias(x.shape[1], x.shape[1])
+        for block in self.blocks:
+            x = block(x, position_bias=pos_bias)
+        return self.final_layer_norm(x)
+def load_and_remap_weights(t5_dir):
+    """Load safetensors and remap HuggingFace T5 keys to our module structure."""
+    import glob
+    safetensors_files = sorted(glob.glob(str(t5_dir / "*.safetensors")))
+    all_weights = {}
+    for f in safetensors_files:
+        w = mx.load(f)
+        all_weights.update(w)
+    # Separate output_proj from T5 weights
+    output_proj_w = all_weights.pop("output_proj.weight", None)
+    output_proj_b = all_weights.pop("output_proj.bias", None)
+    # Remap HuggingFace keys to our module structure
+    remapped = {}
+    for key, val in all_weights.items():
+        new_key = key
+        # shared.weight → shared.weight (OK)
+        # encoder.block.N.layer.0.SelfAttention.X → blocks.N.self_attn.X
+        new_key = new_key.replace("encoder.block.", "blocks.")
+        new_key = new_key.replace(".layer.0.SelfAttention.", ".self_attn.")
+        new_key = new_key.replace(".layer.0.layer_norm.", ".layer_norm_sa.")
+        new_key = new_key.replace(".layer.1.DenseReluDense.", ".ff.")
+        new_key = new_key.replace(".layer.1.layer_norm.", ".layer_norm_ff.")
+        # encoder.final_layer_norm → final_layer_norm
+        new_key = new_key.replace("encoder.final_layer_norm.", "final_layer_norm.")
+        remapped[new_key] = val
+    return remapped, output_proj_w, output_proj_b
+def main():
+    print("=" * 60)
+    print("T5 Encoder Verification (MLX Python reference)")
+    print("=" * 60)
+    # Load config
+    with open(t5_dir / "config.json") as f:
+        config = json.load(f)
+    print(f"Config: d_model={config['d_model']} layers={config['num_layers']} "
+          f"heads={config['num_heads']} d_kv={config['d_kv']} d_ff={config['d_ff']}")
+    # Build model
+    encoder = T5Encoder(config)
+    # Load weights
+    t5_dir_p = MODEL_DIR
+    weights, proj_w, proj_b = load_and_remap_weights(t5_dir_p)
+    # Apply weights
+    encoder.load_weights(list(weights.items()))
+    # Build output_proj
+    output_proj = None
+    if proj_w is not None:
+        output_proj = nn.Linear(proj_w.shape[1], proj_w.shape[0])
+        proj_params = [("weight", proj_w)]
+        if proj_b is not None:
+            proj_params.append(("bias", proj_b))
+        output_proj.load_weights(proj_params)
+        print(f"output_proj: {proj_w.shape[1]} → {proj_w.shape[0]}")
+    # Test prompts with known token IDs from Swift logs
+    test_cases = [
+        ("dog barking", [1782, 21696, 53, 1]),
+        ("cars in the street", [2948, 16, 8, 2815, 1]),
+        ("A metro train leaving the platform", [71, 12810, 2412, 3140, 8, 1585, 1]),
+    ]
+    for prompt, token_ids in test_cases:
+        print(f"\n--- '{prompt}' ---")
+        print(f"Tokens: {token_ids}")
+        input_ids = mx.array([token_ids], dtype=mx.int32)
+        features = encoder(input_ids)
+        mx.eval(features)
+        print(f"Encoder output: shape={features.shape} "
+              f"min={features.min().item():.7f} max={features.max().item():.7f} "
+              f"sum={features.sum().item():.4f}")
+        # Per-position stats
+        for i in range(features.shape[1]):
+            pos_feat = features[0, i]
+            print(f"  pos[{i}]: min={pos_feat.min().item():.5f} "
+                  f"max={pos_feat.max().item():.5f} "
+                  f"mean={pos_feat.mean().item():.5f}")
+        if output_proj is not None:
+            projected = output_proj(features)
+            mx.eval(projected)
+            print(f"After output_proj: shape={projected.shape} "
+                  f"min={projected.min().item():.7f} max={projected.max().item():.7f} "
+                  f"sum={projected.sum().item():.4f}")
+if __name__ == "__main__":
+    t5_dir = MODEL_DIR
+    if not t5_dir.exists():
+        print(f"T5 directory not found: {t5_dir}")
+        exit(1)
+    main()