WIP audio decoder

Files changed (12) hide show

KanadeDecoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
KanadeDecoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
KanadeDecoder.mlpackage/Manifest.json +18 -0
PlaprePicoDecode.mlpackage/Data/com.apple.CoreML/model.mlmodel +2 -2
PlaprePicoDecode.mlpackage/Manifest.json +8 -8
Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
Vocoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
Vocoder.mlpackage/Manifest.json +18 -0
scripts/convert.py +9 -3
scripts/convert_kanade.py +711 -0
scripts/model_wrapper.py +7 -2
scripts/test_generate.py +250 -0

KanadeDecoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a72aeec4e105b9d593a721317e9fce1ca7783e21293e82f898d810c6bf1c1fe
+size 178115

KanadeDecoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d2922387d7a2ef3f41db7a069ca9be2d313250137841ffbe8ab7b912bddd96a
+size 364866112

KanadeDecoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "3D07005F-6244-406D-9DD3-91CF5F26CCAE": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "FD090485-11AF-465F-8569-E149E7086201": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "3D07005F-6244-406D-9DD3-91CF5F26CCAE"
+}

PlaprePicoDecode.mlpackage/Data/com.apple.CoreML/model.mlmodel CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3204fd09a746e1814b5a6803aaf73fc910d7723710ea6a3eeac7dd5970a77341
-size 579193

 version https://git-lfs.github.com/spec/v1
+oid sha256:0cbc60dac941edc9fbe212c52c4a96677e6ca547d575bdf461d19695e35de86a
+size 579443

PlaprePicoDecode.mlpackage/Manifest.json CHANGED Viewed

@@ -1,18 +1,18 @@
 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
-        "D9ED4ABB-3CF3-496D-8858-06948CEBC48F": {
-            "author": "com.apple.CoreML",
-            "description": "CoreML Model Weights",
-            "name": "weights",
-            "path": "com.apple.CoreML/weights"
-        },
-        "E087C383-13E2-4E2C-B87A-990925041088": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
         }
     },
-    "rootModelIdentifier": "E087C383-13E2-4E2C-B87A-990925041088"
 }

 {
     "fileFormatVersion": "1.0.0",
     "itemInfoEntries": {
+        "668A6F00-934D-4D44-9C27-7881268451D9": {
             "author": "com.apple.CoreML",
             "description": "CoreML Model Specification",
             "name": "model.mlmodel",
             "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "BA90DDE9-E076-4B65-A23D-91E3BFAD284D": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
         }
     },
+    "rootModelIdentifier": "668A6F00-934D-4D44-9C27-7881268451D9"
 }

Vocoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88536c7f82ce5963c40ab46ab192452ddd1af731ecd4e08a40ea827fc544fbb6
+size 1298694

Vocoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f1b0ee1106eb66c74b00639159b27c910123caa778ffb2b7b4ece2eb88a180c
+size 85215120

Vocoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "1865D6B1-DF08-4C5C-8B25-53058EF04D75": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "6D12622B-E675-4537-9163-574EA27CA0C1": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "1865D6B1-DF08-4C5C-8B25-53058EF04D75"
+}

scripts/convert.py CHANGED Viewed

@@ -203,16 +203,17 @@ def convert_decode(model: PlaprePicoDecode, output_dir: Path):
     causal_mask = torch.full((1, 1, 1, MAX_CONTEXT), float("-inf"), dtype=torch.float16)
     causal_mask[0, 0, 0, :PREFILL_SEQ_LEN] = 0.0
-    # Pre-sliced RoPE for a single position (caller computes these)
     cos = torch.zeros(1, 1, 1, HEAD_DIM, dtype=torch.float16)
     sin = torch.zeros(1, 1, 1, HEAD_DIM, dtype=torch.float16)
-    # One-hot position mask for cache update (caller builds this)
     update_mask = torch.zeros(1, 1, MAX_CONTEXT, 1, dtype=torch.float16)
     update_mask[0, 0, PREFILL_SEQ_LEN, 0] = 1.0
     with torch.no_grad():
-        traced = torch.jit.trace(model, (input_ids, causal_mask, cos, sin, update_mask))
     print("Converting decode to CoreML...")
     mlmodel = ct.convert(
@@ -231,6 +232,11 @@ def convert_decode(model: PlaprePicoDecode, output_dir: Path):
                 shape=(1, 1, MAX_CONTEXT, 1),
                 dtype=np.float16,
             ),
         ],
         outputs=[ct.TensorType(name="logits", dtype=np.float16)],
         states=build_kv_cache_states(),

     causal_mask = torch.full((1, 1, 1, MAX_CONTEXT), float("-inf"), dtype=torch.float16)
     causal_mask[0, 0, 0, :PREFILL_SEQ_LEN] = 0.0
     cos = torch.zeros(1, 1, 1, HEAD_DIM, dtype=torch.float16)
     sin = torch.zeros(1, 1, 1, HEAD_DIM, dtype=torch.float16)
     update_mask = torch.zeros(1, 1, MAX_CONTEXT, 1, dtype=torch.float16)
     update_mask[0, 0, PREFILL_SEQ_LEN, 0] = 1.0
+    # Pre-projected speaker hidden: (1, 1, HIDDEN_SIZE) — zeros for non-speaker steps
+    speaker_hidden = torch.zeros(1, 1, HIDDEN_SIZE, dtype=torch.float16)
     with torch.no_grad():
+        traced = torch.jit.trace(model, (input_ids, causal_mask, cos, sin, update_mask, speaker_hidden))
     print("Converting decode to CoreML...")
     mlmodel = ct.convert(
                 shape=(1, 1, MAX_CONTEXT, 1),
                 dtype=np.float16,
             ),
+            ct.TensorType(
+                name="speaker_hidden",
+                shape=(1, 1, HIDDEN_SIZE),
+                dtype=np.float16,
+            ),
         ],
         outputs=[ct.TensorType(name="logits", dtype=np.float16)],
         states=build_kv_cache_states(),

scripts/convert_kanade.py ADDED Viewed

	@@ -0,0 +1,711 @@

+#!/usr/bin/env python3
+"""
+Convert Kanade decoder and HiFT vocoder to CoreML.
+These are non-autoregressive models (single forward pass), so conversion
+is simpler than the LLM — no KV cache or StateType needed.
+Two models are produced:
+- KanadeDecoder.mlpackage: audio token indices + speaker embedding → mel spectrogram
+- HiFTVocoder.mlpackage: mel spectrogram → PCM waveform
+Usage:
+    python scripts/convert_kanade.py [--output-dir PATH] [--num-tokens 100]
+"""
+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import coremltools as ct
+from kanade_tokenizer import KanadeModel, load_vocoder
+import kanade_tokenizer.module.transformer as kanade_transformer
+# ── Monkey-patch Kanade's complex RoPE with real-valued version ───────────
+def _apply_rotary_emb_real(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """Real-valued RoPE replacement for Kanade's complex-number version.
+    Converts complex freqs_cis to cos/sin and applies split-half rotation.
+    """
+    # freqs_cis is complex: (seq_len, head_dim/2)
+    cos = freqs_cis.real  # (seq_len, head_dim/2)
+    sin = freqs_cis.imag
+    # Broadcast to match x shape: (bsz, seq_len, n_heads, head_dim)
+    # x has head_dim, cos/sin have head_dim/2 — need to double them
+    cos = torch.cat([cos, cos], dim=-1)  # (seq_len, head_dim)
+    sin = torch.cat([sin, sin], dim=-1)
+    # Reshape for broadcast: (1, seq_len, 1, head_dim)
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    # Split-half rotation
+    half = x.shape[-1] // 2
+    x1 = x[..., :half]
+    x2 = x[..., half:]
+    rotated = torch.cat((-x2, x1), dim=-1)
+    return (x * cos + rotated * sin).type_as(x)
+def _apply_rotary_emb_precomputed(x: torch.Tensor, freqs_cos_sin: torch.Tensor) -> torch.Tensor:
+    """Real-valued RoPE using precomputed cos/sin stored as (seq_len, head_dim).
+    head_dim is always 64, hardcoded to avoid dynamic size ops.
+    """
+    cos = freqs_cos_sin[..., :32]
+    sin = freqs_cos_sin[..., 32:]
+    cos = torch.cat([cos, cos], dim=-1)
+    sin = torch.cat([sin, sin], dim=-1)
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    x1 = x[..., :32]
+    x2 = x[..., 32:]
+    rotated = torch.cat((-x2, x1), dim=-1)
+    return (x * cos + rotated * sin).type_as(x)
+def _patched_attention_forward_v2(self, x, freqs_cis, mask, return_kv=False):
+    """Attention forward with real-valued RoPE and explicit matmul."""
+    bsz, seqlen, _ = x.shape
+    xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+    xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
+    xk = xk.view(bsz, seqlen, self.n_heads, self.head_dim)
+    xv = xv.view(bsz, seqlen, self.n_heads, self.head_dim)
+    if freqs_cis is not None:
+        xq = _apply_rotary_emb_precomputed(xq, freqs_cis[:seqlen])
+        xk = _apply_rotary_emb_precomputed(xk, freqs_cis[:seqlen])
+    xq = xq.transpose(1, 2)
+    xk = xk.transpose(1, 2)
+    xv = xv.transpose(1, 2)
+    attn_weights = torch.matmul(xq, xk.transpose(2, 3)) * self.scale
+    if mask is not None:
+        attn_weights = attn_weights + mask
+    if self.causal:
+        causal_mask = torch.triu(
+            torch.full((seqlen, seqlen), float("-inf"), device=x.device), diagonal=1
+        )
+        attn_weights = attn_weights + causal_mask
+    attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
+    output = torch.matmul(attn_weights, xv)
+    # 12 heads * 64 head_dim = 768
+    output = output.transpose(1, 2).contiguous().reshape(bsz, seqlen, 768)
+    output = self.wo(output)
+    if return_kv:
+        return output, (xk, xv)
+    return output
+def _convert_freqs_cis_to_real(transformer_module):
+    """Replace complex freqs_cis buffer with real-valued cos/sin concatenation."""
+    if hasattr(transformer_module, 'freqs_cis') and transformer_module.freqs_cis is not None:
+        fc = transformer_module.freqs_cis  # (max_len, head_dim/2) complex
+        cos = fc.real.float()  # (max_len, head_dim/2)
+        sin = fc.imag.float()
+        real_freqs = torch.cat([cos, sin], dim=-1)  # (max_len, head_dim)
+        # Replace the buffer
+        del transformer_module.freqs_cis
+        transformer_module.register_buffer('freqs_cis', real_freqs)
+def patch_kanade_for_coreml(kanade: KanadeModel):
+    """Apply monkey-patches to make Kanade traceable by coremltools."""
+    kanade_transformer.Attention.forward = _patched_attention_forward_v2
+    # Convert complex freqs_cis to real in all transformers
+    for name, module in kanade.named_modules():
+        if isinstance(module, kanade_transformer.Transformer):
+            _convert_freqs_cis_to_real(module)
+class KanadeDecoderWrapper(nn.Module):
+    """Wraps Kanade's decode pipeline for tracing.
+    Pipeline: token indices → quantizer decode → mel_prenet → upsample →
+              mel_decoder (conditioned on speaker) → mel_postnet → mel
+    """
+    def __init__(self, kanade: KanadeModel, num_tokens: int):
+        super().__init__()
+        self.local_quantizer = kanade.local_quantizer
+        self.mel_prenet = kanade.mel_prenet
+        self.mel_conv_upsample = kanade.mel_conv_upsample
+        self.mel_decoder = kanade.mel_decoder
+        self.mel_postnet = kanade.mel_postnet
+        self.num_tokens = num_tokens
+        # Precompute mel_length for this token count
+        self.mel_length = kanade._calculate_target_mel_length(
+            kanade._calculate_original_audio_length(num_tokens)
+        )
+    def forward(
+        self,
+        token_indices: torch.Tensor,
+        speaker_embedding: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            token_indices: (num_tokens,) int32 — Kanade codebook indices (0-12799)
+            speaker_embedding: (1, 128) float32 — speaker embedding
+        Returns:
+            mel: (1, 80, mel_length) float32
+        """
+        # Quantizer decode: indices → content embedding
+        content_emb = self.local_quantizer.decode(token_indices)  # (num_tokens, 768)
+        content_emb = content_emb.unsqueeze(0)  # (1, num_tokens, 768)
+        # Mel prenet (transformer)
+        local_latent = self.mel_prenet(content_emb)
+        # Upsample to mel length
+        if self.mel_conv_upsample is not None:
+            local_latent = self.mel_conv_upsample(
+                local_latent.transpose(1, 2)
+            ).transpose(1, 2)
+        local_latent = F.interpolate(
+            local_latent.transpose(1, 2), size=self.mel_length, mode="nearest"
+        ).transpose(1, 2)
+        # Mel decoder (conditioned on speaker)
+        mel = self.mel_decoder(local_latent, condition=speaker_embedding.unsqueeze(1))
+        mel = mel.transpose(1, 2)  # (1, 80, mel_length)
+        # Postnet
+        mel = self.mel_postnet(mel)
+        return mel
+class FullVocoderWrapper(nn.Module):
+    """Complete mel → waveform pipeline: F0 prediction + source gen + HiFT decode + iSTFT.
+    Noise is replaced with zeros for deterministic tracing.
+    """
+    def __init__(self, vocoder, num_stft_frames: int):
+        super().__init__()
+        self.vocoder = vocoder
+        self.num_stft_frames = num_stft_frames
+        n_fft = vocoder.istft_n_fft  # 16
+        hop_len = vocoder.istft_hop_len  # 4
+        # iDFT basis
+        n = torch.arange(n_fft, dtype=torch.float32)
+        k = torch.arange(n_fft, dtype=torch.float32)
+        angles = 2.0 * torch.pi * n.unsqueeze(1) * k.unsqueeze(0) / n_fft
+        self.register_buffer("idft_cos", torch.cos(angles) / n_fft)
+        self.register_buffer("idft_sin", torch.sin(angles) / n_fft)
+        self.register_buffer("window", vocoder.stft_window.clone())
+        # Source generation constants
+        self.sampling_rate = vocoder.m_source.l_sin_gen.sampling_rate
+        self.harmonic_num = vocoder.m_source.l_sin_gen.harmonic_num  # 8
+        self.sine_amp = vocoder.m_source.l_sin_gen.sine_amp  # 0.1
+        self.upsample_scale = vocoder.m_source.l_sin_gen.upsample_scale  # 480
+        # Harmonic multipliers: [1, 2, ..., 9]
+        self.register_buffer(
+            "harmonic_muls",
+            torch.arange(1, self.harmonic_num + 2, dtype=torch.float32),
+        )
+        # l_linear and l_tanh from m_source
+        self.source_linear = vocoder.m_source.l_linear
+        self.source_tanh = vocoder.m_source.l_tanh
+        self.n_fft = n_fft
+        self.hop_len = hop_len
+        self.n_fft_half = n_fft // 2 + 1
+    def _generate_source(self, f0: torch.Tensor) -> torch.Tensor:
+        """f0: (1, mel_length) → source_stft: (1, 18, stft_frames)"""
+        # Upsample f0: (1, mel_length) → (1, 1, mel_length) → nearest → (1, 1, audio_length)
+        f0_up = F.interpolate(
+            f0.unsqueeze(1), scale_factor=float(self.upsample_scale), mode="nearest"
+        ).squeeze(1)  # (1, audio_length)
+        # Generate harmonics: f0 * [1..9]
+        # f0_up: (1, L) → (1, L, 1) * (9,) → (1, L, 9)
+        fn = f0_up.unsqueeze(-1) * self.harmonic_muls.unsqueeze(0).unsqueeze(0)
+        # Phase accumulation: cumsum(f/sr) * 2pi
+        rad = (fn / self.sampling_rate)  # instantaneous frequency in cycles per sample
+        phase = torch.cumsum(rad, dim=1) * 2.0 * torch.pi  # (1, L, 9)
+        # Sine waves
+        sines = torch.sin(phase) * self.sine_amp  # (1, L, 9)
+        # UV mask (voiced/unvoiced)
+        uv = (f0_up > 0).float().unsqueeze(-1)  # (1, L, 1)
+        # Apply UV (no noise — zeros instead of randn for tracing)
+        sines = sines * uv  # (1, L, 9)
+        # l_linear + tanh: (1, L, 9) → linear → (1, L, 1) → tanh
+        source = self.source_tanh(self.source_linear(sines))  # (1, L, 1)
+        source = source.squeeze(-1)  # (1, L)
+        # Manual STFT (torch.stft/unfold not CoreML-compatible)
+        # n_fft=16, hop=4. With center padding, we get num_stft_frames frames.
+        # Pad source: reflect pad n_fft//2 on each side
+        padded = F.pad(source, (self.n_fft // 2, self.n_fft // 2), mode="reflect")
+        # padded: (1, L + n_fft) where L = audio_length
+        # Extract overlapping frames using conv1d with identity kernel
+        # This replaces unfold: conv1d with (n_fft, 1, n_fft) identity kernel, stride=hop
+        # Equivalent to: frames[i] = padded[i*hop : i*hop + n_fft]
+        eye_kernel = torch.eye(self.n_fft, dtype=source.dtype, device=source.device).unsqueeze(1)
+        # padded: (1, L+16) → (1, 1, L+16) for conv1d
+        frames = F.conv1d(padded.unsqueeze(1), eye_kernel, stride=self.hop_len)
+        # frames: (1, 16, num_frames)
+        frames = frames * self.window.unsqueeze(0).unsqueeze(-1)  # window each frame
+        # Transpose to (1, num_frames, 16) for matmul
+        frames = frames.transpose(1, 2)
+        # DFT via matmul
+        dft_cos = self.idft_cos[:self.n_fft_half, :] * self.n_fft  # undo 1/N normalization
+        dft_sin = self.idft_sin[:self.n_fft_half, :] * self.n_fft
+        s_real = torch.matmul(frames, dft_cos.T)   # (1, NF, 9)
+        s_imag = -torch.matmul(frames, dft_sin.T)  # (1, NF, 9)
+        source_stft = torch.cat([s_real.transpose(1, 2), s_imag.transpose(1, 2)], dim=1)
+        return source_stft
+    def _istft_overlap_add(self, x: torch.Tensor) -> torch.Tensor:
+        """x: (1, 18, num_frames) conv_post output → waveform (1, samples)"""
+        magnitude = torch.exp(x[:, :self.n_fft_half, :])
+        phase = torch.sin(x[:, self.n_fft_half:, :])
+        real_half = magnitude * torch.cos(phase)
+        imag_half = magnitude * torch.sin(phase)
+        real_mirror = torch.flip(real_half[:, 1:self.n_fft_half - 1, :], dims=[1])
+        imag_mirror = -torch.flip(imag_half[:, 1:self.n_fft_half - 1, :], dims=[1])
+        real_full = torch.cat([real_half, real_mirror], dim=1)
+        imag_full = torch.cat([imag_half, imag_mirror], dim=1)
+        real_t = real_full.transpose(1, 2)
+        imag_t = imag_full.transpose(1, 2)
+        segments = torch.matmul(real_t, self.idft_cos.T) - torch.matmul(imag_t, self.idft_sin.T)
+        NF = self.num_stft_frames
+        segments = segments * self.window.unsqueeze(0).unsqueeze(0)
+        seg = segments.squeeze(0)
+        seg_chunks = seg.reshape(NF, 4, 4)
+        b0 = seg_chunks[:, 0, :].reshape(-1)
+        b1 = seg_chunks[:, 1, :].reshape(-1)
+        b2 = seg_chunks[:, 2, :].reshape(-1)
+        b3 = seg_chunks[:, 3, :].reshape(-1)
+        F4 = NF * 4
+        padded_samples = NF * 4 + 12
+        output = torch.zeros(padded_samples)
+        output[0:F4] = output[0:F4] + b0
+        output[4:F4 + 4] = output[4:F4 + 4] + b1
+        output[8:F4 + 8] = output[8:F4 + 8] + b2
+        output[12:F4 + 12] = output[12:F4 + 12] + b3
+        win_sq = self.window * self.window
+        win_chunks = win_sq.reshape(4, 4)
+        w0 = win_chunks[0].repeat(NF)
+        w1 = win_chunks[1].repeat(NF)
+        w2 = win_chunks[2].repeat(NF)
+        w3 = win_chunks[3].repeat(NF)
+        wnorm = torch.zeros(padded_samples)
+        wnorm[0:F4] = wnorm[0:F4] + w0
+        wnorm[4:F4 + 4] = wnorm[4:F4 + 4] + w1
+        wnorm[8:F4 + 8] = wnorm[8:F4 + 8] + w2
+        wnorm[12:F4 + 12] = wnorm[12:F4 + 12] + w3
+        output = output / (wnorm + 1e-8)
+        pad = 8
+        trimmed_len = (NF - 1) * 4
+        output = output[pad:pad + trimmed_len]
+        output = torch.clamp(output, -0.99, 0.99)
+        return output.unsqueeze(0)
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        """mel: (1, 80, T) → waveform: (1, samples)"""
+        # F0 prediction
+        f0 = self.vocoder.f0_predictor(mel)  # (1, T)
+        # Source generation
+        source_stft = self._generate_source(f0)
+        # HiFT decode
+        x = self.vocoder.conv_pre(mel)
+        for i in range(self.vocoder.num_upsamples):
+            x = F.leaky_relu(x, self.vocoder.lrelu_slope)
+            x = self.vocoder.ups[i](x)
+            if i == self.vocoder.num_upsamples - 1:
+                x = self.vocoder.reflection_pad(x)
+            si = self.vocoder.source_downs[i](source_stft)
+            si = self.vocoder.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.vocoder.num_kernels):
+                if xs is None:
+                    xs = self.vocoder.resblocks[i * self.vocoder.num_kernels + j](x)
+                else:
+                    xs += self.vocoder.resblocks[i * self.vocoder.num_kernels + j](x)
+            x = xs / self.vocoder.num_kernels
+        x = F.leaky_relu(x)
+        x = self.vocoder.conv_post(x)
+        return self._istft_overlap_add(x)
+class F0PredictorWrapper(nn.Module):
+    """Wraps HiFT's f0 predictor: mel → f0."""
+    def __init__(self, vocoder):
+        super().__init__()
+        self.f0_predictor = vocoder.f0_predictor
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        """mel: (1, 80, T) → f0: (1, 1, T)"""
+        return self.f0_predictor(mel)
+class HiFTDecodeWrapper(nn.Module):
+    """Wraps HiFT's decode stage: mel + source_stft → waveform.
+    Includes a manual iSTFT implementation using matmul with a precomputed
+    DFT basis matrix, so the entire pipeline runs inside CoreML.
+    """
+    def __init__(self, vocoder, num_stft_frames: int):
+        super().__init__()
+        self.vocoder = vocoder
+        self.num_stft_frames = num_stft_frames  # hardcoded for tracing
+        n_fft = vocoder.istft_n_fft  # 16
+        hop_len = vocoder.istft_hop_len  # 4
+        # Precompute DFT basis for iSTFT: (n_fft, n_fft) real-valued IDFT matrix
+        # X[k] = sum_n x[n] * exp(j*2pi*n*k/N) → x[n] = (1/N) * sum_k X[k] * exp(j*2pi*n*k/N)
+        n = torch.arange(n_fft, dtype=torch.float32)
+        k = torch.arange(n_fft, dtype=torch.float32)
+        angles = 2.0 * torch.pi * n.unsqueeze(1) * k.unsqueeze(0) / n_fft  # (n_fft, n_fft)
+        # cos/sin basis for real/imag parts
+        self.register_buffer("idft_cos", torch.cos(angles) / n_fft)  # (n_fft, n_fft)
+        self.register_buffer("idft_sin", torch.sin(angles) / n_fft)  # (n_fft, n_fft)
+        # Window for overlap-add
+        self.register_buffer("window", vocoder.stft_window.clone())
+        self.n_fft = n_fft
+        self.hop_len = hop_len
+        self.n_fft_half = n_fft // 2 + 1  # 9
+    def forward(self, mel: torch.Tensor, source_stft: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            mel: (1, 80, T) float32
+            source_stft: (1, 18, T') float32 — real+imag STFT of source signal
+        Returns:
+            waveform: (1, samples) float32
+        """
+        x = self.vocoder.conv_pre(mel)
+        for i in range(self.vocoder.num_upsamples):
+            x = F.leaky_relu(x, self.vocoder.lrelu_slope)
+            x = self.vocoder.ups[i](x)
+            if i == self.vocoder.num_upsamples - 1:
+                x = self.vocoder.reflection_pad(x)
+            si = self.vocoder.source_downs[i](source_stft)
+            si = self.vocoder.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.vocoder.num_kernels):
+                if xs is None:
+                    xs = self.vocoder.resblocks[i * self.vocoder.num_kernels + j](x)
+                else:
+                    xs += self.vocoder.resblocks[i * self.vocoder.num_kernels + j](x)
+            x = xs / self.vocoder.num_kernels
+        x = F.leaky_relu(x)
+        x = self.vocoder.conv_post(x)  # (1, 18, num_frames)
+        # Split into magnitude and phase
+        magnitude = torch.exp(x[:, :self.n_fft_half, :])  # (1, 9, num_frames)
+        phase = torch.sin(x[:, self.n_fft_half:, :])       # (1, 9, num_frames)
+        # Convert to real/imag
+        real_half = magnitude * torch.cos(phase)  # (1, 9, num_frames)
+        imag_half = magnitude * torch.sin(phase)
+        # Mirror to full spectrum (Hermitian symmetry)
+        # real: [r0, r1, ..., r8, r7, r6, ..., r1]
+        # imag: [i0, i1, ..., i8, -i7, -i6, ..., -i1]
+        real_mirror = torch.flip(real_half[:, 1:self.n_fft_half - 1, :], dims=[1])
+        imag_mirror = -torch.flip(imag_half[:, 1:self.n_fft_half - 1, :], dims=[1])
+        real_full = torch.cat([real_half, real_mirror], dim=1)  # (1, 16, num_frames)
+        imag_full = torch.cat([imag_half, imag_mirror], dim=1)  # (1, 16, num_frames)
+        # iDFT via matmul: output[n] = sum_k (real[k]*cos[n,k] - imag[k]*sin[n,k])
+        # (1, 16, num_frames) → transpose to (1, num_frames, 16) → matmul with (16, 16)
+        real_t = real_full.transpose(1, 2)  # (1, num_frames, 16)
+        imag_t = imag_full.transpose(1, 2)
+        # segments[n] = sum_k real[k]*cos[n,k] - imag[k]*sin[n,k]
+        # = real_t @ idft_cos.T - imag_t @ idft_sin.T
+        # But idft_cos is (n_fft, n_fft) where idft_cos[n,k] = cos(2pi*n*k/N)/N
+        # We want segments[frame, n] = sum_k (real[frame,k] * idft_cos[n,k] - imag[frame,k] * idft_sin[n,k])
+        # = (real_t @ idft_cos^T - imag_t @ idft_sin^T)[frame, n]
+        segments = torch.matmul(real_t, self.idft_cos.T) - torch.matmul(imag_t, self.idft_sin.T)
+        # segments: (1, num_frames, 16)
+        # Overlap-add with window
+        # n_fft=16, hop=4, so overlap ratio = 4 (each sample covered by 4 frames)
+        NF = self.num_stft_frames  # hardcoded constant for tracing
+        segments = segments * self.window.unsqueeze(0).unsqueeze(0)  # (1, NF, 16)
+        seg = segments.squeeze(0)  # (NF, 16)
+        # Reshape each 16-sample segment into 4 chunks of 4 (hop_len) samples
+        # seg: (F, 16) → (F, 4, 4)
+        seg_chunks = seg.reshape(NF, 4, 4)  # (F, 4_blocks, 4_samples)
+        # Block b of frame f lands at output position (f + b) * hop_len
+        # Rearrange so block b from all frames is contiguous:
+        # chunk_b[f] = seg_chunks[f, b, :] lands at output[(f+b)*4 : (f+b)*4 + 4]
+        # = output index f*4 + b*4 ... but shifted by b frames
+        # Equivalently: for block b, we have F values that go to positions b, b+1, ..., b+F-1
+        # in units of hop_len
+        # For each sub-block offset (0..3), create a flat array and add shifted
+        # Using static slicing only — no dynamic indexing
+        padded_samples = NF * 4 + 12  # (NF-1)*4 + 16
+        # Actually: (num_frames - 1) * 4 + 16 = num_frames * 4 + 12
+        # Each sub-block b contributes F chunks of 4 samples, placed at positions
+        # starting from b*4 with stride 4 between frames.
+        # block_b = seg_chunks[:, b, :].reshape(-1) → F*4 contiguous values
+        # These go to output[b*4 : b*4 + F*4]
+        b0 = seg_chunks[:, 0, :].reshape(-1)  # (F*4,) → output[0 : F*4]
+        b1 = seg_chunks[:, 1, :].reshape(-1)  # (F*4,) → output[4 : F*4 + 4]
+        b2 = seg_chunks[:, 2, :].reshape(-1)  # (F*4,) → output[8 : F*4 + 8]
+        b3 = seg_chunks[:, 3, :].reshape(-1)  # (F*4,) → output[12 : F*4 + 12]
+        F4 = NF * 4
+        output = torch.zeros(padded_samples)
+        output[0:F4] = output[0:F4] + b0
+        output[4:F4 + 4] = output[4:F4 + 4] + b1
+        output[8:F4 + 8] = output[8:F4 + 8] + b2
+        output[12:F4 + 12] = output[12:F4 + 12] + b3
+        # Window normalization — same structure
+        win_sq = self.window * self.window  # (16,)
+        win_chunks = win_sq.reshape(4, 4)  # (4_blocks, 4_samples)
+        w0 = win_chunks[0].repeat(NF)
+        w1 = win_chunks[1].repeat(NF)
+        w2 = win_chunks[2].repeat(NF)
+        w3 = win_chunks[3].repeat(NF)
+        wnorm = torch.zeros(padded_samples)
+        wnorm[0:F4] = wnorm[0:F4] + w0
+        wnorm[4:F4 + 4] = wnorm[4:F4 + 4] + w1
+        wnorm[8:F4 + 8] = wnorm[8:F4 + 8] + w2
+        wnorm[12:F4 + 12] = wnorm[12:F4 + 12] + w3
+        output = output / (wnorm + 1e-8)
+        # Trim center padding: n_fft//2 = 8 from start
+        pad = 8
+        trimmed_len = (NF - 1) * 4  # expected output length
+        output = output[pad:pad + trimmed_len]
+        output = torch.clamp(output, -0.99, 0.99)
+        return output.unsqueeze(0)  # (1, samples)
+def convert_kanade_decoder(kanade: KanadeModel, num_tokens: int, output_dir: Path):
+    """Convert Kanade decoder to CoreML."""
+    wrapper = KanadeDecoderWrapper(kanade, num_tokens).eval().float()
+    print(f"Tracing Kanade decoder (num_tokens={num_tokens}, mel_length={wrapper.mel_length})...")
+    token_indices = torch.arange(num_tokens, dtype=torch.int32)
+    speaker_embedding = torch.randn(1, 128, dtype=torch.float32)
+    with torch.no_grad():
+        # Test forward
+        mel = wrapper(token_indices, speaker_embedding)
+        print(f"  Output mel shape: {mel.shape}")
+        traced = torch.jit.trace(wrapper, (token_indices, speaker_embedding))
+    print("Converting Kanade decoder to CoreML...")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="token_indices", shape=(num_tokens,), dtype=np.int32),
+            ct.TensorType(name="speaker_embedding", shape=(1, 128), dtype=np.float32),
+        ],
+        outputs=[ct.TensorType(name="mel", dtype=np.float32)],
+        compute_precision=ct.precision.FLOAT32,
+        minimum_deployment_target=ct.target.iOS17,
+    )
+    out_path = output_dir / "KanadeDecoder.mlpackage"
+    mlmodel.save(str(out_path))
+    print(f"Saved Kanade decoder to {out_path}")
+def convert_f0_predictor(vocoder, mel_length: int, output_dir: Path):
+    """Convert HiFT f0 predictor to CoreML."""
+    wrapper = F0PredictorWrapper(vocoder).eval().float()
+    print(f"Tracing F0 predictor (mel_length={mel_length})...")
+    mel = torch.randn(1, 80, mel_length, dtype=torch.float32)
+    with torch.no_grad():
+        f0 = wrapper(mel)
+        print(f"  Output f0 shape: {f0.shape}")
+        traced = torch.jit.trace(wrapper, (mel,))
+    print("Converting F0 predictor to CoreML...")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="mel", shape=(1, 80, mel_length), dtype=np.float32),
+        ],
+        outputs=[ct.TensorType(name="f0", dtype=np.float32)],
+        compute_precision=ct.precision.FLOAT32,
+        minimum_deployment_target=ct.target.iOS17,
+    )
+    out_path = output_dir / "F0Predictor.mlpackage"
+    mlmodel.save(str(out_path))
+    print(f"Saved F0 predictor to {out_path}")
+def convert_hift_decode(vocoder, mel_length: int, output_dir: Path):
+    """Convert HiFT decode stage to CoreML.
+    Source signal STFT must be computed externally (Swift side).
+    """
+    # Compute source_stft shape: run f0 predictor + source module to get it
+    mel = torch.randn(1, 80, mel_length, dtype=torch.float32)
+    with torch.no_grad():
+        f0 = vocoder.f0_predictor(mel)
+        s = vocoder.f0_upsamp(f0[:, None]).transpose(1, 2)
+        s, _, _ = vocoder.m_source(s)
+        s = s.transpose(1, 2)
+        s_stft_real, s_stft_imag = vocoder._stft(s.squeeze(1))
+        source_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        num_stft_frames = source_stft.shape[2]
+        print(f"  Source STFT shape: {source_stft.shape} ({num_stft_frames} frames)")
+    wrapper = HiFTDecodeWrapper(vocoder, num_stft_frames).eval().float()
+    print(f"Tracing HiFT decode (mel_length={mel_length})...")
+    with torch.no_grad():
+        waveform = wrapper(mel, source_stft)
+        print(f"  Output waveform shape: {waveform.shape}")
+        traced = torch.jit.trace(wrapper, (mel, source_stft))
+    print("Converting HiFT decode to CoreML...")
+    source_stft_channels = source_stft.shape[1]
+    source_stft_time = source_stft.shape[2]
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="mel", shape=(1, 80, mel_length), dtype=np.float32),
+            ct.TensorType(
+                name="source_stft",
+                shape=(1, source_stft_channels, source_stft_time),
+                dtype=np.float32,
+            ),
+        ],
+        outputs=[ct.TensorType(name="waveform", dtype=np.float32)],
+        compute_precision=ct.precision.FLOAT32,
+        minimum_deployment_target=ct.target.iOS17,
+    )
+    out_path = output_dir / "HiFTDecode.mlpackage"
+    mlmodel.save(str(out_path))
+    print(f"Saved HiFT decode to {out_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Convert Kanade + HiFT to CoreML")
+    parser.add_argument(
+        "--output-dir", type=str,
+        default=str(Path(__file__).parent.parent),
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--num-tokens", type=int, default=100,
+        help="Fixed number of audio tokens (determines mel length)",
+    )
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("Loading Kanade model...")
+    kanade = KanadeModel.from_pretrained("frothywater/kanade-25hz-clean").eval().float()
+    patch_kanade_for_coreml(kanade)
+    vocoder = load_vocoder(kanade.config.vocoder_name).eval().float()
+    # Compute mel_length for this token count
+    mel_length = kanade._calculate_target_mel_length(
+        kanade._calculate_original_audio_length(args.num_tokens)
+    )
+    print(f"\n=== Converting Kanade decoder ===")
+    convert_kanade_decoder(kanade, args.num_tokens, output_dir)
+    print(f"\n=== Converting full vocoder (mel → waveform) ===")
+    convert_full_vocoder(vocoder, mel_length, output_dir)
+    print("\nDone!")
+    print(f"  KanadeDecoder: {args.num_tokens} tokens → mel (80, {mel_length})")
+    print(f"  Vocoder: mel (80, {mel_length}) → waveform")
+def convert_full_vocoder(vocoder, mel_length: int, output_dir: Path):
+    """Convert complete mel→waveform vocoder to CoreML."""
+    # Get num_stft_frames by running a dummy forward
+    mel = torch.randn(1, 80, mel_length, dtype=torch.float32)
+    with torch.no_grad():
+        f0 = vocoder.f0_predictor(mel)
+        s = vocoder.f0_upsamp(f0[:, None]).transpose(1, 2)
+        s, _, _ = vocoder.m_source(s)
+        s = s.transpose(1, 2)
+        sr, si = vocoder._stft(s.squeeze(1))
+        num_stft_frames = sr.shape[2]
+        print(f"  STFT frames: {num_stft_frames}")
+    wrapper = FullVocoderWrapper(vocoder, num_stft_frames).eval().float()
+    print(f"Tracing full vocoder (mel_length={mel_length})...")
+    # Replace randn_like with zeros for tracing
+    orig_randn = torch.randn_like
+    torch.randn_like = lambda x, **kw: torch.zeros_like(x)
+    with torch.no_grad():
+        wav = wrapper(mel)
+        print(f"  Output waveform: {wav.shape}")
+        traced = torch.jit.trace(wrapper, (mel,))
+    torch.randn_like = orig_randn
+    print("Converting full vocoder to CoreML...")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[ct.TensorType(name="mel", shape=(1, 80, mel_length), dtype=np.float32)],
+        outputs=[ct.TensorType(name="waveform", dtype=np.float32)],
+        compute_precision=ct.precision.FLOAT32,
+        minimum_deployment_target=ct.target.iOS17,
+    )
+    out_path = output_dir / "Vocoder.mlpackage"
+    mlmodel.save(str(out_path))
+    print(f"Saved vocoder to {out_path}")
+if __name__ == "__main__":
+    main()

scripts/model_wrapper.py CHANGED Viewed

@@ -118,8 +118,8 @@ class PlaprePicoPrefill(nn.Module):
 class PlaprePicoDecode(nn.Module):
     """Generates one token at a time using the KV cache.
-    Position encoding is handled externally: caller provides pre-sliced cos/sin
-    for the current position, and a one-hot update_mask for cache writing.
     Inputs:
         input_ids: (1, 1) int32
@@ -127,6 +127,7 @@ class PlaprePicoDecode(nn.Module):
         cos: (1, 1, 1, 64) float16 — RoPE cos for current position
         sin: (1, 1, 1, 64) float16 — RoPE sin for current position
         update_mask: (1, 1, 2048, 1) float16 — one-hot at current position
     State buffers:
         k_cache_0..29, v_cache_0..29: (1, 3, 2048, 64) float16
@@ -174,8 +175,12 @@ class PlaprePicoDecode(nn.Module):
         cos: torch.Tensor,
         sin: torch.Tensor,
         update_mask: torch.Tensor,
     ) -> torch.Tensor:
         hidden = self.embed_tokens(input_ids)  # (1, 1, 576)
         for i, layer in enumerate(self.layers):
             k_cache = getattr(self, f"k_cache_{i}")

 class PlaprePicoDecode(nn.Module):
     """Generates one token at a time using the KV cache.
+    Also used for token-by-token prefill. For the speaker token (position 0),
+    pass a non-zero speaker_hidden to replace the token embedding.
     Inputs:
         input_ids: (1, 1) int32
         cos: (1, 1, 1, 64) float16 — RoPE cos for current position
         sin: (1, 1, 1, 64) float16 — RoPE sin for current position
         update_mask: (1, 1, 2048, 1) float16 — one-hot at current position
+        speaker_hidden: (1, 1, 576) float16 — pre-projected speaker embedding, or zeros
     State buffers:
         k_cache_0..29, v_cache_0..29: (1, 3, 2048, 64) float16
         cos: torch.Tensor,
         sin: torch.Tensor,
         update_mask: torch.Tensor,
+        speaker_hidden: torch.Tensor,
     ) -> torch.Tensor:
         hidden = self.embed_tokens(input_ids)  # (1, 1, 576)
+        # Speaker conditioning: caller passes pre-projected (1,1,576) for position 0,
+        # zeros for all other positions. Additive — zeros are a no-op.
+        hidden = hidden + speaker_hidden
         for i, layer in enumerate(self.layers):
             k_cache = getattr(self, f"k_cache_{i}")

scripts/test_generate.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""
+End-to-end test: generate Danish speech using our custom PyTorch wrappers
+(the same code converted to CoreML), decode with Kanade, save as WAV.
+Usage:
+    python scripts/test_generate.py [--text "Hej verden"] [--speaker tor] [--output test.wav]
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+import soundfile as sf
+sys.path.insert(0, str(Path(__file__).parent))
+from attention import precompute_rope_frequencies
+from model_wrapper import (
+    PlaprePicoPrefill,
+    PlaprePicoDecode,
+    NUM_LAYERS,
+    MAX_CONTEXT,
+    HEAD_DIM,
+    PREFILL_SEQ_LEN,
+    SPEAKER_DIM,
+)
+from convert import load_weights, populate_weights
+AUDIO_TOKEN_OFFSET = 8002
+AUDIO_MARKER_TOKEN = 8001
+TEXT_MARKER_TOKEN = 8000
+EOS_TOKEN = 2
+def load_speaker(speakers_path: Path, name: str) -> torch.Tensor:
+    with open(speakers_path) as f:
+        speakers = json.load(f)
+    if name not in speakers:
+        raise ValueError(f"Speaker '{name}' not found. Available: {list(speakers.keys())}")
+    return torch.tensor(speakers[name], dtype=torch.float16).unsqueeze(0)
+def sample(logits: torch.Tensor, temperature: float, top_k: int, top_p: float) -> int:
+    if temperature <= 0:
+        return int(logits.argmax())
+    logits = logits.float() / temperature
+    if top_k > 0:
+        topv, topi = torch.topk(logits, top_k)
+        logits_filtered = torch.full_like(logits, float("-inf"))
+        logits_filtered.scatter_(0, topi, topv)
+    else:
+        logits_filtered = logits
+    probs = F.softmax(logits_filtered, dim=-1)
+    sorted_probs, sorted_idx = torch.sort(probs, descending=True)
+    cumsum = torch.cumsum(sorted_probs, dim=0)
+    mask = cumsum - sorted_probs > top_p
+    sorted_probs[mask] = 0
+    sorted_probs /= sorted_probs.sum()
+    idx = torch.multinomial(sorted_probs, 1)
+    return int(sorted_idx[idx])
+def generate(
+    prefill_model: PlaprePicoPrefill,
+    decode_model: PlaprePicoDecode,
+    text: str,
+    speaker_embedding: torch.Tensor,
+    tokenizer_path: Path,
+    max_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+) -> list[int]:
+    from tokenizers import Tokenizer
+    tokenizer = Tokenizer.from_file(str(tokenizer_path))
+    token_ids = tokenizer.encode(text).ids
+    # Plapre format: [placeholder, <text>, tokens..., <audio>]
+    # Position 0 placeholder gets replaced by speaker_proj output
+    input_ids_list = [EOS_TOKEN] + [TEXT_MARKER_TOKEN] + token_ids + [AUDIO_MARKER_TOKEN]
+    input_len = len(input_ids_list)
+    print(f"Input ({input_len} tokens): {input_ids_list}")
+    # Pad to prefill length
+    padded_ids = torch.full((1, PREFILL_SEQ_LEN), EOS_TOKEN, dtype=torch.int32)
+    for i, tid in enumerate(input_ids_list):
+        padded_ids[0, i] = tid
+    # Causal mask: only real tokens (0..input_len-1) attend
+    causal_mask = torch.full(
+        (1, 1, PREFILL_SEQ_LEN, MAX_CONTEXT), float("-inf"), dtype=torch.float16
+    )
+    for i in range(input_len):
+        causal_mask[0, 0, i, :i + 1] = 0.0
+    # === Prefill ===
+    # We can't get logits at an arbitrary position from the wrapper (it returns pos -1).
+    # So run the layers manually to read logits at input_len - 1.
+    print("Running prefill...")
+    with torch.no_grad():
+        hidden = prefill_model.embed_tokens(padded_ids)
+        spk = prefill_model.speaker_proj(speaker_embedding).unsqueeze(1)
+        hidden = torch.cat([spk, hidden[:, 1:, :]], dim=1)
+        cos = prefill_model.rope_cos
+        sin = prefill_model.rope_sin
+        for i, layer in enumerate(prefill_model.layers):
+            k_cache = getattr(prefill_model, f"k_cache_{i}")
+            v_cache = getattr(prefill_model, f"v_cache_{i}")
+            hidden, k_new, v_new = layer(hidden, cos, sin, causal_mask, k_cache, v_cache)
+            # Update caches on the model so decode can copy them
+            setattr(prefill_model, f"k_cache_{i}", k_new)
+            setattr(prefill_model, f"v_cache_{i}", v_new)
+        hidden = prefill_model.norm(hidden)
+        logits = F.linear(hidden[0, input_len - 1, :], prefill_model.embed_tokens.weight)
+    generated = []
+    next_token = sample(logits, temperature, top_k, top_p)
+    generated.append(next_token)
+    print(f"  Token 0: {next_token}")
+    # === Copy KV cache to decode model ===
+    with torch.no_grad():
+        for i in range(NUM_LAYERS):
+            getattr(decode_model, f"k_cache_{i}").copy_(getattr(prefill_model, f"k_cache_{i}"))
+            getattr(decode_model, f"v_cache_{i}").copy_(getattr(prefill_model, f"v_cache_{i}"))
+    # === Decode loop ===
+    cos_full, sin_full = precompute_rope_frequencies(HEAD_DIM, MAX_CONTEXT, 100000.0)
+    cos_full = cos_full.half()
+    sin_full = sin_full.half()
+    print("Decoding...")
+    for step in range(1, max_tokens):
+        pos = input_len + step - 1
+        decode_ids = torch.tensor([[next_token]], dtype=torch.int32)
+        decode_mask = torch.full((1, 1, 1, MAX_CONTEXT), float("-inf"), dtype=torch.float16)
+        decode_mask[0, 0, 0, :pos + 1] = 0.0
+        pos_cos = cos_full[:, :, pos:pos + 1, :]
+        pos_sin = sin_full[:, :, pos:pos + 1, :]
+        update_mask = torch.zeros(1, 1, MAX_CONTEXT, 1, dtype=torch.float16)
+        update_mask[0, 0, pos, 0] = 1.0
+        with torch.no_grad():
+            logits = decode_model(decode_ids, decode_mask, pos_cos, pos_sin, update_mask)
+        next_token = sample(logits[0, 0], temperature, top_k, top_p)
+        generated.append(next_token)
+        if next_token == EOS_TOKEN:
+            print(f"  EOS at step {step}")
+            break
+        if step % 25 == 0:
+            print(f"  Step {step}: ({step / 25:.1f}s of audio)")
+    return generated
+def decode_audio(tokens: list[int], speaker_embedding: torch.Tensor) -> np.ndarray:
+    from kanade_tokenizer import KanadeModel, load_vocoder, vocode
+    audio_tokens = [t for t in tokens if AUDIO_TOKEN_OFFSET <= t <= 20801]
+    if not audio_tokens:
+        raise ValueError("No audio tokens generated!")
+    kanade_indices = torch.tensor([t - AUDIO_TOKEN_OFFSET for t in audio_tokens])
+    print(f"Decoding {len(kanade_indices)} audio tokens ({len(kanade_indices) / 25:.1f}s)...")
+    model = KanadeModel.from_pretrained("frothywater/kanade-25hz-clean").eval()
+    vocoder = load_vocoder(model.config.vocoder_name)
+    with torch.no_grad():
+        spk = speaker_embedding.squeeze(0).float()
+        mel = model.decode(global_embedding=spk, content_token_indices=kanade_indices)
+        waveform = vocode(vocoder, mel.unsqueeze(0))
+    return waveform.squeeze().cpu().numpy()
+def main():
+    parser = argparse.ArgumentParser(description="Generate Danish speech (custom model)")
+    parser.add_argument("--text", type=str, default="Hej, mit navn er Daniel.")
+    parser.add_argument("--speaker", type=str, default="tor")
+    parser.add_argument("--output", type=str, default="test.wav")
+    parser.add_argument("--max-tokens", type=int, default=500)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--top-k", type=int, default=50)
+    parser.add_argument("--top-p", type=float, default=0.95)
+    parser.add_argument("--model-dir", type=str, default=None)
+    args = parser.parse_args()
+    if args.model_dir:
+        model_dir = Path(args.model_dir)
+    else:
+        cache = Path.home() / ".cache/huggingface/hub/models--syvai--plapre-pico"
+        snapshots = cache / "snapshots"
+        if snapshots.exists():
+            model_dir = next(snapshots.iterdir())
+        else:
+            from huggingface_hub import snapshot_download
+            model_dir = Path(snapshot_download("syvai/plapre-pico"))
+    repo_root = Path(__file__).parent.parent
+    speakers_path = repo_root / "speakers.json"
+    if not speakers_path.exists():
+        speakers_path = model_dir / "speakers.json"
+    speaker_embedding = load_speaker(speakers_path, args.speaker)
+    print(f"Speaker: {args.speaker}")
+    tokenizer_path = repo_root / "tokenizer.json"
+    if not tokenizer_path.exists():
+        tokenizer_path = model_dir / "tokenizer.json"
+    # Load weights into our custom models
+    weights = load_weights(model_dir)
+    prefill = PlaprePicoPrefill()
+    populate_weights(prefill, weights, is_prefill=True)
+    prefill = prefill.half().eval()
+    decode = PlaprePicoDecode()
+    populate_weights(decode, weights, is_prefill=False)
+    decode = decode.half().eval()
+    # Generate
+    tokens = generate(
+        prefill, decode, args.text, speaker_embedding, tokenizer_path,
+        args.max_tokens, args.temperature, args.top_k, args.top_p,
+    )
+    audio_count = sum(1 for t in tokens if AUDIO_TOKEN_OFFSET <= t <= 20801)
+    print(f"\nGenerated {len(tokens)} tokens: {audio_count} audio ({audio_count / 25:.1f}s)")
+    print(f"First 20: {tokens[:20]}")
+    waveform = decode_audio(tokens, speaker_embedding)
+    sf.write(args.output, waveform, 24000)
+    print(f"Saved {len(waveform) / 24000:.1f}s audio to {args.output}")
+if __name__ == "__main__":
+    main()