tantk
/

gpu_endpoint

+"""
+Voxtral Realtime 4B inference engine.
+Loads directly from Mistral-format consolidated.safetensors — no transformers
+dependency. Adapted from voxtral.c/python_simple_implementation.py with CUDA
+and FP16 support for T4 GPUs.
+"""
+import json
+import math
+import os
+import base64
+from typing import Iterator
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from safetensors import safe_open
+# ============================================================================
+# Config (from params.json)
+# ============================================================================
+# Encoder
+ENC_DIM = 1280
+ENC_LAYERS = 32
+ENC_HEADS = 32
+ENC_HEAD_DIM = 64
+ENC_HIDDEN = 5120
+ENC_KV_HEADS = 32
+ENC_WINDOW = 750
+ENC_NORM_EPS = 1e-5
+ENC_ROPE_THETA = 1_000_000.0
+# Decoder
+DEC_DIM = 3072
+DEC_LAYERS = 26
+DEC_HEADS = 32
+DEC_HEAD_DIM = 128
+DEC_HIDDEN = 9216
+DEC_KV_HEADS = 8
+DEC_WINDOW = 8192
+DEC_NORM_EPS = 1e-5
+DEC_ROPE_THETA = 1_000_000.0
+VOCAB_SIZE = 131072
+# Audio
+SAMPLE_RATE = 16000
+FRAME_RATE = 12.5
+NUM_MEL_BINS = 128
+HOP_LENGTH = 160
+WINDOW_SIZE = 400
+GLOBAL_LOG_MEL_MAX = 1.5
+DOWNSAMPLE_FACTOR = 4
+# Ada norm
+ADA_NORM_DIM = 32
+# Streaming
+N_LEFT_PAD_TOKENS = 32
+TRANSCRIPTION_DELAY_MS = 480
+# Special tokens
+TOKEN_BOS = 1
+TOKEN_EOS = 2
+TOKEN_STREAMING_PAD = 32
+TOKEN_BEGIN_AUDIO = 25
+TOKEN_AUDIO = 24
+# Derived constants
+RAW_AUDIO_LENGTH_PER_TOK = int(SAMPLE_RATE // FRAME_RATE)  # 1280
+AUDIO_LENGTH_PER_TOK = RAW_AUDIO_LENGTH_PER_TOK // HOP_LENGTH  # 8
+def _num_delay_tokens():
+    delay_len = int(TRANSCRIPTION_DELAY_MS / 1000.0 * SAMPLE_RATE)
+    n = delay_len
+    if n % HOP_LENGTH != 0:
+        n = math.ceil(n / HOP_LENGTH - 1)
+    else:
+        n = n // HOP_LENGTH
+    return math.ceil(n / AUDIO_LENGTH_PER_TOK)
+N_DELAY_TOKENS = _num_delay_tokens()
+N_RIGHT_PAD_TOKENS = (N_DELAY_TOKENS + 1) + 10  # 17
+# ============================================================================
+# Mel filter bank
+# ============================================================================
+def _hertz_to_mel(freq):
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+    return mels
+def _mel_to_hertz(mels):
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+    log_region = mels >= min_log_mel
+    freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    return freq
+def _compute_mel_filters():
+    num_frequency_bins = 1 + WINDOW_SIZE // 2  # 201
+    fft_freqs = np.linspace(0, SAMPLE_RATE // 2, num_frequency_bins)
+    mel_min = _hertz_to_mel(0.0)
+    mel_max = _hertz_to_mel(8000.0)
+    mel_freqs = np.linspace(mel_min, mel_max, NUM_MEL_BINS + 2)
+    filter_freqs = _mel_to_hertz(mel_freqs)
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    fb = np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+    enorm = 2.0 / (filter_freqs[2:NUM_MEL_BINS + 2] - filter_freqs[:NUM_MEL_BINS])
+    fb *= np.expand_dims(enorm, 0)
+    return fb  # [201, 128]
+# ============================================================================
+# Mel spectrogram
+# ============================================================================
+def _compute_mel_spectrogram(audio, mel_filters, device):
+    """audio: 1D tensor on device, mel_filters: [freq_bins, mel_bins] on device."""
+    window = torch.hann_window(WINDOW_SIZE, device=device)
+    stft = torch.stft(audio, WINDOW_SIZE, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = mel_filters.T @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, torch.tensor(GLOBAL_LOG_MEL_MAX, device=device) - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec  # [128, frames]
+# ============================================================================
+# Audio streaming padding
+# ============================================================================
+def _pad_audio_streaming(audio_array):
+    mult_of = RAW_AUDIO_LENGTH_PER_TOK
+    n_samples = len(audio_array)
+    align_pad = (mult_of - (n_samples % mult_of)) % mult_of
+    right_pad = align_pad + N_RIGHT_PAD_TOKENS * mult_of
+    left_pad = N_LEFT_PAD_TOKENS * mult_of
+    return np.pad(audio_array, (left_pad, right_pad))
+# ============================================================================
+# Weight loading helpers
+# ============================================================================
+def _get_weight(sf_file, name, device, dtype=None):
+    t = sf_file.get_tensor(name)
+    if t.dtype == torch.bfloat16:
+        t = t.float()
+    t = t.to(device)
+    if dtype is not None:
+        t = t.to(dtype)
+    return t
+def _get_weight_optional(sf_file, name, device, dtype=None):
+    try:
+        return _get_weight(sf_file, name, device, dtype)
+    except Exception:
+        return None
+def _permute_qk_weight(w, n_heads, head_dim):
+    attn_in = n_heads * head_dim
+    attn_out = w.shape[1]
+    return (
+        w.view(n_heads, head_dim // 2, 2, attn_out)
+        .transpose(1, 2)
+        .reshape(attn_in, attn_out)
+    )
+def _permute_qk_bias(b, n_heads, head_dim):
+    attn_in = n_heads * head_dim
+    return (
+        b.view(n_heads, head_dim // 2, 2)
+        .transpose(1, 2)
+        .reshape(attn_in)
+    )
+# ============================================================================
+# RMSNorm
+# ============================================================================
+class _RMSNorm(nn.Module):
+    def __init__(self, weight, eps=1e-5):
+        super().__init__()
+        self.weight = weight
+        self.eps = eps
+    def forward(self, x):
+        rms = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
+        return (x.float() * rms * self.weight.float()).to(x.dtype)
+# ============================================================================
+# RoPE
+# ============================================================================
+def _compute_rope_freqs(positions, head_dim, theta, device):
+    freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    angles = positions.float().unsqueeze(-1) * freqs.unsqueeze(0)
+    return torch.cos(angles), torch.sin(angles)
+def _apply_rope(x, cos_f, sin_f, n_heads, head_dim, is_neox_style=False):
+    seq_len = x.shape[0]
+    x = x.view(seq_len, n_heads, head_dim)
+    cos_f = cos_f.unsqueeze(1)
+    sin_f = sin_f.unsqueeze(1)
+    if is_neox_style:
+        x1, x2 = x.chunk(2, dim=-1)
+        o1 = x1 * cos_f - x2 * sin_f
+        o2 = x2 * cos_f + x1 * sin_f
+        out = torch.cat([o1, o2], dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        o1 = x1 * cos_f - x2 * sin_f
+        o2 = x2 * cos_f + x1 * sin_f
+        out = torch.stack([o1, o2], dim=-1).flatten(-2)
+    return out.view(seq_len, n_heads * head_dim)
+# ============================================================================
+# Causal Attention
+# ============================================================================
+def _causal_attention(q, k, v, n_heads, n_kv_heads, head_dim, window,
+                      q_start_pos=0, kv_start_pos=0):
+    seq_q = q.shape[0]
+    seq_kv = k.shape[0]
+    gqa_ratio = n_heads // n_kv_heads
+    device = q.device
+    orig_dtype = q.dtype
+    q = q.view(seq_q, n_heads, head_dim).transpose(0, 1).unsqueeze(0)
+    k = k.view(seq_kv, n_kv_heads, head_dim).transpose(0, 1).unsqueeze(0)
+    v = v.view(seq_kv, n_kv_heads, head_dim).transpose(0, 1).unsqueeze(0)
+    if gqa_ratio > 1:
+        k = k.repeat_interleave(gqa_ratio, dim=1)
+        v = v.repeat_interleave(gqa_ratio, dim=1)
+    qi_abs = (q_start_pos + torch.arange(seq_q, device=device)).unsqueeze(1)
+    kv_abs = (kv_start_pos + torch.arange(seq_kv, device=device)).unsqueeze(0)
+    attn_mask = (kv_abs <= qi_abs) & (kv_abs >= (qi_abs - (window - 1)))
+    out = F.scaled_dot_product_attention(
+        q.float(), k.float(), v.float(),
+        attn_mask=attn_mask.unsqueeze(0).unsqueeze(0),
+        scale=1.0 / math.sqrt(head_dim),
+        dropout_p=0.0,
+    ).to(orig_dtype)
+    return out.squeeze(0).transpose(0, 1).contiguous().view(seq_q, n_heads * head_dim)
+# ============================================================================
+# Causal Conv1d
+# ============================================================================
+def _causal_conv1d(x, weight, bias, stride):
+    kernel_size = weight.shape[2]
+    effective_ks = kernel_size
+    padding_total = effective_ks - stride
+    n_frames = (x.shape[-1] - effective_ks + padding_total) / stride + 1
+    target_length = (math.ceil(n_frames) - 1) * stride + (effective_ks - padding_total)
+    extra_padding = int(target_length - x.shape[-1])
+    x = F.pad(x, (padding_total, extra_padding), mode='constant')
+    return F.conv1d(x, weight, bias, stride=stride)
+# ============================================================================
+# TimeEmbedding
+# ============================================================================
+def _compute_time_embedding(t_value, dim, device, theta=10000.0):
+    half_dim = dim // 2
+    inv_freq = torch.exp(
+        -math.log(theta) * torch.arange(half_dim, device=device).float() / half_dim
+    )
+    emb = t_value * inv_freq
+    return torch.cat([emb.cos(), emb.sin()])
+# ============================================================================
+# Encoder forward
+# ============================================================================
+def _encoder_forward(mel, sf_file, device, compute_dtype):
+    """mel: [128, frames] on device -> [seq, 1280] on device."""
+    prefix = "mm_streams_embeddings.embedding_module.whisper_encoder"
+    mel_3d = mel.unsqueeze(0)
+    conv0_w = _get_weight(sf_file, f"{prefix}.conv_layers.0.conv.weight", device, compute_dtype)
+    conv0_b = _get_weight(sf_file, f"{prefix}.conv_layers.0.conv.bias", device, compute_dtype)
+    conv1_w = _get_weight(sf_file, f"{prefix}.conv_layers.1.conv.weight", device, compute_dtype)
+    conv1_b = _get_weight(sf_file, f"{prefix}.conv_layers.1.conv.bias", device, compute_dtype)
+    h = F.gelu(_causal_conv1d(mel_3d.to(compute_dtype), conv0_w, conv0_b, stride=1))
+    h = F.gelu(_causal_conv1d(h, conv1_w, conv1_b, stride=2))
+    h = h.squeeze(0).transpose(0, 1)  # [seq, 1280]
+    conv_len = h.shape[0]
+    trunc = conv_len % DOWNSAMPLE_FACTOR
+    if trunc > 0:
+        h = h[trunc:]
+    seq_len = h.shape[0]
+    positions = torch.arange(seq_len, device=device)
+    rope_cos, rope_sin = _compute_rope_freqs(positions, ENC_HEAD_DIM, ENC_ROPE_THETA, device)
+    for layer in range(ENC_LAYERS):
+        lp = f"{prefix}.transformer.layers.{layer}"
+        attn_norm_w = _get_weight(sf_file, f"{lp}.attention_norm.weight", device)
+        norm = _RMSNorm(attn_norm_w, ENC_NORM_EPS)
+        x_norm = norm(h).to(compute_dtype)
+        wq = _get_weight(sf_file, f"{lp}.attention.wq.weight", device, compute_dtype)
+        wq_b = _get_weight(sf_file, f"{lp}.attention.wq.bias", device, compute_dtype)
+        wk = _get_weight(sf_file, f"{lp}.attention.wk.weight", device, compute_dtype)
+        wv = _get_weight(sf_file, f"{lp}.attention.wv.weight", device, compute_dtype)
+        wv_b = _get_weight(sf_file, f"{lp}.attention.wv.bias", device, compute_dtype)
+        wo = _get_weight(sf_file, f"{lp}.attention.wo.weight", device, compute_dtype)
+        wo_b = _get_weight(sf_file, f"{lp}.attention.wo.bias", device, compute_dtype)
+        q = F.linear(x_norm, wq, wq_b)
+        k = F.linear(x_norm, wk)
+        v = F.linear(x_norm, wv, wv_b)
+        q = _apply_rope(q, rope_cos, rope_sin, ENC_HEADS, ENC_HEAD_DIM, is_neox_style=False)
+        k = _apply_rope(k, rope_cos, rope_sin, ENC_KV_HEADS, ENC_HEAD_DIM, is_neox_style=False)
+        attn_out = _causal_attention(q, k, v, ENC_HEADS, ENC_KV_HEADS, ENC_HEAD_DIM, ENC_WINDOW)
+        h = h + F.linear(attn_out, wo, wo_b)
+        ffn_norm_w = _get_weight(sf_file, f"{lp}.ffn_norm.weight", device)
+        ffn_norm = _RMSNorm(ffn_norm_w, ENC_NORM_EPS)
+        x_norm = ffn_norm(h).to(compute_dtype)
+        w1 = _get_weight(sf_file, f"{lp}.feed_forward.w1.weight", device, compute_dtype)
+        w2 = _get_weight(sf_file, f"{lp}.feed_forward.w2.weight", device, compute_dtype)
+        w2_b = _get_weight(sf_file, f"{lp}.feed_forward.w2.bias", device, compute_dtype)
+        w3 = _get_weight(sf_file, f"{lp}.feed_forward.w3.weight", device, compute_dtype)
+        gate = F.silu(F.linear(x_norm, w1))
+        up = F.linear(x_norm, w3)
+        h = h + F.linear(gate * up, w2, w2_b)
+    final_norm_w = _get_weight(sf_file, f"{prefix}.transformer.norm.weight", device)
+    final_norm = _RMSNorm(final_norm_w, ENC_NORM_EPS)
+    h = final_norm(h)
+    return h  # [seq, 1280]
+# ============================================================================
+# Adapter forward
+# ============================================================================
+def _adapter_forward(enc_out, sf_file, device, compute_dtype):
+    """enc_out: [seq, 1280] -> [seq/4, 3072]."""
+    prefix = "mm_streams_embeddings.embedding_module"
+    w0 = _get_weight(sf_file, f"{prefix}.audio_language_projection.0.weight", device, compute_dtype)
+    w1 = _get_weight(sf_file, f"{prefix}.audio_language_projection.2.weight", device, compute_dtype)
+    seq_len = enc_out.shape[0]
+    ds = enc_out.reshape(seq_len // DOWNSAMPLE_FACTOR, ENC_DIM * DOWNSAMPLE_FACTOR)
+    out = F.gelu(F.linear(ds.to(compute_dtype), w0))
+    out = F.linear(out, w1)
+    return out  # [seq/4, 3072]
+# ============================================================================
+# Decoder
+# ============================================================================
+class _Decoder:
+    def __init__(self, sf_file, device, compute_dtype):
+        self.sf = sf_file
+        self.device = device
+        self.compute_dtype = compute_dtype
+        self.tok_embeddings = _get_weight(
+            sf_file,
+            "mm_streams_embeddings.embedding_module.tok_embeddings.weight",
+            device, compute_dtype,
+        )
+        self.final_norm = _get_weight(sf_file, "norm.weight", device)
+        self.kv_cache = {}
+        self.layers = []
+        for i in range(DEC_LAYERS):
+            self.layers.append(self._load_layer(i))
+    def _load_layer(self, i):
+        sf = self.sf
+        lp = f"layers.{i}"
+        device = self.device
+        dtype = self.compute_dtype
+        return {
+            'attention_norm': _get_weight(sf, f"{lp}.attention_norm.weight", device),
+            'ffn_norm': _get_weight(sf, f"{lp}.ffn_norm.weight", device),
+            'wq': _get_weight(sf, f"{lp}.attention.wq.weight", device, dtype),
+            'wk': _get_weight(sf, f"{lp}.attention.wk.weight", device, dtype),
+            'wv': _get_weight(sf, f"{lp}.attention.wv.weight", device, dtype),
+            'wo': _get_weight(sf, f"{lp}.attention.wo.weight", device, dtype),
+            'w1': _get_weight(sf, f"{lp}.feed_forward.w1.weight", device, dtype),
+            'w2': _get_weight(sf, f"{lp}.feed_forward.w2.weight", device, dtype),
+            'w3': _get_weight(sf, f"{lp}.feed_forward.w3.weight", device, dtype),
+            'ada_down': _get_weight(sf, f"{lp}.ada_rms_norm_t_cond.0.weight", device, dtype),
+            'ada_up': _get_weight(sf, f"{lp}.ada_rms_norm_t_cond.2.weight", device, dtype),
+        }
+    def embed_token(self, token_id):
+        return self.tok_embeddings[token_id]
+    def embed_tokens(self, token_ids):
+        return self.tok_embeddings[token_ids]
+    def _layer_forward(self, h, layer_idx, pos, kv_seq_len, t_cond=None):
+        L = self.layers[layer_idx]
+        seq_len = h.shape[0]
+        dtype = self.compute_dtype
+        device = self.device
+        if h.dtype != dtype:
+            h = h.to(dtype)
+        norm = _RMSNorm(L['attention_norm'], DEC_NORM_EPS)
+        x_norm = norm(h).to(dtype)
+        q = F.linear(x_norm, L['wq'])
+        k = F.linear(x_norm, L['wk'])
+        v = F.linear(x_norm, L['wv'])
+        positions = torch.arange(pos, pos + seq_len, device=device)
+        rope_cos, rope_sin = _compute_rope_freqs(positions, DEC_HEAD_DIM, DEC_ROPE_THETA, device)
+        q = _apply_rope(q.float(), rope_cos, rope_sin, DEC_HEADS, DEC_HEAD_DIM, is_neox_style=False).to(dtype)
+        k = _apply_rope(k.float(), rope_cos, rope_sin, DEC_KV_HEADS, DEC_HEAD_DIM, is_neox_style=False).to(dtype)
+        if layer_idx not in self.kv_cache:
+            k_cache = k
+            v_cache = v
+        else:
+            k_cache, v_cache = self.kv_cache[layer_idx]
+            k_cache = torch.cat([k_cache, k], dim=0)
+            v_cache = torch.cat([v_cache, v], dim=0)
+        if k_cache.shape[0] > DEC_WINDOW:
+            k_cache = k_cache[-DEC_WINDOW:]
+            v_cache = v_cache[-DEC_WINDOW:]
+        self.kv_cache[layer_idx] = (k_cache, v_cache)
+        full_k, full_v = self.kv_cache[layer_idx]
+        kv_start_pos = (pos + seq_len - 1) - (full_k.shape[0] - 1)
+        attn_out = _causal_attention(
+            q, full_k, full_v,
+            DEC_HEADS, DEC_KV_HEADS, DEC_HEAD_DIM,
+            DEC_WINDOW,
+            q_start_pos=pos,
+            kv_start_pos=kv_start_pos,
+        )
+        attn_proj = F.linear(attn_out, L['wo'])
+        h = h + attn_proj
+        ffn_norm = _RMSNorm(L['ffn_norm'], DEC_NORM_EPS)
+        h_norm = ffn_norm(h).to(dtype)
+        if t_cond is not None:
+            t_cond_dt = t_cond.to(dtype)
+            ada_hidden = F.gelu(F.linear(t_cond_dt, L['ada_down']))
+            ada_scale = F.linear(ada_hidden, L['ada_up'])
+            h_norm = h_norm * (1 + ada_scale.unsqueeze(0))
+        gate = F.silu(F.linear(h_norm, L['w1']))
+        up = F.linear(h_norm, L['w3'])
+        h = h + F.linear(gate * up, L['w2'])
+        return h
+    def prefill(self, input_embeds, t_cond):
+        self.kv_cache = {}
+        h = input_embeds.to(self.compute_dtype)
+        seq_len = h.shape[0]
+        for layer in range(DEC_LAYERS):
+            h = self._layer_forward(h, layer, 0, seq_len, t_cond=t_cond)
+        return h
+    def forward_one(self, embed, pos, t_cond):
+        h = embed.unsqueeze(0) if embed.dim() == 1 else embed
+        h = h.to(self.compute_dtype)
+        for layer in range(DEC_LAYERS):
+            h = self._layer_forward(h, layer, pos, pos + 1, t_cond=t_cond)
+        norm = _RMSNorm(self.final_norm, DEC_NORM_EPS)
+        h = norm(h)
+        logits = F.linear(h.float().squeeze(0), self.tok_embeddings.float())
+        return logits
+# ============================================================================
+# Tokenizer
+# ============================================================================
+def _load_tokenizer(model_dir):
+    tekken_path = os.path.join(model_dir, "tekken.json")
+    with open(tekken_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    vocab = data["vocab"]
+    config = data.get("config", {})
+    n_special = int(config.get("default_num_special_tokens", 1000))
+    special_ids = {int(st["rank"]) for st in data.get("special_tokens", []) if "rank" in st}
+    bytes_cache = {}
+    def token_bytes(token_id: int) -> bytes:
+        b = bytes_cache.get(token_id)
+        if b is not None:
+            return b
+        if token_id < 0:
+            bytes_cache[token_id] = b""
+            return b""
+        if token_id < n_special or token_id in special_ids:
+            bytes_cache[token_id] = b""
+            return b""
+        vocab_id = token_id - n_special
+        if vocab_id < 0 or vocab_id >= len(vocab):
+            bytes_cache[token_id] = b""
+            return b""
+        b = base64.b64decode(vocab[vocab_id]["token_bytes"])
+        bytes_cache[token_id] = b
+        return b
+    def decode(token_ids):
+        out = bytearray()
+        for token_id in map(int, token_ids):
+            if token_id < n_special or token_id in special_ids:
+                continue
+            out += token_bytes(token_id)
+        return out.decode("utf-8", errors="replace")
+    return decode
+# ============================================================================
+# VoxtralModel — singleton inference engine
+# ============================================================================
+class VoxtralModel:
+    """Load Voxtral from Mistral-format safetensors and run inference on CUDA."""
+    def __init__(self, model_dir: str):
+        self.model_dir = model_dir
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # FP16 for T4 (no good bf16 support); float32 on CPU
+        self.compute_dtype = torch.float16 if self.device.type == "cuda" else torch.float32
+        sf_path = os.path.join(model_dir, "consolidated.safetensors")
+        self._sf_file = safe_open(sf_path, framework="pt")
+        # Precompute mel filters on device
+        self._mel_filters = torch.tensor(
+            _compute_mel_filters(), dtype=torch.float32, device=self.device
+        )
+        # Preload decoder (holds all layer weights on GPU)
+        self._decoder = _Decoder(self._sf_file, self.device, self.compute_dtype)
+        # Load tokenizer
+        self._decode = _load_tokenizer(model_dir)
+    def _prepare(self, audio_16k: np.ndarray):
+        """Audio array -> (adapter_out, prompt_ids, t_cond) all on device."""
+        prompt_ids = [TOKEN_BOS] + [TOKEN_STREAMING_PAD] * (N_LEFT_PAD_TOKENS + N_DELAY_TOKENS)
+        padded = _pad_audio_streaming(audio_16k).astype(np.float32)
+        audio_tensor = torch.tensor(padded, dtype=torch.float32, device=self.device)
+        mel = _compute_mel_spectrogram(audio_tensor, self._mel_filters, self.device)
+        if mel.shape[1] % 2 != 0:
+            mel = mel[:, 1:]
+        with torch.no_grad():
+            enc_out = _encoder_forward(mel, self._sf_file, self.device, self.compute_dtype)
+            adapter_out = _adapter_forward(enc_out, self._sf_file, self.device, self.compute_dtype)
+        t_cond = _compute_time_embedding(float(N_DELAY_TOKENS), DEC_DIM, self.device)
+        return adapter_out, prompt_ids, t_cond
+    def transcribe(self, audio_16k: np.ndarray) -> str:
+        """Full pipeline: 16 kHz float32 mono audio -> transcribed text."""
+        adapter_out, prompt_ids, t_cond = self._prepare(audio_16k)
+        n_audio = adapter_out.shape[0]
+        L = len(prompt_ids)
+        prompt_ids_t = torch.tensor(prompt_ids, dtype=torch.long, device=self.device)
+        prefix_text_embeds = self._decoder.embed_tokens(prompt_ids_t)
+        prefix_embeds = adapter_out[:L] + prefix_text_embeds
+        with torch.no_grad():
+            if L > 1:
+                _ = self._decoder.prefill(prefix_embeds[:-1], t_cond)
+            logits = self._decoder.forward_one(prefix_embeds[-1], pos=L - 1, t_cond=t_cond)
+            token = int(logits.argmax().item())
+        generated = [token]
+        with torch.no_grad():
+            for pos in range(L, n_audio):
+                if token == TOKEN_EOS:
+                    break
+                embed = adapter_out[pos] + self._decoder.embed_token(token)
+                logits = self._decoder.forward_one(embed, pos=pos, t_cond=t_cond)
+                token = int(logits.argmax().item())
+                generated.append(token)
+        if generated and generated[-1] == TOKEN_EOS:
+            generated = generated[:-1]
+        return self._decode(generated).strip()
+    def transcribe_stream(self, audio_16k: np.ndarray) -> Iterator[str]:
+        """Streaming pipeline: yields decoded text fragments as tokens are generated."""
+        adapter_out, prompt_ids, t_cond = self._prepare(audio_16k)
+        n_audio = adapter_out.shape[0]
+        L = len(prompt_ids)
+        prompt_ids_t = torch.tensor(prompt_ids, dtype=torch.long, device=self.device)
+        prefix_text_embeds = self._decoder.embed_tokens(prompt_ids_t)
+        prefix_embeds = adapter_out[:L] + prefix_text_embeds
+        with torch.no_grad():
+            if L > 1:
+                _ = self._decoder.prefill(prefix_embeds[:-1], t_cond)
+            logits = self._decoder.forward_one(prefix_embeds[-1], pos=L - 1, t_cond=t_cond)
+            token = int(logits.argmax().item())
+        if token != TOKEN_EOS:
+            text = self._decode([token])
+            if text:
+                yield text
+        with torch.no_grad():
+            for pos in range(L, n_audio):
+                if token == TOKEN_EOS:
+                    break
+                embed = adapter_out[pos] + self._decoder.embed_token(token)
+                logits = self._decoder.forward_one(embed, pos=pos, t_cond=t_cond)
+                token = int(logits.argmax().item())
+                if token != TOKEN_EOS:
+                    text = self._decode([token])
+                    if text:
+                        yield text