Spaces:

nari-labs
/

Dia2-2B

Running on Zero

App Files Files Community

NariLabs commited on 22 days ago

Commit

aa16b75

verified ·

1 Parent(s): ff767d4

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

__init__.py +20 -0
assets.py +65 -0
audio/__init__.py +13 -0
audio/codec.py +58 -0
audio/grid.py +79 -0
cli.py +122 -0
config.py +180 -0
core/__init__.py +10 -0
core/cache.py +106 -0
core/depformer.py +264 -0
core/layers.py +209 -0
core/model.py +72 -0
core/precision.py +23 -0
core/transformer.py +140 -0
engine.py +230 -0
generation.py +158 -0
runtime/__init__.py +7 -0
runtime/audio_io.py +69 -0
runtime/context.py +138 -0
runtime/generator.py +420 -0
runtime/guidance.py +38 -0
runtime/logger.py +33 -0
runtime/sampler.py +37 -0
runtime/script_parser.py +69 -0
runtime/state_machine.py +170 -0
runtime/voice_clone.py +190 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .config import DiaConfig, load_config
+from .core.model import Dia2Model
+from .engine import Dia2
+from .generation import (
+    GenerationConfig,
+    GenerationResult,
+    PrefixConfig,
+    SamplingConfig,
+)
+__all__ = [
+    "DiaConfig",
+    "Dia2Model",
+    "load_config",
+    "GenerationConfig",
+    "GenerationResult",
+    "PrefixConfig",
+    "SamplingConfig",
+    "Dia2",
+]

assets.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+from huggingface_hub import hf_hub_download
+ASSET_MANIFEST = os.environ.get("DIA2_ASSET_MANIFEST", "dia2_assets.json")
+@dataclass(frozen=True)
+class AssetBundle:
+    config_path: str
+    weights_path: str
+    tokenizer_id: Optional[str]
+    mimi_id: Optional[str]
+    repo_id: Optional[str]
+def resolve_assets(
+    *,
+    repo: Optional[str],
+    config_path: Optional[str | Path],
+    weights_path: Optional[str | Path],
+    manifest_name: Optional[str] = None,
+) -> AssetBundle:
+    repo_id = repo
+    manifest_name = manifest_name or ASSET_MANIFEST
+    if repo_id and (config_path or weights_path):
+        raise ValueError("Provide either repo or config+weights, not both")
+    if config_path is None or weights_path is None:
+        if repo_id is None:
+            raise ValueError("Must specify repo or config+weights")
+        manifest = load_manifest(repo_id, manifest_name)
+        config_name = manifest.get("config", "config.json")
+        weights_name = manifest.get("weights", "model.safetensors")
+        config_local = hf_hub_download(repo_id, config_name)
+        weights_local = hf_hub_download(repo_id, weights_name)
+        return AssetBundle(
+            config_path=config_local,
+            weights_path=weights_local,
+            tokenizer_id=manifest.get("tokenizer") or repo_id,
+            mimi_id=manifest.get("mimi"),
+            repo_id=repo_id,
+        )
+    return AssetBundle(str(config_path), str(weights_path), None, None, repo_id)
+def load_manifest(repo_id: str, manifest_name: str) -> dict:
+    if not manifest_name:
+        return {}
+    try:
+        path = hf_hub_download(repo_id, manifest_name)
+    except Exception:
+        return {}
+    try:
+        return json.loads(Path(path).read_text())
+    except json.JSONDecodeError:
+        return {}
+__all__ = ["AssetBundle", "ASSET_MANIFEST", "resolve_assets", "load_manifest"]

audio/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .codec import MimiCodec, DEFAULT_MIMI_MODEL_ID, MimiConfig
+from .grid import delay_frames, undelay_frames, mask_audio_logits, fill_audio_channels, write_wav
+__all__ = [
+    "MimiCodec",
+    "DEFAULT_MIMI_MODEL_ID",
+    "MimiConfig",
+    "delay_frames",
+    "undelay_frames",
+    "mask_audio_logits",
+    "fill_audio_channels",
+    "write_wav",
+]

audio/codec.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+from transformers import MimiModel
+DEFAULT_MIMI_MODEL_ID = "kyutai/mimi"
+@dataclass(frozen=True)
+class MimiConfig:
+    model_id: str = DEFAULT_MIMI_MODEL_ID
+    dtype: Optional[torch.dtype] = None
+class MimiCodec(nn.Module):
+    """Thin wrapper around transformers' MimiModel for decoding audio tokens."""
+    def __init__(self, model: MimiModel, device: torch.device) -> None:
+        super().__init__()
+        self.model = model
+        self.device = device
+        cfg = getattr(model, "config", None)
+        self.sample_rate = getattr(cfg, "sampling_rate", 24000)
+        self.frame_rate = getattr(cfg, "frame_rate", 12.5)
+        self.samples_per_frame = int(round(self.sample_rate / self.frame_rate)) if self.frame_rate else 0
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: str = DEFAULT_MIMI_MODEL_ID,
+        *,
+        device: torch.device,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "MimiCodec":
+        model = MimiModel.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=True,
+        )
+        model = model.to(device)
+        model.eval()
+        return cls(model, device)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        codes = codes.to(self.device)
+        with torch.inference_mode():
+            audio, _ = self.model.decode(codes, return_dict=False)
+            return torch.clamp(audio, -1.0, 1.0)
+    def encode(self, audio: torch.Tensor, *, return_dict: bool = False):
+        audio = audio.to(self.device)
+        with torch.inference_mode():
+            return self.model.encode(audio, return_dict=return_dict)

audio/grid.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Sequence
+import numpy as np
+import torch
+def delay_frames(aligned: torch.Tensor, delays: Sequence[int], pad_id: int) -> torch.Tensor:
+    channels, total = aligned.shape
+    max_delay = max(delays) if delays else 0
+    out = aligned.new_full((channels, total + max_delay), pad_id)
+    for idx, delay in enumerate(delays):
+        out[idx, delay : delay + total] = aligned[idx]
+    return out
+def undelay_frames(delayed: torch.Tensor, delays: Sequence[int], pad_id: int) -> torch.Tensor:
+    channels, total = delayed.shape
+    max_delay = max(delays) if delays else 0
+    target = max(0, total - max_delay)
+    out = delayed.new_full((channels, target), pad_id)
+    for idx, delay in enumerate(delays):
+        out[idx] = delayed[idx, delay : delay + target]
+    return out
+def mask_audio_logits(logits: torch.Tensor, pad_idx: int, bos_idx: int) -> torch.Tensor:
+    if logits.shape[-1] == 0:
+        return logits
+    max_idx = logits.shape[-1] - 1
+    targets = [idx for idx in (pad_idx, bos_idx) if 0 <= idx <= max_idx]
+    if not targets:
+        return logits
+    masked = logits.clone()
+    neg_inf = torch.finfo(masked.dtype).min
+    for idx in targets:
+        masked[..., idx] = neg_inf
+    return masked
+def fill_audio_channels(
+    delays: Sequence[int],
+    constants,
+    step: int,
+    step_tokens: torch.Tensor,
+    audio_buf: torch.Tensor,
+) -> None:
+    for cb, delay in enumerate(delays):
+        idx = step - delay
+        in_bounds = idx >= 0 and step < audio_buf.shape[-1]
+        if in_bounds:
+            step_tokens[:, 2 + cb, 0] = audio_buf[:, cb, step]
+        else:
+            step_tokens[:, 2 + cb, 0] = constants.audio_bos
+def write_wav(path: str | Path, audio: np.ndarray, sample_rate: int) -> None:
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    audio = np.clip(audio, -1.0, 1.0)
+    pcm16 = (audio * 32767.0).astype(np.int16)
+    import wave
+    with wave.open(str(path), "wb") as handle:
+        handle.setnchannels(1)
+        handle.setsampwidth(2)
+        handle.setframerate(sample_rate)
+        handle.writeframes(pcm16.tobytes())
+__all__ = [
+    "delay_frames",
+    "undelay_frames",
+    "mask_audio_logits",
+    "fill_audio_channels",
+    "write_wav",
+]

cli.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from __future__ import annotations
+import argparse
+import torch
+from .engine import Dia2
+from .generation import (
+    build_generation_config,
+    load_script_text,
+    validate_generation_params,
+)
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate audio with Dia2")
+    parser.add_argument("--config", help="Path to config.json (overrides repo lookup)")
+    parser.add_argument(
+        "--weights", help="Path to model.safetensors (overrides repo lookup)"
+    )
+    parser.add_argument(
+        "--hf",
+        required=False,
+        help="Hugging Face repo id to download config/weights from (e.g. nari-labs/Dia2-2B)",
+    )
+    parser.add_argument(
+        "--input", default="input.txt", help="Script text file (default: input.txt)"
+    )
+    parser.add_argument("output", help="Output WAV path")
+    parser.add_argument(
+        "--device",
+        default=None,
+        help="Computation device (defaults to cuda if available, else cpu)",
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=["auto", "float32", "bfloat16"],
+        default="bfloat16",
+        help="Computation dtype (default: bfloat16)",
+    )
+    parser.add_argument("--topk", type=int, default=50)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--cfg", type=float, default=1.0)
+    parser.add_argument("--tokenizer", help="Tokenizer repo or local path override")
+    parser.add_argument(
+        "--mimi", help="Mimi repo id override (defaults to config/assets)"
+    )
+    parser.add_argument("--prefix-speaker-1", help="Prefix audio file for speaker 1")
+    parser.add_argument("--prefix-speaker-2", help="Prefix audio file for speaker 2")
+    parser.add_argument(
+        "--include-prefix",
+        action="store_true",
+        help="Keep prefix audio in the final waveform (default: trimmed)",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", help="Print generation progress logs"
+    )
+    parser.add_argument(
+        "--cuda-graph",
+        action="store_true",
+        help="Run generation with CUDA graph capture",
+    )
+    args = parser.parse_args()
+    device = args.device
+    if device is None or device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = args.dtype or "bfloat16"
+    repo = args.hf
+    if repo:
+        dia = Dia2(
+            repo=repo,
+            device=device,
+            dtype=dtype,
+            tokenizer_id=args.tokenizer,
+            mimi_id=args.mimi,
+        )
+    elif args.config and args.weights:
+        dia = Dia2.from_local(
+            config_path=args.config,
+            weights_path=args.weights,
+            device=device,
+            dtype=dtype,
+            tokenizer_id=args.tokenizer,
+            mimi_id=args.mimi,
+        )
+    else:
+        raise ValueError("Provide --hf/--variant or both --config and --weights")
+    script = load_script_text(args.input)
+    temperature, top_k, cfg_scale = validate_generation_params(
+        temperature=args.temperature,
+        top_k=args.topk,
+        cfg_scale=args.cfg,
+    )
+    config = build_generation_config(
+        temperature=temperature,
+        top_k=top_k,
+        cfg_scale=cfg_scale,
+    )
+    overrides = {}
+    if args.cuda_graph:
+        overrides["use_cuda_graph"] = True
+    if args.prefix_speaker_1:
+        overrides["prefix_speaker_1"] = args.prefix_speaker_1
+    if args.prefix_speaker_2:
+        overrides["prefix_speaker_2"] = args.prefix_speaker_2
+    if args.include_prefix:
+        overrides["include_prefix"] = True
+    dia.generate(
+        script,
+        config=config,
+        output_wav=args.output,
+        verbose=args.verbose,
+        **overrides,
+    )
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+@dataclass(frozen=True)
+class DataConfig:
+    channels: int
+    text_vocab_size: int
+    audio_vocab_size: int
+    action_vocab_size: int
+    text_pad_token_id: int
+    text_new_word_token_id: int
+    text_zero_token_id: int
+    audio_pad_token_id: int
+    audio_bos_token_id: int
+    action_pad_token_id: int
+    action_new_word_token_id: int
+    delay_pattern: List[int]
+    first_word_min_start: int
+    max_pad: int
+    second_stream_ahead: int
+    tokenizer_path: Optional[str] = None
+@dataclass(frozen=True)
+class DecoderConfig:
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    gqa_query_heads: int
+    kv_heads: int
+    gqa_head_dim: int
+    dropout: float
+    low_rank_dim: int | None = None
+@dataclass(frozen=True)
+class DepformerConfig:
+    n_layer: int
+    n_embd: int
+    n_hidden: int
+    gqa_query_heads: int
+    kv_heads: int
+    gqa_head_dim: int
+    apply_rope: bool
+    text_embedding: bool
+    mlp_activations: List[str]
+@dataclass(frozen=True)
+class LinearHeadConfig:
+    mlp_activations: List[str]
+@dataclass(frozen=True)
+class ModelConfig:
+    decoder: DecoderConfig
+    depformer: DepformerConfig
+    linear: LinearHeadConfig
+    dropout: float
+    rope_min_timescale: int
+    rope_max_timescale: int
+    normalization_layer_epsilon: float
+@dataclass(frozen=True)
+class RuntimeConfig:
+    weights_schedule: List[int]
+    max_context_steps: int
+@dataclass(frozen=True)
+class AssetsConfig:
+    tokenizer: Optional[str]
+    mimi: Optional[str]
+@dataclass(frozen=True)
+class DiaConfig:
+    data: DataConfig
+    model: ModelConfig
+    runtime: RuntimeConfig
+    assets: AssetsConfig
+def _resolve_runtime(block: dict | None, data_cfg: DataConfig) -> RuntimeConfig:
+    block = block or {}
+    weights_schedule = block.get("weights_schedule")
+    if weights_schedule is None:
+        audio_channels = max(0, data_cfg.channels - 2)
+        weights_schedule = list(range(max(audio_channels - 1, 0)))
+    max_context = block.get("max_context_steps", 1500)
+    return RuntimeConfig(
+        weights_schedule=list(weights_schedule),
+        max_context_steps=int(max_context),
+    )
+def load_config(path: str | Path) -> DiaConfig:
+    cfg = json.loads(Path(path).read_text())
+    data = cfg["data"]
+    model = cfg["model"]
+    runtime_cfg_raw = cfg.get("runtime")
+    if runtime_cfg_raw is None:
+        raise ValueError(f"Config '{path}' is missing a runtime block")
+    decoder_cfg = DecoderConfig(
+        n_layer=model["decoder"]["n_layer"],
+        n_embd=model["decoder"]["n_embd"],
+        n_hidden=model["decoder"]["n_hidden"],
+        gqa_query_heads=model["decoder"]["gqa_query_heads"],
+        kv_heads=model["decoder"]["kv_heads"],
+        gqa_head_dim=model["decoder"]["gqa_head_dim"],
+        dropout=model.get("dropout", 0.0),
+        low_rank_dim=model["decoder"].get("low_rank_dim"),
+    )
+    depformer_cfg = DepformerConfig(
+        n_layer=model["depformer"]["n_layer"],
+        n_embd=model["depformer"]["n_embd"],
+        n_hidden=model["depformer"]["n_hidden"],
+        gqa_query_heads=model["depformer"]["gqa_query_heads"],
+        kv_heads=model["depformer"]["kv_heads"],
+        gqa_head_dim=model["depformer"]["gqa_head_dim"],
+        apply_rope=model["depformer"].get("apply_rope", True),
+        text_embedding=model["depformer"].get("text_embedding", True),
+        mlp_activations=model["depformer"].get("mlp_activations", ["silu", "linear"]),
+    )
+    data_cfg = DataConfig(
+        channels=data["channels"],
+        text_vocab_size=data["text_vocab_size"],
+        audio_vocab_size=data["audio_vocab_size"],
+        action_vocab_size=data["action_vocab_size"],
+        text_pad_token_id=data["text_pad_token_id"],
+        text_new_word_token_id=data["text_new_word_token_id"],
+        text_zero_token_id=data.get("text_zero_token_id", 7),
+        audio_pad_token_id=data.get("audio_pad_token_id", data["audio_vocab_size"] - 1),
+        audio_bos_token_id=data.get("audio_bos_token_id", data["audio_vocab_size"] - 2),
+        action_pad_token_id=data["action_pad_token_id"],
+        action_new_word_token_id=data["action_new_word_token_id"],
+        delay_pattern=list(data.get("delay_pattern", [])),
+        first_word_min_start=data.get("first_word_min_start", 0),
+        max_pad=data.get("max_pad", 0),
+        second_stream_ahead=data.get("second_stream_ahead", 0),
+        tokenizer_path=data.get("tokenizer_path"),
+    )
+    runtime_cfg = _resolve_runtime(runtime_cfg_raw, data_cfg)
+    linear_cfg = LinearHeadConfig(
+        mlp_activations=model.get("linear", {}).get("mlp_activations", ["silu", "linear"]),
+    )
+    model_cfg = ModelConfig(
+        decoder=decoder_cfg,
+        depformer=depformer_cfg,
+        linear=linear_cfg,
+        dropout=model.get("dropout", 0.0),
+        rope_min_timescale=model.get("rope_min_timescale", 1),
+        rope_max_timescale=model.get("rope_max_timescale", 10000),
+        normalization_layer_epsilon=model.get("normalization_layer_epsilon", 1e-5),
+    )
+    assets_raw = cfg.get("assets") or {}
+    assets_cfg = AssetsConfig(
+        tokenizer=assets_raw.get("tokenizer") or data_cfg.tokenizer_path,
+        mimi=assets_raw.get("mimi"),
+    )
+    return DiaConfig(
+        data=data_cfg,
+        model=model_cfg,
+        runtime=runtime_cfg,
+        assets=assets_cfg,
+    )

core/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .model import Dia2Model, DecodeState
+from .transformer import TransformerDecoder
+from .depformer import Depformer
+__all__ = [
+    "Dia2Model",
+    "DecodeState",
+    "TransformerDecoder",
+    "Depformer",
+]

core/cache.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List
+import torch
+@dataclass
+class CacheSlot:
+    keys: torch.Tensor
+    values: torch.Tensor
+    def __post_init__(self) -> None:
+        self.max_steps = self.keys.shape[2]
+        self.head_dim = self.keys.shape[3]
+        self.flat_heads = self.keys.shape[0] * self.keys.shape[1]
+        device = self.keys.device
+        self.length = torch.zeros((), dtype=torch.long, device=device)
+        self.positions = torch.arange(self.max_steps, dtype=torch.long, device=device)
+    @classmethod
+    def allocate(
+        cls,
+        *,
+        batch_size: int,
+        heads: int,
+        max_steps: int,
+        head_dim: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> "CacheSlot":
+        keys = torch.zeros(batch_size, heads, max_steps, head_dim, device=device, dtype=dtype)
+        values = torch.zeros_like(keys)
+        return cls(keys, values)
+    def reset(self) -> None:
+        self.length.zero_()
+    def write_and_view(
+        self,
+        key_chunk: torch.Tensor,
+        value_chunk: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        step = key_chunk.shape[2]
+        start = self.length
+        indices = self.positions[:step] + start
+        expanded = indices.unsqueeze(0).expand(self.flat_heads, -1)
+        flat_keys = self.keys.view(self.flat_heads, self.max_steps, self.head_dim)
+        flat_values = self.values.view(self.flat_heads, self.max_steps, self.head_dim)
+        flat_key_chunk = key_chunk.reshape(self.flat_heads, step, self.head_dim)
+        flat_value_chunk = value_chunk.reshape(self.flat_heads, step, self.head_dim)
+        scatter_index = expanded.unsqueeze(-1).expand_as(flat_key_chunk)
+        flat_keys.scatter_(1, scatter_index, flat_key_chunk)
+        flat_values.scatter_(1, scatter_index, flat_value_chunk)
+        self.length.add_(step)
+        bool_mask = (self.positions >= self.length).view(1, 1, 1, self.max_steps)
+        mask_dtype = self.keys.dtype
+        mask_value = torch.finfo(mask_dtype).min
+        attn_mask = torch.zeros_like(bool_mask, dtype=mask_dtype)
+        attn_mask = attn_mask.masked_fill(bool_mask, mask_value)
+        return self.keys, self.values, attn_mask
+class KVCache:
+    def __init__(self, slots: List[CacheSlot]) -> None:
+        self.slots = slots
+    @classmethod
+    def allocate(
+        cls,
+        *,
+        num_layers: int,
+        batch_size: int,
+        heads: int,
+        max_steps: int,
+        head_dim: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> "KVCache":
+        slots = [
+            CacheSlot.allocate(
+                batch_size=batch_size,
+                heads=heads,
+                max_steps=max_steps,
+                head_dim=head_dim,
+                device=device,
+                dtype=dtype,
+            )
+            for _ in range(num_layers)
+        ]
+        return cls(slots)
+    def get_slot(self, index: int) -> CacheSlot:
+        return self.slots[index]
+    def reset(self) -> None:
+        for slot in self.slots:
+            slot.reset()
+    clear = reset
+__all__ = ["CacheSlot", "KVCache"]

core/depformer.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+from ..config import DiaConfig
+from .cache import KVCache
+from .layers import MultiStreamEmbedding, Mlp, RotaryEmbedding
+from .precision import Precision
+class ScheduleAttention(nn.Module):
+    """Depformer attention that mirrors dia_v2 ScheduleAttention."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype) -> None:
+        super().__init__()
+        dep_cfg = config.model.depformer
+        runtime = config.runtime
+        self.schedule = runtime.weights_schedule
+        self.num_query_heads = dep_cfg.gqa_query_heads
+        self.num_kv_heads = dep_cfg.kv_heads
+        self.head_dim = dep_cfg.gqa_head_dim
+        self.num_gqa_groups = self.num_query_heads // max(self.num_kv_heads, 1)
+        self.apply_rope = dep_cfg.apply_rope
+        self.used_ids = sorted(set(self.schedule))
+        self.compute_dtype = compute_dtype
+        self.in_proj = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    dep_cfg.n_embd,
+                    3 * self.num_query_heads * self.head_dim,
+                    bias=False,
+                )
+                for i in self.used_ids
+            }
+        )
+        self.out_proj = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    self.num_query_heads * self.head_dim,
+                    dep_cfg.n_embd,
+                    bias=False,
+                )
+                for i in self.used_ids
+            }
+        )
+        eps = config.model.normalization_layer_epsilon
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        if self.apply_rope:
+            self.rotary = RotaryEmbedding(
+                self.head_dim,
+                config.model.rope_min_timescale,
+                config.model.rope_max_timescale,
+            )
+            stage_count = max(len(self.schedule), 1)
+            self.register_buffer(
+                "stage_positions",
+                torch.arange(stage_count, dtype=torch.long).view(stage_count, 1),
+                persistent=False,
+            )
+        else:
+            self.rotary = None
+            self.register_buffer(
+                "stage_positions",
+                torch.zeros(0, 1, dtype=torch.long),
+                persistent=False,
+            )
+    def forward_incremental(
+        self,
+        x_t: torch.Tensor,
+        stage_index: int,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        bsz, seq, _ = x_t.shape
+        if seq != 1:
+            raise ValueError("ScheduleAttention expects seq len 1 during decoding")
+        orig_dtype = x_t.dtype
+        module_index = self.schedule[stage_index]
+        proj = self.in_proj[str(module_index)](x_t.to(torch.float32))
+        proj = proj.view(bsz, seq, 3, self.num_query_heads, self.head_dim).to(self.compute_dtype)
+        q_proj = self.q_norm(proj[:, :, 0])
+        k_proj = self.k_norm(proj[:, :, 1])
+        v_proj = proj[:, :, 2]
+        if self.apply_rope:
+            pos_ids = self.stage_positions[stage_index : stage_index + 1]
+            if pos_ids.device != x_t.device:
+                pos_ids = pos_ids.to(x_t.device)
+            q_proj = self.rotary(q_proj, pos_ids)
+            k_proj = self.rotary(k_proj, pos_ids)
+        q = q_proj.transpose(1, 2)
+        k = k_proj.transpose(1, 2)
+        v = v_proj.transpose(1, 2)
+        if cache_slot is not None:
+            k, v, attn_mask = cache_slot.write_and_view(k, v)
+        else:
+            attn_mask = None
+        attn = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            scale=1.0,
+            attn_mask=attn_mask,
+            enable_gqa=self.num_gqa_groups > 1,
+        )
+        attn = attn.transpose(1, 2).contiguous()
+        flat = attn.reshape(bsz, seq, self.num_query_heads * self.head_dim)
+        out = self.out_proj[str(module_index)](flat.to(torch.float32))
+        return out.to(orig_dtype), cache_slot
+class DepformerLayer(nn.Module):
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
+        super().__init__()
+        dep_cfg = config.model.depformer
+        eps = config.model.normalization_layer_epsilon
+        self.pre_norm = nn.RMSNorm(dep_cfg.n_embd, eps=eps, dtype=torch.float32)
+        self.post_norm = nn.RMSNorm(dep_cfg.n_embd, eps=eps, dtype=torch.float32)
+        self.self_attention = ScheduleAttention(config, compute_dtype)
+        self.mlp = Mlp(
+            dep_cfg.n_embd,
+            dep_cfg.n_hidden,
+            compute_dtype,
+            tuple(config.model.depformer.mlp_activations),
+        )
+    def decode_step(
+        self,
+        x_t: torch.Tensor,
+        stage_index: int,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        residual = x_t
+        x_norm = self.pre_norm(x_t)
+        sa_out, _ = self.self_attention.forward_incremental(x_norm, stage_index, cache_slot)
+        x = residual + sa_out
+        residual2 = x
+        x_norm2 = self.post_norm(x)
+        mlp_out = self.mlp(x_norm2)
+        return residual2 + mlp_out, cache_slot
+class Depformer(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        dep_cfg = config.model.depformer
+        data_cfg = config.data
+        runtime = config.runtime
+        self.num_audio_channels = max(0, data_cfg.channels - 2)
+        self.num_depth = max(self.num_audio_channels - 1, 0)
+        self.weights_schedule = runtime.weights_schedule
+        self.audio_embeds = nn.ModuleList(
+            [nn.Embedding(data_cfg.audio_vocab_size, dep_cfg.n_embd) for _ in range(self.num_depth)]
+        )
+        if dep_cfg.text_embedding:
+            self.text_embed = MultiStreamEmbedding(
+                data_cfg.text_vocab_size,
+                dep_cfg.n_embd,
+                pad_id=data_cfg.text_pad_token_id,
+                output_dtype=precision.compute,
+            )
+        else:
+            self.text_embed = None
+        used_ids = sorted(set(self.weights_schedule))
+        self.depformer_in = nn.ModuleDict(
+            {
+                str(i): nn.Linear(
+                    config.model.decoder.n_embd,
+                    dep_cfg.n_embd,
+                    bias=False,
+                )
+                for i in used_ids
+            }
+        )
+        self.layers = nn.ModuleList([DepformerLayer(config, precision.compute) for _ in range(dep_cfg.n_layer)])
+        self.norm = nn.RMSNorm(dep_cfg.n_embd, eps=config.model.normalization_layer_epsilon)
+        self.logits_dtype = precision.logits
+        self.logits = nn.ModuleList(
+            [
+                nn.Linear(dep_cfg.n_embd, data_cfg.audio_vocab_size, bias=False)
+                for _ in range(self.num_depth)
+            ]
+        )
+        self.audio_vocab_limit = min(data_cfg.audio_pad_token_id, data_cfg.audio_bos_token_id)
+    def init_cache(self, batch_size: int, device: torch.device, max_steps: int) -> KVCache:
+        heads = self.layers[0].self_attention.num_kv_heads
+        head_dim = self.layers[0].self_attention.head_dim
+        return KVCache.allocate(
+            num_layers=len(self.layers),
+            batch_size=batch_size,
+            heads=heads,
+            max_steps=max_steps,
+            head_dim=head_dim,
+            device=device,
+            dtype=self.precision.compute,
+        )
+    def forward_step(
+        self,
+        prev_audio: torch.Tensor,
+        transformer_out: torch.Tensor,
+        stage_index: int,
+        cache: KVCache,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, KVCache]:
+        self._validate_inputs(stage_index, cache)
+        return self._forward_stage(stage_index, prev_audio, transformer_out, cache, main_text, second_text)
+    def _forward_stage(
+        self,
+        stage_index: int,
+        prev_audio: torch.Tensor,
+        transformer_out: torch.Tensor,
+        cache: KVCache,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, KVCache]:
+        prev_audio = prev_audio.long()
+        weight_idx = self.weights_schedule[stage_index]
+        token_emb = self.audio_embeds[stage_index](prev_audio[:, None]).to(self.precision.compute)
+        if stage_index == 0 and self.text_embed is not None:
+            if main_text is None or second_text is None:
+                raise ValueError("stage 0 requires text tokens")
+            token_emb = token_emb + self.text_embed(main_text[:, None], second_text[:, None])
+        dep_in = self.depformer_in[str(weight_idx)](transformer_out.to(torch.float32))
+        dep_in = dep_in.to(self.precision.compute)
+        dep_in = dep_in + token_emb.to(dep_in.dtype)
+        x = dep_in
+        for idx, layer in enumerate(self.layers):
+            slot = cache.get_slot(idx)
+            x, _ = layer.decode_step(x, stage_index, slot)
+        hidden = self.norm(x)
+        logits = self.logits[stage_index](hidden.to(torch.float32))
+        logits = logits.to(self.logits_dtype)
+        logits = logits.unsqueeze(1)
+        logits = logits[..., : self.audio_vocab_limit]
+        return logits, cache
+    def _validate_inputs(self, stage_index: int, cache: KVCache | None) -> None:
+        if stage_index < 0 or stage_index >= self.num_depth:
+            raise ValueError(f"stage_index {stage_index} out of range (depth={self.num_depth})")
+        if cache is None:
+            raise ValueError("depformer cache must be initialized")

core/layers.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union, List
+import torch
+from torch import nn
+import torch.nn.functional as F
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, min_timescale: int, max_timescale: int):
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("RoPE dimension must be even")
+        half_dim = head_dim // 2
+        fraction = (2.0 * torch.arange(0, half_dim)) / head_dim
+        timescale = min_timescale * (max_timescale / min_timescale) ** fraction
+        inv_freq = 1.0 / timescale
+        self.register_buffer("inv_freq", inv_freq.to(torch.float32), persistent=False)
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+        pos = position_ids.to(self.inv_freq.dtype)
+        freqs = torch.einsum("...i,j->...ij", pos, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        while emb.dim() < x.dim():
+            emb = emb.unsqueeze(-2)
+        cos = emb.cos().to(x.dtype)
+        sin = emb.sin().to(x.dtype)
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+        rotated = torch.cat((-x2, x1), dim=-1)
+        return (x * cos) + (rotated * sin)
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).reshape_as(x)
+def _get_activation(name: str) -> nn.Module:
+    name = name.lower()
+    if name in ("silu", "swish", "swiglu"):
+        return nn.SiLU()
+    if name in ("gelu", "geglu"):
+        return nn.GELU()
+    if name == "relu":
+        return nn.ReLU()
+    if name == "linear":
+        return nn.Identity()
+    raise ValueError(f"Unsupported activation {name}")
+@dataclass
+class AttentionShape:
+    dim: int
+    heads: int
+    kv_heads: int
+    head_dim: int
+    rope_min: int
+    rope_max: int
+    apply_rope: bool
+class Attention(nn.Module):
+    """Byte-for-byte port of dia_v2 Attention.forward_incremental."""
+    def __init__(self, config: DiaConfig, dim: int, compute_dtype: torch.dtype) -> None:
+        super().__init__()
+        dec = config.model.decoder
+        self.num_query_heads = dec.gqa_query_heads
+        self.num_kv_heads = dec.kv_heads
+        self.head_dim = dec.gqa_head_dim
+        self.num_gqa_groups = self.num_query_heads // max(self.num_kv_heads, 1)
+        self.compute_dtype = compute_dtype
+        self.q_proj = nn.Linear(dim, self.num_query_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_query_heads * self.head_dim, dim, bias=False)
+        eps = config.model.normalization_layer_epsilon
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=eps, dtype=torch.float32)
+        self.rotary = RotaryEmbedding(
+            self.head_dim,
+            config.model.rope_min_timescale,
+            config.model.rope_max_timescale,
+        )
+    def forward_incremental(
+        self,
+        x: torch.Tensor,
+        pos: Optional[torch.Tensor],
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        B, T, _ = x.shape
+        if T != 1:
+            raise ValueError("Attention expects sequence length 1 during decoding")
+        orig_dtype = x.dtype
+        q_proj = self._project_heads(self.q_proj, x, self.num_query_heads)
+        k_proj = self._project_heads(self.k_proj, x, self.num_kv_heads)
+        v_proj = self._project_heads(self.v_proj, x, self.num_kv_heads)
+        q_proj = self.q_norm(q_proj)
+        k_proj = self.k_norm(k_proj)
+        if pos is not None:
+            q_proj = self.rotary(q_proj, pos)
+            k_proj = self.rotary(k_proj, pos)
+        q = q_proj.transpose(1, 2)
+        k = k_proj.transpose(1, 2)
+        v = v_proj.transpose(1, 2)
+        if cache_slot is not None:
+            k_cache, v_cache, attn_mask = cache_slot.write_and_view(k, v)
+        else:
+            k_cache, v_cache = k, v
+            attn_mask = None
+        attn = F.scaled_dot_product_attention(
+            q,
+            k_cache,
+            v_cache,
+            scale=1.0,
+            attn_mask=attn_mask,
+            enable_gqa=self.num_gqa_groups > 1,
+        )
+        attn = attn.transpose(1, 2).contiguous()
+        flat = attn.reshape(B, T, self.num_query_heads * self.head_dim)
+        out = self.o_proj(flat.to(torch.float32))
+        return out.to(orig_dtype), cache_slot
+    def _project_heads(self, layer: nn.Linear, x: torch.Tensor, heads: int) -> torch.Tensor:
+        proj = layer(x.to(torch.float32))
+        B, T, _ = proj.shape
+        proj = proj.view(B, T, heads, self.head_dim)
+        return proj.to(self.compute_dtype)
+    def forward(
+        self,
+        x: torch.Tensor,
+        positions: Optional[torch.Tensor],
+        cache=None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        return self.forward_incremental(x, positions, cache)
+class MultiStreamEmbedding(nn.Module):
+    """Port of dia_v2 MultiStreamEmbed."""
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        pad_id: int,
+        *,
+        output_dtype: torch.dtype,
+        low_rank_dim: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.pad_id = pad_id
+        self.dtype = output_dtype
+        base_dim = low_rank_dim if low_rank_dim is not None else dim
+        self.embedding = nn.Embedding(vocab_size, base_dim)
+        self.main_proj = nn.Linear(base_dim, dim, bias=False)
+        self.second_proj = nn.Linear(base_dim, dim, bias=False)
+    def forward(self, main_inputs: torch.Tensor, second_inputs: torch.Tensor) -> torch.Tensor:
+        main_inputs = main_inputs.long()
+        second_inputs = second_inputs.long()
+        if self.pad_id is not None:
+            second_is_pad = second_inputs == self.pad_id
+        else:
+            second_is_pad = torch.zeros_like(second_inputs, dtype=torch.bool)
+        use_second = ~second_is_pad
+        emb_main = self.embedding(main_inputs)
+        emb_second = self.embedding(second_inputs)
+        out_main = self.main_proj(emb_main.to(torch.float32))
+        out_second = self.second_proj(emb_second.to(torch.float32))
+        zeros = torch.zeros_like(out_second)
+        y = out_main + torch.where(use_second.unsqueeze(-1), out_second, zeros)
+        target_dtype = self.dtype if self.dtype is not None else y.dtype
+        return y.to(target_dtype)
+class Mlp(nn.Module):
+    """Port of dia_v2 MlpBlock (two-activation gated MLP)."""
+    def __init__(
+        self,
+        dim: int,
+        hidden: int,
+        compute_dtype: torch.dtype,
+        activations: Sequence[str],
+    ) -> None:
+        super().__init__()
+        if len(activations) != 2:
+            raise ValueError("Mlp expects two activation functions.")
+        self.dtype = compute_dtype
+        self.hidden = hidden
+        self.branch_count = len(activations)
+        self.wi = nn.Linear(dim, self.branch_count * hidden, bias=False)
+        self.wo = nn.Linear(hidden, dim, bias=False)
+        self.activation_fns = [_get_activation(activations[0]), _get_activation(activations[1])]
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        proj = self.wi(x.to(torch.float32))
+        proj = proj.view(*x.shape[:-1], self.branch_count, self.hidden).to(self.dtype)
+        gate, up = proj.unbind(dim=-2)
+        hidden = self.activation_fns[0](gate) * self.activation_fns[1](up)
+        out = self.wo(hidden.to(torch.float32))
+        return out.to(self.dtype)

core/model.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+from torch import nn
+from ..config import DiaConfig
+from .cache import KVCache
+from .depformer import Depformer
+from .precision import Precision
+from .transformer import TransformerDecoder
+@dataclass
+class DecodeState:
+    transformer: KVCache
+    depformer: KVCache
+class Dia2Model(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        self.transformer = TransformerDecoder(config, precision)
+        self.depformer = Depformer(config, precision)
+        self._cast_norms_to_compute()
+    def init_state(self, batch_size: int, device: torch.device, max_steps: int) -> DecodeState:
+        transformer_cache = self.transformer.init_cache(batch_size, device, max_steps)
+        depformer_cache = self.depformer.init_cache(batch_size, device, self.depformer.num_depth)
+        return DecodeState(transformer_cache, depformer_cache)
+    def step_text(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        state: DecodeState,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        hidden, action, cb0, cache = self.transformer.forward_step(tokens, positions, state.transformer)
+        state.transformer = cache
+        return hidden, action, cb0
+    def step_audio_stage(
+        self,
+        stage_index: int,
+        prev_audio: torch.Tensor,
+        transformer_hidden: torch.Tensor,
+        state: DecodeState,
+        main_text: Optional[torch.Tensor],
+        second_text: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        cache = state.depformer
+        logits, new_cache = self.depformer.forward_step(
+            prev_audio,
+            transformer_hidden,
+            stage_index,
+            cache,
+            main_text,
+            second_text,
+        )
+        state.depformer = new_cache
+        return logits
+    def _cast_norms_to_compute(self) -> None:
+        """Cast RMSNorm weights/biases to the compute dtype to avoid bf16 warnings."""
+        def _convert(module: nn.Module) -> None:
+            if isinstance(module, nn.RMSNorm):
+                module.to(self.precision.compute)
+        self.apply(_convert)

core/precision.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+@dataclass(frozen=True)
+class Precision:
+    compute: torch.dtype
+    logits: torch.dtype
+def resolve_precision(kind: str | None, device: torch.device) -> Precision:
+    normalized = (kind or "auto").lower()
+    if normalized == "auto":
+        normalized = "bfloat16" if device.type == "cuda" else "float32"
+    if normalized == "bfloat16":
+        compute = torch.bfloat16 if device.type == "cuda" else torch.float32
+        return Precision(compute=compute, logits=torch.float32)
+    if normalized == "float32":
+        return Precision(compute=torch.float32, logits=torch.float32)
+    raise ValueError(f"Unsupported dtype '{kind}'")

core/transformer.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+from ..config import DiaConfig
+from .cache import KVCache
+from .precision import Precision
+from .layers import (
+    AttentionShape,
+    MultiStreamEmbedding,
+    Mlp,
+    Attention,
+)
+class TransformerDecoder(nn.Module):
+    """Inference-time port of dia_v2.model.Transformer."""
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        self.config = config
+        self.precision = precision
+        data_cfg = config.data
+        dec_cfg = config.model.decoder
+        self.audio_embeds = nn.ModuleList(
+            [
+                nn.Embedding(
+                    data_cfg.audio_vocab_size,
+                    dec_cfg.n_embd,
+                )
+                for _ in range(max(0, data_cfg.channels - 2))
+            ]
+        )
+        self.text_embed = MultiStreamEmbedding(
+            data_cfg.text_vocab_size,
+            dec_cfg.n_embd,
+            pad_id=data_cfg.text_pad_token_id,
+            output_dtype=self.precision.compute,
+            low_rank_dim=dec_cfg.low_rank_dim,
+        )
+        self.layers = nn.ModuleList([DecoderLayer(config, precision) for _ in range(dec_cfg.n_layer)])
+        self.norm = nn.RMSNorm(dec_cfg.n_embd, eps=config.model.normalization_layer_epsilon, dtype=torch.float32)
+        self.action_head = nn.Linear(dec_cfg.n_embd, data_cfg.action_vocab_size, bias=False)
+        self.cb0_head = nn.Linear(dec_cfg.n_embd, data_cfg.audio_vocab_size, bias=False)
+    def init_cache(self, batch_size: int, device: torch.device, max_steps: int) -> KVCache:
+        heads = self.layers[0].attn.num_kv_heads
+        head_dim = self.layers[0].attn.head_dim
+        return KVCache.allocate(
+            num_layers=len(self.layers),
+            batch_size=batch_size,
+            heads=heads,
+            max_steps=max_steps,
+            head_dim=head_dim,
+            device=device,
+            dtype=self.precision.compute,
+        )
+    def forward_step(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        cache: KVCache,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, KVCache]:
+        if cache is None:
+            raise ValueError("Transformer cache must be initialized")
+        B, C, T1 = tokens.shape
+        if T1 != 1:
+            raise ValueError("forward_step expects sequence length 1")
+        num_audio_channels = max(0, C - 2)
+        hidden_t = self.text_embed(tokens[:, 0, :], tokens[:, 1, :])
+        for idx in range(num_audio_channels):
+            audio_emb = self.audio_embeds[idx](tokens[:, idx + 2, :])
+            hidden_t.add_(audio_emb)
+        hidden_t = hidden_t.to(self.precision.compute)
+        x = hidden_t
+        for idx, layer in enumerate(self.layers):
+            slot = cache.get_slot(idx)
+            x, _ = layer.decode_step(x, positions, slot)
+        hidden_norm = self.norm(x)
+        action_logits = self.action_head(hidden_norm.to(torch.float32)).to(self.precision.logits)
+        cb0_logits = self.cb0_head(hidden_norm.to(torch.float32)).to(self.precision.logits)
+        return hidden_norm, action_logits, cb0_logits, cache
+    def _embed(self, tokens: torch.Tensor) -> torch.Tensor:
+        B, C, T1 = tokens.shape
+        if T1 != 1:
+            raise ValueError("_embed expects sequence length 1")
+        num_audio_channels = max(0, C - 2)
+        text_hidden = self.text_embed(tokens[:, 0, :], tokens[:, 1, :])
+        audio_terms: list[torch.Tensor] = []
+        for idx in range(num_audio_channels):
+            audio_emb = self.audio_embeds[idx](tokens[:, idx + 2, :])
+            audio_terms.append(audio_emb)
+        hidden = text_hidden
+        for term in audio_terms:
+            hidden = hidden + term
+        final = hidden.to(self.precision.compute)
+        return final
+class DecoderLayer(nn.Module):
+    def __init__(self, config: DiaConfig, precision: Precision):
+        super().__init__()
+        dec = config.model.decoder
+        eps = config.model.normalization_layer_epsilon
+        self.pre_norm = nn.RMSNorm(dec.n_embd, eps=eps, dtype=torch.float32)
+        self.attn = Attention(config, dec.n_embd, precision.compute)
+        self.post_norm = nn.RMSNorm(dec.n_embd, eps=eps, dtype=torch.float32)
+        self.mlp = Mlp(
+            dec.n_embd,
+            dec.n_hidden,
+            precision.compute,
+            tuple(config.model.linear.mlp_activations),
+        )
+    def decode_step(
+        self,
+        x: torch.Tensor,
+        pos: torch.Tensor,
+        cache_slot,
+    ) -> Tuple[torch.Tensor, object]:
+        residual = x
+        x_norm = self.pre_norm(x)
+        attn_out, _ = self.attn(x_norm, pos, cache_slot)
+        x = residual + attn_out
+        residual2 = x
+        x_norm2 = self.post_norm(x)
+        mlp_out = self.mlp(x_norm2)
+        return residual2 + mlp_out, cache_slot

engine.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Optional, Sequence
+from .assets import resolve_assets
+from .runtime.context import RuntimeContext, build_runtime
+from .runtime.generator import (
+    build_initial_state,
+    decode_audio,
+    run_generation_loop,
+    warmup_with_prefix,
+)
+from .runtime.script_parser import parse_script
+from .audio.grid import undelay_frames, write_wav
+from .runtime.voice_clone import build_prefix_plan
+from .generation import (
+    GenerationConfig,
+    GenerationResult,
+    merge_generation_config,
+    normalize_script,
+)
+from .runtime.logger import RuntimeLogger
+class Dia2:
+    def __init__(
+        self,
+        *,
+        repo: Optional[str] = None,
+        config_path: Optional[str | Path] = None,
+        weights_path: Optional[str | Path] = None,
+        tokenizer_id: Optional[str | Path] = None,
+        mimi_id: Optional[str] = None,
+        device: str = "cuda",
+        dtype: str = "auto",
+        default_config: Optional[GenerationConfig] = None,
+    ) -> None:
+        bundle = resolve_assets(
+            repo=repo,
+            config_path=config_path,
+            weights_path=weights_path,
+        )
+        self._config_path = bundle.config_path
+        self._weights_path = bundle.weights_path
+        self._tokenizer_id = (str(tokenizer_id) if tokenizer_id else None) or bundle.tokenizer_id
+        self._repo_id = bundle.repo_id
+        self._mimi_id = mimi_id or bundle.mimi_id
+        self.device = device
+        self._dtype_pref = dtype or "auto"
+        self.default_config = default_config or GenerationConfig()
+        self._runtime: Optional[RuntimeContext] = None
+    @classmethod
+    def from_repo(
+        cls,
+        repo: str,
+        *,
+        device: str = "cuda",
+        dtype: str = "auto",
+        tokenizer_id: Optional[str] = None,
+        mimi_id: Optional[str] = None,
+    ) -> "Dia2":
+        return cls(repo=repo, device=device, dtype=dtype, tokenizer_id=tokenizer_id, mimi_id=mimi_id)
+    @classmethod
+    def from_local(
+        cls,
+        config_path: str | Path,
+        weights_path: str | Path,
+        *,
+        device: str = "cuda",
+        dtype: str = "auto",
+        tokenizer_id: Optional[str | Path] = None,
+        mimi_id: Optional[str] = None,
+    ) -> "Dia2":
+        return cls(
+            config_path=config_path,
+            weights_path=weights_path,
+            tokenizer_id=tokenizer_id,
+            device=device,
+            dtype=dtype,
+            mimi_id=mimi_id,
+        )
+    def set_device(self, device: str, *, dtype: Optional[str] = None) -> None:
+        desired_dtype = dtype or self._dtype_pref
+        if self.device == device and desired_dtype == self._dtype_pref:
+            return
+        self.device = device
+        self._dtype_pref = desired_dtype
+        self._runtime = None
+    def close(self) -> None:
+        self._runtime = None
+    def _ensure_runtime(self) -> RuntimeContext:
+        if self._runtime is None:
+            self._runtime = self._build_runtime()
+        return self._runtime
+    def generate(
+        self,
+        script: str | Sequence[str],
+        *,
+        config: Optional[GenerationConfig] = None,
+        output_wav: Optional[str | Path] = None,
+        prefix_speaker_1: Optional[str] = None,
+        prefix_speaker_2: Optional[str] = None,
+        include_prefix: Optional[bool] = None,
+        verbose: bool = False,
+        **overrides,
+    ):
+        runtime = self._ensure_runtime()
+        logger = RuntimeLogger(verbose)
+        merged_overrides = dict(overrides)
+        if prefix_speaker_1 is not None:
+            merged_overrides["prefix_speaker_1"] = prefix_speaker_1
+        if prefix_speaker_2 is not None:
+            merged_overrides["prefix_speaker_2"] = prefix_speaker_2
+        if include_prefix is not None:
+            merged_overrides["include_prefix"] = include_prefix
+        merged = merge_generation_config(base=config or self.default_config, overrides=merged_overrides)
+        max_context = runtime.config.runtime.max_context_steps
+        text = normalize_script(script)
+        prefix_plan = build_prefix_plan(runtime, merged.prefix)
+        entries = []
+        if prefix_plan is not None:
+            entries.extend(prefix_plan.entries)
+        entries.extend(parse_script([text], runtime.tokenizer, runtime.constants, runtime.frame_rate))
+        runtime.machine.initial_padding = merged.initial_padding
+        logger.event(
+            f"starting generation: max_context={max_context} cfg_scale={merged.cfg_scale:.2f} "
+            f"device={self.device} dtype={self._dtype_pref}"
+        )
+        state = runtime.machine.new_state(entries)
+        cfg_active = merged.cfg_scale != 1.0
+        if cfg_active:
+            logger.event(f"classifier-free guidance enabled (scale={merged.cfg_scale:.2f})")
+        else:
+            logger.event("classifier-free guidance disabled (scale=1.0)")
+        gen_state = build_initial_state(
+            runtime,
+            prefix=prefix_plan,
+        )
+        include_prefix_audio = bool(prefix_plan and merged.prefix and merged.prefix.include_audio)
+        start_step = 0
+        if prefix_plan is not None:
+            logger.event(f"warming up with prefix ({prefix_plan.aligned_frames} frames)")
+            start_step = warmup_with_prefix(runtime, prefix_plan, state, gen_state)
+            if include_prefix_audio:
+                logger.event("prefix audio will be kept in output")
+            else:
+                logger.event("prefix audio trimmed from output")
+        first_word_frame, audio_buf = run_generation_loop(
+            runtime,
+            state=state,
+            generation=gen_state,
+            config=merged,
+            start_step=start_step,
+            logger=logger,
+        )
+        aligned = undelay_frames(audio_buf[0], runtime.audio_delays, runtime.constants.audio_pad).unsqueeze(0)
+        crop = 0 if include_prefix_audio else max(first_word_frame, 0)
+        if crop > 0 and crop < aligned.shape[-1]:
+            aligned = aligned[:, :, crop:]
+        elif crop >= aligned.shape[-1]:
+            crop = 0
+        logger.event(f"decoding {aligned.shape[-1]} Mimi frames")
+        waveform = decode_audio(runtime, aligned)
+        if output_wav is not None:
+            write_wav(str(output_wav), waveform.detach().cpu().numpy(), runtime.mimi.sample_rate)
+            duration = waveform.shape[-1] / max(runtime.mimi.sample_rate, 1)
+            logger.event(f"saved {output_wav} ({duration:.2f}s)")
+        frame_rate = max(runtime.frame_rate, 1.0)
+        prefix_entry_count = len(prefix_plan.entries) if prefix_plan is not None else 0
+        transcript_entries = state.transcript
+        if prefix_plan is not None and not include_prefix_audio:
+            if len(transcript_entries) > prefix_entry_count:
+                transcript_entries = transcript_entries[prefix_entry_count:]
+            else:
+                transcript_entries = []
+        timestamps = []
+        for word, step in transcript_entries:
+            adj = step - crop
+            if adj < 0:
+                continue
+            timestamps.append((word, adj / frame_rate))
+        logger.event(f"generation finished in {logger.elapsed():.2f}s")
+        return GenerationResult(aligned, waveform, runtime.mimi.sample_rate, timestamps)
+    def save_wav(self, script: str | Sequence[str], path: str | Path, **kwargs):
+        return self.generate(script, output_wav=path, **kwargs)
+    @property
+    def sample_rate(self) -> int:
+        return self._ensure_runtime().mimi.sample_rate
+    @property
+    def tokenizer_id(self) -> Optional[str]:
+        if self._tokenizer_id:
+            return self._tokenizer_id
+        if self._runtime is not None:
+            return getattr(self._runtime.tokenizer, "name_or_path", None)
+        return self._repo_id
+    @property
+    def dtype(self) -> str:
+        return self._dtype_pref
+    @property
+    def max_context_steps(self) -> int:
+        return self._ensure_runtime().config.runtime.max_context_steps
+    @property
+    def repo(self) -> Optional[str]:
+        return self._repo_id
+    def _build_runtime(self) -> RuntimeContext:
+        runtime, tokenizer_ref, mimi_ref = build_runtime(
+            config_path=self._config_path,
+            weights_path=self._weights_path,
+            tokenizer_id=self._tokenizer_id,
+            repo_id=self._repo_id,
+            mimi_id=self._mimi_id,
+            device=self.device,
+            dtype_pref=self._dtype_pref,
+        )
+        self._tokenizer_id = tokenizer_ref
+        self._mimi_id = mimi_ref
+        return runtime

generation.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from __future__ import annotations
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Mapping, Optional, Sequence, Tuple
+import torch
+@dataclass(frozen=True)
+class SamplingConfig:
+    temperature: float = 0.8
+    top_k: int = 50
+def _default_text_sampling() -> SamplingConfig:
+    return SamplingConfig(temperature=0.6, top_k=50)
+def _default_audio_sampling() -> SamplingConfig:
+    return SamplingConfig(temperature=0.8, top_k=50)
+@dataclass(frozen=True)
+class PrefixConfig:
+    speaker_1: Optional[str] = None
+    speaker_2: Optional[str] = None
+    include_audio: bool = False
+@dataclass(frozen=True)
+class GenerationConfig:
+    text: SamplingConfig = field(default_factory=_default_text_sampling)
+    audio: SamplingConfig = field(default_factory=_default_audio_sampling)
+    cfg_scale: float = 2.0
+    cfg_filter_k: int = 50
+    initial_padding: int = 2
+    prefix: Optional["PrefixConfig"] = None
+    use_cuda_graph: bool = False
+@dataclass(frozen=True)
+class GenerationResult:
+    audio_tokens: torch.Tensor
+    waveform: torch.Tensor
+    sample_rate: int
+    timestamps: List[Tuple[str, float]]
+def normalize_script(script: str | Sequence[str]) -> str:
+    if isinstance(script, str):
+        return script.strip()
+    return "\n".join(line.strip() for line in script)
+def load_script_text(path: str | Path) -> str:
+    if path == "-":
+        return sys.stdin.read().strip()
+    path_obj = Path(path)
+    if path_obj.exists():
+        return path_obj.read_text().strip()
+    return str(path).strip()
+def validate_generation_params(
+    *,
+    temperature: float,
+    top_k: int,
+    cfg_scale: float,
+) -> tuple[float, int, float]:
+    if temperature <= 0:
+        raise ValueError("temperature must be positive")
+    if top_k <= 0:
+        raise ValueError("top_k must be positive")
+    if cfg_scale <= 0:
+        raise ValueError("cfg_scale must be positive")
+    return temperature, top_k, cfg_scale
+def build_generation_config(
+    *,
+    temperature: float,
+    top_k: int,
+    cfg_scale: float,
+) -> GenerationConfig:
+    sampling = SamplingConfig(temperature=temperature, top_k=top_k)
+    return GenerationConfig(
+        text=sampling,
+        audio=sampling,
+        cfg_scale=cfg_scale,
+    )
+def merge_generation_config(
+    *,
+    base: GenerationConfig,
+    overrides: Mapping[str, object],
+) -> GenerationConfig:
+    clean_overrides = {k: v for k, v in overrides.items() if v is not None}
+    text_temp = clean_overrides.pop("temp_text", None)
+    text_topk = clean_overrides.pop("topk_text", None)
+    audio_temp = clean_overrides.pop("temp_audio", None)
+    audio_topk = clean_overrides.pop("topk_audio", None)
+    prefix_speaker_1 = clean_overrides.pop("prefix_speaker_1", None)
+    prefix_speaker_2 = clean_overrides.pop("prefix_speaker_2", None)
+    include_prefix = clean_overrides.pop("include_prefix", None)
+    text_sampling = base.text
+    if text_temp is not None or text_topk is not None:
+        text_sampling = SamplingConfig(
+            temperature=text_temp if text_temp is not None else text_sampling.temperature,
+            top_k=text_topk if text_topk is not None else text_sampling.top_k,
+        )
+    audio_sampling = base.audio
+    if audio_temp is not None or audio_topk is not None:
+        audio_sampling = SamplingConfig(
+            temperature=audio_temp if audio_temp is not None else audio_sampling.temperature,
+            top_k=audio_topk if audio_topk is not None else audio_sampling.top_k,
+        )
+    prefix_cfg = base.prefix
+    if (
+        prefix_speaker_1 is not None
+        or prefix_speaker_2 is not None
+        or include_prefix is not None
+        or prefix_cfg is not None
+    ):
+        prefix_cfg = prefix_cfg or PrefixConfig()
+        prefix_cfg = PrefixConfig(
+            speaker_1=prefix_speaker_1 if prefix_speaker_1 is not None else prefix_cfg.speaker_1,
+            speaker_2=prefix_speaker_2 if prefix_speaker_2 is not None else prefix_cfg.speaker_2,
+            include_audio=include_prefix if include_prefix is not None else prefix_cfg.include_audio,
+        )
+    return GenerationConfig(
+        text=text_sampling,
+        audio=audio_sampling,
+        cfg_scale=clean_overrides.pop("cfg_scale", base.cfg_scale),
+        cfg_filter_k=clean_overrides.pop("cfg_filter_k", base.cfg_filter_k),
+        initial_padding=clean_overrides.pop("initial_padding", base.initial_padding),
+        prefix=prefix_cfg,
+        use_cuda_graph=clean_overrides.pop("use_cuda_graph", base.use_cuda_graph),
+    )
+__all__ = [
+    "SamplingConfig",
+    "GenerationConfig",
+    "GenerationResult",
+    "PrefixConfig",
+    "normalize_script",
+    "load_script_text",
+    "validate_generation_params",
+    "build_generation_config",
+    "merge_generation_config",
+]

runtime/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .state_machine import Entry, StateMachine, TokenIds
+__all__ = [
+    "Entry",
+    "StateMachine",
+    "TokenIds",
+]

runtime/audio_io.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Union
+import numpy as np
+import sphn
+import torch
+import torch.nn.functional as F
+from ..audio import MimiCodec
+PathLike = Union[str, Path]
+def load_mono_audio(path: PathLike, target_sr: int) -> np.ndarray:
+    """Read an audio file, convert to mono float32, and resample to target_sr."""
+    path = str(path)
+    try:
+        audio, sr = sphn.read_wav(path)
+    except Exception:
+        import soundfile as sf  # Local fallback
+        audio, sr = sf.read(path, dtype="float32", always_2d=False)
+    audio = np.asarray(audio, dtype=np.float32)
+    if audio.ndim == 2:
+        audio = audio.mean(axis=1)
+    if sr != target_sr:
+        if hasattr(sphn, "resample_audio"):
+            audio = sphn.resample_audio(audio, sr, target_sr).astype(np.float32)
+        else:
+            audio = _resample_linear(audio, sr, target_sr)
+    return audio
+def audio_to_tensor(audio: np.ndarray, device: torch.device) -> torch.Tensor:
+    """Convert mono PCM samples into shape [1, 1, T] tensor."""
+    tensor = torch.from_numpy(audio).to(device)
+    if tensor.dim() == 1:
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 2:
+        tensor = tensor.unsqueeze(0)
+    return tensor
+def encode_audio_tokens(mimi: MimiCodec, audio: np.ndarray) -> torch.Tensor:
+    """Encode PCM audio into Mimi codebook tokens [C, T]."""
+    waveform = audio_to_tensor(audio, mimi.device)
+    with torch.inference_mode():
+        codes, *_ = mimi.encode(waveform, return_dict=False)
+    if isinstance(codes, (tuple, list)):
+        codes = codes[0]
+    # Mimi.encode returns [B, num_codebooks, T]; select batch 0.
+    codes = codes[0].to(torch.long)
+    return codes
+def _resample_linear(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray:
+    if src_sr == dst_sr:
+        return audio.astype(np.float32)
+    length = audio.shape[0]
+    new_length = max(1, int(round(length * dst_sr / src_sr)))
+    tensor = torch.from_numpy(audio.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+    with torch.no_grad():
+        resampled = F.interpolate(tensor, size=new_length, mode="linear", align_corners=False)
+    return resampled.squeeze(0).squeeze(0).cpu().numpy().astype(np.float32)
+__all__ = ["load_mono_audio", "audio_to_tensor", "encode_audio_tokens"]

runtime/context.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import warnings
+import torch
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from ..config import DiaConfig, load_config
+from ..core.model import Dia2Model
+from ..core.precision import Precision, resolve_precision
+from ..audio import MimiCodec, DEFAULT_MIMI_MODEL_ID
+from .state_machine import StateMachine, TokenIds
+@dataclass
+class RuntimeContext:
+    config: DiaConfig
+    model: Dia2Model
+    precision: Precision
+    tokenizer: PreTrainedTokenizerBase
+    mimi: MimiCodec
+    device: torch.device
+    machine: StateMachine
+    transformer_step: callable
+    depformer_step: callable
+    constants: TokenIds
+    audio_delays: list[int]
+    audio_delay_tensor: torch.Tensor
+    frame_rate: float
+def build_runtime(
+    *,
+    config_path: str | Path,
+    weights_path: str | Path,
+    tokenizer_id: Optional[str],
+    repo_id: Optional[str],
+    mimi_id: Optional[str],
+    device: str,
+    dtype_pref: str,
+) -> tuple[RuntimeContext, str, str]:
+    device_obj = torch.device(device)
+    if device_obj.type == "cuda":
+        cuda_matmul = torch.backends.cuda.matmul
+        if hasattr(cuda_matmul, "fp32_precision"):
+            cuda_matmul.fp32_precision = "tf32"
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    message="Please use the new API settings",
+                )
+                torch.backends.cuda.matmul.allow_tf32 = True
+        else:  # pragma: no cover - compatibility with older PyTorch
+            torch.backends.cuda.matmul.allow_tf32 = True
+        # Handle cuDNN conv TF32 settings (check if conv attribute exists first)
+        if hasattr(torch.backends.cudnn, "conv"):
+            cudnn_conv = torch.backends.cudnn.conv
+            if hasattr(cudnn_conv, "fp32_precision"):
+                cudnn_conv.fp32_precision = "tf32"
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        message="Please use the new API settings",
+                    )
+                    torch.backends.cudnn.allow_tf32 = True
+            else:
+                torch.backends.cudnn.allow_tf32 = True
+        else:
+            # For older PyTorch versions without the conv attribute
+            torch.backends.cudnn.allow_tf32 = True
+    precision = resolve_precision(dtype_pref, device_obj)
+    config = load_config(config_path)
+    model = Dia2Model(config, precision)
+    state = load_file(str(weights_path))
+    model.load_state_dict(state)
+    model = model.to(device_obj)
+    tokenizer_ref = tokenizer_id or config.assets.tokenizer or repo_id
+    if tokenizer_ref is None:
+        raise ValueError("Tokenizer id is missing. Provide --tokenizer or add assets.tokenizer to the config.")
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_ref,
+        use_fast=False,
+        trust_remote_code=True,
+    )
+    mimi_ref = mimi_id or config.assets.mimi or DEFAULT_MIMI_MODEL_ID
+    mimi = MimiCodec.from_pretrained(mimi_ref, device=device_obj)
+    data_cfg = config.data
+    constants = TokenIds(
+        card=data_cfg.text_vocab_size,
+        new_word=data_cfg.text_new_word_token_id,
+        pad=data_cfg.text_pad_token_id,
+        bos=getattr(tokenizer, "bos_token_id", 1) or 1,
+        zero=data_cfg.text_zero_token_id,
+        spk1=tokenizer.convert_tokens_to_ids("[S1]") if "[S1]" in tokenizer.get_vocab() else data_cfg.text_new_word_token_id,
+        spk2=tokenizer.convert_tokens_to_ids("[S2]") if "[S2]" in tokenizer.get_vocab() else data_cfg.text_new_word_token_id,
+        audio_pad=data_cfg.audio_pad_token_id,
+        audio_bos=data_cfg.audio_bos_token_id,
+    )
+    machine = StateMachine(
+        token_ids=constants,
+        second_stream_ahead=data_cfg.second_stream_ahead,
+        max_padding=6,
+        initial_padding=0,
+    )
+    audio_delays = list(data_cfg.delay_pattern)
+    audio_delay_tensor = torch.tensor(audio_delays, device=device_obj, dtype=torch.long) if audio_delays else torch.empty(0, dtype=torch.long, device=device_obj)
+    frame_rate = getattr(mimi, "frame_rate", 75.0)
+    runtime = RuntimeContext(
+        config=config,
+        precision=precision,
+        model=model,
+        tokenizer=tokenizer,
+        mimi=mimi,
+        device=device_obj,
+        machine=machine,
+        constants=constants,
+        audio_delays=audio_delays,
+        audio_delay_tensor=audio_delay_tensor,
+        frame_rate=frame_rate,
+        transformer_step=model.transformer.forward_step,
+        depformer_step=model.depformer.forward_step,
+    )
+    return runtime, tokenizer_ref, mimi_ref
+__all__ = [
+    "RuntimeContext",
+    "build_runtime",
+]

runtime/generator.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from ..core.cache import KVCache
+from ..core.model import DecodeState
+from ..generation import GenerationConfig
+from ..audio.grid import delay_frames, mask_audio_logits, undelay_frames
+from .context import RuntimeContext
+from .state_machine import State, TokenIds
+from .guidance import apply_classifier_guidance, sample_audio_logits
+from .sampler import sample_token
+from .voice_clone import PrefixPlan
+from .logger import RuntimeLogger
+_GRAPH_CUBLAS_READY = False
+def _ensure_graph_cublas_ready(device: torch.device) -> None:
+    global _GRAPH_CUBLAS_READY
+    if _GRAPH_CUBLAS_READY or device.type != "cuda":
+        return
+    tmp = torch.empty((1, 1), device=device, dtype=torch.float32)
+    torch.matmul(tmp, tmp)
+    torch.cuda.synchronize()
+    _GRAPH_CUBLAS_READY = True
+@dataclass
+class GenerationState:
+    decode: DecodeState
+    step_tokens: torch.Tensor
+    audio_buf: torch.Tensor
+    def trim_audio(self, limit: int, pad_token: int, ungenerated: int) -> torch.Tensor:
+        trimmed = self.audio_buf[:, :, :limit]
+        pad = torch.full_like(trimmed, pad_token)
+        trimmed = torch.where(trimmed == ungenerated, pad, trimmed)
+        self.audio_buf = trimmed
+        return trimmed
+    @property
+    def transformer_cache(self) -> KVCache:
+        return self.decode.transformer
+    @transformer_cache.setter
+    def transformer_cache(self, cache: KVCache) -> None:
+        self.decode.transformer = cache
+    @property
+    def depformer_cache(self) -> KVCache:
+        return self.decode.depformer
+    @depformer_cache.setter
+    def depformer_cache(self, cache: KVCache) -> None:
+        self.decode.depformer = cache
+    def reset_dep_cache(self) -> None:
+        self.decode.depformer.reset()
+@dataclass
+class NetworkBuffers:
+    text: torch.Tensor
+    cb0: torch.Tensor
+    dep: list[torch.Tensor]
+def _allocate_network_buffers(runtime: RuntimeContext, branches: int) -> NetworkBuffers:
+    device = runtime.device
+    logits_dtype = runtime.precision.logits
+    data_cfg = runtime.config.data
+    text_logits = torch.empty((branches, 1, data_cfg.action_vocab_size), dtype=logits_dtype, device=device)
+    cb0_logits = torch.empty((branches, 1, data_cfg.audio_vocab_size), dtype=logits_dtype, device=device)
+    dep_vocab = runtime.model.depformer.audio_vocab_limit or data_cfg.audio_vocab_size
+    dep_logits = [
+        torch.empty((branches, 1, 1, dep_vocab), dtype=logits_dtype, device=device)
+        for _ in range(runtime.model.depformer.num_depth)
+    ]
+    return NetworkBuffers(text=text_logits, cb0=cb0_logits, dep=dep_logits)
+def build_initial_state(
+    runtime: RuntimeContext,
+    *,
+    prefix: PrefixPlan | None = None,
+) -> GenerationState:
+    dep_q = runtime.model.depformer.num_audio_channels
+    channels = 2 + dep_q
+    branches = 2
+    token_ids = runtime.constants
+    step_tokens = torch.full(
+        (branches, channels, 1),
+        token_ids.pad,
+        dtype=torch.long,
+        device=runtime.device,
+    )
+    step_tokens[0, 0, 0] = token_ids.bos
+    step_tokens[0, 1, 0] = token_ids.pad
+    step_tokens[1, 0, 0] = token_ids.zero
+    step_tokens[1, 1, 0] = token_ids.pad
+    prefix_len = 0
+    if prefix is not None:
+        delayed = delay_frames(prefix.aligned_tokens, runtime.audio_delays, token_ids.audio_pad)
+        prefix_len = delayed.shape[1]
+    limit = runtime.config.runtime.max_context_steps
+    total_steps = max(limit + prefix_len + 1, limit)
+    decode_state = runtime.model.init_state(branches, runtime.device, total_steps)
+    audio_buf = torch.full(
+        (branches, dep_q, total_steps),
+        token_ids.ungenerated,
+        dtype=torch.long,
+        device=runtime.device,
+    )
+    if prefix is not None:
+        delayed = delay_frames(prefix.aligned_tokens, runtime.audio_delays, token_ids.audio_pad).to(runtime.device)
+        audio_buf[0, :, : delayed.shape[1]] = delayed
+        if branches > 1:
+            audio_buf[1:, :, : delayed.shape[1]] = delayed
+    return GenerationState(decode_state, step_tokens, audio_buf)
+def _fill_audio_channels(
+    step_tokens: torch.Tensor,
+    audio_buf: torch.Tensor,
+    delays: torch.Tensor,
+    step: int,
+    bos_token: int,
+) -> None:
+    channels = delays.numel()
+    if channels == 0:
+        return
+    target = step_tokens[:, 2 : 2 + channels, 0]
+    if step < audio_buf.shape[-1]:
+        target.copy_(audio_buf[:, :channels, step])
+    else:
+        target.fill_(bos_token)
+    mask = delays > step
+    if mask.any().item():
+        target[:, mask] = bos_token
+def _execute_transformer_step(
+    step_tokens: torch.Tensor,
+    positions_view: torch.Tensor,
+    generation: GenerationState,
+    transformer_step,
+    buffers: NetworkBuffers,
+) -> torch.Tensor:
+    hidden_t, text_logits_t, cb0_logits_t, present = transformer_step(
+        step_tokens,
+        positions_view,
+        generation.transformer_cache,
+    )
+    buffers.text.copy_(text_logits_t)
+    buffers.cb0.copy_(cb0_logits_t)
+    generation.transformer_cache = present
+    return hidden_t
+def _execute_depformer_stage(
+    stage_index: int,
+    prev_audio: torch.Tensor,
+    hidden_t: torch.Tensor,
+    generation: GenerationState,
+    depformer_step,
+    main_tokens: Optional[torch.Tensor],
+    second_tokens: Optional[torch.Tensor],
+    buffers: NetworkBuffers,
+) -> None:
+    logits_stage, dep_present = depformer_step(
+        prev_audio=prev_audio,
+        transformer_out=hidden_t,
+        stage_index=stage_index,
+        cache=generation.depformer_cache,
+        main_text=main_tokens if stage_index == 0 else None,
+        second_text=second_tokens if stage_index == 0 else None,
+    )
+    target = buffers.dep[stage_index]
+    if logits_stage.shape != target.shape:
+        raise RuntimeError(
+            f"depformer logits shape mismatch: {logits_stage.shape} vs {target.shape}"
+        )
+    target.copy_(logits_stage)
+    generation.depformer_cache = dep_present
+def run_generation_loop(
+    runtime: RuntimeContext,
+    *,
+    state: State,
+    generation: GenerationState,
+    config: GenerationConfig,
+    start_step: int = 0,
+    logger: RuntimeLogger | None = None,
+) -> tuple[Optional[int], torch.Tensor]:
+    step_tokens = generation.step_tokens
+    audio_buf = generation.audio_buf
+    branches = step_tokens.shape[0]
+    max_context = runtime.config.runtime.max_context_steps
+    if max_context <= 0:
+        raise ValueError("Runtime configuration must specify a positive max_context_steps")
+    positions = torch.empty(1, 1, dtype=torch.long, device=runtime.device)
+    main_tokens = torch.empty(branches, dtype=torch.long, device=runtime.device)
+    aux_tokens = torch.empty(branches, dtype=torch.long, device=runtime.device)
+    cfg_active = config.cfg_scale != 1.0
+    token_ids = runtime.constants
+    delay_tensor = runtime.audio_delay_tensor
+    max_delay = int(delay_tensor.max().item()) if delay_tensor.numel() else 0
+    flush_tail = max_delay + getattr(runtime.machine, "max_padding", 0)
+    first_word_frame: Optional[int] = None
+    eos_cutoff: Optional[int] = None
+    last_step = start_step - 1
+    use_graph = bool(config.use_cuda_graph and runtime.device.type == "cuda")
+    transformer_step = runtime.transformer_step
+    depformer_step = runtime.depformer_step
+    buffers = _allocate_network_buffers(runtime, branches)
+    positions_view = positions.expand(branches, -1)
+    transformer_capture = None
+    dep_captures: list[dict] | None = None
+    if use_graph:
+        _ensure_graph_cublas_ready(runtime.device)
+    processed_steps = 0
+    report_interval = 12
+    with torch.inference_mode():
+        for offset in range(max_context):
+            t = start_step + offset
+            if eos_cutoff is not None and t >= eos_cutoff:
+                break
+            if t + 1 >= audio_buf.shape[-1]:
+                break
+            generation.reset_dep_cache()
+            positions.fill_(t)
+            _fill_audio_channels(step_tokens, audio_buf, delay_tensor, t, token_ids.audio_bos)
+            if branches > 1:
+                step_tokens[1:, 0, 0] = token_ids.zero
+                step_tokens[1:, 1, 0] = token_ids.pad
+            if use_graph:
+                if transformer_capture is None:
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph):
+                        hidden_ref = _execute_transformer_step(
+                            step_tokens,
+                            positions_view,
+                            generation,
+                            transformer_step,
+                            buffers,
+                        )
+                    transformer_capture = (graph, hidden_ref)
+                    if runtime.model.depformer.num_depth > 0:
+                        dep_captures = []
+                        for idx in range(runtime.model.depformer.num_depth):
+                            capture = {
+                                "graph": torch.cuda.CUDAGraph(),
+                                "captured": False,
+                                "prev_audio": torch.empty((branches,), dtype=torch.long, device=runtime.device),
+                                "main_tokens": torch.empty((branches,), dtype=torch.long, device=runtime.device) if idx == 0 else None,
+                                "second_tokens": torch.empty((branches,), dtype=torch.long, device=runtime.device) if idx == 0 else None,
+                            }
+                            dep_captures.append(capture)
+                else:
+                    transformer_capture[0].replay()
+                hidden_t = transformer_capture[1]
+            else:
+                hidden_t = _execute_transformer_step(
+                    step_tokens,
+                    positions_view,
+                    generation,
+                    transformer_step,
+                    buffers,
+                )
+            guided_text = apply_classifier_guidance(buffers.text, cfg_active, config.cfg_scale, config.cfg_filter_k)
+            if guided_text.shape[0] > 1:
+                guided_text = guided_text[:1]
+            text_token = sample_token(
+                guided_text,
+                temp=config.text.temperature,
+                top_k=config.text.top_k,
+            ).item()
+            main_token, aux_token, _ = runtime.machine.process(t, state, text_token)
+            second_token = aux_token if aux_token != -1 else token_ids.pad
+            if first_word_frame is None and main_token == token_ids.new_word:
+                first_word_frame = t - config.initial_padding
+            step_tokens[:, 0, 0] = main_token
+            step_tokens[:, 1, 0] = second_token
+            guided_cb0 = apply_classifier_guidance(buffers.cb0, cfg_active, config.cfg_scale, config.cfg_filter_k)
+            if guided_cb0.shape[0] > 1:
+                guided_cb0 = guided_cb0[:1]
+            masked_cb0 = mask_audio_logits(guided_cb0, token_ids.audio_pad, token_ids.audio_bos)
+            codebook_token = sample_audio_logits(masked_cb0, config.audio.temperature, config.audio.top_k)
+            audio_buf[:, 0, t + 1] = codebook_token
+            prev_audio = codebook_token.expand(branches)
+            main_tokens.fill_(main_token)
+            aux_tokens.fill_(second_token)
+            for stage in range(runtime.model.depformer.num_depth):
+                if use_graph and dep_captures is not None:
+                    capture = dep_captures[stage]
+                    capture["prev_audio"].copy_(prev_audio)
+                    if capture["main_tokens"] is not None and stage == 0:
+                        capture["main_tokens"].copy_(main_tokens)
+                        capture["second_tokens"].copy_(aux_tokens)
+                    if not capture["captured"]:
+                        torch.cuda.synchronize()
+                        with torch.cuda.graph(capture["graph"]):
+                            _execute_depformer_stage(
+                                stage_index=stage,
+                                prev_audio=capture["prev_audio"],
+                                hidden_t=hidden_t,
+                                generation=generation,
+                                depformer_step=depformer_step,
+                                main_tokens=capture["main_tokens"],
+                                second_tokens=capture["second_tokens"],
+                                buffers=buffers,
+                            )
+                        capture["captured"] = True
+                    else:
+                        capture["graph"].replay()
+                else:
+                    _execute_depformer_stage(
+                        stage_index=stage,
+                        prev_audio=prev_audio,
+                        hidden_t=hidden_t,
+                        generation=generation,
+                        depformer_step=depformer_step,
+                        main_tokens=main_tokens,
+                        second_tokens=aux_tokens,
+                        buffers=buffers,
+                    )
+                dep_logits = apply_classifier_guidance(buffers.dep[stage], cfg_active, config.cfg_scale, config.cfg_filter_k)
+                if dep_logits.shape[0] > 1:
+                    dep_logits = dep_logits[:1]
+                stage_token = sample_audio_logits(
+                    dep_logits,
+                    config.audio.temperature,
+                    config.audio.top_k,
+                )
+                audio_buf[:, stage + 1, t + 1] = stage_token
+                prev_audio = stage_token.expand(branches)
+            last_step = t
+            if eos_cutoff is None and state.end_step is not None:
+                eos_cutoff = state.end_step + flush_tail
+            processed_steps = offset + 1
+            if logger and processed_steps % report_interval == 0:
+                logger.progress(processed_steps, max_context)
+    if logger and processed_steps and processed_steps % report_interval != 0:
+        logger.progress(processed_steps, max_context)
+    if first_word_frame is None:
+        first_word_frame = start_step
+    if last_step < start_step:
+        limit = min(start_step + 1, audio_buf.shape[-1])
+    else:
+        limit = min(last_step + 2, audio_buf.shape[-1])
+    trimmed = generation.trim_audio(limit, token_ids.audio_pad, token_ids.ungenerated)
+    return first_word_frame, trimmed
+def decode_audio(runtime: RuntimeContext, tokens: torch.Tensor) -> torch.Tensor:
+    if tokens.shape[-1] == 0:
+        return torch.zeros(0, device=runtime.device)
+    with torch.inference_mode():
+        pcm = runtime.mimi.decode(tokens.to(runtime.device))
+        return pcm[0, 0]
+def warmup_with_prefix(
+    runtime: RuntimeContext,
+    plan: PrefixPlan,
+    state: State,
+    generation: GenerationState,
+) -> int:
+    step_tokens = generation.step_tokens
+    model_state = generation.decode
+    branches = step_tokens.shape[0]
+    device = runtime.device
+    tokens = plan.aligned_tokens.to(device)
+    new_word_steps = set(plan.new_word_steps)
+    positions = torch.empty(1, 1, dtype=torch.long, device=device)
+    with torch.inference_mode():
+        for t in range(plan.aligned_frames):
+            positions.fill_(t)
+            channels = tokens.shape[0]
+            for cb in range(channels):
+                delay = runtime.audio_delays[cb] if cb < len(runtime.audio_delays) else 0
+                idx = t - delay
+                value = tokens[cb, idx] if idx >= 0 else runtime.constants.audio_bos
+                step_tokens[:, 2 + cb, 0] = value
+            hidden, text_logits, cb0_logits, present = runtime.model.transformer.forward_step(
+                step_tokens,
+                positions.expand(branches, -1),
+                model_state.transformer,
+            )
+            model_state.transformer = present
+            forced = runtime.constants.new_word if t in new_word_steps else runtime.constants.pad
+            main_token, aux_token, _ = runtime.machine.process(t, state, forced, is_forced=True)
+            second_token = runtime.constants.pad if aux_token == -1 else aux_token
+            step_tokens[0, 0, 0] = main_token
+            step_tokens[0, 1, 0] = second_token
+            if branches > 1:
+                step_tokens[1:, 0, 0] = runtime.constants.zero
+                step_tokens[1:, 1, 0] = runtime.constants.pad
+    return max(plan.aligned_frames - 1, 0)
+__all__ = [
+    "build_initial_state",
+    "run_generation_loop",
+    "decode_audio",
+    "warmup_with_prefix",
+    "GenerationState",
+]

runtime/guidance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from __future__ import annotations
+import torch
+from .sampler import sample_token
+def apply_classifier_guidance(
+    logits: torch.Tensor,
+    cfg_active: bool,
+    scale: float,
+    top_k: int,
+) -> torch.Tensor:
+    if not cfg_active:
+        return logits
+    conditional = logits[0:1]
+    unconditional = logits[1:2]
+    cond32 = conditional.to(torch.float32)
+    uncond32 = unconditional.to(torch.float32)
+    guided = torch.lerp(uncond32, cond32, scale)
+    if top_k > 0 and guided.shape[-1] > 0:
+        k = min(top_k, guided.shape[-1])
+        threshold = torch.topk(guided, k=k, dim=-1, sorted=False).values[..., -1:]
+        mask = guided >= threshold
+        neg_inf = torch.full_like(cond32, float("-inf"))
+        cond32 = torch.where(mask, cond32, neg_inf)
+    return cond32.to(conditional.dtype)
+def sample_audio_logits(logits: torch.Tensor, temp: float, top_k: int) -> torch.Tensor:
+    """Sample a single audio token (shape [1]) from logits."""
+    return (
+        sample_token(
+            logits,
+            temp=temp,
+            top_k=top_k,
+        ).view(1)
+    )

runtime/logger.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from __future__ import annotations
+import time
+class RuntimeLogger:
+    def __init__(self, enabled: bool) -> None:
+        self.enabled = enabled
+        self.start_time = time.perf_counter()
+        self.last_time = self.start_time
+        self.last_step = 0
+    def event(self, message: str) -> None:
+        if self.enabled:
+            print(f"[dia2] {message}")
+    def progress(self, step: int, total: Optional[int] = None) -> None:
+        if not self.enabled:
+            return
+        now = time.perf_counter()
+        delta_t = max(now - self.last_time, 1e-6)
+        delta_steps = max(step - self.last_step, 1)
+        speed = delta_steps / delta_t
+        if total is None:
+            self.event(f"step {step} :: {speed:.1f} toks/s")
+        else:
+            self.event(f"step {step}/{total} :: {speed:.1f} toks/s")
+        self.last_time = now
+        self.last_step = step
+    def elapsed(self) -> float:
+        return time.perf_counter() - self.start_time
+__all__ = ["RuntimeLogger"]

runtime/sampler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from __future__ import annotations
+import torch
+def sample_token(
+    logits: torch.Tensor,
+    *,
+    temp: float,
+    top_k: int = 0,
+) -> torch.Tensor:
+    logits32 = logits.to(torch.float32)
+    if temp <= 0.0:
+        return torch.argmax(logits32, dim=-1, keepdim=True)
+    probs = torch.softmax(logits32 / max(temp, 1e-6), dim=-1)
+    probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+    probs = torch.clamp_min(probs, 0.0)
+    flat = probs.reshape(-1, probs.shape[-1])
+    norm = flat.sum(dim=-1, keepdim=True)
+    zero_mask = norm <= 0
+    norm = norm.clamp_min(1e-12)
+    flat = flat / norm
+    if zero_mask.any():
+        filler = torch.zeros_like(flat)
+        filler[..., 0] = 1.0
+        mask = zero_mask.expand_as(flat)
+        flat = torch.where(mask, filler, flat)
+    vocab = flat.shape[-1]
+    if top_k > 0 and top_k < vocab:
+        topv, indices = torch.topk(flat, top_k, dim=-1)
+        topv = topv / topv.sum(dim=-1, keepdim=True).clamp_min(1e-12)
+        draws = torch.multinomial(topv, num_samples=1)
+        picks = torch.gather(indices, dim=-1, index=draws)
+    else:
+        picks = torch.multinomial(flat, num_samples=1)
+    picks = picks.reshape(*probs.shape[:-1], 1)
+    return picks

runtime/script_parser.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from __future__ import annotations
+import re
+from typing import List, Optional, Sequence
+from .state_machine import Entry
+def parse_script(
+    script: Sequence[str],
+    tokenizer,
+    constants,
+    frame_rate: float,
+) -> List[Entry]:
+    entries: List[Entry] = []
+    speaker_tokens = [constants.spk1, constants.spk2]
+    padding_between = 1
+    event_re = re.compile(r"(?:<break\s+time=\"([0-9]+(?:.[0-9]*)?)s\"\s*/?>)|(?:\s+)")
+    last_speaker_idx = [None]
+    def add_entry(idx: int, word: str, *, pending: Optional[int], first_content: List[bool]):
+        tokens: List[int]
+        if pending is not None:
+            prefix = "[S1]" if pending == constants.spk1 else "[S2]"
+            tokens = tokenizer.encode(f"{prefix} {word}", add_special_tokens=False)
+        else:
+            tokens = tokenizer.encode(word, add_special_tokens=False)
+        if first_content[0]:
+            if speaker_tokens:
+                speaker_idx = idx % len(speaker_tokens)
+                speaker_token = speaker_tokens[speaker_idx]
+                if speaker_token is not None and last_speaker_idx[0] != speaker_idx:
+                    if not tokens or tokens[0] != speaker_token:
+                        tokens.insert(0, speaker_token)
+                    last_speaker_idx[0] = speaker_idx
+            first_content[0] = False
+        padding = max(0, padding_between + len(tokens) - 1)
+        entries.append(Entry(tokens=tokens, text=word, padding=padding))
+    for idx, line in enumerate(script):
+        normalized = line.replace("’", "'").replace(":", " ")
+        remaining = normalized
+        first_content = [True]
+        pending_speaker: Optional[int] = None
+        while remaining:
+            match = event_re.search(remaining)
+            if match is None:
+                segment = remaining
+                remaining = ""
+            else:
+                segment = remaining[: match.start()]
+                remaining = remaining[match.end() :]
+            if segment:
+                for raw_word in segment.split():
+                    if raw_word in ("[S1]", "[S2]"):
+                        pending_speaker = (
+                            constants.spk1 if raw_word == "[S1]" else constants.spk2
+                        )
+                        continue
+                    add_entry(idx, raw_word, pending=pending_speaker, first_content=first_content)
+                    pending_speaker = None
+            if match and match.group(1):
+                seconds = float(match.group(1))
+                padding = int(round(seconds * frame_rate))
+                if padding > 0:
+                    entries.append(Entry(tokens=[], text="", padding=padding))
+        if remaining:
+            continue
+    return entries

runtime/state_machine.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Deque, Iterable, List, Sequence, Tuple
+@dataclass
+class TokenIds:
+    card: int
+    new_word: int
+    pad: int
+    bos: int
+    zero: int
+    spk1: int
+    spk2: int
+    audio_pad: int
+    audio_bos: int
+    ungenerated: int = -2
+@dataclass
+class Entry:
+    tokens: List[int]
+    text: str
+    padding: int = 0
+@dataclass
+class State:
+    entries: Deque[Entry]
+    padding_budget: int
+    forced_padding: int
+    pending_tokens: Deque[int] = field(default_factory=deque)
+    lookahead_tokens: Deque[int] = field(default_factory=deque)
+    end_step: int | None = None
+    consumption_times: List[int] = field(default_factory=list)
+    transcript: List[Tuple[str, int]] = field(default_factory=list)
+    def peek_tokens(self, count: int) -> List[int]:
+        """Return tokens from upcoming entries (used for second-stream lookahead)."""
+        assert count > 0
+        for entry in self.entries:
+            if entry.tokens:
+                count -= 1
+                if count == 0:
+                    return entry.tokens
+        return []
+class StateMachine:
+    def __init__(
+        self,
+        token_ids: TokenIds,
+        *,
+        second_stream_ahead: int = 0,
+        max_padding: int = 6,
+        initial_padding: int = 0,
+    ) -> None:
+        self.token_ids = token_ids
+        self.second_stream_ahead = second_stream_ahead
+        self.max_padding = max_padding
+        self.initial_padding = initial_padding
+    def new_state(self, entries: Iterable[Entry]) -> State:
+        return State(
+            entries=deque(entries),
+            padding_budget=self.initial_padding,
+            forced_padding=self.initial_padding,
+        )
+    def process(
+        self,
+        step: int,
+        state: State,
+        token: int,
+        is_forced: bool = False,
+    ) -> Tuple[int, int, bool]:
+        token = self._sanitize_token(token)
+        token = self._enforce_token_constraints(state, token, is_forced)
+        token, consumed_new_word = self._handle_new_word(step, state, token)
+        output_token = self._select_output_token(state, token)
+        final_main, final_second = self._maybe_multiplex_second_stream(
+            state, output_token
+        )
+        return final_main, final_second, consumed_new_word
+    def _sanitize_token(self, token: int) -> int:
+        if token == 1:
+            token = self.token_ids.new_word
+        elif token == 0:
+            token = self.token_ids.pad
+        if token not in (self.token_ids.new_word, self.token_ids.pad):
+            return self.token_ids.pad
+        return token
+    def _enforce_token_constraints(
+        self, state: State, token: int, is_forced: bool
+    ) -> int:
+        if state.pending_tokens:
+            return self.token_ids.pad
+        if is_forced:
+            return token
+        if state.forced_padding > 0:
+            if token != self.token_ids.pad:
+                token = self.token_ids.pad
+            return token
+        if state.padding_budget <= 0 and token != self.token_ids.new_word:
+            return self.token_ids.new_word
+        return token
+    def _handle_new_word(
+        self, step: int, state: State, token: int
+    ) -> Tuple[int, bool]:
+        if token != self.token_ids.new_word:
+            return token, False
+        if state.entries:
+            entry = state.entries.popleft()
+            state.consumption_times.append(step)
+            if entry.tokens:
+                state.transcript.append((entry.text, step))
+                state.pending_tokens.extend(entry.tokens)
+                if self.second_stream_ahead:
+                    state.lookahead_tokens.extend(
+                        state.peek_tokens(self.second_stream_ahead)
+                    )
+                state.padding_budget = self.max_padding
+            else:
+                token = self.token_ids.pad
+            state.forced_padding = entry.padding
+            return token, True
+        token = self.token_ids.pad
+        if self.second_stream_ahead and state.end_step is None:
+            token = self.token_ids.new_word
+        if state.end_step is None:
+            state.end_step = step
+        return token, False
+    def _select_output_token(self, state: State, token: int) -> int:
+        if token == self.token_ids.pad:
+            if state.padding_budget > 0:
+                state.padding_budget -= 1
+            if state.forced_padding > 0:
+                state.forced_padding -= 1
+            if state.pending_tokens:
+                return state.pending_tokens.popleft()
+            return self.token_ids.pad
+        if token == self.token_ids.new_word:
+            return self.token_ids.new_word
+        if token == self.token_ids.zero:
+            return token
+        raise RuntimeError(f"Invalid token {token}")
+    def _maybe_multiplex_second_stream(
+        self, state: State, output: int
+    ) -> Tuple[int, int]:
+        if not self.second_stream_ahead:
+            return output, output
+        second = -1
+        if output == self.token_ids.new_word:
+            second = self.token_ids.new_word
+            if state.pending_tokens:
+                output = state.pending_tokens.popleft()
+            else:
+                output = self.token_ids.pad
+        elif state.lookahead_tokens:
+            second = state.lookahead_tokens.popleft()
+        else:
+            second = self.token_ids.pad
+        return output, second

runtime/voice_clone.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Sequence, TYPE_CHECKING
+import numpy as np
+import torch
+from ..generation import PrefixConfig
+from .audio_io import encode_audio_tokens, load_mono_audio
+from .state_machine import Entry
+if TYPE_CHECKING:  # pragma: no cover
+    from .context import RuntimeContext
+@dataclass
+class WhisperWord:
+    text: str
+    start: float
+    end: float
+@dataclass
+class PrefixPlan:
+    entries: List[Entry]
+    new_word_steps: List[int]
+    aligned_tokens: torch.Tensor
+    aligned_frames: int
+def build_prefix_plan(
+    runtime: "RuntimeContext",
+    prefix: Optional[PrefixConfig],
+    *,
+    transcribe_fn: Optional[Callable[[str, torch.device], List[WhisperWord]]] = None,
+    load_audio_fn: Optional[Callable[[str, int], np.ndarray]] = None,
+    encode_fn: Optional[Callable[[np.ndarray], torch.Tensor]] = None,
+) -> Optional[PrefixPlan]:
+    if prefix is None:
+        return None
+    if not prefix.speaker_1:
+        if prefix.speaker_2:
+            raise ValueError("speaker_2 requires speaker_1 to be provided")
+        return None
+    transcribe = transcribe_fn or (lambda path, device: transcribe_words(path, device))
+    load_audio = load_audio_fn or (lambda path, sr: load_mono_audio(path, sr))
+    encode_audio = encode_fn or (lambda audio: encode_audio_tokens(runtime.mimi, audio))
+    entries1, steps1, tokens1 = _process_prefix_audio(
+        runtime=runtime,
+        audio_path=prefix.speaker_1,
+        speaker_token=runtime.constants.spk1,
+        transcribe=transcribe,
+        load_audio=load_audio,
+        encode_audio=encode_audio,
+    )
+    offset = 3  # Match legacy BOS/PAD offset
+    entries = list(entries1)
+    new_word_steps = [step + offset for step in steps1]
+    audio_tokens = tokens1.to(runtime.device)
+    if prefix.speaker_2:
+        entries2, steps2, tokens2 = _process_prefix_audio(
+            runtime=runtime,
+            audio_path=prefix.speaker_2,
+            speaker_token=runtime.constants.spk2,
+            transcribe=transcribe,
+            load_audio=load_audio,
+            encode_audio=encode_audio,
+        )
+        spk1_frames = audio_tokens.shape[-1]
+        new_word_steps.extend(step + spk1_frames for step in steps2)
+        entries.extend(entries2)
+        audio_tokens = torch.cat([audio_tokens, tokens2.to(runtime.device)], dim=1)
+    return PrefixPlan(
+        entries=entries,
+        new_word_steps=new_word_steps,
+        aligned_tokens=audio_tokens,
+        aligned_frames=audio_tokens.shape[-1],
+    )
+def _process_prefix_audio(
+    runtime: "RuntimeContext",
+    audio_path: str,
+    speaker_token: int,
+    *,
+    transcribe: Callable[[str, torch.device], List[WhisperWord]],
+    load_audio: Callable[[str, int], np.ndarray],
+    encode_audio: Callable[[np.ndarray], torch.Tensor],
+) -> tuple[List[Entry], List[int], torch.Tensor]:
+    words = transcribe(audio_path, runtime.device)
+    entries, steps = words_to_entries(
+        words=words,
+        tokenizer=runtime.tokenizer,
+        speaker_token=speaker_token,
+        frame_rate=runtime.frame_rate,
+    )
+    audio = load_audio(audio_path, runtime.mimi.sample_rate)
+    tokens = encode_audio(audio)
+    return entries, steps, tokens
+def transcribe_words(
+    audio_path: str,
+    device: torch.device,
+    language: Optional[str] = None,
+) -> List[WhisperWord]:
+    import whisper_timestamped as wts  # Imported lazily
+    model = wts.load_model("openai/whisper-large-v3", device=str(device))
+    result = wts.transcribe(model, audio_path, language=language)
+    words: List[WhisperWord] = []
+    for segment in result.get("segments", []):
+        for word in segment.get("words", []):
+            text = (word.get("text") or word.get("word") or "").strip()
+            if not text:
+                continue
+            words.append(
+                WhisperWord(
+                    text=text,
+                    start=float(word.get("start", 0.0)),
+                    end=float(word.get("end", 0.0)),
+                )
+            )
+    return words
+def words_to_entries(
+    *,
+    words: Sequence[WhisperWord],
+    tokenizer,
+    speaker_token: int,
+    frame_rate: float,
+) -> tuple[List[Entry], List[int]]:
+    entries: List[Entry] = []
+    new_word_steps: List[int] = []
+    if not words:
+        return entries, new_word_steps
+    convert = getattr(tokenizer, "convert_tokens_to_ids", None)
+    speaker_prefix: Optional[str] = None
+    if callable(convert):
+        s1_id = convert("[S1]")
+        s2_id = convert("[S2]")
+        if speaker_token == s1_id:
+            speaker_prefix = "[S1]"
+        elif speaker_token == s2_id:
+            speaker_prefix = "[S2]"
+    pending_prefix: Optional[str] = speaker_prefix
+    current_pos = 0
+    for idx, word in enumerate(words):
+        tokens = _encode_word(word.text, tokenizer, pending_prefix)
+        pending_prefix = None
+        start_frame = max(current_pos + 1, int(round(word.start * frame_rate)))
+        end_frame = start_frame + len(tokens)
+        new_word_steps.append(start_frame - 1)
+        if idx < len(words) - 1:
+            next_start = int(round(words[idx + 1].start * frame_rate))
+            next_word_start = max(end_frame + 1, next_start)
+        else:
+            end_time = int(round(words[-1].end * frame_rate))
+            next_word_start = max(end_frame + 1, end_time)
+        padding = max(0, next_word_start - start_frame - 1)
+        entries.append(Entry(tokens=tokens, text=word.text, padding=padding))
+        current_pos = end_frame
+    return entries, new_word_steps
+def _encode_word(text: str, tokenizer, prefix: Optional[str]) -> List[int]:
+    if prefix:
+        return tokenizer.encode(f"{prefix} {text}", add_special_tokens=False)
+    return tokenizer.encode(text, add_special_tokens=False)
+__all__ = [
+    "PrefixPlan",
+    "WhisperWord",
+    "build_prefix_plan",
+    "transcribe_words",
+    "words_to_entries",
+]