Prince-1
/

VibeVoice

+#!/usr/bin/env python3
+"""
+convert_to_onnx.py - Export VibeVoice ASR components to ONNX opset 20.
+Exports (written to --output-dir, default: onnx_outputs/):
+  acoustic_encoder.onnx    audio [B,1,T] -> acoustic_latent [B,F,64]
+  acoustic_decoder.onnx    acoustic_latent [B,64,F] -> audio [B,1,T]
+  semantic_encoder.onnx    audio [B,1,T] -> semantic_latent [B,F,128]
+  acoustic_connector.onnx  acoustic_latent [B,F,64] -> lm_features [B,F,3584]
+  semantic_connector.onnx  semantic_latent [B,F,128] -> lm_features [B,F,3584]
+  diffusion_head.onnx      (noisy[N,L], timesteps[N], condition[N,H]) -> predicted[N,L]
+  llm_embed_tokens.onnx    token_ids [B,T] -> embeddings [B,T,3584]
+  lm_head.onnx             hidden_states [B,T,3584] -> logits [B,T,152064]
+Architecture facts (from content/ configs):
+  Encoder ratios (applied order) : 2, 2, 4, 5, 5, 8  (reversed from config [8,5,5,4,2,2])
+  Total hop length               : 2*2*4*5*5*8 = 1600 samples  (~66.7 ms at 24 kHz)
+  Acoustic VAE dim               : 64
+  Semantic VAE dim               : 128
+  LM hidden size (Qwen2.5-7B)    : 3584
+  Vocab size                     : 152 064
+Reference input size (REF_AUDIO_LEN = 48 000 samples = 2 s at 24 kHz):
+  This length gives an exact integer frame count at EVERY downsampling stage,
+  so no extra padding is baked into the ONNX graph as a constant.
+  For variable-length inference pad audio to multiples of REF_AUDIO_LEN, OR
+  use --dynamo to export with fully dynamic shapes.
+Usage:
+  python convert_to_onnx.py
+  python convert_to_onnx.py --output-dir onnx_out --device cpu
+  python convert_to_onnx.py --skip-llm        # skip 7 B LLM (saves ~30 GB RAM)
+  python convert_to_onnx.py --dynamo           # use torch.onnx.dynamo_export
+  python convert_to_onnx.py --components acoustic_encoder acoustic_connector
+"""
+from __future__ import annotations
+import sys
+import os
+import logging
+import argparse
+import warnings
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", message=".*Torch was not compiled with flash attention.*")
+import torch
+import torch.nn as nn
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+ROOT = Path(__file__).parent.resolve()
+CONTENT = ROOT / "content"
+VIBEVOICE_SRC = ROOT / "VibeVoice"
+if str(VIBEVOICE_SRC) not in sys.path:
+    sys.path.insert(0, str(VIBEVOICE_SRC))
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+OPSET = 20
+SAMPLE_RATE = 24_000          # Hz - fixed by the VibeVoice architecture
+HOP_LENGTH = 1600             # 2*2*4*5*5*8 - total encoder downsampling factor
+# 48 000 samples = 2 s at 24 kHz. This is the smallest T where every
+# downsampling stage (strides 2,2,4,5,5,8) produces an exact integer
+# frame count, so extra_padding=0 everywhere and the ONNX graph has
+# no baked-in padding constants.
+REF_AUDIO_LEN = 48_000
+ACOUSTIC_VAE_DIM = 64
+SEMANTIC_VAE_DIM = 128
+LM_HIDDEN = 3584
+LM_VOCAB = 152_064
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(levelname)-7s  %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# 1. Register VibeVoice custom classes with Transformers AutoModel
+# ---------------------------------------------------------------------------
+def _register_vibevoice():
+    """Import VibeVoice classes and register with Transformers AutoModel."""
+    from vibevoice.modular.configuration_vibevoice import (
+        VibeVoiceAcousticTokenizerConfig,
+        VibeVoiceSemanticTokenizerConfig,
+        VibeVoiceDiffusionHeadConfig,
+    )
+    from vibevoice.modular.modular_vibevoice_tokenizer import (
+        VibeVoiceAcousticTokenizerModel,
+        VibeVoiceSemanticTokenizerModel,
+    )
+    from vibevoice.modular.modular_vibevoice_diffusion_head import VibeVoiceDiffusionHead
+    from transformers.models.auto import AutoModel
+    for cfg, mdl in [
+        (VibeVoiceAcousticTokenizerConfig, VibeVoiceAcousticTokenizerModel),
+        (VibeVoiceSemanticTokenizerConfig, VibeVoiceSemanticTokenizerModel),
+        (VibeVoiceDiffusionHeadConfig, VibeVoiceDiffusionHead),
+    ]:
+        try:
+            AutoModel.register(cfg, mdl)
+        except Exception:
+            pass   # already registered - fine
+    log.info("VibeVoice model classes registered with AutoModel")
+    return (
+        VibeVoiceAcousticTokenizerConfig,
+        VibeVoiceSemanticTokenizerConfig,
+        VibeVoiceDiffusionHeadConfig,
+        VibeVoiceAcousticTokenizerModel,
+        VibeVoiceSemanticTokenizerModel,
+        VibeVoiceDiffusionHead,
+    )
+# ---------------------------------------------------------------------------
+# 2. ONNX-friendly wrapper modules
+# ---------------------------------------------------------------------------
+class AcousticEncoderONNX(nn.Module):
+    """Acoustic tokenizer encoder: audio [B,1,T] -> latent_mean [B,F,64].
+    Calls the encoder in non-streaming mode (use_cache=False) and returns
+    only the mean latent (no stochastic sampling).
+    """
+    def __init__(self, tokenizer: nn.Module):
+        super().__init__()
+        self.encoder = tokenizer.encoder
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        # audio: [B, 1, T]  ->  latents: [B, vae_dim, F]  ->  [B, F, vae_dim]
+        latents = self.encoder(audio)          # [B, 64, F]
+        return latents.permute(0, 2, 1)        # [B, F, 64]
+class AcousticDecoderONNX(nn.Module):
+    """Acoustic tokenizer decoder: latent [B,64,F] -> audio [B,1,T]."""
+    def __init__(self, tokenizer: nn.Module):
+        super().__init__()
+        self.decoder = tokenizer.decoder
+        self.vae_dim = tokenizer.config.vae_dim
+    def forward(self, latents: torch.Tensor) -> torch.Tensor:
+        # Accept both [B, 64, F] and [B, F, 64]
+        if latents.shape[1] != self.vae_dim:
+            latents = latents.permute(0, 2, 1)   # [B, 64, F]
+        return self.decoder(latents)              # [B, 1, T]
+class SemanticEncoderONNX(nn.Module):
+    """Semantic tokenizer encoder: audio [B,1,T] -> latent_mean [B,F,128]."""
+    def __init__(self, tokenizer: nn.Module):
+        super().__init__()
+        self.encoder = tokenizer.encoder
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        latents = self.encoder(audio)          # [B, 128, F]
+        return latents.permute(0, 2, 1)        # [B, F, 128]
+class SpeechConnectorONNX(nn.Module):
+    """Thin wrapper around SpeechConnector (Linear -> RMSNorm -> Linear)."""
+    def __init__(self, connector: nn.Module):
+        super().__init__()
+        self.connector = connector
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        return self.connector(features)
+class DiffusionHeadONNX(nn.Module):
+    """VibeVoiceDiffusionHead wrapper with explicit positional inputs."""
+    def __init__(self, head: nn.Module):
+        super().__init__()
+        self.head = head
+    def forward(
+        self,
+        noisy_latent: torch.Tensor,   # [N, latent_size]
+        timesteps: torch.Tensor,       # [N]  float
+        condition: torch.Tensor,       # [N, hidden_size]
+    ) -> torch.Tensor:
+        return self.head(noisy_latent, timesteps, condition)
+class LLMEmbedTokensONNX(nn.Module):
+    """Token embedding table: input_ids [B,T] -> embeddings [B,T,H]."""
+    def __init__(self, embed_tokens: nn.Module):
+        super().__init__()
+        self.embed_tokens = embed_tokens
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+class LMHeadONNX(nn.Module):
+    """LM head linear: hidden_states [B,T,H] -> logits [B,T,V]."""
+    def __init__(self, lm_head: nn.Module):
+        super().__init__()
+        self.lm_head = lm_head
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.lm_head(hidden_states)
+# ---------------------------------------------------------------------------
+# 3. Core export helper
+# ---------------------------------------------------------------------------
+def _export_onnx(
+    model: nn.Module,
+    sample_args: tuple,
+    out_path: Path,
+    input_names: List[str],
+    output_names: List[str],
+    dynamic_axes: Optional[Dict] = None,
+    use_dynamo: bool = False,
+) -> None:
+    """Export *model* to ONNX opset 20 at *out_path*."""
+    import onnx
+    model.eval()
+    with torch.no_grad():
+        if use_dynamo:
+            _export_dynamo(model, sample_args, out_path, input_names, output_names)
+        else:
+            _export_traditional(
+                model, sample_args, out_path,
+                input_names, output_names, dynamic_axes or {},
+            )
+    # Validate the model
+    proto = onnx.load(str(out_path))
+    onnx.checker.check_model(proto)
+    size_mb = out_path.stat().st_size / 1e6
+    log.info("  [OK]  %-38s  %.1f MB", out_path.name, size_mb)
+def _export_traditional(
+    model, sample_args, out_path, input_names, output_names, dynamic_axes
+):
+    """Old-style torch.onnx.export (universally supported)."""
+    with torch.no_grad():
+        torch.onnx.export(
+            model,
+            sample_args,
+            str(out_path),
+            opset_version=OPSET,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=True,
+            export_params=True,
+        )
+def _export_dynamo(model, sample_args, out_path, input_names, output_names):
+    """torch.onnx.dynamo_export - dynamic shapes, no baked-in constants."""
+    pt_ver = tuple(int(x) for x in torch.__version__.split(".")[:2] if x.isdigit())
+    if pt_ver >= (2, 6):
+        # Unified API (PyTorch ≥ 2.6)
+        torch.onnx.export(
+            model,
+            sample_args,
+            str(out_path),
+            dynamo=True,
+            opset_version=OPSET,
+            input_names=input_names,
+            output_names=output_names,
+        )
+    elif pt_ver >= (2, 1):
+        # Legacy dynamo API (PyTorch 2.1 – 2.5)
+        export_opts = torch.onnx.ExportOptions(opset_version=OPSET)
+        prog = torch.onnx.dynamo_export(
+            model, *sample_args, export_options=export_opts
+        )
+        prog.save(str(out_path))
+    else:
+        raise RuntimeError(
+            f"--dynamo requires PyTorch >= 2.1; found {torch.__version__}"
+        )
+# ---------------------------------------------------------------------------
+# 4. Model loading helpers
+# ---------------------------------------------------------------------------
+def _load_pth_state(path: Path) -> Dict:
+    """Load a .pth file and unwrap common wrapper dicts."""
+    sd = torch.load(str(path), map_location="cpu", weights_only=False)
+    for wrap_key in ("state_dict", "model", "model_state_dict"):
+        if isinstance(sd, dict) and wrap_key in sd and isinstance(sd[wrap_key], dict):
+            sd = sd[wrap_key]
+            break
+    return sd
+def _strip_prefix(sd: Dict, prefix: str) -> Dict:
+    return {
+        (k[len(prefix):] if k.startswith(prefix) else k): v
+        for k, v in sd.items()
+    }
+def _load_acoustic_tokenizer(device: torch.device):
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained(
+        str(CONTENT / "acoustic"),
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
+    ).to(device).eval()
+    log.info("  Acoustic tokenizer loaded  (VAE dim=%d)", model.config.vae_dim)
+    return model
+def _load_semantic_tokenizer(device: torch.device):
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained(
+        str(CONTENT / "semantic"),
+        trust_remote_code=True,
+        torch_dtype=torch.float32,
+    ).to(device).eval()
+    log.info("  Semantic tokenizer loaded  (VAE dim=%d)", model.config.vae_dim)
+    return model
+def _load_connector(
+    path: Path,
+    input_dim: int,
+    output_dim: int,
+    device: torch.device,
+) -> nn.Module:
+    from vibevoice.modular.modeling_vibevoice import SpeechConnector
+    connector = SpeechConnector(input_dim, output_dim).to(device)
+    sd = _load_pth_state(path)
+    # Strip common prefixes that may be present if saved from a full model
+    for prefix in (
+        "model.acoustic_connector.", "model.semantic_connector.",
+        "acoustic_connector.",       "semantic_connector.",
+    ):
+        if any(k.startswith(prefix) for k in sd):
+            sd = _strip_prefix(sd, prefix)
+            break
+    connector.load_state_dict(sd, strict=True)
+    connector.eval()
+    log.info("  Connector loaded from %s  (%d -> %d)", path.name, input_dim, output_dim)
+    return connector
+def _infer_diffusion_head_config(sd: Dict):
+    """Infer VibeVoiceDiffusionHeadConfig from state-dict tensor shapes."""
+    from vibevoice.modular.configuration_vibevoice import VibeVoiceDiffusionHeadConfig
+    # Find noisy_images_proj.weight regardless of prefix
+    proj_w = None
+    for k, v in sd.items():
+        if k.endswith("noisy_images_proj.weight"):
+            proj_w = v
+            break
+    if proj_w is None:
+        raise KeyError(
+            "'noisy_images_proj.weight' not found in diffusion head state dict. "
+            f"Available keys (first 10): {list(sd.keys())[:10]}"
+        )
+    hidden_size, latent_size = proj_w.shape
+    # Count layers by looking for per-layer norm weights
+    head_layers = sum(
+        1 for k in sd if ".norm.weight" in k and k.split(".norm.weight")[0].startswith("layers.")
+    )
+    head_layers = max(head_layers, 1)
+    # Infer FFN ratio
+    ffn_w = next((v for k, v in sd.items() if k.endswith("ffn.gate_proj.weight")), None)
+    head_ffn_ratio = (ffn_w.shape[0] / hidden_size) if ffn_w is not None else 3.0
+    cfg = VibeVoiceDiffusionHeadConfig(
+        hidden_size=hidden_size,
+        latent_size=latent_size,
+        head_layers=head_layers,
+        head_ffn_ratio=head_ffn_ratio,
+    )
+    log.info(
+        "  Diffusion head config  hidden=%d  latent=%d  layers=%d  ffn_ratio=%.1f",
+        hidden_size, latent_size, head_layers, head_ffn_ratio,
+    )
+    return cfg
+def _load_diffusion_head(path: Path, device: torch.device):
+    from vibevoice.modular.modular_vibevoice_diffusion_head import VibeVoiceDiffusionHead
+    sd = _load_pth_state(path)
+    for prefix in ("model.prediction_head.", "prediction_head."):
+        if any(k.startswith(prefix) for k in sd):
+            sd = _strip_prefix(sd, prefix)
+            break
+    cfg = _infer_diffusion_head_config(sd)
+    head = VibeVoiceDiffusionHead(cfg).to(device)
+    head.load_state_dict(sd, strict=True)
+    head.eval()
+    return head, cfg
+def _load_llm_embed_and_head(device: torch.device):
+    """Load only embed_tokens + lm_head from the Qwen2.5-7B LLM to save RAM."""
+    from transformers import AutoModelForCausalLM
+    log.info("  Loading Qwen2.5-7B  (embed_tokens + lm_head only - may take a few minutes) …")
+    llm = AutoModelForCausalLM.from_pretrained(
+        str(CONTENT / "llm"),
+        torch_dtype=torch.float32,
+        device_map="cpu",
+        low_cpu_mem_usage=True,
+    )
+    embed_tokens = llm.model.embed_tokens.to(device).eval()
+    lm_head = llm.lm_head.to(device).eval()
+    del llm
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    log.info("  Qwen2.5-7B  embed_tokens + lm_head ready")
+    return embed_tokens, lm_head
+# ---------------------------------------------------------------------------
+# 5. Per-component export functions
+# ---------------------------------------------------------------------------
+def _dynamic_axes_audio():
+    return {
+        "audio":           {0: "batch", 2: "time"},
+        "acoustic_latent": {0: "batch", 1: "frames"},
+    }
+def export_acoustic_encoder(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting acoustic_encoder.onnx …")
+    tok = _load_acoustic_tokenizer(device)
+    wrapper = AcousticEncoderONNX(tok).to(device)
+    audio = torch.randn(1, 1, REF_AUDIO_LEN, device=device)
+    _export_onnx(
+        wrapper, (audio,),
+        out_dir / "acoustic_encoder.onnx",
+        input_names=["audio"],
+        output_names=["acoustic_latent"],
+        dynamic_axes=_dynamic_axes_audio(),
+        use_dynamo=dynamo,
+    )
+def export_acoustic_decoder(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting acoustic_decoder.onnx …")
+    tok = _load_acoustic_tokenizer(device)
+    wrapper = AcousticDecoderONNX(tok).to(device)
+    ref_frames = REF_AUDIO_LEN // HOP_LENGTH   # 30
+    latents = torch.randn(1, ACOUSTIC_VAE_DIM, ref_frames, device=device)
+    _export_onnx(
+        wrapper, (latents,),
+        out_dir / "acoustic_decoder.onnx",
+        input_names=["acoustic_latent"],
+        output_names=["audio"],
+        dynamic_axes={
+            "acoustic_latent": {0: "batch", 2: "frames"},
+            "audio":            {0: "batch", 2: "time"},
+        },
+        use_dynamo=dynamo,
+    )
+def export_semantic_encoder(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting semantic_encoder.onnx …")
+    tok = _load_semantic_tokenizer(device)
+    wrapper = SemanticEncoderONNX(tok).to(device)
+    audio = torch.randn(1, 1, REF_AUDIO_LEN, device=device)
+    _export_onnx(
+        wrapper, (audio,),
+        out_dir / "semantic_encoder.onnx",
+        input_names=["audio"],
+        output_names=["semantic_latent"],
+        dynamic_axes={
+            "audio":           {0: "batch", 2: "time"},
+            "semantic_latent": {0: "batch", 1: "frames"},
+        },
+        use_dynamo=dynamo,
+    )
+def export_acoustic_connector(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting acoustic_connector.onnx …")
+    conn = _load_connector(
+        CONTENT / "acoustic_connector.pth", ACOUSTIC_VAE_DIM, LM_HIDDEN, device
+    )
+    wrapper = SpeechConnectorONNX(conn).to(device)
+    ref_frames = REF_AUDIO_LEN // HOP_LENGTH
+    latents = torch.randn(1, ref_frames, ACOUSTIC_VAE_DIM, device=device)
+    _export_onnx(
+        wrapper, (latents,),
+        out_dir / "acoustic_connector.onnx",
+        input_names=["acoustic_latent"],
+        output_names=["acoustic_features"],
+        dynamic_axes={
+            "acoustic_latent":  {0: "batch", 1: "frames"},
+            "acoustic_features": {0: "batch", 1: "frames"},
+        },
+        use_dynamo=dynamo,
+    )
+def export_semantic_connector(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting semantic_connector.onnx …")
+    conn = _load_connector(
+        CONTENT / "semantic_connector.pth", SEMANTIC_VAE_DIM, LM_HIDDEN, device
+    )
+    wrapper = SpeechConnectorONNX(conn).to(device)
+    ref_frames = REF_AUDIO_LEN // HOP_LENGTH
+    latents = torch.randn(1, ref_frames, SEMANTIC_VAE_DIM, device=device)
+    _export_onnx(
+        wrapper, (latents,),
+        out_dir / "semantic_connector.onnx",
+        input_names=["semantic_latent"],
+        output_names=["semantic_features"],
+        dynamic_axes={
+            "semantic_latent":  {0: "batch", 1: "frames"},
+            "semantic_features": {0: "batch", 1: "frames"},
+        },
+        use_dynamo=dynamo,
+    )
+def export_diffusion_head(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting diffusion_head.onnx …")
+    head, cfg = _load_diffusion_head(CONTENT / "head.pth", device)
+    wrapper = DiffusionHeadONNX(head).to(device)
+    N = 4  # batch of latent tokens
+    noisy = torch.randn(N, cfg.latent_size, device=device)
+    timesteps = torch.randint(0, 1000, (N,), dtype=torch.float32, device=device)
+    condition = torch.randn(N, cfg.hidden_size, device=device)
+    _export_onnx(
+        wrapper, (noisy, timesteps, condition),
+        out_dir / "diffusion_head.onnx",
+        input_names=["noisy_latent", "timesteps", "condition"],
+        output_names=["predicted_noise"],
+        dynamic_axes={
+            "noisy_latent":    {0: "N"},
+            "timesteps":       {0: "N"},
+            "condition":       {0: "N"},
+            "predicted_noise": {0: "N"},
+        },
+        use_dynamo=dynamo,
+    )
+def export_llm_parts(out_dir: Path, device: torch.device, dynamo: bool) -> None:
+    log.info("Exporting llm_embed_tokens.onnx …")
+    embed_tokens, lm_head = _load_llm_embed_and_head(device)
+    token_ids = torch.randint(0, LM_VOCAB, (1, 32), device=device)
+    _export_onnx(
+        LLMEmbedTokensONNX(embed_tokens), (token_ids,),
+        out_dir / "llm_embed_tokens.onnx",
+        input_names=["input_ids"],
+        output_names=["embeddings"],
+        dynamic_axes={
+            "input_ids":   {0: "batch", 1: "seq"},
+            "embeddings":  {0: "batch", 1: "seq"},
+        },
+        use_dynamo=dynamo,
+    )
+    log.info("Exporting lm_head.onnx …")
+    hidden = torch.randn(1, 32, LM_HIDDEN, device=device)
+    _export_onnx(
+        LMHeadONNX(lm_head), (hidden,),
+        out_dir / "lm_head.onnx",
+        input_names=["hidden_states"],
+        output_names=["logits"],
+        dynamic_axes={
+            "hidden_states": {0: "batch", 1: "seq"},
+            "logits":        {0: "batch", 1: "seq"},
+        },
+        use_dynamo=dynamo,
+    )
+# ---------------------------------------------------------------------------
+# 6. CLI
+# ---------------------------------------------------------------------------
+ALL_COMPONENTS = [
+    "acoustic_encoder",
+    "acoustic_decoder",
+    "semantic_encoder",
+    "acoustic_connector",
+    "semantic_connector",
+    "diffusion_head",
+    "llm",
+]
+EXPORT_FNS = {
+    "acoustic_encoder":  export_acoustic_encoder,
+    "acoustic_decoder":  export_acoustic_decoder,
+    "semantic_encoder":  export_semantic_encoder,
+    "acoustic_connector": export_acoustic_connector,
+    "semantic_connector": export_semantic_connector,
+    "diffusion_head":    export_diffusion_head,
+    "llm":               export_llm_parts,
+}
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Export VibeVoice ASR components to ONNX opset 20",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--output-dir", default="onnx_outputs",
+        help="Directory where ONNX files are written (default: onnx_outputs/)",
+    )
+    parser.add_argument(
+        "--device", default="cpu",
+        help="PyTorch device string, e.g. 'cpu' or 'cuda:0' (default: cpu)",
+    )
+    parser.add_argument(
+        "--skip-llm", action="store_true",
+        help="Skip llm_embed_tokens + lm_head (saves ~28 GB RAM for the 7 B LLM)",
+    )
+    parser.add_argument(
+        "--dynamo", action="store_true",
+        help=(
+            "Use torch.onnx.dynamo_export for fully dynamic shapes "
+            "(requires PyTorch >= 2.1). Slower but handles variable audio lengths."
+        ),
+    )
+    parser.add_argument(
+        "--components", nargs="+", choices=ALL_COMPONENTS,
+        help="Subset of components to export (default: all)",
+    )
+    args = parser.parse_args()
+    out_dir = Path(args.output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    device = torch.device(args.device)
+    log.info(
+        "VibeVoice ASR -> ONNX opset %d  |  device=%s  |  output=%s  |  dynamo=%s",
+        OPSET, device, out_dir, args.dynamo,
+    )
+    log.info("PyTorch %s", torch.__version__)
+    # Dependency check
+    try:
+        import onnx
+        log.info("onnx %s", onnx.__version__)
+    except ImportError:
+        log.error("'onnx' not installed. Run:  pip install onnx onnxruntime")
+        return 1
+    _register_vibevoice()
+    # Determine which components to export
+    want = set(args.components) if args.components else set(ALL_COMPONENTS)
+    if args.skip_llm:
+        want.discard("llm")
+    succeeded: List[str] = []
+    failed: List[str] = []
+    for name in ALL_COMPONENTS:
+        if name not in want:
+            continue
+        fn = EXPORT_FNS[name]
+        try:
+            fn(out_dir, device, args.dynamo)
+            succeeded.append(name)
+        except Exception as exc:
+            log.error("FAILED  %s: %s", name, exc, exc_info=True)
+            failed.append(name)
+    log.info("")
+    log.info("=== Summary ===")
+    log.info("Succeeded : %s", ", ".join(succeeded) if succeeded else "(none)")
+    if failed:
+        log.warning("Failed    : %s", ", ".join(failed))
+    log.info("Output dir: %s", out_dir.resolve())
+    if not failed:
+        log.info("")
+        log.info("Inference note:")
+        log.info(
+            "  Tokenizer encoders were exported with REF_AUDIO_LEN=%d samples (%g s).",
+            REF_AUDIO_LEN, REF_AUDIO_LEN / SAMPLE_RATE,
+        )
+        log.info(
+            "  For variable-length inference, pad audio to multiples of %d samples "
+            "(%g ms) before feeding to acoustic_encoder / semantic_encoder.",
+            HOP_LENGTH, HOP_LENGTH / SAMPLE_RATE * 1000,
+        )
+        log.info(
+            "  Or re-export with --dynamo for fully dynamic shape support."
+        )
+    return 1 if failed else 0
+if __name__ == "__main__":
+    sys.exit(main())