OrbitVoice

Running on Zero

App Files Files Community

zhu-han commited on 16 days ago

Commit

aa79b9c

verified ·

1 Parent(s): 72536e4

Upload 48 files

Browse files

Files changed (48) hide show

app.py +128 -0
omnivoice/__init__.py +28 -0
omnivoice/cli/__init__.py +0 -0
omnivoice/cli/demo.py +533 -0
omnivoice/cli/infer.py +157 -0
omnivoice/cli/infer_batch.py +523 -0
omnivoice/cli/train.py +74 -0
omnivoice/data/__init__.py +0 -0
omnivoice/data/batching.py +166 -0
omnivoice/data/collator.py +92 -0
omnivoice/data/dataset.py +551 -0
omnivoice/data/processor.py +258 -0
omnivoice/eval/__init__.py +4 -0
omnivoice/eval/models/ecapa_tdnn_wavlm.py +374 -0
omnivoice/eval/models/utmos.py +370 -0
omnivoice/eval/mos/utmos.py +299 -0
omnivoice/eval/speaker_similarity/sim.py +321 -0
omnivoice/eval/utils.py +80 -0
omnivoice/eval/wer/common.py +88 -0
omnivoice/eval/wer/fleurs.py +517 -0
omnivoice/eval/wer/hubert.py +318 -0
omnivoice/eval/wer/minimax.py +596 -0
omnivoice/eval/wer/norm_config_module.py +291 -0
omnivoice/eval/wer/punctuations.lst +188 -0
omnivoice/eval/wer/seedtts.py +413 -0
omnivoice/eval/wer/sensevoice.py +344 -0
omnivoice/eval/wer/text_norm_omni.py +113 -0
omnivoice/models/__init__.py +0 -0
omnivoice/models/omnivoice.py +1502 -0
omnivoice/scripts/__init__.py +0 -0
omnivoice/scripts/denoise_audio.py +1048 -0
omnivoice/scripts/extract_audio_tokens.py +625 -0
omnivoice/scripts/extract_audio_tokens_add_noise.py +825 -0
omnivoice/scripts/jsonl_to_webdataset.py +439 -0
omnivoice/training/__init__.py +0 -0
omnivoice/training/builder.py +180 -0
omnivoice/training/checkpoint.py +180 -0
omnivoice/training/config.py +98 -0
omnivoice/training/trainer.py +342 -0
omnivoice/utils/__init__.py +0 -0
omnivoice/utils/audio.py +355 -0
omnivoice/utils/common.py +56 -0
omnivoice/utils/data_utils.py +63 -0
omnivoice/utils/duration.py +282 -0
omnivoice/utils/lang_map.py +698 -0
omnivoice/utils/text.py +219 -0
omnivoice/utils/voice_design.py +66 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+HuggingFace Space entry point for OmniVoice demo.
+"""
+import logging
+import os
+import tempfile
+from typing import Any, Dict
+import torch
+import torchaudio
+from omnivoice import OmniVoice, OmniVoiceGenerationConfig
+from omnivoice.cli.demo import build_demo
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+# ---------------------------------------------------------------------------
+# Hardware detection
+# ---------------------------------------------------------------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"Using device: {DEVICE}")
+# ---------------------------------------------------------------------------
+# Model loading
+# ---------------------------------------------------------------------------
+CHECKPOINT = os.environ.get("OMNIVOICE_MODEL", "k2-fsa/OmniVoice")
+logger.info(f"Loading model from {CHECKPOINT} on {DEVICE} ...")
+model = OmniVoice.from_pretrained(
+    CHECKPOINT,
+    device_map=DEVICE,
+    dtype=torch.float16,
+    load_asr=True,
+)
+logger.info("Model loaded on %s.", DEVICE)
+sampling_rate = model.sampling_rate
+# ---------------------------------------------------------------------------
+# Generation logic (outside build_demo so we can wrap with spaces.GPU)
+# ---------------------------------------------------------------------------
+def _gen_core(
+    text,
+    language,
+    ref_audio,
+    instruct,
+    num_step,
+    guidance_scale,
+    denoise,
+    speed,
+    duration,
+    preprocess_prompt,
+    postprocess_output,
+    mode,
+    ref_text=None,
+):
+    if not text or not text.strip():
+        return None, "Please enter the text to synthesize."
+    gen_config = OmniVoiceGenerationConfig(
+        num_step=int(num_step or 32),
+        guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0,
+        denoise=bool(denoise) if denoise is not None else True,
+        preprocess_prompt=bool(preprocess_prompt),
+        postprocess_output=bool(postprocess_output),
+    )
+    lang = language if (language and language != "Auto") else None
+    kw: Dict[str, Any] = dict(
+        text=text.strip(), language=lang, generation_config=gen_config
+    )
+    if speed is not None and float(speed) != 1.0:
+        kw["speed"] = float(speed)
+    if duration is not None and float(duration) > 0:
+        kw["duration"] = float(duration)
+    if mode == "clone":
+        if not ref_audio:
+            return None, "Please upload a reference audio."
+        kw["voice_clone_prompt"] = model.create_voice_clone_prompt(
+            ref_audio=ref_audio,
+            ref_text=ref_text,
+        )
+    if mode == "design":
+        if instruct and instruct.strip():
+            kw["instruct"] = instruct.strip()
+    try:
+        out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+        audio = model.generate(**kw)
+        torchaudio.save(out_path, audio[0], sampling_rate)
+    except Exception as e:
+        return None, f"Error: {type(e).__name__}: {e}"
+    return out_path, "Done."
+# ---------------------------------------------------------------------------
+# ZeroGPU wrapper
+# ---------------------------------------------------------------------------
+generate_fn = None
+try:
+    import spaces
+    @spaces.GPU()
+    def _gen_gpu(*args, **kwargs):
+        return _gen_core(*args, **kwargs)
+    generate_fn = _gen_gpu
+    logger.info("Using spaces.GPU() wrapper.")
+except ImportError:
+    logger.info("spaces module not found, running without GPU wrapper.")
+# ---------------------------------------------------------------------------
+# Build and launch demo — reuses the full UI from omnivoice.cli.demo
+# ---------------------------------------------------------------------------
+demo = build_demo(model, CHECKPOINT, generate_fn=generate_fn)
+if __name__ == "__main__":
+    demo.queue().launch()

omnivoice/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import warnings
+from importlib.metadata import PackageNotFoundError, version
+warnings.filterwarnings("ignore", module="torchaudio")
+warnings.filterwarnings(
+    "ignore",
+    category=SyntaxWarning,
+    message="invalid escape sequence",
+    module="pydub.utils",
+)
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    module="torch.distributed.algorithms.ddp_comm_hooks",
+)
+try:
+    __version__ = version("omnivoice")
+except PackageNotFoundError:
+    __version__ = "0.0.0"
+from omnivoice.models.omnivoice import (
+    OmniVoice,
+    OmniVoiceConfig,
+    OmniVoiceGenerationConfig,
+)
+__all__ = ["OmniVoice", "OmniVoiceConfig", "OmniVoiceGenerationConfig"]

omnivoice/cli/__init__.py ADDED Viewed

File without changes

omnivoice/cli/demo.py ADDED Viewed

	@@ -0,0 +1,533 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Gradio demo for OmniVoice.
+Supports voice cloning and voice design.
+Usage:
+    omnivoice-demo --model /path/to/checkpoint --port 8000
+"""
+import argparse
+import logging
+from typing import Any, Dict
+import gradio as gr
+import numpy as np
+import torch
+from omnivoice import OmniVoice, OmniVoiceGenerationConfig
+from omnivoice.utils.lang_map import LANG_NAMES, lang_display_name
+def get_best_device():
+    """Auto-detect the best available device: CUDA > MPS > CPU."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+# ---------------------------------------------------------------------------
+# Language list — all 600+ supported languages
+# ---------------------------------------------------------------------------
+_ALL_LANGUAGES = ["Auto"] + sorted(lang_display_name(n) for n in LANG_NAMES)
+# ---------------------------------------------------------------------------
+# Voice Design instruction templates
+# ---------------------------------------------------------------------------
+# Each option is displayed as "English / 中文".
+# The model expects English for accents and Chinese for dialects.
+_CATEGORIES = {
+    "Gender / 性别": ["Male / 男", "Female / 女"],
+    "Age / 年龄": [
+        "Child / 儿童",
+        "Teenager / 少年",
+        "Young Adult / 青年",
+        "Middle-aged / 中年",
+        "Elderly / 老年",
+    ],
+    "Pitch / 音调": [
+        "Very Low Pitch / 极低音调",
+        "Low Pitch / 低音调",
+        "Moderate Pitch / 中音调",
+        "High Pitch / 高音调",
+        "Very High Pitch / 极高音调",
+    ],
+    "Style / 风格": ["Whisper / 耳语"],
+    "English Accent / 英文口音": [
+        "American Accent / 美式口音",
+        "Australian Accent / 澳大利亚口音",
+        "British Accent / 英国口音",
+        "Chinese Accent / 中国口音",
+        "Canadian Accent / 加拿大口音",
+        "Indian Accent / 印度口音",
+        "Korean Accent / 韩国口音",
+        "Portuguese Accent / 葡萄牙口音",
+        "Russian Accent / 俄罗斯口音",
+        "Japanese Accent / 日本口音",
+    ],
+    "Chinese Dialect / 中文方言": [
+        "Henan Dialect / 河南话",
+        "Shaanxi Dialect / 陕西话",
+        "Sichuan Dialect / 四川话",
+        "Guizhou Dialect / 贵州话",
+        "Yunnan Dialect / 云南话",
+        "Guilin Dialect / 桂林话",
+        "Jinan Dialect / 济南话",
+        "Shijiazhuang Dialect / 石家庄话",
+        "Gansu Dialect / 甘肃话",
+        "Ningxia Dialect / 宁夏话",
+        "Qingdao Dialect / 青岛话",
+        "Northeast Dialect / 东北话",
+    ],
+}
+_ATTR_INFO = {
+    "English Accent / 英文口音": "Only effective for English speech.",
+    "Chinese Dialect / 中文方言": "Only effective for Chinese speech.",
+}
+# ---------------------------------------------------------------------------
+# Argument parser
+# ---------------------------------------------------------------------------
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="omnivoice-demo",
+        description="Launch a Gradio demo for OmniVoice.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        default="k2-fsa/OmniVoice",
+        help="Model checkpoint path or HuggingFace repo id.",
+    )
+    parser.add_argument(
+        "--device", default=None, help="Device to use. Auto-detected if not specified."
+    )
+    parser.add_argument("--ip", default="0.0.0.0", help="Server IP (default: 0.0.0.0).")
+    parser.add_argument(
+        "--port", type=int, default=7860, help="Server port (default: 7860)."
+    )
+    parser.add_argument(
+        "--root-path",
+        default=None,
+        help="Root path for reverse proxy.",
+    )
+    parser.add_argument(
+        "--share", action="store_true", default=False, help="Create public link."
+    )
+    return parser
+# ---------------------------------------------------------------------------
+# Build demo
+# ---------------------------------------------------------------------------
+def build_demo(
+    model: OmniVoice,
+    checkpoint: str,
+    generate_fn=None,
+) -> gr.Blocks:
+    sampling_rate = model.sampling_rate
+    # -- shared generation core --
+    def _gen_core(
+        text,
+        language,
+        ref_audio,
+        instruct,
+        num_step,
+        guidance_scale,
+        denoise,
+        speed,
+        duration,
+        preprocess_prompt,
+        postprocess_output,
+        mode,
+        ref_text=None,
+    ):
+        if not text or not text.strip():
+            return None, "Please enter the text to synthesize."
+        gen_config = OmniVoiceGenerationConfig(
+            num_step=int(num_step or 32),
+            guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0,
+            denoise=bool(denoise) if denoise is not None else True,
+            preprocess_prompt=bool(preprocess_prompt),
+            postprocess_output=bool(postprocess_output),
+        )
+        lang = language if (language and language != "Auto") else None
+        kw: Dict[str, Any] = dict(
+            text=text.strip(), language=lang, generation_config=gen_config
+        )
+        if speed is not None and float(speed) != 1.0:
+            kw["speed"] = float(speed)
+        if duration is not None and float(duration) > 0:
+            kw["duration"] = float(duration)
+        if mode == "clone":
+            if not ref_audio:
+                return None, "Please upload a reference audio."
+            kw["voice_clone_prompt"] = model.create_voice_clone_prompt(
+                ref_audio=ref_audio,
+                ref_text=ref_text,
+            )
+        if mode == "design":
+            if instruct and instruct.strip():
+                kw["instruct"] = instruct.strip()
+        try:
+            audio = model.generate(**kw)
+        except Exception as e:
+            return None, f"Error: {type(e).__name__}: {e}"
+        waveform = audio[0].squeeze(0).numpy()  # (T,)
+        waveform = (waveform * 32767).astype(np.int16)
+        return (sampling_rate, waveform), "Done."
+    # Allow external wrappers (e.g. spaces.GPU for ZeroGPU Spaces)
+    _gen = generate_fn if generate_fn is not None else _gen_core
+    # =====================================================================
+    # UI
+    # =====================================================================
+    theme = gr.themes.Soft(
+        font=["Inter", "Arial", "sans-serif"],
+    )
+    css = """
+    .gradio-container {max-width: 100% !important; font-size: 16px !important;}
+    .gradio-container h1 {font-size: 1.5em !important;}
+    .gradio-container .prose {font-size: 1.1em !important;}
+    .compact-audio audio {height: 60px !important;}
+    .compact-audio .waveform {min-height: 80px !important;}
+    """
+    # Reusable: language dropdown component
+    def _lang_dropdown(label="Language (optional) / 语种 (可选)", value="Auto"):
+        return gr.Dropdown(
+            label=label,
+            choices=_ALL_LANGUAGES,
+            value=value,
+            allow_custom_value=False,
+            interactive=True,
+            info="Keep as Auto to auto-detect the language.",
+        )
+    # Reusable: optional generation settings accordion
+    def _gen_settings():
+        with gr.Accordion("Generation Settings (optional)", open=False):
+            sp = gr.Slider(
+                0.7,
+                1.3,
+                value=1.0,
+                step=0.05,
+                label="Speed",
+                info="1.0 = normal. >1 faster, <1 slower. Ignored if Duration is set.",
+            )
+            du = gr.Number(
+                value=None,
+                label="Duration (seconds)",
+                info=(
+                    "Leave empty to use speed."
+                    " Set a fixed duration to override speed."
+                ),
+            )
+            ns = gr.Slider(
+                4,
+                64,
+                value=32,
+                step=1,
+                label="Inference Steps",
+                info="Default: 32. Lower = faster, higher = better quality.",
+            )
+            dn = gr.Checkbox(
+                label="Denoise",
+                value=True,
+                info="Default: enabled. Uncheck to disable denoising.",
+            )
+            gs = gr.Slider(
+                0.0,
+                4.0,
+                value=2.0,
+                step=0.1,
+                label="Guidance Scale (CFG)",
+                info="Default: 2.0.",
+            )
+            pp = gr.Checkbox(
+                label="Preprocess Prompt",
+                value=True,
+                info="apply silence removal and trimming to the reference "
+                "audio, add punctuation in the end of reference text (if not already)",
+            )
+            po = gr.Checkbox(
+                label="Postprocess Output",
+                value=True,
+                info="Remove long silences from generated audio.",
+            )
+        return ns, gs, dn, sp, du, pp, po
+    with gr.Blocks(theme=theme, css=css, title="OmniVoice Demo") as demo:
+        gr.Markdown(
+            """
+# OmniVoice Demo
+State-of-the-art text-to-speech model for **600+ languages**, supporting:
+- **Voice Clone** — Clone any voice from a reference audio
+- **Voice Design** — Create custom voices with speaker attributes
+Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
+by Xiaomi Next-gen Kaldi team.
+"""
+        )
+        with gr.Tabs():
+            # ==============================================================
+            # Voice Clone
+            # ==============================================================
+            with gr.TabItem("Voice Clone"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        vc_text = gr.Textbox(
+                            label="Text to Synthesize / 待合成文本",
+                            lines=4,
+                            placeholder="Enter the text you want to synthesize...",
+                        )
+                        vc_ref_audio = gr.Audio(
+                            label="Reference Audio / 参考音频",
+                            type="filepath",
+                            elem_classes="compact-audio",
+                        )
+                        gr.Markdown(
+                            "<span style='font-size:0.85em;color:#888;'>"
+                            "Recommended: 3–10 seconds audio. "
+                            "</span>"
+                        )
+                        vc_ref_text = gr.Textbox(
+                            label=("Reference Text (optional)" " / 参考音频文本（可选）"),
+                            lines=2,
+                            placeholder="Transcript of the reference audio. Leave empty"
+                            " to auto-transcribe via ASR models.",
+                        )
+                        vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
+                        (
+                            vc_ns,
+                            vc_gs,
+                            vc_dn,
+                            vc_sp,
+                            vc_du,
+                            vc_pp,
+                            vc_po,
+                        ) = _gen_settings()
+                        vc_btn = gr.Button("Generate / 生成", variant="primary")
+                    with gr.Column(scale=1):
+                        vc_audio = gr.Audio(
+                            label="Output Audio / 合成结果",
+                            type="numpy",
+                        )
+                        vc_status = gr.Textbox(label="Status / 状态", lines=2)
+                def _clone_fn(
+                    text, lang, ref_aud, ref_text, ns, gs, dn, sp, du, pp, po
+                ):
+                    return _gen(
+                        text,
+                        lang,
+                        ref_aud,
+                        None,
+                        ns,
+                        gs,
+                        dn,
+                        sp,
+                        du,
+                        pp,
+                        po,
+                        mode="clone",
+                        ref_text=ref_text or None,
+                    )
+                vc_btn.click(
+                    _clone_fn,
+                    inputs=[
+                        vc_text,
+                        vc_lang,
+                        vc_ref_audio,
+                        vc_ref_text,
+                        vc_ns,
+                        vc_gs,
+                        vc_dn,
+                        vc_sp,
+                        vc_du,
+                        vc_pp,
+                        vc_po,
+                    ],
+                    outputs=[vc_audio, vc_status],
+                )
+            # ==============================================================
+            # Voice Design
+            # ==============================================================
+            with gr.TabItem("Voice Design"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        vd_text = gr.Textbox(
+                            label="Text to Synthesize / 待合成文本",
+                            lines=4,
+                            placeholder="Enter the text you want to synthesize...",
+                        )
+                        vd_lang = _lang_dropdown()
+                        _AUTO = "Auto"
+                        vd_groups = []
+                        for _cat, _choices in _CATEGORIES.items():
+                            vd_groups.append(
+                                gr.Dropdown(
+                                    label=_cat,
+                                    choices=[_AUTO] + _choices,
+                                    value=_AUTO,
+                                    info=_ATTR_INFO.get(_cat),
+                                )
+                            )
+                        (
+                            vd_ns,
+                            vd_gs,
+                            vd_dn,
+                            vd_sp,
+                            vd_du,
+                            vd_pp,
+                            vd_po,
+                        ) = _gen_settings()
+                        vd_btn = gr.Button("Generate / 生成", variant="primary")
+                    with gr.Column(scale=1):
+                        vd_audio = gr.Audio(
+                            label="Output Audio / 合成结果",
+                            type="numpy",
+                        )
+                        vd_status = gr.Textbox(label="Status / 状态", lines=2)
+                def _build_instruct(groups):
+                    """Extract instruct text from UI dropdowns.
+                    Language unification and validation is handled by
+                    _resolve_instruct inside _preprocess_all.
+                    """
+                    selected = [g for g in groups if g and g != "Auto"]
+                    if not selected:
+                        return None
+                    parts = []
+                    for v in selected:
+                        if " / " in v:
+                            en, zh = v.split(" / ", 1)
+                            # Dialects have no English equivalent
+                            if "Dialect" in v.split(" / ")[0]:
+                                parts.append(zh.strip())
+                            else:
+                                parts.append(en.strip())
+                        else:
+                            parts.append(v)
+                    return ", ".join(parts)
+                def _design_fn(text, lang, ns, gs, dn, sp, du, pp, po, *groups):
+                    return _gen(
+                        text,
+                        lang,
+                        None,
+                        _build_instruct(groups),
+                        ns,
+                        gs,
+                        dn,
+                        sp,
+                        du,
+                        pp,
+                        po,
+                        mode="design",
+                    )
+                vd_btn.click(
+                    _design_fn,
+                    inputs=[
+                        vd_text,
+                        vd_lang,
+                        vd_ns,
+                        vd_gs,
+                        vd_dn,
+                        vd_sp,
+                        vd_du,
+                        vd_pp,
+                        vd_po,
+                    ]
+                    + vd_groups,
+                    outputs=[vd_audio, vd_status],
+                )
+    return demo
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main(argv=None) -> int:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s: %(message)s",
+    )
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    device = args.device or get_best_device()
+    checkpoint = args.model
+    if not checkpoint:
+        parser.print_help()
+        return 0
+    logging.info(f"Loading model from {checkpoint}, device={device} ...")
+    model = OmniVoice.from_pretrained(
+        checkpoint,
+        device_map=device,
+        dtype=torch.float16,
+        load_asr=True,
+    )
+    print("Model loaded.")
+    demo = build_demo(model, checkpoint)
+    demo.queue().launch(
+        server_name=args.ip,
+        server_port=args.port,
+        share=args.share,
+        root_path=args.root_path,
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

omnivoice/cli/infer.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""Single-item inference CLI for OmniVoice.
+Generates audio from a single text input using voice cloning,
+voice design, or auto voice.
+Usage:
+    # Voice cloning
+    omnivoice-infer --model k2-fsa/OmniVoice \
+        --text "Hello, this is a text for text-to-speech." \
+        --ref_audio ref.wav --ref_text "Reference transcript." --output out.wav
+    # Voice design
+    omnivoice-infer --model k2-fsa/OmniVoice \
+        --text "Hello, this is a text for text-to-speech." \
+        --instruct "male, British accent" --output out.wav
+    # Auto voice
+    omnivoice-infer --model k2-fsa/OmniVoice \
+        --text "Hello, this is a text for text-to-speech." --output out.wav
+"""
+import argparse
+import logging
+import torch
+import torchaudio
+from omnivoice.models.omnivoice import OmniVoice
+from omnivoice.utils.common import str2bool
+def get_best_device():
+    """Auto-detect the best available device: CUDA > MPS > CPU."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="OmniVoice single-item inference",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="k2-fsa/OmniVoice",
+        help="Model checkpoint path or HuggingFace repo id.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="Text to synthesize.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output WAV file path.",
+    )
+    # Voice cloning
+    parser.add_argument(
+        "--ref_audio",
+        type=str,
+        default=None,
+        help="Reference audio file path for voice cloning.",
+    )
+    parser.add_argument(
+        "--ref_text",
+        type=str,
+        default=None,
+        help="Reference text describing the reference audio.",
+    )
+    # Voice design
+    parser.add_argument(
+        "--instruct",
+        type=str,
+        default=None,
+        help="Style instruction for voice design mode.",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default=None,
+        help="Language name (e.g. 'English') or code (e.g. 'en').",
+    )
+    # Generation parameters
+    parser.add_argument("--num_step", type=int, default=32)
+    parser.add_argument("--guidance_scale", type=float, default=2.0)
+    parser.add_argument("--speed", type=float, default=1.0)
+    parser.add_argument(
+        "--duration",
+        type=float,
+        default=None,
+        help="Fixed output duration in seconds. If set, overrides the "
+        "model's duration estimation. The speed factor is automatically "
+        "adjusted to match while preserving language-aware pacing.",
+    )
+    parser.add_argument("--t_shift", type=float, default=0.1)
+    parser.add_argument("--denoise", type=str2bool, default=True)
+    parser.add_argument(
+        "--postprocess_output",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument("--layer_penalty_factor", type=float, default=5.0)
+    parser.add_argument("--position_temperature", type=float, default=5.0)
+    parser.add_argument("--class_temperature", type=float, default=0.0)
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use for inference. Auto-detected if not specified.",
+    )
+    return parser
+def main():
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_parser().parse_args()
+    device = args.device or get_best_device()
+    logging.info(f"Loading model from {args.model} on {device} ...")
+    model = OmniVoice.from_pretrained(
+        args.model, device_map=device, dtype=torch.float16
+    )
+    logging.info(f"Generating audio for: {args.text[:80]}...")
+    audios = model.generate(
+        text=args.text,
+        language=args.language,
+        ref_audio=args.ref_audio,
+        ref_text=args.ref_text,
+        instruct=args.instruct,
+        duration=args.duration,
+        num_step=args.num_step,
+        guidance_scale=args.guidance_scale,
+        speed=args.speed,
+        t_shift=args.t_shift,
+        denoise=args.denoise,
+        postprocess_output=args.postprocess_output,
+        layer_penalty_factor=args.layer_penalty_factor,
+        position_temperature=args.position_temperature,
+        class_temperature=args.class_temperature,
+    )
+    torchaudio.save(args.output, audios[0], model.sampling_rate)
+    logging.info(f"Saved to {args.output}")
+if __name__ == "__main__":
+    main()

omnivoice/cli/infer_batch.py ADDED Viewed

	@@ -0,0 +1,523 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Batch inference CLI for OmniVoice.
+Distributes TTS generation across multiple GPUs for large-scale tasks.
+Reads a JSONL test list, generates audio in parallel, and saves results.
+Usage:
+    omnivoice-infer-batch --model k2-fsa/OmniVoice \
+        --test_list test.jsonl --res_dir results/
+Test list format (JSONL, one JSON object per line):
+    Required fields: "id", "text"
+    Voice cloning:   "ref_audio", "ref_text"
+    Voice design:    "instruct"
+    Optional:        "language_id", "language_name", "duration", "speed"
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import signal
+import time
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import List, Optional, Tuple
+import torch
+import torchaudio
+from tqdm import tqdm
+from omnivoice.models.omnivoice import OmniVoice
+from omnivoice.utils.audio import load_audio
+from omnivoice.utils.common import str2bool
+from omnivoice.utils.data_utils import read_test_list
+from omnivoice.utils.duration import RuleDurationEstimator
+def get_best_device():
+    """Auto-detect the best available device: CUDA > MPS > CPU."""
+    if torch.cuda.is_available():
+        return "cuda", torch.cuda.device_count()
+    if torch.backends.mps.is_available():
+        return "mps", 1
+    return "cpu", 1
+worker_model = None
+SAMPLING_RATE = 24000
+def get_parser():
+    parser = argparse.ArgumentParser(description="Infer OmniVoice Model")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="k2-fsa/OmniVoice",
+        help="Path to the model checkpoint (local dir or HF repo id). "
+        "Audio tokenizer is expected at <checkpoint>/audio_tokenizer/.",
+    )
+    parser.add_argument(
+        "--test_list",
+        type=str,
+        required=True,
+        help="Path to the JSONL file containing test samples. "
+        'Each line is a JSON object: {"id": "name", "text": "...", '
+        '"ref_audio": "/path.wav", "ref_text": "...", '
+        '"language_id": "en", "language_name": "English", '
+        '"duration": 10.0, "speed": 1.2}. '
+        "language_id, language_name, duration, and speed are optional.",
+    )
+    parser.add_argument(
+        "--res_dir",
+        type=str,
+        required=True,
+        help="Directory to save the generated audio files.",
+    )
+    parser.add_argument(
+        "--num_step",
+        type=int,
+        default=32,
+        help="Number of steps for iterative decoding.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=2.0,
+        help="Scale for Classifier-Free Guidance.",
+    )
+    parser.add_argument(
+        "--t_shift",
+        type=float,
+        default=0.1,
+        help="Shift t to smaller ones if t_shift < 1.0",
+    )
+    parser.add_argument(
+        "--nj_per_gpu",
+        type=int,
+        default=1,
+        help="Number of worker processes to spawn per GPU.",
+    )
+    parser.add_argument(
+        "--audio_chunk_duration",
+        type=float,
+        default=15.0,
+        help="Maximum duration of audio chunk (in seconds) for splitting. "
+        '"Not split" if <= 0.',
+    )
+    parser.add_argument(
+        "--audio_chunk_threshold",
+        type=float,
+        default=30.0,
+        help=(
+            "The duration threshold (in seconds) to decide"
+            " whether to split audio into chunks."
+        ),
+    )
+    parser.add_argument(
+        "--batch_duration",
+        type=float,
+        default=1000.0,
+        help="Maximum total duration (reference + generated) per batch (seconds). "
+        "Only effective for parallel_chunk / no chunk mode.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=0,
+        help="Fixed batch size (number of samples per batch). "
+        "If > 0, use fixed-size batching instead of duration-based batching.",
+    )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=0,
+        help="Number of dummy inference runs per worker before real inference "
+        "starts, to warm up CUDA kernels and caches.",
+    )
+    parser.add_argument(
+        "--preprocess_prompt",
+        type=str2bool,
+        default=True,
+        help="Whether to preprocess reference audio (silence removal, trimming). "
+        "Set to False to keep raw audio.",
+    )
+    parser.add_argument(
+        "--postprocess_output",
+        type=str2bool,
+        default=True,
+        help="Whether to post-process generated audio (remove silence).",
+    )
+    parser.add_argument(
+        "--layer_penalty_factor",
+        type=float,
+        default=5.0,
+        help="The penalty factor for layer-wise sampling.",
+    )
+    parser.add_argument(
+        "--position_temperature",
+        type=float,
+        default=5.0,
+        help="The temperature for position selection.",
+    )
+    parser.add_argument(
+        "--class_temperature",
+        type=float,
+        default=0.0,
+        help="The temperature for class token sampling.",
+    )
+    parser.add_argument(
+        "--denoise",
+        type=str2bool,
+        default=True,
+        help="Whether to add <|denoise|> token in the reference.",
+    )
+    parser.add_argument(
+        "--lang_id",
+        type=str,
+        default=None,
+        help="Language id to use when test_list JSONL entries do not contain "
+        "language_id/language_name fields. If provided, both language_id and "
+        "language_name will be set to this value.",
+    )
+    return parser
+def process_init(rank_queue, model_checkpoint, warmup=0):
+    """Initializer for each worker process.
+    Loads model (with tokenizers and duration estimator) onto a specific GPU
+    via ``OmniVoice.from_pretrained()``.
+    """
+    global worker_model
+    torch.set_num_threads(2)
+    torch.set_num_interop_threads(2)
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] "
+        "[Worker %(process)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    rank = rank_queue.get()
+    device_type, device_id = rank
+    if device_type == "cpu":
+        worker_device = "cpu"
+    elif device_type == "mps":
+        worker_device = "mps"
+    else:
+        worker_device = f"cuda:{device_id}"
+    logging.info(f"Initializing worker on device: {worker_device}")
+    worker_model = OmniVoice.from_pretrained(
+        model_checkpoint,
+        device_map=worker_device,
+        dtype=torch.float16,
+    )
+    if warmup > 0:
+        logging.info(f"Running {warmup} warmup iterations on {worker_device}")
+        dummy_ref_audio = (
+            torch.randn(1, SAMPLING_RATE),
+            SAMPLING_RATE,
+        )  # 1s silence
+        for i in range(warmup):
+            worker_model.generate(
+                text=["hello"],
+                language=["en"],
+                ref_audio=[dummy_ref_audio],
+                ref_text=["hello"],
+            )
+        logging.info(f"Warmup complete on {worker_device}")
+    logging.info(f"Worker on {worker_device} initialized successfully.")
+def estimate_sample_total_duration(
+    duration_estimator: RuleDurationEstimator,
+    text: str,
+    ref_text: str,
+    ref_audio_path: str,
+    gen_duration: Optional[float] = None,
+) -> float:
+    ref_wav = load_audio(ref_audio_path, SAMPLING_RATE)
+    ref_duration = ref_wav.shape[-1] / SAMPLING_RATE
+    if gen_duration is None:
+        gen_duration = duration_estimator.estimate_duration(
+            text, ref_text, ref_duration, low_threshold=2.0
+        )
+    total_duration = ref_duration + gen_duration
+    return total_duration
+def cluster_samples_by_duration(
+    samples: List[Tuple],
+    duration_estimator: RuleDurationEstimator,
+    batch_duration: float,
+) -> List[List[Tuple]]:
+    sample_with_duration = []
+    for sample in samples:
+        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
+        total_duration = estimate_sample_total_duration(
+            duration_estimator,
+            text,
+            ref_text,
+            ref_audio_path,
+            gen_duration=dur,
+        )
+        sample_with_duration.append((sample, total_duration))
+    sample_with_duration.sort(key=lambda x: x[1], reverse=True)
+    batches = []
+    current_batch = []
+    current_total_duration = 0.0
+    for sample, duration in sample_with_duration:
+        if duration > batch_duration:
+            batches.append([sample])
+            continue
+        if current_total_duration + duration <= batch_duration:
+            current_batch.append(sample)
+            current_total_duration += duration
+        else:
+            batches.append(current_batch)
+            current_batch = [sample]
+            current_total_duration = duration
+    if current_batch:
+        batches.append(current_batch)
+    logging.info(f"Clustered {len(samples)} samples into {len(batches)} batches")
+    return batches
+def cluster_samples_by_batch_size(
+    samples: List[Tuple],
+    duration_estimator: RuleDurationEstimator,
+    batch_size: int,
+) -> List[List[Tuple]]:
+    """Split samples into fixed-size batches, sorted by duration to minimize padding."""
+    sample_with_duration = []
+    for sample in samples:
+        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
+        total_duration = estimate_sample_total_duration(
+            duration_estimator,
+            text,
+            ref_text,
+            ref_audio_path,
+            gen_duration=dur,
+        )
+        sample_with_duration.append((sample, total_duration))
+    sample_with_duration.sort(key=lambda x: x[1], reverse=True)
+    sorted_samples = [s for s, _ in sample_with_duration]
+    batches = [
+        sorted_samples[i : i + batch_size]
+        for i in range(0, len(sorted_samples), batch_size)
+    ]
+    logging.info(
+        f"Split {len(samples)} samples into {len(batches)} batches "
+        f"(fixed batch_size={batch_size}, sorted by duration)"
+    )
+    return batches
+def run_inference_batch(
+    batch_samples: List[Tuple],
+    res_dir: str,
+    **gen_kwargs,
+) -> List[Tuple]:
+    global worker_model
+    save_names = []
+    ref_texts = []
+    ref_audio_paths = []
+    texts = []
+    langs = []
+    durations = []
+    speeds = []
+    for sample in batch_samples:
+        save_name, ref_text, ref_audio_path, text, lang_id, lang_name, dur, spd = sample
+        save_names.append(save_name)
+        ref_texts.append(ref_text)
+        ref_audio_paths.append(ref_audio_path)
+        texts.append(text)
+        langs.append(lang_id)
+        durations.append(dur)
+        speeds.append(spd)
+    start_time = time.time()
+    audios = worker_model.generate(
+        text=texts,
+        language=langs,
+        ref_audio=ref_audio_paths,
+        ref_text=ref_texts,
+        duration=durations if any(d is not None for d in durations) else None,
+        speed=speeds if any(s is not None for s in speeds) else None,
+        **gen_kwargs,
+    )
+    batch_synth_time = time.time() - start_time
+    results = []
+    for save_name, audio in zip(save_names, audios):
+        save_path = os.path.join(res_dir, save_name + ".wav")
+        torchaudio.save(save_path, audio, worker_model.sampling_rate)
+        audio_duration = audio.shape[-1] / worker_model.sampling_rate
+        results.append(
+            (
+                save_name,
+                batch_synth_time / len(batch_samples),
+                audio_duration,
+                "success",
+            )
+        )
+    return results
+def main():
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    os.makedirs(args.res_dir, exist_ok=True)
+    device_type, num_devices = get_best_device()
+    if device_type == "cpu":
+        logging.warning(
+            "No GPU found. Falling back to CPU inference. This might be slow."
+        )
+    num_processes = num_devices * args.nj_per_gpu
+    logging.info(
+        f"Using {device_type} ({num_devices} device(s))."
+        f" Spawning {num_processes} worker processes."
+    )
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for rank in list(range(num_devices)) * args.nj_per_gpu:
+        rank_queue.put((device_type, rank))
+    samples_raw = read_test_list(args.test_list)
+    samples = []
+    for s in samples_raw:
+        if args.lang_id is not None:
+            lang_id = args.lang_id
+            lang_name = args.lang_id
+        else:
+            lang_id = s.get("language_id")
+            lang_name = s.get("language_name")
+        samples.append(
+            (
+                s["id"],
+                s["ref_text"],
+                s["ref_audio"],
+                s["text"],
+                lang_id,
+                lang_name,
+                s.get("duration"),
+                s.get("speed"),
+            )
+        )
+    total_synthesis_time = []
+    total_audio_duration = []
+    try:
+        with ProcessPoolExecutor(
+            max_workers=num_processes,
+            initializer=process_init,
+            initargs=(rank_queue, args.model, args.warmup),
+        ) as executor:
+            futures = []
+            # parallel_chunk / no chunk
+            logging.info("Running batch inference")
+            duration_estimator = RuleDurationEstimator()
+            if args.batch_size > 0:
+                batches = cluster_samples_by_batch_size(
+                    samples, duration_estimator, args.batch_size
+                )
+            else:
+                batches = cluster_samples_by_duration(
+                    samples, duration_estimator, args.batch_duration
+                )
+            args_dict = vars(args)
+            for batch in batches:
+                futures.append(
+                    executor.submit(
+                        run_inference_batch, batch_samples=batch, **args_dict
+                    )
+                )
+            for future in tqdm(
+                as_completed(futures), total=len(futures), desc="Processing samples"
+            ):
+                try:
+                    result = future.result()
+                    for s_name, synth_time, audio_dur, status in result:
+                        total_synthesis_time.append(synth_time)
+                        total_audio_duration.append(audio_dur)
+                        rtf = synth_time / audio_dur if audio_dur > 0 else float("inf")
+                        logging.debug(
+                            f"Processed {s_name}: Audio Duration={audio_dur:.2f}s, "
+                            f"Synthesis Time={synth_time:.2f}s, RTF={rtf:.4f}"
+                        )
+                except Exception as e:
+                    logging.error(f"Failed to process sample: {e}")
+                    detailed_error = traceback.format_exc()
+                    logging.error(f"Detailed error: {detailed_error}")
+    except (Exception, KeyboardInterrupt) as e:
+        logging.critical(
+            f"An unrecoverable error occurred: {e}. Terminating all processes."
+        )
+        detailed_error_info = traceback.format_exc()
+        logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+        os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
+    total_synthesis_time = sum(total_synthesis_time)
+    total_audio_duration = sum(total_audio_duration)
+    logging.info("--- Summary ---")
+    logging.info(f"Total audio duration: {total_audio_duration:.2f}s")
+    logging.info(f"Total synthesis time: {total_synthesis_time:.2f}s")
+    if total_audio_duration > 0:
+        average_rtf = total_synthesis_time / total_audio_duration
+        logging.info(f"Average RTF: {average_rtf:.4f}")
+    else:
+        logging.warning("No speech was generated. RTF cannot be computed.")
+    logging.info("Done!")
+if __name__ == "__main__":
+    main()

omnivoice/cli/train.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training CLI for OmniVoice.
+Launches distributed training via HuggingFace Accelerate.
+Supports pre-training on Emilia data and finetuning on custom data.
+Usage:
+    accelerate launch --gpu_ids 0,1,2,3 --num_processes 4 \\
+        -m omnivoice.cli.train \\
+        --train_config train_config.json \\
+        --data_config data_config.json \\
+        --output_dir output/
+See examples/run_emilia.sh and examples/run_finetune.sh for full pipelines.
+"""
+import argparse
+from omnivoice.training.builder import build_dataloaders, build_model_and_tokenizer
+from omnivoice.training.config import TrainingConfig
+from omnivoice.training.trainer import OmniTrainer
+def main():
+    parser = argparse.ArgumentParser(description="OmniVoice Training Entry Point")
+    parser.add_argument(
+        "--train_config", type=str, required=True, help="Path to config JSON"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, required=True, help="Where to save checkpoints"
+    )
+    parser.add_argument(
+        "--data_config", type=str, required=True, help="Path to data config JSON"
+    )
+    args = parser.parse_args()
+    # 1. Load Configuration
+    config = TrainingConfig.from_json(args.train_config)
+    config.output_dir = args.output_dir
+    config.data_config = args.data_config
+    # 2. Build Components
+    model, tokenizer = build_model_and_tokenizer(config)
+    train_loader, eval_loader = build_dataloaders(config, tokenizer)
+    # 3. Initialize Trainer and Start
+    trainer = OmniTrainer(
+        model=model,
+        config=config,
+        train_dataloader=train_loader,
+        eval_dataloader=eval_loader,
+        tokenizer=tokenizer,
+    )
+    trainer.train()
+if __name__ == "__main__":
+    main()

omnivoice/data/__init__.py ADDED Viewed

File without changes

omnivoice/data/batching.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Batching strategies for streaming/iterable datasets.
+Provides length-based grouping and packing for efficient training with
+variable-length audio.
+Key classes:
+- ``PackingIterableDataset``: Packs multiple samples into fixed-length sequences
+  for training. Used by ``omnivoice.training.builder``.
+- ``StreamLengthGroupDataset``: Groups samples by length into buckets. Used by
+  data processing scripts (e.g. ``omnivoice/scripts/``).
+"""
+import bisect
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+import numpy as np
+from omnivoice.data.dataset import IterableDataReader, WrappedIterableDataset
+class StreamLengthGroupDataset(WrappedIterableDataset):
+    """A streaming dataset that groups samples by their lengths into buckets.
+    Only support audio data for now."""
+    def __init__(
+        self,
+        dataset: IterableDataReader,
+        batch_duration: float,
+        min_length: float = 0.5,
+        max_length: float = 30.0,
+        num_buckets: int = 20,
+        audio_key: str = "audio",
+        drop_last: bool = False,
+        max_sample: Optional[int] = None,
+    ):
+        self.dataset = dataset
+        self.batch_duration = batch_duration
+        self.min_length = min_length
+        self.max_length = max_length
+        self.num_buckets = num_buckets
+        self.audio_key = audio_key
+        self.drop_last = drop_last
+        self.max_sample = max_sample if max_sample is not None else float("inf")
+        self.boundaries = np.linspace(min_length, max_length, num_buckets + 1)[1:]
+    def set_epoch(self, epoch: int):
+        """
+        Set the epoch for shuffling.
+        """
+        self.dataset.set_epoch(epoch)
+    def _get_bucket_id(self, length: float) -> int:
+        return bisect.bisect_left(self.boundaries, length)
+    def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+        buckets = [[] for _ in range(self.num_buckets)]
+        bucket_max_len = [0.0] * self.num_buckets
+        for sample in self.dataset:
+            audio = sample[self.audio_key]
+            duration = audio.size(-1) / self.dataset.sample_rate
+            if duration < self.min_length or duration > self.max_length:
+                # logging.warning(f"Skipping sample with duration {duration:.2f}s")
+                continue
+            b_id = self._get_bucket_id(duration)
+            buckets[b_id].append(sample)
+            if duration > bucket_max_len[b_id]:
+                bucket_max_len[b_id] = duration
+            if (
+                bucket_max_len[b_id] * (len(buckets[b_id]) + 1) >= self.batch_duration
+                or len(buckets[b_id]) >= self.max_sample
+            ):
+                yield buckets[b_id]
+                buckets[b_id] = []
+                bucket_max_len[b_id] = 0.0
+        if not self.drop_last:
+            for b_idx, bucket in enumerate(buckets):
+                if bucket:
+                    yield bucket
+                    buckets[b_idx] = []
+class PackingIterableDataset(WrappedIterableDataset):
+    """
+    An IterableDataset that dynamically processes samples using a processor
+    and packs them into batches based on the real token count.
+    Args:
+        dataset (Iterable): The raw dataset to process.
+        processor (Callable): A processor to process each sample.
+        batch_tokens (int): Maximum number of tokens per batch.
+    """
+    def __init__(
+        self,
+        dataset: IterableDataReader,
+        processor: Any,
+        batch_tokens: int,
+    ):
+        self.dataset = dataset
+        self.processor = processor
+        self.batch_tokens = batch_tokens
+        self.skip_batches = 0
+    def set_epoch(self, epoch: int):
+        """
+        Set the epoch for shuffling.
+        """
+        self.dataset.set_epoch(epoch)
+    def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+        current_batch = []
+        current_token_count = 0
+        for raw_sample in self.dataset:
+            # Process the sample using the processor
+            try:
+                processed_sample = self.processor(raw_sample)
+            except Exception as e:
+                logging.warning(f"Error processing sample {raw_sample}: {e}")
+                continue
+            sample_length = processed_sample["length"]
+            if sample_length > self.batch_tokens:
+                continue
+            # Check if adding this sample exceeds the batch token limit
+            if current_token_count + sample_length > self.batch_tokens:
+                # Yield the current batch and start a new one
+                yield current_batch
+                current_batch = []
+                current_token_count = 0
+            # Add the processed sample to the current batch
+            current_batch.append(processed_sample)
+            current_token_count += sample_length
+        # Yield the last batch if it's not empty
+        if current_batch:
+            yield current_batch

omnivoice/data/collator.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data collator with packing for efficient training.
+Packs multiple samples into a single sequence of fixed length (``batch_tokens``)
+to maximize GPU utilization, instead of padding each sample individually.
+Used by ``omnivoice.training.builder`` to create the collate function.
+"""
+from typing import Any, Dict, List
+import torch
+class PackingDataCollator:
+    def __init__(self, processor, batch_tokens: int):
+        self.batch_tokens = batch_tokens
+        self.processor = processor
+    def __call__(self, processed_samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        target_length = self.batch_tokens
+        input_ids = torch.cat(
+            [s["input_ids"] for s in processed_samples], dim=1
+        )  # [C, Total_Len], C is the number of codebook layers of the audio tokenizer
+        labels = torch.cat(
+            [s["labels"] for s in processed_samples], dim=1
+        )  # [C, Total_Len]
+        audio_mask = torch.cat(
+            [s["audio_mask"] for s in processed_samples], dim=0
+        )  # [Total_Len]
+        position_ids = torch.cat(
+            [torch.arange(s["length"], dtype=torch.long) for s in processed_samples],
+            dim=0,
+        )  # [Total_Len]
+        pad_length = target_length - input_ids.shape[1]
+        input_ids = torch.nn.functional.pad(
+            input_ids,
+            pad=(0, pad_length),
+            value=self.processor.text_tokenizer.pad_token_id,
+        )
+        labels = torch.nn.functional.pad(labels, pad=(0, pad_length), value=-100)
+        audio_mask = torch.nn.functional.pad(
+            audio_mask, pad=(0, pad_length), value=False
+        )
+        position_ids = torch.nn.functional.pad(
+            position_ids, pad=(0, pad_length), value=0
+        )
+        return_list = {
+            "input_ids": input_ids.unsqueeze(0),  # [1, C, L]
+            "labels": labels.unsqueeze(0),  # [1, C, L]
+            "audio_mask": audio_mask.unsqueeze(0),  # [1, L]
+            "position_ids": position_ids.unsqueeze(0),  # [1, L]
+        }
+        document_ids_list = []
+        for i, s in enumerate(processed_samples):
+            seq_len = s["length"]
+            document_ids_list.append(torch.full((seq_len,), i, dtype=torch.int32))
+        document_ids = torch.cat(document_ids_list, dim=0)
+        document_ids = torch.nn.functional.pad(
+            document_ids, pad=(0, pad_length), value=-1
+        )
+        return_list["document_ids"] = document_ids.unsqueeze(0)  # [1, L]
+        return return_list

omnivoice/data/dataset.py ADDED Viewed

	@@ -0,0 +1,551 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset and data-loading utilities for training and evaluation.
+Provides WebDataset-based iterable datasets, manifest parsing, and audio/token
+loading. Used by ``omnivoice.training.builder.build_dataloaders()`` to construct
+train and eval data loaders.
+Key functions:
+- ``prepare_data_manifests_from_json()``: Parses a data config JSON into train/dev
+    manifests.
+Key classes:
+- ``WebDatasetReader``: Reads audio/text pairs from WebDataset tar shards as an
+    iterable dataset.
+- ``MuxWebDatasetReader``: Multiplexes multiple WebDataset readers for
+    multilingual data.
+- ``JsonlDatasetReader``: Reads audio/text pairs from a JSONL manifest file.
+    Used by data processing scripts (e.g. ``omnivoice/scripts/``).
+- ``SampleDecoder``: Decodes individual samples (audio or tokens + labels).
+"""
+import io
+import json
+import logging
+import os
+import random
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torchaudio
+import webdataset as wds
+from torch.utils.data import IterableDataset
+def load_audio_webdataset(data, sample_rate: int = 24000, device="cpu"):
+    """
+    Load audio from bytes data and resample to the target sample rate if needed.
+    Return a tensor of shape (1, num_samples)
+    """
+    audio, sr = torchaudio.load(io.BytesIO(data))
+    audio = audio.to(device)
+    if audio.size(dim=0) > 1:
+        audio = torch.mean(audio, dim=0)
+    if sr != sample_rate:
+        audio = torchaudio.functional.resample(audio, sr, sample_rate)
+    return audio
+def prepare_data_manifests_from_json(
+    data_config: str,
+) -> Tuple[List[Tuple[str, str, int, float]], List[Tuple[str, str, int, float]]]:
+    """
+    Prepare data manifests from a json file.
+    A typical multilingual json file is in the following format:
+    {
+        "train":
+        [
+            {
+                "language_id": "en",
+                "manifest_path": [
+                    "/Emilia/EN/data.lst"
+                ],
+                "repeat": 1
+            },
+            {
+                "language_id": "zh",
+                "manifest_path": [
+                    "/Emilia/ZH/data.lst"
+                ],
+                "repeat": 1
+            }
+        ],
+        "dev":
+        [
+            {
+                "language_id": "en",
+                "manifest_path": [
+                    "/Emilia/EN-dev/data.lst"
+                ],
+                "repeat": 1
+            },
+            {
+                "language_id": "zh",
+                "manifest_path": [
+                    "/Emilia/ZH-dev/data.lst"
+                ],
+                "repeat": 1
+            }
+        ]
+    }
+    "language_id" is not used, just for better organization of multilingual data.
+    "repeat" is an optional field, default to 1, which indicates how many times
+        the manifest should be repeated.
+    The simplist format is like:
+    {
+        "train":
+        [
+            {
+                "manifest_path": [
+                    "/Emilia/EN/data.lst",
+                    "/Emilia/ZH/data.lst"
+                ],
+            }
+        ],
+        "dev":
+        [
+            {
+                "manifest_path": [
+                    "/Emilia/EN-dev/data.lst",
+                    "/Emilia/ZH-dev/data.lst"
+                ],
+            }
+        ]
+    data.lst format (items separated by space):
+    /path/to/data.tar /path/to/label.jsonl num_items num_seconds
+    """
+    train_manifests = []
+    dev_manifests = []
+    with open(data_config, "r", encoding="utf-8") as f:
+        data = json.load(f)
+        for item in data["train"]:
+            manifest_paths = item["manifest_path"]
+            repeat = item.get("repeat", 1)
+            for manifest_path in manifest_paths:
+                # assert manifest_path is a file
+                assert os.path.isfile(manifest_path), f"{manifest_path} is not a file."
+                train_manifests.extend(
+                    webdataset_manifest_reader(manifest_path) * repeat
+                )
+        if "dev" in data:
+            for item in data["dev"]:
+                manifest_paths = item["manifest_path"]
+                repeat = item.get("repeat", 1)
+                for manifest_path in manifest_paths:
+                    dev_manifests.extend(
+                        webdataset_manifest_reader(manifest_path) * repeat
+                    )
+    return train_manifests, dev_manifests
+def webdataset_manifest_reader(
+    manifest_path: str,
+) -> List[Tuple[str, str]]:
+    """
+    Read a manifest file containing webdataset tar paths and label jsonl paths.
+    Each line in the manifest file is in the format of:
+    /path/to/data.tar /path/to/label.jsonl num_items num_seconds
+    """
+    manifests = []
+    with open(manifest_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split()
+            if len(parts) != 4:
+                raise ValueError(
+                    f"Invalid manifest line: {line}. "
+                    f"Each line must contain "
+                    "tar_path, label_jsonl_path, num_items, num_seconds."
+                )
+            tar_path, label_jsonl_path, num_items, num_seconds = (
+                parts[0],
+                parts[1],
+                int(parts[2]),
+                float(parts[3]),
+            )
+            manifests.append((tar_path, label_jsonl_path, num_items, num_seconds))
+    return manifests
+class SampleDecoder:
+    """
+    Decode a sample from webdataset, including loading audio/tokens and fetching label.
+    """
+    def __init__(
+        self,
+        tar_to_label: Dict,
+        sample_rate: int = 24000,
+        audio_format: Optional[Tuple[str]] = None,
+        normalize_audio: bool = True,
+    ):
+        """
+        Args:
+          tar_to_label:
+            A dict mapping from audio tar file to label tar file.
+          sample_rate:
+            Target sample rate for audio. Required if audio is loaded.
+          audio_format:
+            Tuple of audio file extensions to look for in the sample.
+        """
+        self.tar_to_label = tar_to_label
+        self.sample_rate = sample_rate
+        self.label_dataset = None
+        if audio_format is None:
+            self.audio_format = ("flac", "wav", "mp3")
+        else:
+            self.audio_format = audio_format
+        self.normalize_audio = normalize_audio
+    def __call__(self, sample):
+        return_dict = {}
+        src = sample["__url__"]
+        key = sample["__key__"]
+        if (
+            self.label_dataset is None
+            or self.label_dataset.path != self.tar_to_label[src]
+        ):
+            self.label_dataset = LabelDataset(self.tar_to_label[src])
+        audio = torch.empty(0)
+        if "npy" in sample:
+            audio_tokens = torch.from_numpy(sample["npy"])
+            return_dict["audio_tokens"] = audio_tokens
+        else:
+            for ext in self.audio_format:
+                if ext in sample:
+                    # load audio (1, num_samples)
+                    audio = load_audio_webdataset(
+                        sample[ext], sample_rate=self.sample_rate
+                    )
+                    if self.normalize_audio:
+                        audio = (audio / (audio.abs().max() + 1e-7)) * 0.9
+                    break
+            return_dict["audio"] = audio
+            return_dict["audio_duration"] = audio.size(-1) / self.sample_rate
+        label = self.label_dataset[key]
+        return_dict["label"] = label
+        return return_dict
+class LabelDataset:
+    def __init__(self, jsonl_path: str):
+        """
+        Load labels from a jsonl file.
+        Args:
+          jsonl_path:
+            Path to the jsonl file containing labels.
+            Each line in the manifest file is in the format of:
+            {"idx": "idx", "text": "transcription text"}
+        """
+        self._labels = {}
+        self.path = jsonl_path
+        if not os.path.exists(jsonl_path):
+            raise FileNotFoundError(f"Label jsonl file {jsonl_path} does not exist.")
+        with open(jsonl_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                item = json.loads(line)
+                if "id" in item:
+                    self._labels[item["id"]] = item
+    def __getitem__(self, key):
+        return self._labels[key]
+class IterableDataReader:
+    "Interfaces for classes reading data."
+    sample_rate: int
+    def set_epoch(self, epoch: int):
+        raise NotImplementedError
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        raise NotImplementedError
+    def __len__(self) -> int:
+        raise NotImplementedError
+class WrappedIterableDataset(IterableDataset):
+    "IterableDataset interfaces in this project."
+    def set_epoch(self, epoch: int):
+        raise NotImplementedError
+    def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+        raise NotImplementedError
+class WebDatasetReader(IterableDataReader):
+    def __init__(
+        self,
+        manifests: List[Tuple[str, str, int, float]],
+        evaluation: bool = False,
+        shuffle_buffer_size: int = 20000,
+        sample_rate: int = 24000,
+    ):
+        self.shuffle_buffer_size = shuffle_buffer_size
+        self.evaluation = evaluation
+        self.epoch = 0
+        self.orig_urls = []
+        self.tar_to_label = {}
+        self.num_items = 0
+        self.num_seconds = 0.0
+        for tar_path, label_jsonl_path, num_items, num_seconds in manifests:
+            self.orig_urls.append(tar_path)
+            self.tar_to_label[tar_path] = label_jsonl_path
+            self.num_items += num_items
+            self.num_seconds += num_seconds
+        self.urls = self.orig_urls.copy()
+        self.sample_decoder = SampleDecoder(
+            tar_to_label=self.tar_to_label,
+            sample_rate=sample_rate,
+        )
+        self.sample_rate = sample_rate
+    def set_epoch(self, epoch: int):
+        """
+        Set the epoch for shuffling.
+        """
+        self.epoch = epoch
+        self.urls = self.orig_urls.copy()
+        if not self.evaluation:
+            random.Random(epoch).shuffle(self.urls)
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        dataset = wds.WebDataset(
+            self.urls,
+            shardshuffle=False,
+            workersplitter=wds.split_by_worker,
+            nodesplitter=wds.split_by_node,
+        )
+        pipeline = dataset.decode().map(self.sample_decoder)
+        if not self.evaluation:
+            pipeline = pipeline.shuffle(self.shuffle_buffer_size, seed=self.epoch)
+        return iter(pipeline)
+    def __len__(self) -> int:
+        return self.num_items
+class JsonlDatasetReader(IterableDataReader):
+    """Read raw JSONL and load audio files, matching WebDatasetReader output format.
+    Each JSONL line should be a JSON object with at least:
+        {"id": "...", "audio_path": "/path/to/audio.wav", ...}
+    Yields dicts of the form: {"audio": Tensor(1, T), "label": dict}
+    """
+    def __init__(
+        self,
+        jsonl_path: str,
+        sample_rate: int = 24_000,
+        shuffle: bool = True,
+        shuffle_seed: int = 42,
+        normalize_audio: bool = True,
+    ):
+        self.jsonl_path = jsonl_path
+        self.sample_rate = sample_rate
+        self.shuffle = shuffle
+        self.shuffle_seed = shuffle_seed
+        self.normalize_audio = normalize_audio
+    def set_epoch(self, epoch: int):
+        self.shuffle_seed = epoch
+    def _read_lines(self) -> list[dict]:
+        entries = []
+        with open(self.jsonl_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    entries.append(json.loads(line))
+        if self.shuffle:
+            random.seed(self.shuffle_seed)
+            random.shuffle(entries)
+            logging.info(
+                f"Shuffled {len(entries)} JSONL entries (seed={self.shuffle_seed})"
+            )
+        return entries
+    def _stream_lines(self):
+        with open(self.jsonl_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+    def __iter__(self):
+        source = self._read_lines() if self.shuffle else self._stream_lines()
+        # Split data across distributed ranks (multi-GPU / DDP)
+        if dist.is_initialized():
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            source = [item for i, item in enumerate(source) if i % world_size == rank]
+        # Split data across DataLoader workers to avoid duplication
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            source = (
+                item
+                for i, item in enumerate(source)
+                if i % worker_info.num_workers == worker_info.id
+            )
+        for meta in source:
+            audio_path = meta.get("audio_path")
+            if not audio_path or not os.path.exists(audio_path):
+                logging.warning(
+                    f"Skipping {meta.get('id', '?')}: audio_path missing or not found"
+                )
+                continue
+            try:
+                waveform, sr = torchaudio.load(audio_path)
+                if waveform.shape[0] > 1:
+                    waveform = waveform.mean(dim=0, keepdim=True)
+                if sr != self.sample_rate:
+                    waveform = torchaudio.functional.resample(
+                        waveform, sr, self.sample_rate
+                    )
+                if self.normalize_audio:
+                    waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.9
+                meta["audio_duration"] = waveform.shape[1] / self.sample_rate
+                yield {"audio": waveform, "label": meta}
+            except Exception as e:
+                logging.warning(f"Skipping {meta.get('id', '?')}: {e}")
+class MuxWebDatasetReader(IterableDataReader):
+    def __init__(
+        self,
+        readers: List[WebDatasetReader],
+        weights: Optional[List[float]] = None,
+        stop_early: bool = False,
+        seed: int = 0,
+    ):
+        self.readers = readers
+        self.stop_early = stop_early
+        self.mux_iterator = LazyIteratorMultiplexer(
+            *readers,
+            stop_early=stop_early,
+            weights=weights,
+            seed=seed,
+        )
+    def set_epoch(self, epoch: int):
+        """
+        Set the epoch for shuffling.
+        """
+        for reader in self.readers:
+            reader.set_epoch(epoch)
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        return iter(self.mux_iterator)
+class LazyIteratorMultiplexer:
+    """
+    A wrapper over multiple iterators that enables to combine
+    lazy manifests in Lhotse. During iteration, unlike
+    :class:`.LazyIteratorChain`,
+    :class:`.LazyIteratorMultiplexer` at each step randomly
+    selects the iterable used to yield an item.
+    Since the iterables might be of different length, we provide
+    a ``weights`` parameter to let the user decide which iterables
+    should be sampled more frequently than others.
+    When an iterable is exhausted, we will keep sampling from the other iterables, until
+    we exhaust them all, unless ``stop_early`` is set to ``True``.
+    """
+    def __init__(
+        self,
+        *iterators: IterableDataReader,
+        stop_early: bool = False,
+        weights: Optional[List[float]] = None,
+        seed: int = 0,
+    ) -> None:
+        self.iterators = list(iterators)
+        self.stop_early = stop_early
+        self.seed = seed
+        assert (
+            len(self.iterators) > 1
+        ), "There have to be at least two iterables to multiplex."
+        if weights is None:
+            if all(hasattr(it, "__len__") for it in self.iterators):
+                lengths = [len(it) for it in self.iterators]
+                total_length = sum(lengths)
+                self.weights = [length / total_length for length in lengths]
+            else:
+                self.weights = [1] * len(self.iterators)
+        else:
+            self.weights = weights
+        assert len(self.iterators) == len(self.weights)
+    def __iter__(self):
+        rng = random.Random(self.seed)
+        iters = [iter(it) for it in self.iterators]
+        exhausted = [False for _ in range(len(iters))]
+        def should_continue():
+            if self.stop_early:
+                return not any(exhausted)
+            else:
+                return not all(exhausted)
+        while should_continue():
+            active_indexes, active_weights = zip(
+                *[
+                    (i, w)
+                    for i, (is_exhausted, w) in enumerate(zip(exhausted, self.weights))
+                    if not is_exhausted
+                ]
+            )
+            idx = rng.choices(active_indexes, weights=active_weights, k=1)[0]
+            selected = iters[idx]
+            try:
+                item = next(selected)
+                yield item
+            except StopIteration:
+                exhausted[idx] = True
+                continue
+    def __len__(self) -> int:
+        return sum(len(iterator) for iterator in self.iterators)

omnivoice/data/processor.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training sample processor for OmniVoice.
+Converts raw audio/text samples into model-ready tensors: applies prompt/mask
+tokenization, randomly drops conditioning, and injects language/instruct tokens.
+Used by ``omnivoice.training.builder`` to build the data pipeline.
+Contains two processor classes:
+- ``OmniVoiceSampleProcessor``: Full processor used for training.
+- ``OmniVoiceSimpleSampleProcessor``: Simplified processor (not used for training).
+"""
+import random
+from typing import Any, Dict
+import torch
+class OmniVoiceSampleProcessor:
+    """
+    Handles the logic of processing a raw sample into tensors
+    (masking, tokenization, etc.).
+    """
+    def __init__(
+        self,
+        text_tokenizer: Any,
+        num_channels: int,
+        audio_mask_id: int,
+        prompt_ratio_range: tuple,
+        mask_ratio_range: tuple,
+        drop_cond_ratio: float,
+        language_ratio: float,
+        use_pinyin_ratio: float,
+        instruct_ratio: float,
+        only_instruct_ratio: float,
+    ):
+        self.text_tokenizer = text_tokenizer
+        self.num_channels = num_channels
+        self.audio_mask_id = audio_mask_id
+        self.prompt_ratio_range = prompt_ratio_range
+        self.mask_ratio_range = mask_ratio_range
+        self.drop_cond_ratio = drop_cond_ratio
+        self.language_ratio = language_ratio
+        self.use_pinyin_ratio = use_pinyin_ratio
+        self.instruct_ratio = instruct_ratio
+        self.only_instruct_ratio = only_instruct_ratio
+    def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        # clean_start_token_idx is only used for prompt denoising training,
+        # where the prompt region is augmented with noises and the model
+        # needs to learn to recover the clean prompt.
+        # clean_start_token_idx indicates the start index of the clean generated token.
+        if "clean_start_token_idx" in sample["label"]:
+            drop_cond = False
+        else:
+            drop_cond = random.uniform(0, 1) < self.drop_cond_ratio
+        if drop_cond:
+            prompt_ratio = 0.0
+            drop_text = True
+            use_language = False
+            use_instruct = False
+        else:
+            prompt_ratio = random.uniform(*self.prompt_ratio_range)
+            drop_text = False
+            use_language = random.uniform(0, 1) < self.language_ratio
+            use_instruct = random.uniform(0, 1) < self.instruct_ratio
+            if use_instruct and random.uniform(0, 1) < self.only_instruct_ratio:
+                prompt_ratio = 0.0
+        mask_ratio = random.uniform(*self.mask_ratio_range)
+        # --- Style ---
+        style = ""
+        if use_language:
+            language = sample["label"].get("language_id", "None")
+        else:
+            language = "None"
+        if use_instruct:
+            instruct = sample["label"].get("instruct", "None")
+        else:
+            instruct = "None"
+        if "clean_start_token_idx" in sample["label"]:
+            style += "<|denoise|>"
+        style += f"<|lang_start|>{language}<|lang_end|>"
+        style += f"<|instruct_start|>{instruct}<|instruct_end|>"
+        style_inputs = self.text_tokenizer(style, return_tensors="pt").input_ids.repeat(
+            self.num_channels, 1
+        )
+        style_labels = torch.full(
+            style_inputs.shape, -100
+        )  # Style prompt does not compute loss
+        # --- Text ---
+        if (
+            "text_pinyin" in sample["label"]
+            and random.uniform(0, 1) < self.use_pinyin_ratio
+        ):
+            text = sample["label"]["text_pinyin"]
+        else:
+            text = sample["label"]["text"]
+        text_inputs = self.text_tokenizer(
+            f"<|text_start|>{text}<|text_end|>", return_tensors="pt"
+        ).input_ids.repeat(self.num_channels, 1)
+        text_labels = torch.full(text_inputs.shape, -100)  # Text does not compute loss
+        # --- Audio ---
+        audio_tokens = sample["audio_tokens"].long()
+        # Masking Logic
+        if "clean_start_token_idx" in sample["label"]:
+            prompt_length = sample["label"]["clean_start_token_idx"]
+        else:
+            prompt_length = int(audio_tokens.shape[1] * prompt_ratio)
+        audio_inputs = audio_tokens.clone()
+        audio_labels = audio_tokens.clone()
+        # Apply masking
+        maskable_region = audio_tokens[:, prompt_length:]
+        token_mask = torch.rand(maskable_region.shape) < mask_ratio
+        audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id
+        audio_labels[:, prompt_length:][
+            ~token_mask
+        ] = -100  # Only compute loss on masked tokens
+        if not drop_cond:
+            audio_labels[:, :prompt_length] = -100  # No loss on prompt region
+        # --- Concatenation ---
+        if drop_text:
+            input_ids = audio_inputs
+            labels = audio_labels
+            total_length = input_ids.shape[1]
+            audio_mask = torch.ones(total_length, dtype=torch.bool)
+        else:
+            input_ids = torch.cat([style_inputs, text_inputs, audio_inputs], dim=1)
+            labels = torch.cat([style_labels, text_labels, audio_labels], dim=1)
+            total_length = input_ids.shape[1]
+            audio_start_idx = style_inputs.shape[1] + text_inputs.shape[1]
+            audio_mask = torch.zeros(total_length, dtype=torch.bool)
+            audio_mask[audio_start_idx:] = True
+        return_dict = {
+            "input_ids": input_ids,  # [C, L]
+            "labels": labels,  # [C, L]
+            "audio_mask": audio_mask,  # [L]
+            "length": total_length,
+        }
+        return return_dict
+class OmniVoiceSimpleSampleProcessor:
+    """
+    Handles the logic of processing a raw sample into tensors
+    (masking, tokenization, etc.).
+    This is a simpler version that does not include language, instructions,
+        or denoising prompts.
+    We do not use it for training as OmniVoiceSampleProcessor can cover this case.
+    We keep it as a reference implementation for users to understand the basic logics.
+    """
+    def __init__(
+        self,
+        text_tokenizer: Any,
+        num_channels: int,
+        audio_mask_id: int,
+        prompt_ratio_range: tuple,
+        mask_ratio_range: tuple,
+        drop_cond_ratio: float,
+    ):
+        self.text_tokenizer = text_tokenizer
+        self.num_channels = num_channels
+        self.audio_mask_id = audio_mask_id
+        self.prompt_ratio_range = prompt_ratio_range
+        self.mask_ratio_range = mask_ratio_range
+        self.drop_cond_ratio = drop_cond_ratio
+    def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        drop_cond = random.uniform(0, 1) < self.drop_cond_ratio
+        mask_ratio = random.uniform(*self.mask_ratio_range)
+        if drop_cond:
+            prompt_ratio = 0.0
+        else:
+            prompt_ratio = random.uniform(*self.prompt_ratio_range)
+        # --- Text ---
+        text = sample["label"]["text"]
+        text_inputs = self.text_tokenizer(
+            f"<|text_start|>{text}<|text_end|>", return_tensors="pt"
+        ).input_ids.repeat(self.num_channels, 1)
+        text_labels = torch.full(text_inputs.shape, -100)  # Text does not compute loss
+        # --- Audio ---
+        audio_tokens = sample["audio_tokens"].long()
+        # Masking Logic
+        prompt_length = int(audio_tokens.shape[1] * prompt_ratio)
+        audio_inputs = audio_tokens.clone()
+        audio_labels = audio_tokens.clone()
+        # Apply masking
+        maskable_region = audio_tokens[:, prompt_length:]
+        token_mask = torch.rand(maskable_region.shape) < mask_ratio
+        audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id
+        audio_labels[:, prompt_length:][
+            ~token_mask
+        ] = -100  # Only compute loss on masked tokens
+        if not drop_cond:
+            # No loss on prompt region
+            audio_labels[:, :prompt_length] = -100
+        # --- Concatenation ---
+        if drop_cond:
+            input_ids = audio_inputs
+            labels = audio_labels
+            total_length = input_ids.shape[1]
+            audio_mask = torch.ones(total_length, dtype=torch.bool)
+        else:
+            input_ids = torch.cat([text_inputs, audio_inputs], dim=1)
+            labels = torch.cat([text_labels, audio_labels], dim=1)
+            total_length = input_ids.shape[1]
+            audio_start_idx = text_inputs.shape[1]
+            audio_mask = torch.zeros(total_length, dtype=torch.bool)
+            audio_mask[audio_start_idx:] = True
+        return_dict = {
+            "input_ids": input_ids,  # [C, L]
+            "labels": labels,  # [C, L]
+            "audio_mask": audio_mask,  # [L]
+            "length": total_length,
+        }
+        return return_dict

omnivoice/eval/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import warnings
+# Suppress specific warnings from zhconv that are not relevant to WER calculation
+warnings.filterwarnings("ignore", category=UserWarning)

omnivoice/eval/models/ecapa_tdnn_wavlm.py ADDED Viewed

	@@ -0,0 +1,374 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ECAPA_TDNN_WAVLM(nn.Module):
+    def __init__(
+        self,
+        feat_dim=80,
+        channels=512,
+        emb_dim=192,
+        global_context_att=False,
+        sr=16000,
+        ssl_model_path=None,
+    ):
+        super().__init__()
+        self.sr = sr
+        if ssl_model_path is None:
+            self.feature_extract = torch.hub.load("s3prl/s3prl", "wavlm_large")
+        else:
+            self.feature_extract = torch.hub.load(
+                os.path.dirname(ssl_model_path),
+                "wavlm_local",
+                source="local",
+                ckpt=os.path.join(ssl_model_path, "wavlm_large.pt"),
+            )
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+            self.feature_extract.model.encoder.layers[23].self_attn,
+            "fp32_attention",
+        ):
+            self.feature_extract.model.encoder.layers[
+                23
+            ].self_attn.fp32_attention = False
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+            self.feature_extract.model.encoder.layers[11].self_attn,
+            "fp32_attention",
+        ):
+            self.feature_extract.model.encoder.layers[
+                11
+            ].self_attn.fp32_attention = False
+        self.feat_num = self.get_feat_num()
+        self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
+        self.instance_norm = nn.InstanceNorm1d(feat_dim)
+        # self.channels = [channels] * 4 + [channels * 3]
+        self.channels = [channels] * 4 + [1536]
+        self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            self.channels[0],
+            self.channels[1],
+            kernel_size=3,
+            stride=1,
+            padding=2,
+            dilation=2,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer3 = SE_Res2Block(
+            self.channels[1],
+            self.channels[2],
+            kernel_size=3,
+            stride=1,
+            padding=3,
+            dilation=3,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer4 = SE_Res2Block(
+            self.channels[2],
+            self.channels[3],
+            kernel_size=3,
+            stride=1,
+            padding=4,
+            dilation=4,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+        self.pooling = AttentiveStatsPool(
+            self.channels[-1],
+            attention_channels=128,
+            global_context_att=global_context_att,
+        )
+        self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
+        self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
+    def get_feat_num(self):
+        self.feature_extract.eval()
+        wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
+        with torch.no_grad():
+            features = self.feature_extract(wav)
+        select_feature = features["hidden_states"]
+        if isinstance(select_feature, (list, tuple)):
+            return len(select_feature)
+        else:
+            return 1
+    def get_feat(self, x):
+        with torch.no_grad():
+            x = self.feature_extract([sample for sample in x])
+        x = x["hidden_states"]
+        if isinstance(x, (list, tuple)):
+            x = torch.stack(x, dim=0)
+        else:
+            x = x.unsqueeze(0)
+        norm_weights = (
+            F.softmax(self.feature_weight, dim=-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+        )
+        x = (norm_weights * x).sum(dim=0)
+        x = torch.transpose(x, 1, 2) + 1e-6
+        x = self.instance_norm(x)
+        return x
+    def forward(self, x):
+        x = self.get_feat(x)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn(self.pooling(out))
+        out = self.linear(out)
+        return out
+# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+""" Res2Conv1d + BatchNorm1d + ReLU
+"""
+class Res2Conv1dReluBn(nn.Module):
+    """
+    in_channels == out_channels == channels
+    """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = self.bns[i](F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
+class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+""" The SE connection of 1D case.
+"""
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
+#     return nn.Sequential(
+#         Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
+#         Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
+#         Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
+#         SE_Connect(channels)
+#     )
+class SE_Res2Block(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        scale,
+        se_bottleneck_dim,
+    ):
+        super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(
+            in_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(
+            out_channels, kernel_size, stride, padding, dilation, scale=scale
+        )
+        self.Conv1dReluBn2 = Conv1dReluBn(
+            out_channels, out_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+    def forward(self, x):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+        x = self.Conv1dReluBn1(x)
+        x = self.Res2Conv1dReluBn(x)
+        x = self.Conv1dReluBn2(x)
+        x = self.SE_Connect(x)
+        return x + residual
+""" Attentive weighted mean and standard deviation pooling.
+"""
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, attention_channels=128, global_context_att=False):
+        super().__init__()
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear,
+        #  then we don't need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(
+                in_dim, attention_channels, kernel_size=1
+            )  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(
+            attention_channels, in_dim, kernel_size=1
+        )  # equals V and k in the paper
+    def forward(self, x):
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-10
+            ).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x_in))
+        # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)

omnivoice/eval/models/utmos.py ADDED Viewed

	@@ -0,0 +1,370 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UTMOS strong model.
+Implementation from https://github.com/tarepan/SpeechMOS
+"""
+import math
+from typing import List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+class UTMOS22Strong(nn.Module):
+    """Saeki_2022 paper's `UTMOS strong learner` inference model
+    (w/o Phoneme encoder)."""
+    def __init__(self):
+        """Init."""
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        feat_ssl, feat_domain_emb, feat_judge_emb, feat_rnn_h, feat_proj_h = (
+            768,
+            128,
+            128,
+            512,
+            2048,
+        )
+        feat_cat = feat_ssl + feat_domain_emb + feat_judge_emb
+        # SSL/DataDomainEmb/JudgeIdEmb/BLSTM/Projection
+        self.wav2vec2 = Wav2Vec2Model()
+        self.domain_emb = nn.Parameter(
+            data=torch.empty(1, feat_domain_emb), requires_grad=False
+        )
+        self.judge_emb = nn.Parameter(
+            data=torch.empty(1, feat_judge_emb), requires_grad=False
+        )
+        self.blstm = nn.LSTM(
+            input_size=feat_cat,
+            hidden_size=feat_rnn_h,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.projection = nn.Sequential(
+            nn.Linear(feat_rnn_h * 2, feat_proj_h), nn.ReLU(), nn.Linear(feat_proj_h, 1)
+        )
+    def forward(self, wave: Tensor, sr: int) -> Tensor:  # pylint: disable=invalid-name
+        """wave-to-score :: (B, T) -> (B,)"""
+        # Feature extraction :: (B, T) -> (B, Frame, Feat)
+        unit_series = self.wav2vec2(wave)
+        bsz, frm, _ = unit_series.size()
+        # DataDomain/JudgeId Embedding's Batch/Time expansion ::
+        # (B=1, Feat) -> (B=bsz, Frame=frm, Feat)
+        domain_series = self.domain_emb.unsqueeze(1).expand(bsz, frm, -1)
+        judge_series = self.judge_emb.unsqueeze(1).expand(bsz, frm, -1)
+        # Feature concatenation :: (B, Frame, Feat=f1) + (B, Frame, Feat=f2) +
+        # (B, Frame, Feat=f3) -> (B, Frame, Feat=f1+f2+f3)
+        cat_series = torch.cat([unit_series, domain_series, judge_series], dim=2)
+        # Frame-scale score estimation :: (B, Frame, Feat) -> (B, Frame, Feat)
+        # -> (B, Frame, Feat=1) - BLSTM/Projection
+        feat_series = self.blstm(cat_series)[0]
+        score_series = self.projection(feat_series)
+        # Utterance-scale score :: (B, Frame, Feat=1) -> (B, Feat=1)
+        # -> (B,) - Time averaging
+        utter_score = score_series.mean(dim=1).squeeze(1) * 2 + 3
+        return utter_score
+class Wav2Vec2Model(nn.Module):
+    """Wav2Vev2."""
+    def __init__(self):
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        feat_h1, feat_h2 = 512, 768
+        feature_enc_layers = (
+            [(feat_h1, 10, 5)] + [(feat_h1, 3, 2)] * 4 + [(feat_h1, 2, 2)] * 2
+        )
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers
+        )  # pyright: ignore [reportGeneralTypeIssues]
+        self.layer_norm = nn.LayerNorm(feat_h1)
+        self.post_extract_proj = nn.Linear(feat_h1, feat_h2)
+        self.dropout_input = nn.Dropout(0.1)
+        self.encoder = TransformerEncoder(feat_h2)
+        # Remnants
+        self.mask_emb = nn.Parameter(torch.FloatTensor(feat_h2))
+    def forward(self, source: Tensor):
+        """FeatureEncoder + ContextTransformer"""
+        # Feature encoding
+        features = self.feature_extractor(source)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        features = self.post_extract_proj(features)
+        # Context transformer
+        x = self.encoder(features)
+        return x
+class ConvFeatureExtractionModel(nn.Module):
+    """Feature Encoder."""
+    def __init__(self, conv_layers: List[Tuple[int, int, int]]):
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        def block(
+            n_in: int, n_out: int, k: int, stride: int, is_group_norm: bool = False
+        ):
+            if is_group_norm:
+                return nn.Sequential(
+                    nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+                    nn.Dropout(p=0.0),
+                    nn.GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
+            else:
+                return nn.Sequential(
+                    nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+                    nn.Dropout(p=0.0),
+                    nn.GELU(),
+                )
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for i, params in enumerate(conv_layers):
+            (dim, k, stride) = params
+            self.conv_layers.append(block(in_d, dim, k, stride, is_group_norm=i == 0))
+            in_d = dim
+    def forward(self, series: Tensor) -> Tensor:
+        """:: (B, T) -> (B, Feat, Frame)"""
+        series = series.unsqueeze(1)
+        for conv in self.conv_layers:
+            series = conv(series)
+        return series
+class TransformerEncoder(nn.Module):
+    """Transformer."""
+    def build_encoder_layer(self, feat: int):
+        """Layer builder."""
+        return TransformerSentenceEncoderLayer(
+            embedding_dim=feat,
+            ffn_embedding_dim=3072,
+            num_attention_heads=12,
+            activation_fn="gelu",
+            dropout=0.1,
+            attention_dropout=0.1,
+            activation_dropout=0.0,
+            layer_norm_first=False,
+        )
+    def __init__(self, feat: int):
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        self.required_seq_len_multiple = 2
+        self.pos_conv = nn.Sequential(
+            *[
+                nn.utils.weight_norm(
+                    nn.Conv1d(feat, feat, kernel_size=128, padding=128 // 2, groups=16),
+                    name="weight",
+                    dim=2,
+                ),
+                SamePad(128),
+                nn.GELU(),
+            ]
+        )
+        self.layer_norm = nn.LayerNorm(feat)
+        self.layers = nn.ModuleList([self.build_encoder_layer(feat) for _ in range(12)])
+    def forward(self, x: Tensor) -> Tensor:
+        x_conv = self.pos_conv(x.transpose(1, 2)).transpose(1, 2)
+        x = x + x_conv
+        x = self.layer_norm(x)
+        # pad to the sequence length dimension
+        x, pad_length = pad_to_multiple(
+            x, self.required_seq_len_multiple, dim=-2, value=0
+        )
+        if pad_length > 0:
+            padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
+            padding_mask[:, -pad_length:] = True
+        else:
+            padding_mask, _ = pad_to_multiple(
+                None, self.required_seq_len_multiple, dim=-1, value=True
+            )
+        # :: (B, T, Feat) -> (T, B, Feat)
+        x = x.transpose(0, 1)
+        for layer in self.layers:
+            x = layer(x, padding_mask)
+        # :: (T, B, Feat) -> (B, T, Feat)
+        x = x.transpose(0, 1)
+        # undo paddding
+        if pad_length > 0:
+            x = x[:, :-pad_length]
+        return x
+class SamePad(nn.Module):
+    """Tail inverse padding."""
+    def __init__(self, kernel_size: int):
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        assert kernel_size % 2 == 0, "`SamePad` now support only even kernel."
+    def forward(self, x: Tensor) -> Tensor:
+        return x[:, :, :-1]
+def pad_to_multiple(
+    x: Optional[Tensor], multiple: int, dim: int = -1, value: float = 0
+) -> Tuple[Optional[Tensor], int]:
+    """Tail padding."""
+    if x is None:
+        return None, 0
+    tsz = x.size(dim)
+    m = tsz / multiple
+    remainder = math.ceil(m) * multiple - tsz
+    if m.is_integer():
+        return x, 0
+    pad_offset = (0,) * (-1 - dim) * 2
+    return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
+class TransformerSentenceEncoderLayer(nn.Module):
+    """Transformer Encoder Layer used in BERT/XLM style pre-trained models."""
+    def __init__(
+        self,
+        embedding_dim: int,
+        ffn_embedding_dim: int,
+        num_attention_heads: int,
+        activation_fn: str,
+        dropout: float,
+        attention_dropout: float,
+        activation_dropout: float,
+        layer_norm_first: bool,
+    ) -> None:
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        assert layer_norm_first is False, "`layer_norm_first` is fixed to `False`"
+        assert activation_fn == "gelu", "`activation_fn` is fixed to `gelu`"
+        feat = embedding_dim
+        self.self_attn = MultiheadAttention(
+            feat, num_attention_heads, attention_dropout
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.fc1 = nn.Linear(feat, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, feat)
+        self.self_attn_layer_norm = nn.LayerNorm(feat)
+        self.final_layer_norm = nn.LayerNorm(feat)
+    def forward(self, x: Tensor, self_attn_padding_mask: Optional[Tensor]):
+        # Res[Attn-Do]-LN
+        residual = x
+        x = self.self_attn(x, x, x, self_attn_padding_mask)
+        x = self.dropout1(x)
+        x = residual + x
+        x = self.self_attn_layer_norm(x)
+        # Res[SegFC-GELU-Do-SegFC-Do]-LN
+        residual = x
+        x = F.gelu(self.fc1(x))  # pyright: ignore [reportUnknownMemberType]
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        x = self.dropout3(x)
+        x = residual + x
+        x = self.final_layer_norm(x)
+        return x
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention."""
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float):
+        super().__init__()  # pyright: ignore [reportUnknownMemberType]
+        self.embed_dim, self.num_heads, self.p_dropout = embed_dim, num_heads, dropout
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Optional[Tensor],
+    ) -> Tensor:
+        """
+        Args:
+            query            :: (T, B, Feat)
+            key_padding_mask :: (B, src_len) - mask to exclude keys that are pads
+                , where padding elements are indicated by 1s.
+        """
+        return F.multi_head_attention_forward(
+            query=query,
+            key=key,
+            value=value,
+            embed_dim_to_check=self.embed_dim,
+            num_heads=self.num_heads,
+            in_proj_weight=torch.empty([0]),
+            in_proj_bias=torch.cat(
+                (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)
+            ),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=self.p_dropout,
+            out_proj_weight=self.out_proj.weight,
+            out_proj_bias=self.out_proj.bias,
+            training=False,
+            key_padding_mask=key_padding_mask.bool()
+            if key_padding_mask is not None
+            else None,
+            need_weights=False,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )[0]

omnivoice/eval/mos/utmos.py ADDED Viewed

	@@ -0,0 +1,299 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Calculate UTMOS score with automatic Mean Opinion Score (MOS) prediction system
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import sys
+import traceback
+import warnings
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import numpy as np
+import torch
+from tqdm import tqdm
+from omnivoice.eval.models.utmos import UTMOS22Strong
+from omnivoice.eval.utils import load_waveform
+from omnivoice.utils.data_utils import read_test_list
+warnings.filterwarnings("ignore")
+# Global variables for workers
+worker_model = None
+worker_device = None
+worker_sr = 16000
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Calculate UTMOS score using UTMOS22Strong model."
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing evaluated speech files.",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        required=True,
+        help="Path to the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of our evaluation model repository."
+        "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+        "Will use 'tts_eval_models/mos/utmos22_strong_step7459_v1.pt'"
+        " in this script",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where UTMOS information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu",
+        type=int,
+        default=1,
+        help="Number of worker processes to spawn per GPU.",
+    )
+    return parser
+def get_device(rank: int = 0) -> torch.device:
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    return device
+def worker_init(
+    rank_queue,
+    model_path,
+):
+    """Initialize worker process with model and device."""
+    global worker_model, worker_device, worker_sr
+    # Limit CPU threads per worker
+    torch.set_num_threads(2)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    rank = rank_queue.get() if rank_queue else -1
+    worker_device = get_device(rank)
+    worker_sr = 16000
+    logging.debug(f"Initializing UTMOS worker on {worker_device}")
+    # Initialize Model
+    worker_model = UTMOS22Strong()
+    try:
+        # Load weights to CPU first, then move to device
+        state_dict = torch.load(model_path, map_location="cpu")
+        worker_model.load_state_dict(state_dict)
+    except Exception as e:
+        logging.error(f"Failed to load model from {model_path}: {e}")
+        raise
+    worker_model.to(worker_device)
+    worker_model.eval()
+@torch.no_grad()
+def run_utmos_worker(file_idx, wav_path, language_name):
+    """Worker function to process a single audio file."""
+    try:
+        if not os.path.exists(wav_path):
+            return file_idx, wav_path, language_name, f"File not found: {wav_path}", "error"
+        # Load and preprocess waveform
+        speech = load_waveform(wav_path, worker_sr, device=worker_device)
+        # Compute score
+        # UTMOS expects input shape (Batch, Time)
+        score = worker_model(speech.unsqueeze(0), worker_sr)
+        return file_idx, wav_path, language_name, score.item(), "success"
+    except Exception as e:
+        error_detail = (
+            f"Error processing {wav_path}: {str(e)}\n"
+            f"Traceback:\n{traceback.format_exc()}"
+        )
+        return file_idx, wav_path, language_name, error_detail, "error"
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    # Main process thread setting
+    torch.set_num_threads(2)
+    mp.set_start_method("spawn", force=True)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    # Validate inputs
+    if not os.path.isdir(args.wav_path):
+        logging.error(f"Invalid directory: {args.wav_path}")
+        sys.exit(1)
+    model_path = os.path.join(args.model_dir, "mos/utmos22_strong_step7459_v1.pt")
+    if not os.path.exists(model_path):
+        logging.error(f"Model file not found at {model_path}")
+        sys.exit(1)
+    # Scan directory for files
+    logging.info(f"Calculating UTMOS for {args.wav_path}")
+    wav_files = []
+    try:
+        samples = read_test_list(args.test_list)
+        for s in samples:
+            language_name = s.get("language_name") or "unknown"
+            eval_wav_path = os.path.join(args.wav_path, f"{s['id']}.{args.extension}")
+            wav_files.append((eval_wav_path, language_name))
+    except Exception as e:
+        raise ValueError(f"Error reading test list {args.test_list}: {e}")
+    # Setup Parallel Processing
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_procs = num_gpus * args.nj_per_gpu
+    logging.info(
+        f"Starting evaluation with {total_procs} processes on {num_gpus} GPUs."
+    )
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for rank in list(range(num_gpus)) * args.nj_per_gpu:
+        rank_queue.put(rank)
+    scores = []
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        fout = open(args.decode_path, "w", encoding="utf8")
+        logging.info(f"Saving detailed UTMOS results to: {args.decode_path}")
+        fout.write("Name\tUTMOS\n")
+    try:
+        with ProcessPoolExecutor(
+            max_workers=total_procs,
+            initializer=worker_init,
+            initargs=(
+                rank_queue,
+                model_path,
+            ),
+        ) as executor:
+            futures = []
+            for i, (wav_path, language_name) in enumerate(wav_files):
+                futures.append(
+                    executor.submit(run_utmos_worker, i, wav_path, language_name)
+                )
+            pbar = tqdm(
+                as_completed(futures), total=len(wav_files), desc="Evaluating UTMOS"
+            )
+            lang_stats = {}
+            for future in pbar:
+                idx, path, language_name, result, status = future.result()
+                if status == "success":
+                    if language_name not in lang_stats:
+                        lang_stats[language_name] = []
+                    lang_stats[language_name].append(result)
+                    scores.append(result)
+                    if fout:
+                        if language_name == "unknown":
+                            fout.write(f"{os.path.basename(path)}\t{result:.2f}\n")
+                        else:
+                            fout.write(
+                                f"{language_name}\t{os.path.basename(path)}\t{result:.2f}\n"
+                            )
+                else:
+                    pbar.write(f"!!! FAILED [File {idx}]: {path} | {result}")
+    except (Exception, KeyboardInterrupt) as e:
+        logging.critical(
+            f"An unrecoverable error occurred: {e}. Terminating all processes."
+        )
+        detailed_error_info = traceback.format_exc()
+        logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+        sys.exit(1)
+    print("-" * 50)
+    if len(lang_stats) > 1:
+        lang_scores = []
+        for lang in sorted(lang_stats.keys()):
+            l_scores = lang_stats[lang]
+            l_avg = np.mean(l_scores)
+            lang_scores.append(l_scores)
+            l_count = len(l_scores)
+            logging.info(f"[{lang}] UTMOS score: {l_avg:.3f} ({l_count} samples)")
+            if fout:
+                fout.write(f"[{lang}] UTMOS: {l_avg:.3f} ({l_count} samples)\n")
+        logging.info(
+            f"Macro-average UTMOS over {len(lang_stats)} languages: "
+            f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}"
+        )
+        if fout:
+            fout.write(
+                f"\nMacro-average UTMOS over {len(lang_stats)} languages: "
+                f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n"
+            )
+    if scores:
+        avg_score = np.mean(scores)
+        logging.info(f"Processed {len(scores)}/{len(wav_files)} files.")
+        logging.info(f"UTMOS score: {avg_score:.2f}")
+        if fout:
+            fout.write(f"\nAverage UTMOS: {avg_score:.2f}\n")
+    else:
+        logging.error("No valid scores computed.")
+    print("-" * 50)
+    if fout:
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/speaker_similarity/sim.py ADDED Viewed

	@@ -0,0 +1,321 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Computes speaker similarity (SIM-o) using a WavLM-based
+    ECAPA-TDNN speaker verification model.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import sys
+import traceback
+import warnings
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import numpy as np
+import torch
+from tqdm import tqdm
+from omnivoice.eval.models.ecapa_tdnn_wavlm import ECAPA_TDNN_WAVLM
+from omnivoice.eval.utils import load_waveform
+from omnivoice.utils.data_utils import read_test_list
+warnings.filterwarnings("ignore")
+# Global variables for workers
+worker_model = None
+worker_device = None
+worker_sr = 16000
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Calculate speaker similarity (SIM-o) score."
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing evaluated speech files.",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        required=True,
+        help="Path to the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of our evaluation model repository."
+        "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+        "Will use 'tts_eval_models/speaker_similarity/wavlm_large_finetune.pth'"
+        "and 'tts_eval_models/speaker_similarity/wavlm_large/' in this script",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files.",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where SIM-o information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu",
+        type=int,
+        default=1,
+        help="Number of worker processes to spawn per GPU.",
+    )
+    return parser
+def get_device(rank: int = 0) -> torch.device:
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    return device
+def worker_init(
+    rank_queue,
+    sv_model_path,
+    ssl_model_path,
+):
+    """Initialize worker process with model and device."""
+    global worker_model, worker_device, worker_sr
+    torch.set_num_threads(2)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    rank = rank_queue.get() if rank_queue else -1
+    worker_device = get_device(rank)
+    worker_sr = 16000
+    logging.debug(f"Initializing SIM-o worker on {worker_device}")
+    # Temporarily suppress INFO logs to hide verbose WavLM config
+    logging.disable(logging.INFO)
+    # Initialize Model
+    try:
+        worker_model = ECAPA_TDNN_WAVLM(
+            feat_dim=1024,
+            channels=512,
+            emb_dim=256,
+            sr=worker_sr,
+            ssl_model_path=ssl_model_path,
+        )
+        state_dict = torch.load(
+            sv_model_path, map_location=lambda storage, loc: storage
+        )
+        worker_model.load_state_dict(state_dict["model"], strict=False)
+        worker_model.to(worker_device)
+        worker_model.eval()
+    finally:
+        # Restore normal logging
+        logging.disable(logging.NOTSET)
+@torch.no_grad()
+def get_embedding(wav_path: str) -> torch.Tensor:
+    """Extract embedding for a single file."""
+    speech = load_waveform(wav_path, worker_sr, device=worker_device, max_seconds=120)
+    return worker_model([speech])
+def run_similarity_worker(line_idx, sample, wav_dir, extension):
+    """Worker function to process a single pair."""
+    try:
+        wav_name = sample["id"]
+        ref_wav_path = sample["ref_audio"]
+        language_name = sample.get("language_name") or "unknown"
+        eval_wav_path = os.path.join(wav_dir, f"{wav_name}.{extension}")
+        if not os.path.exists(ref_wav_path):
+            return line_idx, f"Reference not found: {ref_wav_path}", None, "error"
+        if not os.path.exists(eval_wav_path):
+            return line_idx, f"Eval wav not found: {eval_wav_path}", None, "error"
+        # Compute embeddings pair-wise
+        ref_emb = get_embedding(ref_wav_path)
+        eval_emb = get_embedding(eval_wav_path)
+        # Cosine Similarity
+        similarity = torch.nn.functional.cosine_similarity(ref_emb, eval_emb, dim=-1)
+        return (
+            line_idx,
+            (ref_wav_path, eval_wav_path, language_name),
+            similarity.item(),
+            "success",
+        )
+    except Exception as e:
+        error_detail = f"Error: {str(e)}\nTraceback:\n{traceback.format_exc()}"
+        return line_idx, str(sample), error_detail, "error"
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    # Main process thread setting
+    torch.set_num_threads(2)
+    mp.set_start_method("spawn", force=True)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    # Prepare paths
+    sv_model_path = os.path.join(
+        args.model_dir, "speaker_similarity/wavlm_large_finetune.pth"
+    )
+    ssl_model_path = os.path.join(args.model_dir, "speaker_similarity/wavlm_large/")
+    if not os.path.exists(sv_model_path) or not os.path.exists(ssl_model_path):
+        logging.error("Model files not found. Please check --model-dir.")
+        sys.exit(1)
+    logging.info(f"Calculating SIM-o for {args.wav_path}")
+    # Read list
+    samples = read_test_list(args.test_list)
+    # Setup Parallel Processing
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_procs = num_gpus * args.nj_per_gpu
+    logging.info(
+        f"Starting evaluation with {total_procs} processes " f"on {num_gpus} GPUs."
+    )
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for rank in list(range(num_gpus)) * args.nj_per_gpu:
+        rank_queue.put(rank)
+    scores = []
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        fout = open(args.decode_path, "w", encoding="utf8")
+        logging.info(f"Saving detailed SIM-o results to: {args.decode_path}")
+        fout.write("Prompt-path\tEval-path\tSIM-o\n")
+    try:
+        with ProcessPoolExecutor(
+            max_workers=total_procs,
+            initializer=worker_init,
+            initargs=(
+                rank_queue,
+                sv_model_path,
+                ssl_model_path,
+            ),
+        ) as executor:
+            futures = []
+            for i, sample in enumerate(samples):
+                futures.append(
+                    executor.submit(
+                        run_similarity_worker, i, sample, args.wav_path, args.extension
+                    )
+                )
+            pbar = tqdm(
+                as_completed(futures), total=len(samples), desc="Evaluating SIM-o"
+            )
+            lang_stats = {}
+            for future in pbar:
+                idx, context, result, status = future.result()
+                if status == "success":
+                    prompt_path, eval_path, lang = context
+                    scores.append(result)
+                    # Accumulate per-language
+                    if lang not in lang_stats:
+                        lang_stats[lang] = []
+                    lang_stats[lang].append(result)
+                    if fout:
+                        if lang == "unknown":
+                            fout.write(f"{prompt_path}\t{eval_path}\t{result:.2f}\n")
+                        else:
+                            fout.write(
+                                f"{lang}\t{context[0]}\t{context[1]}\t{result:.2f}\n"
+                            )
+                else:
+                    pbar.write(f"!!! FAILED [Line {idx}]: {context} | Error: {result}")
+    except (Exception, KeyboardInterrupt) as e:
+        logging.critical(
+            f"An unrecoverable error occurred: {e}. " f"Terminating all processes."
+        )
+        detailed_error_info = traceback.format_exc()
+        logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+        sys.exit(1)
+    print("-" * 50)
+    if len(lang_stats) > 1:
+        lang_scores = []
+        for lang in sorted(lang_stats.keys()):
+            l_scores = lang_stats[lang]
+            l_avg = np.mean(l_scores)
+            lang_scores.append(l_scores)
+            l_count = len(l_scores)
+            logging.info(f"[{lang}] SIM-o score: {l_avg:.3f} ({l_count} pairs)")
+            if fout:
+                fout.write(f"[{lang}] SIM-o: {l_avg:.3f} ({l_count} pairs)\n")
+        logging.info(
+            f"Macro-average SIM-o over {len(lang_stats)} languages: "
+            f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}"
+        )
+        if fout:
+            fout.write(
+                f"\nMacro-average SIM-o over {len(lang_stats)} languages: "
+                f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n"
+            )
+    if scores:
+        avg_score = np.mean(scores)
+        logging.info(f"Processed {len(scores)}/{len(samples)} pairs.")
+        logging.info(f"SIM-o score: {avg_score:.3f}")
+        if fout:
+            fout.write(f"\nAverage SIM-o: {avg_score:.3f}\n")
+    else:
+        logging.error("No valid scores computed.")
+    if fout:
+        fout.close()
+    print("-" * 50)
+if __name__ == "__main__":
+    main()

omnivoice/eval/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Optional
+import librosa
+import soundfile as sf
+import torch
+def load_waveform(
+    fname: str,
+    sample_rate: int,
+    dtype: str = "float32",
+    device: torch.device = torch.device("cpu"),
+    return_numpy: bool = False,
+    max_seconds: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    Load an audio file, preprocess it, and convert to a PyTorch tensor.
+    Args:
+        fname (str): Path to the audio file.
+        sample_rate (int): Target sample rate for resampling.
+        dtype (str, optional): Data type to load audio as (default: "float32").
+        device (torch.device, optional): Device to place the resulting tensor
+            on (default: CPU).
+        return_numpy (bool): If True, returns a NumPy array instead of a
+            PyTorch tensor.
+        max_seconds (float): Maximum length (seconds) of the audio tensor.
+            If the audio is longer than this, it will be truncated.
+    Returns:
+        torch.Tensor: Processed audio waveform as a PyTorch tensor,
+            with shape (num_samples,).
+    Notes:
+        - If the audio is stereo, it will be converted to mono by averaging channels.
+        - If the audio's sample rate differs from the target, it will be resampled.
+    """
+    # Load audio file with specified data type
+    wav_data, sr = sf.read(fname, dtype=dtype)
+    # Convert stereo to mono if necessary
+    if len(wav_data.shape) == 2:
+        wav_data = wav_data.mean(1)
+    # Resample to target sample rate if needed
+    if sr != sample_rate:
+        wav_data = librosa.resample(wav_data, orig_sr=sr, target_sr=sample_rate)
+    if max_seconds is not None:
+        # Trim to max length
+        max_length = int(sample_rate * max_seconds)
+        if len(wav_data) > max_length:
+            wav_data = wav_data[:max_length]
+            logging.warning(
+                f"Wav file {fname} is longer than {max_seconds}s, "
+                f"truncated to {max_seconds}s to avoid OOM."
+            )
+    if return_numpy:
+        return wav_data
+    else:
+        wav_data = torch.from_numpy(wav_data)
+        return wav_data.to(device)

omnivoice/eval/wer/common.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Shared utilities for WER evaluation scripts.
+"""
+import logging
+import numpy as np
+from jiwer import compute_measures
+def process_one(hypothesis: str, truth: str, post_process, lang: str = None) -> dict:
+    """
+    Computes WER and related metrics for a single hypothesis-truth pair.
+    Args:
+        hypothesis (str): The transcribed text from the ASR model.
+        truth (str): The ground truth transcript.
+        post_process (callable): Text normalization function defined by each script.
+            Signature: post_process(text, lang) or post_process(text).
+        lang (str): The language code for post_process. Pass None if post_process
+            does not accept a lang argument.
+    Returns:
+        dict: A dict containing:
+            - truth (str): Post-processed ground truth text.
+            - hypothesis (str): Post-processed hypothesis text.
+            - wer (float): Word Error Rate.
+            - substitutions (int): Number of substitutions.
+            - deletions (int): Number of deletions.
+            - insertions (int): Number of insertions.
+            - word_num (int): Number of words in the post-processed ground truth.
+    """
+    if lang is not None:
+        truth_processed = post_process(truth, lang)
+        hypothesis_processed = post_process(hypothesis, lang)
+    else:
+        truth_processed = post_process(truth)
+        hypothesis_processed = post_process(hypothesis)
+    measures = compute_measures(truth_processed, hypothesis_processed)
+    word_num = len(truth_processed.split(" "))
+    return {
+        "truth": truth_processed,
+        "hypo": hypothesis_processed,
+        "wer": measures["wer"],
+        "substitutions": measures["substitutions"],
+        "deletions": measures["deletions"],
+        "insertions": measures["insertions"],
+        "word_num": word_num,
+    }
+def log_metrics(fout, prefix, i_list, d_list, s_list, w_total, ndigits=2):
+    """Log weighted WER metrics for a subset of results."""
+    metrics_wer = round(
+        (np.sum(s_list) + np.sum(d_list) + np.sum(i_list)) / w_total * 100, ndigits
+    )
+    metrics_inse = np.sum(i_list)
+    metrics_dele = np.sum(d_list)
+    metrics_subs = np.sum(s_list)
+    logging.info(f"{prefix} WER: {metrics_wer}%")
+    logging.info(
+        f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, "
+        f"{metrics_subs} sub / {w_total} words"
+    )
+    if fout:
+        fout.write(f"{prefix} WER: {metrics_wer}%\n")
+        fout.write(
+            f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, "
+            f"{metrics_subs} sub / {w_total} words\n"
+        )
+    return metrics_wer

omnivoice/eval/wer/fleurs.py ADDED Viewed

	@@ -0,0 +1,517 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Computes word error rate (WER) for FLEURS multilingual evaluation.
+Uses omnilingual-asr for ASR transcription across 100+ languages.
+Requires a separate environment with ``omnilingual_asr`` installed.
+Usage:
+    python3 omnivoice/eval/wer/fleurs.py \\
+        --wav-path results/fleurs \\
+        --test-list test.jsonl \\
+        --decode-path results/fleurs.wer.log \\
+        --model-card omniASR_LLM_Unlimited_7B_v2 \\
+        --chunk-size 100 --batch-size 50
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import sys
+import traceback
+import types
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+import torch
+from tqdm import tqdm
+try:
+    from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+    from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+except ImportError:
+    logging.error("Please install omnilingual_asr first.")
+    exit(1)
+# omnilingual-asr may pull a transformers version that lacks
+# HiggsAudioV2TokenizerModel. Pre-register stubs to bypass
+# omnivoice/__init__.py heavy imports.
+if "omnivoice" not in sys.modules:
+    _root = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
+    for _name in (
+        "omnivoice",
+        "omnivoice.eval",
+        "omnivoice.eval.wer",
+        "omnivoice.utils",
+    ):
+        if _name not in sys.modules:
+            _m = types.ModuleType(_name)
+            _m.__path__ = [os.path.join(_root, *_name.split(".")[1:])]
+            _m.__package__ = _name
+            sys.modules[_name] = _m
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+# fix mismatched language codes between OmniVoice and Omnilingual-ASR model
+rename = {
+    "et": "ekk",
+    "ms": "zsm",
+    "sw": "swh",
+    "npi": "nep",
+}
+def read_language_mapping_from_tsv(
+    mapping_path: Path,
+) -> dict[str, Union[str, List[str]]]:
+    with open(mapping_path, "r", encoding="utf-8") as f:
+        _ = f.readline()  # Skip header
+        language_mapping = {}
+        for line in f:
+            parts = line.strip().split("\t")
+            mixed_id, language_name, iso_639_3_id, duration = parts
+            language_mapping[iso_639_3_id] = mixed_id
+    return language_mapping
+iso_639_3_id_to_mixed_id = read_language_mapping_from_tsv(
+    Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv")
+)
+mixed_id_to_omnilingual_asr_lang = {}
+for lang in supported_langs:
+    if lang in ("cmn_Hant",):
+        continue
+    iso_639_3_lang_code = lang.split("_")[0]
+    if iso_639_3_lang_code in iso_639_3_id_to_mixed_id:
+        mixed_id = iso_639_3_id_to_mixed_id[iso_639_3_lang_code]
+        mixed_id_to_omnilingual_asr_lang[mixed_id] = lang
+    else:
+        mixed_id_to_omnilingual_asr_lang[iso_639_3_lang_code] = lang
+def clean_cjk_spaces(text):
+    """
+    Removes spaces adjacent to Chinese and Japanese characters while preserving
+    meaningful spaces in English or other languages (like Korean).
+    """
+    # Define CJK (Chinese, Japanese) Unicode ranges
+    # \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
+    # \u3040-\u309f: Hiragana (Japanese)
+    # \u30a0-\u30ff: Katakana (Japanese)
+    # \u3000-\u303f: CJK Symbols and Punctuation
+    cjk_range = r"\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f"
+    # 1. Remove spaces between two CJK characters
+    # Example: "我 爱 你" -> "我爱你"
+    text = re.sub(f"([{cjk_range}])\\s+([{cjk_range}])", r"\1\2", text)
+    # 2. Remove spaces between a CJK character and a non-CJK character (English/Numbers)
+    # Example: "我 爱 you" -> "我爱you"
+    text = re.sub(f"([{cjk_range}])\\s+", r"\1", text)
+    text = re.sub(f"\\s+([{cjk_range}])", r"\1", text)
+    # 3. Collapse multiple spaces into one for the remaining parts (e.g., English words)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Computes WER with Whisper.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing speech files.",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where WER information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--model-card",
+        type=str,
+        default="omniASR_LLM_7B",
+        help="Model card name for OmniASR (e.g., omniASR_LLM_7B) or local path.",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default="test.jsonl",
+        help="path of the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default=None,
+        help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese).
+        If not provided, the script will evaluate all languages found in the test list.
+        If specified, only samples of the given language will be evaluated.
+        """,
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=8,
+        help="Batch size for decoding with the Hugging Face pipeline.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=300,
+        help="Number of samples per task chunk sent to workers.",
+    )
+    return parser
+def load_omni_model(model_card, device):
+    logging.info(f"Loading OmniASR model ({model_card}) on {device}...")
+    try:
+        pipeline = ASRInferencePipeline(model_card=model_card, device=str(device))
+        return pipeline
+    except Exception as e:
+        logging.error(f"Failed to load OmniASR pipeline: {e}")
+        return None
+def process_init(rank_queue, model_card):
+    """
+    Initializer for each worker process.
+    """
+    global worker_pipe, worker_device
+    # Configure threads constraint
+    torch.set_num_threads(2)
+    try:
+        rank = rank_queue.get(timeout=10)
+    except Exception:
+        raise RuntimeError("Failed to get GPU rank from queue.")
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    worker_device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    logging.info(f"Initializing worker on device: {worker_device}")
+    try:
+        # Using the model_card argument
+        worker_pipe = load_omni_model(model_card, worker_device)
+        if worker_pipe is None:
+            raise RuntimeError("Model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load model on {worker_device}: {e}")
+        raise e
+def post_process(text: str, lang: str) -> str:
+    """
+    Cleans and normalizes text for WER calculation.
+    Args:
+        text (str): The input text to be processed.
+        lang (str): The language of the input text.
+    Returns:
+        str: The cleaned and normalized text.
+    """
+    lang_id = lang[:3]  # Extract ISO 639-3 code (e.g., 'eng' from 'eng_Latn')
+    text = text_normalize(
+        text,
+        iso_code=lang_id,
+        lower_case=True,
+        remove_numbers=False,
+        remove_brackets=False,
+    )
+    text = clean_cjk_spaces(text)
+    text = text.replace(" ", "|")
+    text = " ".join([x for x in text])
+    return text
+def run_eval_worker(data_chunk, language, batch_size):
+    """
+    Worker function to process a chunk of data.
+    Uses the global worker_pipe initialized by process_init.
+    """
+    global worker_pipe
+    if worker_pipe is None:
+        logging.error("Worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        # Prepare batch lists for OmniASR
+        audio_paths = [item["wav_path"] for item in data_chunk]
+        # OmniASR expects explicit language codes for each file if not auto-detected.
+        # Using the language passed to the worker function, or item specific language
+        # Assuming item['lang_id'] is compatible (e.g., 'en', 'zh', 'arb_Arab')
+        # If the model needs full tokens like 'en_Latn', conversion might be needed here depending on input data.
+        lang_list = [item.get("lang_id", language) for item in data_chunk]
+        # Use the pipeline to infer batch
+        # OmniASR pipeline.transcribe returns a list of strings
+        transcriptions = worker_pipe.transcribe(
+            audio_paths, lang=lang_list, batch_size=batch_size
+        )
+        for i, hypo_text in enumerate(transcriptions):
+            ref_item = data_chunk[i]
+            truth = ref_item["truth_text"]
+            wav_path = ref_item["wav_path"]
+            lang_id = ref_item.get("lang_id")
+            lang_name = ref_item.get("lang_name")
+            m = process_one(hypo_text, truth, post_process, lang_id)
+            m["wav_path"] = wav_path
+            m["lang_name"] = lang_name
+            metrics_buffer.append(m)
+    except Exception:
+        logging.error(
+            f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}"
+        )
+        return []
+    return metrics_buffer
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+        force=True,
+    )
+    # 1. Prepare Data
+    logging.info("Reading test list...")
+    data_by_lang = defaultdict(list)
+    total_files = 0
+    wav_root = Path(args.wav_path)
+    samples = read_test_list(args.test_list)
+    for s in samples:
+        wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+        if not os.path.exists(wav_path):
+            logging.warning(f"File missing: {wav_path}")
+            continue
+        lang_id = s.get("language_id") or "unknown"
+        if lang_id in rename:
+            lang_id = mixed_id_to_omnilingual_asr_lang[rename[lang_id]]
+        else:
+            lang_id = mixed_id_to_omnilingual_asr_lang[lang_id]
+        item = {
+            "wav_path": wav_path,
+            "truth_text": s["text"],
+            "lang_id": lang_id,
+            "lang_name": s.get("language_name") or "unknown",
+        }
+        if args.lang and s.get("language_id") != args.lang:
+            continue
+        data_by_lang[s.get("language_name") or "unknown"].append(item)
+        total_files += 1
+    logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.")
+    # 2. Worker config
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_workers = num_gpus * args.nj_per_gpu
+    mp.set_start_method("spawn", force=True)
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for _ in range(args.nj_per_gpu):
+        for rank in range(num_gpus):
+            rank_queue.put(rank)
+    # 3. Scheduling: Split languages into chunks
+    # This prevents one huge language from blocking a worker for too long,
+    # allows better load balancing across the pool.
+    tasks = []
+    chunk_size = args.chunk_size
+    for lang_name, items in data_by_lang.items():
+        # Slicing the list into chunks
+        for i in range(0, len(items), chunk_size):
+            chunk = items[i : i + chunk_size]
+            tasks.append({"chunk": chunk, "lang": lang_name})
+    logging.info(
+        f"Split data into {len(tasks)} chunks (size ~{chunk_size}). Spawning {total_workers} workers."
+    )
+    # 4. Execution
+    results = []
+    with ProcessPoolExecutor(
+        max_workers=total_workers,
+        initializer=process_init,
+        initargs=(rank_queue, args.model_card),
+    ) as executor:
+        futures = []
+        for task in tasks:
+            futures.append(
+                executor.submit(
+                    run_eval_worker, task["chunk"], task["lang"], args.batch_size
+                )
+            )
+        # Unified progress bar
+        with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+            for future in as_completed(futures):
+                try:
+                    chunk_metrics = future.result()
+                    results.extend(chunk_metrics)
+                    pbar.update(len(chunk_metrics))
+                except Exception as e:
+                    logging.error(f"Task failed: {e}")
+    # 5. Metrics Aggregation
+    wers, inses, deles, subses = [], [], [], []
+    word_nums = 0
+    # Store metrics per language
+    lang_stats = {}
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        logging.info(f"Saving detailed WER results to: {args.decode_path}")
+        fout = open(args.decode_path, "w", encoding="utf-8")
+    for res in results:
+        wers.append(float(res["wer"]))
+        inses.append(float(res["insertions"]))
+        deles.append(float(res["deletions"]))
+        subses.append(float(res["substitutions"]))
+        word_nums += res["word_num"]
+        if fout:
+            fout.write(
+                f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+                f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+                f"{res['substitutions']}\n"
+            )
+        lang_name = res["lang_name"]
+        # Per language stats
+        if lang_name not in lang_stats:
+            lang_stats[lang_name] = {
+                "inses": [],
+                "deles": [],
+                "subses": [],
+                "word_nums": 0,
+            }
+        lang_stats[lang_name]["inses"].append(float(res["insertions"]))
+        lang_stats[lang_name]["deles"].append(float(res["deletions"]))
+        lang_stats[lang_name]["subses"].append(float(res["substitutions"]))
+        lang_stats[lang_name]["word_nums"] += res["word_num"]
+    print("-" * 50)
+    # Log per-language stats
+    per_lang_wers = []
+    for lang in sorted(lang_stats.keys()):
+        stats = lang_stats[lang]
+        if stats["word_nums"] > 0:
+            lang_wer = log_metrics(
+                fout,
+                f"[{lang}]",
+                stats["inses"],
+                stats["deles"],
+                stats["subses"],
+                stats["word_nums"],
+            )
+            per_lang_wers.append(lang_wer)
+            print("-" * 50)
+    # Log Macro-average WER
+    if len(per_lang_wers) > 1:
+        macro_wer = np.mean(per_lang_wers)
+        logging.info(
+            f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%"
+        )
+        if fout:
+            fout.write(
+                f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n"
+            )
+        count_le_5 = sum(1 for w in per_lang_wers if w <= 5.0)
+        count_le_10 = sum(1 for w in per_lang_wers if w <= 10.0)
+        count_le_20 = sum(1 for w in per_lang_wers if w <= 20.0)
+        stats_msg = (
+            f"Languages with WER/CER <= 5%: {count_le_5}/{len(per_lang_wers)}\n"
+            f"Languages with WER/CER <= 10%: {count_le_10}/{len(per_lang_wers)}\n"
+            f"Languages with WER/CER <= 20%: {count_le_20}/{len(per_lang_wers)}"
+        )
+        logging.info("\n" + stats_msg)
+        if fout:
+            fout.write(stats_msg + "\n")
+    # Log overall stats
+    if word_nums > 0:
+        log_metrics(fout, "Overall", inses, deles, subses, word_nums)
+    if fout:
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/wer/hubert.py ADDED Viewed

	@@ -0,0 +1,318 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Computes word error rate (WER) with Hubert models for LibriSpeech test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import numpy as np
+import torch
+from tqdm import tqdm
+from omnivoice.eval.utils import load_waveform
+from omnivoice.eval.wer.common import process_one
+from omnivoice.utils.data_utils import read_test_list
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Computes WER with Hubert-based ASR model.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing speech files.",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where WER information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of our evaluation model repository."
+        "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+        "Will use 'tts_eval_models/wer/hubert-large-ls960-ft/'"
+        " in this script",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default="transcript.jsonl",
+        help="path of the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Batch size for decoding with the Hugging Face pipeline.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+    )
+    return parser
+def process_init(rank_queue, model_dir):
+    global worker_pipe, worker_device
+    torch.set_num_threads(2)
+    try:
+        rank = rank_queue.get(timeout=10)
+    except Exception:
+        raise RuntimeError("Failed to get GPU rank from queue.")
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    worker_device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    logging.info(f"Initializing worker on device: {worker_device}")
+    try:
+        worker_pipe = load_hubert_model(model_dir, worker_device)
+        if worker_pipe is None:
+            raise RuntimeError("Model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load model on {worker_device}: {e}")
+        raise e
+def load_hubert_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/hubert-large-ls960-ft/")
+    if not os.path.exists(model_path):
+        logging.error(
+            f"Hubert model not found at {model_path}. "
+            "Please download from https://huggingface.co/k2-fsa/TTS_eval_models"
+        )
+        return None
+    logging.debug(f"Loading Hubert-based ASR model on {device}...")
+    import transformers
+    # Suppress transformers logging
+    transformers.logging.set_verbosity_error()
+    pipe = transformers.pipeline(
+        "automatic-speech-recognition",
+        model=model_path,
+        device=device,
+        tokenizer=model_path,
+    )
+    return pipe
+def post_process(text: str) -> str:
+    """
+    Cleans and normalizes text for WER calculation.
+    Args:
+        text (str): The input text to be processed.
+    Returns:
+        str: The cleaned and normalized text.
+    """
+    text = text.replace("‘", "'").replace("’", "'")
+    text = re.sub(r"[^a-zA-Z0-9']", " ", text.lower())
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def run_eval_worker(data_chunk, batch_size):
+    global worker_pipe
+    if worker_pipe is None:
+        logging.error("Worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        dataset = [
+            {
+                "array": load_waveform(
+                    item["wav_path"], sample_rate=16000, return_numpy=True
+                ),
+                "sampling_rate": 16000,
+            }
+            for item in data_chunk
+        ]
+        generate_kwargs = {"language": "english", "task": "transcribe"}
+        iterator = worker_pipe(
+            dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+        )
+        for i, out in enumerate(iterator):
+            hypothesis = out["text"].strip()
+            ref_item = data_chunk[i]
+            truth = ref_item["truth_text"]
+            wav_path = ref_item["wav_path"]
+            m = process_one(hypothesis, truth, post_process)
+            m["wav_path"] = wav_path
+            metrics_buffer.append(m)
+    except Exception:
+        logging.error(f"Worker failed on chunk:\n{traceback.format_exc()}")
+        return []
+    return metrics_buffer
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+        force=True,
+    )
+    logging.info(f"Calculating WER for {args.wav_path}")
+    data_list = []
+    samples = read_test_list(args.test_list)
+    for s in samples:
+        wav_full_path = str(Path(args.wav_path) / (s["id"] + "." + args.extension))
+        if not os.path.exists(wav_full_path):
+            logging.warning(f"File missing: {wav_full_path}")
+            continue
+        data_list.append(
+            {
+                "wav_path": wav_full_path,
+                "truth_text": s["text"],
+            }
+        )
+    total_files = len(data_list)
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_workers = num_gpus * args.nj_per_gpu
+    mp.set_start_method("spawn", force=True)
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for _ in range(args.nj_per_gpu):
+        for rank in range(num_gpus):
+            rank_queue.put(rank)
+    chunk_size = max(1, args.batch_size)
+    tasks = [data_list[i : i + chunk_size] for i in range(0, total_files, chunk_size)]
+    logging.info(
+        f"Split data into {len(tasks)} chunks (size ~{chunk_size}). "
+        f"Spawning {total_workers} workers."
+    )
+    results = []
+    with ProcessPoolExecutor(
+        max_workers=total_workers,
+        initializer=process_init,
+        initargs=(rank_queue, args.model_dir),
+    ) as executor:
+        futures = []
+        for chunk in tasks:
+            futures.append(executor.submit(run_eval_worker, chunk, args.batch_size))
+        with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+            for future in as_completed(futures):
+                chunk_metrics = future.result()
+                results.extend(chunk_metrics)
+                pbar.update(len(chunk_metrics))
+    wers, inses, deles, subses = [], [], [], []
+    word_nums = 0
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        fout = open(args.decode_path, "w", encoding="utf8")
+        logging.info(f"Saving detailed WER results to: {args.decode_path}")
+        fout.write(
+            "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n"
+        )
+    for res in results:
+        wers.append(float(res["wer"]))
+        inses.append(float(res["insertions"]))
+        deles.append(float(res["deletions"]))
+        subses.append(float(res["substitutions"]))
+        word_nums += res["word_num"]
+        if fout:
+            fout.write(
+                f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+                f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+                f"{res['substitutions']}\n"
+            )
+    wer_weighted = (
+        round(
+            (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2
+        )
+        if word_nums > 0
+        else float("nan")
+    )
+    inse_sum = np.sum(inses)
+    dele_sum = np.sum(deles)
+    subs_sum = np.sum(subses)
+    print("-" * 50)
+    logging.info(f"Processed {len(results)}/{total_files} files.")
+    wer_info = f"WER: {wer_weighted}%"
+    detailed_info = (
+        f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words"
+    )
+    logging.info(wer_info)
+    logging.info(detailed_info)
+    print("-" * 50)
+    if fout:
+        fout.write(wer_info + "\n" + detailed_info + "\n")
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/wer/minimax.py ADDED Viewed

	@@ -0,0 +1,596 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Computes word error rate (WER) with Whisper-large-v3 for English and
+Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import traceback
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+import torch
+import zhconv
+from tqdm import tqdm
+from omnivoice.eval.utils import load_waveform
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_paraformer = None
+worker_device = None
+def read_language_mapping_from_tsv(
+    mapping_path: Path,
+) -> dict[str, Union[str, List[str]]]:
+    with open(mapping_path, "r", encoding="utf-8") as f:
+        _ = f.readline()  # Skip header
+        language_mapping = {}
+        for line in f:
+            parts = line.strip().split("\t")
+            mixed_id, language_name, iso_639_3_id, duration = parts
+            language_mapping[mixed_id] = iso_639_3_id
+    return language_mapping
+mixed_id_to_iso_639_3_id = read_language_mapping_from_tsv(
+    Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv")
+)
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Computes WER with Whisper.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing speech files.",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where WER information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of evaluation models repository. "
+        "Download from https://huggingface.co/k2-fsa/TTS_eval_models. ",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default="test.jsonl",
+        help="path of the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default=None,
+        help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese).
+        If not provided, the script will evaluate all languages found in the test list.
+        If specified, only samples of the given language will be evaluated.
+        """,
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Batch size for decoding with the Hugging Face pipeline.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=10,
+        help="Number of samples per task chunk sent to workers.",
+    )
+    return parser
+def load_whisper_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/whisper-large-v3/")
+    if not os.path.exists(model_path):
+        logging.error(f"Whisper model not found at {model_path}.")
+        return None
+    import transformers
+    # Suppress transformers logging
+    transformers.logging.set_verbosity_error()
+    logging.info(f"Loading Whisper model on {device}...")
+    pipe = transformers.pipeline(
+        "automatic-speech-recognition",
+        model=model_path,
+        chunk_length_s=30,
+        dtype=torch.float16 if "cuda" in str(device) else torch.float32,
+        device=device,
+    )
+    return pipe
+def load_paraformer_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/paraformer-zh/")
+    if not os.path.exists(model_path):
+        logging.error(f"Paraformer model not found at {model_path}.")
+        return None
+    logging.info(f"Loading Paraformer model on {device}...")
+    previous_level = logging.root.manager.disable
+    logging.disable(logging.CRITICAL)
+    try:
+        from funasr import AutoModel
+        model = AutoModel(
+            model=model_path,
+            device=str(device),
+            disable_update=True,
+            disable_pbar=True,
+            verbose=False,
+        )
+    finally:
+        logging.disable(previous_level)
+    return model
+def _worker_setup(rank_queue):
+    """Common worker setup: get rank, configure device and threads."""
+    global worker_device
+    torch.set_num_threads(2)
+    try:
+        rank = rank_queue.get(timeout=10)
+    except Exception:
+        raise RuntimeError("Failed to get GPU rank from queue.")
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    worker_device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    logging.info(f"Initializing worker on device: {worker_device}")
+def process_init(rank_queue, model_dir):
+    """Initializer for Whisper worker processes."""
+    global worker_pipe
+    _worker_setup(rank_queue)
+    try:
+        worker_pipe = load_whisper_model(model_dir, worker_device)
+        if worker_pipe is None:
+            raise RuntimeError("Whisper model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load Whisper model on {worker_device}: {e}")
+        raise e
+def process_init_paraformer(rank_queue, model_dir):
+    """Initializer for Paraformer worker processes (Chinese evaluation)."""
+    global worker_paraformer
+    _worker_setup(rank_queue)
+    try:
+        worker_paraformer = load_paraformer_model(model_dir, worker_device)
+        if worker_paraformer is None:
+            raise RuntimeError("Paraformer model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load Paraformer model on {worker_device}: {e}")
+        raise e
+def post_process(text: str, lang: str) -> str:
+    """
+    Cleans and normalizes text for WER calculation.
+    Args:
+        text (str): The input text to be processed.
+        lang (str): The language of the input text.
+    Returns:
+        str: The cleaned and normalized text.
+    """
+    if lang != "unknown":
+        iso_639_3_code = mixed_id_to_iso_639_3_id[lang]
+        text = text_normalize(
+            text,
+            iso_code=iso_639_3_code,
+            lower_case=True,
+            remove_numbers=False,
+            remove_brackets=False,
+        )
+    if lang in ["zh", "yue"]:
+        text = zhconv.convert(text, "zh-cn")
+    # Processing spaces for languages using CER (consistent with the practice
+    # in paper Minimax-Speech), specifically: zh, yue, ja, ko, th, arb, vi, hi, el.
+    if lang in ("zh", "yue", "ja"):
+        # For languages where spaces are not semantically meaningful, remove spaces.
+        text = text.replace(" ", "")
+        text = " ".join([x for x in text])
+    elif lang in ("ko", "th", "arb", "vi", "hi", "el"):
+        # For languages where spaces are semantically meaningful, replace spaces with |.
+        text = text.replace(" ", "|")
+        text = " ".join([x for x in text])
+    text = text.lower()
+    return text.strip()
+class SpeechEvalDataset(torch.utils.data.Dataset):
+    def __init__(self, data_list):
+        self.data_list = data_list
+    def __len__(self):
+        return len(self.data_list)
+    def __getitem__(self, index):
+        item = self.data_list[index]
+        waveform = load_waveform(item["wav_path"], sample_rate=16000, return_numpy=True)
+        return {
+            "array": waveform,
+            "sampling_rate": 16000,
+            "truth_text": item["truth_text"],
+        }
+def run_eval_worker(data_chunk, language, batch_size):
+    """
+    Worker function to process a chunk of data.
+    Uses the global worker_pipe initialized by process_init.
+    """
+    global worker_pipe
+    if worker_pipe is None:
+        logging.error("Worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        dataset = SpeechEvalDataset(data_chunk)
+        if language != "unknown":
+            generate_kwargs = {"language": language, "task": "transcribe"}
+        else:
+            generate_kwargs = {"task": "transcribe"}
+        # Use the pipeline to infer batch
+        # Note: We iterate through the iterator returned by pipe
+        iterator = worker_pipe(
+            dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+        )
+        for i, out in enumerate(iterator):
+            hypothesis = out["text"].strip()
+            ref_item = data_chunk[i]
+            truth = ref_item["truth_text"]
+            wav_path = ref_item["wav_path"]
+            lang_id = ref_item.get("lang_id")
+            lang_name = ref_item.get("lang_name")
+            m = process_one(hypothesis, truth, post_process, lang_id)
+            m["wav_path"] = wav_path
+            m["lang_name"] = lang_name
+            metrics_buffer.append(m)
+    except Exception:
+        logging.error(
+            f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}"
+        )
+        return []
+    return metrics_buffer
+def run_eval_worker_paraformer(data_chunk, batch_size):
+    """
+    Worker function for Chinese evaluation using Paraformer.
+    Uses the global worker_paraformer initialized by process_init_paraformer.
+    """
+    global worker_paraformer
+    if worker_paraformer is None:
+        logging.error("Paraformer worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        wav_paths = [item["wav_path"] for item in data_chunk]
+        for i in range(0, len(wav_paths), batch_size):
+            batch_paths = wav_paths[i : i + batch_size]
+            res_batch = worker_paraformer.generate(
+                input=batch_paths, batch_size=batch_size, disable_pbar=True
+            )
+            for j, res in enumerate(res_batch):
+                hypothesis = res["text"]
+                ref_item = data_chunk[i + j]
+                truth = ref_item["truth_text"]
+                wav_path = ref_item["wav_path"]
+                lang_name = ref_item.get("lang_name")
+                m = process_one(hypothesis, truth, post_process, "zh")
+                m["wav_path"] = wav_path
+                m["lang_name"] = lang_name
+                metrics_buffer.append(m)
+    except Exception:
+        logging.error(f"Paraformer worker failed on chunk:\n{traceback.format_exc()}")
+        return []
+    return metrics_buffer
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+        force=True,
+    )
+    # 1. Prepare Data
+    logging.info("Reading test list...")
+    data_by_lang = defaultdict(list)
+    total_files = 0
+    wav_root = Path(args.wav_path)
+    samples = read_test_list(args.test_list)
+    for s in samples:
+        wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+        if not os.path.exists(wav_path):
+            logging.warning(f"File missing: {wav_path}")
+            continue
+        lang_id = s.get("language_id") or "unknown"
+        lang_name = s.get("language_name") or "unknown"
+        item = {
+            "wav_path": wav_path,
+            "truth_text": s["text"],
+            "lang_id": lang_id,
+            "lang_name": lang_name,
+        }
+        if args.lang and s.get("language_id") != args.lang:
+            continue
+        data_by_lang[lang_name].append(item)
+        total_files += 1
+    logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.")
+    # 2. Worker config
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_workers = num_gpus * args.nj_per_gpu
+    mp.set_start_method("spawn", force=True)
+    manager = mp.Manager()
+    # 3. Scheduling: Split data into Chinese (Paraformer) and non-Chinese (Whisper)
+    zh_items = []
+    non_zh_items = []
+    for lang_name, items in data_by_lang.items():
+        lang_id = items[0].get("lang_id", "") if items else ""
+        if lang_name == "Chinese" or (lang_id and lang_id.startswith("zh")):
+            zh_items.extend(items)
+        else:
+            non_zh_items.extend(items)
+    chunk_size = args.chunk_size
+    whisper_tasks = []
+    for i in range(0, len(non_zh_items), chunk_size):
+        chunk = non_zh_items[i : i + chunk_size]
+        lang_name = chunk[0].get("lang_name", "unknown")
+        whisper_tasks.append({"chunk": chunk, "lang": lang_name})
+    paraformer_tasks = []
+    for i in range(0, len(zh_items), chunk_size):
+        paraformer_tasks.append(zh_items[i : i + chunk_size])
+    logging.info(
+        f"Whisper tasks: {len(whisper_tasks)} chunks ({len(non_zh_items)} files). "
+        f"Paraformer tasks: {len(paraformer_tasks)} chunks ({len(zh_items)} files). "
+        f"Spawning {total_workers} workers per pool."
+    )
+    # 4. Execution — run Whisper and Paraformer pools sequentially
+    results = []
+    # 4a. Whisper pool for non-Chinese languages
+    if whisper_tasks:
+        whisper_rank_queue = manager.Queue()
+        for _ in range(args.nj_per_gpu):
+            for rank in range(num_gpus):
+                whisper_rank_queue.put(rank)
+        with ProcessPoolExecutor(
+            max_workers=total_workers,
+            initializer=process_init,
+            initargs=(whisper_rank_queue, args.model_dir),
+        ) as executor:
+            futures = []
+            for task in whisper_tasks:
+                futures.append(
+                    executor.submit(
+                        run_eval_worker, task["chunk"], task["lang"], args.batch_size
+                    )
+                )
+            with tqdm(
+                total=len(non_zh_items),
+                desc="Whisper Eval",
+                dynamic_ncols=True,
+            ) as pbar:
+                for future in as_completed(futures):
+                    try:
+                        chunk_metrics = future.result()
+                        results.extend(chunk_metrics)
+                        pbar.update(len(chunk_metrics))
+                    except Exception as e:
+                        logging.error(f"Whisper task failed: {e}")
+    # 4b. Paraformer pool for Chinese
+    if paraformer_tasks:
+        para_rank_queue = manager.Queue()
+        for _ in range(args.nj_per_gpu):
+            for rank in range(num_gpus):
+                para_rank_queue.put(rank)
+        with ProcessPoolExecutor(
+            max_workers=total_workers,
+            initializer=process_init_paraformer,
+            initargs=(para_rank_queue, args.model_dir),
+        ) as executor:
+            futures = []
+            for chunk in paraformer_tasks:
+                futures.append(
+                    executor.submit(run_eval_worker_paraformer, chunk, args.batch_size)
+                )
+            with tqdm(
+                total=len(zh_items),
+                desc="Paraformer Eval",
+                dynamic_ncols=True,
+            ) as pbar:
+                for future in as_completed(futures):
+                    try:
+                        chunk_metrics = future.result()
+                        results.extend(chunk_metrics)
+                        pbar.update(len(chunk_metrics))
+                    except Exception as e:
+                        logging.error(f"Paraformer task failed: {e}")
+    # 5. Metrics Aggregation
+    wers, inses, deles, subses = [], [], [], []
+    word_nums = 0
+    # Store metrics per language
+    lang_stats = {}
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        logging.info(f"Saving detailed WER results to: {args.decode_path}")
+        fout = open(args.decode_path, "w", encoding="utf-8")
+    for res in results:
+        wers.append(float(res["wer"]))
+        inses.append(float(res["insertions"]))
+        deles.append(float(res["deletions"]))
+        subses.append(float(res["substitutions"]))
+        word_nums += res["word_num"]
+        if fout:
+            fout.write(
+                f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+                f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+                f"{res['substitutions']}\n"
+            )
+        lang_name = res["lang_name"]
+        # Per language stats
+        if lang_name not in lang_stats:
+            lang_stats[lang_name] = {
+                "inses": [],
+                "deles": [],
+                "subses": [],
+                "word_nums": 0,
+            }
+        lang_stats[lang_name]["inses"].append(float(res["insertions"]))
+        lang_stats[lang_name]["deles"].append(float(res["deletions"]))
+        lang_stats[lang_name]["subses"].append(float(res["substitutions"]))
+        lang_stats[lang_name]["word_nums"] += res["word_num"]
+    print("-" * 50)
+    # Log per-language stats
+    per_lang_wers = []
+    for lang in sorted(lang_stats.keys()):
+        stats = lang_stats[lang]
+        if stats["word_nums"] > 0:
+            lang_wer = log_metrics(
+                fout,
+                f"[{lang}]",
+                stats["inses"],
+                stats["deles"],
+                stats["subses"],
+                stats["word_nums"],
+                ndigits=3,
+            )
+            per_lang_wers.append(lang_wer)
+            print("-" * 50)
+    # Log Macro-average WER
+    if len(per_lang_wers) > 1:
+        macro_wer = np.mean(per_lang_wers)
+        logging.info(
+            f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%"
+        )
+        if fout:
+            fout.write(
+                f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n"
+            )
+    # Log overall stats
+    if word_nums > 0:
+        log_metrics(fout, "Overall", inses, deles, subses, word_nums)
+    if fout:
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/wer/norm_config_module.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This module defines the normalization configuration for WER evaluation.
+Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/norm_config_module.py
+"""
+# type: ignore
+import os
+import re
+colon = ":"
+comma = ","
+exclamation_mark = "!"
+period = re.escape(".")
+question_mark = re.escape("?")
+semicolon = ";"
+left_curly_bracket = "{"
+right_curly_bracket = "}"
+quotation_mark = '"'
+basic_punc = (
+    period
+    + question_mark
+    + comma
+    + colon
+    + exclamation_mark
+    + left_curly_bracket
+    + right_curly_bracket
+)
+# General punc unicode block (0x2000-0x206F)
+zero_width_space = r"\u200B"
+zero_width_nonjoiner = r"\u200C"
+left_to_right_mark = r"\u200E"
+right_to_left_mark = r"\u200F"
+left_to_right_embedding = r"\u202A"
+pop_directional_formatting = r"\u202C"
+# Here are some commonly ill-typed versions of apostrophe
+right_single_quotation_mark = r"\u2019"
+left_single_quotation_mark = r"\u2018"
+# Language specific definitions
+# Spanish
+inverted_exclamation_mark = r"\u00A1"
+inverted_question_mark = r"\u00BF"
+# Hindi
+hindi_danda = "\u0964"
+# Egyptian Arabic
+# arabic_percent = r"\u066A"
+arabic_comma = r"\u060C"
+arabic_question_mark = r"\u061F"
+arabic_semicolon = r"\u061B"
+arabic_diacritics = r"\u064B-\u0652"
+arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
+# Chinese
+full_stop = r"\u3002"
+full_comma = r"\uFF0C"
+full_exclamation_mark = r"\uFF01"
+full_question_mark = r"\uFF1F"
+full_semicolon = r"\uFF1B"
+full_colon = r"\uFF1A"
+full_parentheses = r"\uFF08\uFF09"
+quotation_mark_horizontal = r"\u300C-\u300F"
+quotation_mark_vertical = r"\uFF41-\uFF44"
+title_marks = r"\u3008-\u300B"
+wavy_low_line = r"\uFE4F"
+ellipsis = r"\u22EF"
+enumeration_comma = r"\u3001"
+hyphenation_point = r"\u2027"
+forward_slash = r"\uFF0F"
+wavy_dash = r"\uFF5E"
+box_drawings_light_horizontal = r"\u2500"
+fullwidth_low_line = r"\uFF3F"
+chinese_punc = (
+    full_stop
+    + full_comma
+    + full_exclamation_mark
+    + full_question_mark
+    + full_semicolon
+    + full_colon
+    + full_parentheses
+    + quotation_mark_horizontal
+    + quotation_mark_vertical
+    + title_marks
+    + wavy_low_line
+    + ellipsis
+    + enumeration_comma
+    + hyphenation_point
+    + forward_slash
+    + wavy_dash
+    + box_drawings_light_horizontal
+    + fullwidth_low_line
+)
+# Armenian
+armenian_apostrophe = r"\u055A"
+emphasis_mark = r"\u055B"
+exclamation_mark = r"\u055C"
+armenian_comma = r"\u055D"
+armenian_question_mark = r"\u055E"
+abbreviation_mark = r"\u055F"
+armenian_full_stop = r"\u0589"
+armenian_punc = (
+    armenian_apostrophe
+    + emphasis_mark
+    + exclamation_mark
+    + armenian_comma
+    + armenian_question_mark
+    + abbreviation_mark
+    + armenian_full_stop
+)
+lesser_than_symbol = r"&lt;"
+greater_than_symbol = r"&gt;"
+lesser_than_sign = r"\u003c"
+greater_than_sign = r"\u003e"
+nbsp_written_form = r"&nbsp"
+# Quotation marks
+left_double_quotes = r"\u201c"
+right_double_quotes = r"\u201d"
+left_double_angle = r"\u00ab"
+right_double_angle = r"\u00bb"
+left_single_angle = r"\u2039"
+right_single_angle = r"\u203a"
+low_double_quotes = r"\u201e"
+low_single_quotes = r"\u201a"
+high_double_quotes = r"\u201f"
+high_single_quotes = r"\u201b"
+all_punct_quotes = (
+    left_double_quotes
+    + right_double_quotes
+    + left_double_angle
+    + right_double_angle
+    + left_single_angle
+    + right_single_angle
+    + low_double_quotes
+    + low_single_quotes
+    + high_double_quotes
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+)
+mapping_quotes = (
+    "["
+    + high_single_quotes
+    + right_single_quotation_mark
+    + left_single_quotation_mark
+    + "]"
+)
+# Digits
+english_digits = r"\u0030-\u0039"
+bengali_digits = r"\u09e6-\u09ef"
+khmer_digits = r"\u17e0-\u17e9"
+devanagari_digits = r"\u0966-\u096f"
+oriya_digits = r"\u0b66-\u0b6f"
+extended_arabic_indic_digits = r"\u06f0-\u06f9"
+kayah_li_digits = r"\ua900-\ua909"
+fullwidth_digits = r"\uff10-\uff19"
+malayam_digits = r"\u0d66-\u0d6f"
+myanmar_digits = r"\u1040-\u1049"
+roman_numeral = r"\u2170-\u2179"
+nominal_digit_shapes = r"\u206f"
+# Load punctuations
+with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
+    punc_list = [
+        line
+        for line in punc_f.readlines()
+        if line.strip() and not line.strip().startswith("#")
+    ]
+punct_pattern = r""
+for punc in punc_list:
+    # the first character in the tab separated line is the punc to be removed
+    punct_pattern += re.escape(punc.split("\t")[0])
+shared_digits = (
+    english_digits
+    + bengali_digits
+    + khmer_digits
+    + devanagari_digits
+    + oriya_digits
+    + extended_arabic_indic_digits
+    + kayah_li_digits
+    + fullwidth_digits
+    + malayam_digits
+    + myanmar_digits
+    + roman_numeral
+    + nominal_digit_shapes
+)
+shared_punc_list = (
+    basic_punc
+    + all_punct_quotes
+    + greater_than_sign
+    + lesser_than_sign
+    + inverted_question_mark
+    + full_stop
+    + semicolon
+    + armenian_punc
+    + inverted_exclamation_mark
+    + arabic_comma
+    + enumeration_comma
+    + hindi_danda
+    + quotation_mark
+    + arabic_semicolon
+    + arabic_question_mark
+    + chinese_punc
+    + punct_pattern
+)
+shared_mappping = {
+    lesser_than_symbol: "",
+    greater_than_symbol: "",
+    nbsp_written_form: "",
+    r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
+}
+shared_deletion_list = (
+    left_to_right_mark
+    + zero_width_nonjoiner
+    + arabic_subscript_alef_and_inverted_damma
+    + zero_width_space
+    + arabic_diacritics
+    + pop_directional_formatting
+    + right_to_left_mark
+    + left_to_right_embedding
+)
+norm_config = {
+    "*": {
+        "lower_case": True,
+        "punc_set": shared_punc_list,
+        "del_set": shared_deletion_list,
+        "mapping": shared_mappping,
+        "digit_set": shared_digits,
+        "unicode_norm": "NFKC",
+        "rm_diacritics": False,
+    }
+}
+# =============== Mongolian ===============#
+norm_config["mon"] = norm_config["*"].copy()
+# add soft hyphen to punc list to match with fleurs
+norm_config["mon"]["del_set"] += r"\u00AD"
+norm_config["khk"] = norm_config["mon"].copy()
+# =============== Hebrew ===============#
+norm_config["heb"] = norm_config["*"].copy()
+# add "HEBREW POINT" symbols to match with fleurs
+norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
+# =============== Thai ===============#
+norm_config["tha"] = norm_config["*"].copy()
+# add "Zero width joiner" symbols to match with fleurs
+norm_config["tha"]["punc_set"] += r"\u200D"
+# =============== Arabic ===============#
+norm_config["ara"] = norm_config["*"].copy()
+norm_config["ara"]["mapping"]["ٱ"] = "ا"
+norm_config["arb"] = norm_config["ara"].copy()
+# =============== Javanese ===============#
+norm_config["jav"] = norm_config["*"].copy()
+norm_config["jav"]["rm_diacritics"] = True

omnivoice/eval/wer/punctuations.lst ADDED Viewed

	@@ -0,0 +1,188 @@

+	7355	INVALID UNICODE	0x81
+	5265	INVALID UNICODE	0x90
+	75	INVALID UNICODE	0x8
+	31	INVALID UNICODE	0x8d
+	3	INVALID UNICODE	0x94
+	2	INVALID UNICODE	0x8f
+	2	INVALID UNICODE	0x1a
+	1	INVALID UNICODE	0x9d
+	1	INVALID UNICODE	0x93
+	1	INVALID UNICODE	0x92
+	8647	INVALID UNICODE	0xe295
+	6650	INVALID UNICODE	0xf21d
+	6234	INVALID UNICODE	0xf62d
+	4815	INVALID UNICODE	0xf173
+	4789	INVALID UNICODE	0xe514
+	4409	INVALID UNICODE	0xe293
+	3881	INVALID UNICODE	0xf523
+	3788	INVALID UNICODE	0xe233
+	2448	INVALID UNICODE	0xf50f
+	2177	INVALID UNICODE	0xe232
+	1955	INVALID UNICODE	0xea7b
+	1926	INVALID UNICODE	0xf172
+	973	INVALID UNICODE	0xe290
+	972	INVALID UNICODE	0xf519
+	661	INVALID UNICODE	0xe292
+	591	INVALID UNICODE	0xe328
+	509	INVALID UNICODE	0xe2fa
+	458	INVALID UNICODE	0xe234
+	446	INVALID UNICODE	0xe043
+	419	INVALID UNICODE	0xe040
+	399	INVALID UNICODE	0xe2fb
+	387	INVALID UNICODE	0xe32b
+	381	INVALID UNICODE	0xe236
+	374	INVALID UNICODE	0xf511
+	314	INVALID UNICODE	0xe517
+	296	INVALID UNICODE	0xe2fe
+	293	INVALID UNICODE	0xe492
+	291	INVALID UNICODE	0xf52d
+	289	INVALID UNICODE	0xe2fc
+	195	INVALID UNICODE	0xf521
+	190	INVALID UNICODE	0xe516
+	182	INVALID UNICODE	0xe041
+	178	INVALID UNICODE	0xf529
+	113	INVALID UNICODE	0xe2f9
+	87	INVALID UNICODE	0xe2d9
+	78	INVALID UNICODE	0xe32a
+	76	INVALID UNICODE	0xe291
+	74	INVALID UNICODE	0xe296
+	66	INVALID UNICODE	0xe518
+	52	INVALID UNICODE	0xe32c
+	46	INVALID UNICODE	0xe2db
+	41	INVALID UNICODE	0xe231
+	34	INVALID UNICODE	0xf522
+	33	INVALID UNICODE	0xf518
+	32	INVALID UNICODE	0xf513
+	27	INVALID UNICODE	0xe32d
+	25	INVALID UNICODE	0xe32e
+	23	INVALID UNICODE	0xe06b
+	15	INVALID UNICODE	0xea01
+	12	INVALID UNICODE	0xe294
+	11	INVALID UNICODE	0xe203
+	8	INVALID UNICODE	0xf218
+	7	INVALID UNICODE	0xe070
+	7	INVALID UNICODE	0xe013
+	5	INVALID UNICODE	0xe2de
+	4	INVALID UNICODE	0xe493
+	3	INVALID UNICODE	0xf7e8
+	3	INVALID UNICODE	0xf7d0
+	3	INVALID UNICODE	0xe313
+	2	INVALID UNICODE	0xe329
+	2	INVALID UNICODE	0xe06d
+	2	INVALID UNICODE	0xe003
+	1	INVALID UNICODE	0xf50e
+	1	INVALID UNICODE	0xf171
+	1	INVALID UNICODE	0xe01d
+⁯	71	NOMINAL DIGIT SHAPES	0x206f
+⁠	3	WORD JOINER	0x2060
+―	126545	HORIZONTAL BAR	0x2015
+־	1028	HEBREW PUNCTUATION MAQAF	0x5be
+)	98429	RIGHT PARENTHESIS	0x29
+]	27108	RIGHT SQUARE BRACKET	0x5d
+⌋	1567	RIGHT FLOOR	0x230b
+〕	97	RIGHT TORTOISE SHELL BRACKET	0x3015
+】	36	RIGHT BLACK LENTICULAR BRACKET	0x3011
+﴾	14	ORNATE LEFT PARENTHESIS	0xfd3e
+&	170517	AMPERSAND	0x26
+།	106330	TIBETAN MARK SHAD	0xf0d
+።	90203	ETHIOPIC FULL STOP	0x1362
+፥	60484	ETHIOPIC COLON	0x1365
+༌	60464	TIBETAN MARK DELIMITER TSHEG BSTAR	0xf0c
+။	51567	MYANMAR SIGN SECTION	0x104b
+/	46929	SOLIDUS	0x2f
+၊	38042	MYANMAR SIGN LITTLE SECTION	0x104a
+·	37985	MIDDLE DOT	0xb7
+‸	36310	CARET	0x2038
+*	34793	ASTERISK	0x2a
+۔	32432	ARABIC FULL STOP	0x6d4
+፤	31906	ETHIOPIC SEMICOLON	0x1364
+၏	21519	MYANMAR SYMBOL GENITIVE	0x104f
+។	20834	KHMER SIGN KHAN	0x17d4
+꓾	15773	LISU PUNCTUATION COMMA	0xa4fe
+᙮	13473	CANADIAN SYLLABICS FULL STOP	0x166e
+꤯	12892	KAYAH LI SIGN SHYA	0xa92f
+⵰	11478	TIFINAGH SEPARATOR MARK	0x2d70
+꓿	11118	LISU PUNCTUATION FULL STOP	0xa4ff
+॥	10763	DEVANAGARI DOUBLE DANDA	0x965
+؞	10403	ARABIC TRIPLE DOT PUNCTUATION MARK	0x61e
+၍	8936	MYANMAR SYMBOL COMPLETED	0x104d
+·	8431	GREEK ANO TELEIA	0x387
+†	7477	DAGGER	0x2020
+၌	6632	MYANMAR SYMBOL LOCATIVE	0x104c
+፣	5719	ETHIOPIC COMMA	0x1363
+៖	5528	KHMER SIGN CAMNUC PII KUUH	0x17d6
+꤮	4791	KAYAH LI SIGN CWI	0xa92e
+※	3439	REFERENCE MARK	0x203b
+፦	2727	ETHIOPIC PREFACE COLON	0x1366
+•	1749	BULLET	0x2022
+¶	1507	PILCROW SIGN	0xb6
+၎	1386	MYANMAR SYMBOL AFOREMENTIONED	0x104e
+﹖	1224	SMALL QUESTION MARK	0xfe56
+;	975	GREEK QUESTION MARK	0x37e
+…	827	HORIZONTAL ELLIPSIS	0x2026
+%	617	PERCENT SIGN	0x25
+・	468	KATAKANA MIDDLE DOT	0x30fb
+༎	306	TIBETAN MARK NYIS SHAD	0xf0e
+‡	140	DOUBLE DAGGER	0x2021
+#	137	NUMBER SIGN	0x23
+@	125	COMMERCIAL AT	0x40
+፡	121	ETHIOPIC WORDSPACE	0x1361
+៚	55	KHMER SIGN KOOMUUT	0x17da
+៕	49	KHMER SIGN BARIYOOSAN	0x17d5
+﹐	10	SMALL COMMA	0xfe50
+༅	6	TIBETAN MARK CLOSING YIG MGO SGAB MA	0xf05
+༄	6	TIBETAN MARK INITIAL YIG MGO MDUN MA	0xf04
+．	2	FULLWIDTH FULL STOP	0xff0e
+﹗	2	SMALL EXCLAMATION MARK	0xfe57
+﹕	2	SMALL COLON	0xfe55
+‰	2	PER MILLE SIGN	0x2030
+･	1	HALFWIDTH KATAKANA MIDDLE DOT	0xff65
+(	98504	LEFT PARENTHESIS	0x28
+[	27245	LEFT SQUARE BRACKET	0x5b
+⌊	1567	LEFT FLOOR	0x230a
+〔	95	LEFT TORTOISE SHELL BRACKET	0x3014
+【	36	LEFT BLACK LENTICULAR BRACKET	0x3010
+﴿	14	ORNATE RIGHT PARENTHESIS	0xfd3f
+_	4851	LOW LINE	0x5f
+$	72	DOLLAR SIGN	0x24
+€	14	EURO SIGN	0x20ac
+£	2	POUND SIGN	0xa3
+~	27462	TILDE	0x7e
+=	11450	EQUALS SIGN	0x3d
+|	8430	VERTICAL LINE	0x7c
+−	3971	MINUS SIGN	0x2212
+≫	1904	MUCH GREATER-THAN	0x226b
+≪	1903	MUCH LESS-THAN	0x226a
++	1450	PLUS SIGN	0x2b
+＜	345	FULLWIDTH LESS-THAN SIGN	0xff1c
+＞	344	FULLWIDTH GREATER-THAN SIGN	0xff1e
+¬	5	NOT SIGN	0xac
+×	4	MULTIPLICATION SIGN	0xd7
+→	2	RIGHTWARDS ARROW	0x2192
+᙭	537	CANADIAN SYLLABICS CHI SIGN	0x166d
+°	499	DEGREE SIGN	0xb0
+႟	421	MYANMAR SYMBOL SHAN EXCLAMATION	0x109f
+�	192	REPLACEMENT CHARACTER	0xfffd
+⌟	54	BOTTOM RIGHT CORNER	0x231f
+⌞	54	BOTTOM LEFT CORNER	0x231e
+©	2	COPYRIGHT SIGN	0xa9
+ 	40	NARROW NO-BREAK SPACE	0x202f
+ 	1	SIX-PER-EM SPACE	0x2006
+˜	40261	SMALL TILDE	0x2dc
+^	6469	CIRCUMFLEX ACCENT	0x5e
+¯	20	MACRON	0xaf
+ˇ	191442	CARON	0x2c7
+ⁿ	38144	SUPERSCRIPT LATIN SMALL LETTER N	0x207f
+ـ	9440	ARABIC TATWEEL	0x640
+ๆ	6766	THAI CHARACTER MAIYAMOK	0xe46
+ៗ	3310	KHMER SIGN LEK TOO	0x17d7
+々	678	IDEOGRAPHIC ITERATION MARK	0x3005
+ໆ	430	LAO KO LA	0xec6
+ー	319	KATAKANA-HIRAGANA PROLONGED SOUND MARK	0x30fc
+ⁱ	137	SUPERSCRIPT LATIN SMALL LETTER I	0x2071
+৷	11056	BENGALI CURRENCY NUMERATOR FOUR	0x9f7
+⅓	26	VULGAR FRACTION ONE THIRD	0x2153
+½	26	VULGAR FRACTION ONE HALF	0xbd
+¼	4	VULGAR FRACTION ONE QUARTER	0xbc
+⅟	1	FRACTION NUMERATOR ONE	0x215f
+⁄	57	FRACTION SLASH	0x2044

omnivoice/eval/wer/seedtts.py ADDED Viewed

	@@ -0,0 +1,413 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Computes word error rate (WER) with Whisper-large-v3 for English and
+Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import string
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import numpy as np
+import torch
+import zhconv
+from tqdm import tqdm
+from zhon.hanzi import punctuation
+from omnivoice.eval.utils import load_waveform
+from omnivoice.eval.wer.common import process_one
+from omnivoice.utils.data_utils import read_test_list
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Computes WER with Whisper/Paraformer.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing speech files.",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where WER information will be saved. "
+        "If not provided, results are only printed to console.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of evaluation models repository. "
+        "Download from https://huggingface.co/k2-fsa/TTS_eval_models. "
+        "This script expects 'tts_eval_models/wer/whisper-large-v3/' for English "
+        "and 'tts_eval_models/wer/paraformer-zh/' for Chinese within this directory.",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default="test.jsonl",
+        help="path of the JSONL test list. Each line is a JSON object "
+        "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        choices=["zh", "en"],
+        required=True,
+        help="Language of the audio and transcripts for "
+        "decoding ('zh' for Chinese or 'en' for English).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Batch size for decoding with the Hugging Face pipeline.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+    )
+    return parser
+def load_whisper_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/whisper-large-v3/")
+    if not os.path.exists(model_path):
+        logging.error(f"Whisper model not found at {model_path}.")
+        return None
+    logging.debug(f"Loading Whisper model on {device}...")
+    import transformers
+    # Suppress transformers logging
+    transformers.logging.set_verbosity_error()
+    pipe = transformers.pipeline(
+        "automatic-speech-recognition",
+        model=model_path,
+        dtype=torch.float16 if "cuda" in str(device) else torch.float32,
+        device=device,
+    )
+    return pipe
+def load_paraformer_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/paraformer-zh/")
+    if not os.path.exists(model_path):
+        logging.error(f"Paraformer model not found at {model_path}.")
+        return None
+    logging.debug(f"Loading Paraformer model on {device}...")
+    previous_level = logging.root.manager.disable
+    logging.disable(logging.CRITICAL)
+    try:
+        from funasr import AutoModel
+        # FunASR AutoModel accepts "cuda:0" string or torch.device
+        model = AutoModel(
+            model=model_path,
+            device=str(device),
+            disable_update=True,
+            disable_pbar=True,
+            verbose=False,
+        )
+    finally:
+        logging.disable(previous_level)
+    return model
+def post_process(text: str, lang: str) -> str:
+    """
+    Cleans and normalizes text for WER calculation.
+    Args:
+        text (str): The input text to be processed.
+        lang (str): The language of the input text.
+    Returns:
+        str: The cleaned and normalized text.
+    """
+    punctuation_all = punctuation + string.punctuation
+    for x in punctuation_all:
+        if x == "'":
+            continue
+        text = text.replace(x, "")
+    text = text.replace("  ", " ")
+    if lang == "zh":
+        text = " ".join([x for x in text])
+    elif lang == "en":
+        text = text.lower()
+    else:
+        raise NotImplementedError
+    return text
+def process_init(rank_queue, model_dir, lang):
+    """
+    Initializer for each worker process.
+    Loads model onto a specific GPU, once per process.
+    """
+    global worker_pipe, worker_device
+    torch.set_num_threads(2)
+    try:
+        rank = rank_queue.get(timeout=10)
+    except Exception:
+        raise RuntimeError("Failed to get GPU rank from queue.")
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    worker_device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    logging.info(f"Initializing worker on device: {worker_device}")
+    try:
+        if lang == "en":
+            worker_pipe = load_whisper_model(model_dir, worker_device)
+        elif lang == "zh":
+            worker_pipe = load_paraformer_model(model_dir, worker_device)
+        if worker_pipe is None:
+            raise RuntimeError("Model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load model on {worker_device}: {e}")
+        raise e
+def run_eval_worker(data_chunk, lang, batch_size):
+    """
+    Worker function to process a chunk of data.
+    Uses the global worker_pipe initialized by process_init.
+    """
+    global worker_pipe
+    if worker_pipe is None:
+        logging.error("Worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        if lang == "en":
+            # Load waveforms as arrays, truncating to 30s
+            dataset = [
+                {
+                    "array": load_waveform(
+                        item["wav_path"], sample_rate=16000, return_numpy=True
+                    )[: 16000 * 30],
+                    "sampling_rate": 16000,
+                }
+                for item in data_chunk
+            ]
+            generate_kwargs = {"language": "english", "task": "transcribe"}
+            iterator = worker_pipe(
+                dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+            )
+            for i, out in enumerate(iterator):
+                hypothesis = out["text"].strip()
+                ref_item = data_chunk[i]
+                truth = ref_item["truth_text"]
+                wav_path = ref_item["wav_path"]
+                m = process_one(hypothesis, truth, post_process, lang)
+                m["wav_path"] = wav_path
+                metrics_buffer.append(m)
+        elif lang == "zh":
+            wav_paths = [item["wav_path"] for item in data_chunk]
+            for i in range(0, len(wav_paths), batch_size):
+                batch_paths = wav_paths[i : i + batch_size]
+                res_batch = worker_pipe.generate(
+                    input=batch_paths, batch_size=batch_size, disable_pbar=True
+                )
+                for j, res in enumerate(res_batch):
+                    hypothesis = zhconv.convert(res["text"], "zh-cn")
+                    ref_item = data_chunk[i + j]
+                    truth = ref_item["truth_text"]
+                    wav_path = ref_item["wav_path"]
+                    m = process_one(hypothesis, truth, post_process, lang)
+                    m["wav_path"] = wav_path
+                    metrics_buffer.append(m)
+    except Exception:
+        logging.error(
+            f"Worker failed on chunk (Lang: {lang}):\n{traceback.format_exc()}"
+        )
+        return []
+    return metrics_buffer
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+        force=True,
+    )
+    logging.info(f"Calculating WER for {args.wav_path}")
+    # 1. Prepare Data
+    logging.info("Reading test list...")
+    data_list = []
+    samples = read_test_list(args.test_list)
+    for s in samples:
+        wav_path = str(Path(args.wav_path) / f"{s['id']}.{args.extension}")
+        if not os.path.exists(wav_path):
+            logging.warning(f"File missing: {wav_path}")
+            continue
+        data_list.append({"wav_path": wav_path, "truth_text": s["text"]})
+    total_files = len(data_list)
+    logging.info(f"Total files: {total_files}.")
+    # 2. Worker config
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_workers = num_gpus * args.nj_per_gpu
+    mp.set_start_method("spawn", force=True)
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for _ in range(args.nj_per_gpu):
+        for rank in range(num_gpus):
+            rank_queue.put(rank)
+    # 3. Scheduling: Split data into chunks for better load balancing
+    chunk_size = max(1, args.batch_size)
+    tasks = []
+    for i in range(0, total_files, chunk_size):
+        tasks.append(data_list[i : i + chunk_size])
+    logging.info(
+        f"Split data into {len(tasks)} chunks (size ~{chunk_size}). "
+        f"Spawning {total_workers} workers."
+    )
+    # 4. Execution
+    results = []
+    with ProcessPoolExecutor(
+        max_workers=total_workers,
+        initializer=process_init,
+        initargs=(rank_queue, args.model_dir, args.lang),
+    ) as executor:
+        futures = []
+        for chunk in tasks:
+            futures.append(
+                executor.submit(run_eval_worker, chunk, args.lang, args.batch_size)
+            )
+        # Unified progress bar
+        with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+            for future in as_completed(futures):
+                try:
+                    chunk_metrics = future.result()
+                    results.extend(chunk_metrics)
+                    pbar.update(len(chunk_metrics))
+                except Exception as e:
+                    logging.error(f"Task failed: {e}")
+    wers, inses, deles, subses = [], [], [], []
+    word_nums = 0
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        fout = open(args.decode_path, "w", encoding="utf8")
+        logging.info(f"Saving detailed WER results to: {args.decode_path}")
+        fout.write(
+            "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n"
+        )
+    for res in results:
+        wers.append(float(res["wer"]))
+        inses.append(float(res["insertions"]))
+        deles.append(float(res["deletions"]))
+        subses.append(float(res["substitutions"]))
+        word_nums += res["word_num"]
+        if fout:
+            fout.write(
+                f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+                f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+                f"{res['substitutions']}\n"
+            )
+    wer_avg = round(np.mean(wers) * 100, 2) if wers else float("nan")
+    wer_weighted = (
+        round(
+            (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2
+        )
+        if word_nums > 0
+        else float("nan")
+    )
+    inse_sum = np.sum(inses)
+    dele_sum = np.sum(deles)
+    subs_sum = np.sum(subses)
+    print("-" * 50)
+    logging.info(f"Processed {len(results)}/{total_files} files.")
+    seedtts_wer_info = f"Seed-TTS WER (Avg of WERs): {wer_avg}%"
+    wer_info = f"WER (Weighted): {wer_weighted}%"
+    detailed_info = (
+        f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words"
+    )
+    logging.info(seedtts_wer_info)
+    logging.info(wer_info)
+    logging.info(detailed_info)
+    print("-" * 50)
+    if fout:
+        fout.write(seedtts_wer_info + "\n" + wer_info + "\n" + detailed_info + "\n")
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/wer/sensevoice.py ADDED Viewed

	@@ -0,0 +1,344 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Computes Character Error Rate (CER) for Cantonese (yue) using SenseVoiceSmall.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+import cn2an
+import torch
+import zhconv
+from tqdm import tqdm
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+# --- Global variables for worker processes ---
+worker_sensevoice = None
+worker_device = None
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Computes CER for Cantonese using SenseVoiceSmall.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--wav-path",
+        type=str,
+        required=True,
+        help="Path to the directory containing speech files.",
+    )
+    parser.add_argument(
+        "--extension",
+        type=str,
+        default="wav",
+        help="Extension of the speech files. Default: wav",
+    )
+    parser.add_argument(
+        "--decode-path",
+        type=str,
+        default=None,
+        help="Path to the output file where CER information will be saved. ",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Local path of evaluation models repository. ",
+    )
+    parser.add_argument(
+        "--test-list",
+        type=str,
+        default="test.jsonl",
+        help="path of the JSONL test list.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=16,
+        help="Batch size for decoding.",
+    )
+    parser.add_argument(
+        "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=10,
+        help="Number of samples per task chunk sent to workers.",
+    )
+    return parser
+def load_sensevoice_model(model_dir, device):
+    model_path = os.path.join(model_dir, "wer/SenseVoiceSmall")
+    if not os.path.exists(model_path):
+        # Fallback if specific sensevoice spelling isn't found
+        logging.warning(
+            f"SenseVoiceSmall not found at {model_path}. "
+            f"Please ensure it is present in eval models."
+        )
+    logging.info(f"Loading SenseVoice model on {device}...")
+    previous_level = logging.root.manager.disable
+    logging.disable(logging.CRITICAL)
+    try:
+        from funasr import AutoModel
+        model = AutoModel(
+            model="iic/SenseVoiceSmall",
+            device=str(device),
+            disable_update=True,
+            disable_pbar=True,
+            verbose=False,
+        )
+    finally:
+        logging.disable(previous_level)
+    return model
+def _worker_setup(rank_queue):
+    global worker_device
+    torch.set_num_threads(2)
+    try:
+        rank = rank_queue.get(timeout=10)
+    except Exception:
+        raise RuntimeError("Failed to get GPU rank from queue.")
+    assert torch.cuda.is_available(), "CUDA is required but not available."
+    worker_device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(rank)
+    logging.info(f"Initializing worker on device: {worker_device}")
+def process_init_sensevoice(rank_queue, model_dir):
+    global worker_sensevoice
+    _worker_setup(rank_queue)
+    try:
+        worker_sensevoice = load_sensevoice_model(model_dir, worker_device)
+        if worker_sensevoice is None:
+            raise RuntimeError("SenseVoice model loading failed.")
+    except Exception as e:
+        logging.critical(f"Failed to load SenseVoice model on {worker_device}: {e}")
+        raise e
+def post_process(text: str, lang: str) -> str:
+    """
+    Cleans and normalizes text for calculation.
+    """
+    assert lang == "yue", "this script is designed for Cantonese (yue) evaluation only."
+    text = text_normalize(
+        text,
+        iso_code="yue",
+        lower_case=True,
+        remove_numbers=False,
+        remove_brackets=False,
+    )
+    text = zhconv.convert(text, "zh-cn")
+    text = cn2an.transform(text, "an2cn")
+    text = text.replace(" ", "")
+    text = " ".join([x for x in text])
+    text = text.lower()
+    return text.strip()
+def run_eval_worker_sensevoice(data_chunk, batch_size):
+    global worker_sensevoice
+    if worker_sensevoice is None:
+        logging.error("SenseVoice worker pipeline is not initialized!")
+        return []
+    metrics_buffer = []
+    try:
+        wav_paths = [item["wav_path"] for item in data_chunk]
+        for i in range(0, len(wav_paths), batch_size):
+            batch_paths = wav_paths[i : i + batch_size]
+            # SenseVoice generate call, target lang mapped to yue
+            res_batch = worker_sensevoice.generate(
+                input=batch_paths,
+                batch_size=batch_size,
+                language="yue",
+                use_itn=False,
+                disable_pbar=True,
+            )
+            for j, res in enumerate(res_batch):
+                hypothesis = res["text"]
+                # SenseVoice may format output with language tags,
+                # cleaning basic tags if any
+                hypothesis = re.sub(r"<\|[^|]*\|>", "", hypothesis).strip()
+                ref_item = data_chunk[i + j]
+                truth = ref_item["truth_text"]
+                wav_path = ref_item["wav_path"]
+                lang_name = ref_item.get("lang_name")
+                m = process_one(hypothesis, truth, post_process, "yue")
+                m["wav_path"] = wav_path
+                m["lang_name"] = lang_name
+                metrics_buffer.append(m)
+    except Exception:
+        logging.error(f"SenseVoice worker failed on chunk:\n{traceback.format_exc()}")
+        return []
+    return metrics_buffer
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+        level=logging.INFO,
+        force=True,
+    )
+    logging.info("Reading test list and filtering for Cantonese (yue)...")
+    yue_items = []
+    wav_root = Path(args.wav_path)
+    samples = read_test_list(args.test_list)
+    for s in samples:
+        lang_id = s.get("language_id", "")
+        if lang_id != "yue":
+            continue
+        wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+        if not os.path.exists(wav_path):
+            logging.warning(f"File missing: {wav_path}")
+            continue
+        yue_items.append(
+            {
+                "wav_path": wav_path,
+                "truth_text": s["text"],
+                "lang_id": "yue",
+                "lang_name": s.get("language_name", "Cantonese"),
+            }
+        )
+    logging.info(f"Total Cantonese files found: {len(yue_items)}.")
+    if len(yue_items) == 0:
+        logging.warning("No files to evaluate. Exiting.")
+        return
+    num_gpus = torch.cuda.device_count()
+    assert num_gpus > 0, "No GPU found. GPU is required."
+    total_workers = num_gpus * args.nj_per_gpu
+    mp.set_start_method("spawn", force=True)
+    manager = mp.Manager()
+    chunk_size = args.chunk_size
+    tasks = []
+    for i in range(0, len(yue_items), chunk_size):
+        tasks.append(yue_items[i : i + chunk_size])
+    results = []
+    rank_queue = manager.Queue()
+    for _ in range(args.nj_per_gpu):
+        for rank in range(num_gpus):
+            rank_queue.put(rank)
+    with ProcessPoolExecutor(
+        max_workers=total_workers,
+        initializer=process_init_sensevoice,
+        initargs=(rank_queue, args.model_dir),
+    ) as executor:
+        futures = []
+        for chunk in tasks:
+            futures.append(
+                executor.submit(run_eval_worker_sensevoice, chunk, args.batch_size)
+            )
+        with tqdm(
+            total=len(yue_items),
+            desc="SenseVoice Eval (Cantonese)",
+            dynamic_ncols=True,
+        ) as pbar:
+            for future in as_completed(futures):
+                try:
+                    chunk_metrics = future.result()
+                    results.extend(chunk_metrics)
+                    pbar.update(len(chunk_metrics))
+                except Exception as e:
+                    logging.error(f"Task failed: {e}")
+    # Metrics Aggregation
+    inses, deles, subses = [], [], []
+    word_nums = 0
+    fout = None
+    if args.decode_path:
+        os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+        logging.info(f"Saving detailed CER results to: {args.decode_path}")
+        fout = open(args.decode_path, "w", encoding="utf-8")
+    for res in results:
+        inses.append(float(res["insertions"]))
+        deles.append(float(res["deletions"]))
+        subses.append(float(res["substitutions"]))
+        word_nums += res["word_num"]
+        if fout:
+            fout.write(
+                f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+                f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+                f"{res['substitutions']}\n"
+            )
+    print("-" * 50)
+    if word_nums > 0:
+        log_metrics(fout, "[yue] Cantonese", inses, deles, subses, word_nums)
+    if fout:
+        fout.close()
+if __name__ == "__main__":
+    main()

omnivoice/eval/wer/text_norm_omni.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This module contains the text normalization function for WER evaluation.
+Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/text_tools.py
+"""
+import re
+import unicodedata
+from unidecode import unidecode
+import omnivoice.eval.wer.norm_config_module as norm_config_module
+norm_config = norm_config_module.norm_config  # type: ignore
+def text_normalize(
+    text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
+):
+    """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
+    Args:
+        text : The string to be normalized
+        iso_code :
+        remove_numbers : Boolean flag to specify if words containing only digits should be removed
+    Returns:
+        normalized_text : the string after all normalization
+    """
+    config = norm_config.get(iso_code, norm_config["*"])
+    for field in [
+        "lower_case",
+        "punc_set",
+        "del_set",
+        "mapping",
+        "digit_set",
+        "unicode_norm",
+    ]:
+        if field not in config:
+            config[field] = norm_config["*"][field]
+    text = unicodedata.normalize(config["unicode_norm"], text)
+    # Convert to lower case
+    if config["lower_case"] and lower_case:
+        text = text.lower()
+    # brackets
+    # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
+    text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
+    if remove_brackets:
+        text = re.sub(r"\([^\)]*\)", " ", text)
+    # Apply mappings
+    for old, new in config["mapping"].items():
+        text = re.sub(old, new, text)
+    # Replace punctutations with space
+    punct_pattern = r"[" + config["punc_set"]
+    punct_pattern += "]"
+    normalized_text = re.sub(punct_pattern, " ", text)
+    # remove characters in delete list
+    delete_patten = r"[" + config["del_set"] + "]"
+    normalized_text = re.sub(delete_patten, "", normalized_text)
+    # Remove words containing only digits
+    # We check for 3 cases  a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
+    # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
+    # The lookaround enables overlapping pattern matches to be replaced
+    if remove_numbers:
+        digits_pattern = "[" + config["digit_set"]
+        digits_pattern += "]+"
+        complete_digit_pattern = (
+            r"^"
+            + digits_pattern
+            + r"(?=\s)|(?<=\s)"
+            + digits_pattern
+            + r"(?=\s)|(?<=\s)"
+            + digits_pattern
+            + "$"
+        )
+        normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
+    if config["rm_diacritics"]:
+        normalized_text = unidecode(normalized_text)
+    # Remove extra spaces
+    normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
+    return normalized_text

omnivoice/models/__init__.py ADDED Viewed

File without changes

omnivoice/models/omnivoice.py ADDED Viewed

	@@ -0,0 +1,1502 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core OmniVoice model implementation.
+Defines the ``OmniVoice`` model class, generation config, and inference pipeline.
+This is the main entry point for both inference and training:
+- **Inference**: ``OmniVoice.from_pretrained()`` loads the model, then
+  ``model.generate()`` supports voice cloning, voice design, and auto voice.
+- **Training**: ``model.forward()`` computes the training loss; the model is
+  built and used by ``omnivoice.training.builder`` and ``omnivoice.training.trainer``.
+"""
+import difflib
+import logging
+import math
+import os
+import re
+from dataclasses import dataclass, fields
+from functools import partial
+from typing import Any, List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from torch.nn.attention.flex_attention import create_block_mask
+from transformers import (
+    AutoFeatureExtractor,
+    AutoModel,
+    AutoTokenizer,
+    HiggsAudioV2TokenizerModel,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.auto import CONFIG_MAPPING, AutoConfig
+from omnivoice.utils.audio import (
+    cross_fade_chunks,
+    fade_and_pad_audio,
+    load_audio,
+    remove_silence,
+    trim_long_audio,
+)
+from omnivoice.utils.duration import RuleDurationEstimator
+from omnivoice.utils.lang_map import LANG_IDS, LANG_NAMES
+from omnivoice.utils.text import add_punctuation, chunk_text_punctuation
+from omnivoice.utils.voice_design import (
+    _INSTRUCT_ALL_VALID,
+    _INSTRUCT_EN_TO_ZH,
+    _INSTRUCT_MUTUALLY_EXCLUSIVE,
+    _INSTRUCT_VALID_EN,
+    _INSTRUCT_VALID_ZH,
+    _INSTRUCT_ZH_TO_EN,
+    _ZH_RE,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+@dataclass
+class VoiceClonePrompt:
+    ref_audio_tokens: torch.Tensor  # (C, T)
+    ref_text: str
+    ref_rms: float
+@dataclass
+class OmniVoiceGenerationConfig:
+    num_step: int = 32
+    guidance_scale: float = 2.0
+    t_shift: float = 0.1
+    layer_penalty_factor: float = 5.0
+    position_temperature: float = 5.0
+    class_temperature: float = 0.0
+    denoise: bool = True
+    preprocess_prompt: bool = True
+    postprocess_output: bool = True
+    audio_chunk_duration: float = 15.0
+    audio_chunk_threshold: float = 30.0
+    @classmethod
+    def from_dict(cls, kwargs_dict):
+        valid_keys = {f.name for f in fields(cls)}
+        filtered = {k: v for k, v in kwargs_dict.items() if k in valid_keys}
+        return cls(**filtered)
+@dataclass
+class GenerationTask:
+    batch_size: int
+    texts: List[str]
+    target_lens: List[int]
+    langs: List[Optional[str]]
+    instructs: List[Optional[str]]
+    ref_texts: List[Optional[str]]
+    ref_audio_tokens: List[Optional[torch.Tensor]]
+    ref_rms: List[Optional[float]]
+    speed: Optional[List[float]] = None
+    def get_indices(self, config: OmniVoiceGenerationConfig, frame_rate: int):
+        threshold = int(config.audio_chunk_threshold * frame_rate)
+        short_idx = [i for i, l in enumerate(self.target_lens) if l <= threshold]
+        long_idx = [i for i, l in enumerate(self.target_lens) if l > threshold]
+        return short_idx, long_idx
+    def slice_task(self, indices: List[int]):
+        if not indices:
+            return None
+        return GenerationTask(
+            batch_size=len(indices),
+            texts=[self.texts[i] for i in indices],
+            target_lens=[self.target_lens[i] for i in indices],
+            langs=[self.langs[i] for i in indices],
+            instructs=[self.instructs[i] for i in indices],
+            ref_texts=[self.ref_texts[i] for i in indices],
+            ref_audio_tokens=[self.ref_audio_tokens[i] for i in indices],
+            ref_rms=[self.ref_rms[i] for i in indices],
+            speed=[self.speed[i] for i in indices] if self.speed else None,
+        )
+@dataclass
+class OmniVoiceModelOutput(ModelOutput):
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+# ---------------------------------------------------------------------------
+# Config & Model
+# ---------------------------------------------------------------------------
+class OmniVoiceConfig(PretrainedConfig):
+    model_type = "omnivoice"
+    sub_configs = {"llm_config": AutoConfig}
+    def __init__(
+        self,
+        audio_vocab_size: int = 1025,
+        audio_mask_id: int = 1024,
+        num_audio_codebook: int = 8,
+        audio_codebook_weights: Optional[list[float]] = None,
+        llm_config: Optional[Union[dict, PretrainedConfig]] = None,
+        **kwargs,
+    ):
+        if isinstance(llm_config, dict):
+            llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
+        self.llm_config = llm_config
+        super().__init__(**kwargs)
+        self.audio_vocab_size = audio_vocab_size
+        self.audio_mask_id = audio_mask_id
+        self.num_audio_codebook = num_audio_codebook
+        if audio_codebook_weights is None:
+            audio_codebook_weights = [8, 8, 6, 6, 4, 4, 2, 2]
+        self.audio_codebook_weights = audio_codebook_weights
+class OmniVoice(PreTrainedModel):
+    _supports_flex_attn = True
+    _supports_flash_attn_2 = True
+    config_class = OmniVoiceConfig
+    def __init__(self, config: OmniVoiceConfig, llm: Optional[PreTrainedModel] = None):
+        super().__init__(config)
+        if llm is not None:
+            # If an LLM instance is provided, use it directly
+            # (skipping config-based init).
+            self.llm = llm
+        else:
+            # Otherwise, initialize the LLM from the config.
+            self.llm = AutoModel.from_config(self.config.llm_config)
+        self.audio_embeddings = nn.Embedding(
+            config.num_audio_codebook * config.audio_vocab_size,
+            self.config.llm_config.hidden_size,
+        )
+        self.register_buffer(
+            "codebook_layer_offsets",
+            torch.arange(config.num_audio_codebook) * config.audio_vocab_size,
+        )
+        self.audio_heads = nn.Linear(
+            self.config.llm_config.hidden_size,
+            config.num_audio_codebook * config.audio_vocab_size,
+            bias=False,
+        )
+        self.normalized_audio_codebook_weights = [
+            w / sum(config.audio_codebook_weights)
+            for w in config.audio_codebook_weights
+        ]
+        self.post_init()
+        # Inference-only attributes (set by from_pretrained when not in train mode)
+        self.text_tokenizer = None
+        self.audio_tokenizer = None
+        self.duration_estimator = None
+        self.sampling_rate = None
+        self._asr_pipe = None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        train_mode = kwargs.pop("train", False)
+        load_asr = kwargs.pop("load_asr", False)
+        asr_model_name = kwargs.pop("asr_model_name", "openai/whisper-large-v3-turbo")
+        # Suppress noisy INFO logs from transformers/huggingface_hub during loading
+        _prev_disable = logging.root.manager.disable
+        logging.disable(logging.INFO)
+        try:
+            model = super().from_pretrained(
+                pretrained_model_name_or_path, *args, **kwargs
+            )
+            if not train_mode:
+                # Resolve local path for audio tokenizer subdirectory
+                if os.path.isdir(pretrained_model_name_or_path):
+                    resolved_path = pretrained_model_name_or_path
+                else:
+                    from huggingface_hub import snapshot_download
+                    resolved_path = snapshot_download(pretrained_model_name_or_path)
+                model.text_tokenizer = AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path
+                )
+                audio_tokenizer_path = os.path.join(resolved_path, "audio_tokenizer")
+                if not os.path.isdir(audio_tokenizer_path):
+                    # Fallback to the HuggingFace Hub path of transformers'
+                    # HiggsAudioV2Tokenizer if the local subdirectory doesn't exist.
+                    audio_tokenizer_path = "eustlb/higgs-audio-v2-tokenizer"
+                # higgs-audio-v2-tokenizer does not support MPS (output channels > 65536)
+                tokenizer_device = (
+                    "cpu" if str(model.device).startswith("mps") else model.device
+                )
+                model.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+                    audio_tokenizer_path, device_map=tokenizer_device
+                )
+                model.feature_extractor = AutoFeatureExtractor.from_pretrained(
+                    audio_tokenizer_path
+                )
+                model.sampling_rate = model.feature_extractor.sampling_rate
+                model.duration_estimator = RuleDurationEstimator()
+                if load_asr:
+                    model.load_asr_model(model_name=asr_model_name)
+        finally:
+            logging.disable(_prev_disable)
+        return model
+    # -------------------------------------------------------------------
+    # ASR support (optional, for auto-transcription)
+    # -------------------------------------------------------------------
+    def load_asr_model(self, model_name: str = "openai/whisper-large-v3-turbo"):
+        """Load a Whisper ASR model for reference audio transcription.
+        Args:
+            model_name: HuggingFace model name for the Whisper model.
+        """
+        from transformers import pipeline as hf_pipeline
+        logger.info("Loading ASR model %s ...", model_name)
+        asr_dtype = (
+            torch.float16 if str(self.device).startswith("cuda") else torch.float32
+        )
+        self._asr_pipe = hf_pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            dtype=asr_dtype,
+            device_map=self.device,
+        )
+        logger.info("ASR model loaded on %s.", self.device)
+    @torch.inference_mode()
+    def transcribe(
+        self,
+        audio: Union[str, tuple[torch.Tensor, int]],
+    ) -> str:
+        """Transcribe audio using the loaded Whisper ASR model.
+        Args:
+            audio: File path or (waveform, sample_rate) tuple.
+        Returns:
+            Transcribed text.
+        """
+        if self._asr_pipe is None:
+            raise RuntimeError(
+                "ASR model is not loaded. Call model.load_asr_model() first."
+            )
+        if isinstance(audio, str):
+            return self._asr_pipe(audio)["text"].strip()
+        else:
+            waveform, sr = audio
+            if waveform.dim() == 1:
+                waveform = waveform.unsqueeze(0)
+            if waveform.size(0) > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            audio_input = {
+                "array": waveform.squeeze(0).cpu().numpy(),
+                "sampling_rate": sr,
+            }
+            return self._asr_pipe(audio_input)["text"].strip()
+    def get_input_embeddings(self):
+        return self.llm.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.llm.set_input_embeddings(value)
+    def _prepare_embed_inputs(
+        self, input_ids: torch.Tensor, audio_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Prepares embeddings from input_ids of shape (batch_size, layers, seq_length).
+        Embedding shape is (batch_size, seq_length, hidden_size).
+        """
+        text_embeds = self.get_input_embeddings()(input_ids[:, 0, :])
+        # Apply shift to audio IDs based on codebook layer
+        # audio_ids: [Batch, 8, Seq]
+        # codebook_layer_offsets: [1, 8, 1]
+        # Result: Layer 0 ID Layer 1 ID + Layer 2 ID + 2050...
+        shifted_ids = (
+            input_ids * audio_mask.unsqueeze(1)
+        ) + self.codebook_layer_offsets.view(1, -1, 1)
+        # input: [Batch, 8, Seq] -> output: [Batch, Seq, Hidden]
+        audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1)
+        return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        audio_mask: torch.Tensor,
+        labels: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        document_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        inputs_embeds = self._prepare_embed_inputs(input_ids, audio_mask)
+        if attention_mask is None and document_ids is not None:
+            attention_mask = create_block_mask(
+                _get_packed_mask(
+                    document_ids[0].to(inputs_embeds.device),
+                ),
+                B=None,
+                H=None,
+                Q_LEN=input_ids.size(-1),
+                KV_LEN=input_ids.size(-1),
+                _compile=True,
+                device=inputs_embeds.device,
+            )
+        llm_outputs = self.llm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            return_dict=True,
+            position_ids=position_ids,
+        )
+        hidden_states = llm_outputs[0]
+        loss = None
+        # Shape: [B, S, C * Vocab]
+        batch_size, seq_len, _ = hidden_states.shape
+        logits_flat = self.audio_heads(hidden_states)
+        # Shape: [B, S, C, Vocab] -> [B, C, S, Vocab]
+        audio_logits = logits_flat.view(
+            batch_size,
+            seq_len,
+            self.config.num_audio_codebook,
+            self.config.audio_vocab_size,
+        ).permute(0, 2, 1, 3)
+        if labels is not None:
+            # audio_logits.permute(0, 3, 1, 2):
+            # [Batch, Layer, Seq, Vocab] -> [Batch, Vocab, Layer, Seq]
+            # per_token_loss shape: [Batch, Layer, Seq]，ignore -100
+            per_token_loss = torch.nn.functional.cross_entropy(
+                audio_logits.permute(0, 3, 1, 2),
+                labels,
+                reduction="none",
+                ignore_index=-100,
+            )
+            # valid_mask shape: [Batch, Layer, Seq]
+            valid_mask = (labels != -100).float()
+            # layer_means shape: [num_layers]
+            layer_means = (per_token_loss * valid_mask).sum(
+                dim=(0, 2)
+            ) / valid_mask.sum(dim=(0, 2)).clamp(min=1.0)
+            weights = torch.tensor(
+                self.normalized_audio_codebook_weights, device=audio_logits.device
+            )
+            loss = (layer_means * weights).sum()
+        return OmniVoiceModelOutput(
+            loss=loss,
+            logits=audio_logits,
+        )
+    def supported_language_ids(self) -> set[str]:
+        """Return a list of supported language IDs."""
+        return LANG_IDS
+    def supported_language_names(self) -> set[str]:
+        """Return a list of supported language names."""
+        return LANG_NAMES
+    # -------------------------------------------------------------------
+    # Inference API
+    # -------------------------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+        speed: Union[float, list[Optional[float]], None] = None,
+        generation_config: Optional[OmniVoiceGenerationConfig] = None,
+        **kwargs,
+    ) -> list[torch.Tensor]:
+        """Generate speech audio given text in various modes.
+        Supports three modes:
+        1. **Voice clone** — clone the voice style from the reference audio.
+            Should provide ``voice_clone_prompt`` (from
+           :meth:`create_voice_clone_prompt`) or ``ref_text`` + ``ref_audio``.
+        2. **Voice design** — provide ``instruct`` text describing
+           the desired voice style; no reference audio needed.
+        3. **Auto** — provide neither; the model picks a voice itself.
+        Args:
+            text: Target text (single string or list for batch).
+            language: Language name (e.g. ``"English"``) or code
+                (e.g. ``"en"``). ``None`` for language-agnostic mode.
+                Performance is slightly better if you specify the language.
+            ref_text: Optional reference text for voice cloning mode.
+            ref_audio: Optional reference audio for voice cloning mode.
+                Can be a file path or a (waveform, sample_rate) tuple.
+            voice_clone_prompt: Reusable prompt from :meth:`create_voice_clone_prompt`.
+                If provided, it overrides ``ref_text`` and ``ref_audio``.
+            instruct: Style instruction for voice design mode.
+            duration: Fixed output duration in seconds. If a single float,
+                applies to all items; if a list, one value per item.
+                ``None`` (default) lets the model estimate duration from text.
+                Overrides ``speed`` when both are provided.
+            speed: Speaking speed factor. ``> 1.0`` for faster, ``< 1.0`` for
+                slower. If a list, one value per item. ``None`` (default) uses
+                the model's default estimation.
+            generation_config: Explicit config object. If provided, takes
+                precedence over ``**kwargs``.
+            **kwargs: Generation config or its fields:
+                denoise: Whether to prepend the ``<|denoise|>`` token.
+                num_step: Number of iterative decoding steps.
+                guidance_scale: Classifier-free guidance scale.
+                t_shift: Time-step shift (smaller → emphasise low-SNR).
+                postprocess_output: Post-process output (remove silence, fade-in/out, pad edges).
+                layer_penalty_factor: Penalty encouraging earlier codebook
+                    layers to unmask first.
+                position_temperature: Temperature for position selection.
+                class_temperature: Temperature for token sampling (0 = greedy).
+                audio_chunk_duration: If > 0, split long text into chunks of
+                    this duration (seconds) and generate chunk by chunk.
+                audio_chunk_threshold: Only apply chunking if estimated audio
+                    duration exceeds this threshold (seconds).
+        Returns:
+            ``audios`` a list of 2-D ``torch.Tensor``, with the shape (1, T) and sampling rate
+            consistent with the model's audio tokenizer (usually 24000 Hz).
+        """
+        if self.audio_tokenizer is None or self.text_tokenizer is None:
+            raise RuntimeError(
+                "Model is not loaded with audio/text tokenizers. Make sure you "
+                "loaded the model with OmniVoice.from_pretrained()."
+            )
+        gen_config = (
+            generation_config
+            if generation_config is not None
+            else OmniVoiceGenerationConfig.from_dict(kwargs)
+        )
+        self.eval()
+        full_task = self._preprocess_all(
+            text=text,
+            language=language,
+            ref_text=ref_text,
+            ref_audio=ref_audio,
+            voice_clone_prompt=voice_clone_prompt,
+            instruct=instruct,
+            preprocess_prompt=gen_config.preprocess_prompt,
+            speed=speed,
+            duration=duration,
+        )
+        short_idx, long_idx = full_task.get_indices(
+            gen_config, self.audio_tokenizer.config.frame_rate
+        )
+        results = [None] * full_task.batch_size
+        if short_idx:
+            short_task = full_task.slice_task(short_idx)
+            short_results = self._generate_iterative(short_task, gen_config)
+            for idx, res in zip(short_idx, short_results):
+                results[idx] = res
+        if long_idx:
+            long_task = full_task.slice_task(long_idx)
+            long_results = self._generate_chunked(long_task, gen_config)
+            for idx, res in zip(long_idx, long_results):
+                results[idx] = res
+        generated_audios = []
+        for i in range(full_task.batch_size):
+            assert results[i] is not None, f"Result {i} was not generated"
+            generated_audios.append(
+                self._decode_and_post_process(
+                    results[i], full_task.ref_rms[i], gen_config  # type: ignore[arg-type]
+                )
+            )
+        return generated_audios
+    def create_voice_clone_prompt(
+        self,
+        ref_audio: Union[str, tuple[torch.Tensor, int]],
+        ref_text: Optional[str] = None,
+        preprocess_prompt: bool = True,
+    ) -> VoiceClonePrompt:
+        """Create a reusable voice clone prompt from reference audio.
+        Args:
+            ref_audio: File path (str) or ``(waveform, sample_rate)`` tuple.
+                waveform should be a 1-D or 2-D torch.Tensor (channels x samples).
+            ref_text: Transcript of the reference audio. If ``None``, the
+                ASR model will be used to auto-transcribe (must call
+                :meth:`load_asr_model` first).
+            preprocess_prompt: If ``True`` (default), apply silence removal and
+                trimming to the reference audio, add punctuation in the end
+                of reference text (if not already)
+        Returns:
+            A :class:`VoiceClonePrompt` that can be passed to :meth:`generate`.
+        """
+        if self.audio_tokenizer is None:
+            raise RuntimeError(
+                "Audio tokenizer is not loaded. Make sure you loaded the model "
+                "with OmniVoice.from_pretrained()."
+            )
+        if isinstance(ref_audio, str):
+            ref_wav = load_audio(ref_audio, self.sampling_rate)
+        else:
+            waveform, sr = ref_audio
+            if waveform.dim() == 1:
+                waveform = waveform.unsqueeze(0)
+            if waveform.size(0) > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            if sr != self.sampling_rate:
+                waveform = torchaudio.functional.resample(
+                    waveform, sr, self.sampling_rate
+                )
+            ref_wav = waveform
+        ref_rms = torch.sqrt(torch.mean(torch.square(ref_wav))).item()
+        if 0 < ref_rms < 0.1:
+            ref_wav = ref_wav * 0.1 / ref_rms
+        if preprocess_prompt:
+            # Trim long reference audio (>20s) by splitting at the largest silence gap.
+            # Skip trimming when ref_text is user-provided, otherwise the
+            # trimmed audio will no longer match the full transcript.
+            if ref_text is None:
+                ref_wav = trim_long_audio(ref_wav, self.sampling_rate)
+            elif ref_wav.size(-1) / self.sampling_rate > 20.0:
+                logger.warning(
+                    "Reference audio is %.1fs long (>20s) and ref_text was "
+                    "provided, so automatic trimming is skipped. A long reference "
+                    "may cause slower generation and degraded quality.",
+                    ref_wav.size(-1) / self.sampling_rate,
+                )
+            ref_wav = remove_silence(
+                ref_wav,
+                self.sampling_rate,
+                mid_sil=200,
+                lead_sil=100,
+                trail_sil=200,
+            )
+            if ref_wav.size(-1) == 0:
+                raise ValueError(
+                    "Reference audio is empty after silence removal. "
+                    "Try setting preprocess_prompt=False."
+                )
+        # Auto-transcribe if ref_text not provided
+        if ref_text is None:
+            if self._asr_pipe is None:
+                logger.info("ASR model not loaded yet, loading on-the-fly ...")
+                self.load_asr_model()
+            ref_text = self.transcribe((ref_wav, self.sampling_rate))
+            logger.debug("Auto-transcribed ref_text: %s", ref_text)
+        chunk_size = self.audio_tokenizer.config.hop_length
+        clip_size = int(ref_wav.size(-1) % chunk_size)
+        ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
+        ref_audio_tokens = self.audio_tokenizer.encode(
+            ref_wav.unsqueeze(0).to(self.audio_tokenizer.device),
+        ).audio_codes.squeeze(
+            0
+        )  # (C, T)
+        if preprocess_prompt:
+            ref_text = add_punctuation(ref_text)
+        return VoiceClonePrompt(
+            ref_audio_tokens=ref_audio_tokens,
+            ref_text=ref_text,
+            ref_rms=ref_rms,
+        )
+    def _decode_and_post_process(
+        self,
+        tokens: Union[torch.Tensor, List[torch.Tensor]],
+        rms: Union[float, None],
+        gen_config: OmniVoiceGenerationConfig,
+    ) -> torch.Tensor:
+        """
+        Args:
+            tokens: Audio tokens — either a single tensor of shape
+                (num_codebooks, seq_len) or a list of chunk tensors.
+            rms: RMS of the reference audio for volume adjustment.
+            gen_config: Generation config for post-processing options.
+        Returns:
+            Decoded and post-processed audio tensor of shape (1, T).
+        """
+        tokenizer_device = self.audio_tokenizer.device
+        if isinstance(tokens, list):
+            chunk_audios = [
+                self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+                for t in tokens
+            ]
+            audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
+        else:
+            audio_waveform = (
+                self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+            )
+        return self._post_process_audio(
+            audio_waveform,
+            postprocess_output=gen_config.postprocess_output,
+            ref_rms=rms,
+        )
+    def _post_process_audio(
+        self,
+        generated_audio: torch.Tensor,
+        postprocess_output: bool,
+        ref_rms: Union[float, None],
+    ) -> torch.Tensor:
+        """Optionally remove long silences, adjust volume, and add edge padding.
+        Args:
+            generated_audio: Audio tensor of shape (1, T).
+            postprocess_output: If True, remove long silences and apply fade/pad.
+            ref_rms: RMS of the reference audio for volume normalisation.
+        Returns:
+            Processed audio tensor of shape (1, T).
+        """
+        if postprocess_output:
+            generated_audio = remove_silence(
+                generated_audio,
+                self.sampling_rate,
+                mid_sil=500,
+                lead_sil=100,
+                trail_sil=100,
+            )
+        if ref_rms is not None and ref_rms < 0.1:
+            generated_audio = generated_audio * ref_rms / 0.1
+        elif ref_rms is None:
+            # No reference audio (voice design): peak-normalize to 0.5
+            # to avoid clipping while keeping a comfortable volume level.
+            peak = generated_audio.abs().max()
+            if peak > 1e-6:
+                generated_audio = generated_audio / peak * 0.5
+        generated_audio = fade_and_pad_audio(
+            generated_audio,
+            sample_rate=self.sampling_rate,
+        )
+        return generated_audio
+    def _generate_chunked(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[List[torch.Tensor]]:
+        """Generate long audio by splitting text into chunks and batching.
+        Each item in the returned list corresponds to one input and contains
+        a list of audio token tensors — one per text chunk.
+        Args:
+            task: A :class:`GenerationTask` with one or more items whose
+                estimated audio exceeds ``audio_chunk_threshold``.
+            gen_config: Generation config (``audio_chunk_duration`` controls
+                chunk size).
+        Returns:
+            Per-item list of chunk token-tensor lists.
+        """
+        # Chunk each item's text
+        all_chunks = []
+        for i in range(task.batch_size):
+            avg_tokens_per_char = task.target_lens[i] / len(task.texts[i])
+            text_chunk_len = int(
+                gen_config.audio_chunk_duration
+                * self.audio_tokenizer.config.frame_rate
+                / avg_tokens_per_char
+            )
+            chunks = chunk_text_punctuation(
+                text=task.texts[i],
+                chunk_len=text_chunk_len,
+                min_chunk_len=3,
+            )
+            logger.debug(f"Item {i} chunked into {len(chunks)} pieces: {chunks}")
+            all_chunks.append(chunks)
+        has_ref = [t is not None for t in task.ref_audio_tokens]
+        assert all(has_ref) or not any(has_ref), (
+            "Chunked inference requires all items to either have or not have "
+            "ref_audio. Mixed ref/non-ref is not supported."
+        )
+        max_num_chunks = max(len(c) for c in all_chunks)
+        # chunk_results[item_idx] = list of generated token tensors per chunk
+        chunk_results = [[] for _ in range(task.batch_size)]
+        def _run_batch(indices, texts, ref_audios, ref_texts):
+            speed_list = task.speed
+            target_lens = [
+                self._estimate_target_tokens(
+                    texts[j],
+                    ref_texts[j],
+                    ref_audios[j].size(-1) if ref_audios[j] is not None else None,
+                    speed=speed_list[i] if speed_list else 1.0,
+                )
+                for j, i in enumerate(indices)
+            ]
+            sub_task = GenerationTask(
+                batch_size=len(indices),
+                texts=texts,
+                target_lens=target_lens,
+                langs=[task.langs[i] for i in indices],
+                instructs=[task.instructs[i] for i in indices],
+                ref_texts=ref_texts,
+                ref_audio_tokens=ref_audios,
+                ref_rms=[task.ref_rms[i] for i in indices],
+                speed=[task.speed[i] for i in indices] if task.speed else None,
+            )
+            gen_tokens = self._generate_iterative(sub_task, gen_config)
+            for j, idx in enumerate(indices):
+                chunk_results[idx].append(gen_tokens[j])
+        if all(has_ref):
+            # All items have reference audio.
+            # We still sequentially generate chunks within each item, but we
+            # batch across items for the same chunk index. This allows to keep
+            # the VRAM usage manageable while still benefiting from batching.
+            for ci in range(max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[task.ref_audio_tokens[i] for i in indices],
+                    ref_texts=[task.ref_texts[i] for i in indices],
+                )
+        else:
+            # No reference audio — generate chunk 0 for all items first,
+            # then use chunk 0 output as reference for all subsequent chunks.
+            indices_0 = [i for i in range(task.batch_size) if len(all_chunks[i]) > 0]
+            _run_batch(
+                indices_0,
+                texts=[all_chunks[i][0] for i in indices_0],
+                ref_audios=[None] * len(indices_0),
+                ref_texts=[None] * len(indices_0),
+            )
+            first_chunk_map = {idx: chunk_results[idx][0] for idx in indices_0}
+            # Batch all remaining chunks, using chunk 0 as fixed reference
+            for ci in range(1, max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[first_chunk_map[i] for i in indices],
+                    ref_texts=[all_chunks[i][0] for i in indices],
+                )
+        return chunk_results
+    def _preprocess_all(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        preprocess_prompt: bool = True,
+        speed: Union[float, list[Optional[float]], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+    ) -> GenerationTask:
+        if isinstance(text, str):
+            text_list = [text]
+        else:
+            assert isinstance(
+                text, list
+            ), "text should be a string or a list of strings"
+            text_list = text
+        batch_size = len(text_list)
+        language_list = self._ensure_list(language, batch_size)
+        language_list = [_resolve_language(lang) for lang in language_list]
+        instruct_list = self._ensure_list(instruct, batch_size)
+        for i, s in enumerate(instruct_list):
+            if s is None:
+                continue
+            use_zh = bool(text_list[i] and _ZH_RE.search(text_list[i]))
+            instruct_list[i] = _resolve_instruct(s, use_zh=use_zh)
+        if voice_clone_prompt is not None and (
+            ref_text is not None or ref_audio is not None
+        ):
+            logger.warning(
+                "Both voice_clone_prompt and ref_text/ref_audio are provided. "
+                "ref_text/ref_audio will be ignored."
+            )
+        if voice_clone_prompt is None and ref_audio is not None:
+            # If voice_clone_prompt is not provided, create it from
+            # ref_audio (ref_text will be auto-transcribed if not given).
+            ref_text_list = self._ensure_list(ref_text, batch_size, auto_repeat=False)
+            ref_audio_list = self._ensure_list(ref_audio, batch_size, auto_repeat=False)
+            voice_clone_prompt = []
+            for i in range(len(ref_text_list)):
+                voice_clone_prompt.append(
+                    self.create_voice_clone_prompt(
+                        ref_audio=ref_audio_list[i],
+                        ref_text=ref_text_list[i],
+                        preprocess_prompt=preprocess_prompt,
+                    )
+                )
+        voice_clone_prompt_list = self._ensure_list(voice_clone_prompt, batch_size)
+        if voice_clone_prompt_list[0] is not None:
+            ref_text_list = [vc.ref_text for vc in voice_clone_prompt_list]
+            ref_audio_tokens_list = [
+                vc.ref_audio_tokens for vc in voice_clone_prompt_list
+            ]
+            ref_rms_list = [vc.ref_rms for vc in voice_clone_prompt_list]
+        else:
+            ref_text_list = [None] * batch_size
+            ref_audio_tokens_list = [None] * batch_size
+            ref_rms_list = [None] * batch_size
+        # Normalize speed/duration to per-item lists (may contain None).
+        if speed is not None:
+            if isinstance(speed, (int, float)):
+                user_speed = [float(speed)] * batch_size
+            else:
+                user_speed = list(speed)
+        else:
+            user_speed = None
+        if duration is not None:
+            if isinstance(duration, (int, float)):
+                durations = [float(duration)] * batch_size
+            else:
+                durations = list(duration)
+        else:
+            durations = None
+        num_target_tokens_list = []
+        for i in range(batch_size):
+            # duration[i] overrides speed for estimation: use speed=1.0
+            # to get the raw estimate, then override target_lens below.
+            has_dur = durations is not None and durations[i] is not None
+            item_speed = 1.0 if has_dur else (user_speed[i] if user_speed else 1.0)
+            est = self._estimate_target_tokens(
+                text_list[i],
+                ref_text_list[i],
+                ref_audio_tokens_list[i].size(-1)
+                if ref_audio_tokens_list[i] is not None
+                else None,
+                speed=item_speed,
+            )
+            num_target_tokens_list.append(est)
+        # Per-item duration overrides: set target_lens to exact frame count
+        # and compute speed ratio so chunked generation scales proportionally.
+        speed_list: Optional[List[float]] = None
+        if durations is not None:
+            frame_rate = self.audio_tokenizer.config.frame_rate
+            speed_list = []
+            for i in range(batch_size):
+                if durations[i] is not None:
+                    target_tokens = max(1, int(durations[i] * frame_rate))
+                    est = num_target_tokens_list[i]
+                    speed_list.append(est / target_tokens if target_tokens > 0 else 1.0)
+                    num_target_tokens_list[i] = target_tokens
+                else:
+                    s = user_speed[i] if user_speed else None
+                    speed_list.append(s if s is not None else 1.0)
+        elif user_speed is not None:
+            speed_list = [s if s is not None else 1.0 for s in user_speed]
+        return GenerationTask(
+            batch_size=batch_size,
+            texts=text_list,
+            target_lens=num_target_tokens_list,
+            langs=language_list,
+            instructs=instruct_list,
+            ref_texts=ref_text_list,
+            ref_audio_tokens=ref_audio_tokens_list,
+            ref_rms=ref_rms_list,
+            speed=speed_list,
+        )
+    def _estimate_target_tokens(self, text, ref_text, num_ref_audio_tokens, speed=1.0):
+        """Estimate number of target audio tokens."""
+        if num_ref_audio_tokens is None or ref_text is None or len(ref_text) == 0:
+            # Fall back to a simple heuristic
+            ref_text = "Nice to meet you."
+            num_ref_audio_tokens = 25
+        est = self.duration_estimator.estimate_duration(
+            text, ref_text, num_ref_audio_tokens
+        )
+        if speed > 0 and speed != 1.0:
+            est = est / speed
+        return max(1, int(est))
+    def _ensure_list(
+        self, x: Union[Any, List[Any]], batch_size: int, auto_repeat: bool = True
+    ) -> List[Any]:
+        x_list = x if isinstance(x, list) else [x]
+        if len(x_list) not in (
+            1,
+            batch_size,
+        ):
+            raise ValueError(
+                f"should be either the number of the text or 1, but got {len(x_list)}"
+            )
+        if auto_repeat and len(x_list) == 1 and batch_size is not None:
+            x_list = x_list * batch_size
+        return x_list
+    def _prepare_inference_inputs(
+        self,
+        text: str,
+        num_target_tokens: int,
+        ref_text: Optional[str] = None,
+        ref_audio_tokens: Optional[torch.Tensor] = None,
+        lang: Optional[str] = None,
+        instruct: Optional[str] = None,
+        denoise: bool = True,
+    ):
+        """Prepare input_ids and audio masks for inference.
+        Args:
+            text: Target text to generate.
+            num_target_tokens: Number of audio tokens to generate.
+            ref_text: Optional reference text for voice cloning.
+            ref_audio_tokens: Optional reference audio tokens for voice cloning.
+                with shape (C, T).
+            lang: Optional language ID.
+            instruct: Optional style instruction for voice design.
+            denoise: Whether to include the <|denoise|> token.
+        """
+        # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
+        #                      + <|instruct_start|>...<|instruct_end|>
+        style_text = ""
+        if denoise:
+            style_text += "<|denoise|>"
+        lang_str = lang if lang else "None"
+        instruct_str = instruct if instruct else "None"
+        style_text += f"<|lang_start|>{lang_str}<|lang_end|>"
+        style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>"
+        style_tokens = (
+            self.text_tokenizer(style_text, return_tensors="pt")
+            .input_ids.repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N1]
+        # Build text tokens
+        full_text = _combine_text(ref_text=ref_text, text=text)
+        text_tokens = (
+            self.text_tokenizer(
+                f"<|text_start|>{full_text}<|text_end|>",
+                return_tensors="pt",
+            )
+            .input_ids.repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N2]
+        # Target: all MASK
+        target_audio_tokens = torch.full(
+            (1, self.config.num_audio_codebook, num_target_tokens),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        # Conditional input
+        parts = [style_tokens, text_tokens]
+        if ref_audio_tokens is not None:
+            parts.append(ref_audio_tokens.unsqueeze(0).to(self.device))
+        parts.append(target_audio_tokens)
+        cond_input_ids = torch.cat(parts, dim=2)
+        cond_total_length = cond_input_ids.shape[2]
+        cond_audio_start_idx = cond_total_length - num_target_tokens
+        if ref_audio_tokens is not None:
+            cond_audio_start_idx -= ref_audio_tokens.size(-1)
+        cond_audio_mask = torch.zeros(
+            1, cond_total_length, dtype=torch.bool, device=self.device
+        )
+        cond_audio_mask[0, cond_audio_start_idx:] = True
+        return {
+            "input_ids": cond_input_ids,
+            "audio_mask": cond_audio_mask,
+        }
+    def _generate_iterative(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[torch.Tensor]:
+        """N-step iterative unmasked decoding.
+        Args:
+            task: A :class:`GenerationTask` containing batch texts, target
+                lengths, languages, instructions, and optional reference data.
+            gen_config: A :class:`OmniVoiceGenerationConfig` controlling
+                decoding steps, guidance, temperatures, etc.
+        Returns:
+            List of generated audio token tensors of shape (C, T) (one per
+            input text).
+        """
+        B = task.batch_size
+        inputs_list = [
+            self._prepare_inference_inputs(
+                task.texts[i],
+                task.target_lens[i],
+                task.ref_texts[i],
+                task.ref_audio_tokens[i],
+                task.langs[i],
+                task.instructs[i],
+                gen_config.denoise,
+            )
+            for i in range(B)
+        ]
+        c_lens = [inp["input_ids"].size(2) for inp in inputs_list]
+        max_c_len = max(c_lens)
+        pad_id = self.config.audio_mask_id  # Or any other tokens
+        batch_input_ids = torch.full(
+            (2 * B, self.config.num_audio_codebook, max_c_len),
+            pad_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        batch_audio_mask = torch.zeros(
+            (2 * B, max_c_len), dtype=torch.bool, device=self.device
+        )
+        batch_attention_mask = torch.zeros(
+            (2 * B, 1, max_c_len, max_c_len), dtype=torch.bool, device=self.device
+        )
+        for i, inp in enumerate(inputs_list):
+            c_len, u_len = c_lens[i], task.target_lens[i]
+            # Cond (0 ~ B-1)
+            batch_input_ids[i, :, :c_len] = inp["input_ids"]
+            batch_audio_mask[i, :c_len] = inp["audio_mask"]
+            batch_attention_mask[i, :, :c_len, :c_len] = True
+            # Uncond (B ~ 2B-1)
+            batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
+            batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
+            batch_attention_mask[B + i, :, :u_len, :u_len] = True
+        tokens = torch.full(
+            (B, self.config.num_audio_codebook, max(task.target_lens)),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        timesteps = _get_time_steps(
+            t_start=0.0,
+            t_end=1.0,
+            num_step=gen_config.num_step + 1,
+            t_shift=gen_config.t_shift,
+        ).tolist()
+        schedules = []
+        for t_len in task.target_lens:
+            total_mask = t_len * self.config.num_audio_codebook
+            rem = total_mask
+            sched = []
+            for step in range(gen_config.num_step):
+                num = (
+                    rem
+                    if step == gen_config.num_step - 1
+                    else min(
+                        math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])),
+                        rem,
+                    )
+                )
+                sched.append(int(num))
+                rem -= int(num)
+            schedules.append(sched)
+        layer_ids = torch.arange(
+            self.config.num_audio_codebook, device=self.device
+        ).view(1, -1, 1)
+        for step in range(gen_config.num_step):
+            batch_logits = self(
+                input_ids=batch_input_ids,
+                audio_mask=batch_audio_mask,
+                attention_mask=batch_attention_mask,
+            ).logits.to(torch.float32)
+            for i in range(B):
+                k = schedules[i][step]
+                if k <= 0:
+                    continue
+                c_len, t_len = c_lens[i], task.target_lens[i]
+                # Extract real target Logits
+                # [1, C, T, V]
+                c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :]
+                u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :]
+                pred_tokens, scores = self._predict_tokens_with_scoring(
+                    c_logits, u_logits, gen_config
+                )
+                scores = scores - (layer_ids * gen_config.layer_penalty_factor)
+                if gen_config.position_temperature > 0.0:
+                    scores = _gumbel_sample(scores, gen_config.position_temperature)
+                sample_tokens = tokens[i : i + 1, :, :t_len]
+                scores.masked_fill_(
+                    sample_tokens != self.config.audio_mask_id, -float("inf")
+                )
+                _, topk_idx = torch.topk(scores.flatten(), k)
+                flat_tokens = sample_tokens.flatten()
+                flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx]
+                sample_tokens.copy_(flat_tokens.view_as(sample_tokens))
+                # Update individual slices into batched structure
+                tokens[i : i + 1, :, :t_len] = sample_tokens
+                batch_input_ids[i : i + 1, :, c_len - t_len : c_len] = sample_tokens
+                batch_input_ids[B + i : B + i + 1, :, :t_len] = sample_tokens
+        return [tokens[i, :, : task.target_lens[i]] for i in range(B)]
+    def _predict_tokens_with_scoring(self, c_logits, u_logits, gen_config):
+        if gen_config.guidance_scale != 0:
+            c_log_probs = F.log_softmax(c_logits, dim=-1)
+            u_log_probs = F.log_softmax(u_logits, dim=-1)
+            log_probs = torch.log_softmax(
+                c_log_probs + gen_config.guidance_scale * (c_log_probs - u_log_probs),
+                dim=-1,
+            )
+        else:
+            log_probs = F.log_softmax(c_logits, dim=-1)
+        log_probs[..., self.config.audio_mask_id] = -float("inf")
+        if gen_config.class_temperature > 0.0:
+            filtered_probs = _filter_top_k(log_probs, ratio=0.1)
+            pred_tokens = _gumbel_sample(
+                filtered_probs, gen_config.class_temperature
+            ).argmax(dim=-1)
+        else:
+            pred_tokens = log_probs.argmax(dim=-1)
+        confidence_scores = log_probs.max(dim=-1)[0]
+        return pred_tokens, confidence_scores
+# ---------------------------------------------------------------------------
+# Standalone helpers
+# ---------------------------------------------------------------------------
+def _get_packed_mask(document_ids):
+    return partial(_mask_mod_packed, document_ids)
+def _mask_mod_packed(document_ids, b, h, q_idx, kv_idx):
+    # 1. Sequence Packing Logic: Tokens must belong to the same document.
+    # Note: The doc_id for padding tokens is -1, which will automatically not match
+    # (if handled correctly) or be ignored.
+    same_doc = document_ids[q_idx] == document_ids[kv_idx]
+    return same_doc
+def _resolve_language(language: Optional[str]) -> Union[str, None]:
+    from omnivoice.utils.lang_map import LANG_IDS, LANG_NAME_TO_ID
+    if language is None or language.lower() == "none":
+        return None
+    if language in LANG_IDS:
+        return language
+    key = language.lower()
+    if key in LANG_NAME_TO_ID:
+        return LANG_NAME_TO_ID[key]
+    logger.warning(
+        f"Language '{language}' is not recognized. "
+        f"Please use a valid language ID (e.g., 'en', 'zh', 'ja', 'de') "
+        f"or a full language name (e.g., 'English', 'Chinese', 'Japanese'). "
+        f"See supported_language_ids() or supported_language_names() for details. "
+        f"Falling back to None (language-agnostic mode)."
+    )
+    return None
+def _resolve_instruct(
+    instruct: Optional[str], use_zh: bool = False
+) -> Union[str, None]:
+    """Validate and normalise a voice-design instruct string.
+    Supported instruct items (case-insensitive for English):
+    English (comma + space separated):
+        gender: male, female
+        age: child, teenager, young adult, middle-aged, elderly
+        pitch: very low pitch, low pitch, moderate pitch,
+               high pitch, very high pitch
+        style: whisper
+        accent: american accent, british accent, australian accent, ...
+    Chinese (full-width comma separated):
+        gender: 男, 女
+        age: 儿童, 少年, 青年, 中年, 老年
+        pitch: 极低音调, 低音调, 中音调, 高音调, 极高音调
+        style: 耳语
+        dialect: 河南话, 陕西话, 四川话, 贵州话, 云南话,
+                 桂林话, 济南话, 石家庄话, 甘肃话, 宁夏话,
+                 青岛话, 东北话
+    Minor issues (auto-fixed):
+      - Wrong separator (half-width comma in Chinese instruct or
+        full-width comma in English instruct)
+      - Leading / trailing commas
+    Major issues (raise ``ValueError``):
+      - Unsupported or misspelled instruct items
+      - Suggestions are offered for close matches
+    Args:
+        instruct: Raw instruct string, or ``None``.
+        use_zh: If True, normalise all items to Chinese (used when the
+            synthesis text contains Chinese and no accent is specified).
+    Returns:
+        Normalised instruct string, or ``None``.
+    Raises:
+        ValueError: if any instruct item is unsupported or misspelled.
+    """
+    if instruct is None:
+        return None
+    instruct_str = instruct.strip()
+    if not instruct_str:
+        return None
+    # Split on both half-width and full-width commas
+    raw_items = re.split(r"\s*[,，]\s*", instruct_str)
+    raw_items = [x for x in raw_items if x]
+    # Validate each item
+    unknown = []
+    normalised = []
+    for raw in raw_items:
+        n = raw.strip().lower()
+        if n in _INSTRUCT_ALL_VALID:
+            normalised.append(n)
+        else:
+            sug = difflib.get_close_matches(n, _INSTRUCT_ALL_VALID, n=1, cutoff=0.6)
+            unknown.append((raw, n, sug[0] if sug else None))
+    if unknown:
+        lines = []
+        for raw, n, sug in unknown:
+            if sug:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported; did you mean '{sug}'?)")
+            else:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported)")
+        err = (
+            f"Unsupported instruct items found in {instruct_str}:\n"
+            + "\n".join(lines)
+            + "\n\nValid English items: "
+            + ", ".join(sorted(_INSTRUCT_VALID_EN))
+            + "\nValid Chinese items: "
+            + "，".join(sorted(_INSTRUCT_VALID_ZH))
+            + "\n\nTip: Use only English or only Chinese instructs. "
+            "English instructs should use comma + space (e.g. "
+            "'male, indian accent'),\nChinese instructs should use full-width "
+            "comma (e.g. '男，河南话')."
+        )
+        raise ValueError(err)
+    # --- Language consistency: dialect forces Chinese, accent forces English ---
+    has_dialect = any(n.endswith("话") for n in normalised)
+    has_accent = any(" accent" in n for n in normalised)
+    if has_dialect and has_accent:
+        raise ValueError(
+            "Cannot mix Chinese dialect and English accent in a single instruct. "
+            "Dialects are for Chinese speech, accents for English speech."
+        )
+    if has_dialect:
+        use_zh = True
+    elif has_accent:
+        use_zh = False
+    # --- Unify to single language ---
+    if use_zh:
+        normalised = [_INSTRUCT_EN_TO_ZH.get(n, n) for n in normalised]
+    else:
+        normalised = [_INSTRUCT_ZH_TO_EN.get(n, n) for n in normalised]
+    # --- Category conflict check ---
+    conflicts = []
+    for cat in _INSTRUCT_MUTUALLY_EXCLUSIVE:
+        hits = [n for n in normalised if n in cat]
+        if len(hits) > 1:
+            conflicts.append(hits)
+    if conflicts:
+        parts = []
+        for group in conflicts:
+            parts.append(" vs ".join(f"'{x}'" for x in group))
+        raise ValueError(
+            "Conflicting instruct items within the same category: "
+            + "; ".join(parts)
+            + ". Each category (gender, age, pitch, style, accent, dialect) "
+            "allows at most one item."
+        )
+    # Determine separator based on language
+    has_zh = any(any("\u4e00" <= c <= "\u9fff" for c in n) for n in normalised)
+    separator = "，" if has_zh else ", "
+    return separator.join(normalised)
+def _filter_top_k(logits: torch.Tensor, ratio: float = 0.1) -> torch.Tensor:
+    k = math.ceil(ratio * logits.shape[-1])
+    val, ind = logits.topk(k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(-1, ind, val)
+    return probs
+def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    scaled_logits = logits / temperature
+    u = torch.rand_like(scaled_logits)
+    gumbel_noise = -torch.log(-torch.log(u + 1e-10) + 1e-10)
+    return scaled_logits + gumbel_noise
+def _get_time_steps(
+    t_start: float = 0.0,
+    t_end: float = 1.0,
+    num_step: int = 10,
+    t_shift: float = 1.0,
+    device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    timesteps = torch.linspace(t_start, t_end, num_step + 1).to(device)
+    timesteps = t_shift * timesteps / (1 + (t_shift - 1) * timesteps)
+    return timesteps
+def _combine_text(text, ref_text: Optional[str] = None) -> str:
+    # combine with reference text if not None
+    if ref_text:
+        full_text = ref_text.strip() + " " + text.strip()
+    else:
+        full_text = text.strip()
+    # replace \n with .
+    full_text = re.sub(r"[ \t]*\r?\n[\s]*", ".", full_text)
+    # remove spaces around chinese characters
+    chinese_range = r"[\u4e00-\u9fff]"
+    pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
+    full_text = re.sub(pattern, "", full_text)
+    return full_text
+# ---------------------------------------------------------------------------
+# Register with HuggingFace Auto classes
+# ---------------------------------------------------------------------------
+AutoConfig.register("omnivoice", OmniVoiceConfig)
+AutoModel.register(OmniVoiceConfig, OmniVoice)

omnivoice/scripts/__init__.py ADDED Viewed

File without changes

omnivoice/scripts/denoise_audio.py ADDED Viewed

	@@ -0,0 +1,1048 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Denoise audio with Sidon and pack results into WebDataset shards.
+Supports two input modes:
+1. WebDataset manifest (data.lst):
+    python denoise_audio.py \
+        --input_manifest data.lst \
+        --tar_output_pattern output/audios/shard-%06d.tar \
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+        --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \
+        --decoder_path sidon-v0.1/decoder_cuda.pt
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", ...}):
+    python denoise_audio.py \
+        --input_jsonl data.jsonl \
+        --tar_output_pattern output/audios/shard-%06d.tar \
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+        --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \
+        --decoder_path sidon-v0.1/decoder_cuda.pt
+Output structure:
+    output_dir/
+    ├── audios/           # WebDataset tar shards (.flac audio + .json metadata)
+    │   ├── shard_000000.tar
+    │   └── ...
+    ├── txts/             # Per-shard JSONL metadata
+    │   ├── shard_000000.jsonl
+    │   └── ...
+    ├── data.lst          # Manifest: <tar_path> <jsonl_path> <sample_count> <total_duration>
+    └── errors.jsonl      # Failed samples with error details
+"""
+from __future__ import annotations
+import argparse
+import io
+import json
+import logging
+import os
+import pickle
+import struct
+import subprocess
+import sys
+import threading
+from concurrent.futures import FIRST_COMPLETED, Future, wait
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Union
+import numpy as np
+import torch
+import torchaudio
+import webdataset as wds
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from omnivoice.data.batching import StreamLengthGroupDataset
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.common import str2bool
+SIDON_INPUT_SAMPLE_RATE = 16_000
+SIDON_OUTPUT_SAMPLE_RATE = 48_000
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    # ── Input (mutually exclusive) ──
+    parser.add_argument(
+        "--input_manifest",
+        default=None,
+        help="WebDataset manifest (data.lst). Each line: "
+        "<tar_path> <jsonl_path> <num_items> <duration>",
+    )
+    parser.add_argument(
+        "--input_jsonl",
+        default=None,
+        help="Raw JSONL file. Each line: " '{"id": "...", "audio_path": "...", ...}',
+    )
+    # ── Output ──
+    parser.add_argument(
+        "--tar_output_pattern",
+        default=None,
+        help="Tar shard pattern, e.g. output/audios/shard_%%06d.tar",
+    )
+    parser.add_argument(
+        "--jsonl_output_pattern",
+        default=None,
+        help="JSONL shard pattern, e.g. output/txts/shard_%%06d.jsonl",
+    )
+    parser.add_argument(
+        "--samples_per_shard",
+        type=int,
+        default=1_000,
+        help="Maximum records per output shard",
+    )
+    # ── Model ──
+    parser.add_argument(
+        "--feature_extractor_path",
+        default=None,
+        help="Path to feature_extractor_cuda.pt",
+    )
+    parser.add_argument(
+        "--decoder_path",
+        default=None,
+        help="Path to decoder_cuda.pt",
+    )
+    parser.add_argument(
+        "--target_sample_rate",
+        type=int,
+        default=24_000,
+        help="Sample rate of the denoised output audio",
+    )
+    # ── Filtering ──
+    parser.add_argument(
+        "--min_length",
+        type=float,
+        default=0.0,
+        help="Minimum audio duration in seconds",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=float,
+        default=80.0,
+        help="Maximum audio duration in seconds",
+    )
+    # ── Batching ──
+    parser.add_argument(
+        "--batch_duration",
+        type=float,
+        default=200.0,
+        help="Target batch duration in seconds for dynamic batching",
+    )
+    parser.add_argument(
+        "--max_sample",
+        type=int,
+        default=32,
+        help="Maximum samples per batch for dynamic batching",
+    )
+    # ── Distributed ──
+    parser.add_argument(
+        "--num_machines",
+        type=int,
+        default=1,
+        help="Total number of machines for distributed runs",
+    )
+    parser.add_argument(
+        "--machine_index",
+        type=int,
+        default=0,
+        help="Zero-based machine index when distributing across multiple "
+        "machines (e.g. 0, 1, ... num_machines-1)",
+    )
+    # ── Parallelism ──
+    parser.add_argument(
+        "--nj_per_gpu",
+        type=int,
+        default=1,
+        help="Worker processes per GPU (default 1)",
+    )
+    parser.add_argument(
+        "--loader_workers",
+        type=int,
+        default=16,
+        help="PyTorch DataLoader worker threads",
+    )
+    # ── Data order (JSONL mode) ──
+    parser.add_argument(
+        "--shuffle",
+        type=str2bool,
+        default=True,
+        help="Shuffle JSONL entries",
+    )
+    parser.add_argument(
+        "--shuffle_seed",
+        type=int,
+        default=42,
+        help="Seed for JSONL shuffle",
+    )
+    # ── Error handling ──
+    parser.add_argument(
+        "--skip_errors",
+        action="store_true",
+        help="Skip items that fail to denoise instead of aborting",
+    )
+    parser.add_argument(
+        "--_subprocess_worker",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+    return parser
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+def count_lines(path: str) -> int:
+    """Count newlines efficiently by reading binary chunks."""
+    count = 0
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            count += chunk.count(b"\n")
+    return count
+PaddingStrategy = Union[bool, str]
+ReturnType = Union[torch.Tensor, np.ndarray]
+def extract_seamless_m4t_features(
+    raw_speech: Union[torch.Tensor, List[float], List[torch.Tensor], List[List[float]]],
+    sampling_rate: int = 16000,
+    num_mel_bins: int = 80,
+    frame_length: int = 25,
+    frame_shift: int = 10,
+    preemphasis_coefficient: float = 0.97,
+    dither: float = 0.0,
+    window_type: str = "povey",
+    do_normalize_per_mel_bins: bool = True,
+    stride: int = 2,
+    padding: PaddingStrategy = "longest",
+    max_length: Optional[int] = None,
+    pad_to_multiple_of: Optional[int] = 2,
+    return_tensors: Optional[str] = "pt",
+    return_attention_mask: bool = True,
+    padding_value: float = 0.0,
+    device: torch.device = torch.device("cpu"),
+) -> Dict[str, ReturnType]:
+    """Extract SeamlessM4T features using Torch-only operators."""
+    if not isinstance(raw_speech, list):
+        raw_speech = [raw_speech]
+    processed_speech = [
+        torch.as_tensor(sample, dtype=torch.float32, device=device)
+        for sample in raw_speech
+    ]
+    features: List[torch.Tensor] = []
+    for waveform in processed_speech:
+        if waveform.ndim > 1:
+            waveform = waveform[0]
+        waveform_tensor = waveform.unsqueeze(0)
+        feature = torchaudio.compliance.kaldi.fbank(
+            waveform=waveform_tensor,
+            sample_frequency=sampling_rate,
+            num_mel_bins=num_mel_bins,
+            frame_length=frame_length,
+            frame_shift=frame_shift,
+            dither=dither,
+            preemphasis_coefficient=preemphasis_coefficient,
+            remove_dc_offset=True,
+            window_type=window_type,
+            use_energy=False,
+            energy_floor=1.192092955078125e-07,
+        )
+        features.append(feature.squeeze(0))
+    if do_normalize_per_mel_bins:
+        normalised: List[torch.Tensor] = []
+        for feature in features:
+            mean = feature.mean(0, keepdim=True)
+            var = feature.var(0, keepdim=True)
+            normalised.append((feature - mean) / torch.sqrt(var + 1e-5))
+        features = normalised
+    def _pad_batch(
+        features: List[torch.Tensor],
+        padding_strategy: PaddingStrategy = "longest",
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_value: float = 0.0,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if padding_strategy == "longest":
+            target_length = max(f.shape[0] for f in features)
+        elif max_length is not None:
+            target_length = max_length
+        else:
+            raise ValueError(
+                "max_length must be provided when padding_strategy is not 'longest'"
+            )
+        if pad_to_multiple_of is not None:
+            target_length = (
+                (target_length + pad_to_multiple_of - 1)
+                // pad_to_multiple_of
+                * pad_to_multiple_of
+            )
+        batch_size = len(features)
+        feature_dim = features[0].shape[1]
+        device = features[0].device
+        padded_features = torch.full(
+            (batch_size, target_length, feature_dim),
+            padding_value,
+            dtype=torch.float32,
+            device=device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, target_length),
+            dtype=torch.int64,
+            device=device,
+        )
+        for index, feature_tensor in enumerate(features):
+            seq_len = feature_tensor.shape[0]
+            padded_features[index, :seq_len] = feature_tensor
+            attention_mask[index, :seq_len] = 1
+        return padded_features, attention_mask
+    input_features, attention_mask = _pad_batch(
+        features,
+        padding_strategy=padding,
+        max_length=max_length,
+        pad_to_multiple_of=pad_to_multiple_of,
+        padding_value=padding_value,
+    )
+    batch_size, num_frames, num_channels = input_features.shape
+    new_num_frames = (num_frames // stride) * stride
+    input_features = input_features[:, :new_num_frames, :]
+    if return_attention_mask:
+        attention_mask = attention_mask[:, :new_num_frames]
+    input_features = input_features.reshape(
+        batch_size, new_num_frames // stride, num_channels * stride
+    )
+    output: Dict[str, ReturnType] = {"input_features": input_features}
+    if return_attention_mask:
+        output["attention_mask"] = attention_mask[:, 1::stride]
+    if return_tensors == "np":
+        for key, value in output.items():
+            output[key] = value.cpu().numpy()  # type: ignore[assignment]
+    return output
+def serialise_flac(key: str, waveform: torch.Tensor, sample_rate: int) -> dict:
+    buffer = io.BytesIO()
+    audio = waveform.to(dtype=torch.float32).cpu()
+    if audio.ndim == 1:
+        audio = audio.unsqueeze(0)
+    torchaudio.save(buffer, audio, sample_rate, format="flac", bits_per_sample=16)
+    return {"__key__": key, "flac": buffer.getvalue()}
+def _normalise_value(value: Any) -> Any:
+    """Convert tensors and NumPy scalars to serialisable Python objects."""
+    if isinstance(value, torch.Tensor):
+        if value.ndim == 0:
+            return value.item()
+        return value.cpu().tolist()
+    if isinstance(value, np.generic):
+        return value.item()
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    return value
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+    cleaned: dict[str, Any] = {}
+    for key, value in metadata.items():
+        if value is None:
+            continue
+        cleaned[key] = _normalise_value(value)
+    return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+# ---------------------------------------------------------------------------
+# Denoising model
+# ---------------------------------------------------------------------------
+class SpeechDenoisingProcessor:
+    """Run the TorchScripted feature extractor and decoder."""
+    def __init__(
+        self,
+        feature_extractor_path: str,
+        decoder_path: str,
+        device: str,
+    ) -> None:
+        self.device = torch.device(device)
+        self.feature_extractor = torch.jit.load(
+            feature_extractor_path, map_location=self.device
+        )
+        self.decoder = torch.jit.load(decoder_path, map_location=self.device)
+        self.feature_extractor.eval()
+        self.decoder.eval()
+    @torch.inference_mode()
+    def process(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
+        return self.process_batch([waveform], [sample_rate])[0]
+    @torch.inference_mode()
+    def process_batch(
+        self,
+        waveforms: Sequence[torch.Tensor] | torch.Tensor,
+        sample_rates: Optional[Sequence[int]] = None,
+        expected_lengths: Optional[Sequence[int]] = None,
+    ) -> List[torch.Tensor]:
+        if expected_lengths is None:
+            expected_lengths: list[int] = []
+            for waveform, sample_rate in zip(waveforms, sample_rates):
+                duration_seconds = waveform.shape[-1] / float(sample_rate)
+                expected_lengths.append(
+                    int(round(duration_seconds * SIDON_OUTPUT_SAMPLE_RATE))
+                )
+        waveforms = torch.nn.functional.pad(waveforms, (0, 24000))
+        features = extract_seamless_m4t_features(
+            [x for x in waveforms],
+            return_tensors="pt",
+            padding_value=1.0,
+            device=self.device,
+        )
+        feature_tensor = self.feature_extractor(
+            features["input_features"].to(self.device)
+        )["last_hidden_state"]
+        restored_waveforms = self.decoder(feature_tensor.transpose(1, 2)).cpu()
+        results: List[torch.Tensor] = []
+        for sample_idx, sample in enumerate(restored_waveforms):
+            restored_waveform = sample.view(-1)
+            target_length = expected_lengths[sample_idx]
+            current_length = restored_waveform.shape[-1]
+            if target_length > 0 and current_length != target_length:
+                diff = target_length - current_length
+                if diff > 0:
+                    restored_waveform = torch.nn.functional.pad(
+                        restored_waveform, (0, diff)
+                    )
+                elif diff < 0:
+                    restored_waveform = restored_waveform[:target_length]
+            results.append(restored_waveform.contiguous())
+        return results
+# ---------------------------------------------------------------------------
+# Batch collation
+# ---------------------------------------------------------------------------
+class CollateFunction:
+    """Collate a list of samples into a padded batch."""
+    def __init__(
+        self,
+        sample_rate: int,
+        skip_errors: bool,
+    ) -> None:
+        self.sample_rate = sample_rate
+        self.skip_errors = skip_errors
+    def __call__(self, samples: Sequence[dict[str, Any]]) -> CollatedBatch:
+        keys: list[str] = []
+        waveforms: list[torch.Tensor] = []
+        durations: list[float] = []
+        metadata: list[dict[str, Any]] = []
+        for sample in samples:
+            keys.append(sample["label"]["id"])
+            waveforms.append(sample["audio"].squeeze(0))
+            durations.append(sample["audio"].size(-1) / self.sample_rate)
+            metadata.append(sample["label"])
+        waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
+        return CollatedBatch(
+            keys=keys, waveforms=waveforms, durations=durations, metadata=metadata
+        )
+@dataclass
+class CollatedBatch:
+    """Batch payload returned by the DataLoader collate function."""
+    keys: list[str]
+    waveforms: list[torch.Tensor]
+    durations: list[float]
+    metadata: list[dict[str, Any]]
+    @property
+    def size(self) -> int:
+        return len(self.keys)
+# ---------------------------------------------------------------------------
+# Subprocess-based GPU worker pool
+# ---------------------------------------------------------------------------
+#
+# Problem: PyTorch ≥2.8 caches CUDA device state at import time.  Neither
+# forkserver nor spawn lets us change CUDA_VISIBLE_DEVICES *before* the CUDA
+# runtime captures the device list.  The only reliable approach is to launch
+# each worker as a **subprocess** with CUDA_VISIBLE_DEVICES set in the
+# subprocess environment, guaranteeing it takes effect before `import torch`.
+#
+# Protocol (parent ↔ child, length-prefixed pickle over stdin/stdout):
+#   Parent → child:  4-byte LE uint32 length  +  pickle(CollatedBatch)
+#   Child  → parent: 4-byte LE uint32 length  +  pickle(result dict)
+#   Shutdown signal: 4 zero bytes (length == 0)
+def _subprocess_recv():
+    """Read a length-prefixed pickled object from stdin.  Returns None on shutdown."""
+    raw = sys.stdin.buffer.read(4)
+    if len(raw) < 4:
+        return None
+    (length,) = struct.unpack("<I", raw)
+    if length == 0:
+        return None
+    data = sys.stdin.buffer.read(length)
+    return pickle.loads(data)
+def _subprocess_send(obj):
+    """Send a pickled object with a 4-byte length prefix to stdout."""
+    data = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+    sys.stdout.buffer.write(struct.pack("<I", len(data)))
+    sys.stdout.buffer.write(data)
+    sys.stdout.buffer.flush()
+def subprocess_worker_main():
+    """Entry point for a GPU worker subprocess.
+    Expected environment: CUDA_VISIBLE_DEVICES already set by the parent.
+    Receives initargs via stdin, then processes batches in a loop.
+    """
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker PID %(process)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    initargs = _subprocess_recv()
+    feature_extractor_path, decoder_path = initargs
+    device = "cpu"
+    if torch.cuda.is_available():
+        torch.cuda.set_device(0)
+        device = "cuda:0"
+    else:
+        logging.warning("CUDA not available in worker subprocess.")
+    logging.info(
+        f"Worker PID={os.getpid()}, "
+        f"CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES')}, device={device}"
+    )
+    processor = SpeechDenoisingProcessor(
+        feature_extractor_path=feature_extractor_path,
+        decoder_path=decoder_path,
+        device=device,
+    )
+    # Process batches until shutdown signal
+    while True:
+        msg = _subprocess_recv()
+        if msg is None:
+            break
+        req_id = msg["_req_id"]
+        batch = msg["_batch"]
+        try:
+            cleaned_waveforms = processor.process_batch(
+                batch.waveforms,
+                expected_lengths=[
+                    round(d * SIDON_OUTPUT_SAMPLE_RATE) for d in batch.durations
+                ],
+            )
+            cleaned_cpu = [w.cpu() for w in cleaned_waveforms]
+            result = {
+                "_req_id": req_id,
+                "status": "success",
+                "keys": batch.keys,
+                "results": cleaned_cpu,
+                "metadata": batch.metadata,
+                "size": batch.size,
+            }
+        except Exception as e:
+            result = {
+                "_req_id": req_id,
+                "status": "error",
+                "keys": batch.keys,
+                "error": str(e),
+                "size": batch.size,
+            }
+        _subprocess_send(result)
+class _GPUWorker:
+    """Handle to a single GPU worker subprocess."""
+    def __init__(self, physical_gpu_id, feature_extractor_path, decoder_path):
+        env = os.environ.copy()
+        if physical_gpu_id is not None:
+            env["CUDA_VISIBLE_DEVICES"] = str(physical_gpu_id)
+        self.proc = subprocess.Popen(
+            [
+                sys.executable,
+                "-m",
+                "omnivoice.scripts.denoise_audio",
+                "--_subprocess_worker",
+            ],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            env=env,
+        )
+        # Send init args
+        init_data = pickle.dumps(
+            (feature_extractor_path, decoder_path), protocol=pickle.HIGHEST_PROTOCOL
+        )
+        self.proc.stdin.write(struct.pack("<I", len(init_data)))
+        self.proc.stdin.write(init_data)
+        self.proc.stdin.flush()
+        self._lock = threading.Lock()
+    def submit(self, batch_with_id):
+        """Send a batch dict (containing _req_id + _batch) for processing."""
+        with self._lock:
+            data = pickle.dumps(batch_with_id, protocol=pickle.HIGHEST_PROTOCOL)
+            self.proc.stdin.write(struct.pack("<I", len(data)))
+            self.proc.stdin.write(data)
+            self.proc.stdin.flush()
+    def read_result(self):
+        """Blocking read for one result."""
+        raw = self.proc.stdout.read(4)
+        if len(raw) < 4:
+            return None
+        (length,) = struct.unpack("<I", raw)
+        if length == 0:
+            return None
+        data = self.proc.stdout.read(length)
+        return pickle.loads(data)
+    def shutdown(self):
+        """Send shutdown signal and wait for process."""
+        try:
+            with self._lock:
+                self.proc.stdin.write(struct.pack("<I", 0))
+                self.proc.stdin.flush()
+        except Exception:
+            pass
+        self.proc.wait(timeout=30)
+class GPUWorkerPool:
+    """Pool of GPU worker subprocesses with round-robin task submission."""
+    def __init__(self, pool_specs, feature_extractor_path, decoder_path):
+        """
+        Args:
+            pool_specs: list of (physical_gpu_id, num_workers) tuples.
+            feature_extractor_path: path to JIT feature extractor.
+            decoder_path: path to JIT decoder.
+        """
+        self.workers: list[_GPUWorker] = []
+        for physical_gpu_id, num_workers in pool_specs:
+            for _ in range(num_workers):
+                self.workers.append(
+                    _GPUWorker(physical_gpu_id, feature_extractor_path, decoder_path)
+                )
+        self._rr = 0
+        self._futures: dict[int, Future] = {}
+        self._futures_lock = threading.Lock()
+        self._next_id = 0
+        # Start reader threads for each worker
+        self._reader_threads = []
+        for worker in self.workers:
+            t = threading.Thread(target=self._reader_loop, args=(worker,), daemon=True)
+            t.start()
+            self._reader_threads.append(t)
+    def _reader_loop(self, worker):
+        while True:
+            result = worker.read_result()
+            if result is None:
+                break
+            req_id = result.pop("_req_id", None)
+            with self._futures_lock:
+                fut = self._futures.pop(req_id, None)
+            if fut is not None:
+                fut.set_result(result)
+    def submit(self, batch) -> Future:
+        worker = self.workers[self._rr % len(self.workers)]
+        self._rr += 1
+        with self._futures_lock:
+            req_id = self._next_id
+            self._next_id += 1
+            fut = Future()
+            self._futures[req_id] = fut
+        batch_dict = {
+            "_req_id": req_id,
+            "_batch": batch,
+        }
+        worker.submit(batch_dict)
+        return fut
+    def shutdown(self):
+        for worker in self.workers:
+            worker.shutdown()
+        for t in self._reader_threads:
+            t.join(timeout=5)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    parser = build_parser()
+    args = parser.parse_args()
+    # ── Subprocess worker mode ──
+    if args._subprocess_worker:
+        subprocess_worker_main()
+        return
+    # Validate input arguments
+    assert args.tar_output_pattern is not None, "--tar_output_pattern is required."
+    assert args.jsonl_output_pattern is not None, "--jsonl_output_pattern is required."
+    assert bool(args.input_manifest) != bool(
+        args.input_jsonl
+    ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+    if args.num_machines > 1:
+        assert (
+            0 <= args.machine_index < args.num_machines
+        ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+    # ── Build base dataset and count total samples ──
+    if args.input_jsonl:
+        logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+        total_samples = count_lines(args.input_jsonl)
+        base_dataset = JsonlDatasetReader(
+            args.input_jsonl,
+            sample_rate=SIDON_INPUT_SAMPLE_RATE,
+            shuffle=args.shuffle,
+            shuffle_seed=args.shuffle_seed,
+        )
+        loader_workers = args.loader_workers
+    else:
+        logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+        manifest_num_lines = count_lines(args.input_manifest)
+        loader_workers = min(args.loader_workers, manifest_num_lines)
+        total_samples = 0
+        manifests = []
+        with open(args.input_manifest, "r", encoding="utf-8") as f:
+            for line_id, line in tqdm(
+                enumerate(f),
+                total=manifest_num_lines,
+                desc="Calculating dataset length",
+            ):
+                items = line.strip().split(" ")
+                tar_path, jsonl_path, num_items, duration = (
+                    items[0],
+                    items[1],
+                    int(items[2]),
+                    float(items[3]),
+                )
+                assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+                assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+                assert jsonl_path.endswith(
+                    ".jsonl"
+                ), f"File {jsonl_path} is not a .jsonl file."
+                if (
+                    args.num_machines > 1
+                    and line_id % args.num_machines != args.machine_index
+                ):
+                    continue
+                total_samples += num_items
+                manifests.append((tar_path, jsonl_path, num_items, duration))
+        logging.info(
+            f"Total shards: {manifest_num_lines}, "
+            f"Shards for current index: {len(manifests)}"
+        )
+        base_dataset = WebDatasetReader(
+            manifests=manifests,
+            sample_rate=SIDON_INPUT_SAMPLE_RATE,
+            evaluation=True,
+        )
+    # ── Dynamic batching + DataLoader ──
+    batched_dataset = StreamLengthGroupDataset(
+        dataset=base_dataset,
+        batch_duration=args.batch_duration,
+        max_sample=args.max_sample,
+        min_length=args.min_length,
+        max_length=args.max_length,
+    )
+    collate_fn = CollateFunction(
+        skip_errors=args.skip_errors,
+        sample_rate=SIDON_INPUT_SAMPLE_RATE,
+    )
+    dataloader = DataLoader(
+        dataset=batched_dataset,
+        batch_size=None,
+        collate_fn=collate_fn,
+        num_workers=loader_workers,
+        prefetch_factor=10 if loader_workers > 0 else None,
+        pin_memory=True,
+        persistent_workers=loader_workers > 0,
+    )
+    # ── Multi-GPU process pool ──
+    num_devices = torch.cuda.device_count()
+    if num_devices == 0:
+        logging.warning("No GPUs detected - using CPU for processing")
+        num_processes = args.nj_per_gpu
+    else:
+        num_processes = num_devices * args.nj_per_gpu
+    logging.info(
+        f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+        f"Total processes: {num_processes}"
+    )
+    # Build a list of (physical_gpu_id, num_workers) for each pool.
+    # When num_devices == 0 we use a single CPU pool.
+    if num_devices == 0:
+        pool_specs = [(None, num_processes)]
+    else:
+        pool_specs = [(gpu_id, args.nj_per_gpu) for gpu_id in range(num_devices)]
+    # ── Output paths ──
+    tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+    jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+    Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    output_dir = Path(tar_output_pattern).parent.parent
+    error_log_path = str(output_dir / "errors.jsonl")
+    manifest_path = str(output_dir / "data.lst")
+    error_logger = logging.getLogger("error_log")
+    error_logger.setLevel(logging.ERROR)
+    error_logger.handlers.clear()
+    error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+    error_fh.setFormatter(logging.Formatter("%(message)s"))
+    error_logger.addHandler(error_fh)
+    # ── Progress and shard tracking ──
+    processed_count = 0
+    error_count = 0
+    write_error_count = 0
+    failed_ids = []
+    shard_idx = 0
+    shard_sample_count = 0
+    shard_duration = 0.0
+    samples_per_shard = args.samples_per_shard
+    shard_manifest = {}
+    target_sample_rate = args.target_sample_rate
+    tar_writer = None
+    jsonl_file = None
+    def open_new_shard():
+        nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        if shard_idx > 0 and shard_sample_count > 0:
+            prev_idx = shard_idx - 1
+            shard_manifest[prev_idx] = (
+                os.path.abspath(tar_output_pattern % prev_idx),
+                os.path.abspath(jsonl_output_pattern % prev_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+        tar_fname = tar_output_pattern % shard_idx
+        jsonl_fname = jsonl_output_pattern % shard_idx
+        tar_writer = wds.TarWriter(tar_fname)
+        jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+        shard_idx += 1
+        shard_sample_count = 0
+        shard_duration = 0.0
+    def write_sample(key, waveform, metadata):
+        nonlocal shard_sample_count, write_error_count, shard_duration
+        assert tar_writer is not None and jsonl_file is not None
+        try:
+            if target_sample_rate != SIDON_OUTPUT_SAMPLE_RATE:
+                waveform = torchaudio.functional.resample(
+                    waveform,
+                    orig_freq=SIDON_OUTPUT_SAMPLE_RATE,
+                    new_freq=target_sample_rate,
+                )
+            waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.6
+            record = serialise_flac(key, waveform, target_sample_rate)
+            jsonl_record = _encode_metadata(metadata)
+            tar_writer.write(record)
+            jsonl_file.write(jsonl_record.decode("utf-8") + "\n")
+            shard_sample_count += 1
+            shard_duration += metadata.get("audio_duration", 0.0)
+        except Exception as exc:
+            write_error_count += 1
+            failed_ids.append(key)
+            error_logger.error(
+                json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+            )
+            logging.error(f"Write failed for sample {key}: {exc}")
+    def handle_result(result):
+        nonlocal processed_count, error_count
+        if result["status"] == "success":
+            for key, cleaned, metadata in zip(
+                result["keys"], result["results"], result["metadata"]
+            ):
+                if tar_writer is None or shard_sample_count >= samples_per_shard:
+                    open_new_shard()
+                write_sample(key, cleaned, metadata)
+                processed_count += 1
+        else:
+            error_count += result["size"]
+            failed_ids.extend(result["keys"])
+            for key in result["keys"]:
+                error_logger.error(
+                    json.dumps(
+                        {"id": key, "reason": result["error"]},
+                        ensure_ascii=False,
+                    )
+                )
+            if not args.skip_errors:
+                raise RuntimeError(
+                    f"Batch starting with {result['keys'][0]} failed - terminating"
+                )
+            logging.warning(
+                f"Skipping failed batch starting with {result['keys'][0]}: "
+                f"{result['error']}"
+            )
+    # ── Main processing loop ──
+    main_progress = tqdm(total=total_samples, desc="Denoising Audio")
+    # Launch subprocess-based GPU workers.  CUDA_VISIBLE_DEVICES is set in the
+    # subprocess Popen environment so it takes effect before import torch.
+    pool = GPUWorkerPool(pool_specs, args.feature_extractor_path, args.decoder_path)
+    logging.info(f"Submitting tasks... ({num_processes} subprocess workers)")
+    try:
+        futures = set()
+        max_pending = num_processes * 2
+        def drain_completed():
+            nonlocal futures
+            done, _ = wait(futures, return_when=FIRST_COMPLETED)
+            for f in done:
+                futures.discard(f)
+                result = f.result()
+                main_progress.update(result["size"])
+                handle_result(result)
+                main_progress.set_postfix(
+                    OK=processed_count,
+                    Err=error_count,
+                )
+        for batch in dataloader:
+            if batch.size == 0:
+                continue
+            if len(futures) >= max_pending:
+                drain_completed()
+            futures.add(pool.submit(batch))
+        logging.info("Processing remaining pending batches...")
+        while futures:
+            drain_completed()
+    except Exception:
+        logging.error("Critical error during processing", exc_info=True)
+        raise
+    finally:
+        pool.shutdown()
+        main_progress.close()
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        if shard_idx > 0 and shard_sample_count > 0:
+            last_idx = shard_idx - 1
+            shard_manifest[last_idx] = (
+                os.path.abspath(tar_output_pattern % last_idx),
+                os.path.abspath(jsonl_output_pattern % last_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+    # ── Write manifest (data.lst) ──
+    with open(manifest_path, "w", encoding="utf-8") as mf:
+        for idx in sorted(shard_manifest.keys()):
+            tar_path, jsonl_path, count, duration = shard_manifest[idx]
+            mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+    # ── Summary ──
+    total_failed = error_count + write_error_count
+    filtered_and_skipped = total_samples - processed_count - total_failed
+    logging.info(
+        f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+        f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+    )
+    logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+    if total_failed > 0:
+        logging.info(f"Error details: {error_log_path}")
+    if failed_ids and args.skip_errors:
+        logging.warning(
+            f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+        )
+    if write_error_count > 0 and not args.skip_errors:
+        raise RuntimeError(
+            f"{write_error_count} samples failed to write - check logs for details"
+        )
+if __name__ == "__main__":
+    main()

omnivoice/scripts/extract_audio_tokens.py ADDED Viewed

	@@ -0,0 +1,625 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Extract audio tokens from audio data and pack them into WebDataset shards.
+Supports two input modes:
+1. WebDataset manifest (data.lst):
+    python extract_audio_tokens.py \
+        --input_manifest data.lst \
+        --tar_output_pattern output/audios/shard-%06d.tar \
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}):
+    python extract_audio_tokens.py \
+        --input_jsonl data.jsonl \
+        --tar_output_pattern output/audios/shard-%06d.tar \
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl
+Output structure:
+    output_dir/
+    ├── audios/           # WebDataset tar shards (.npy audio tokens + .json metadata)
+    │   ├── shard_000000.tar
+    │   └── ...
+    ├── txts/             # Per-shard JSONL metadata
+    │   ├── shard_000000.jsonl
+    │   └── ...
+    ├── data.lst          # Manifest: <tar_path> <jsonl_path> <sample_count> <total_duration>
+    └── errors.jsonl      # Failed samples with error details
+"""
+import argparse
+import io
+import json
+import logging
+import multiprocessing as mp
+import os
+import warnings
+from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+import webdataset as wds
+from torch.utils.data import DataLoader, IterableDataset
+from tqdm.auto import tqdm
+from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.common import str2bool
+warnings.filterwarnings(
+    "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm"
+)
+HIGGS_INPUT_SAMPLE_RATE = 24_000
+# Global variables: Store tokenizer and device for each worker process
+worker_tokenizer = None
+worker_feature_extractor = None
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input_manifest",
+        default=None,
+        help="Path to input dataset manifest (data.lst).",
+    )
+    parser.add_argument(
+        "--input_jsonl",
+        default=None,
+        help="Path to raw JSONL file (alternative to --input_manifest).",
+    )
+    parser.add_argument(
+        "--tar_output_pattern",
+        required=True,
+        help="Tar shard pattern passed to WebDataset",
+    )
+    parser.add_argument(
+        "--jsonl_output_pattern",
+        required=True,
+        help="Jsonl shard pattern passed to WebDataset",
+    )
+    parser.add_argument(
+        "--samples_per_shard",
+        type=int,
+        default=1000,
+        help="Maximum records per shard",
+    )
+    parser.add_argument(
+        "--min_num_shards",
+        type=int,
+        default=32,
+        help="Minimum number of output shards (use to ensure "
+        "shard count >= num_gpu * num_workers)",
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="eustlb/higgs-audio-v2-tokenizer",
+        help="Path to audio tokenizer.",
+    )
+    parser.add_argument(
+        "--skip_errors", action="store_true", help="Skip items that fail to process"
+    )
+    parser.add_argument(
+        "--min_length",
+        type=float,
+        default=0.0,
+        help="Minimum audio duration in seconds (e.g. 2.0)",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=float,
+        default=float("inf"),
+        help="Maximum audio duration in seconds (e.g. 15.0)",
+    )
+    parser.add_argument(
+        "--num_machines",
+        type=int,
+        default=1,
+        help="Total number of machines for distributed runs",
+    )
+    parser.add_argument(
+        "--machine_index",
+        type=int,
+        default=0,
+        help="Zero-based machine index when distributing across multiple "
+        "machines (e.g. 0, 1, ... num_machines-1)",
+    )
+    parser.add_argument(
+        "--nj_per_gpu",
+        type=int,
+        default=3,
+        help="Number of worker processes to spawn per GPU.",
+    )
+    parser.add_argument(
+        "--loader_workers",
+        type=int,
+        default=24,
+        help="Number of DataLoader workers for streaming IterableDataset.",
+    )
+    parser.add_argument(
+        "--shuffle",
+        type=str2bool,
+        default=True,
+        help="Shuffle data by default.",
+    )
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=42,
+        help="Random seed for shuffle (default: 42).",
+    )
+    return parser
+def count_lines(path):
+    with open(path, "rb") as f:
+        return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+def serialise_numpy(key: str, tokens: np.ndarray) -> dict:
+    buffer = io.BytesIO()
+    np.save(buffer, tokens)
+    return {"__key__": key, "npy": buffer.getvalue()}
+def process_init(rank_queue, tokenizer_path):
+    """
+    Initialization function for each worker process.
+    Assigns a specific GPU to the process and loads the tokenizer.
+    """
+    global worker_tokenizer, worker_feature_extractor
+    # Configure worker process logging
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]"
+        " [Worker %(process)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    # Get assigned GPU rank
+    rank = rank_queue.get()
+    # Determine device
+    if rank != -1 and torch.cuda.is_available():
+        worker_device = torch.device(f"cuda:{rank}")
+    else:
+        worker_device = torch.device("cpu")
+    logging.debug(f"Worker process initialized with device: {worker_device}")
+    # Load tokenizer onto the specified device
+    worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path)
+    worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+        tokenizer_path, device_map=worker_device
+    )
+    logging.debug(f"Tokenizer loaded successfully on device {worker_device}")
+def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]:
+    """
+    Single-sample processing function executed in worker processes.
+    Skips invalid samples during streaming processing.
+    """
+    try:
+        audio_tensor = sample.get("audio", None)  # shape (1, T)
+        if audio_tensor is None:
+            raise ValueError("Sample missing 'audio' field")
+        with torch.inference_mode():
+            key = sample["label"]["id"]
+            inputs = worker_feature_extractor(
+                raw_audio=audio_tensor.squeeze(0).numpy(),
+                sampling_rate=HIGGS_INPUT_SAMPLE_RATE,
+                return_tensors="pt",
+            ).to(worker_tokenizer.device)
+            audio_tokens = worker_tokenizer.encode(
+                inputs["input_values"],
+            ).audio_codes.squeeze(0)
+            assert len(audio_tokens.shape) == 2
+            assert audio_tokens.size(0) == 8
+            num_tokens = audio_tokens.size(1)
+            metadata = sample["label"]
+            metadata["num_tokens"] = num_tokens
+            # Convert to numpy format for subsequent serialization (int16 to save space)
+            audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy()
+            return {
+                "status": "success",
+                "key": key,
+                "audio_tokens": audio_tokens_np,
+                "metadata": metadata,
+                "error_msg": None,
+            }
+    except Exception as e:
+        sample_id = sample.get("label", {}).get("id", "unknown")
+        logging.error(f"Failed to process sample {sample_id}: {e}")
+        return {
+            "status": "error",
+            "key": sample_id,
+            "audio_tokens": None,
+            "metadata": None,
+            "error_msg": str(e),
+        }
+def _normalise_value(value: Any) -> Any:
+    """Convert tensors and NumPy scalars to serialisable Python objects."""
+    if isinstance(value, torch.Tensor):
+        if value.ndim == 0:
+            return value.item()
+        return value.cpu().tolist()
+    if isinstance(value, np.generic):
+        return value.item()
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    return value
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+    cleaned: dict[str, Any] = {}
+    for key, value in metadata.items():
+        if value is None:
+            continue
+        cleaned[key] = _normalise_value(value)
+    return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+class StreamingLengthFilteredDataset(IterableDataset):
+    def __init__(
+        self,
+        base_iterable,
+        min_len: float,
+        max_len: float,
+        sr: int,
+    ):
+        self.base_iterable = base_iterable
+        self.min_len = min_len
+        self.max_len = max_len
+        self.sr = sr
+        self.filtered_count = 0
+    def __iter__(self):
+        """Stream samples one by one and filter on the fly."""
+        for sample in self.base_iterable:
+            try:
+                duration = sample["audio"].size(-1) / self.sr
+                if self.min_len <= duration <= self.max_len:
+                    yield sample
+                else:
+                    self.filtered_count += 1
+                    logging.warning(
+                        f"Filtered sample (duration out of range): "
+                        f"{sample['label']['id']} ({duration:.2f}s)"
+                    )
+            except Exception as e:
+                logging.warning(f"Skipped invalid sample during streaming: {e}")
+                continue
+def main() -> None:
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    parser = build_parser()
+    args = parser.parse_args()
+    mp.set_start_method("spawn", force=True)
+    # Validate input arguments
+    assert bool(args.input_manifest) != bool(
+        args.input_jsonl
+    ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+    if args.num_machines > 1:
+        assert (
+            0 <= args.machine_index < args.num_machines
+        ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+    # Build base dataset and count total samples based on input mode
+    if args.input_jsonl:
+        logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+        total_samples = count_lines(args.input_jsonl)
+        base_dataset = JsonlDatasetReader(
+            args.input_jsonl,
+            sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+            shuffle=args.shuffle,
+            shuffle_seed=args.shuffle_seed,
+        )
+        loader_workers = args.loader_workers
+    else:
+        logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+        manifest_num_lines = count_lines(args.input_manifest)
+        loader_workers = min(args.loader_workers, manifest_num_lines)
+        total_samples = 0
+        manifests = []
+        with open(args.input_manifest, "r", encoding="utf-8") as f:
+            for line_id, line in tqdm(
+                enumerate(f),
+                total=manifest_num_lines,
+                desc="Calculating dataset length",
+            ):
+                items = line.strip().split(" ")
+                tar_path, jsonl_path, num_items, duration = (
+                    items[0],
+                    items[1],
+                    int(items[2]),
+                    float(items[3]),
+                )
+                assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+                assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+                assert jsonl_path.endswith(
+                    ".jsonl"
+                ), f"File {jsonl_path} is not a .jsonl file."
+                if (
+                    args.num_machines > 1
+                    and line_id % args.num_machines != args.machine_index
+                ):
+                    continue
+                total_samples += num_items
+                manifests.append((tar_path, jsonl_path, num_items, duration))
+        logging.info(
+            f"Total shards: {manifest_num_lines}, "
+            f"Shards for current index: {len(manifests)}"
+        )
+        base_dataset = WebDatasetReader(
+            manifests=manifests,
+            sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+            evaluation=True,
+        )
+    # Adjust samples_per_shard if min_num_shards would be violated
+    samples_per_shard = args.samples_per_shard
+    if total_samples > 0:
+        estimated_shards = max(
+            1, (total_samples + samples_per_shard - 1) // samples_per_shard
+        )
+        if estimated_shards < args.min_num_shards:
+            samples_per_shard = max(1, total_samples // args.min_num_shards)
+            logging.info(
+                f"Adjusted samples_per_shard from {args.samples_per_shard} to "
+                f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} "
+                f"(total_samples={total_samples})"
+            )
+    # Apply length filter and create DataLoader
+    filtered_dataset = StreamingLengthFilteredDataset(
+        base_iterable=base_dataset,
+        min_len=args.min_length,
+        max_len=args.max_length,
+        sr=HIGGS_INPUT_SAMPLE_RATE,
+    )
+    dataloader = DataLoader(
+        dataset=filtered_dataset,
+        batch_size=None,
+        num_workers=loader_workers,
+        persistent_workers=loader_workers > 0,
+        pin_memory=False,
+    )
+    # Configure multi-GPU multi-process setup
+    num_devices = torch.cuda.device_count()
+    if num_devices == 0:
+        logging.warning("No GPUs detected - using CPU for processing")
+        num_processes = args.nj_per_gpu
+    else:
+        num_processes = num_devices * args.nj_per_gpu
+    logging.info(
+        f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+        f"Total processes: {num_processes}"
+    )
+    # Shared GPU rank queue for process assignment
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for rank in list(range(num_devices)) * args.nj_per_gpu:
+        rank_queue.put(rank)
+    if num_devices == 0:
+        for _ in range(num_processes):
+            rank_queue.put(-1)
+    # Prepare output paths
+    tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+    jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+    Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    # Determine output directory from tar_output_pattern
+    output_dir = Path(tar_output_pattern).parent.parent
+    error_log_path = str(output_dir / "errors.jsonl")
+    manifest_path = str(output_dir / "data.lst")
+    # Setup error logger (writes to errors.jsonl)
+    error_logger = logging.getLogger("error_log")
+    error_logger.setLevel(logging.ERROR)
+    error_logger.handlers.clear()
+    error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+    error_fh.setFormatter(logging.Formatter("%(message)s"))
+    error_logger.addHandler(error_fh)
+    # Progress and error tracking
+    processed_count = 0
+    error_count = 0
+    write_error_count = 0
+    failed_ids = []
+    shard_idx = 0
+    shard_sample_count = 0
+    shard_duration = 0.0
+    shard_manifest = {}  # shard_idx -> (tar_path, jsonl_path, count, duration)
+    tar_writer = None
+    jsonl_file = None
+    def open_new_shard():
+        nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        # Record manifest for the previous shard
+        if shard_idx > 0 and shard_sample_count > 0:
+            prev_idx = shard_idx - 1
+            shard_manifest[prev_idx] = (
+                os.path.abspath(tar_output_pattern % prev_idx),
+                os.path.abspath(jsonl_output_pattern % prev_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+        tar_fname = tar_output_pattern % shard_idx
+        jsonl_fname = jsonl_output_pattern % shard_idx
+        tar_writer = wds.TarWriter(tar_fname)
+        jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+        shard_idx += 1
+        shard_sample_count = 0
+        shard_duration = 0.0
+    def write_sample(key, audio_tokens_np, metadata):
+        nonlocal shard_sample_count, write_error_count, shard_duration
+        assert tar_writer is not None and jsonl_file is not None
+        try:
+            token_record = serialise_numpy(key, audio_tokens_np)
+            json_record = _encode_metadata(metadata)
+            tar_writer.write(token_record)
+            jsonl_file.write(json_record.decode("utf-8") + "\n")
+            shard_sample_count += 1
+            shard_duration += metadata.get("audio_duration", 0.0)
+        except Exception as exc:
+            write_error_count += 1
+            failed_ids.append(key)
+            error_logger.error(
+                json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+            )
+            logging.error(f"Write failed for sample {key}: {exc}")
+    def handle_result(result):
+        nonlocal processed_count, error_count
+        if result["status"] == "success":
+            # Rotate shard if needed
+            if tar_writer is None or shard_sample_count >= samples_per_shard:
+                open_new_shard()
+            write_sample(result["key"], result["audio_tokens"], result["metadata"])
+            processed_count += 1
+        else:
+            error_count += 1
+            failed_ids.append(result["key"])
+            error_logger.error(
+                json.dumps(
+                    {"id": result["key"], "reason": result["error_msg"]},
+                    ensure_ascii=False,
+                )
+            )
+            if not args.skip_errors:
+                raise RuntimeError(
+                    f"Sample {result['key']} processing failed due "
+                    f"to {result['error_msg']} - terminating"
+                )
+            logging.warning(
+                f"Skipping failed sample {result['key']}: {result['error_msg']}"
+            )
+    main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens")
+    try:
+        with ProcessPoolExecutor(
+            max_workers=num_processes,
+            initializer=process_init,
+            initargs=(rank_queue, args.tokenizer_path),
+        ) as executor:
+            logging.info(f"Submitting tasks... ({num_processes} workers)")
+            futures = set()
+            max_pending = num_processes * 10
+            def drain_completed():
+                """Wait for at least one future to complete, process all done."""
+                nonlocal futures
+                done, _ = wait(futures, return_when=FIRST_COMPLETED)
+                for f in done:
+                    futures.discard(f)
+                    result = f.result()
+                    main_progress.update(1)
+                    handle_result(result)
+                    main_progress.set_postfix(
+                        Samples=processed_count,
+                        Errors=error_count,
+                    )
+            # Stream samples from DataLoader
+            for sample in dataloader:
+                if len(futures) >= max_pending:
+                    drain_completed()
+                future = executor.submit(process_single_sample, sample)
+                futures.add(future)
+            # Process remaining futures
+            logging.info("Processing remaining pending samples...")
+            while futures:
+                drain_completed()
+    except Exception:
+        logging.error("Critical error during processing", exc_info=True)
+        raise
+    finally:
+        main_progress.close()
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        # Record the last shard in the manifest
+        if shard_idx > 0 and shard_sample_count > 0:
+            last_idx = shard_idx - 1
+            shard_manifest[last_idx] = (
+                os.path.abspath(tar_output_pattern % last_idx),
+                os.path.abspath(jsonl_output_pattern % last_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+    # Write manifest file (data.lst)
+    with open(manifest_path, "w", encoding="utf-8") as mf:
+        for idx in sorted(shard_manifest.keys()):
+            tar_path, jsonl_path, count, duration = shard_manifest[idx]
+            mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+    # Output final statistics
+    total_failed = error_count + write_error_count
+    filtered_and_skipped = total_samples - processed_count - total_failed
+    logging.info(
+        f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+        f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+    )
+    logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+    if total_failed > 0:
+        logging.info(f"Error details: {error_log_path}")
+    if failed_ids and args.skip_errors:
+        logging.warning(
+            f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+        )
+    if write_error_count > 0 and not args.skip_errors:
+        raise RuntimeError(
+            f"{write_error_count} samples failed to write - check logs for details"
+        )
+if __name__ == "__main__":
+    main()

omnivoice/scripts/extract_audio_tokens_add_noise.py ADDED Viewed

	@@ -0,0 +1,825 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Extract audio tokens from audio data and pack them into WebDataset shards.
+Extends ``extract_audio_tokens.py`` with optional noise and reverberation
+augmentation on the prompt (reference) portion of the audio. Requires a
+noise manifest and/or RIR manifest.
+Supports two input modes:
+1. WebDataset manifest (data.lst):
+    python extract_audio_tokens_add_noise.py \\
+        --input_manifest data.lst \\
+        --noise_manifest noise.lst \\
+        --tar_output_pattern output/audios/shard-%06d.tar \\
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}):
+    python extract_audio_tokens_add_noise.py \\
+        --input_jsonl data.jsonl \\
+        --noise_manifest noise.lst \\
+        --tar_output_pattern output/audios/shard-%06d.tar \\
+        --jsonl_output_pattern output/txts/shard-%06d.jsonl
+Output structure:
+    output_dir/
+    ├── audios/           # WebDataset tar shards (.npy audio tokens + .json metadata)
+    │   ├── shard_000000.tar
+    │   └── ...
+    ├── txts/             # Per-shard JSONL metadata
+    │   ├── shard_000000.jsonl
+    │   └── ...
+    ├── data.lst          # Manifest: <tar_path> <jsonl_path> <sample_count> <total_duration>
+    └── errors.jsonl      # Failed samples with error details
+"""
+import argparse
+import io
+import json
+import logging
+import math
+import multiprocessing as mp
+import os
+import random
+import warnings
+from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+import webdataset as wds
+from torch.utils.data import DataLoader, IterableDataset
+from tqdm.auto import tqdm
+from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.common import str2bool
+warnings.filterwarnings(
+    "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm"
+)
+HIGGS_INPUT_SAMPLE_RATE = 24_000
+# Global variables: Store tokenizer and device for each worker process
+worker_tokenizer = None
+worker_feature_extractor = None
+worker_noise_sampler = None
+worker_rir_sampler = None
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input_manifest",
+        default=None,
+        help="Path to input dataset manifest (data.lst).",
+    )
+    parser.add_argument(
+        "--input_jsonl",
+        default=None,
+        help="Path to raw JSONL file (alternative to --input_manifest).",
+    )
+    parser.add_argument(
+        "--tar_output_pattern",
+        required=True,
+        help="Tar shard pattern passed to WebDataset",
+    )
+    parser.add_argument(
+        "--jsonl_output_pattern",
+        required=True,
+        help="Jsonl shard pattern passed to WebDataset",
+    )
+    parser.add_argument(
+        "--samples_per_shard",
+        type=int,
+        default=1000,
+        help="Maximum records per shard",
+    )
+    parser.add_argument(
+        "--min_num_shards",
+        type=int,
+        default=32,
+        help="Minimum number of output shards (use to ensure "
+        "shard count >= num_gpu * num_workers)",
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="eustlb/higgs-audio-v2-tokenizer",
+        help="Path to audio tokenizer.",
+    )
+    parser.add_argument(
+        "--skip_errors", action="store_true", help="Skip items that fail to process"
+    )
+    parser.add_argument(
+        "--min_length",
+        type=float,
+        default=0.0,
+        help="Minimum audio duration in seconds (e.g. 2.0)",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=float,
+        default=float("inf"),
+        help="Maximum audio duration in seconds (e.g. 15.0)",
+    )
+    parser.add_argument(
+        "--num_machines",
+        type=int,
+        default=1,
+        help="Total number of machines for distributed runs",
+    )
+    parser.add_argument(
+        "--machine_index",
+        type=int,
+        default=0,
+        help="Zero-based machine index when distributing across multiple "
+        "machines (e.g. 0, 1, ... num_machines-1)",
+    )
+    parser.add_argument(
+        "--nj_per_gpu",
+        type=int,
+        default=3,
+        help="Number of worker processes to spawn per GPU.",
+    )
+    parser.add_argument(
+        "--loader_workers",
+        type=int,
+        default=24,
+        help="Number of DataLoader workers for streaming IterableDataset.",
+    )
+    parser.add_argument(
+        "--shuffle",
+        type=str2bool,
+        default=True,
+        help="Shuffle data by default.",
+    )
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=42,
+        help="Random seed for shuffle (default: 42).",
+    )
+    parser.add_argument(
+        "--noise_manifest",
+        default=None,
+        help="Path to noise manifest (list of tar files). Enables prompt noise augmentation.",
+    )
+    parser.add_argument(
+        "--rir_manifest",
+        default=None,
+        help="Path to RIR manifest (list of tar files). Enables prompt reverb augmentation.",
+    )
+    return parser
+def count_lines(path):
+    with open(path, "rb") as f:
+        return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+def serialise_numpy(key: str, tokens: np.ndarray) -> dict:
+    buffer = io.BytesIO()
+    np.save(buffer, tokens)
+    return {"__key__": key, "npy": buffer.getvalue()}
+def _load_aug_audio(data, sample_rate=24000):
+    """Simple audio loader for augmentation files."""
+    with io.BytesIO(data) as b:
+        wav, sr = torchaudio.load(b)
+    if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+    if sr != sample_rate:
+        wav = torchaudio.functional.resample(wav, sr, sample_rate)
+    return wav
+class SimpleWorkerSampler:
+    """A lightweight infinite sampler for noise/RIR within a worker process."""
+    def __init__(self, tar_paths, sample_rate=24000):
+        self.dataset = (
+            wds.WebDataset(
+                tar_paths, shardshuffle=True, nodesplitter=None, workersplitter=None
+            )
+            .decode()
+            .map(lambda s: self._decode(s, sample_rate))
+            .select(lambda x: x is not None)
+            .shuffle(100)
+            .repeat()
+        )
+        self.iterator = iter(self.dataset)
+    def _decode(self, sample, sample_rate):
+        for ext in ["wav", "flac", "mp3"]:
+            if ext in sample:
+                return _load_aug_audio(sample[ext], sample_rate)
+        return None
+    def sample_segment(self, target_len, allow_repeat=True):
+        """Get a random segment of noise matching the target length."""
+        try:
+            audio = next(self.iterator)
+        except StopIteration:
+            self.iterator = iter(self.dataset)
+            audio = next(self.iterator)
+        cur_len = audio.size(-1)
+        if cur_len < target_len and allow_repeat:
+            if cur_len > 0:
+                num_repeats = math.ceil(target_len / cur_len)
+                audio = audio.repeat(1, num_repeats)
+            else:
+                audio = F.pad(audio, (0, target_len), mode="constant")
+            cur_len = audio.size(-1)
+        if cur_len > target_len:
+            start = random.randint(0, cur_len - target_len)
+            audio = audio[..., start : start + target_len]
+        return audio
+def _convolve1d(signal: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor:
+    m = signal.size(-1)
+    n = kernel.size(-1)
+    padded_size = m + n - 1
+    f_signal = torch.fft.rfft(signal, n=padded_size)
+    f_kernel = torch.fft.rfft(kernel, n=padded_size)
+    f_result = f_signal * f_kernel
+    result = torch.fft.irfft(f_result, n=padded_size)
+    return result[:padded_size]
+def _apply_rir(audio, rir, mix_ratio=0.5):
+    rir_scaling_factor = 0.5**15
+    N_in = audio.shape[-1]
+    rir_d = rir[0, :] * rir_scaling_factor
+    aug_d = _convolve1d(audio[0], rir_d)
+    shift_index = torch.argmax(torch.abs(rir_d))
+    end_index = shift_index + N_in
+    if end_index > aug_d.shape[0]:
+        augmented = F.pad(aug_d[shift_index:], (0, end_index - aug_d.shape[0]))
+    else:
+        augmented = aug_d[shift_index:end_index]
+    power_before = torch.sum(audio[0] ** 2)
+    power_after = torch.sum(augmented**2)
+    if power_after > 0:
+        augmented *= torch.sqrt(power_before / power_after)
+    mixed = (1 - mix_ratio) * audio[0] + mix_ratio * augmented
+    return mixed.unsqueeze(0)
+def process_init(rank_queue, tokenizer_path, noise_manifest=None, rir_manifest=None):
+    """
+    Initialization function for each worker process.
+    Assigns a specific GPU to the process and loads the tokenizer.
+    """
+    global worker_tokenizer, worker_feature_extractor, worker_noise_sampler, worker_rir_sampler
+    # Configure worker process logging
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]"
+        " [Worker %(process)d] %(message)s"
+    )
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    # Get assigned GPU rank
+    rank = rank_queue.get()
+    # Determine device
+    if rank != -1 and torch.cuda.is_available():
+        worker_device = torch.device(f"cuda:{rank}")
+    else:
+        worker_device = torch.device("cpu")
+    logging.debug(f"Worker process initialized with device: {worker_device}")
+    # Load tokenizer onto the specified device
+    worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path)
+    worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+        tokenizer_path, device_map=worker_device
+    )
+    logging.debug(f"Tokenizer loaded successfully on device {worker_device}")
+    # Initialize augmentation samplers (optional)
+    if noise_manifest:
+        try:
+            with open(noise_manifest, "r") as f:
+                tars = [l.strip().split()[0] for l in f if l.strip()]
+            worker_noise_sampler = SimpleWorkerSampler(
+                tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE
+            )
+            logging.debug("Noise sampler initialized.")
+        except Exception as e:
+            logging.warning(f"Failed to load noise manifest: {e}")
+    if rir_manifest:
+        try:
+            with open(rir_manifest, "r") as f:
+                tars = [l.strip().split()[0] for l in f if l.strip()]
+            worker_rir_sampler = SimpleWorkerSampler(
+                tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE
+            )
+            logging.debug("RIR sampler initialized.")
+        except Exception as e:
+            logging.warning(f"Failed to load RIR manifest: {e}")
+def _augment_prompt(audio_tensor: torch.Tensor) -> tuple[torch.Tensor, int]:
+    """Apply noise/reverb augmentation to the front portion of audio.
+    Returns the augmented audio and the sample index where clean audio starts.
+    """
+    # Pre-normalization
+    max_val = audio_tensor.abs().max() + 1e-7
+    audio_tensor = (audio_tensor / max_val) * 0.6
+    total_len = audio_tensor.size(-1)
+    ratio = random.uniform(0.1, 0.3)
+    split_idx = int(total_len * ratio)
+    front_part = audio_tensor[:, :split_idx].clone()
+    # Apply noise
+    if worker_noise_sampler is not None:
+        noise = worker_noise_sampler.sample_segment(split_idx)
+        snr_db = random.uniform(5, 15)
+        sig_rms = front_part.norm(p=2) / (split_idx**0.5)
+        noise_rms = noise.norm(p=2) / (split_idx**0.5)
+        if noise_rms > 1e-9:
+            snr = 10 ** (snr_db / 20)
+            scale = sig_rms / (snr * noise_rms + 1e-8)
+            front_part = front_part + noise * scale
+    # Apply RIR (30% probability)
+    if worker_rir_sampler is not None and random.random() < 0.3:
+        rir = worker_rir_sampler.sample_segment(split_idx, allow_repeat=False)
+        reverb_amt = random.uniform(0.3, 1.0)
+        try:
+            front_part = _apply_rir(front_part, rir, reverb_amt)
+        except Exception as e:
+            logging.warning(f"RIR failed: {e}")
+    # Merge back
+    if front_part.device != audio_tensor.device:
+        front_part = front_part.to(audio_tensor.device)
+    audio_tensor[:, :split_idx] = front_part
+    # Post-normalization
+    max_val = audio_tensor.abs().max() + 1e-7
+    audio_tensor = (audio_tensor / max_val) * 0.9
+    return audio_tensor, split_idx
+def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]:
+    """
+    Single-sample processing function executed in worker processes.
+    Skips invalid samples during streaming processing.
+    """
+    try:
+        audio_tensor = sample.get("audio", None)  # shape (1, T)
+        if audio_tensor is None:
+            raise ValueError("Sample missing 'audio' field")
+        # Apply prompt augmentation if noise/rir samplers are available
+        enable_aug = worker_noise_sampler is not None or worker_rir_sampler is not None
+        clean_sample_idx = 0
+        if enable_aug:
+            audio_tensor, clean_sample_idx = _augment_prompt(audio_tensor)
+        with torch.inference_mode():
+            key = sample["label"]["id"]
+            inputs = worker_feature_extractor(
+                raw_audio=audio_tensor.squeeze(0).numpy(),
+                sampling_rate=HIGGS_INPUT_SAMPLE_RATE,
+                return_tensors="pt",
+            ).to(worker_tokenizer.device)
+            audio_tokens = worker_tokenizer.encode(
+                inputs["input_values"],
+            ).audio_codes.squeeze(0)
+            assert len(audio_tokens.shape) == 2
+            assert audio_tokens.size(0) == 8
+            num_tokens = audio_tokens.size(1)
+            metadata = sample["label"]
+            metadata["num_tokens"] = num_tokens
+            if enable_aug:
+                clean_token_idx = math.ceil(
+                    clean_sample_idx / worker_tokenizer.config.hop_length
+                )
+                metadata["clean_start_token_idx"] = clean_token_idx
+            # Convert to numpy format for subsequent serialization (int16 to save space)
+            audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy()
+            return {
+                "status": "success",
+                "key": key,
+                "audio_tokens": audio_tokens_np,
+                "metadata": metadata,
+                "error_msg": None,
+            }
+    except Exception as e:
+        sample_id = sample.get("label", {}).get("id", "unknown")
+        logging.error(f"Failed to process sample {sample_id}: {e}")
+        return {
+            "status": "error",
+            "key": sample_id,
+            "audio_tokens": None,
+            "metadata": None,
+            "error_msg": str(e),
+        }
+def _normalise_value(value: Any) -> Any:
+    """Convert tensors and NumPy scalars to serialisable Python objects."""
+    if isinstance(value, torch.Tensor):
+        if value.ndim == 0:
+            return value.item()
+        return value.cpu().tolist()
+    if isinstance(value, np.generic):
+        return value.item()
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    return value
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+    cleaned: dict[str, Any] = {}
+    for key, value in metadata.items():
+        if value is None:
+            continue
+        cleaned[key] = _normalise_value(value)
+    return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+class StreamingLengthFilteredDataset(IterableDataset):
+    def __init__(
+        self,
+        base_iterable,
+        min_len: float,
+        max_len: float,
+        sr: int,
+    ):
+        self.base_iterable = base_iterable
+        self.min_len = min_len
+        self.max_len = max_len
+        self.sr = sr
+        self.filtered_count = 0
+    def __iter__(self):
+        """Stream samples one by one and filter on the fly."""
+        for sample in self.base_iterable:
+            try:
+                duration = sample["audio"].size(-1) / self.sr
+                if self.min_len <= duration <= self.max_len:
+                    yield sample
+                else:
+                    self.filtered_count += 1
+                    logging.warning(
+                        f"Filtered sample (duration out of range): "
+                        f"{sample['label']['id']} ({duration:.2f}s)"
+                    )
+            except Exception as e:
+                logging.warning(f"Skipped invalid sample during streaming: {e}")
+                continue
+def main() -> None:
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    parser = build_parser()
+    args = parser.parse_args()
+    mp.set_start_method("spawn", force=True)
+    # Validate input arguments
+    assert bool(args.input_manifest) != bool(
+        args.input_jsonl
+    ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+    if args.num_machines > 1:
+        assert (
+            0 <= args.machine_index < args.num_machines
+        ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+    # Build base dataset and count total samples based on input mode
+    if args.input_jsonl:
+        logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+        total_samples = count_lines(args.input_jsonl)
+        base_dataset = JsonlDatasetReader(
+            args.input_jsonl,
+            sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+            shuffle=args.shuffle,
+            shuffle_seed=args.shuffle_seed,
+        )
+        loader_workers = args.loader_workers
+    else:
+        logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+        manifest_num_lines = count_lines(args.input_manifest)
+        loader_workers = min(args.loader_workers, manifest_num_lines)
+        total_samples = 0
+        manifests = []
+        with open(args.input_manifest, "r", encoding="utf-8") as f:
+            for line_id, line in tqdm(
+                enumerate(f),
+                total=manifest_num_lines,
+                desc="Calculating dataset length",
+            ):
+                items = line.strip().split(" ")
+                tar_path, jsonl_path, num_items, duration = (
+                    items[0],
+                    items[1],
+                    int(items[2]),
+                    float(items[3]),
+                )
+                assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+                assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+                assert jsonl_path.endswith(
+                    ".jsonl"
+                ), f"File {jsonl_path} is not a .jsonl file."
+                if (
+                    args.num_machines > 1
+                    and line_id % args.num_machines != args.machine_index
+                ):
+                    continue
+                total_samples += num_items
+                manifests.append((tar_path, jsonl_path, num_items, duration))
+        logging.info(
+            f"Total shards: {manifest_num_lines}, "
+            f"Shards for current index: {len(manifests)}"
+        )
+        base_dataset = WebDatasetReader(
+            manifests=manifests,
+            sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+            evaluation=True,
+        )
+    # Apply length filter and create DataLoader
+    filtered_dataset = StreamingLengthFilteredDataset(
+        base_iterable=base_dataset,
+        min_len=args.min_length,
+        max_len=args.max_length,
+        sr=HIGGS_INPUT_SAMPLE_RATE,
+    )
+    dataloader = DataLoader(
+        dataset=filtered_dataset,
+        batch_size=None,
+        num_workers=loader_workers,
+        persistent_workers=loader_workers > 0,
+        pin_memory=False,
+    )
+    # Adjust samples_per_shard if min_num_shards would be violated
+    samples_per_shard = args.samples_per_shard
+    if total_samples > 0:
+        estimated_shards = max(
+            1, (total_samples + samples_per_shard - 1) // samples_per_shard
+        )
+        if estimated_shards < args.min_num_shards:
+            samples_per_shard = max(1, total_samples // args.min_num_shards)
+            logging.info(
+                f"Adjusted samples_per_shard from {args.samples_per_shard} to "
+                f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} "
+                f"(total_samples={total_samples})"
+            )
+    # Configure multi-GPU multi-process setup
+    num_devices = torch.cuda.device_count()
+    if num_devices == 0:
+        logging.warning("No GPUs detected - using CPU for processing")
+        num_processes = args.nj_per_gpu
+    else:
+        num_processes = num_devices * args.nj_per_gpu
+    logging.info(
+        f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+        f"Total processes: {num_processes}"
+    )
+    if args.noise_manifest or args.rir_manifest:
+        logging.info(
+            f"Prompt augmentation enabled - "
+            f"noise: {args.noise_manifest or 'off'}, rir: {args.rir_manifest or 'off'}"
+        )
+    # Shared GPU rank queue for process assignment
+    manager = mp.Manager()
+    rank_queue = manager.Queue()
+    for rank in list(range(num_devices)) * args.nj_per_gpu:
+        rank_queue.put(rank)
+    if num_devices == 0:
+        for _ in range(num_processes):
+            rank_queue.put(-1)
+    # Prepare output paths
+    tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+    jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+    Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+    # Determine output directory from tar_output_pattern
+    output_dir = Path(tar_output_pattern).parent.parent
+    error_log_path = str(output_dir / "errors.jsonl")
+    manifest_path = str(output_dir / "data.lst")
+    # Setup error logger (writes to errors.jsonl)
+    error_logger = logging.getLogger("error_log")
+    error_logger.setLevel(logging.ERROR)
+    error_logger.handlers.clear()
+    error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+    error_fh.setFormatter(logging.Formatter("%(message)s"))
+    error_logger.addHandler(error_fh)
+    # Progress and error tracking
+    processed_count = 0
+    error_count = 0
+    write_error_count = 0
+    failed_ids = []
+    shard_idx = 0
+    shard_sample_count = 0
+    shard_duration = 0.0
+    shard_manifest = {}  # shard_idx -> (tar_path, jsonl_path, count, duration)
+    tar_writer = None
+    jsonl_file = None
+    def open_new_shard():
+        nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        # Record manifest for the previous shard
+        if shard_idx > 0 and shard_sample_count > 0:
+            prev_idx = shard_idx - 1
+            shard_manifest[prev_idx] = (
+                os.path.abspath(tar_output_pattern % prev_idx),
+                os.path.abspath(jsonl_output_pattern % prev_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+        tar_fname = tar_output_pattern % shard_idx
+        jsonl_fname = jsonl_output_pattern % shard_idx
+        tar_writer = wds.TarWriter(tar_fname)
+        jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+        shard_idx += 1
+        shard_sample_count = 0
+        shard_duration = 0.0
+    def write_sample(key, audio_tokens_np, metadata):
+        nonlocal shard_sample_count, write_error_count, shard_duration
+        assert tar_writer is not None and jsonl_file is not None
+        try:
+            token_record = serialise_numpy(key, audio_tokens_np)
+            json_record = _encode_metadata(metadata)
+            tar_writer.write(token_record)
+            jsonl_file.write(json_record.decode("utf-8") + "\n")
+            shard_sample_count += 1
+            shard_duration += metadata.get("audio_duration", 0.0)
+        except Exception as exc:
+            write_error_count += 1
+            failed_ids.append(key)
+            error_logger.error(
+                json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+            )
+            logging.error(f"Write failed for sample {key}: {exc}")
+    def handle_result(result):
+        nonlocal processed_count, error_count
+        if result["status"] == "success":
+            # Rotate shard if needed
+            if tar_writer is None or shard_sample_count >= samples_per_shard:
+                open_new_shard()
+            write_sample(result["key"], result["audio_tokens"], result["metadata"])
+            processed_count += 1
+        else:
+            error_count += 1
+            failed_ids.append(result["key"])
+            error_logger.error(
+                json.dumps(
+                    {"id": result["key"], "reason": result["error_msg"]},
+                    ensure_ascii=False,
+                )
+            )
+            if not args.skip_errors:
+                raise RuntimeError(
+                    f"Sample {result['key']} processing failed due "
+                    f"to {result['error_msg']} - terminating"
+                )
+            logging.warning(
+                f"Skipping failed sample {result['key']}: {result['error_msg']}"
+            )
+    main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens")
+    try:
+        with ProcessPoolExecutor(
+            max_workers=num_processes,
+            initializer=process_init,
+            initargs=(
+                rank_queue,
+                args.tokenizer_path,
+                args.noise_manifest,
+                args.rir_manifest,
+            ),
+        ) as executor:
+            logging.info(f"Submitting tasks... ({num_processes} workers)")
+            futures = set()
+            max_pending = num_processes * 10
+            def drain_completed():
+                """Wait for at least one future to complete, process all done."""
+                nonlocal futures
+                done, _ = wait(futures, return_when=FIRST_COMPLETED)
+                for f in done:
+                    futures.discard(f)
+                    result = f.result()
+                    main_progress.update(1)
+                    handle_result(result)
+                    main_progress.set_postfix(
+                        Samples=processed_count,
+                        Errors=error_count,
+                    )
+            # Stream samples from DataLoader
+            for sample in dataloader:
+                if len(futures) >= max_pending:
+                    drain_completed()
+                future = executor.submit(process_single_sample, sample)
+                futures.add(future)
+            # Process remaining futures
+            logging.info("Processing remaining pending samples...")
+            while futures:
+                drain_completed()
+    except Exception:
+        logging.error("Critical error during processing", exc_info=True)
+        raise
+    finally:
+        main_progress.close()
+        if tar_writer is not None:
+            tar_writer.close()
+        if jsonl_file is not None:
+            jsonl_file.close()
+        # Record the last shard in the manifest
+        if shard_idx > 0 and shard_sample_count > 0:
+            last_idx = shard_idx - 1
+            shard_manifest[last_idx] = (
+                os.path.abspath(tar_output_pattern % last_idx),
+                os.path.abspath(jsonl_output_pattern % last_idx),
+                shard_sample_count,
+                shard_duration,
+            )
+    # Write manifest file (data.lst)
+    with open(manifest_path, "w", encoding="utf-8") as mf:
+        for idx in sorted(shard_manifest.keys()):
+            tar_path, jsonl_path, count, duration = shard_manifest[idx]
+            mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+    # Output final statistics
+    total_failed = error_count + write_error_count
+    filtered_and_skipped = total_samples - processed_count - total_failed
+    logging.info(
+        f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+        f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+    )
+    logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+    if total_failed > 0:
+        logging.info(f"Error details: {error_log_path}")
+    if failed_ids and args.skip_errors:
+        logging.warning(
+            f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+        )
+    if write_error_count > 0 and not args.skip_errors:
+        raise RuntimeError(
+            f"{write_error_count} samples failed to write - check logs for details"
+        )
+if __name__ == "__main__":
+    main()

omnivoice/scripts/jsonl_to_webdataset.py ADDED Viewed

	@@ -0,0 +1,439 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pack a JSONL audio dataset into a customed WebDataset shards
+(paired .tar and .jsonl files).
+Usage:
+    python jsonl_to_webdataset.py \
+        --input data.jsonl \
+        --output output_dir/ \
+        --workers 16 \
+        --threads 4 \
+        --shard-size 1000 \
+        --sr 24000
+Input JSONL format (one JSON object per line):
+    {"id": "utt_001", "audio_path": "/data/wavs/001.wav", "text": "hello world", ...}
+    Required fields: "id", "audio_path", "text"
+    All other fields are preserved in the output metadata.
+Output structure:
+    output_dir/
+    ├── audios/           # WebDataset tar shards
+    │   ├── shard_000000.tar
+    │   ├── shard_000001.tar
+    │   └── ...
+    ├── txts/             # Per-shard JSONL metadata (with audio_duration added)
+    │   ├── shard_000000.jsonl
+    │   ├── shard_000001.jsonl
+    │   └── ...
+    ├── data.lst          # Manifest: <tar_path> <jsonl_path> <sample_count> <total_duration>
+    └── errors.jsonl      # Failed samples with error details
+"""
+import argparse
+import io
+import json
+import logging
+import multiprocessing as mp
+import os
+import random
+from concurrent.futures import (
+    FIRST_COMPLETED,
+    ProcessPoolExecutor,
+    ThreadPoolExecutor,
+    as_completed,
+    wait,
+)
+from itertools import islice
+from pathlib import Path
+import torchaudio
+import webdataset as wds
+from tqdm import tqdm
+from omnivoice.utils.common import str2bool
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Pack JSONL audio dataset into WebDataset shards."
+    )
+    parser.add_argument(
+        "--input", type=str, default="data.jsonl", help="Path to input JSONL file"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="emilia",
+        help="Path to output directory",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=16,
+        help="Number of worker processes (default: 16)",
+    )
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=4,
+        help="Number of threads per worker process.",
+    )
+    parser.add_argument(
+        "--shard-size",
+        type=int,
+        default=1000,
+        help="Number of samples per shard (default: 1000)",
+    )
+    parser.add_argument(
+        "--sr", type=int, default=24000, help="Target sample rate (default: 24000)"
+    )
+    parser.add_argument(
+        "--shuffle",
+        type=str2bool,
+        default=True,
+        help="Shuffle data by default.",
+    )
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=42,
+        help="Random seed for shuffle (default: 42)",
+    )
+    parser.add_argument(
+        "--min-duration",
+        type=float,
+        default=None,
+        help="Filter out samples shorter than this (seconds).",
+    )
+    parser.add_argument(
+        "--max-duration",
+        type=float,
+        default=None,
+        help="Filter out samples >= this duration (seconds).",
+    )
+    return parser
+def read_jsonl(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                yield json.loads(line)
+def chunked_reader(iterator, chunk_size):
+    it = iter(iterator)
+    while chunk := list(islice(it, chunk_size)):
+        yield chunk
+def process_audio_item(meta, target_sr):
+    key = meta.get("id")
+    audio_path = meta.get("audio_path")
+    if not key or not audio_path:
+        return {
+            "error": {
+                "id": key,
+                "audio_path": audio_path,
+                "reason": "missing id or audio_path",
+            }
+        }
+    try:
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"{audio_path} not found")
+        waveform, sr = torchaudio.load(audio_path)
+        audio_duration = waveform.shape[1] / sr
+        meta["audio_duration"] = audio_duration
+        if target_sr and sr != target_sr:
+            waveform = torchaudio.functional.resample(waveform, sr, target_sr)
+            sr = target_sr
+        audio_buffer = io.BytesIO()
+        torchaudio.save(audio_buffer, waveform, sr, format="flac", bits_per_sample=16)
+        audio_bytes = audio_buffer.getvalue()
+        sample = {
+            "__key__": key,
+            "flac": audio_bytes,
+        }
+        return {"ok": (sample, meta)}
+    except Exception as e:
+        return {"error": {"id": key, "audio_path": audio_path, "reason": str(e)}}
+def process_single_shard(
+    shard_idx,
+    records,
+    output_tar_pattern,
+    output_jsonl_pattern,
+    target_sr,
+    num_threads=4,
+    min_duration=None,
+    max_duration=None,
+):
+    tar_fname = output_tar_pattern % shard_idx
+    jsonl_fname = output_jsonl_pattern % shard_idx
+    processed_count = 0
+    filtered_count = 0
+    error_count = 0
+    total_duration = 0.0
+    errors = []
+    with wds.TarWriter(tar_fname) as sink, open(
+        jsonl_fname, "w", encoding="utf-8"
+    ) as jsonl_f:
+        with ThreadPoolExecutor(max_workers=num_threads) as thread_pool:
+            futures = []
+            for meta in records:
+                f = thread_pool.submit(process_audio_item, meta, target_sr)
+                futures.append(f)
+            for f in as_completed(futures):
+                result = f.result()
+                if "error" in result:
+                    error_count += 1
+                    errors.append(result["error"])
+                    continue
+                sample, meta = result["ok"]
+                dur = meta.get("audio_duration", 0.0)
+                # Duration filtering (based on actual audio_duration computed above)
+                if min_duration is not None and dur < min_duration:
+                    filtered_count += 1
+                    continue
+                if max_duration is not None and dur >= max_duration:
+                    filtered_count += 1
+                    continue
+                sink.write(sample)
+                jsonl_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
+                total_duration += dur
+                processed_count += 1
+    # Clean up empty shard files
+    if processed_count == 0:
+        for p in (tar_fname, jsonl_fname):
+            if os.path.exists(p):
+                os.remove(p)
+    return (
+        shard_idx,
+        processed_count,
+        error_count,
+        filtered_count,
+        total_duration,
+        errors,
+    )
+def count_lines(path):
+    with open(path, "rb") as f:
+        return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+def pack_dataset(
+    input_jsonl,
+    output_dir,
+    samples_per_shard=5000,
+    num_workers=16,
+    target_sr=24000,
+    threads_per_worker=4,
+    shuffle=False,
+    shuffle_seed=None,
+    min_duration=None,
+    max_duration=None,
+):
+    input_path = Path(input_jsonl)
+    output_dir = Path(output_dir)
+    output_tar_dir = output_dir / "audios"
+    output_tar_dir.mkdir(parents=True, exist_ok=True)
+    output_jsonl_dir = output_dir / "txts"
+    output_jsonl_dir.mkdir(parents=True, exist_ok=True)
+    output_tar_pattern = str(output_tar_dir / "shard-%06d.tar")
+    output_jsonl_pattern = str(output_jsonl_dir / "shard-%06d.jsonl")
+    error_log_path = str(output_dir / "errors.jsonl")
+    # Setup error logger
+    error_logger = logging.getLogger("error_log")
+    error_logger.setLevel(logging.ERROR)
+    error_logger.handlers.clear()
+    fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+    fh.setFormatter(logging.Formatter("%(message)s"))
+    error_logger.addHandler(fh)
+    shard_manifest = {}
+    print(f"Reading input: {input_path}")
+    print(f"Output dir: {output_dir}")
+    print(f"Strategy: {num_workers} Processes x {threads_per_worker} Threads")
+    if shuffle:
+        print("Load input dataset...")
+        entries = list(read_jsonl(input_path))
+        random.seed(shuffle_seed)
+        random.shuffle(entries)
+        print(f"Shuffled {len(entries)} entries (seed={shuffle_seed})")
+        total_lines = len(entries)
+        chunk_gen = chunked_reader(iter(entries), samples_per_shard)
+    else:
+        print("Calculating total lines...")
+        total_lines = count_lines(input_path)
+        chunk_gen = chunked_reader(read_jsonl(input_path), samples_per_shard)
+    if min_duration is not None or max_duration is not None:
+        print(
+            f"Duration filter: [{min_duration or 0:.2f}s"
+            f", {max_duration or float('inf'):.1f}s) (applied after audio decoding)"
+        )
+    total_shards_est = (total_lines + samples_per_shard - 1) // samples_per_shard
+    print(f"Total samples: {total_lines}, Estimated shards: {total_shards_est}")
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = set()
+        shard_idx = 0
+        total_processed = 0
+        total_errors = 0
+        total_filtered = 0
+        pbar = tqdm(
+            total=total_shards_est,
+            desc="Shards Processed",
+            unit="shard",
+        )
+        def submit_next_chunks(limit):
+            """Pull up to `limit` chunks from generator, submit them."""
+            nonlocal shard_idx
+            submitted = 0
+            for chunk in chunk_gen:
+                f = executor.submit(
+                    process_single_shard,
+                    shard_idx,
+                    chunk,
+                    output_tar_pattern,
+                    output_jsonl_pattern,
+                    target_sr,
+                    threads_per_worker,
+                    min_duration,
+                    max_duration,
+                )
+                futures.add(f)
+                shard_idx += 1
+                submitted += 1
+                if submitted >= limit:
+                    break
+        submit_next_chunks(num_workers * 2)
+        while futures:
+            done, _ = wait(futures, return_when=FIRST_COMPLETED)
+            for f in done:
+                futures.remove(f)
+                try:
+                    s_idx, p_count, e_count, f_count, s_duration, errors = f.result()
+                    total_processed += p_count
+                    total_errors += e_count
+                    total_filtered += f_count
+                    # Write error log
+                    for err in errors:
+                        err["shard_idx"] = s_idx
+                        error_logger.error(json.dumps(err, ensure_ascii=False))
+                    if p_count > 0:
+                        tar_abs = os.path.abspath(output_tar_pattern % s_idx)
+                        jsonl_abs = os.path.abspath(output_jsonl_pattern % s_idx)
+                        shard_manifest[s_idx] = (
+                            tar_abs,
+                            jsonl_abs,
+                            p_count,
+                            s_duration,
+                        )
+                    pbar.set_postfix(
+                        {
+                            "Samples": total_processed,
+                            "Filtered": total_filtered,
+                            "Errors": total_errors,
+                        }
+                    )
+                    pbar.update(1)
+                except Exception as e:
+                    print(f"Shard task failed: {e}")
+                submit_next_chunks(1)
+        pbar.close()
+    # Write final manifest file (data.lst)
+    manifest_path = str(output_dir / "data.lst")
+    with open(manifest_path, "w", encoding="utf-8") as mf:
+        for idx in sorted(shard_manifest.keys()):
+            tar_path, jsonl_path, count, duration = shard_manifest[idx]
+            mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+    print(f"\nDone! Output saved to {output_dir}")
+    print(f"Successfully packed: {total_processed}")
+    print(f"Filtered by duration: {total_filtered}")
+    print(f"Failed: {total_errors}")
+    print(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+    if total_errors > 0:
+        print(f"Error details: {error_log_path}")
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = build_parser().parse_args()
+    pack_dataset(
+        input_jsonl=args.input,
+        output_dir=args.output,
+        samples_per_shard=args.shard_size,
+        num_workers=args.workers,
+        target_sr=args.sr,
+        threads_per_worker=args.threads,
+        shuffle=args.shuffle,
+        shuffle_seed=args.shuffle_seed,
+        min_duration=args.min_duration,
+        max_duration=args.max_duration,
+    )

omnivoice/training/__init__.py ADDED Viewed

File without changes

omnivoice/training/builder.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Builders for constructing training components.
+Provides factory functions to assemble the model, tokenizer, and data loaders
+from a ``TrainingConfig``. Called by ``omnivoice.cli.train`` to set up training.
+Key functions:
+- ``build_model_and_tokenizer()``: Loads the model and text tokenizer.
+- ``build_dataloaders()``: Builds packed train/eval data loaders
+  from a data config JSON.
+"""
+import logging
+from functools import partial
+from typing import Tuple
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+from transformers import logging as hf_logging
+from transformers.trainer_utils import seed_worker
+from omnivoice.data.batching import PackingIterableDataset
+from omnivoice.data.collator import PackingDataCollator
+from omnivoice.data.dataset import WebDatasetReader, prepare_data_manifests_from_json
+from omnivoice.data.processor import OmniVoiceSampleProcessor
+from omnivoice.models.omnivoice import OmniVoice, OmniVoiceConfig
+from omnivoice.training.config import TrainingConfig
+logger = logging.getLogger(__name__)
+def build_model_and_tokenizer(
+    config: TrainingConfig,
+) -> Tuple[OmniVoice, AutoTokenizer]:
+    """Load Tokenizer and Model, handle resizing and special tokens."""
+    logger.info("Initializing Model & Tokenizer...")
+    # 1. Tokenizer
+    tokenizer_path = (
+        config.init_from_checkpoint
+        if config.init_from_checkpoint
+        else config.llm_name_or_path
+    )
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    new_tokens = [
+        "<|denoise|>",
+        "<|lang_start|>",
+        "<|lang_end|>",
+        "<|instruct_start|>",
+        "<|instruct_end|>",
+        "<|text_start|>",
+        "<|text_end|>",
+    ]
+    tokens_to_add = [t for t in new_tokens if t not in tokenizer.get_vocab()]
+    if tokens_to_add:
+        tokenizer.add_special_tokens({"additional_special_tokens": tokens_to_add})
+    if config.init_from_checkpoint:
+        logger.info(f"Loading weights from {config.init_from_checkpoint}")
+        model = OmniVoice.from_pretrained(
+            config.init_from_checkpoint,
+            attn_implementation="flex_attention",
+            dtype=torch.float32,
+            train=True,
+        )
+    else:
+        llm_config = AutoConfig.from_pretrained(config.llm_name_or_path)
+        ov_config = OmniVoiceConfig(
+            audio_vocab_size=config.audio_vocab_size,
+            audio_mask_id=config.audio_mask_id,
+            num_audio_codebook=config.num_audio_codebook,
+            audio_codebook_weights=config.audio_codebook_weights,
+            llm_config=llm_config,
+        )
+        original_level = hf_logging.get_verbosity()
+        hf_logging.set_verbosity_error()  # suppress expected lm_head.weight warnings
+        llm = AutoModel.from_pretrained(
+            config.llm_name_or_path,
+            attn_implementation="flex_attention",
+            dtype=torch.float32,
+        )
+        hf_logging.set_verbosity(original_level)
+        model = OmniVoice(config=ov_config, llm=llm)
+    # 3. Resize Embeddings
+    if len(tokenizer) != model.config.llm_config.vocab_size:
+        model.llm.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+    # 4. Config IDs
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.eos_token_id = tokenizer.eos_token_id
+    return model, tokenizer
+def build_dataloaders(
+    config: TrainingConfig, tokenizer: AutoTokenizer
+) -> Tuple[DataLoader, DataLoader]:
+    """Setup Data Pipeline: Manifests -> WDS -> Packing -> Loaders."""
+    logger.info("Initializing Data Readers...")
+    processor = OmniVoiceSampleProcessor(
+        text_tokenizer=tokenizer,
+        num_channels=config.num_audio_codebook,
+        audio_mask_id=config.audio_mask_id,
+        prompt_ratio_range=config.prompt_ratio_range,
+        mask_ratio_range=config.mask_ratio_range,
+        drop_cond_ratio=config.drop_cond_ratio,
+        language_ratio=config.language_ratio,
+        use_pinyin_ratio=config.use_pinyin_ratio,
+        instruct_ratio=config.instruct_ratio,
+        only_instruct_ratio=config.only_instruct_ratio,
+    )
+    train_manifests, dev_manifests = prepare_data_manifests_from_json(
+        config.data_config
+    )
+    raw_train_ds = WebDatasetReader(manifests=train_manifests, evaluation=False)
+    train_dataset = PackingIterableDataset(raw_train_ds, processor, config.batch_tokens)
+    collate_fn = PackingDataCollator(processor, config.batch_tokens)
+    init_fn = partial(
+        seed_worker,
+        num_workers=config.num_workers,
+        rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0,
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=None,  # Each item is a batch packed to the target batch_tokens
+        num_workers=config.num_workers,
+        collate_fn=collate_fn,
+        worker_init_fn=init_fn,
+        pin_memory=True,
+        prefetch_factor=4,
+    )
+    eval_loader = None
+    if dev_manifests:
+        raw_dev_ds = WebDatasetReader(manifests=dev_manifests, evaluation=True)
+        dev_dataset = PackingIterableDataset(raw_dev_ds, processor, config.batch_tokens)
+        eval_loader = DataLoader(
+            dev_dataset,
+            batch_size=None,  # Each item is a batch packed to the target batch_tokens
+            num_workers=1,
+            collate_fn=collate_fn,
+            pin_memory=True,
+            prefetch_factor=2,
+        )
+    return train_loader, eval_loader

omnivoice/training/checkpoint.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Checkpoint saving, resuming, and training logging.
+Provides utilities for saving/loading training checkpoints and logging metrics
+to console and trackers (TensorBoard/WandB). Used by ``OmniTrainer``.
+Key components:
+- ``TrainLogger``: Logs training metrics to console and Accelerate trackers.
+- ``save_checkpoint()``: Saves model, optimizer, and scheduler state.
+- ``load_checkpoint()``: Restores training state from a checkpoint directory.
+"""
+import logging
+import os
+import shutil
+import time
+from typing import Any, Dict, Optional
+import torch
+from accelerate import Accelerator
+from tqdm.auto import tqdm
+logger = logging.getLogger(__name__)
+class TrainLogger:
+    """
+    Handles logging to console and trackers (TensorBoard/WandB)
+    """
+    def __init__(self, accelerator: Accelerator, total_steps: int, logging_steps: int):
+        self.accelerator = accelerator
+        self.total_steps = total_steps
+        self.logging_steps = logging_steps
+        self.start_time = None
+        self.progress_bar = None
+    def start(self, start_step: int = 0):
+        self.start_time = time.time()
+        if self.accelerator.is_main_process:
+            self.progress_bar = tqdm(
+                total=self.total_steps,
+                initial=start_step,
+                desc="Training",
+                dynamic_ncols=True,
+                disable=not self.accelerator.is_local_main_process,
+            )
+    def update(
+        self, step: int, loss: Optional[float] = None, lr: Optional[float] = None
+    ):
+        """
+        Called every step to update the progress bar UI.
+        """
+        if self.progress_bar:
+            self.progress_bar.update(1)
+            # Update real-time metrics on the progress bar itself
+            postfix = {}
+            if loss is not None:
+                postfix["loss"] = f"{loss:.4f}"
+            if lr is not None:
+                postfix["lr"] = f"{lr:.2e}"
+            if postfix:
+                self.progress_bar.set_postfix(postfix)
+    def log_metrics(self, step: int, metrics: Dict[str, Any]):
+        """
+        Called periodically to log to TensorBoard/WandB and console.
+        """
+        # Log to trackers (TensorBoard, etc.)
+        self.accelerator.log(metrics, step=step)
+        if self.accelerator.is_main_process:
+            # Format for console log (separate from tqdm)
+            # Remove keys that are redundant or too verbose for one line
+            formatted_metrics = []
+            for k, v in metrics.items():
+                if isinstance(v, float):
+                    val_str = f"{v:.4f}"
+                    if val_str == "0.0000" and v != 0:
+                        formatted_metrics.append(f"{k}: {v:.2e}")
+                    else:
+                        formatted_metrics.append(f"{k}: {val_str}")
+                else:
+                    formatted_metrics.append(f"{k}: {v}")
+            # Use external logger to write to file, tqdm.write to avoid breaking bar
+            msg = f"Step {step} | " + " | ".join(formatted_metrics)
+            if self.progress_bar:
+                self.progress_bar.write(msg)
+            else:
+                logger.info(msg)
+    def close(self):
+        if self.progress_bar:
+            self.progress_bar.close()
+def save_checkpoint(
+    accelerator: Accelerator,
+    model: torch.nn.Module,
+    tokenizer: Any,
+    output_dir: str,
+    step: int,
+    keep_last_n: int = 3,
+):
+    """
+    Saves model, tokenizer, and accelerator states (optimizer/scheduler).
+    Manages rotation of checkpoints.
+    """
+    checkpoint_dir = os.path.join(output_dir, f"checkpoint-{step}")
+    # 1. Save Accelerator State (Optimizer, Scheduler, RNG, Scaler)
+    accelerator.save_state(checkpoint_dir)
+    # 2. Save Model in HF format (config.json + pytorch_model.bin/safetensors)
+    unwrap_model = accelerator.unwrap_model(model)
+    unwrap_model.save_pretrained(
+        checkpoint_dir,
+        is_main_process=accelerator.is_main_process,
+        save_function=accelerator.save,
+    )
+    # 3. Save Tokenizer
+    if accelerator.is_main_process:
+        tokenizer.save_pretrained(checkpoint_dir)
+    logger.info(f"Saved checkpoint to {checkpoint_dir}")
+    # 4. Rotate checkpoints (Keep last N)
+    if accelerator.is_main_process and keep_last_n > 0:
+        checkpoints = [
+            d
+            for d in os.listdir(output_dir)
+            if d.startswith("checkpoint-")
+            and os.path.isdir(os.path.join(output_dir, d))
+        ]
+        # Sort by step number
+        checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
+        if len(checkpoints) > keep_last_n:
+            to_remove = checkpoints[:-keep_last_n]
+            for d in to_remove:
+                shutil.rmtree(os.path.join(output_dir, d))
+                logger.info(f"Removed old checkpoint {d}")
+def load_checkpoint(accelerator: Accelerator, checkpoint_path: str):
+    """
+    Resumes training state.
+    """
+    logger.info(f"Resuming from {checkpoint_path}")
+    accelerator.load_state(checkpoint_path)
+    # Try to infer step
+    try:
+        clean_path = os.path.normpath(checkpoint_path)
+        step = int(os.path.basename(clean_path).split("-")[-1])
+        return step
+    except ValueError:
+        return 0

omnivoice/training/config.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training configuration dataclass.
+Defines ``TrainingConfig``, a dataclass that holds all hyperparameters and paths
+for training. Loaded from a JSON config file via ``TrainingConfig.from_json()``
+in ``omnivoice.cli.train``.
+"""
+import json
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Tuple
+@dataclass
+class TrainingConfig:
+    # Key Paths
+    output_dir: Optional[str] = None
+    data_config: Optional[str] = None
+    # Model Specific
+    llm_name_or_path: str = "Qwen/Qwen3-0.6B"
+    audio_vocab_size: int = 1025  # valid vocab size + 1 (mask token)
+    audio_mask_id: int = 1024  # 1024 is the 1025-th token
+    num_audio_codebook: int = 8
+    # Model Training Specific
+    audio_codebook_weights: List[float | int] = field(
+        default_factory=lambda: [8, 8, 6, 6, 4, 4, 2, 2]
+    )
+    drop_cond_ratio: float = 0.1
+    prompt_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 0.3))
+    mask_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 1.0))
+    language_ratio: float = 0.8
+    use_pinyin_ratio: float = 0.3
+    instruct_ratio: float = 1.0
+    only_instruct_ratio: float = 0.5
+    # Init settings
+    resume_from_checkpoint: Optional[str] = None
+    init_from_checkpoint: Optional[str] = None
+    # Training Hyperparams
+    learning_rate: float = 1e-4
+    weight_decay: float = 0.01
+    max_grad_norm: float = 1.0
+    steps: int = 300000
+    seed: int = 42
+    lr_scheduler_type: str = "cosine"
+    warmup_type: str = "ratio"
+    warmup_ratio: float = 0.03
+    warmup_steps: int = 2000
+    # Data
+    batch_tokens: int = 8192
+    gradient_accumulation_steps: int = 1
+    num_workers: int = 8
+    # System
+    mixed_precision: str = "bf16"
+    allow_tf32: bool = True
+    use_deepspeed: bool = False
+    deepspeed_config: Optional[str] = None
+    # Logging
+    logging_steps: int = 100
+    eval_steps: int = 1000
+    save_steps: int = 10000
+    keep_last_n_checkpoints: int = -1
+    @classmethod
+    def from_json(cls, json_path: str):
+        with open(json_path, "r") as f:
+            cfg_dict = json.load(f)
+        valid_keys = cls.__annotations__.keys()
+        filtered_dict = {k: v for k, v in cfg_dict.items() if k in valid_keys}
+        instance = cls(**filtered_dict)
+        return instance
+    def save_to_json(self, json_path: str):
+        data = asdict(self)
+        with open(json_path, "w") as f:
+            json.dump(data, f, indent=4)

omnivoice/training/trainer.py ADDED Viewed

	@@ -0,0 +1,342 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Training loop for OmniVoice.
+Wraps the HuggingFace Accelerate training loop with checkpoint saving/resuming,
+evaluation, gradient accumulation, and learning rate scheduling.
+Launched via ``omnivoice.cli.train``.
+"""
+import logging
+import math
+import os
+import sys
+import time
+from datetime import timedelta
+from typing import Any, Optional
+import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.utils import DeepSpeedPlugin, InitProcessGroupKwargs, set_seed
+from torch.utils.data import DataLoader
+from transformers import (
+    get_cosine_schedule_with_warmup,
+    get_constant_schedule_with_warmup,
+)
+from omnivoice.training.checkpoint import TrainLogger, load_checkpoint
+from omnivoice.training.checkpoint import save_checkpoint as engine_save_checkpoint
+logger = logging.getLogger(__name__)
+class OmniTrainer:
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        config: Any,  # TrainingConfig
+        train_dataloader: DataLoader,
+        eval_dataloader: Optional[DataLoader] = None,
+        tokenizer: Optional[Any] = None,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        lr_scheduler: Optional[Any] = None,
+    ):
+        self.config = config
+        self.model = model
+        self.tokenizer = tokenizer
+        self.train_dataloader = train_dataloader
+        self.eval_dataloader = eval_dataloader
+        # 1. Initialize Accelerator
+        self.accelerator = self._init_accelerator()
+        # 2. Setup Optimizer & Scheduler if not provided
+        if optimizer is None:
+            self.optimizer, self.lr_scheduler = self.create_optimizer_and_scheduler()
+        else:
+            self.optimizer = optimizer
+            self.lr_scheduler = lr_scheduler
+        # 3. DeepSpeed Hack (Batch Size fix)
+        if self.accelerator.distributed_type == "DEEPSPEED":
+            self.accelerator.state.deepspeed_plugin.deepspeed_config[
+                "train_micro_batch_size_per_gpu"
+            ] = 1
+        # 4. Prepare with Accelerator
+        (self.model, self.optimizer, self.lr_scheduler,) = self.accelerator.prepare(
+            self.model,
+            self.optimizer,
+            self.lr_scheduler,
+        )
+        self.global_step = 0
+        self.epoch = 0
+    def _init_accelerator(self) -> Accelerator:
+        """Initialize Accelerator, DeepSpeed, and Logging."""
+        # TF32 setup
+        if getattr(self.config, "allow_tf32", False):
+            torch.set_float32_matmul_precision("high")
+        # Init handlers
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
+        init_kwargs = InitProcessGroupKwargs(timeout=timedelta(minutes=60))
+        # DeepSpeed setup
+        deepspeed_plugin = None
+        if self.config.use_deepspeed and self.config.deepspeed_config:
+            if not os.path.exists(self.config.deepspeed_config):
+                raise FileNotFoundError(
+                    f"DeepSpeed config not found: {self.config.deepspeed_config}"
+                )
+            deepspeed_plugin = DeepSpeedPlugin(
+                hf_ds_config=self.config.deepspeed_config,
+                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+                gradient_clipping=self.config.max_grad_norm,
+            )
+        accelerator = Accelerator(
+            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+            mixed_precision=self.config.mixed_precision,
+            log_with="tensorboard",
+            project_dir=self.config.output_dir,
+            step_scheduler_with_optimizer=False,
+            kwargs_handlers=[ddp_kwargs, init_kwargs],
+            deepspeed_plugin=deepspeed_plugin,
+            split_batches=False,
+        )
+        # Logging setup
+        if accelerator.is_main_process:
+            os.makedirs(self.config.output_dir, exist_ok=True)
+            # Try to save config if it has the method
+            if hasattr(self.config, "save_to_json"):
+                self.config.save_to_json(
+                    os.path.join(self.config.output_dir, "initial_config.json")
+                )
+            logging.basicConfig(
+                format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+                datefmt="%m/%d/%Y %H:%M:%S",
+                level=logging.INFO,
+                handlers=[
+                    logging.StreamHandler(sys.stdout),
+                    logging.FileHandler(
+                        os.path.join(self.config.output_dir, "train.log")
+                    ),
+                ],
+            )
+        else:
+            logging.basicConfig(level=logging.ERROR)
+        logger.info(f"Loaded Config: {self.config}")
+        set_seed(self.config.seed)
+        accelerator.init_trackers("tensorboard")
+        return accelerator
+    def create_optimizer_and_scheduler(self):
+        """Default AdamW + configurable LR Scheduler."""
+        optimizer = torch.optim.AdamW(
+            self.model.parameters(),
+            lr=self.config.learning_rate,
+            weight_decay=self.config.weight_decay,
+        )
+        if self.config.warmup_type == "ratio":
+            final_warmup_steps = math.ceil(self.config.steps * self.config.warmup_ratio)
+        else:
+            final_warmup_steps = self.config.warmup_steps
+        if self.config.lr_scheduler_type == "constant":
+            lr_scheduler = get_constant_schedule_with_warmup(
+                optimizer=optimizer,
+                num_warmup_steps=final_warmup_steps,
+            )
+        else:
+            lr_scheduler = get_cosine_schedule_with_warmup(
+                optimizer=optimizer,
+                num_warmup_steps=final_warmup_steps,
+                num_training_steps=self.config.steps,
+            )
+        return optimizer, lr_scheduler
+    def save_checkpoint(self, step):
+        """Wrapper for engine save_checkpoint."""
+        engine_save_checkpoint(
+            self.accelerator,
+            self.model,
+            self.tokenizer,
+            self.config.output_dir,
+            step,
+            self.config.keep_last_n_checkpoints,
+        )
+        # Save config copy for convenience
+        if self.accelerator.is_main_process and hasattr(self.config, "save_to_json"):
+            checkpoint_dir = os.path.join(self.config.output_dir, f"checkpoint-{step}")
+            self.config.save_to_json(os.path.join(checkpoint_dir, "train_config.json"))
+    def load_checkpoint(self, checkpoint_path):
+        """Wrapper for loading."""
+        step = load_checkpoint(self.accelerator, checkpoint_path)
+        self.global_step = step
+        logger.info(f"Resumed from step {self.global_step}")
+        return step
+    def evaluate(self):
+        """Evaluation loop."""
+        if self.eval_dataloader is None:
+            return {}
+        self.model.eval()
+        logger.info(f"Running evaluation at step {self.global_step}...")
+        local_loss_sum = torch.tensor(0.0, device=self.accelerator.device)
+        eval_count = 0
+        with torch.no_grad():
+            for eval_batch in self.eval_dataloader:
+                outputs = self.model(**eval_batch)
+                local_loss_sum += outputs.loss.detach()
+                eval_count += 1
+        if eval_count > 0:
+            local_mean = local_loss_sum / eval_count
+        else:
+            local_mean = torch.tensor(0.0, device=self.accelerator.device)
+        all_means = self.accelerator.gather(local_mean)
+        final_eval_loss = all_means.mean().item()
+        eval_metrics = {"eval/loss": final_eval_loss}
+        self.accelerator.log(eval_metrics, step=self.global_step)
+        logger.info(f"Eval Loss: {final_eval_loss:.4f}")
+        self.accelerator.wait_for_everyone()
+        self.model.train()
+        return eval_metrics
+    def train(self):
+        """Main training loop."""
+        logger.info("Starting Training Loop...")
+        # Resume if configured
+        if self.config.resume_from_checkpoint:
+            self.load_checkpoint(self.config.resume_from_checkpoint)
+        # Handle IterableDataset Epochs
+        if hasattr(self.train_dataloader.dataset, "set_epoch"):
+            self.train_dataloader.dataset.set_epoch(self.epoch)
+        # Logger
+        train_logger = TrainLogger(
+            self.accelerator, self.config.steps, self.config.logging_steps
+        )
+        train_logger.start(self.global_step)
+        self.model.train()
+        train_iterator = iter(self.train_dataloader)
+        logging_start_time = time.time()
+        logging_start_step = self.global_step
+        tr_loss = torch.tensor(0.0).to(self.accelerator.device)
+        logging_loss_scalar = 0.0
+        while self.global_step < self.config.steps:
+            try:
+                batch = next(train_iterator)
+            except StopIteration:
+                self.epoch += 1
+                logger.info(f"Epoch {self.epoch} starting. Resetting dataloader...")
+                if hasattr(self.train_dataloader.dataset, "set_epoch"):
+                    self.train_dataloader.dataset.set_epoch(self.epoch)
+                train_iterator = iter(self.train_dataloader)
+                batch = next(train_iterator)
+            with self.accelerator.accumulate(self.model):
+                outputs = self.model(**batch)
+                loss = outputs.loss
+                tr_loss += loss.detach()
+                self.accelerator.backward(loss)
+                if self.accelerator.sync_gradients:
+                    # Clipping
+                    grad_norm = 0.0
+                    if self.config.max_grad_norm > 0:
+                        grad_norm = self.accelerator.clip_grad_norm_(
+                            self.model.parameters(), self.config.max_grad_norm
+                        )
+                        grad_norm = (
+                            grad_norm.item() if grad_norm is not None else 0.0
+                        )
+                    self.optimizer.step()
+                    self.lr_scheduler.step()
+                    self.optimizer.zero_grad()
+                    self.global_step += 1
+                    # Logging
+                    current_lr = self.lr_scheduler.get_last_lr()[0]
+                    train_logger.update(
+                        step=self.global_step, loss=loss.item(), lr=current_lr
+                    )
+                    if self.global_step % self.config.logging_steps == 0:
+                        elapsed = time.time() - logging_start_time
+                        steps_per_sec = (
+                            (self.global_step - logging_start_step) / elapsed
+                            if elapsed > 0
+                            else 0
+                        )
+                        tr_loss_scalar = self.accelerator.gather(tr_loss).mean().item()
+                        current_interval_loss = tr_loss_scalar - logging_loss_scalar
+                        avg_loss = current_interval_loss / (
+                            self.config.logging_steps
+                            * self.config.gradient_accumulation_steps
+                        )
+                        logging_loss_scalar = tr_loss_scalar
+                        logs = {
+                            "train/loss": avg_loss,
+                            "train/learning_rate": current_lr,
+                            "train/grad_norm": grad_norm,
+                            "train/epoch": self.epoch,
+                            "train/steps_per_sec": steps_per_sec,
+                        }
+                        train_logger.log_metrics(step=self.global_step, metrics=logs)
+                        logging_start_time = time.time()
+                        logging_start_step = self.global_step
+                    # Evaluate
+                    if (
+                        self.eval_dataloader is not None
+                        and self.global_step % self.config.eval_steps == 0
+                    ):
+                        self.evaluate()
+                    # Save
+                    if self.global_step % self.config.save_steps == 0:
+                        self.save_checkpoint(self.global_step)
+        # Final Save
+        self.save_checkpoint(self.global_step)
+        train_logger.close()
+        self.accelerator.end_training()

omnivoice/utils/__init__.py ADDED Viewed

File without changes

omnivoice/utils/audio.py ADDED Viewed

	@@ -0,0 +1,355 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Audio I/O and processing utilities.
+Provides functions for loading, resampling, silence removal, chunking,
+cross-fading, and format conversion. Used by ``OmniVoice.generate()`` during
+inference post-processing.
+"""
+import numpy as np
+import torch
+import torchaudio
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence
+def load_audio(audio_path: str, sampling_rate: int):
+    """
+    Load the waveform with torchaudio and resampling if needed.
+    Parameters:
+        audio_path: path of the audio.
+        sampling_rate: target sampling rate.
+    Returns:
+        Loaded prompt waveform with target sampling rate,
+        PyTorch tensor of shape (1, T)
+    """
+    try:
+        waveform, prompt_sampling_rate = torchaudio.load(audio_path)
+    except (RuntimeError, OSError):
+        # Fallback via pydub+ffmpeg for formats torchaudio can't handle
+        aseg = AudioSegment.from_file(audio_path)
+        audio_data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0
+        if aseg.channels == 1:
+            waveform = torch.from_numpy(audio_data).unsqueeze(0)
+        else:
+            waveform = torch.from_numpy(audio_data.reshape(-1, aseg.channels).T)
+        prompt_sampling_rate = aseg.frame_rate
+    if prompt_sampling_rate != sampling_rate:
+        waveform = torchaudio.functional.resample(
+            waveform,
+            orig_freq=prompt_sampling_rate,
+            new_freq=sampling_rate,
+        )
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    return waveform
+def remove_silence(
+    audio: torch.Tensor,
+    sampling_rate: int,
+    mid_sil: int = 300,
+    lead_sil: int = 100,
+    trail_sil: int = 300,
+):
+    """
+    Remove middle silences longer than mid_sil ms, and edge silences longer than edge_sil ms
+    Parameters:
+        audio: PyTorch tensor with shape (C, T).
+        sampling_rate: sampling rate of the audio.
+        mid_sil: the duration of silences in the middle of audio to be removed in ms.
+            if mid_sil <= 0, no middle silence will be removed.
+        edge_sil: the duration of silences in the edge of audio to be removed in ms.
+        trail_sil: the duration of added trailing silence in ms.
+    Returns:
+        PyTorch tensor with shape (C, T), where C is number of channels
+            and T is number of audio samples
+    """
+    # Load audio file
+    wave = tensor_to_audiosegment(audio, sampling_rate)
+    if mid_sil > 0:
+        # Split audio using silences longer than mid_sil
+        non_silent_segs = split_on_silence(
+            wave,
+            min_silence_len=mid_sil,
+            silence_thresh=-50,
+            keep_silence=mid_sil,
+            seek_step=10,
+        )
+        # Concatenate all non-silent segments
+        wave = AudioSegment.silent(duration=0)
+        for seg in non_silent_segs:
+            wave += seg
+    # Remove silence longer than 0.1 seconds in the begining and ending of wave
+    wave = remove_silence_edges(wave, lead_sil, trail_sil, -50)
+    # Convert to PyTorch tensor
+    return audiosegment_to_tensor(wave)
+def remove_silence_edges(
+    audio: AudioSegment,
+    lead_sil: int = 100,
+    trail_sil: int = 300,
+    silence_threshold: float = -50,
+):
+    """
+    Remove edge silences longer than `keep_silence` ms.
+    Parameters:
+        audio: an AudioSegment object.
+        keep_silence: kept silence in the edge.
+        only_edge: If true, only remove edge silences.
+        silence_threshold: the threshold of silence.
+    Returns:
+        An AudioSegment object
+    """
+    # Remove heading silence
+    start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+    start_idx = max(0, start_idx - lead_sil)
+    audio = audio[start_idx:]
+    # Remove trailing silence
+    audio = audio.reverse()
+    start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+    start_idx = max(0, start_idx - trail_sil)
+    audio = audio[start_idx:]
+    audio = audio.reverse()
+    return audio
+def audiosegment_to_tensor(aseg):
+    """
+    Convert a pydub.AudioSegment to PyTorch audio tensor
+    """
+    audio_data = np.array(aseg.get_array_of_samples())
+    # Convert to float32 and normalize to [-1, 1] range
+    audio_data = audio_data.astype(np.float32) / 32768.0
+    # Handle channels
+    if aseg.channels == 1:
+        # Mono channel: add channel dimension (T) -> (1, T)
+        tensor_data = torch.from_numpy(audio_data).unsqueeze(0)
+    else:
+        # Multi-channel: reshape to (C, T)
+        tensor_data = torch.from_numpy(audio_data.reshape(-1, aseg.channels).T)
+    return tensor_data
+def tensor_to_audiosegment(tensor, sample_rate):
+    """
+    Convert a PyTorch audio tensor to pydub.AudioSegment
+    Parameters:
+        tensor: Tensor with shape (C, T), where C is the number of channels
+            and T is the time steps
+        sample_rate: Audio sample rate
+    """
+    # Convert tensor to numpy array
+    assert isinstance(tensor, torch.Tensor)
+    audio_np = tensor.cpu().numpy()
+    # Convert to int16 type (common format for pydub)
+    # Assumes tensor values are in [-1, 1] range as floating point
+    audio_np = (audio_np * 32768.0).clip(-32768, 32767).astype(np.int16)
+    # Convert to byte stream
+    # For multi-channel audio, pydub requires interleaved format
+    # (e.g., left-right-left-right)
+    if audio_np.shape[0] > 1:
+        # Convert to interleaved format
+        audio_np = audio_np.transpose(1, 0).flatten()
+    audio_bytes = audio_np.tobytes()
+    # Create AudioSegment
+    audio_segment = AudioSegment(
+        data=audio_bytes,
+        sample_width=2,
+        frame_rate=sample_rate,
+        channels=tensor.shape[0],
+    )
+    return audio_segment
+def fade_and_pad_audio(
+    audio: torch.Tensor,
+    pad_duration: float = 0.1,
+    fade_duration: float = 0.1,
+    sample_rate: int = 24000,
+) -> torch.Tensor:
+    """
+    Applies a smooth fade-in and fade-out to the audio, and then pads both sides
+    with pure silence to prevent abrupt starts and ends (clicks/pops).
+    Args:
+        audio: PyTorch tensor of shape (C, T) containing audio data.
+        pad_duration: Duration of pure silence to add to each end (in seconds).
+        fade_duration: Duration of the fade-in/out curve (in seconds).
+        sample_rate: Audio sampling rate.
+    Returns:
+        Processed sequence tensor with shape (C, T_new)
+    """
+    if audio.shape[-1] == 0:
+        return audio
+    fade_samples = int(fade_duration * sample_rate)
+    pad_samples = int(pad_duration * sample_rate)
+    processed = audio.clone()
+    if fade_samples > 0:
+        k = min(fade_samples, processed.shape[-1] // 2)
+        if k > 0:
+            fade_in = torch.linspace(
+                0, 1, k, device=processed.device, dtype=processed.dtype
+            )[None, :]
+            processed[..., :k] = processed[..., :k] * fade_in
+            fade_out = torch.linspace(
+                1, 0, k, device=processed.device, dtype=processed.dtype
+            )[None, :]
+            processed[..., -k:] = processed[..., -k:] * fade_out
+    if pad_samples > 0:
+        silence = torch.zeros(
+            (processed.shape[0], pad_samples),
+            dtype=processed.dtype,
+            device=processed.device,
+        )
+        processed = torch.cat([silence, processed, silence], dim=-1)
+    return processed
+def trim_long_audio(
+    audio: torch.Tensor,
+    sampling_rate: int,
+    max_duration: float = 15.0,
+    min_duration: float = 3.0,
+    trim_threshold: float = 20.0,
+) -> torch.Tensor:
+    """Trim audio to <= max_duration by splitting at the largest silence gap.
+    Only trims when the audio exceeds *trim_threshold* seconds.
+    Args:
+        audio: Audio tensor of shape (C, T).
+        sampling_rate: Audio sampling rate.
+        max_duration: Maximum duration in seconds.
+        min_duration: Minimum duration in seconds.
+        trim_threshold: Only trim if audio is longer than this (seconds).
+    Returns:
+        Trimmed audio tensor.
+    """
+    duration = audio.size(-1) / sampling_rate
+    if duration <= trim_threshold:
+        return audio
+    seg = tensor_to_audiosegment(audio, sampling_rate)
+    nonsilent = detect_nonsilent(
+        seg, min_silence_len=100, silence_thresh=-40, seek_step=10
+    )
+    if not nonsilent:
+        return audio
+    max_ms = int(max_duration * 1000)
+    min_ms = int(min_duration * 1000)
+    # Walk through speech regions; at each gap pick the latest split <= max_duration
+    best_split = 0
+    for start, end in nonsilent:
+        if start > best_split and start <= max_ms:
+            best_split = start
+        if end > max_ms:
+            break
+    if best_split < min_ms:
+        best_split = min(max_ms, len(seg))
+    trimmed = seg[:best_split]
+    return audiosegment_to_tensor(trimmed)
+def cross_fade_chunks(
+    chunks: list[torch.Tensor],
+    sample_rate: int,
+    silence_duration: float = 0.3,
+) -> torch.Tensor:
+    """Concatenate audio chunks with a short silence gap and fade at boundaries.
+    Each boundary is structured as: fade-out tail → silence buffer → fade-in head.
+    This avoids click artifacts from direct concatenation or overlapping mismatch.
+    Args:
+        chunks: List of audio tensors, each (C, T).
+        sample_rate: Audio sample rate.
+        silence_duration: Total silence gap duration in seconds.
+    Returns:
+        Merged audio tensor (C, T_total).
+    """
+    if len(chunks) == 1:
+        return chunks[0]
+    total_n = int(silence_duration * sample_rate)
+    fade_n = total_n // 3
+    silence_n = fade_n  # middle silent gap
+    merged = chunks[0].clone()
+    for chunk in chunks[1:]:
+        dev, dt = merged.device, merged.dtype
+        parts = [merged]
+        # Fade out tail of current merged audio
+        fout_n = min(fade_n, merged.size(-1))
+        if fout_n > 0:
+            w_out = torch.linspace(1, 0, fout_n, device=dev, dtype=dt)[None, :]
+            parts[-1][..., -fout_n:] = parts[-1][..., -fout_n:] * w_out
+        # Silent buffer between chunks
+        parts.append(torch.zeros(chunks[0].shape[0], silence_n, device=dev, dtype=dt))
+        # Fade in head of next chunk
+        fade_in = chunk.clone()
+        fin_n = min(fade_n, fade_in.size(-1))
+        if fin_n > 0:
+            w_in = torch.linspace(0, 1, fin_n, device=dev, dtype=dt)[None, :]
+            fade_in[..., :fin_n] = fade_in[..., :fin_n] * w_in
+        parts.append(fade_in)
+        merged = torch.cat(parts, dim=-1)
+    return merged

omnivoice/utils/common.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared utility functions."""
+import argparse
+import random
+import numpy as np
+import torch
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def fix_random_seed(random_seed: int):
+    """
+    Set the same random seed for the libraries and modules.
+    Includes the ``random`` module, numpy, and torch.
+    """
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    torch.random.manual_seed(random_seed)
+    # Ensure deterministic ID creation
+    rd = random.Random()
+    rd.seed(random_seed)

omnivoice/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data utilities for batch inference and evaluation.
+Provides ``read_test_list()`` to parse JSONL test list files used by
+``omnivoice.cli.infer_batch`` and evaluation scripts.
+"""
+import json
+import logging
+from pathlib import Path
+def read_test_list(path):
+    """Read a JSONL test list file.
+    Each line should be a JSON object with fields:
+        id, text, ref_audio, ref_text, language_id, language_name, duration, speed
+    language_id, language_name, duration, and speed are optional (default to None).
+    Returns a list of dicts.
+    """
+    path = Path(path)
+    samples = []
+    with path.open("r", encoding="utf-8") as f:
+        for line_no, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                logging.warning(f"Skipping malformed JSON at line {line_no}: {line}")
+                continue
+            sample = {
+                "id": obj.get("id"),
+                "text": obj.get("text"),
+                "ref_audio": obj.get("ref_audio"),
+                "ref_text": obj.get("ref_text"),
+                "language_id": obj.get("language_id"),
+                "language_name": obj.get("language_name"),
+                "duration": obj.get("duration"),
+                "speed": obj.get("speed"),
+            }
+            samples.append(sample)
+    return samples

omnivoice/utils/duration.py ADDED Viewed

	@@ -0,0 +1,282 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text duration estimation for TTS generation.
+Provides ``RuleDurationEstimator``, which estimates audio duration from text
+using character phonetic weights across 600+ languages. Used by
+``OmniVoice.generate()`` to determine output length when no duration is specified.
+"""
+import bisect
+import unicodedata
+from functools import lru_cache
+from typing import Optional
+class RuleDurationEstimator:
+    def __init__(self):
+        # ==========================================
+        # 1. Phonetic Weights Table
+        # ==========================================
+        # The weight represents the relative speaking time compared to
+        # a standard Latin letter.
+        # Benchmark: 1.0 = One Latin Character (~40-50ms)
+        self.weights = {
+            # --- Logographic (1 char = full syllable/word) ---
+            "cjk": 3.0,  # Chinese, Japanese Kanji, etc.
+            # --- Syllabic / Blocks
+            "hangul": 2.5,  # Korean Hangul
+            "kana": 2.2,  # Japanese Hiragana/Katakana
+            "ethiopic": 3.0,  # Amharic/Ge'ez
+            "yi": 3.0,  # Yi script
+            # --- Abugida (Consonant-Vowel complexes) ---
+            "indic": 1.8,  # Hindi, Bengali, Tamil, etc.
+            "thai_lao": 1.5,  # Thai, Lao
+            "khmer_myanmar": 1.8,  # Khmer, Myanmar
+            # --- Abjad (Consonant-heavy) ---
+            "arabic": 1.5,  # Arabic, Persian, Urdu
+            "hebrew": 1.5,  # Hebrew
+            # --- Alphabet (Segmental) ---
+            "latin": 1.0,  # English, Spanish, French, Vietnamese, etc. (Baseline)
+            "cyrillic": 1.0,  # Russian, Ukrainian
+            "greek": 1.0,  # Greek
+            "armenian": 1.0,  # Armenian
+            "georgian": 1.0,  # Georgian
+            # --- Symbols & Misc ---
+            "punctuation": 0.5,  # Pause capability
+            "space": 0.2,  # Word boundary/Breath (0.05 / 0.22)
+            "digit": 3.5,  # Numbers
+            "mark": 0.0,  # Diacritics/Accents (Silent modifiers)
+            "default": 1.0,  # Fallback for unknown scripts
+        }
+        # ==========================================
+        # 2. Unicode Range Mapping
+        # ==========================================
+        # Format: (End_Codepoint, Type_Key)
+        # Used for fast binary search (bisect).
+        self.ranges = [
+            (0x02AF, "latin"),  # Latin (Basic, Supplement, Ext, IPA)
+            (0x03FF, "greek"),  # Greek & Coptic
+            (0x052F, "cyrillic"),  # Cyrillic
+            (0x058F, "armenian"),  # Armenian
+            (0x05FF, "hebrew"),  # Hebrew
+            (0x077F, "arabic"),  # Arabic, Syriac, Arabic Supplement
+            (0x089F, "arabic"),  # Arabic Extended-B (+ Syriac Supp)
+            (0x08FF, "arabic"),  # Arabic Extended-A
+            (0x097F, "indic"),  # Devanagari
+            (0x09FF, "indic"),  # Bengali
+            (0x0A7F, "indic"),  # Gurmukhi
+            (0x0AFF, "indic"),  # Gujarati
+            (0x0B7F, "indic"),  # Oriya
+            (0x0BFF, "indic"),  # Tamil
+            (0x0C7F, "indic"),  # Telugu
+            (0x0CFF, "indic"),  # Kannada
+            (0x0D7F, "indic"),  # Malayalam
+            (0x0DFF, "indic"),  # Sinhala
+            (0x0EFF, "thai_lao"),  # Thai & Lao
+            (0x0FFF, "indic"),  # Tibetan (Abugida)
+            (0x109F, "khmer_myanmar"),  # Myanmar
+            (0x10FF, "georgian"),  # Georgian
+            (0x11FF, "hangul"),  # Hangul Jamo
+            (0x137F, "ethiopic"),  # Ethiopic
+            (0x139F, "ethiopic"),  # Ethiopic Supplement
+            (0x13FF, "default"),  # Cherokee
+            (0x167F, "default"),  # Canadian Aboriginal Syllabics
+            (0x169F, "default"),  # Ogham
+            (0x16FF, "default"),  # Runic
+            (0x171F, "default"),  # Tagalog (Baybayin)
+            (0x173F, "default"),  # Hanunoo
+            (0x175F, "default"),  # Buhid
+            (0x177F, "default"),  # Tagbanwa
+            (0x17FF, "khmer_myanmar"),  # Khmer
+            (0x18AF, "default"),  # Mongolian
+            (0x18FF, "default"),  # Canadian Aboriginal Syllabics Ext
+            (0x194F, "indic"),  # Limbu
+            (0x19DF, "indic"),  # Tai Le & New Tai Lue
+            (0x19FF, "khmer_myanmar"),  # Khmer Symbols
+            (0x1A1F, "indic"),  # Buginese
+            (0x1AAF, "indic"),  # Tai Tham
+            (0x1B7F, "indic"),  # Balinese
+            (0x1BBF, "indic"),  # Sundanese
+            (0x1BFF, "indic"),  # Batak
+            (0x1C4F, "indic"),  # Lepcha
+            (0x1C7F, "indic"),  # Ol Chiki (Santali)
+            (0x1C8F, "cyrillic"),  # Cyrillic Extended-C
+            (0x1CBF, "georgian"),  # Georgian Extended
+            (0x1CCF, "indic"),  # Sundanese Supplement
+            (0x1CFF, "indic"),  # Vedic Extensions
+            (0x1D7F, "latin"),  # Phonetic Extensions
+            (0x1DBF, "latin"),  # Phonetic Extensions Supplement
+            (0x1DFF, "default"),  # Combining Diacritical Marks Supplement
+            (0x1EFF, "latin"),  # Latin Extended Additional (Vietnamese)
+            (0x309F, "kana"),  # Hiragana
+            (0x30FF, "kana"),  # Katakana
+            (0x312F, "cjk"),  # Bopomofo (Pinyin)
+            (0x318F, "hangul"),  # Hangul Compatibility Jamo
+            (0x9FFF, "cjk"),  # CJK Unified Ideographs (Main)
+            (0xA4CF, "yi"),  # Yi Syllables
+            (0xA4FF, "default"),  # Lisu
+            (0xA63F, "default"),  # Vai
+            (0xA69F, "cyrillic"),  # Cyrillic Extended-B
+            (0xA6FF, "default"),  # Bamum
+            (0xA7FF, "latin"),  # Latin Extended-D
+            (0xA82F, "indic"),  # Syloti Nagri
+            (0xA87F, "default"),  # Phags-pa
+            (0xA8DF, "indic"),  # Saurashtra
+            (0xA8FF, "indic"),  # Devanagari Extended
+            (0xA92F, "indic"),  # Kayah Li
+            (0xA95F, "indic"),  # Rejang
+            (0xA97F, "hangul"),  # Hangul Jamo Extended-A
+            (0xA9DF, "indic"),  # Javanese
+            (0xA9FF, "khmer_myanmar"),  # Myanmar Extended-B
+            (0xAA5F, "indic"),  # Cham
+            (0xAA7F, "khmer_myanmar"),  # Myanmar Extended-A
+            (0xAADF, "indic"),  # Tai Viet
+            (0xAAFF, "indic"),  # Meetei Mayek Extensions
+            (0xAB2F, "ethiopic"),  # Ethiopic Extended-A
+            (0xAB6F, "latin"),  # Latin Extended-E
+            (0xABBF, "default"),  # Cherokee Supplement
+            (0xABFF, "indic"),  # Meetei Mayek
+            (0xD7AF, "hangul"),  # Hangul Syllables
+            (0xFAFF, "cjk"),  # CJK Compatibility
+            (0xFDFF, "arabic"),  # Arabic Presentation Forms-A
+            (0xFE6F, "default"),  # Variation Selectors
+            (0xFEFF, "arabic"),  # Arabic Presentation Forms-B
+            (0xFFEF, "latin"),  # Fullwidth Latin
+        ]
+        self.breakpoints = [r[0] for r in self.ranges]
+    @lru_cache(maxsize=4096)
+    def _get_char_weight(self, char):
+        """Determines the weight of a single character."""
+        code = ord(char)
+        if (65 <= code <= 90) or (97 <= code <= 122):
+            return self.weights["latin"]
+        if code == 32:
+            return self.weights["space"]
+        # Ignore arabic Tatweel
+        if code == 0x0640:
+            return self.weights["mark"]
+        category = unicodedata.category(char)
+        if category.startswith("M"):
+            return self.weights["mark"]
+        if category.startswith("P") or category.startswith("S"):
+            return self.weights["punctuation"]
+        if category.startswith("Z"):
+            return self.weights["space"]
+        if category.startswith("N"):
+            return self.weights["digit"]
+        # 3. Binary search for Unicode Block (此时区间里绝不会再混进标点符号)
+        idx = bisect.bisect_left(self.breakpoints, code)
+        if idx < len(self.ranges):
+            script_type = self.ranges[idx][1]
+            return self.weights.get(script_type, self.weights["default"])
+        # 4. Handle upper planes (CJK Ext B/C/D, Historic scripts)
+        if code > 0x20000:
+            return self.weights["cjk"]
+        return self.weights["default"]
+    def calculate_total_weight(self, text):
+        """Sums up the normalized weights for a string."""
+        return sum(self._get_char_weight(c) for c in text)
+    def estimate_duration(
+        self,
+        target_text: str,
+        ref_text: str,
+        ref_duration: float,
+        low_threshold: Optional[float] = 50,
+        boost_strength: float = 3,
+    ) -> float:
+        """
+        Args:
+            target_text (str): The text for which we want to estimate the duration.
+            ref_text (str): The reference text that was used to measure
+                the ref_duration.
+            ref_duration (float): The actual duration it took
+                to speak the ref_text.
+            low_threshold (float): The minimum duration threshold below which the
+                estimation will be considered unreliable.
+            boost_strength (float): Controls the power-curve boost for short durations.
+                Higher values boost small durations more aggressively.
+                1 = no boost (linear), 2 = sqrt-like
+        Returns:
+            float: The estimated duration for the target_text based
+                on the ref_text and ref_duration.
+        """
+        if ref_duration <= 0 or not ref_text:
+            return 0.0
+        ref_weight = self.calculate_total_weight(ref_text)
+        if ref_weight == 0:
+            return 0.0
+        speed_factor = ref_weight / ref_duration
+        target_weight = self.calculate_total_weight(target_text)
+        estimated_duration = target_weight / speed_factor
+        if low_threshold is not None and estimated_duration < low_threshold:
+            alpha = 1.0 / boost_strength
+            return low_threshold * (estimated_duration / low_threshold) ** alpha
+        else:
+            return estimated_duration
+# ==========================================
+# Example Usage
+# ==========================================
+if __name__ == "__main__":
+    estimator = RuleDurationEstimator()
+    ref_txt = "Hello, world."
+    ref_dur = 1.5
+    test_cases = [
+        ("Hindi (With complex marks)", "नमस्ते दुनिया"),
+        ("Arabic (With vowels)", "مَرْحَبًا بِالْعَالَم"),
+        ("Vietnamese (Lots of diacritics)", "Chào thế giới"),
+        ("Chinese", "你好，世界！"),
+        ("Mixed Emoji", "Hello 🌍! This is fun 🎉"),
+    ]
+    print("--- Reference ---")
+    print(f"Reference Text: '{ref_txt}'")
+    print(f"Reference Duration: {ref_dur}s")
+    print("-" * 30)
+    for lang, txt in test_cases:
+        est_time = estimator.estimate_duration(txt, ref_txt, ref_dur)
+        weight = estimator.calculate_total_weight(txt)
+        print(f"[{lang}]")
+        print(f"Text: {txt}")
+        print(f"Total Weight: {weight:.2f}")
+        print(f"Estimated Duration: {est_time:.2f} s")
+        print("-" * 30)

omnivoice/utils/lang_map.py ADDED Viewed

	@@ -0,0 +1,698 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Language name to ISO 639-3 code mapping.
+Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID``
+(for resolving language names to codes) and ``LANG_IDS`` (the set of supported
+ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided
+language names.
+"""
+# Auto-generated from docs/lang_id_name_map.tsv
+# Maps lowercase language name -> language ID code
+LANG_NAME_TO_ID = {
+    "abadi": "kbt",
+    "abkhazian": "ab",
+    "abron": "abr",
+    "abua": "abn",
+    "adamawa fulfulde": "fub",
+    "adyghe": "ady",
+    "afade": "aal",
+    "afrikaans": "af",
+    "agwagwune": "yay",
+    "aja (benin)": "ajg",
+    "akebu": "keu",
+    "alago": "ala",
+    "albanian": "sq",
+    "algerian arabic": "arq",
+    "algerian saharan arabic": "aao",
+    "ambo-pasco quechua": "qva",
+    "ambonese malay": "abs",
+    "amdo tibetan": "adx",
+    "amharic": "am",
+    "anaang": "anw",
+    "angika": "anp",
+    "antankarana malagasy": "xmv",
+    "aragonese": "an",
+    "arbëreshë albanian": "aae",
+    "arequipa-la unión quechua": "qxu",
+    "armenian": "hy",
+    "ashe": "ahs",
+    "ashéninka perené": "prq",
+    "askopan": "eiv",
+    "assamese": "as",
+    "asturian": "ast",
+    "atayal": "tay",
+    "awak": "awo",
+    "ayacucho quechua": "quy",
+    "azerbaijani": "az",
+    "baatonum": "bba",
+    "bacama": "bcy",
+    "bade": "bde",
+    "bafia": "ksf",
+    "bafut": "bfd",
+    "bagirmi fulfulde": "fui",
+    "bago-kusuntu": "bqg",
+    "baharna arabic": "abv",
+    "bakoko": "bkh",
+    "balanta-ganja": "bjt",
+    "balti": "bft",
+    "bamenyam": "bce",
+    "bamun": "bax",
+    "bangwinji": "bsj",
+    "banjar": "bjn",
+    "bankon": "abb",
+    "baoulé": "bci",
+    "bara malagasy": "bhr",
+    "barok": "bjk",
+    "basa (cameroon)": "bas",
+    "basa (nigeria)": "bzw",
+    "bashkir": "ba",
+    "basque": "eu",
+    "batak mandailing": "btm",
+    "batanga": "bnm",
+    "bateri": "btv",
+    "bats": "bbl",
+    "bayot": "bda",
+    "bebele": "beb",
+    "belarusian": "be",
+    "bengali": "bn",
+    "betawi": "bew",
+    "bhili": "bhb",
+    "bhojpuri": "bho",
+    "bilur": "bxf",
+    "bima": "bhp",
+    "bodo": "brx",
+    "boghom": "bux",
+    "bokyi": "bky",
+    "bomu": "bmq",
+    "bondei": "bou",
+    "borgu fulfulde": "fue",
+    "bosnian": "bs",
+    "brahui": "brh",
+    "braj": "bra",
+    "breton": "br",
+    "buduma": "bdm",
+    "buginese": "bug",
+    "bukharic": "bhh",
+    "bulgarian": "bg",
+    "bulu (cameroon)": "bum",
+    "bundeli": "bns",
+    "bunun": "bnn",
+    "bura-pabir": "bwr",
+    "burak": "bys",
+    "burmese": "my",
+    "burushaski": "bsk",
+    "cacaloxtepec mixtec": "miu",
+    "cajatambo north lima quechua": "qvl",
+    "cakfem-mushere": "cky",
+    "cameroon pidgin": "wes",
+    "campidanese sardinian": "sro",
+    "cantonese": "yue",
+    "catalan": "ca",
+    "cebuano": "ceb",
+    "cen": "cen",
+    "central kurdish": "ckb",
+    "central nahuatl": "nhn",
+    "central pame": "pbs",
+    "central pashto": "pst",
+    "central puebla nahuatl": "ncx",
+    "central tarahumara": "tar",
+    "central yupik": "esu",
+    "central-eastern niger fulfulde": "fuq",
+    "chadian arabic": "shu",
+    "chichewa": "ny",
+    "chichicapan zapotec": "zpv",
+    "chiga": "cgg",
+    "chimalapa zoque": "zoh",
+    "chimborazo highland quichua": "qug",
+    "chinese": "zh",
+    "chiquián ancash quechua": "qxa",
+    "chitwania tharu": "the",
+    "chokwe": "cjk",
+    "chuvash": "cv",
+    "cibak": "ckl",
+    "coastal konjo": "kjc",
+    "copainalá zoque": "zoc",
+    "cornish": "kw",
+    "corongo ancash quechua": "qwa",
+    "croatian": "hr",
+    "cross river mbembe": "mfn",
+    "cuyamecalco mixtec": "xtu",
+    "czech": "cs",
+    "dadiya": "dbd",
+    "dagbani": "dag",
+    "dameli": "dml",
+    "danish": "da",
+    "dargwa": "dar",
+    "dazaga": "dzg",
+    "deccan": "dcc",
+    "degema": "deg",
+    "dera (nigeria)": "kna",
+    "dghwede": "dgh",
+    "dhatki": "mki",
+    "dhivehi": "dv",
+    "dhofari arabic": "adf",
+    "dijim-bwilim": "cfa",
+    "dogri": "dgo",
+    "domaaki": "dmk",
+    "dotyali": "dty",
+    "duala": "dua",
+    "dutch": "nl",
+    "dũya": "ldb",
+    "dyula": "dyu",
+    "eastern balochi": "bgp",
+    "eastern bolivian guaraní": "gui",
+    "eastern egyptian bedawi arabic": "avl",
+    "eastern krahn": "kqo",
+    "eastern mari": "mhr",
+    "eastern yiddish": "ydd",
+    "ebrié": "ebr",
+    "eggon": "ego",
+    "egyptian arabic": "arz",
+    "ejagham": "etu",
+    "eleme": "elm",
+    "eloyi": "afo",
+    "embu": "ebu",
+    "english": "en",
+    "erzya": "myv",
+    "esan": "ish",
+    "esperanto": "eo",
+    "estonian": "et",
+    "eton (cameroon)": "eto",
+    "ewondo": "ewo",
+    "extremaduran": "ext",
+    "fang (equatorial guinea)": "fan",
+    "fanti": "fat",
+    "farefare": "gur",
+    "fe'fe'": "fmp",
+    "filipino": "fil",
+    "filomena mata-coahuitlán totonac": "tlp",
+    "finnish": "fi",
+    "fipa": "fip",
+    "french": "fr",
+    "fulah": "ff",
+    "galician": "gl",
+    "gambian wolof": "wof",
+    "ganda": "lg",
+    "garhwali": "gbm",
+    "gawar-bati": "gwt",
+    "gawri": "gwc",
+    "gbagyi": "gbr",
+    "gbari": "gby",
+    "geji": "gyz",
+    "gen": "gej",
+    "georgian": "ka",
+    "german": "de",
+    "geser-gorom": "ges",
+    "gheg albanian": "aln",
+    "ghomálá'": "bbj",
+    "gidar": "gid",
+    "glavda": "glw",
+    "goan konkani": "gom",
+    "goaria": "gig",
+    "goemai": "ank",
+    "gola": "gol",
+    "greek": "el",
+    "guarani": "gn",
+    "guduf-gava": "gdf",
+    "guerrero amuzgo": "amu",
+    "gujarati": "gu",
+    "gujari": "gju",
+    "gulf arabic": "afb",
+    "gurgula": "ggg",
+    "gusii": "guz",
+    "gusilay": "gsl",
+    "gweno": "gwe",
+    "güilá zapotec": "ztu",
+    "hadothi": "hoj",
+    "hahon": "hah",
+    "haitian": "ht",
+    "hakha chin": "cnh",
+    "hakö": "hao",
+    "halia": "hla",
+    "hausa": "ha",
+    "hawaiian": "haw",
+    "hazaragi": "haz",
+    "hebrew": "he",
+    "hemba": "hem",
+    "herero": "hz",
+    "highland konjo": "kjk",
+    "hijazi arabic": "acw",
+    "hindi": "hi",
+    "huarijio": "var",
+    "huautla mazatec": "mau",
+    "huaxcaleca nahuatl": "nhq",
+    "huba": "hbb",
+    "huitepec mixtec": "mxs",
+    "hula": "hul",
+    "hungarian": "hu",
+    "hunjara-kaina ke": "hkk",
+    "hwana": "hwo",
+    "ibibio": "ibb",
+    "icelandic": "is",
+    "idakho-isukha-tiriki": "ida",
+    "idoma": "idu",
+    "igbo": "ig",
+    "igo": "ahl",
+    "ikposo": "kpo",
+    "ikwere": "ikw",
+    "imbabura highland quichua": "qvi",
+    "indonesian": "id",
+    "indus kohistani": "mvy",
+    "interlingua (international auxiliary language association)": "ia",
+    "inupiaq": "ik",
+    "irish": "ga",
+    "iron ossetic": "os",
+    "isekiri": "its",
+    "isoko": "iso",
+    "italian": "it",
+    "ito": "itw",
+    "itzá": "itz",
+    "ixtayutla mixtec": "vmj",
+    "izon": "ijc",
+    "jambi malay": "jax",
+    "japanese": "ja",
+    "jaqaru": "jqr",
+    "jauja wanca quechua": "qxw",
+    "jaunsari": "jns",
+    "javanese": "jv",
+    "jiba": "juo",
+    "jju": "kaj",
+    "judeo-moroccan arabic": "aju",
+    "juxtlahuaca mixtec": "vmc",
+    "kabardian": "kbd",
+    "kabras": "lkb",
+    "kabuverdianu": "kea",
+    "kabyle": "kab",
+    "kachi koli": "gjk",
+    "kairak": "ckr",
+    "kalabari": "ijn",
+    "kalasha": "kls",
+    "kalenjin": "kln",
+    "kalkoti": "xka",
+    "kamba": "kam",
+    "kamo": "kcq",
+    "kanauji": "bjj",
+    "kanembu": "kbl",
+    "kannada": "kn",
+    "karekare": "kai",
+    "kashmiri": "ks",
+    "kathoriya tharu": "tkt",
+    "kati": "bsh",
+    "kazakh": "kk",
+    "keiyo": "eyo",
+    "khams tibetan": "khg",
+    "khana": "ogo",
+    "khetrani": "xhe",
+    "khmer": "km",
+    "khowar": "khw",
+    "kinga": "zga",
+    "kinnauri": "kfk",
+    "kinyarwanda": "rw",
+    "kirghiz": "ky",
+    "kirya-konzəl": "fkk",
+    "kochila tharu": "thq",
+    "kohistani shina": "plk",
+    "kohumono": "bcs",
+    "kok borok": "trp",
+    "kol (papua new guinea)": "kol",
+    "kom (cameroon)": "bkm",
+    "koma": "kmy",
+    "konkani": "knn",
+    "konzo": "koo",
+    "korean": "ko",
+    "korwa": "kfp",
+    "kota (india)": "kfe",
+    "koti": "eko",
+    "kuanua": "ksd",
+    "kuanyama": "kj",
+    "kui (india)": "uki",
+    "kulung (nigeria)": "bbu",
+    "kuot": "kto",
+    "kushi": "kuh",
+    "kwambi": "kwm",
+    "kwasio": "nmg",
+    "lala-roba": "lla",
+    "lamang": "hia",
+    "lao": "lo",
+    "larike-wakasihu": "alo",
+    "lasi": "lss",
+    "latgalian": "ltg",
+    "latvian": "lv",
+    "levantine arabic": "apc",
+    "liana-seti": "ste",
+    "liberia kpelle": "xpe",
+    "liberian english": "lir",
+    "libyan arabic": "ayl",
+    "ligurian": "lij",
+    "lijili": "mgi",
+    "lingala": "ln",
+    "lithuanian": "lt",
+    "loarki": "lrk",
+    "logooli": "rag",
+    "logudorese sardinian": "src",
+    "loja highland quichua": "qvj",
+    "loloda": "loa",
+    "longuda": "lnu",
+    "loxicha zapotec": "ztp",
+    "luba-lulua": "lua",
+    "luo": "luo",
+    "lushai": "lus",
+    "luxembourgish": "lb",
+    "maasina fulfulde": "ffm",
+    "maba (chad)": "mde",
+    "macedo-romanian": "rup",
+    "macedonian": "mk",
+    "mada (cameroon)": "mxu",
+    "mafa": "maf",
+    "maithili": "mai",
+    "malay": "ms",
+    "malayalam": "ml",
+    "mali": "gcc",
+    "malinaltepec me'phaa": "tcf",
+    "maltese": "mt",
+    "mandara": "tbf",
+    "mandjak": "mfv",
+    "manggarai": "mqy",
+    "manipuri": "mni",
+    "mansoanka": "msw",
+    "manx": "gv",
+    "maori": "mi",
+    "marathi": "mr",
+    "marghi central": "mrt",
+    "marghi south": "mfm",
+    "maria (india)": "mrr",
+    "marwari (pakistan)": "mve",
+    "masana": "mcn",
+    "masikoro malagasy": "msh",
+    "matsés": "mcf",
+    "mazaltepec zapotec": "zpy",
+    "mazatlán mazatec": "vmz",
+    "mazatlán mixe": "mzl",
+    "mbe": "mfo",
+    "mbo (cameroon)": "mbo",
+    "mbum": "mdd",
+    "medumba": "byv",
+    "mekeo": "mek",
+    "meru": "mer",
+    "mesopotamian arabic": "acm",
+    "mewari": "mtr",
+    "min nan chinese": "nan",
+    "mingrelian": "xmf",
+    "mitlatongo mixtec": "vmm",
+    "miya": "mkf",
+    "mokpwe": "bri",
+    "moksha": "mdf",
+    "mom jango": "ver",
+    "mongolian": "mn",
+    "moroccan arabic": "ary",
+    "motu": "meu",
+    "mpiemo": "mcx",
+    "mpumpong": "mgg",
+    "mundang": "mua",
+    "mungaka": "mhk",
+    "musey": "mse",
+    "musgu": "mug",
+    "musi": "mui",
+    "naba": "mne",
+    "najdi arabic": "ars",
+    "nalik": "nal",
+    "nawdm": "nmz",
+    "ndonga": "ng",
+    "neapolitan": "nap",
+    "nepali": "npi",
+    "ngamo": "nbh",
+    "ngas": "anc",
+    "ngiemboon": "nnh",
+    "ngizim": "ngi",
+    "ngomba": "jgo",
+    "ngombale": "nla",
+    "nigerian fulfulde": "fuv",
+    "nigerian pidgin": "pcm",
+    "nimadi": "noe",
+    "nobiin": "fia",
+    "north mesopotamian arabic": "ayp",
+    "north moluccan malay": "max",
+    "northern betsimisaraka malagasy": "bmm",
+    "northern hindko": "hno",
+    "northern kurdish": "kmr",
+    "northern pame": "pmq",
+    "northern pashto": "pbu",
+    "northern uzbek": "uzn",
+    "northwest gbaya": "gya",
+    "norwegian": "no",
+    "norwegian bokmål": "nb",
+    "norwegian nynorsk": "nn",
+    "notsi": "ncf",
+    "nyankpa": "yes",
+    "nyungwe": "nyu",
+    "nzanyi": "nja",
+    "nüpode huitoto": "hux",
+    "occitan": "oc",
+    "od": "odk",
+    "odia": "ory",
+    "odual": "odu",
+    "omani arabic": "acx",
+    "orizaba nahuatl": "nlv",
+    "orma": "orc",
+    "ormuri": "oru",
+    "oromo": "om",
+    "pahari-potwari": "phr",
+    "paiwan": "pwn",
+    "panjabi": "pa",
+    "papuan malay": "pmy",
+    "parkari koli": "kvx",
+    "pedi": "nso",
+    "pero": "pip",
+    "persian": "fa",
+    "petats": "pex",
+    "phalura": "phl",
+    "piemontese": "pms",
+    "piya-kwonci": "piy",
+    "plateau malagasy": "plt",
+    "polish": "pl",
+    "poqomam": "poc",
+    "portuguese": "pt",
+    "pulaar": "fuc",
+    "pular": "fuf",
+    "puno quechua": "qxp",
+    "pushto": "ps",
+    "pökoot": "pko",
+    "qaqet": "byx",
+    "quiotepec chinantec": "chq",
+    "rana tharu": "thr",
+    "rangi": "lag",
+    "rapoisi": "kyx",
+    "ratahan": "rth",
+    "rayón zoque": "zor",
+    "romanian": "ro",
+    "romansh": "rm",
+    "rombo": "rof",
+    "rotokas": "roo",
+    "rukai": "dru",
+    "russian": "ru",
+    "sacapulteco": "quv",
+    "saidi arabic": "aec",
+    "sakalava malagasy": "skg",
+    "sakizaya": "szy",
+    "saleman": "sau",
+    "samba daka": "ccg",
+    "samba leko": "ndi",
+    "san felipe otlaltepec popoloca": "pow",
+    "san francisco del mar huave": "hue",
+    "san juan atzingo popoloca": "poe",
+    "san martín itunyoso triqui": "trq",
+    "san miguel el grande mixtec": "mig",
+    "sansi": "ssi",
+    "sanskrit": "sa",
+    "santa ana de tusi pasco quechua": "qxt",
+    "santa catarina albarradas zapotec": "ztn",
+    "santali": "sat",
+    "santiago del estero quichua": "qus",
+    "saposa": "sps",
+    "saraiki": "skr",
+    "sardinian": "sc",
+    "saya": "say",
+    "sediq": "trv",
+    "serbian": "sr",
+    "seri": "sei",
+    "shina": "scl",
+    "shona": "sn",
+    "siar-lak": "sjr",
+    "sibe": "nco",
+    "sicilian": "scn",
+    "sihuas ancash quechua": "qws",
+    "sikkimese": "sip",
+    "sinaugoro": "snc",
+    "sindhi": "sd",
+    "sindhi bhil": "sbn",
+    "sinhala": "si",
+    "sinicahua mixtec": "xti",
+    "sipacapense": "qum",
+    "siwai": "siw",
+    "slovak": "sk",
+    "slovenian": "sl",
+    "solos": "sol",
+    "somali": "so",
+    "soninke": "snk",
+    "south giziga": "giz",
+    "south ucayali ashéninka": "cpy",
+    "southeastern nochixtlán mixtec": "mxy",
+    "southern betsimisaraka malagasy": "bzc",
+    "southern pashto": "pbt",
+    "southern pastaza quechua": "qup",
+    "soyaltepec mazatec": "vmp",
+    "spanish": "es",
+    "standard arabic": "arb",
+    "standard moroccan tamazight": "zgh",
+    "sudanese arabic": "apd",
+    "sulka": "sua",
+    "svan": "sva",
+    "swahili": "sw",
+    "swedish": "sv",
+    "tae'": "rob",
+    "tahaggart tamahaq": "thv",
+    "taita": "dav",
+    "tajik": "tg",
+    "tamil": "ta",
+    "tandroy-mahafaly malagasy": "tdx",
+    "tangale": "tan",
+    "tanosy malagasy": "txy",
+    "tarok": "yer",
+    "tatar": "tt",
+    "tedaga": "tuq",
+    "telugu": "te",
+    "tem": "kdh",
+    "teop": "tio",
+    "tepeuxila cuicatec": "cux",
+    "tepinapa chinantec": "cte",
+    "tera": "ttr",
+    "terei": "buo",
+    "termanu": "twu",
+    "tesaka malagasy": "tkg",
+    "tetelcingo nahuatl": "nhg",
+    "teutila cuicatec": "cut",
+    "thai": "th",
+    "tibetan": "bo",
+    "tidaá mixtec": "mtx",
+    "tidore": "tvo",
+    "tigak": "tgc",
+    "tigre": "tig",
+    "tigrinya": "ti",
+    "tilquiapan zapotec": "zts",
+    "tinputz": "tpz",
+    "tlacoapa me'phaa": "tpl",
+    "tlacoatzintepec chinantec": "ctl",
+    "tlingit": "tli",
+    "toki pona": "tok",
+    "tomoip": "tqp",
+    "tondano": "tdn",
+    "tonsea": "txs",
+    "tooro": "ttj",
+    "torau": "ttu",
+    "torwali": "trw",
+    "tsimihety malagasy": "xmw",
+    "tsotso": "lto",
+    "tswana": "tn",
+    "tugen": "tuy",
+    "tuki": "bag",
+    "tula": "tul",
+    "tulu": "tcy",
+    "tunen": "tvu",
+    "tungag": "lcm",
+    "tunisian arabic": "aeb",
+    "tupuri": "tui",
+    "turkana": "tuv",
+    "turkish": "tr",
+    "turkmen": "tk",
+    "tututepec mixtec": "mtu",
+    "twi": "tw",
+    "ubaghara": "byc",
+    "uighur": "ug",
+    "ukrainian": "uk",
+    "umbundu": "umb",
+    "upper sorbian": "hsb",
+    "urdu": "ur",
+    "ushojo": "ush",
+    "uzbek": "uz",
+    "vai": "vai",
+    "vietnamese": "vi",
+    "votic": "vot",
+    "võro": "vro",
+    "waci gbe": "wci",
+    "wadiyara koli": "kxp",
+    "waja": "wja",
+    "wakhi": "wbl",
+    "wanga": "lwg",
+    "wapan": "juk",
+    "warji": "wji",
+    "welsh": "cy",
+    "wemale": "weo",
+    "western frisian": "fy",
+    "western highland purepecha": "pua",
+    "western juxtlahuaca mixtec": "jmx",
+    "western maninkakan": "mlq",
+    "western mari": "mrj",
+    "western niger fulfulde": "fuh",
+    "western panjabi": "pnb",
+    "wolof": "wo",
+    "wuzlam": "udl",
+    "xanaguía zapotec": "ztg",
+    "xhosa": "xh",
+    "yace": "ekr",
+    "yakut": "sah",
+    "yalahatan": "jal",
+    "yanahuanca pasco quechua": "qur",
+    "yangben": "yav",
+    "yaqui": "yaq",
+    "yauyos quechua": "qux",
+    "yekhee": "ets",
+    "yiddish": "yi",
+    "yidgha": "ydg",
+    "yoruba": "yo",
+    "yutanduchi mixtec": "mab",
+    "zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi",
+    "zarma": "dje",
+    "zaza": "zza",
+    "zulu": "zu",
+    "ömie": "aom",
+}
+LANG_NAMES = set(LANG_NAME_TO_ID.keys())
+LANG_IDS = set(LANG_NAME_TO_ID.values())
+# Exceptions where .title() doesn't match the canonical casing from the TSV.
+_TITLE_EXCEPTIONS = {
+    "fe'fe'": "Fe'fe'",
+    "dũya": "Dũya",
+    "santiago del estero quichua": "Santiago del Estero Quichua",
+    "santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua",
+    "malinaltepec me'phaa": "Malinaltepec Me'phaa",
+    "tlacoapa me'phaa": "Tlacoapa Me'phaa",
+}
+def lang_display_name(name: str) -> str:
+    """Return a display-friendly version of a lowercase language name.
+    Uses .title() for most names, with manual exceptions for cases like
+    apostrophes and small words (de, del) that should stay lowercase.
+    """
+    return _TITLE_EXCEPTIONS.get(name, name.title())

omnivoice/utils/text.py ADDED Viewed

	@@ -0,0 +1,219 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text processing utilities for TTS inference.
+Provides:
+- ``chunk_text_punctuation()``: Splits long text into model-friendly chunks at
+  sentence boundaries, with abbreviation-aware punctuation splitting.
+- ``add_punctuation()``: Appends missing end punctuation (Chinese or English).
+"""
+from typing import List, Optional
+SPLIT_PUNCTUATION = set(".,;:!?。，；：！？")
+CLOSING_MARKS = set("\"'""'）]》》>」】")
+END_PUNCTUATION = {
+    ";",
+    ":",
+    ",",
+    ".",
+    "!",
+    "?",
+    "…",
+    ")",
+    "]",
+    "}",
+    '"',
+    "'",
+    """,
+    "'",
+    "；",
+    "：",
+    "，",
+    "。",
+    "！",
+    "？",
+    "、",
+    "……",
+    "）",
+    "】",
+    """,
+    "'",
+}
+ABBREVIATIONS = {
+    "Mr.",
+    "Mrs.",
+    "Ms.",
+    "Dr.",
+    "Prof.",
+    "Sr.",
+    "Jr.",
+    "Rev.",
+    "Fr.",
+    "Hon.",
+    "Pres.",
+    "Gov.",
+    "Capt.",
+    "Gen.",
+    "Sen.",
+    "Rep.",
+    "Col.",
+    "Maj.",
+    "Lt.",
+    "Cmdr.",
+    "Sgt.",
+    "Cpl.",
+    "Co.",
+    "Corp.",
+    "Inc.",
+    "Ltd.",
+    "Est.",
+    "Dept.",
+    "St.",
+    "Ave.",
+    "Blvd.",
+    "Rd.",
+    "Mt.",
+    "Ft.",
+    "No.",
+    "Jan.",
+    "Feb.",
+    "Mar.",
+    "Apr.",
+    "Aug.",
+    "Sep.",
+    "Sept.",
+    "Oct.",
+    "Nov.",
+    "Dec.",
+    "i.e.",
+    "e.g.",
+    "vs.",
+    "Vs.",
+    "Etc.",
+    "approx.",
+    "fig.",
+    "def.",
+}
+def chunk_text_punctuation(
+    text: str,
+    chunk_len: int,
+    min_chunk_len: Optional[int] = None,
+) -> List[str]:
+    """
+    Splits the input tokens list into chunks according to punctuations,
+    avoiding splits on common abbreviations (e.g., Mr., No.).
+    """
+    # 1. Split the tokens according to punctuations.
+    sentences = []
+    current_sentence = []
+    tokens_list = list(text)
+    for token in tokens_list:
+        # If the first token of current sentence is punctuation,
+        # append it to the end of the previous sentence.
+        if (
+            len(current_sentence) == 0
+            and len(sentences) != 0
+            and (token in SPLIT_PUNCTUATION or token in CLOSING_MARKS)
+        ):
+            sentences[-1].append(token)
+        # Otherwise, append the current token to the current sentence.
+        else:
+            current_sentence.append(token)
+            # Split the sentence in positions of punctuations.
+            if token in SPLIT_PUNCTUATION:
+                is_abbreviation = False
+                if token == ".":
+                    temp_str = "".join(current_sentence).strip()
+                    if temp_str:
+                        last_word = temp_str.split()[-1]
+                        if last_word in ABBREVIATIONS:
+                            is_abbreviation = True
+                if not is_abbreviation:
+                    sentences.append(current_sentence)
+                    current_sentence = []
+    # Assume the last few tokens are also a sentence
+    if len(current_sentence) != 0:
+        sentences.append(current_sentence)
+    # 2. Merge short sentences.
+    merged_chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_len:
+            current_chunk.extend(sentence)
+        else:
+            if len(current_chunk) > 0:
+                merged_chunks.append(current_chunk)
+            current_chunk = sentence
+    if len(current_chunk) > 0:
+        merged_chunks.append(current_chunk)
+    # 4. Post-process: Check for undersized chunks and merge them
+    #  with the previous chunk or next chunk (if it's the first chunk).
+    if min_chunk_len is not None:
+        first_chunk_short_flag = (
+            len(merged_chunks) > 0 and len(merged_chunks[0]) < min_chunk_len
+        )
+        final_chunks = []
+        for i, chunk in enumerate(merged_chunks):
+            if i == 1 and first_chunk_short_flag:
+                final_chunks[-1].extend(chunk)
+            else:
+                if len(chunk) >= min_chunk_len:
+                    final_chunks.append(chunk)
+                else:
+                    if len(final_chunks) == 0:
+                        final_chunks.append(chunk)
+                    else:
+                        final_chunks[-1].extend(chunk)
+    else:
+        final_chunks = merged_chunks
+    chunk_strings = [
+        "".join(chunk).strip() for chunk in final_chunks if "".join(chunk).strip()
+    ]
+    return chunk_strings
+def add_punctuation(text: str):
+    """Add punctuation if there is not in the end of text"""
+    text = text.strip()
+    if not text:
+        return text
+    if text[-1] not in END_PUNCTUATION:
+        is_chinese = any("\u4e00" <= char <= "\u9fff" for char in text)
+        text += "。" if is_chinese else "."
+    return text

omnivoice/utils/voice_design.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Voice-design instruct constants for TTS inference.
+Defines speaker attribute tags (gender, age, pitch, accent, dialect) and
+translation/validation utilities between English and Chinese. Used by
+``OmniVoice.generate()`` for voice design mode.
+"""
+import re
+_ZH_RE = re.compile(r'[\u4e00-\u9fff]')
+# Category = set of {english: chinese, ...} items that are mutually exclusive.
+# Accent (EN-only) and dialect (ZH-only) are stored as flat sets below.
+_INSTRUCT_CATEGORIES = [
+    {"male": "男", "female": "女"},
+    {"child": "儿童", "teenager": "少年", "young adult": "青年",
+     "middle-aged": "中年", "elderly": "老年"},
+    {"very low pitch": "极低音调", "low pitch": "低音调",
+     "moderate pitch": "中音调", "high pitch": "高音调",
+     "very high pitch": "极高音调"},
+    {"whisper": "耳语"},
+    # Accent (English-only, no Chinese counterpart)
+    {"american accent", "british accent", "australian accent",
+     "chinese accent", "canadian accent", "indian accent",
+     "korean accent", "portuguese accent", "russian accent", "japanese accent"},
+    # Dialect (Chinese-only, no English counterpart)
+    {"河南话", "陕西话", "四川话", "贵州话", "云南话", "桂林话",
+     "济南话", "石家庄话", "甘肃话", "宁夏话", "青岛话", "东北话"},
+]
+_INSTRUCT_EN_TO_ZH = {}
+_INSTRUCT_ZH_TO_EN = {}
+_INSTRUCT_MUTUALLY_EXCLUSIVE = []
+for _cat in _INSTRUCT_CATEGORIES:
+    if isinstance(_cat, dict):
+        _INSTRUCT_EN_TO_ZH.update(_cat)
+        _INSTRUCT_ZH_TO_EN.update({v: k for k, v in _cat.items()})
+        _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat) | set(_cat.values()))
+    else:
+        _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat))
+_INSTRUCT_ALL_VALID = (
+    set(_INSTRUCT_EN_TO_ZH) | set(_INSTRUCT_ZH_TO_EN)
+    | _INSTRUCT_MUTUALLY_EXCLUSIVE[-2]  # accents
+    | _INSTRUCT_MUTUALLY_EXCLUSIVE[-1]  # dialects
+)
+_INSTRUCT_VALID_EN = frozenset(i for i in _INSTRUCT_ALL_VALID if not _ZH_RE.search(i))
+_INSTRUCT_VALID_ZH = frozenset(i for i in _INSTRUCT_ALL_VALID if _ZH_RE.search(i))

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+--extra-index-url https://download.pytorch.org/whl/cu128
+torch==2.8.0
+torchaudio==2.8.0
+transformers==5.3
+accelerate
+pydub
+soundfile
+numpy
+gradio
+hf_transfer