Hak5 commited on 30 days ago

Commit

7496177

verified ·

1 Parent(s): 9c4c740

Add bundled AVoice runtime for HF-only inference

Browse files

Files changed (21) hide show

runtime/LICENSE +339 -0
runtime/README.md +3 -0
runtime/THIRD_PARTY_NOTICES.md +14 -0
runtime/omnivoice/__init__.py +40 -0
runtime/omnivoice/cli/__init__.py +0 -0
runtime/omnivoice/cli/infer.py +157 -0
runtime/omnivoice/models/__init__.py +0 -0
runtime/omnivoice/models/omnivoice.py +1610 -0
runtime/omnivoice/server/__init__.py +2 -0
runtime/omnivoice/server/app.py +506 -0
runtime/omnivoice/server/prefetch.py +71 -0
runtime/omnivoice/utils/__init__.py +0 -0
runtime/omnivoice/utils/armenian_text.py +450 -0
runtime/omnivoice/utils/audio.py +343 -0
runtime/omnivoice/utils/common.py +56 -0
runtime/omnivoice/utils/duration.py +282 -0
runtime/omnivoice/utils/lang_map.py +698 -0
runtime/omnivoice/utils/text.py +219 -0
runtime/omnivoice/utils/voice_design.py +68 -0
runtime/pyproject.toml +21 -0
runtime/requirements.txt +11 -0

runtime/LICENSE ADDED Viewed

	@@ -0,0 +1,339 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+                            NO WARRANTY
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+Also add information on how to contact you by electronic and paper mail.
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.

runtime/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # AVoice Runtime
2	+
3	+ Bundled runtime for HF-only AVoice-TTS inference.

runtime/THIRD_PARTY_NOTICES.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Third-Party Notices
+## OmniVoice
+This repository vendors the PyTorch implementation from:
+- Project: `k2-fsa/OmniVoice`
+- Source: https://github.com/k2-fsa/OmniVoice
+- Vendored commit: `7a68a5cffa71b904a862f4870b246966deebadf7`
+- License: Apache License 2.0
+The vendored code lives in `omnivoice/`. Local Armenian-specific changes are
+kept in this repository so training, inference, tokenization, and model changes
+can be edited without depending on an installed `omnivoice` wheel.

runtime/omnivoice/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import warnings
+from importlib.metadata import PackageNotFoundError, version
+warnings.filterwarnings("ignore", module="torchaudio")
+warnings.filterwarnings(
+    "ignore",
+    category=SyntaxWarning,
+    message="invalid escape sequence",
+    module="pydub.utils",
+)
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    module="torch.distributed.algorithms.ddp_comm_hooks",
+)
+try:
+    __version__ = version("avoice")
+except PackageNotFoundError:
+    __version__ = "0.0.0"
+__all__ = ["OmniVoice", "OmniVoiceConfig", "OmniVoiceGenerationConfig"]
+def __getattr__(name):
+    if name not in __all__:
+        raise AttributeError(f"module 'omnivoice' has no attribute {name!r}")
+    from omnivoice.models.omnivoice import (
+        OmniVoice,
+        OmniVoiceConfig,
+        OmniVoiceGenerationConfig,
+    )
+    values = {
+        "OmniVoice": OmniVoice,
+        "OmniVoiceConfig": OmniVoiceConfig,
+        "OmniVoiceGenerationConfig": OmniVoiceGenerationConfig,
+    }
+    return values[name]

runtime/omnivoice/cli/__init__.py ADDED Viewed

File without changes

runtime/omnivoice/cli/infer.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""Single-item inference CLI for OmniVoice.
+Generates audio from a single text input using voice cloning,
+voice design, or auto voice.
+Usage:
+    # Voice cloning
+    omnivoice-infer --model Hak5/AVoice \
+        --text "Hello, this is a text for text-to-speech." \
+        --ref_audio ref.wav --ref_text "Reference transcript." --output out.wav
+    # Voice design
+    omnivoice-infer --model Hak5/AVoice \
+        --text "Hello, this is a text for text-to-speech." \
+        --instruct "male, British accent" --output out.wav
+    # Auto voice
+    omnivoice-infer --model Hak5/AVoice \
+        --text "Hello, this is a text for text-to-speech." --output out.wav
+"""
+import argparse
+import logging
+import torch
+import soundfile as sf
+from omnivoice.models.omnivoice import OmniVoice
+from omnivoice.utils.common import str2bool
+def get_best_device():
+    """Auto-detect the best available device: CUDA > MPS > CPU."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="OmniVoice single-item inference",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="Hak5/AVoice",
+        help="Model checkpoint path or HuggingFace repo id.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="Text to synthesize.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        required=True,
+        help="Output WAV file path.",
+    )
+    # Voice cloning
+    parser.add_argument(
+        "--ref_audio",
+        type=str,
+        default=None,
+        help="Reference audio file path for voice cloning.",
+    )
+    parser.add_argument(
+        "--ref_text",
+        type=str,
+        default=None,
+        help="Reference text describing the reference audio.",
+    )
+    # Voice design
+    parser.add_argument(
+        "--instruct",
+        type=str,
+        default=None,
+        help="Style instruction for voice design mode.",
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default=None,
+        help="Language name (e.g. 'English') or code (e.g. 'en').",
+    )
+    # Generation parameters
+    parser.add_argument("--num_step", type=int, default=32)
+    parser.add_argument("--guidance_scale", type=float, default=2.0)
+    parser.add_argument("--speed", type=float, default=1.0)
+    parser.add_argument(
+        "--duration",
+        type=float,
+        default=None,
+        help="Fixed output duration in seconds. If set, overrides the "
+        "model's duration estimation. The speed factor is automatically "
+        "adjusted to match while preserving language-aware pacing.",
+    )
+    parser.add_argument("--t_shift", type=float, default=0.1)
+    parser.add_argument("--denoise", type=str2bool, default=True)
+    parser.add_argument(
+        "--postprocess_output",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument("--layer_penalty_factor", type=float, default=5.0)
+    parser.add_argument("--position_temperature", type=float, default=5.0)
+    parser.add_argument("--class_temperature", type=float, default=0.0)
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use for inference. Auto-detected if not specified.",
+    )
+    return parser
+def main():
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+    args = get_parser().parse_args()
+    device = args.device or get_best_device()
+    logging.info(f"Loading model from {args.model} on {device} ...")
+    dtype = torch.float16 if device == "cuda" else torch.float32
+    model = OmniVoice.from_pretrained(args.model, device_map=device, dtype=dtype)
+    logging.info(f"Generating audio for: {args.text[:80]}...")
+    audios = model.generate(
+        text=args.text,
+        language=args.language,
+        ref_audio=args.ref_audio,
+        ref_text=args.ref_text,
+        instruct=args.instruct,
+        duration=args.duration,
+        num_step=args.num_step,
+        guidance_scale=args.guidance_scale,
+        speed=args.speed,
+        t_shift=args.t_shift,
+        denoise=args.denoise,
+        postprocess_output=args.postprocess_output,
+        layer_penalty_factor=args.layer_penalty_factor,
+        position_temperature=args.position_temperature,
+        class_temperature=args.class_temperature,
+    )
+    sf.write(args.output, audios[0], model.sampling_rate)
+    logging.info(f"Saved to {args.output}")
+if __name__ == "__main__":
+    main()

runtime/omnivoice/models/__init__.py ADDED Viewed

File without changes

runtime/omnivoice/models/omnivoice.py ADDED Viewed

	@@ -0,0 +1,1610 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core OmniVoice model implementation for AVoice inference.
+``OmniVoice.from_pretrained()`` loads a local or Hugging Face checkpoint, then
+``model.generate()`` synthesizes audio from text with optional voice cloning,
+voice design, and Armenian text normalization.
+"""
+import difflib
+import logging
+import math
+import os
+import re
+from dataclasses import dataclass, fields
+from functools import partial
+from typing import Any, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+try:
+    from torch.nn.attention.flex_attention import create_block_mask
+    _flex_attention_available = True
+except ImportError:
+    _flex_attention_available = False
+from transformers import (
+    AutoFeatureExtractor,
+    AutoModel,
+    AutoTokenizer,
+    HiggsAudioV2TokenizerModel,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.auto import CONFIG_MAPPING, AutoConfig
+from omnivoice.utils.audio import (
+    cross_fade_chunks,
+    fade_and_pad_audio,
+    load_audio,
+    remove_silence,
+    trim_long_audio,
+)
+from omnivoice.utils.armenian_text import normalize_for_tts
+from omnivoice.utils.duration import RuleDurationEstimator
+from omnivoice.utils.lang_map import LANG_IDS, LANG_NAMES
+from omnivoice.utils.text import add_punctuation, chunk_text_punctuation
+from omnivoice.utils.voice_design import (
+    _INSTRUCT_ALL_VALID,
+    _INSTRUCT_EN_TO_ZH,
+    _INSTRUCT_MUTUALLY_EXCLUSIVE,
+    _INSTRUCT_VALID_EN,
+    _INSTRUCT_VALID_ZH,
+    _INSTRUCT_ZH_TO_EN,
+    _ZH_RE,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+@dataclass
+class VoiceClonePrompt:
+    ref_audio_tokens: torch.Tensor  # (C, T)
+    ref_text: str
+    ref_rms: float
+@dataclass
+class OmniVoiceGenerationConfig:
+    num_step: int = 32
+    guidance_scale: float = 2.0
+    t_shift: float = 0.1
+    layer_penalty_factor: float = 5.0
+    position_temperature: float = 5.0
+    class_temperature: float = 0.0
+    denoise: bool = True
+    preprocess_prompt: bool = True
+    postprocess_output: bool = True
+    audio_chunk_duration: float = 15.0
+    audio_chunk_threshold: float = 30.0
+    @classmethod
+    def from_dict(cls, kwargs_dict):
+        valid_keys = {f.name for f in fields(cls)}
+        filtered = {k: v for k, v in kwargs_dict.items() if k in valid_keys}
+        return cls(**filtered)
+@dataclass
+class GenerationTask:
+    batch_size: int
+    texts: List[str]
+    target_lens: List[int]
+    langs: List[Optional[str]]
+    instructs: List[Optional[str]]
+    ref_texts: List[Optional[str]]
+    ref_audio_tokens: List[Optional[torch.Tensor]]
+    ref_rms: List[Optional[float]]
+    speed: Optional[List[float]] = None
+    def get_indices(self, config: OmniVoiceGenerationConfig, frame_rate: int):
+        threshold = int(config.audio_chunk_threshold * frame_rate)
+        short_idx = [i for i, l in enumerate(self.target_lens) if l <= threshold]
+        long_idx = [i for i, l in enumerate(self.target_lens) if l > threshold]
+        return short_idx, long_idx
+    def slice_task(self, indices: List[int]):
+        if not indices:
+            return None
+        return GenerationTask(
+            batch_size=len(indices),
+            texts=[self.texts[i] for i in indices],
+            target_lens=[self.target_lens[i] for i in indices],
+            langs=[self.langs[i] for i in indices],
+            instructs=[self.instructs[i] for i in indices],
+            ref_texts=[self.ref_texts[i] for i in indices],
+            ref_audio_tokens=[self.ref_audio_tokens[i] for i in indices],
+            ref_rms=[self.ref_rms[i] for i in indices],
+            speed=[self.speed[i] for i in indices] if self.speed else None,
+        )
+@dataclass
+class OmniVoiceModelOutput(ModelOutput):
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+    layer_losses: Optional[torch.Tensor] = None
+    layer_token_counts: Optional[torch.Tensor] = None
+# ---------------------------------------------------------------------------
+# Config & Model
+# ---------------------------------------------------------------------------
+class OmniVoiceConfig(PretrainedConfig):
+    model_type = "omnivoice"
+    sub_configs = {"llm_config": AutoConfig}
+    def __init__(
+        self,
+        audio_vocab_size: int = 1025,
+        audio_mask_id: int = 1024,
+        num_audio_codebook: int = 8,
+        audio_codebook_weights: Optional[list[float]] = None,
+        llm_config: Optional[Union[dict, PretrainedConfig]] = None,
+        **kwargs,
+    ):
+        if isinstance(llm_config, dict):
+            llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
+        self.llm_config = llm_config
+        super().__init__(**kwargs)
+        self.audio_vocab_size = audio_vocab_size
+        self.audio_mask_id = audio_mask_id
+        self.num_audio_codebook = num_audio_codebook
+        if audio_codebook_weights is None:
+            audio_codebook_weights = [8, 8, 6, 6, 4, 4, 2, 2]
+        self.audio_codebook_weights = audio_codebook_weights
+def _resolve_model_path(name_or_path: str) -> str:
+    if os.path.isdir(name_or_path):
+        return name_or_path
+    from huggingface_hub import snapshot_download
+    return snapshot_download(name_or_path)
+class OmniVoice(PreTrainedModel):
+    _supports_flex_attn = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    config_class = OmniVoiceConfig
+    def __init__(self, config: OmniVoiceConfig, llm: Optional[PreTrainedModel] = None):
+        super().__init__(config)
+        if llm is not None:
+            # If an LLM instance is provided, use it directly
+            # (skipping config-based init).
+            self.llm = llm
+        else:
+            # Otherwise, initialize the LLM from the config.
+            self.llm = AutoModel.from_config(self.config.llm_config)
+        self.audio_embeddings = nn.Embedding(
+            config.num_audio_codebook * config.audio_vocab_size,
+            self.config.llm_config.hidden_size,
+        )
+        self.register_buffer(
+            "codebook_layer_offsets",
+            torch.arange(config.num_audio_codebook) * config.audio_vocab_size,
+        )
+        self.audio_heads = nn.Linear(
+            self.config.llm_config.hidden_size,
+            config.num_audio_codebook * config.audio_vocab_size,
+            bias=False,
+        )
+        self.normalized_audio_codebook_weights = [
+            w / sum(config.audio_codebook_weights)
+            for w in config.audio_codebook_weights
+        ]
+        self.post_init()
+        # Inference-only attributes (set by from_pretrained when not in train mode)
+        self.text_tokenizer = None
+        self.audio_tokenizer = None
+        self.duration_estimator = None
+        self.sampling_rate = None
+        self._asr_pipe = None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        train_mode = kwargs.pop("train", False)
+        load_asr = kwargs.pop("load_asr", False)
+        asr_model_name = kwargs.pop("asr_model_name", "openai/whisper-large-v3-turbo")
+        # Suppress noisy INFO logs from transformers/huggingface_hub during loading
+        _prev_disable = logging.root.manager.disable
+        logging.disable(logging.INFO)
+        try:
+            # Resolve to local path first; download only if not already cached
+            resolved_path = _resolve_model_path(pretrained_model_name_or_path)
+            model = super().from_pretrained(resolved_path, *args, **kwargs)
+            if not train_mode:
+                model.text_tokenizer = AutoTokenizer.from_pretrained(resolved_path)
+                audio_tokenizer_path = os.path.join(resolved_path, "audio_tokenizer")
+                if not os.path.isdir(audio_tokenizer_path):
+                    audio_tokenizer_path = _resolve_model_path(
+                        "eustlb/higgs-audio-v2-tokenizer"
+                    )
+                # higgs-audio-v2-tokenizer does not support MPS
+                # (output channels > 65536)
+                tokenizer_device = (
+                    "cpu" if str(model.device).startswith("mps") else model.device
+                )
+                model.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+                    audio_tokenizer_path, device_map=tokenizer_device
+                )
+                model.feature_extractor = AutoFeatureExtractor.from_pretrained(
+                    audio_tokenizer_path
+                )
+                model.sampling_rate = model.feature_extractor.sampling_rate
+                model.duration_estimator = RuleDurationEstimator()
+                if load_asr:
+                    model.load_asr_model(model_name=asr_model_name)
+        finally:
+            logging.disable(_prev_disable)
+        return model
+    # -------------------------------------------------------------------
+    # ASR support (optional, for auto-transcription)
+    # -------------------------------------------------------------------
+    def load_asr_model(self, model_name: str = "openai/whisper-large-v3-turbo"):
+        """Load a Whisper ASR model for reference audio transcription.
+        Args:
+            model_name: HuggingFace model name or local path for the Whisper model.
+        """
+        from transformers import pipeline as hf_pipeline
+        logger.info("Loading ASR model %s ...", model_name)
+        asr_dtype = (
+            torch.float16 if str(self.device).startswith("cuda") else torch.float32
+        )
+        model_name = _resolve_model_path(model_name)
+        self._asr_pipe = hf_pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            dtype=asr_dtype,
+            device_map=self.device,
+        )
+        logger.info("ASR model loaded on %s.", self.device)
+    @torch.inference_mode()
+    def transcribe(
+        self,
+        audio: Union[str, tuple],
+    ) -> str:
+        """Transcribe audio using the loaded Whisper ASR model.
+        Args:
+            audio: File path or ``(waveform, sample_rate)`` tuple.
+                Waveform can be a numpy array or torch.Tensor of shape
+                ``(1, T)`` or ``(T,)``.
+        Returns:
+            Transcribed text.
+        """
+        if self._asr_pipe is None:
+            raise RuntimeError(
+                "ASR model is not loaded. Call model.load_asr_model() first."
+            )
+        if isinstance(audio, str):
+            return self._asr_pipe(audio)["text"].strip()
+        else:
+            waveform, sr = audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            waveform = np.squeeze(waveform)  # (1, T) or (T,) → (T,)
+            audio_input = {
+                "array": waveform,
+                "sampling_rate": sr,
+            }
+            return self._asr_pipe(audio_input)["text"].strip()
+    def get_input_embeddings(self):
+        return self.llm.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.llm.set_input_embeddings(value)
+    def _prepare_embed_inputs(
+        self, input_ids: torch.Tensor, audio_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Prepares embeddings from input_ids of shape (batch_size, layers, seq_length).
+        Embedding shape is (batch_size, seq_length, hidden_size).
+        """
+        text_embeds = self.get_input_embeddings()(input_ids[:, 0, :])
+        # Apply shift to audio IDs based on codebook layer
+        # audio_ids: [Batch, 8, Seq]
+        # codebook_layer_offsets: [1, 8, 1]
+        # Result: Layer 0 ID Layer 1 ID + Layer 2 ID + 2050...
+        shifted_ids = (
+            input_ids * audio_mask.unsqueeze(1)
+        ) + self.codebook_layer_offsets.view(1, -1, 1)
+        # input: [Batch, 8, Seq] -> output: [Batch, Seq, Hidden]
+        audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1)
+        return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        audio_mask: torch.Tensor,
+        labels: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        document_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        inputs_embeds = self._prepare_embed_inputs(input_ids, audio_mask)
+        if attention_mask is None and document_ids is not None:
+            if not _flex_attention_available:
+                raise RuntimeError(
+                    "flex_attention is not available in the current environment. "
+                    "If you do not need flex_attention, set "
+                    '"attn_implementation": "sdpa" in your training config.'
+                )
+            attention_mask = create_block_mask(
+                _get_packed_mask(
+                    document_ids[0].to(inputs_embeds.device),
+                ),
+                B=None,
+                H=None,
+                Q_LEN=input_ids.size(-1),
+                KV_LEN=input_ids.size(-1),
+                _compile=True,
+                device=inputs_embeds.device,
+            )
+        llm_outputs = self.llm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            return_dict=True,
+            position_ids=position_ids,
+        )
+        hidden_states = llm_outputs[0]
+        loss = None
+        layer_losses = None
+        layer_token_counts = None
+        # Shape: [B, S, C * Vocab]
+        batch_size, seq_len, _ = hidden_states.shape
+        logits_flat = self.audio_heads(hidden_states)
+        # Shape: [B, S, C, Vocab] -> [B, C, S, Vocab]
+        audio_logits = logits_flat.view(
+            batch_size,
+            seq_len,
+            self.config.num_audio_codebook,
+            self.config.audio_vocab_size,
+        ).permute(0, 2, 1, 3)
+        if labels is not None:
+            # audio_logits.permute(0, 3, 1, 2):
+            # [Batch, Layer, Seq, Vocab] -> [Batch, Vocab, Layer, Seq]
+            # per_token_loss shape: [Batch, Layer, Seq]，ignore -100
+            per_token_loss = torch.nn.functional.cross_entropy(
+                audio_logits.permute(0, 3, 1, 2),
+                labels,
+                reduction="none",
+                ignore_index=-100,
+            )
+            # valid_mask shape: [Batch, Layer, Seq]
+            valid_mask = (labels != -100).float()
+            layer_token_counts = valid_mask.sum(dim=(0, 2))
+            # layer_means shape: [num_layers]
+            layer_losses = (per_token_loss * valid_mask).sum(
+                dim=(0, 2)
+            ) / layer_token_counts.clamp(min=1.0)
+            weights = torch.tensor(
+                self.normalized_audio_codebook_weights, device=audio_logits.device
+            )
+            loss = (layer_losses * weights).sum()
+        return OmniVoiceModelOutput(
+            loss=loss,
+            logits=audio_logits,
+            layer_losses=layer_losses,
+            layer_token_counts=layer_token_counts,
+        )
+    def supported_language_ids(self) -> set[str]:
+        """Return a list of supported language IDs."""
+        return LANG_IDS
+    def supported_language_names(self) -> set[str]:
+        """Return a list of supported language names."""
+        return LANG_NAMES
+    # -------------------------------------------------------------------
+    # Inference API
+    # -------------------------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+        speed: Union[float, list[Optional[float]], None] = None,
+        generation_config: Optional[OmniVoiceGenerationConfig] = None,
+        **kwargs,
+    ) -> list[np.ndarray]:
+        """Generate speech audio given text in various modes.
+        Supports three modes:
+        1. **Voice clone** — clone the voice style from the reference audio.
+            Should provide ``voice_clone_prompt`` (from
+           :meth:`create_voice_clone_prompt`) or ``ref_text`` + ``ref_audio``.
+        2. **Voice design** — provide ``instruct`` text describing
+           the desired voice style; no reference audio needed.
+        3. **Auto** — provide neither; the model picks a voice itself.
+        Args:
+            text: Target text (single string or list for batch).
+            language: Language name (e.g. ``"English"``) or code
+                (e.g. ``"en"``). ``None`` for language-agnostic mode.
+                Performance is slightly better if you specify the language.
+            ref_text: Optional reference text for voice cloning mode.
+            ref_audio: Optional reference audio for voice cloning mode.
+                Can be a file path or a (waveform, sample_rate) tuple.
+            voice_clone_prompt: Reusable prompt from :meth:`create_voice_clone_prompt`.
+                If provided, it overrides ``ref_text`` and ``ref_audio``.
+            instruct: Style instruction for voice design mode.
+            duration: Fixed output duration in seconds. If a single float,
+                applies to all items; if a list, one value per item.
+                ``None`` (default) lets the model estimate duration from text.
+                Overrides ``speed`` when both are provided.
+            speed: Speaking speed factor. ``> 1.0`` for faster, ``< 1.0`` for
+                slower. If a list, one value per item. ``None`` (default) uses
+                the model's default estimation.
+            generation_config: Explicit config object. If provided, takes
+                precedence over ``**kwargs``.
+            **kwargs: Generation config or its fields:
+                denoise: Whether to prepend the ``<|denoise|>`` token.
+                num_step: Number of iterative decoding steps.
+                guidance_scale: Classifier-free guidance scale.
+                t_shift: Time-step shift (smaller → emphasise low-SNR).
+                postprocess_output: Post-process output (remove silence, fade-in/out, pad edges).
+                layer_penalty_factor: Penalty encouraging earlier codebook
+                    layers to unmask first.
+                position_temperature: Temperature for position selection.
+                class_temperature: Temperature for token sampling (0 = greedy).
+                audio_chunk_duration: If > 0, split long text into chunks of
+                    this duration (seconds) and generate chunk by chunk.
+                audio_chunk_threshold: Only apply chunking if estimated audio
+                    duration exceeds this threshold (seconds).
+        Returns:
+            ``audios`` a list of 1-D ``np.ndarray`` with shape ``(T,)`` and
+            sampling rate consistent with the model's audio tokenizer
+            (usually 24 000 Hz).  Can be saved directly with
+            ``soundfile.write("out.wav", audios[0], model.sampling_rate)``.
+        """
+        if self.audio_tokenizer is None or self.text_tokenizer is None:
+            raise RuntimeError(
+                "Model is not loaded with audio/text tokenizers. Make sure you "
+                "loaded the model with OmniVoice.from_pretrained()."
+            )
+        gen_config = (
+            generation_config
+            if generation_config is not None
+            else OmniVoiceGenerationConfig.from_dict(kwargs)
+        )
+        self.eval()
+        full_task = self._preprocess_all(
+            text=text,
+            language=language,
+            ref_text=ref_text,
+            ref_audio=ref_audio,
+            voice_clone_prompt=voice_clone_prompt,
+            instruct=instruct,
+            preprocess_prompt=gen_config.preprocess_prompt,
+            speed=speed,
+            duration=duration,
+        )
+        short_idx, long_idx = full_task.get_indices(
+            gen_config, self.audio_tokenizer.config.frame_rate
+        )
+        results = [None] * full_task.batch_size
+        if short_idx:
+            short_task = full_task.slice_task(short_idx)
+            short_results = self._generate_iterative(short_task, gen_config)
+            for idx, res in zip(short_idx, short_results):
+                results[idx] = res
+        if long_idx:
+            long_task = full_task.slice_task(long_idx)
+            long_results = self._generate_chunked(long_task, gen_config)
+            for idx, res in zip(long_idx, long_results):
+                results[idx] = res
+        generated_audios = []
+        for i in range(full_task.batch_size):
+            assert results[i] is not None, f"Result {i} was not generated"
+            generated_audios.append(
+                self._decode_and_post_process(
+                    results[i], full_task.ref_rms[i], gen_config  # type: ignore[arg-type]
+                )
+            )
+        return generated_audios
+    def create_voice_clone_prompt(
+        self,
+        ref_audio: Union[str, tuple[torch.Tensor, int]],
+        ref_text: Optional[str] = None,
+        preprocess_prompt: bool = True,
+    ) -> VoiceClonePrompt:
+        """Create a reusable voice clone prompt from reference audio.
+        Args:
+            ref_audio: File path (str) or ``(waveform, sample_rate)`` tuple.
+                waveform should be a 1-D or 2-D torch.Tensor (channels x samples).
+            ref_text: Transcript of the reference audio. If ``None``, the
+                ASR model will be used to auto-transcribe (must call
+                :meth:`load_asr_model` first).
+            preprocess_prompt: If ``True`` (default), apply silence removal and
+                trimming to the reference audio, add punctuation in the end
+                of reference text (if not already)
+        Returns:
+            A :class:`VoiceClonePrompt` that can be passed to :meth:`generate`.
+        """
+        if self.audio_tokenizer is None:
+            raise RuntimeError(
+                "Audio tokenizer is not loaded. Make sure you loaded the model "
+                "with OmniVoice.from_pretrained()."
+            )
+        if isinstance(ref_audio, str):
+            ref_wav = load_audio(ref_audio, self.sampling_rate)
+        else:
+            waveform, sr = ref_audio
+            if isinstance(waveform, torch.Tensor):
+                waveform = waveform.cpu().numpy()
+            if waveform.ndim == 1:
+                waveform = waveform[np.newaxis, :]
+            if waveform.shape[0] > 1:
+                waveform = np.mean(waveform, axis=0, keepdims=True)
+            if sr != self.sampling_rate:
+                waveform = torchaudio.functional.resample(
+                    torch.from_numpy(waveform),
+                    orig_freq=sr,
+                    new_freq=self.sampling_rate,
+                ).numpy()
+            ref_wav = waveform
+        ref_rms = float(np.sqrt(np.mean(ref_wav**2)))
+        if 0 < ref_rms < 0.1:
+            ref_wav = ref_wav * 0.1 / ref_rms
+        if preprocess_prompt:
+            # Trim long reference audio (>20s) by splitting at the largest silence gap.
+            # Skip trimming when ref_text is user-provided, otherwise the
+            # trimmed audio will no longer match the full transcript.
+            if ref_text is None:
+                ref_wav = trim_long_audio(
+                    ref_wav, self.sampling_rate, trim_threshold=20.0
+                )
+            ref_wav = remove_silence(
+                ref_wav,
+                self.sampling_rate,
+                mid_sil=200,
+                lead_sil=100,
+                trail_sil=200,
+            )
+            if ref_wav.shape[-1] == 0:
+                raise ValueError(
+                    "Reference audio is empty after silence removal. "
+                    "Try setting preprocess_prompt=False."
+                )
+        ref_duration = ref_wav.shape[-1] / self.sampling_rate
+        if ref_duration > 20.0:
+            logger.warning(
+                "Reference audio is %.1fs long (>20s). This may cause slower "
+                "generation, higher memory usage, and degraded voice cloning "
+                "quality. We recommend trimming it to 3-10s.",
+                ref_duration,
+            )
+        # Auto-transcribe if ref_text not provided
+        if ref_text is None:
+            if self._asr_pipe is None:
+                logger.info("ASR model not loaded yet, loading on-the-fly ...")
+                self.load_asr_model()
+            ref_text = self.transcribe((ref_wav, self.sampling_rate))
+            logger.debug("Auto-transcribed ref_text: %s", ref_text)
+        chunk_size = self.audio_tokenizer.config.hop_length
+        clip_size = int(ref_wav.shape[-1] % chunk_size)
+        ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
+        # numpy → torch at tokenizer boundary
+        ref_wav_tensor = torch.from_numpy(ref_wav).to(self.audio_tokenizer.device)
+        ref_audio_tokens = self.audio_tokenizer.encode(
+            ref_wav_tensor.unsqueeze(0),
+        ).audio_codes.squeeze(
+            0
+        )  # (C, T)
+        if preprocess_prompt:
+            ref_text = add_punctuation(ref_text)
+        return VoiceClonePrompt(
+            ref_audio_tokens=ref_audio_tokens,
+            ref_text=ref_text,
+            ref_rms=ref_rms,
+        )
+    def _decode_and_post_process(
+        self,
+        tokens: Union[torch.Tensor, List[torch.Tensor]],
+        rms: Union[float, None],
+        gen_config: OmniVoiceGenerationConfig,
+    ) -> np.ndarray:
+        """
+        Args:
+            tokens: Audio tokens — either a single tensor of shape
+                (num_codebooks, seq_len) or a list of chunk tensors.
+            rms: RMS of the reference audio for volume adjustment.
+            gen_config: Generation config for post-processing options.
+        Returns:
+            Decoded and post-processed audio array of shape (T,).
+        """
+        tokenizer_device = self.audio_tokenizer.device
+        if isinstance(tokens, list):
+            chunk_audios = [
+                self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+                .numpy()
+                for t in tokens
+            ]
+            audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
+        else:
+            audio_waveform = (
+                self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
+                .audio_values[0]
+                .cpu()
+                .numpy()
+            )
+        audio_waveform = self._post_process_audio(
+            audio_waveform,
+            postprocess_output=gen_config.postprocess_output,
+            ref_rms=rms,
+        )
+        return audio_waveform.squeeze(0)
+    def _post_process_audio(
+        self,
+        generated_audio: np.ndarray,
+        postprocess_output: bool,
+        ref_rms: Union[float, None],
+    ) -> np.ndarray:
+        """Optionally remove long silences, adjust volume, and add edge padding.
+        Args:
+            generated_audio: Numpy array of shape (1, T).
+            postprocess_output: If True, remove long silences and apply fade/pad.
+            ref_rms: RMS of the reference audio for volume normalisation.
+        Returns:
+            Processed numpy array of shape (1, T).
+        """
+        if postprocess_output:
+            generated_audio = remove_silence(
+                generated_audio,
+                self.sampling_rate,
+                mid_sil=500,
+                lead_sil=100,
+                trail_sil=100,
+            )
+        if ref_rms is not None and ref_rms < 0.1:
+            generated_audio = generated_audio * ref_rms / 0.1
+        elif ref_rms is None:
+            peak = np.abs(generated_audio).max()
+            if peak > 1e-6:
+                generated_audio = generated_audio / peak * 0.5
+        generated_audio = fade_and_pad_audio(
+            generated_audio,
+            sample_rate=self.sampling_rate,
+        )
+        return generated_audio
+    def _generate_chunked(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[List[torch.Tensor]]:
+        """Generate long audio by splitting text into chunks and batching.
+        Each item in the returned list corresponds to one input and contains
+        a list of audio token tensors — one per text chunk.
+        Args:
+            task: A :class:`GenerationTask` with one or more items whose
+                estimated audio exceeds ``audio_chunk_threshold``.
+            gen_config: Generation config (``audio_chunk_duration`` controls
+                chunk size).
+        Returns:
+            Per-item list of chunk token-tensor lists.
+        """
+        # Chunk each item's text
+        all_chunks = []
+        for i in range(task.batch_size):
+            avg_tokens_per_char = task.target_lens[i] / len(task.texts[i])
+            text_chunk_len = int(
+                gen_config.audio_chunk_duration
+                * self.audio_tokenizer.config.frame_rate
+                / avg_tokens_per_char
+            )
+            chunks = chunk_text_punctuation(
+                text=task.texts[i],
+                chunk_len=text_chunk_len,
+                min_chunk_len=3,
+            )
+            logger.debug(f"Item {i} chunked into {len(chunks)} pieces: {chunks}")
+            all_chunks.append(chunks)
+        has_ref = [t is not None for t in task.ref_audio_tokens]
+        assert all(has_ref) or not any(has_ref), (
+            "Chunked inference requires all items to either have or not have "
+            "ref_audio. Mixed ref/non-ref is not supported."
+        )
+        max_num_chunks = max(len(c) for c in all_chunks)
+        # chunk_results[item_idx] = list of generated token tensors per chunk
+        chunk_results = [[] for _ in range(task.batch_size)]
+        def _run_batch(indices, texts, ref_audios, ref_texts):
+            speed_list = task.speed
+            target_lens = [
+                self._estimate_target_tokens(
+                    texts[j],
+                    ref_texts[j],
+                    ref_audios[j].size(-1) if ref_audios[j] is not None else None,
+                    speed=speed_list[i] if speed_list else 1.0,
+                )
+                for j, i in enumerate(indices)
+            ]
+            sub_task = GenerationTask(
+                batch_size=len(indices),
+                texts=texts,
+                target_lens=target_lens,
+                langs=[task.langs[i] for i in indices],
+                instructs=[task.instructs[i] for i in indices],
+                ref_texts=ref_texts,
+                ref_audio_tokens=ref_audios,
+                ref_rms=[task.ref_rms[i] for i in indices],
+                speed=[task.speed[i] for i in indices] if task.speed else None,
+            )
+            gen_tokens = self._generate_iterative(sub_task, gen_config)
+            for j, idx in enumerate(indices):
+                chunk_results[idx].append(gen_tokens[j])
+        if all(has_ref):
+            # All items have reference audio.
+            # We still sequentially generate chunks within each item, but we
+            # batch across items for the same chunk index. This allows to keep
+            # the VRAM usage manageable while still benefiting from batching.
+            for ci in range(max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[task.ref_audio_tokens[i] for i in indices],
+                    ref_texts=[task.ref_texts[i] for i in indices],
+                )
+        else:
+            # No reference audio — generate chunk 0 for all items first,
+            # then use chunk 0 output as reference for all subsequent chunks.
+            indices_0 = [i for i in range(task.batch_size) if len(all_chunks[i]) > 0]
+            _run_batch(
+                indices_0,
+                texts=[all_chunks[i][0] for i in indices_0],
+                ref_audios=[None] * len(indices_0),
+                ref_texts=[None] * len(indices_0),
+            )
+            first_chunk_map = {idx: chunk_results[idx][0] for idx in indices_0}
+            # Batch all remaining chunks, using chunk 0 as fixed reference
+            for ci in range(1, max_num_chunks):
+                indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+                if not indices:
+                    continue
+                _run_batch(
+                    indices,
+                    texts=[all_chunks[i][ci] for i in indices],
+                    ref_audios=[first_chunk_map[i] for i in indices],
+                    ref_texts=[all_chunks[i][0] for i in indices],
+                )
+        return chunk_results
+    def _preprocess_all(
+        self,
+        text: Union[str, list[str]],
+        language: Union[str, list[str], None] = None,
+        ref_text: Union[str, list[str], None] = None,
+        ref_audio: Union[
+            str,
+            list[str],
+            tuple[torch.Tensor, int],
+            list[tuple[torch.Tensor, int]],
+            None,
+        ] = None,
+        voice_clone_prompt: Union[
+            VoiceClonePrompt, list[VoiceClonePrompt], None
+        ] = None,
+        instruct: Union[str, list[str], None] = None,
+        preprocess_prompt: bool = True,
+        speed: Union[float, list[Optional[float]], None] = None,
+        duration: Union[float, list[Optional[float]], None] = None,
+    ) -> GenerationTask:
+        if isinstance(text, str):
+            text_list = [text]
+        else:
+            assert isinstance(
+                text, list
+            ), "text should be a string or a list of strings"
+            text_list = text
+        batch_size = len(text_list)
+        language_list = self._ensure_list(language, batch_size)
+        language_list = [_resolve_language(lang) for lang in language_list]
+        text_list = [
+            normalize_for_tts(text_item, language=language_list[i])
+            for i, text_item in enumerate(text_list)
+        ]
+        instruct_list = self._ensure_list(instruct, batch_size)
+        for i, s in enumerate(instruct_list):
+            if s is None:
+                continue
+            use_zh = bool(text_list[i] and _ZH_RE.search(text_list[i]))
+            instruct_list[i] = _resolve_instruct(s, use_zh=use_zh)
+        if voice_clone_prompt is not None and (
+            ref_text is not None or ref_audio is not None
+        ):
+            logger.warning(
+                "Both voice_clone_prompt and ref_text/ref_audio are provided. "
+                "ref_text/ref_audio will be ignored."
+            )
+        if voice_clone_prompt is None and ref_audio is not None:
+            # If voice_clone_prompt is not provided, create it from
+            # ref_audio (ref_text will be auto-transcribed if not given).
+            ref_text_list = self._ensure_list(ref_text, batch_size, auto_repeat=False)
+            ref_audio_list = self._ensure_list(ref_audio, batch_size, auto_repeat=False)
+            voice_clone_prompt = []
+            for i in range(len(ref_text_list)):
+                if ref_text_list[i] is not None:
+                    lang_idx = i if i < len(language_list) else 0
+                    ref_text_list[i] = normalize_for_tts(
+                        ref_text_list[i], language=language_list[lang_idx]
+                    )
+                voice_clone_prompt.append(
+                    self.create_voice_clone_prompt(
+                        ref_audio=ref_audio_list[i],
+                        ref_text=ref_text_list[i],
+                        preprocess_prompt=preprocess_prompt,
+                    )
+                )
+        voice_clone_prompt_list = self._ensure_list(voice_clone_prompt, batch_size)
+        if voice_clone_prompt_list[0] is not None:
+            ref_text_list = [vc.ref_text for vc in voice_clone_prompt_list]
+            ref_audio_tokens_list = [
+                vc.ref_audio_tokens for vc in voice_clone_prompt_list
+            ]
+            ref_rms_list = [vc.ref_rms for vc in voice_clone_prompt_list]
+        else:
+            ref_text_list = [None] * batch_size
+            ref_audio_tokens_list = [None] * batch_size
+            ref_rms_list = [None] * batch_size
+        # Normalize speed/duration to per-item lists (may contain None).
+        if speed is not None:
+            if isinstance(speed, (int, float)):
+                user_speed = [float(speed)] * batch_size
+            else:
+                user_speed = list(speed)
+        else:
+            user_speed = None
+        if duration is not None:
+            if isinstance(duration, (int, float)):
+                durations = [float(duration)] * batch_size
+            else:
+                durations = list(duration)
+        else:
+            durations = None
+        num_target_tokens_list = []
+        for i in range(batch_size):
+            # duration[i] overrides speed for estimation: use speed=1.0
+            # to get the raw estimate, then override target_lens below.
+            has_dur = durations is not None and durations[i] is not None
+            item_speed = 1.0 if has_dur else (user_speed[i] if user_speed else 1.0)
+            est = self._estimate_target_tokens(
+                text_list[i],
+                ref_text_list[i],
+                ref_audio_tokens_list[i].size(-1)
+                if ref_audio_tokens_list[i] is not None
+                else None,
+                speed=item_speed,
+            )
+            num_target_tokens_list.append(est)
+        # Per-item duration overrides: set target_lens to exact frame count
+        # and compute speed ratio so chunked generation scales proportionally.
+        speed_list: Optional[List[float]] = None
+        if durations is not None:
+            frame_rate = self.audio_tokenizer.config.frame_rate
+            speed_list = []
+            for i in range(batch_size):
+                if durations[i] is not None:
+                    target_tokens = max(1, int(durations[i] * frame_rate))
+                    est = num_target_tokens_list[i]
+                    speed_list.append(est / target_tokens if target_tokens > 0 else 1.0)
+                    num_target_tokens_list[i] = target_tokens
+                else:
+                    s = user_speed[i] if user_speed else None
+                    speed_list.append(s if s is not None else 1.0)
+        elif user_speed is not None:
+            speed_list = [s if s is not None else 1.0 for s in user_speed]
+        return GenerationTask(
+            batch_size=batch_size,
+            texts=text_list,
+            target_lens=num_target_tokens_list,
+            langs=language_list,
+            instructs=instruct_list,
+            ref_texts=ref_text_list,
+            ref_audio_tokens=ref_audio_tokens_list,
+            ref_rms=ref_rms_list,
+            speed=speed_list,
+        )
+    def _estimate_target_tokens(self, text, ref_text, num_ref_audio_tokens, speed=1.0):
+        """Estimate number of target audio tokens."""
+        if num_ref_audio_tokens is None or ref_text is None or len(ref_text) == 0:
+            # Fall back to a simple heuristic
+            ref_text = "Nice to meet you."
+            num_ref_audio_tokens = 25
+        est = self.duration_estimator.estimate_duration(
+            text, ref_text, num_ref_audio_tokens
+        )
+        if speed > 0 and speed != 1.0:
+            est = est / speed
+        return max(1, int(est))
+    def _ensure_list(
+        self, x: Union[Any, List[Any]], batch_size: int, auto_repeat: bool = True
+    ) -> List[Any]:
+        x_list = x if isinstance(x, list) else [x]
+        if len(x_list) not in (
+            1,
+            batch_size,
+        ):
+            raise ValueError(
+                f"should be either the number of the text or 1, but got {len(x_list)}"
+            )
+        if auto_repeat and len(x_list) == 1 and batch_size is not None:
+            x_list = x_list * batch_size
+        return x_list
+    def _prepare_inference_inputs(
+        self,
+        text: str,
+        num_target_tokens: int,
+        ref_text: Optional[str] = None,
+        ref_audio_tokens: Optional[torch.Tensor] = None,
+        lang: Optional[str] = None,
+        instruct: Optional[str] = None,
+        denoise: bool = True,
+    ):
+        """Prepare input_ids and audio masks for inference.
+        Args:
+            text: Target text to generate.
+            num_target_tokens: Number of audio tokens to generate.
+            ref_text: Optional reference text for voice cloning.
+            ref_audio_tokens: Optional reference audio tokens for voice cloning.
+                with shape (C, T).
+            lang: Optional language ID.
+            instruct: Optional style instruction for voice design.
+            denoise: Whether to include the <|denoise|> token.
+        """
+        # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
+        #                      + <|instruct_start|>...<|instruct_end|>
+        style_text = ""
+        if denoise and ref_audio_tokens is not None:
+            style_text += "<|denoise|>"
+        lang_str = lang if lang else "None"
+        instruct_str = instruct if instruct else "None"
+        style_text += f"<|lang_start|>{lang_str}<|lang_end|>"
+        style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>"
+        style_tokens = (
+            self.text_tokenizer(style_text, return_tensors="pt")
+            .input_ids.repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N1]
+        # Build text tokens
+        full_text = _combine_text(ref_text=ref_text, text=text)
+        wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
+        text_tokens = (
+            _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
+            .repeat(self.config.num_audio_codebook, 1)
+            .unsqueeze(0)
+        ).to(
+            self.device
+        )  # [1, C, N2]
+        # Target: all MASK
+        target_audio_tokens = torch.full(
+            (1, self.config.num_audio_codebook, num_target_tokens),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        # Conditional input
+        parts = [style_tokens, text_tokens]
+        if ref_audio_tokens is not None:
+            parts.append(ref_audio_tokens.unsqueeze(0).to(self.device))
+        parts.append(target_audio_tokens)
+        cond_input_ids = torch.cat(parts, dim=2)
+        cond_total_length = cond_input_ids.shape[2]
+        cond_audio_start_idx = cond_total_length - num_target_tokens
+        if ref_audio_tokens is not None:
+            cond_audio_start_idx -= ref_audio_tokens.size(-1)
+        cond_audio_mask = torch.zeros(
+            1, cond_total_length, dtype=torch.bool, device=self.device
+        )
+        cond_audio_mask[0, cond_audio_start_idx:] = True
+        return {
+            "input_ids": cond_input_ids,
+            "audio_mask": cond_audio_mask,
+        }
+    def _generate_iterative(
+        self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+    ) -> List[torch.Tensor]:
+        """N-step iterative unmasked decoding.
+        Args:
+            task: A :class:`GenerationTask` containing batch texts, target
+                lengths, languages, instructions, and optional reference data.
+            gen_config: A :class:`OmniVoiceGenerationConfig` controlling
+                decoding steps, guidance, temperatures, etc.
+        Returns:
+            List of generated audio token tensors of shape (C, T) (one per
+            input text).
+        """
+        B = task.batch_size
+        for i in range(B):
+            logger.debug(
+                "Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d",
+                i,
+                task.texts[i],
+                task.ref_texts[i],
+                task.instructs[i],
+                task.langs[i],
+                task.target_lens[i],
+            )
+        inputs_list = [
+            self._prepare_inference_inputs(
+                task.texts[i],
+                task.target_lens[i],
+                task.ref_texts[i],
+                task.ref_audio_tokens[i],
+                task.langs[i],
+                task.instructs[i],
+                gen_config.denoise,
+            )
+            for i in range(B)
+        ]
+        c_lens = [inp["input_ids"].size(2) for inp in inputs_list]
+        max_c_len = max(c_lens)
+        pad_id = self.config.audio_mask_id  # Or any other tokens
+        batch_input_ids = torch.full(
+            (2 * B, self.config.num_audio_codebook, max_c_len),
+            pad_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        batch_audio_mask = torch.zeros(
+            (2 * B, max_c_len), dtype=torch.bool, device=self.device
+        )
+        batch_attention_mask = torch.zeros(
+            (2 * B, 1, max_c_len, max_c_len), dtype=torch.bool, device=self.device
+        )
+        for i, inp in enumerate(inputs_list):
+            c_len, u_len = c_lens[i], task.target_lens[i]
+            # Cond (0 ~ B-1)
+            batch_input_ids[i, :, :c_len] = inp["input_ids"]
+            batch_audio_mask[i, :c_len] = inp["audio_mask"]
+            batch_attention_mask[i, :, :c_len, :c_len] = True
+            # Uncond (B ~ 2B-1)
+            batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
+            batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
+            batch_attention_mask[B + i, :, :u_len, :u_len] = True
+            if max_c_len > u_len:
+                pad_diag = torch.arange(u_len, max_c_len, device=self.device)
+                batch_attention_mask[B + i, :, pad_diag, pad_diag] = True
+        tokens = torch.full(
+            (B, self.config.num_audio_codebook, max(task.target_lens)),
+            self.config.audio_mask_id,
+            dtype=torch.long,
+            device=self.device,
+        )
+        timesteps = _get_time_steps(
+            t_start=0.0,
+            t_end=1.0,
+            num_step=gen_config.num_step,
+            t_shift=gen_config.t_shift,
+        ).tolist()
+        schedules = []
+        for t_len in task.target_lens:
+            total_mask = t_len * self.config.num_audio_codebook
+            rem = total_mask
+            sched = []
+            for step in range(gen_config.num_step):
+                num = (
+                    rem
+                    if step == gen_config.num_step - 1
+                    else min(
+                        math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])),
+                        rem,
+                    )
+                )
+                sched.append(int(num))
+                rem -= int(num)
+            schedules.append(sched)
+        layer_ids = torch.arange(
+            self.config.num_audio_codebook, device=self.device
+        ).view(1, -1, 1)
+        for step in range(gen_config.num_step):
+            batch_logits = self(
+                input_ids=batch_input_ids,
+                audio_mask=batch_audio_mask,
+                attention_mask=batch_attention_mask,
+            ).logits.to(torch.float32)
+            for i in range(B):
+                k = schedules[i][step]
+                if k <= 0:
+                    continue
+                c_len, t_len = c_lens[i], task.target_lens[i]
+                # Extract real target Logits
+                # [1, C, T, V]
+                c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :]
+                u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :]
+                pred_tokens, scores = self._predict_tokens_with_scoring(
+                    c_logits, u_logits, gen_config
+                )
+                scores = scores - (layer_ids * gen_config.layer_penalty_factor)
+                if gen_config.position_temperature > 0.0:
+                    scores = _gumbel_sample(scores, gen_config.position_temperature)
+                sample_tokens = tokens[i : i + 1, :, :t_len]
+                scores.masked_fill_(
+                    sample_tokens != self.config.audio_mask_id, -float("inf")
+                )
+                _, topk_idx = torch.topk(scores.flatten(), k)
+                flat_tokens = sample_tokens.flatten()
+                flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx]
+                sample_tokens.copy_(flat_tokens.view_as(sample_tokens))
+                # Update individual slices into batched structure
+                tokens[i : i + 1, :, :t_len] = sample_tokens
+                batch_input_ids[i : i + 1, :, c_len - t_len : c_len] = sample_tokens
+                batch_input_ids[B + i : B + i + 1, :, :t_len] = sample_tokens
+        return [tokens[i, :, : task.target_lens[i]] for i in range(B)]
+    def _predict_tokens_with_scoring(self, c_logits, u_logits, gen_config):
+        if gen_config.guidance_scale != 0:
+            c_log_probs = F.log_softmax(c_logits, dim=-1)
+            u_log_probs = F.log_softmax(u_logits, dim=-1)
+            log_probs = torch.log_softmax(
+                c_log_probs + gen_config.guidance_scale * (c_log_probs - u_log_probs),
+                dim=-1,
+            )
+        else:
+            log_probs = F.log_softmax(c_logits, dim=-1)
+        log_probs[..., self.config.audio_mask_id] = -float("inf")
+        if gen_config.class_temperature > 0.0:
+            filtered_probs = _filter_top_k(log_probs, ratio=0.1)
+            pred_tokens = _gumbel_sample(
+                filtered_probs, gen_config.class_temperature
+            ).argmax(dim=-1)
+        else:
+            pred_tokens = log_probs.argmax(dim=-1)
+        confidence_scores = log_probs.max(dim=-1)[0]
+        return pred_tokens, confidence_scores
+# ---------------------------------------------------------------------------
+# Standalone helpers
+# ---------------------------------------------------------------------------
+def _get_packed_mask(document_ids):
+    return partial(_mask_mod_packed, document_ids)
+def _mask_mod_packed(document_ids, b, h, q_idx, kv_idx):
+    # 1. Sequence Packing Logic: Tokens must belong to the same document.
+    # Note: The doc_id for padding tokens is -1, which will automatically not match
+    # (if handled correctly) or be ignored.
+    same_doc = document_ids[q_idx] == document_ids[kv_idx]
+    return same_doc
+def _resolve_language(language: Optional[str]) -> Union[str, None]:
+    from omnivoice.utils.lang_map import LANG_IDS, LANG_NAME_TO_ID
+    if language is None or language.lower() == "none":
+        return None
+    if language in LANG_IDS:
+        return language
+    key = language.lower()
+    if key in LANG_NAME_TO_ID:
+        return LANG_NAME_TO_ID[key]
+    logger.warning(
+        f"Language '{language}' is not recognized. "
+        f"Please use a valid language ID (e.g., 'en', 'zh', 'ja', 'de') "
+        f"or a full language name (e.g., 'English', 'Chinese', 'Japanese'). "
+        f"See supported_language_ids() or supported_language_names() for details. "
+        f"Falling back to None (language-agnostic mode)."
+    )
+    return None
+def _resolve_instruct(
+    instruct: Optional[str], use_zh: bool = False
+) -> Union[str, None]:
+    """Validate and normalise a voice-design instruct string.
+    Supported instruct items (case-insensitive for English):
+    English (comma + space separated):
+        gender: male, female
+        age: child, teenager, young adult, middle-aged, elderly
+        pitch: very low pitch, low pitch, moderate pitch,
+               high pitch, very high pitch
+        style: whisper
+        accent: american accent, british accent, australian accent, ...
+    Chinese (full-width comma separated):
+        gender: 男, 女
+        age: 儿童, 少年, 青年, 中年, 老年
+        pitch: 极低音调, 低音调, 中音调, 高音调, 极高音调
+        style: 耳语
+        dialect: 河南话, 陕西话, 四川话, 贵州话, 云南话,
+                 桂林话, 济南话, 石家庄话, 甘肃话, 宁夏话,
+                 青岛话, 东北话
+    Minor issues (auto-fixed):
+      - Wrong separator (half-width comma in Chinese instruct or
+        full-width comma in English instruct)
+      - Leading / trailing commas
+    Major issues (raise ``ValueError``):
+      - Unsupported or misspelled instruct items
+      - Suggestions are offered for close matches
+    Args:
+        instruct: Raw instruct string, or ``None``.
+        use_zh: If True, normalise all items to Chinese (used when the
+            synthesis text contains Chinese and no accent is specified).
+    Returns:
+        Normalised instruct string, or ``None``.
+    Raises:
+        ValueError: if any instruct item is unsupported or misspelled.
+    """
+    if instruct is None:
+        return None
+    instruct_str = instruct.strip()
+    if not instruct_str:
+        return None
+    # Split on both half-width and full-width commas
+    raw_items = re.split(r"\s*[,，]\s*", instruct_str)
+    raw_items = [x for x in raw_items if x]
+    # Validate each item
+    unknown = []
+    normalised = []
+    for raw in raw_items:
+        n = raw.strip().lower()
+        if n in _INSTRUCT_ALL_VALID:
+            normalised.append(n)
+        else:
+            sug = difflib.get_close_matches(n, _INSTRUCT_ALL_VALID, n=1, cutoff=0.6)
+            unknown.append((raw, n, sug[0] if sug else None))
+    if unknown:
+        lines = []
+        for raw, n, sug in unknown:
+            if sug:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported; did you mean '{sug}'?)")
+            else:
+                lines.append(f"  '{raw}' -> '{n}' (unsupported)")
+        err = (
+            f"Unsupported instruct items found in {instruct_str}:\n"
+            + "\n".join(lines)
+            + "\n\nValid English items: "
+            + ", ".join(sorted(_INSTRUCT_VALID_EN))
+            + "\nValid Chinese items: "
+            + "，".join(sorted(_INSTRUCT_VALID_ZH))
+            + "\n\nTip: Use only English or only Chinese instructs. "
+            "English instructs should use comma + space (e.g. "
+            "'male, indian accent'),\nChinese instructs should use full-width "
+            "comma (e.g. '男，河南话')."
+        )
+        raise ValueError(err)
+    # --- Language consistency: dialect forces Chinese, accent forces English ---
+    has_dialect = any(n.endswith("话") for n in normalised)
+    has_accent = any(" accent" in n for n in normalised)
+    if has_dialect and has_accent:
+        raise ValueError(
+            "Cannot mix Chinese dialect and English accent in a single instruct. "
+            "Dialects are for Chinese speech, accents for English speech."
+        )
+    if has_dialect:
+        use_zh = True
+    elif has_accent:
+        use_zh = False
+    # --- Unify to single language ---
+    if use_zh:
+        normalised = [_INSTRUCT_EN_TO_ZH.get(n, n) for n in normalised]
+    else:
+        normalised = [_INSTRUCT_ZH_TO_EN.get(n, n) for n in normalised]
+    # --- Category conflict check ---
+    conflicts = []
+    for cat in _INSTRUCT_MUTUALLY_EXCLUSIVE:
+        hits = [n for n in normalised if n in cat]
+        if len(hits) > 1:
+            conflicts.append(hits)
+    if conflicts:
+        parts = []
+        for group in conflicts:
+            parts.append(" vs ".join(f"'{x}'" for x in group))
+        raise ValueError(
+            "Conflicting instruct items within the same category: "
+            + "; ".join(parts)
+            + ". Each category (gender, age, pitch, style, accent, dialect) "
+            "allows at most one item."
+        )
+    # Determine separator based on language
+    has_zh = any(any("\u4e00" <= c <= "\u9fff" for c in n) for n in normalised)
+    separator = "，" if has_zh else ", "
+    return separator.join(normalised)
+def _filter_top_k(logits: torch.Tensor, ratio: float = 0.1) -> torch.Tensor:
+    k = math.ceil(ratio * logits.shape[-1])
+    val, ind = logits.topk(k, dim=-1)
+    probs = torch.full_like(logits, float("-inf"))
+    probs.scatter_(-1, ind, val)
+    return probs
+def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    scaled_logits = logits / temperature
+    u = torch.rand_like(scaled_logits)
+    gumbel_noise = -torch.log(-torch.log(u + 1e-10) + 1e-10)
+    return scaled_logits + gumbel_noise
+def _get_time_steps(
+    t_start: float = 0.0,
+    t_end: float = 1.0,
+    num_step: int = 10,
+    t_shift: float = 1.0,
+    device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    timesteps = torch.linspace(t_start, t_end, num_step + 1).to(device)
+    timesteps = t_shift * timesteps / (1 + (t_shift - 1) * timesteps)
+    return timesteps
+_NONVERBAL_PATTERN = re.compile(
+    r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
+    r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
+    r"surprise-yo|dissatisfaction-hnn)\]"
+)
+def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
+    """Tokenize text containing non-verbal tags, handling each tag independently.
+    Non-verbal tags are tokenized standalone to guarantee consistent token
+    IDs regardless of surrounding language context (Chinese, English, etc.).
+    Args:
+        text: Full text string potentially containing non-verbal tags.
+        tokenizer: HuggingFace text tokenizer instance.
+    Returns:
+        Token IDs tensor of shape (1, seq_len).
+    """
+    parts = []
+    last_end = 0
+    for m in _NONVERBAL_PATTERN.finditer(text):
+        if m.start() > last_end:
+            segment = text[last_end : m.start()]
+            ids = tokenizer(segment, add_special_tokens=False).input_ids
+            if ids:
+                parts.append(ids)
+        tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
+        if tag_ids:
+            parts.append(tag_ids)
+        last_end = m.end()
+    if last_end < len(text):
+        segment = text[last_end:]
+        ids = tokenizer(segment, add_special_tokens=False).input_ids
+        if ids:
+            parts.append(ids)
+    if not parts:
+        result = tokenizer(text, return_tensors="pt").input_ids
+    else:
+        combined = []
+        for p in parts:
+            combined.extend(p)
+        result = torch.tensor([combined], dtype=torch.long)
+    return result
+def _combine_text(text, ref_text: Optional[str] = None) -> str:
+    # combine with reference text if not None
+    if ref_text:
+        full_text = ref_text.strip() + " " + text.strip()
+    else:
+        full_text = text.strip()
+    # filter out newline / carriage-return characters
+    full_text = re.sub(r"[\r\n]+", "", full_text)
+    # replace Chinese parentheses with English ones
+    full_text = full_text.replace("\uff08", "(").replace("\uff09", ")")
+    # collapse consecutive spaces / tabs into a single space
+    full_text = re.sub(r"[ \t]+", " ", full_text)
+    # remove spaces around chinese characters
+    chinese_range = r"[\u4e00-\u9fff]"
+    pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
+    full_text = re.sub(pattern, "", full_text)
+    return full_text
+# ---------------------------------------------------------------------------
+# Register with HuggingFace Auto classes
+# ---------------------------------------------------------------------------
+AutoConfig.register("omnivoice", OmniVoiceConfig)
+AutoModel.register(OmniVoiceConfig, OmniVoice)

runtime/omnivoice/server/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """HTTP API server helpers for OmniVoice."""
2	+

runtime/omnivoice/server/app.py ADDED Viewed

	@@ -0,0 +1,506 @@

+from __future__ import annotations
+import argparse
+import base64
+import binascii
+import io
+import logging
+import os
+import threading
+from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from importlib import import_module
+from typing import Any, Literal, Protocol
+from fastapi import FastAPI, HTTPException, Response
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from omnivoice import __version__
+from omnivoice.utils.lang_map import LANG_NAME_TO_ID
+logger = logging.getLogger(__name__)
+DEFAULT_MODEL_PATH = "/app/model"
+DEFAULT_HOST = "0.0.0.0"
+DEFAULT_PORT = 8000
+def _parse_bool(value: str | None, default: bool = False) -> bool:
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+def _parse_origins(value: str | None) -> list[str]:
+    if not value or value.strip() == "*":
+        return ["*"]
+    return [item.strip() for item in value.split(",") if item.strip()]
+def _clean_base64_audio(value: str) -> bytes:
+    payload = value.strip()
+    if "," in payload and payload.split(",", 1)[0].startswith("data:"):
+        payload = payload.split(",", 1)[1]
+    try:
+        return base64.b64decode(payload, validate=True)
+    except binascii.Error as exc:
+        raise ValueError("reference audio must be valid base64 or data URI") from exc
+@dataclass(slots=True)
+class ServerSettings:
+    model: str = DEFAULT_MODEL_PATH
+    device: str = "auto"
+    dtype: str = "auto"
+    host: str = DEFAULT_HOST
+    port: int = DEFAULT_PORT
+    log_level: str = "info"
+    cors_origins: list[str] | None = None
+    preload_model: bool = False
+    load_asr: bool = False
+    asr_model_name: str = "openai/whisper-large-v3-turbo"
+    @classmethod
+    def from_env(cls) -> "ServerSettings":
+        port = int(os.getenv("OMNIVOICE_PORT", str(DEFAULT_PORT)))
+        return cls(
+            model=os.getenv("OMNIVOICE_MODEL", DEFAULT_MODEL_PATH),
+            device=os.getenv("OMNIVOICE_DEVICE", "auto"),
+            dtype=os.getenv("OMNIVOICE_DTYPE", "auto"),
+            host=os.getenv("OMNIVOICE_HOST", DEFAULT_HOST),
+            port=port,
+            log_level=os.getenv("OMNIVOICE_LOG_LEVEL", "info"),
+            cors_origins=_parse_origins(os.getenv("OMNIVOICE_CORS_ORIGINS", "*")),
+            preload_model=_parse_bool(os.getenv("OMNIVOICE_PRELOAD", "0")),
+            load_asr=_parse_bool(os.getenv("OMNIVOICE_LOAD_ASR", "0")),
+            asr_model_name=os.getenv(
+                "OMNIVOICE_ASR_MODEL",
+                "openai/whisper-large-v3-turbo",
+            ),
+        )
+class SynthesizeRequest(BaseModel):
+    text: str = Field(..., min_length=1, description="Text to synthesize.")
+    language: str | None = Field(
+        default=None,
+        description="Optional language code or language name.",
+    )
+    instruct: str | None = Field(
+        default=None,
+        description="Optional voice-design instruction.",
+    )
+    ref_text: str | None = Field(
+        default=None,
+        description="Transcript for the reference audio used in voice cloning.",
+    )
+    ref_audio_base64: str | None = Field(
+        default=None,
+        description="Optional reference audio encoded as base64 or data URI.",
+    )
+    num_step: int = Field(default=32, ge=1, le=128)
+    guidance_scale: float = Field(default=2.0, ge=0.0, le=20.0)
+    speed: float = Field(default=1.0, gt=0.0, le=4.0)
+    duration: float | None = Field(default=None, gt=0.0)
+    denoise: bool = True
+    preprocess_prompt: bool = True
+    postprocess_output: bool = True
+class RuntimeStatus(BaseModel):
+    model_path: str
+    model_loaded: bool
+    device_preference: str
+    device_resolved: str | None = None
+    dtype_preference: str
+    dtype_resolved: str | None = None
+    load_asr: bool
+    last_error: str | None = None
+class SynthesizeResponse(BaseModel):
+    audio_base64: str
+    sample_rate: int
+    duration_seconds: float
+    device: str
+    model_path: str
+@dataclass(slots=True)
+class SynthesisResult:
+    audio_bytes: bytes
+    sample_rate: int
+    duration_seconds: float
+    device: str
+    model_path: str
+class RuntimeLike(Protocol):
+    settings: ServerSettings
+    def get_status(self) -> RuntimeStatus:
+        ...
+    def list_languages(self) -> list[dict[str, str]]:
+        ...
+    def synthesize(self, request: SynthesizeRequest) -> SynthesisResult:
+        ...
+    def maybe_preload(self) -> None:
+        ...
+class OmniVoiceRuntime:
+    def __init__(self, settings: ServerSettings):
+        self.settings = settings
+        self._model: Any | None = None
+        self._torch: Any | None = None
+        self._config_cls: Any | None = None
+        self._device: str | None = None
+        self._dtype_name: str | None = None
+        self._last_error: str | None = None
+        self._load_lock = threading.Lock()
+        self._infer_lock = threading.Lock()
+    def get_status(self) -> RuntimeStatus:
+        return RuntimeStatus(
+            model_path=self.settings.model,
+            model_loaded=self._model is not None,
+            device_preference=self.settings.device,
+            device_resolved=self._device,
+            dtype_preference=self.settings.dtype,
+            dtype_resolved=self._dtype_name,
+            load_asr=self.settings.load_asr,
+            last_error=self._last_error,
+        )
+    def maybe_preload(self) -> None:
+        if self.settings.preload_model:
+            self._ensure_loaded()
+    def list_languages(self) -> list[dict[str, str]]:
+        languages = [
+            {"id": code, "name": name.title()}
+            for name, code in LANG_NAME_TO_ID.items()
+        ]
+        languages.sort(key=lambda item: (item["name"], item["id"]))
+        return languages
+    def synthesize(self, request: SynthesizeRequest) -> SynthesisResult:
+        if not request.text.strip():
+            raise ValueError("text must not be blank")
+        self._ensure_loaded()
+        assert self._model is not None
+        assert self._torch is not None
+        assert self._config_cls is not None
+        prompt = None
+        if request.ref_audio_base64:
+            prompt = self._build_voice_clone_prompt(
+                audio_base64=request.ref_audio_base64,
+                ref_text=request.ref_text,
+                preprocess_prompt=request.preprocess_prompt,
+            )
+        elif request.ref_text:
+            raise ValueError("ref_text requires ref_audio_base64 as well")
+        generation_config = self._config_cls(
+            num_step=request.num_step,
+            guidance_scale=request.guidance_scale,
+            denoise=request.denoise,
+            preprocess_prompt=request.preprocess_prompt,
+            postprocess_output=request.postprocess_output,
+        )
+        with self._infer_lock:
+            audios = self._model.generate(
+                text=request.text,
+                language=request.language,
+                voice_clone_prompt=prompt,
+                instruct=request.instruct,
+                duration=request.duration,
+                speed=request.speed,
+                generation_config=generation_config,
+            )
+        audio = audios[0]
+        wav_bytes = self._encode_wav(audio, self._model.sampling_rate)
+        duration_seconds = float(len(audio) / self._model.sampling_rate)
+        return SynthesisResult(
+            audio_bytes=wav_bytes,
+            sample_rate=int(self._model.sampling_rate),
+            duration_seconds=duration_seconds,
+            device=self._device or self.settings.device,
+            model_path=self.settings.model,
+        )
+    def _ensure_loaded(self) -> None:
+        if self._model is not None:
+            return
+        with self._load_lock:
+            if self._model is not None:
+                return
+            try:
+                torch_module = import_module("torch")
+                omnivoice_module = import_module("omnivoice")
+                model_cls = getattr(omnivoice_module, "OmniVoice")
+                config_cls = getattr(omnivoice_module, "OmniVoiceGenerationConfig")
+                device = self._resolve_device(torch_module)
+                dtype_name, dtype_value = self._resolve_dtype(torch_module, device)
+                logger.info(
+                    "Loading OmniVoice model from %s on %s (%s)",
+                    self.settings.model,
+                    device,
+                    dtype_name,
+                )
+                model = model_cls.from_pretrained(
+                    self.settings.model,
+                    device_map=device,
+                    dtype=dtype_value,
+                    load_asr=self.settings.load_asr,
+                    asr_model_name=self.settings.asr_model_name,
+                )
+            except Exception as exc:
+                self._last_error = f"{type(exc).__name__}: {exc}"
+                raise
+            self._torch = torch_module
+            self._config_cls = config_cls
+            self._model = model
+            self._device = device
+            self._dtype_name = dtype_name
+            self._last_error = None
+    def _resolve_device(self, torch_module: Any) -> str:
+        choice = self.settings.device.strip().lower()
+        if choice == "auto":
+            if torch_module.cuda.is_available():
+                return "cuda"
+            mps_backend = getattr(getattr(torch_module, "backends", None), "mps", None)
+            if mps_backend is not None and mps_backend.is_available():
+                return "mps"
+            return "cpu"
+        if choice == "cuda":
+            if not torch_module.cuda.is_available():
+                raise RuntimeError("OMNIVOICE_DEVICE=cuda was requested but CUDA is unavailable")
+            return "cuda"
+        if choice == "mps":
+            mps_backend = getattr(getattr(torch_module, "backends", None), "mps", None)
+            if mps_backend is None or not mps_backend.is_available():
+                raise RuntimeError("OMNIVOICE_DEVICE=mps was requested but MPS is unavailable")
+            return "mps"
+        if choice == "cpu":
+            return "cpu"
+        raise RuntimeError(f"Unsupported device choice: {self.settings.device}")
+    def _resolve_dtype(self, torch_module: Any, device: str) -> tuple[str, Any]:
+        aliases = {
+            "fp16": "float16",
+            "half": "float16",
+            "fp32": "float32",
+            "float": "float32",
+            "bf16": "bfloat16",
+        }
+        choice = self.settings.dtype.strip().lower()
+        if choice == "auto":
+            choice = "float16" if device == "cuda" else "float32"
+        choice = aliases.get(choice, choice)
+        valid = {"float16", "float32", "bfloat16"}
+        if choice not in valid:
+            raise RuntimeError(f"Unsupported dtype choice: {self.settings.dtype}")
+        return choice, getattr(torch_module, choice)
+    def _build_voice_clone_prompt(
+        self,
+        audio_base64: str,
+        ref_text: str | None,
+        preprocess_prompt: bool,
+    ) -> Any:
+        assert self._model is not None
+        assert self._torch is not None
+        waveform, sample_rate = self._decode_audio(audio_base64)
+        return self._model.create_voice_clone_prompt(
+            ref_audio=(self._torch.from_numpy(waveform), sample_rate),
+            ref_text=ref_text,
+            preprocess_prompt=preprocess_prompt,
+        )
+    def _decode_audio(self, audio_base64: str) -> tuple[Any, int]:
+        import numpy as np
+        import soundfile as sf
+        raw_bytes = _clean_base64_audio(audio_base64)
+        audio_buffer = io.BytesIO(raw_bytes)
+        waveform, sample_rate = sf.read(audio_buffer, dtype="float32", always_2d=False)
+        if waveform.ndim == 2:
+            waveform = np.transpose(waveform)
+        return waveform, int(sample_rate)
+    def _encode_wav(self, audio: Any, sample_rate: int) -> bytes:
+        import soundfile as sf
+        buffer = io.BytesIO()
+        sf.write(buffer, audio, sample_rate, format="WAV")
+        return buffer.getvalue()
+def create_app(
+    settings: ServerSettings | None = None,
+    runtime: RuntimeLike | None = None,
+) -> FastAPI:
+    if settings is None:
+        if runtime is not None and hasattr(runtime, "settings"):
+            settings = runtime.settings
+        else:
+            settings = ServerSettings.from_env()
+    runtime = runtime or OmniVoiceRuntime(settings)
+    @asynccontextmanager
+    async def lifespan(_: FastAPI):
+        runtime.maybe_preload()
+        yield
+    app = FastAPI(
+        title="AVoice OmniVoice API",
+        version=__version__,
+        summary="Local HTTP API for OmniVoice speech generation.",
+        lifespan=lifespan,
+    )
+    if settings.cors_origins:
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=settings.cors_origins,
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"],
+        )
+    @app.get("/healthz", response_model=RuntimeStatus)
+    def healthz() -> RuntimeStatus:
+        return runtime.get_status()
+    @app.get("/v1/runtime", response_model=RuntimeStatus)
+    def runtime_status() -> RuntimeStatus:
+        return runtime.get_status()
+    @app.get("/v1/languages")
+    def languages() -> dict[str, list[dict[str, str]]]:
+        return {"languages": runtime.list_languages()}
+    @app.post("/v1/audio/speech")
+    def speech(request: SynthesizeRequest) -> Response:
+        try:
+            result = runtime.synthesize(request)
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+        except RuntimeError as exc:
+            raise HTTPException(status_code=503, detail=str(exc)) from exc
+        except Exception as exc:
+            logger.exception("speech generation failed")
+            raise HTTPException(status_code=500, detail=str(exc)) from exc
+        headers = {
+            "X-OmniVoice-Device": result.device,
+            "X-OmniVoice-Sample-Rate": str(result.sample_rate),
+            "X-OmniVoice-Model": result.model_path,
+        }
+        return Response(content=result.audio_bytes, media_type="audio/wav", headers=headers)
+    @app.post("/v1/audio/speech/json", response_model=SynthesizeResponse)
+    def speech_json(request: SynthesizeRequest) -> SynthesizeResponse:
+        try:
+            result = runtime.synthesize(request)
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+        except RuntimeError as exc:
+            raise HTTPException(status_code=503, detail=str(exc)) from exc
+        except Exception as exc:
+            logger.exception("speech generation failed")
+            raise HTTPException(status_code=500, detail=str(exc)) from exc
+        return SynthesizeResponse(
+            audio_base64=base64.b64encode(result.audio_bytes).decode("ascii"),
+            sample_rate=result.sample_rate,
+            duration_seconds=result.duration_seconds,
+            device=result.device,
+            model_path=result.model_path,
+        )
+    return app
+def build_parser() -> argparse.ArgumentParser:
+    env = ServerSettings.from_env()
+    parser = argparse.ArgumentParser(
+        prog="omnivoice-api",
+        description="Serve OmniVoice inference as a local HTTP API.",
+    )
+    parser.add_argument("--model", default=env.model, help="Local model path or HuggingFace repo id.")
+    parser.add_argument(
+        "--device",
+        default=env.device,
+        help="Device selection: auto, cpu, cuda, or mps.",
+    )
+    parser.add_argument(
+        "--dtype",
+        default=env.dtype,
+        help="Precision selection: auto, float16, float32, or bfloat16.",
+    )
+    parser.add_argument("--host", default=env.host, help="Bind host.")
+    parser.add_argument("--port", type=int, default=env.port, help="Bind port.")
+    parser.add_argument("--log-level", default=env.log_level, help="Uvicorn log level.")
+    parser.add_argument(
+        "--preload-model",
+        action="store_true",
+        default=env.preload_model,
+        help="Load the model during server startup instead of on first request.",
+    )
+    parser.add_argument(
+        "--load-asr",
+        action="store_true",
+        default=env.load_asr,
+        help="Load the Whisper ASR helper at startup for reference-audio transcription.",
+    )
+    parser.add_argument(
+        "--asr-model-name",
+        default=env.asr_model_name,
+        help="Whisper model to use when ref_text is omitted.",
+    )
+    parser.add_argument(
+        "--cors-origins",
+        default=",".join(env.cors_origins or ["*"]),
+        help="Comma-separated CORS allowlist or *.",
+    )
+    return parser
+def main() -> None:
+    parser = build_parser()
+    args = parser.parse_args()
+    settings = ServerSettings(
+        model=args.model,
+        device=args.device,
+        dtype=args.dtype,
+        host=args.host,
+        port=args.port,
+        log_level=args.log_level,
+        cors_origins=_parse_origins(args.cors_origins),
+        preload_model=args.preload_model,
+        load_asr=args.load_asr,
+        asr_model_name=args.asr_model_name,
+    )
+    import uvicorn
+    uvicorn.run(
+        create_app(settings=settings),
+        host=settings.host,
+        port=settings.port,
+        log_level=settings.log_level,
+    )
+app = create_app()

runtime/omnivoice/server/prefetch.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from __future__ import annotations
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def _resolve_path(name_or_path: str) -> str:
+    if os.path.isdir(name_or_path):
+        return name_or_path
+    from huggingface_hub import snapshot_download
+    return snapshot_download(name_or_path)
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="omnivoice-prefetch",
+        description="Cache auxiliary OmniVoice assets for offline/container use.",
+    )
+    parser.add_argument(
+        "--model-dir",
+        default="/app/model",
+        help="Directory containing the OmniVoice model files.",
+    )
+    parser.add_argument(
+        "--audio-tokenizer",
+        default="eustlb/higgs-audio-v2-tokenizer",
+        help="Audio tokenizer repo id or local path.",
+    )
+    parser.add_argument(
+        "--asr-model",
+        default="openai/whisper-large-v3-turbo",
+        help="ASR model repo id or local path.",
+    )
+    parser.add_argument(
+        "--copy-audio-tokenizer",
+        action="store_true",
+        help="Copy the tokenizer into <model-dir>/audio_tokenizer.",
+    )
+    parser.add_argument(
+        "--prefetch-asr",
+        action="store_true",
+        help="Download the ASR model into the Hugging Face cache.",
+    )
+    return parser
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    args = build_parser().parse_args()
+    tokenizer_path = _resolve_path(args.audio_tokenizer)
+    if args.copy_audio_tokenizer:
+        model_dir = Path(args.model_dir)
+        target_dir = model_dir / "audio_tokenizer"
+        target_dir.mkdir(parents=True, exist_ok=True)
+        shutil.copytree(tokenizer_path, target_dir, dirs_exist_ok=True)
+        logger.info("Copied audio tokenizer to %s", target_dir)
+    else:
+        logger.info("Cached audio tokenizer at %s", tokenizer_path)
+    if args.prefetch_asr:
+        asr_path = _resolve_path(args.asr_model)
+        logger.info("Cached ASR model at %s", asr_path)

runtime/omnivoice/utils/__init__.py ADDED Viewed

File without changes

runtime/omnivoice/utils/armenian_text.py ADDED Viewed

	@@ -0,0 +1,450 @@

+#!/usr/bin/env python3
+"""Armenian text frontend for TTS.
+The acoustic model should see pronounceable text, not written shortcuts such as
+``02.02.2026`` or ``25%``.  This module keeps that logic in one place so the
+same frontend can be used for manifest preparation and inference.
+"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass
+import re
+from typing import Iterable, List, Optional
+ARMENIAN_CHAR_RE = re.compile(r"[\u0530-\u058f]")
+SPACE_RE = re.compile(r"\s+")
+REPLACEMENTS = {
+    "\u00a0": " ",
+    "\u200b": "",
+    "\u200c": "",
+    "\u200d": "",
+    "\u2018": "'",
+    "\u2019": "'",
+    "\u201c": '"',
+    "\u201d": '"',
+    "\u2013": "-",
+    "\u2014": "-",
+    "\u2212": "-",
+}
+LANGUAGE_IDS_ARMENIAN = {
+    "hy",
+    "hye",
+    "hyw",
+    "hye-east",
+    "hyw-west",
+    "armenian",
+    "eastern armenian",
+    "western armenian",
+}
+ONES = {
+    0: "զրո",
+    1: "մեկ",
+    2: "երկու",
+    3: "երեք",
+    4: "չորս",
+    5: "հինգ",
+    6: "վեց",
+    7: "յոթ",
+    8: "ութ",
+    9: "ինը",
+}
+TEENS = {
+    10: "տասը",
+    11: "տասնմեկ",
+    12: "տասներկու",
+    13: "տասներեք",
+    14: "տասնչորս",
+    15: "տասնհինգ",
+    16: "տասնվեց",
+    17: "տասնյոթ",
+    18: "տասնութ",
+    19: "տասնինը",
+}
+TENS = {
+    20: "քսան",
+    30: "երեսուն",
+    40: "քառասուն",
+    50: "հիսուն",
+    60: "վաթսուն",
+    70: "յոթանասուն",
+    80: "ութսուն",
+    90: "իննսուն",
+}
+MONTHS_GENITIVE = {
+    1: "հունվարի",
+    2: "փետրվարի",
+    3: "մարտի",
+    4: "ապրիլի",
+    5: "մայիսի",
+    6: "հունիսի",
+    7: "հուլիսի",
+    8: "օգոստոսի",
+    9: "սեպտեմբերի",
+    10: "հոկտեմբերի",
+    11: "նոյեմբերի",
+    12: "դեկտեմբերի",
+}
+DAY_DATIVE = {
+    1: "մեկին",
+    2: "երկուսին",
+    3: "երեքին",
+    4: "չորսին",
+    5: "հինգին",
+    6: "վեցին",
+    7: "յոթին",
+    8: "ութին",
+    9: "իննին",
+    10: "տասին",
+    11: "տասնմեկին",
+    12: "տասներկուսին",
+    13: "տասներեքին",
+    14: "տասնչորսին",
+    15: "տասնհինգին",
+    16: "տասնվեցին",
+    17: "տասնյոթին",
+    18: "տասնութին",
+    19: "տասնիննին",
+    20: "քսանին",
+    21: "քսանմեկին",
+    22: "քսաներկուսին",
+    23: "քսաներեքին",
+    24: "քսանչորսին",
+    25: "քսանհինգին",
+    26: "քսանվեցին",
+    27: "քսանյոթին",
+    28: "քսանութին",
+    29: "քսանիննին",
+    30: "երեսունին",
+    31: "երեսունմեկին",
+}
+ORDINALS = {
+    1: "առաջին",
+    2: "երկրորդ",
+    3: "երրորդ",
+    4: "չորրորդ",
+    5: "հինգերորդ",
+    6: "վեցերորդ",
+    7: "յոթերորդ",
+    8: "ութերորդ",
+    9: "իններորդ",
+    10: "տասներորդ",
+    20: "քսաներորդ",
+    30: "երեսուներորդ",
+    40: "քառասուներորդ",
+    50: "հիսուներորդ",
+    60: "վաթսուներորդ",
+    70: "յոթանասուներորդ",
+    80: "ութսուներորդ",
+    90: "իննսուներորդ",
+    100: "հարյուրերորդ",
+    1000: "հազարերորդ",
+}
+CURRENCY_NAMES = {
+    "֏": "դրամ",
+    "դրամ": "դրամ",
+    "դր": "դրամ",
+    "դր.": "դրամ",
+    "amd": "դրամ",
+    "$": "դոլար",
+    "usd": "դոլար",
+    "€": "եվրո",
+    "eur": "եվրո",
+    "£": "ֆունտ",
+    "gbp": "ֆունտ",
+    "₽": "ռուբլի",
+    "rub": "ռուբլի",
+}
+LETTER_DIGIT = r"A-Za-zԱ-Ֆա-ֆևԵՎՙ՚՛՜՝՞՟\d_"
+LEFT_BOUNDARY = rf"(?<![{LETTER_DIGIT}])"
+RIGHT_BOUNDARY = rf"(?![{LETTER_DIGIT}])"
+DATE_SUFFIX = r"(?:\s*-?\s*(?:ին|թ\.?|թվականին))?"
+DATE_DMY_RE = re.compile(
+    rf"{LEFT_BOUNDARY}([0-3]?\d)[./-]([01]?\d)[./-]((?:18|19|20|21)\d{{2}}){DATE_SUFFIX}{RIGHT_BOUNDARY}"
+)
+DATE_YMD_RE = re.compile(
+    rf"{LEFT_BOUNDARY}((?:18|19|20|21)\d{{2}})[./-]([01]?\d)[./-]([0-3]?\d){DATE_SUFFIX}{RIGHT_BOUNDARY}"
+)
+TIME_RE = re.compile(
+    rf"{LEFT_BOUNDARY}(?:ժամը\s*)?([0-2]?\d):([0-5]\d)(?::([0-5]\d))?{RIGHT_BOUNDARY}"
+)
+PERCENT_RE = re.compile(
+    rf"{LEFT_BOUNDARY}([+-]?(?:\d{{1,3}}(?:[ ,]\d{{3}})+|\d+)(?:[.,]\d+)?)\s*(?:%|տոկոս){RIGHT_BOUNDARY}",
+    re.IGNORECASE,
+)
+CURRENCY_PREFIX_RE = re.compile(
+    rf"{LEFT_BOUNDARY}([$€£₽֏])\s*([+-]?(?:\d{{1,3}}(?:[ ,]\d{{3}})+|\d+)(?:[.,]\d+)?)"
+)
+CURRENCY_SUFFIX_RE = re.compile(
+    rf"{LEFT_BOUNDARY}([+-]?(?:\d{{1,3}}(?:[ ,]\d{{3}})+|\d+)(?:[.,]\d+)?)\s*(֏|դրամ|դր\.?|AMD|USD|EUR|GBP|RUB|\$|€|£|₽){RIGHT_BOUNDARY}",
+    re.IGNORECASE,
+)
+ORDINAL_RE = re.compile(rf"{LEFT_BOUNDARY}(\d+)\s*[-\u2010-\u2015]?\s*(?:րդ|ին){RIGHT_BOUNDARY}")
+RANGE_RE = re.compile(rf"{LEFT_BOUNDARY}(\d+)\s*[-\u2010-\u2015]\s*(\d+){RIGHT_BOUNDARY}")
+PLAIN_NUMBER_RE = re.compile(
+    rf"{LEFT_BOUNDARY}([+-]?(?:\d{{1,3}}(?:[ ,]\d{{3}})+|\d+)(?:[.,]\d+)?)"
+    rf"{RIGHT_BOUNDARY}"
+)
+URL_RE = re.compile(r"\b(?:https?://|www\.)\S+", re.IGNORECASE)
+EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}\b")
+SYMBOL_RE = re.compile(r"[@#&=+*/\\|<>_~^]")
+LATIN_TOKEN_RE = re.compile(rf"{LEFT_BOUNDARY}[A-Za-z]{{2,}}{RIGHT_BOUNDARY}")
+@dataclass(frozen=True)
+class TextIssue:
+    kind: str
+    value: str
+    start: int
+    end: int
+    def as_dict(self) -> dict:
+        return asdict(self)
+def looks_armenian_context(text: str, language: Optional[str] = None) -> bool:
+    if language and language.strip().lower() in LANGUAGE_IDS_ARMENIAN:
+        return True
+    return bool(ARMENIAN_CHAR_RE.search(text))
+def normalize_unicode_text(text: str) -> str:
+    s = str(text)
+    for old, new in REPLACEMENTS.items():
+        s = s.replace(old, new)
+    s = SPACE_RE.sub(" ", s.strip())
+    return s
+def integer_to_armenian(value: int) -> str:
+    if value < 0:
+        return "մինուս " + integer_to_armenian(abs(value))
+    if value < 10:
+        return ONES[value]
+    if value < 20:
+        return TEENS[value]
+    if value < 100:
+        tens = value // 10 * 10
+        ones = value % 10
+        return TENS[tens] + (ONES[ones] if ones else "")
+    if value < 1000:
+        hundreds = value // 100
+        rest = value % 100
+        prefix = "հարյուր" if hundreds == 1 else f"{integer_to_armenian(hundreds)} հարյուր"
+        return prefix if rest == 0 else f"{prefix} {integer_to_armenian(rest)}"
+    for scale, name in (
+        (1_000_000_000, "միլիարդ"),
+        (1_000_000, "միլիոն"),
+        (1000, "հազար"),
+    ):
+        if value >= scale:
+            head = value // scale
+            rest = value % scale
+            if scale == 1000 and head == 1:
+                prefix = name
+            else:
+                prefix = f"{integer_to_armenian(head)} {name}"
+            return prefix if rest == 0 else f"{prefix} {integer_to_armenian(rest)}"
+    raise ValueError(f"Unsupported integer value: {value}")
+def _strip_group_separators(value: str) -> str:
+    return re.sub(r"(?<=\d)[ ,](?=\d{3}(?:\D|$))", "", value)
+def number_to_armenian(value: str | int) -> str:
+    raw = str(value).strip()
+    if not raw:
+        return raw
+    sign = ""
+    if raw[0] in "+-":
+        sign = "մինուս " if raw[0] == "-" else ""
+        raw = raw[1:]
+    raw = _strip_group_separators(raw)
+    decimal_match = re.fullmatch(r"(\d+)[.,](\d+)", raw)
+    if decimal_match:
+        whole, frac = decimal_match.groups()
+        frac_words = " ".join(ONES[int(ch)] for ch in frac)
+        return f"{sign}{integer_to_armenian(int(whole))} ամբողջ {frac_words}".strip()
+    return f"{sign}{integer_to_armenian(int(raw))}".strip()
+def ordinal_to_armenian(value: int) -> str:
+    if value in ORDINALS:
+        return ORDINALS[value]
+    if value < 100:
+        return integer_to_armenian(value) + "երորդ"
+    words = integer_to_armenian(value).split()
+    words[-1] = ordinal_to_armenian(int_to_last_component(value))
+    return " ".join(words)
+def int_to_last_component(value: int) -> int:
+    if value % 100:
+        return value % 100
+    if value % 1000:
+        return value % 1000
+    if value % 1_000_000:
+        return value % 1_000_000
+    return value
+def day_to_date_armenian(day: int) -> str:
+    if day in DAY_DATIVE:
+        return DAY_DATIVE[day]
+    if not 1 <= day <= 31:
+        raise ValueError(f"Invalid day: {day}")
+    raise ValueError(f"Invalid day: {day}")
+def expand_numeric_date(day: int, month: int, year: int) -> str:
+    if month not in MONTHS_GENITIVE or not 1 <= day <= 31:
+        raise ValueError(f"Invalid date: {day}.{month}.{year}")
+    # Keep validation lightweight; this frontend is a normalizer, not a calendar.
+    return f"{MONTHS_GENITIVE[month]} {day_to_date_armenian(day)}, {integer_to_armenian(year)} թվականին"
+def _replace_dmy(match: re.Match[str]) -> str:
+    day, month, year = (int(x) for x in match.groups())
+    try:
+        return expand_numeric_date(day, month, year)
+    except ValueError:
+        return match.group(0)
+def _replace_ymd(match: re.Match[str]) -> str:
+    year, month, day = (int(x) for x in match.groups())
+    try:
+        return expand_numeric_date(day, month, year)
+    except ValueError:
+        return match.group(0)
+def _replace_time(match: re.Match[str]) -> str:
+    hour = int(match.group(1))
+    minute = int(match.group(2))
+    second = int(match.group(3)) if match.group(3) is not None else None
+    if hour > 23:
+        return match.group(0)
+    text = f"ժամը {integer_to_armenian(hour)}"
+    if minute:
+        text += f" անց {integer_to_armenian(minute)}"
+    if second:
+        text += f" և {integer_to_armenian(second)} վայրկյան"
+    return text
+def _replace_percent(match: re.Match[str]) -> str:
+    return f"{number_to_armenian(match.group(1))} տոկոս"
+def _replace_currency_prefix(match: re.Match[str]) -> str:
+    currency = CURRENCY_NAMES[match.group(1).lower()]
+    return f"{number_to_armenian(match.group(2))} {currency}"
+def _replace_currency_suffix(match: re.Match[str]) -> str:
+    currency = CURRENCY_NAMES[match.group(2).lower()]
+    return f"{number_to_armenian(match.group(1))} {currency}"
+def _replace_ordinal(match: re.Match[str]) -> str:
+    return ordinal_to_armenian(int(match.group(1)))
+def _replace_range(match: re.Match[str]) -> str:
+    return f"{number_to_armenian(match.group(1))}ից {number_to_armenian(match.group(2))}"
+def _replace_number(match: re.Match[str]) -> str:
+    try:
+        return number_to_armenian(match.group(1))
+    except ValueError:
+        return match.group(0)
+def expand_armenian_text(text: str) -> str:
+    s = normalize_unicode_text(text)
+    s = DATE_DMY_RE.sub(_replace_dmy, s)
+    s = DATE_YMD_RE.sub(_replace_ymd, s)
+    s = TIME_RE.sub(_replace_time, s)
+    s = CURRENCY_PREFIX_RE.sub(_replace_currency_prefix, s)
+    s = CURRENCY_SUFFIX_RE.sub(_replace_currency_suffix, s)
+    s = PERCENT_RE.sub(_replace_percent, s)
+    s = ORDINAL_RE.sub(_replace_ordinal, s)
+    s = RANGE_RE.sub(_replace_range, s)
+    s = PLAIN_NUMBER_RE.sub(_replace_number, s)
+    return cleanup_spacing(s)
+def cleanup_spacing(text: str) -> str:
+    s = SPACE_RE.sub(" ", text.strip())
+    s = re.sub(r"\s+([,.;:!?։՝՞՜])", r"\1", s)
+    s = re.sub(r"([,;:!?։])(?=\S)", r"\1 ", s)
+    return SPACE_RE.sub(" ", s).strip()
+def normalize_for_tts(text: str, language: Optional[str] = "hy") -> str:
+    s = normalize_unicode_text(text)
+    if looks_armenian_context(s, language=language):
+        return expand_armenian_text(s)
+    return cleanup_spacing(s)
+def _iter_issue_matches(text: str) -> Iterable[TextIssue]:
+    patterns = [
+        ("url", URL_RE),
+        ("email", EMAIL_RE),
+        ("date", DATE_DMY_RE),
+        ("date", DATE_YMD_RE),
+        ("time", TIME_RE),
+        ("currency", CURRENCY_PREFIX_RE),
+        ("currency", CURRENCY_SUFFIX_RE),
+        ("percent", PERCENT_RE),
+        ("ordinal", ORDINAL_RE),
+        ("range", RANGE_RE),
+        ("number", PLAIN_NUMBER_RE),
+        ("symbol", SYMBOL_RE),
+        ("latin_token", LATIN_TOKEN_RE),
+    ]
+    occupied: list[tuple[int, int]] = []
+    for kind, pattern in patterns:
+        for match in pattern.finditer(text):
+            start, end = match.span()
+            if any(start < old_end and end > old_start for old_start, old_end in occupied):
+                continue
+            occupied.append((start, end))
+            yield TextIssue(kind=kind, value=match.group(0), start=start, end=end)
+def find_text_frontend_issues(
+    text: str,
+    language: Optional[str] = "hy",
+) -> List[TextIssue]:
+    s = normalize_unicode_text(text)
+    if not looks_armenian_context(s, language=language):
+        return []
+    return sorted(_iter_issue_matches(s), key=lambda issue: (issue.start, issue.end))
+def issues_as_dicts(issues: Iterable[TextIssue]) -> list[dict]:
+    return [issue.as_dict() for issue in issues]

runtime/omnivoice/utils/audio.py ADDED Viewed

	@@ -0,0 +1,343 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Audio I/O and processing utilities.
+Provides functions for loading, resampling, silence removal,
+chunking, cross-fading, and format conversion.
+All public functions in this module operate on **numpy float32 arrays**
+with shape ``(C, T)`` (channels-first).
+"""
+import io
+import logging
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Loading
+# ---------------------------------------------------------------------------
+def load_waveform(audio_path: str):
+    """Load audio from a file path, returning (data, sample_rate).
+    Tries two backends in order:
+    1. soundfile — covers WAV/FLAC/OGG etc., no ffmpeg needed.
+    2. librosa — covers MP3/M4A etc. via audioread + ffmpeg.
+    Returns:
+        (data, sample_rate) where data is a numpy float32 array of
+        shape (C, T).
+    """
+    try:
+        data, sr = sf.read(audio_path, dtype="float32", always_2d=True)
+        return data.T, sr  # (T, C) → (C, T)
+    except Exception:
+        # soundfile cannot handle MP3/M4A etc., fall back to librosa.
+        import librosa
+        data, sr = librosa.load(audio_path, sr=None, mono=False)
+        if data.ndim == 1:
+            data = data[np.newaxis, :]
+        return data, sr
+def load_audio(audio_path: str, sampling_rate: int) -> np.ndarray:
+    """Load a waveform from file and resample to the target rate.
+    Parameters:
+        audio_path: path of the audio.
+        sampling_rate: target sampling rate.
+    Returns:
+        Numpy float32 array of shape (1, T).
+    """
+    data, sr = load_waveform(audio_path)
+    if data.shape[0] > 1:
+        data = np.mean(data, axis=0, keepdims=True)
+    if sr != sampling_rate:
+        data = torchaudio.functional.resample(
+            torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate
+        ).numpy()
+    return data
+def load_audio_bytes(raw: bytes, sampling_rate: int) -> np.ndarray:
+    """Load audio from in-memory bytes and resample.
+    Parameters:
+        raw: raw audio file bytes (e.g. from WebDataset).
+        sampling_rate: target sampling rate.
+    Returns:
+        Numpy float32 array of shape (1, T).
+    """
+    buf = io.BytesIO(raw)
+    try:
+        data, sr = sf.read(buf, dtype="float32", always_2d=True)
+        data = data.T  # (T, C) → (C, T)
+    except Exception:
+        import librosa
+        buf.seek(0)
+        data, sr = librosa.load(buf, sr=None, mono=False)
+        if data.ndim == 1:
+            data = data[np.newaxis, :]
+    if data.shape[0] > 1:
+        data = np.mean(data, axis=0, keepdims=True)
+    if sr != sampling_rate:
+        data = torchaudio.functional.resample(
+            torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate
+        ).numpy()
+    return data
+# ---------------------------------------------------------------------------
+# Audio processing (all numpy in / numpy out)
+# ---------------------------------------------------------------------------
+def numpy_to_audiosegment(audio: np.ndarray, sample_rate: int) -> AudioSegment:
+    """Convert a numpy float32 array of shape (C, T) to a pydub AudioSegment."""
+    audio_int = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
+    if audio_int.shape[0] > 1:
+        audio_int = audio_int.T.flatten()  # interleave channels
+    return AudioSegment(
+        data=audio_int.tobytes(),
+        sample_width=2,
+        frame_rate=sample_rate,
+        channels=audio.shape[0],
+    )
+def audiosegment_to_numpy(aseg: AudioSegment) -> np.ndarray:
+    """Convert a pydub AudioSegment to a numpy float32 array of shape (C, T)."""
+    data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0
+    if aseg.channels == 1:
+        return data[np.newaxis, :]
+    return data.reshape(-1, aseg.channels).T
+def remove_silence(
+    audio: np.ndarray,
+    sampling_rate: int,
+    mid_sil: int = 300,
+    lead_sil: int = 100,
+    trail_sil: int = 300,
+) -> np.ndarray:
+    """Remove middle silences longer than *mid_sil* ms and trim edge silences.
+    Parameters:
+        audio: numpy array with shape (C, T).
+        sampling_rate: sampling rate of the audio.
+        mid_sil: middle-silence threshold in ms (0 to skip).
+        lead_sil: kept leading silence in ms.
+        trail_sil: kept trailing silence in ms.
+    Returns:
+        Numpy array with shape (C, T').
+    """
+    wave = numpy_to_audiosegment(audio, sampling_rate)
+    if mid_sil > 0:
+        non_silent_segs = split_on_silence(
+            wave,
+            min_silence_len=mid_sil,
+            silence_thresh=-50,
+            keep_silence=mid_sil,
+            seek_step=10,
+        )
+        wave = AudioSegment.silent(duration=0)
+        for seg in non_silent_segs:
+            wave += seg
+    wave = remove_silence_edges(wave, lead_sil, trail_sil, -50)
+    return audiosegment_to_numpy(wave)
+def remove_silence_edges(
+    audio: AudioSegment,
+    lead_sil: int = 100,
+    trail_sil: int = 300,
+    silence_threshold: float = -50,
+) -> AudioSegment:
+    """Remove edge silences, keeping *lead_sil* / *trail_sil* ms."""
+    start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+    start_idx = max(0, start_idx - lead_sil)
+    audio = audio[start_idx:]
+    audio = audio.reverse()
+    start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+    start_idx = max(0, start_idx - trail_sil)
+    audio = audio[start_idx:]
+    audio = audio.reverse()
+    return audio
+def fade_and_pad_audio(
+    audio: np.ndarray,
+    pad_duration: float = 0.1,
+    fade_duration: float = 0.1,
+    sample_rate: int = 24000,
+) -> np.ndarray:
+    """Apply fade-in/out and pad with silence to prevent clicks.
+    Args:
+        audio: numpy array of shape (C, T).
+        pad_duration: silence padding duration per side (seconds).
+        fade_duration: fade curve duration (seconds).
+        sample_rate: audio sampling rate.
+    Returns:
+        Processed numpy array of shape (C, T_new).
+    """
+    if audio.shape[-1] == 0:
+        return audio
+    fade_samples = int(fade_duration * sample_rate)
+    pad_samples = int(pad_duration * sample_rate)
+    processed = audio.copy()
+    if fade_samples > 0:
+        k = min(fade_samples, processed.shape[-1] // 2)
+        if k > 0:
+            fade_in = np.linspace(0, 1, k, dtype=np.float32)[np.newaxis, :]
+            processed[..., :k] *= fade_in
+            fade_out = np.linspace(1, 0, k, dtype=np.float32)[np.newaxis, :]
+            processed[..., -k:] *= fade_out
+    if pad_samples > 0:
+        silence = np.zeros(
+            (processed.shape[0], pad_samples),
+            dtype=processed.dtype,
+        )
+        processed = np.concatenate([silence, processed, silence], axis=-1)
+    return processed
+def trim_long_audio(
+    audio: np.ndarray,
+    sampling_rate: int,
+    max_duration: float = 15.0,
+    min_duration: float = 3.0,
+    trim_threshold: float = 20.0,
+) -> np.ndarray:
+    """Trim audio to <= *max_duration* by splitting at the largest silence gap.
+    Only trims when the audio exceeds *trim_threshold* seconds.
+    Args:
+        audio: numpy array of shape (C, T).
+        sampling_rate: audio sampling rate.
+        max_duration: maximum duration in seconds.
+        min_duration: minimum duration in seconds.
+        trim_threshold: only trim if audio is longer than this (seconds).
+    Returns:
+        Trimmed numpy array.
+    """
+    duration = audio.shape[-1] / sampling_rate
+    if duration <= trim_threshold:
+        return audio
+    seg = numpy_to_audiosegment(audio, sampling_rate)
+    nonsilent = detect_nonsilent(
+        seg, min_silence_len=100, silence_thresh=-40, seek_step=10
+    )
+    if not nonsilent:
+        return audio
+    max_ms = int(max_duration * 1000)
+    min_ms = int(min_duration * 1000)
+    best_split = 0
+    for start, end in nonsilent:
+        if start > best_split and start <= max_ms:
+            best_split = start
+        if end > max_ms:
+            break
+    if best_split < min_ms:
+        best_split = min(max_ms, len(seg))
+    trimmed = seg[:best_split]
+    return audiosegment_to_numpy(trimmed)
+def cross_fade_chunks(
+    chunks: list[np.ndarray],
+    sample_rate: int,
+    silence_duration: float = 0.3,
+) -> np.ndarray:
+    """Concatenate audio chunks with silence gaps and cross-fade at boundaries.
+    Args:
+        chunks: list of numpy arrays, each (C, T).
+        sample_rate: audio sample rate.
+        silence_duration: total silence gap duration in seconds.
+    Returns:
+        Merged numpy array (C, T_total).
+    """
+    if len(chunks) == 1:
+        return chunks[0]
+    total_n = int(silence_duration * sample_rate)
+    fade_n = total_n // 3
+    silence_n = fade_n
+    merged = chunks[0].copy()
+    for chunk in chunks[1:]:
+        parts = [merged]
+        fout_n = min(fade_n, merged.shape[-1])
+        if fout_n > 0:
+            w_out = np.linspace(1, 0, fout_n, dtype=np.float32)[np.newaxis, :]
+            parts[-1][..., -fout_n:] *= w_out
+        parts.append(np.zeros((chunks[0].shape[0], silence_n), dtype=np.float32))
+        fade_in = chunk.copy()
+        fin_n = min(fade_n, fade_in.shape[-1])
+        if fin_n > 0:
+            w_in = np.linspace(0, 1, fin_n, dtype=np.float32)[np.newaxis, :]
+            fade_in[..., :fin_n] *= w_in
+        parts.append(fade_in)
+        merged = np.concatenate(parts, axis=-1)
+    return merged

runtime/omnivoice/utils/common.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared utility functions."""
+import argparse
+import random
+import numpy as np
+import torch
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def fix_random_seed(random_seed: int):
+    """
+    Set the same random seed for the libraries and modules.
+    Includes the ``random`` module, numpy, and torch.
+    """
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    torch.random.manual_seed(random_seed)
+    # Ensure deterministic ID creation
+    rd = random.Random()
+    rd.seed(random_seed)

runtime/omnivoice/utils/duration.py ADDED Viewed

	@@ -0,0 +1,282 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text duration estimation for TTS generation.
+Provides ``RuleDurationEstimator``, which estimates audio duration from text
+using character phonetic weights across 600+ languages. Used by
+``OmniVoice.generate()`` to determine output length when no duration is specified.
+"""
+import bisect
+import unicodedata
+from functools import lru_cache
+from typing import Optional
+class RuleDurationEstimator:
+    def __init__(self):
+        # ==========================================
+        # 1. Phonetic Weights Table
+        # ==========================================
+        # The weight represents the relative speaking time compared to
+        # a standard Latin letter.
+        # Benchmark: 1.0 = One Latin Character (~40-50ms)
+        self.weights = {
+            # --- Logographic (1 char = full syllable/word) ---
+            "cjk": 3.0,  # Chinese, Japanese Kanji, etc.
+            # --- Syllabic / Blocks
+            "hangul": 2.5,  # Korean Hangul
+            "kana": 2.2,  # Japanese Hiragana/Katakana
+            "ethiopic": 3.0,  # Amharic/Ge'ez
+            "yi": 3.0,  # Yi script
+            # --- Abugida (Consonant-Vowel complexes) ---
+            "indic": 1.8,  # Hindi, Bengali, Tamil, etc.
+            "thai_lao": 1.5,  # Thai, Lao
+            "khmer_myanmar": 1.8,  # Khmer, Myanmar
+            # --- Abjad (Consonant-heavy) ---
+            "arabic": 1.5,  # Arabic, Persian, Urdu
+            "hebrew": 1.5,  # Hebrew
+            # --- Alphabet (Segmental) ---
+            "latin": 1.0,  # English, Spanish, French, Vietnamese, etc. (Baseline)
+            "cyrillic": 1.0,  # Russian, Ukrainian
+            "greek": 1.0,  # Greek
+            "armenian": 1.0,  # Armenian
+            "georgian": 1.0,  # Georgian
+            # --- Symbols & Misc ---
+            "punctuation": 0.5,  # Pause capability
+            "space": 0.2,  # Word boundary/Breath (0.05 / 0.22)
+            "digit": 3.5,  # Numbers
+            "mark": 0.0,  # Diacritics/Accents (Silent modifiers)
+            "default": 1.0,  # Fallback for unknown scripts
+        }
+        # ==========================================
+        # 2. Unicode Range Mapping
+        # ==========================================
+        # Format: (End_Codepoint, Type_Key)
+        # Used for fast binary search (bisect).
+        self.ranges = [
+            (0x02AF, "latin"),  # Latin (Basic, Supplement, Ext, IPA)
+            (0x03FF, "greek"),  # Greek & Coptic
+            (0x052F, "cyrillic"),  # Cyrillic
+            (0x058F, "armenian"),  # Armenian
+            (0x05FF, "hebrew"),  # Hebrew
+            (0x077F, "arabic"),  # Arabic, Syriac, Arabic Supplement
+            (0x089F, "arabic"),  # Arabic Extended-B (+ Syriac Supp)
+            (0x08FF, "arabic"),  # Arabic Extended-A
+            (0x097F, "indic"),  # Devanagari
+            (0x09FF, "indic"),  # Bengali
+            (0x0A7F, "indic"),  # Gurmukhi
+            (0x0AFF, "indic"),  # Gujarati
+            (0x0B7F, "indic"),  # Oriya
+            (0x0BFF, "indic"),  # Tamil
+            (0x0C7F, "indic"),  # Telugu
+            (0x0CFF, "indic"),  # Kannada
+            (0x0D7F, "indic"),  # Malayalam
+            (0x0DFF, "indic"),  # Sinhala
+            (0x0EFF, "thai_lao"),  # Thai & Lao
+            (0x0FFF, "indic"),  # Tibetan (Abugida)
+            (0x109F, "khmer_myanmar"),  # Myanmar
+            (0x10FF, "georgian"),  # Georgian
+            (0x11FF, "hangul"),  # Hangul Jamo
+            (0x137F, "ethiopic"),  # Ethiopic
+            (0x139F, "ethiopic"),  # Ethiopic Supplement
+            (0x13FF, "default"),  # Cherokee
+            (0x167F, "default"),  # Canadian Aboriginal Syllabics
+            (0x169F, "default"),  # Ogham
+            (0x16FF, "default"),  # Runic
+            (0x171F, "default"),  # Tagalog (Baybayin)
+            (0x173F, "default"),  # Hanunoo
+            (0x175F, "default"),  # Buhid
+            (0x177F, "default"),  # Tagbanwa
+            (0x17FF, "khmer_myanmar"),  # Khmer
+            (0x18AF, "default"),  # Mongolian
+            (0x18FF, "default"),  # Canadian Aboriginal Syllabics Ext
+            (0x194F, "indic"),  # Limbu
+            (0x19DF, "indic"),  # Tai Le & New Tai Lue
+            (0x19FF, "khmer_myanmar"),  # Khmer Symbols
+            (0x1A1F, "indic"),  # Buginese
+            (0x1AAF, "indic"),  # Tai Tham
+            (0x1B7F, "indic"),  # Balinese
+            (0x1BBF, "indic"),  # Sundanese
+            (0x1BFF, "indic"),  # Batak
+            (0x1C4F, "indic"),  # Lepcha
+            (0x1C7F, "indic"),  # Ol Chiki (Santali)
+            (0x1C8F, "cyrillic"),  # Cyrillic Extended-C
+            (0x1CBF, "georgian"),  # Georgian Extended
+            (0x1CCF, "indic"),  # Sundanese Supplement
+            (0x1CFF, "indic"),  # Vedic Extensions
+            (0x1D7F, "latin"),  # Phonetic Extensions
+            (0x1DBF, "latin"),  # Phonetic Extensions Supplement
+            (0x1DFF, "default"),  # Combining Diacritical Marks Supplement
+            (0x1EFF, "latin"),  # Latin Extended Additional (Vietnamese)
+            (0x309F, "kana"),  # Hiragana
+            (0x30FF, "kana"),  # Katakana
+            (0x312F, "cjk"),  # Bopomofo (Pinyin)
+            (0x318F, "hangul"),  # Hangul Compatibility Jamo
+            (0x9FFF, "cjk"),  # CJK Unified Ideographs (Main)
+            (0xA4CF, "yi"),  # Yi Syllables
+            (0xA4FF, "default"),  # Lisu
+            (0xA63F, "default"),  # Vai
+            (0xA69F, "cyrillic"),  # Cyrillic Extended-B
+            (0xA6FF, "default"),  # Bamum
+            (0xA7FF, "latin"),  # Latin Extended-D
+            (0xA82F, "indic"),  # Syloti Nagri
+            (0xA87F, "default"),  # Phags-pa
+            (0xA8DF, "indic"),  # Saurashtra
+            (0xA8FF, "indic"),  # Devanagari Extended
+            (0xA92F, "indic"),  # Kayah Li
+            (0xA95F, "indic"),  # Rejang
+            (0xA97F, "hangul"),  # Hangul Jamo Extended-A
+            (0xA9DF, "indic"),  # Javanese
+            (0xA9FF, "khmer_myanmar"),  # Myanmar Extended-B
+            (0xAA5F, "indic"),  # Cham
+            (0xAA7F, "khmer_myanmar"),  # Myanmar Extended-A
+            (0xAADF, "indic"),  # Tai Viet
+            (0xAAFF, "indic"),  # Meetei Mayek Extensions
+            (0xAB2F, "ethiopic"),  # Ethiopic Extended-A
+            (0xAB6F, "latin"),  # Latin Extended-E
+            (0xABBF, "default"),  # Cherokee Supplement
+            (0xABFF, "indic"),  # Meetei Mayek
+            (0xD7AF, "hangul"),  # Hangul Syllables
+            (0xFAFF, "cjk"),  # CJK Compatibility
+            (0xFDFF, "arabic"),  # Arabic Presentation Forms-A
+            (0xFE6F, "default"),  # Variation Selectors
+            (0xFEFF, "arabic"),  # Arabic Presentation Forms-B
+            (0xFFEF, "latin"),  # Fullwidth Latin
+        ]
+        self.breakpoints = [r[0] for r in self.ranges]
+    @lru_cache(maxsize=4096)
+    def _get_char_weight(self, char):
+        """Determines the weight of a single character."""
+        code = ord(char)
+        if (65 <= code <= 90) or (97 <= code <= 122):
+            return self.weights["latin"]
+        if code == 32:
+            return self.weights["space"]
+        # Ignore arabic Tatweel
+        if code == 0x0640:
+            return self.weights["mark"]
+        category = unicodedata.category(char)
+        if category.startswith("M"):
+            return self.weights["mark"]
+        if category.startswith("P") or category.startswith("S"):
+            return self.weights["punctuation"]
+        if category.startswith("Z"):
+            return self.weights["space"]
+        if category.startswith("N"):
+            return self.weights["digit"]
+        # 3. Binary search for Unicode Block (此时区间里绝不会再混进标点符号)
+        idx = bisect.bisect_left(self.breakpoints, code)
+        if idx < len(self.ranges):
+            script_type = self.ranges[idx][1]
+            return self.weights.get(script_type, self.weights["default"])
+        # 4. Handle upper planes (CJK Ext B/C/D, Historic scripts)
+        if code > 0x20000:
+            return self.weights["cjk"]
+        return self.weights["default"]
+    def calculate_total_weight(self, text):
+        """Sums up the normalized weights for a string."""
+        return sum(self._get_char_weight(c) for c in text)
+    def estimate_duration(
+        self,
+        target_text: str,
+        ref_text: str,
+        ref_duration: float,
+        low_threshold: Optional[float] = 50,
+        boost_strength: float = 3,
+    ) -> float:
+        """
+        Args:
+            target_text (str): The text for which we want to estimate the duration.
+            ref_text (str): The reference text that was used to measure
+                the ref_duration.
+            ref_duration (float): The actual duration it took
+                to speak the ref_text.
+            low_threshold (float): The minimum duration threshold below which the
+                estimation will be considered unreliable.
+            boost_strength (float): Controls the power-curve boost for short durations.
+                Higher values boost small durations more aggressively.
+                1 = no boost (linear), 2 = sqrt-like
+        Returns:
+            float: The estimated duration for the target_text based
+                on the ref_text and ref_duration.
+        """
+        if ref_duration <= 0 or not ref_text:
+            return 0.0
+        ref_weight = self.calculate_total_weight(ref_text)
+        if ref_weight == 0:
+            return 0.0
+        speed_factor = ref_weight / ref_duration
+        target_weight = self.calculate_total_weight(target_text)
+        estimated_duration = target_weight / speed_factor
+        if low_threshold is not None and estimated_duration < low_threshold:
+            alpha = 1.0 / boost_strength
+            return low_threshold * (estimated_duration / low_threshold) ** alpha
+        else:
+            return estimated_duration
+# ==========================================
+# Example Usage
+# ==========================================
+if __name__ == "__main__":
+    estimator = RuleDurationEstimator()
+    ref_txt = "Hello, world."
+    ref_dur = 1.5
+    test_cases = [
+        ("Hindi (With complex marks)", "नमस्ते दुनिया"),
+        ("Arabic (With vowels)", "مَرْحَبًا بِالْعَالَم"),
+        ("Vietnamese (Lots of diacritics)", "Chào thế giới"),
+        ("Chinese", "你好，世界！"),
+        ("Mixed Emoji", "Hello 🌍! This is fun 🎉"),
+    ]
+    print("--- Reference ---")
+    print(f"Reference Text: '{ref_txt}'")
+    print(f"Reference Duration: {ref_dur}s")
+    print("-" * 30)
+    for lang, txt in test_cases:
+        est_time = estimator.estimate_duration(txt, ref_txt, ref_dur)
+        weight = estimator.calculate_total_weight(txt)
+        print(f"[{lang}]")
+        print(f"Text: {txt}")
+        print(f"Total Weight: {weight:.2f}")
+        print(f"Estimated Duration: {est_time:.2f} s")
+        print("-" * 30)

runtime/omnivoice/utils/lang_map.py ADDED Viewed

	@@ -0,0 +1,698 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Language name to ISO 639-3 code mapping.
+Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID``
+(for resolving language names to codes) and ``LANG_IDS`` (the set of supported
+ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided
+language names.
+"""
+# Auto-generated from docs/lang_id_name_map.tsv
+# Maps lowercase language name -> language ID code
+LANG_NAME_TO_ID = {
+    "abadi": "kbt",
+    "abkhazian": "ab",
+    "abron": "abr",
+    "abua": "abn",
+    "adamawa fulfulde": "fub",
+    "adyghe": "ady",
+    "afade": "aal",
+    "afrikaans": "af",
+    "agwagwune": "yay",
+    "aja (benin)": "ajg",
+    "akebu": "keu",
+    "alago": "ala",
+    "albanian": "sq",
+    "algerian arabic": "arq",
+    "algerian saharan arabic": "aao",
+    "ambo-pasco quechua": "qva",
+    "ambonese malay": "abs",
+    "amdo tibetan": "adx",
+    "amharic": "am",
+    "anaang": "anw",
+    "angika": "anp",
+    "antankarana malagasy": "xmv",
+    "aragonese": "an",
+    "arbëreshë albanian": "aae",
+    "arequipa-la unión quechua": "qxu",
+    "armenian": "hy",
+    "ashe": "ahs",
+    "ashéninka perené": "prq",
+    "askopan": "eiv",
+    "assamese": "as",
+    "asturian": "ast",
+    "atayal": "tay",
+    "awak": "awo",
+    "ayacucho quechua": "quy",
+    "azerbaijani": "az",
+    "baatonum": "bba",
+    "bacama": "bcy",
+    "bade": "bde",
+    "bafia": "ksf",
+    "bafut": "bfd",
+    "bagirmi fulfulde": "fui",
+    "bago-kusuntu": "bqg",
+    "baharna arabic": "abv",
+    "bakoko": "bkh",
+    "balanta-ganja": "bjt",
+    "balti": "bft",
+    "bamenyam": "bce",
+    "bamun": "bax",
+    "bangwinji": "bsj",
+    "banjar": "bjn",
+    "bankon": "abb",
+    "baoulé": "bci",
+    "bara malagasy": "bhr",
+    "barok": "bjk",
+    "basa (cameroon)": "bas",
+    "basa (nigeria)": "bzw",
+    "bashkir": "ba",
+    "basque": "eu",
+    "batak mandailing": "btm",
+    "batanga": "bnm",
+    "bateri": "btv",
+    "bats": "bbl",
+    "bayot": "bda",
+    "bebele": "beb",
+    "belarusian": "be",
+    "bengali": "bn",
+    "betawi": "bew",
+    "bhili": "bhb",
+    "bhojpuri": "bho",
+    "bilur": "bxf",
+    "bima": "bhp",
+    "bodo": "brx",
+    "boghom": "bux",
+    "bokyi": "bky",
+    "bomu": "bmq",
+    "bondei": "bou",
+    "borgu fulfulde": "fue",
+    "bosnian": "bs",
+    "brahui": "brh",
+    "braj": "bra",
+    "breton": "br",
+    "buduma": "bdm",
+    "buginese": "bug",
+    "bukharic": "bhh",
+    "bulgarian": "bg",
+    "bulu (cameroon)": "bum",
+    "bundeli": "bns",
+    "bunun": "bnn",
+    "bura-pabir": "bwr",
+    "burak": "bys",
+    "burmese": "my",
+    "burushaski": "bsk",
+    "cacaloxtepec mixtec": "miu",
+    "cajatambo north lima quechua": "qvl",
+    "cakfem-mushere": "cky",
+    "cameroon pidgin": "wes",
+    "campidanese sardinian": "sro",
+    "cantonese": "yue",
+    "catalan": "ca",
+    "cebuano": "ceb",
+    "cen": "cen",
+    "central kurdish": "ckb",
+    "central nahuatl": "nhn",
+    "central pame": "pbs",
+    "central pashto": "pst",
+    "central puebla nahuatl": "ncx",
+    "central tarahumara": "tar",
+    "central yupik": "esu",
+    "central-eastern niger fulfulde": "fuq",
+    "chadian arabic": "shu",
+    "chichewa": "ny",
+    "chichicapan zapotec": "zpv",
+    "chiga": "cgg",
+    "chimalapa zoque": "zoh",
+    "chimborazo highland quichua": "qug",
+    "chinese": "zh",
+    "chiquián ancash quechua": "qxa",
+    "chitwania tharu": "the",
+    "chokwe": "cjk",
+    "chuvash": "cv",
+    "cibak": "ckl",
+    "coastal konjo": "kjc",
+    "copainalá zoque": "zoc",
+    "cornish": "kw",
+    "corongo ancash quechua": "qwa",
+    "croatian": "hr",
+    "cross river mbembe": "mfn",
+    "cuyamecalco mixtec": "xtu",
+    "czech": "cs",
+    "dadiya": "dbd",
+    "dagbani": "dag",
+    "dameli": "dml",
+    "danish": "da",
+    "dargwa": "dar",
+    "dazaga": "dzg",
+    "deccan": "dcc",
+    "degema": "deg",
+    "dera (nigeria)": "kna",
+    "dghwede": "dgh",
+    "dhatki": "mki",
+    "dhivehi": "dv",
+    "dhofari arabic": "adf",
+    "dijim-bwilim": "cfa",
+    "dogri": "dgo",
+    "domaaki": "dmk",
+    "dotyali": "dty",
+    "duala": "dua",
+    "dutch": "nl",
+    "dũya": "ldb",
+    "dyula": "dyu",
+    "eastern balochi": "bgp",
+    "eastern bolivian guaraní": "gui",
+    "eastern egyptian bedawi arabic": "avl",
+    "eastern krahn": "kqo",
+    "eastern mari": "mhr",
+    "eastern yiddish": "ydd",
+    "ebrié": "ebr",
+    "eggon": "ego",
+    "egyptian arabic": "arz",
+    "ejagham": "etu",
+    "eleme": "elm",
+    "eloyi": "afo",
+    "embu": "ebu",
+    "english": "en",
+    "erzya": "myv",
+    "esan": "ish",
+    "esperanto": "eo",
+    "estonian": "et",
+    "eton (cameroon)": "eto",
+    "ewondo": "ewo",
+    "extremaduran": "ext",
+    "fang (equatorial guinea)": "fan",
+    "fanti": "fat",
+    "farefare": "gur",
+    "fe'fe'": "fmp",
+    "filipino": "fil",
+    "filomena mata-coahuitlán totonac": "tlp",
+    "finnish": "fi",
+    "fipa": "fip",
+    "french": "fr",
+    "fulah": "ff",
+    "galician": "gl",
+    "gambian wolof": "wof",
+    "ganda": "lg",
+    "garhwali": "gbm",
+    "gawar-bati": "gwt",
+    "gawri": "gwc",
+    "gbagyi": "gbr",
+    "gbari": "gby",
+    "geji": "gyz",
+    "gen": "gej",
+    "georgian": "ka",
+    "german": "de",
+    "geser-gorom": "ges",
+    "gheg albanian": "aln",
+    "ghomálá'": "bbj",
+    "gidar": "gid",
+    "glavda": "glw",
+    "goan konkani": "gom",
+    "goaria": "gig",
+    "goemai": "ank",
+    "gola": "gol",
+    "greek": "el",
+    "guarani": "gn",
+    "guduf-gava": "gdf",
+    "guerrero amuzgo": "amu",
+    "gujarati": "gu",
+    "gujari": "gju",
+    "gulf arabic": "afb",
+    "gurgula": "ggg",
+    "gusii": "guz",
+    "gusilay": "gsl",
+    "gweno": "gwe",
+    "güilá zapotec": "ztu",
+    "hadothi": "hoj",
+    "hahon": "hah",
+    "haitian": "ht",
+    "hakha chin": "cnh",
+    "hakö": "hao",
+    "halia": "hla",
+    "hausa": "ha",
+    "hawaiian": "haw",
+    "hazaragi": "haz",
+    "hebrew": "he",
+    "hemba": "hem",
+    "herero": "hz",
+    "highland konjo": "kjk",
+    "hijazi arabic": "acw",
+    "hindi": "hi",
+    "huarijio": "var",
+    "huautla mazatec": "mau",
+    "huaxcaleca nahuatl": "nhq",
+    "huba": "hbb",
+    "huitepec mixtec": "mxs",
+    "hula": "hul",
+    "hungarian": "hu",
+    "hunjara-kaina ke": "hkk",
+    "hwana": "hwo",
+    "ibibio": "ibb",
+    "icelandic": "is",
+    "idakho-isukha-tiriki": "ida",
+    "idoma": "idu",
+    "igbo": "ig",
+    "igo": "ahl",
+    "ikposo": "kpo",
+    "ikwere": "ikw",
+    "imbabura highland quichua": "qvi",
+    "indonesian": "id",
+    "indus kohistani": "mvy",
+    "interlingua (international auxiliary language association)": "ia",
+    "inupiaq": "ik",
+    "irish": "ga",
+    "iron ossetic": "os",
+    "isekiri": "its",
+    "isoko": "iso",
+    "italian": "it",
+    "ito": "itw",
+    "itzá": "itz",
+    "ixtayutla mixtec": "vmj",
+    "izon": "ijc",
+    "jambi malay": "jax",
+    "japanese": "ja",
+    "jaqaru": "jqr",
+    "jauja wanca quechua": "qxw",
+    "jaunsari": "jns",
+    "javanese": "jv",
+    "jiba": "juo",
+    "jju": "kaj",
+    "judeo-moroccan arabic": "aju",
+    "juxtlahuaca mixtec": "vmc",
+    "kabardian": "kbd",
+    "kabras": "lkb",
+    "kabuverdianu": "kea",
+    "kabyle": "kab",
+    "kachi koli": "gjk",
+    "kairak": "ckr",
+    "kalabari": "ijn",
+    "kalasha": "kls",
+    "kalenjin": "kln",
+    "kalkoti": "xka",
+    "kamba": "kam",
+    "kamo": "kcq",
+    "kanauji": "bjj",
+    "kanembu": "kbl",
+    "kannada": "kn",
+    "karekare": "kai",
+    "kashmiri": "ks",
+    "kathoriya tharu": "tkt",
+    "kati": "bsh",
+    "kazakh": "kk",
+    "keiyo": "eyo",
+    "khams tibetan": "khg",
+    "khana": "ogo",
+    "khetrani": "xhe",
+    "khmer": "km",
+    "khowar": "khw",
+    "kinga": "zga",
+    "kinnauri": "kfk",
+    "kinyarwanda": "rw",
+    "kirghiz": "ky",
+    "kirya-konzəl": "fkk",
+    "kochila tharu": "thq",
+    "kohistani shina": "plk",
+    "kohumono": "bcs",
+    "kok borok": "trp",
+    "kol (papua new guinea)": "kol",
+    "kom (cameroon)": "bkm",
+    "koma": "kmy",
+    "konkani": "knn",
+    "konzo": "koo",
+    "korean": "ko",
+    "korwa": "kfp",
+    "kota (india)": "kfe",
+    "koti": "eko",
+    "kuanua": "ksd",
+    "kuanyama": "kj",
+    "kui (india)": "uki",
+    "kulung (nigeria)": "bbu",
+    "kuot": "kto",
+    "kushi": "kuh",
+    "kwambi": "kwm",
+    "kwasio": "nmg",
+    "lala-roba": "lla",
+    "lamang": "hia",
+    "lao": "lo",
+    "larike-wakasihu": "alo",
+    "lasi": "lss",
+    "latgalian": "ltg",
+    "latvian": "lv",
+    "levantine arabic": "apc",
+    "liana-seti": "ste",
+    "liberia kpelle": "xpe",
+    "liberian english": "lir",
+    "libyan arabic": "ayl",
+    "ligurian": "lij",
+    "lijili": "mgi",
+    "lingala": "ln",
+    "lithuanian": "lt",
+    "loarki": "lrk",
+    "logooli": "rag",
+    "logudorese sardinian": "src",
+    "loja highland quichua": "qvj",
+    "loloda": "loa",
+    "longuda": "lnu",
+    "loxicha zapotec": "ztp",
+    "luba-lulua": "lua",
+    "luo": "luo",
+    "lushai": "lus",
+    "luxembourgish": "lb",
+    "maasina fulfulde": "ffm",
+    "maba (chad)": "mde",
+    "macedo-romanian": "rup",
+    "macedonian": "mk",
+    "mada (cameroon)": "mxu",
+    "mafa": "maf",
+    "maithili": "mai",
+    "malay": "ms",
+    "malayalam": "ml",
+    "mali": "gcc",
+    "malinaltepec me'phaa": "tcf",
+    "maltese": "mt",
+    "mandara": "tbf",
+    "mandjak": "mfv",
+    "manggarai": "mqy",
+    "manipuri": "mni",
+    "mansoanka": "msw",
+    "manx": "gv",
+    "maori": "mi",
+    "marathi": "mr",
+    "marghi central": "mrt",
+    "marghi south": "mfm",
+    "maria (india)": "mrr",
+    "marwari (pakistan)": "mve",
+    "masana": "mcn",
+    "masikoro malagasy": "msh",
+    "matsés": "mcf",
+    "mazaltepec zapotec": "zpy",
+    "mazatlán mazatec": "vmz",
+    "mazatlán mixe": "mzl",
+    "mbe": "mfo",
+    "mbo (cameroon)": "mbo",
+    "mbum": "mdd",
+    "medumba": "byv",
+    "mekeo": "mek",
+    "meru": "mer",
+    "mesopotamian arabic": "acm",
+    "mewari": "mtr",
+    "min nan chinese": "nan",
+    "mingrelian": "xmf",
+    "mitlatongo mixtec": "vmm",
+    "miya": "mkf",
+    "mokpwe": "bri",
+    "moksha": "mdf",
+    "mom jango": "ver",
+    "mongolian": "mn",
+    "moroccan arabic": "ary",
+    "motu": "meu",
+    "mpiemo": "mcx",
+    "mpumpong": "mgg",
+    "mundang": "mua",
+    "mungaka": "mhk",
+    "musey": "mse",
+    "musgu": "mug",
+    "musi": "mui",
+    "naba": "mne",
+    "najdi arabic": "ars",
+    "nalik": "nal",
+    "nawdm": "nmz",
+    "ndonga": "ng",
+    "neapolitan": "nap",
+    "nepali": "npi",
+    "ngamo": "nbh",
+    "ngas": "anc",
+    "ngiemboon": "nnh",
+    "ngizim": "ngi",
+    "ngomba": "jgo",
+    "ngombale": "nla",
+    "nigerian fulfulde": "fuv",
+    "nigerian pidgin": "pcm",
+    "nimadi": "noe",
+    "nobiin": "fia",
+    "north mesopotamian arabic": "ayp",
+    "north moluccan malay": "max",
+    "northern betsimisaraka malagasy": "bmm",
+    "northern hindko": "hno",
+    "northern kurdish": "kmr",
+    "northern pame": "pmq",
+    "northern pashto": "pbu",
+    "northern uzbek": "uzn",
+    "northwest gbaya": "gya",
+    "norwegian": "no",
+    "norwegian bokmål": "nb",
+    "norwegian nynorsk": "nn",
+    "notsi": "ncf",
+    "nyankpa": "yes",
+    "nyungwe": "nyu",
+    "nzanyi": "nja",
+    "nüpode huitoto": "hux",
+    "occitan": "oc",
+    "od": "odk",
+    "odia": "ory",
+    "odual": "odu",
+    "omani arabic": "acx",
+    "orizaba nahuatl": "nlv",
+    "orma": "orc",
+    "ormuri": "oru",
+    "oromo": "om",
+    "pahari-potwari": "phr",
+    "paiwan": "pwn",
+    "panjabi": "pa",
+    "papuan malay": "pmy",
+    "parkari koli": "kvx",
+    "pedi": "nso",
+    "pero": "pip",
+    "persian": "fa",
+    "petats": "pex",
+    "phalura": "phl",
+    "piemontese": "pms",
+    "piya-kwonci": "piy",
+    "plateau malagasy": "plt",
+    "polish": "pl",
+    "poqomam": "poc",
+    "portuguese": "pt",
+    "pulaar": "fuc",
+    "pular": "fuf",
+    "puno quechua": "qxp",
+    "pushto": "ps",
+    "pökoot": "pko",
+    "qaqet": "byx",
+    "quiotepec chinantec": "chq",
+    "rana tharu": "thr",
+    "rangi": "lag",
+    "rapoisi": "kyx",
+    "ratahan": "rth",
+    "rayón zoque": "zor",
+    "romanian": "ro",
+    "romansh": "rm",
+    "rombo": "rof",
+    "rotokas": "roo",
+    "rukai": "dru",
+    "russian": "ru",
+    "sacapulteco": "quv",
+    "saidi arabic": "aec",
+    "sakalava malagasy": "skg",
+    "sakizaya": "szy",
+    "saleman": "sau",
+    "samba daka": "ccg",
+    "samba leko": "ndi",
+    "san felipe otlaltepec popoloca": "pow",
+    "san francisco del mar huave": "hue",
+    "san juan atzingo popoloca": "poe",
+    "san martín itunyoso triqui": "trq",
+    "san miguel el grande mixtec": "mig",
+    "sansi": "ssi",
+    "sanskrit": "sa",
+    "santa ana de tusi pasco quechua": "qxt",
+    "santa catarina albarradas zapotec": "ztn",
+    "santali": "sat",
+    "santiago del estero quichua": "qus",
+    "saposa": "sps",
+    "saraiki": "skr",
+    "sardinian": "sc",
+    "saya": "say",
+    "sediq": "trv",
+    "serbian": "sr",
+    "seri": "sei",
+    "shina": "scl",
+    "shona": "sn",
+    "siar-lak": "sjr",
+    "sibe": "nco",
+    "sicilian": "scn",
+    "sihuas ancash quechua": "qws",
+    "sikkimese": "sip",
+    "sinaugoro": "snc",
+    "sindhi": "sd",
+    "sindhi bhil": "sbn",
+    "sinhala": "si",
+    "sinicahua mixtec": "xti",
+    "sipacapense": "qum",
+    "siwai": "siw",
+    "slovak": "sk",
+    "slovenian": "sl",
+    "solos": "sol",
+    "somali": "so",
+    "soninke": "snk",
+    "south giziga": "giz",
+    "south ucayali ashéninka": "cpy",
+    "southeastern nochixtlán mixtec": "mxy",
+    "southern betsimisaraka malagasy": "bzc",
+    "southern pashto": "pbt",
+    "southern pastaza quechua": "qup",
+    "soyaltepec mazatec": "vmp",
+    "spanish": "es",
+    "standard arabic": "arb",
+    "standard moroccan tamazight": "zgh",
+    "sudanese arabic": "apd",
+    "sulka": "sua",
+    "svan": "sva",
+    "swahili": "sw",
+    "swedish": "sv",
+    "tae'": "rob",
+    "tahaggart tamahaq": "thv",
+    "taita": "dav",
+    "tajik": "tg",
+    "tamil": "ta",
+    "tandroy-mahafaly malagasy": "tdx",
+    "tangale": "tan",
+    "tanosy malagasy": "txy",
+    "tarok": "yer",
+    "tatar": "tt",
+    "tedaga": "tuq",
+    "telugu": "te",
+    "tem": "kdh",
+    "teop": "tio",
+    "tepeuxila cuicatec": "cux",
+    "tepinapa chinantec": "cte",
+    "tera": "ttr",
+    "terei": "buo",
+    "termanu": "twu",
+    "tesaka malagasy": "tkg",
+    "tetelcingo nahuatl": "nhg",
+    "teutila cuicatec": "cut",
+    "thai": "th",
+    "tibetan": "bo",
+    "tidaá mixtec": "mtx",
+    "tidore": "tvo",
+    "tigak": "tgc",
+    "tigre": "tig",
+    "tigrinya": "ti",
+    "tilquiapan zapotec": "zts",
+    "tinputz": "tpz",
+    "tlacoapa me'phaa": "tpl",
+    "tlacoatzintepec chinantec": "ctl",
+    "tlingit": "tli",
+    "toki pona": "tok",
+    "tomoip": "tqp",
+    "tondano": "tdn",
+    "tonsea": "txs",
+    "tooro": "ttj",
+    "torau": "ttu",
+    "torwali": "trw",
+    "tsimihety malagasy": "xmw",
+    "tsotso": "lto",
+    "tswana": "tn",
+    "tugen": "tuy",
+    "tuki": "bag",
+    "tula": "tul",
+    "tulu": "tcy",
+    "tunen": "tvu",
+    "tungag": "lcm",
+    "tunisian arabic": "aeb",
+    "tupuri": "tui",
+    "turkana": "tuv",
+    "turkish": "tr",
+    "turkmen": "tk",
+    "tututepec mixtec": "mtu",
+    "twi": "tw",
+    "ubaghara": "byc",
+    "uighur": "ug",
+    "ukrainian": "uk",
+    "umbundu": "umb",
+    "upper sorbian": "hsb",
+    "urdu": "ur",
+    "ushojo": "ush",
+    "uzbek": "uz",
+    "vai": "vai",
+    "vietnamese": "vi",
+    "votic": "vot",
+    "võro": "vro",
+    "waci gbe": "wci",
+    "wadiyara koli": "kxp",
+    "waja": "wja",
+    "wakhi": "wbl",
+    "wanga": "lwg",
+    "wapan": "juk",
+    "warji": "wji",
+    "welsh": "cy",
+    "wemale": "weo",
+    "western frisian": "fy",
+    "western highland purepecha": "pua",
+    "western juxtlahuaca mixtec": "jmx",
+    "western maninkakan": "mlq",
+    "western mari": "mrj",
+    "western niger fulfulde": "fuh",
+    "western panjabi": "pnb",
+    "wolof": "wo",
+    "wuzlam": "udl",
+    "xanaguía zapotec": "ztg",
+    "xhosa": "xh",
+    "yace": "ekr",
+    "yakut": "sah",
+    "yalahatan": "jal",
+    "yanahuanca pasco quechua": "qur",
+    "yangben": "yav",
+    "yaqui": "yaq",
+    "yauyos quechua": "qux",
+    "yekhee": "ets",
+    "yiddish": "yi",
+    "yidgha": "ydg",
+    "yoruba": "yo",
+    "yutanduchi mixtec": "mab",
+    "zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi",
+    "zarma": "dje",
+    "zaza": "zza",
+    "zulu": "zu",
+    "ömie": "aom",
+}
+LANG_NAMES = set(LANG_NAME_TO_ID.keys())
+LANG_IDS = set(LANG_NAME_TO_ID.values())
+# Exceptions where .title() doesn't match the canonical casing from the TSV.
+_TITLE_EXCEPTIONS = {
+    "fe'fe'": "Fe'fe'",
+    "dũya": "Dũya",
+    "santiago del estero quichua": "Santiago del Estero Quichua",
+    "santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua",
+    "malinaltepec me'phaa": "Malinaltepec Me'phaa",
+    "tlacoapa me'phaa": "Tlacoapa Me'phaa",
+}
+def lang_display_name(name: str) -> str:
+    """Return a display-friendly version of a lowercase language name.
+    Uses .title() for most names, with manual exceptions for cases like
+    apostrophes and small words (de, del) that should stay lowercase.
+    """
+    return _TITLE_EXCEPTIONS.get(name, name.title())

runtime/omnivoice/utils/text.py ADDED Viewed

	@@ -0,0 +1,219 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Text processing utilities for TTS inference.
+Provides:
+- ``chunk_text_punctuation()``: Splits long text into model-friendly chunks at
+  sentence boundaries, with abbreviation-aware punctuation splitting.
+- ``add_punctuation()``: Appends missing end punctuation (Chinese or English).
+"""
+from typing import List, Optional
+SPLIT_PUNCTUATION = set(".,;:!?。，；：！？")
+CLOSING_MARKS = set("\"'""'）]》》>」】")
+END_PUNCTUATION = {
+    ";",
+    ":",
+    ",",
+    ".",
+    "!",
+    "?",
+    "…",
+    ")",
+    "]",
+    "}",
+    '"',
+    "'",
+    """,
+    "'",
+    "；",
+    "：",
+    "，",
+    "。",
+    "！",
+    "？",
+    "、",
+    "……",
+    "）",
+    "】",
+    """,
+    "'",
+}
+ABBREVIATIONS = {
+    "Mr.",
+    "Mrs.",
+    "Ms.",
+    "Dr.",
+    "Prof.",
+    "Sr.",
+    "Jr.",
+    "Rev.",
+    "Fr.",
+    "Hon.",
+    "Pres.",
+    "Gov.",
+    "Capt.",
+    "Gen.",
+    "Sen.",
+    "Rep.",
+    "Col.",
+    "Maj.",
+    "Lt.",
+    "Cmdr.",
+    "Sgt.",
+    "Cpl.",
+    "Co.",
+    "Corp.",
+    "Inc.",
+    "Ltd.",
+    "Est.",
+    "Dept.",
+    "St.",
+    "Ave.",
+    "Blvd.",
+    "Rd.",
+    "Mt.",
+    "Ft.",
+    "No.",
+    "Jan.",
+    "Feb.",
+    "Mar.",
+    "Apr.",
+    "Aug.",
+    "Sep.",
+    "Sept.",
+    "Oct.",
+    "Nov.",
+    "Dec.",
+    "i.e.",
+    "e.g.",
+    "vs.",
+    "Vs.",
+    "Etc.",
+    "approx.",
+    "fig.",
+    "def.",
+}
+def chunk_text_punctuation(
+    text: str,
+    chunk_len: int,
+    min_chunk_len: Optional[int] = None,
+) -> List[str]:
+    """
+    Splits the input tokens list into chunks according to punctuations,
+    avoiding splits on common abbreviations (e.g., Mr., No.).
+    """
+    # 1. Split the tokens according to punctuations.
+    sentences = []
+    current_sentence = []
+    tokens_list = list(text)
+    for token in tokens_list:
+        # If the first token of current sentence is punctuation,
+        # append it to the end of the previous sentence.
+        if (
+            len(current_sentence) == 0
+            and len(sentences) != 0
+            and (token in SPLIT_PUNCTUATION or token in CLOSING_MARKS)
+        ):
+            sentences[-1].append(token)
+        # Otherwise, append the current token to the current sentence.
+        else:
+            current_sentence.append(token)
+            # Split the sentence in positions of punctuations.
+            if token in SPLIT_PUNCTUATION:
+                is_abbreviation = False
+                if token == ".":
+                    temp_str = "".join(current_sentence).strip()
+                    if temp_str:
+                        last_word = temp_str.split()[-1]
+                        if last_word in ABBREVIATIONS:
+                            is_abbreviation = True
+                if not is_abbreviation:
+                    sentences.append(current_sentence)
+                    current_sentence = []
+    # Assume the last few tokens are also a sentence
+    if len(current_sentence) != 0:
+        sentences.append(current_sentence)
+    # 2. Merge short sentences.
+    merged_chunks = []
+    current_chunk = []
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) <= chunk_len:
+            current_chunk.extend(sentence)
+        else:
+            if len(current_chunk) > 0:
+                merged_chunks.append(current_chunk)
+            current_chunk = sentence
+    if len(current_chunk) > 0:
+        merged_chunks.append(current_chunk)
+    # 4. Post-process: Check for undersized chunks and merge them
+    #  with the previous chunk or next chunk (if it's the first chunk).
+    if min_chunk_len is not None:
+        first_chunk_short_flag = (
+            len(merged_chunks) > 0 and len(merged_chunks[0]) < min_chunk_len
+        )
+        final_chunks = []
+        for i, chunk in enumerate(merged_chunks):
+            if i == 1 and first_chunk_short_flag:
+                final_chunks[-1].extend(chunk)
+            else:
+                if len(chunk) >= min_chunk_len:
+                    final_chunks.append(chunk)
+                else:
+                    if len(final_chunks) == 0:
+                        final_chunks.append(chunk)
+                    else:
+                        final_chunks[-1].extend(chunk)
+    else:
+        final_chunks = merged_chunks
+    chunk_strings = [
+        "".join(chunk).strip() for chunk in final_chunks if "".join(chunk).strip()
+    ]
+    return chunk_strings
+def add_punctuation(text: str):
+    """Add punctuation if there is not in the end of text"""
+    text = text.strip()
+    if not text:
+        return text
+    if text[-1] not in END_PUNCTUATION:
+        is_chinese = any("\u4e00" <= char <= "\u9fff" for char in text)
+        text += "。" if is_chinese else "."
+    return text

runtime/omnivoice/utils/voice_design.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env python3
+# Copyright    2026  Xiaomi Corp.        (authors:  Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Voice-design instruct constants for TTS inference.
+Defines speaker attribute tags (gender, age, pitch, accent, dialect) and
+translation/validation utilities between English and Chinese. Used by
+``OmniVoice.generate()`` for voice design mode.
+"""
+import re
+_ZH_RE = re.compile(r'[\u4e00-\u9fff]')
+# Category = set of {english: chinese, ...} items that are mutually exclusive.
+# Accent (EN-only) and dialect (ZH-only) are stored as flat sets below.
+_INSTRUCT_CATEGORIES = [
+    {"male": "男", "female": "女"},
+    {"child": "儿童", "teenager": "少年", "young adult": "青年",
+     "middle-aged": "中年", "elderly": "老年"},
+    {"very low pitch": "极低音调", "low pitch": "低音调",
+     "moderate pitch": "中音调", "high pitch": "高音调",
+     "very high pitch": "极高音调"},
+    {"whisper": "耳语"},
+    # Accent (English-only, no Chinese counterpart)
+    {"american accent", "british accent", "australian accent",
+     "chinese accent", "canadian accent", "indian accent",
+     "korean accent", "portuguese accent", "russian accent", "japanese accent",
+     "armenian accent", "eastern armenian accent", "western armenian accent",
+     "yerevan accent"},
+    # Dialect (Chinese-only, no English counterpart)
+    {"河南话", "陕西话", "四川话", "贵州话", "云南话", "桂林话",
+     "济南话", "石家庄话", "甘肃话", "宁夏话", "青岛话", "东北话"},
+]
+_INSTRUCT_EN_TO_ZH = {}
+_INSTRUCT_ZH_TO_EN = {}
+_INSTRUCT_MUTUALLY_EXCLUSIVE = []
+for _cat in _INSTRUCT_CATEGORIES:
+    if isinstance(_cat, dict):
+        _INSTRUCT_EN_TO_ZH.update(_cat)
+        _INSTRUCT_ZH_TO_EN.update({v: k for k, v in _cat.items()})
+        _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat) | set(_cat.values()))
+    else:
+        _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat))
+_INSTRUCT_ALL_VALID = (
+    set(_INSTRUCT_EN_TO_ZH) | set(_INSTRUCT_ZH_TO_EN)
+    | _INSTRUCT_MUTUALLY_EXCLUSIVE[-2]  # accents
+    | _INSTRUCT_MUTUALLY_EXCLUSIVE[-1]  # dialects
+)
+_INSTRUCT_VALID_EN = frozenset(i for i in _INSTRUCT_ALL_VALID if not _ZH_RE.search(i))
+_INSTRUCT_VALID_ZH = frozenset(i for i in _INSTRUCT_ALL_VALID if _ZH_RE.search(i))

runtime/pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "avoice-runtime"
+version = "0.1.0"
+description = "Runtime package for the AVoice Armenian TTS model."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "GPL-2.0" }
+dependencies = ["numpy>=1.26","torch>=2.4","torchaudio>=2.4","transformers>=5.5.0","huggingface_hub>=0.24","soundfile>=0.12","sentencepiece>=0.2","fastapi>=0.115","uvicorn>=0.30","pydub","librosa"]
+[project.scripts]
+avoice = "omnivoice.cli.infer:main"
+omnivoice-infer = "omnivoice.cli.infer:main"
+omnivoice-api = "omnivoice.server.app:main"
+omnivoice-prefetch = "omnivoice.server.prefetch:main"
+[tool.setuptools.packages.find]
+include = ["omnivoice*"]

runtime/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy>=1.26
+torch>=2.4
+torchaudio>=2.4
+transformers>=5.5.0
+huggingface_hub>=0.24
+soundfile>=0.12
+sentencepiece>=0.2
+fastapi>=0.115
+uvicorn>=0.30
+pydub
+librosa