Text-to-Speech
Transformers
Safetensors
Qwen3-TTS
English
text-generation
tts
prompttts
qwen3-tts
voice-design
vocence
Instructions to use michael-chan-000/tts-v21 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use michael-chan-000/tts-v21 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-speech", model="michael-chan-000/tts-v21")# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("michael-chan-000/tts-v21", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """Vocence engine for the merged Qwen3-TTS VoiceDesign checkpoint. | |
| The Vocence Chutes wrapper instantiates ``Miner`` with the on-disk path of the HF | |
| snapshot and then drives it through the contract: | |
| Miner(path_hf_repo: Path) | |
| warmup() -> None | |
| generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int] | |
| All weights, the audio codec, and the tokenizer ship together in the snapshot — | |
| nothing is fetched at runtime. | |
| """ | |
| from __future__ import annotations | |
| import dataclasses | |
| import re | |
| import threading | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| _REPO_REQUIRED_FILE = "config.json" | |
| _RUNTIME_CONFIG_FILE = "vocence_config.yaml" | |
| # --------------------------------------------------------------------------- # | |
| # Instruction rewrite (tag -> natural-language preamble) # | |
| # --------------------------------------------------------------------------- # | |
| # | |
| # Validators may send instructions in the legacy pipe-tag form, e.g. | |
| # ``| gender: male | pitch: mid | accent: uk |``. The base voice_design | |
| # checkpoint was conditioned on natural-language descriptions, so we paraphrase | |
| # the tags into a short imperative preamble and *prepend* it to whatever the | |
| # caller sent. Free-form prompts (no ``| key: value |`` pairs) pass through | |
| # unchanged because ``_parse_instruction`` returns ``{}`` for them. | |
| # One ``| key: value |`` pair. Value runs until the next ``|`` or end-of-string; | |
| # the lookahead keeps the trailing ``|`` available for the next iteration. | |
| _INSTRUCTION_TAG_RE = re.compile( | |
| r"\|\s*([A-Za-z_]+)\s*:\s*([^|]+?)\s*(?=\||$)" | |
| ) | |
| _GENDER_PHRASE = { | |
| "male": "male", "female": "female", "neutral": "gender-neutral", | |
| } | |
| _PITCH_PHRASE = { | |
| "low": "deep low-pitched voice", "mid": "medium natural pitch", "high": "high-pitched voice", | |
| } | |
| _SPEED_PHRASE = { | |
| "slow": "slow deliberate pace", "normal": "natural conversational pace", "fast": "brisk fast pace", | |
| } | |
| _AGE_PHRASE = { | |
| "child": "child", "young_adult": "young adult", "adult": "adult", "senior": "elderly senior", | |
| } | |
| _EMOTION_PHRASE = { | |
| "neutral": "neutral composed delivery", | |
| "happy": "cheerful happy upbeat warm", | |
| "sad": "sorrowful sad subdued downcast", | |
| "angry": "firm angry forceful assertive tense", | |
| "calm": "calm relaxed measured peaceful unhurried", | |
| "excited": "excited enthusiastic energetic lively", | |
| "serious": "serious grave deliberate weighty", | |
| "fearful": "nervous fearful hesitant trembling", | |
| } | |
| _TONE_PHRASE = { | |
| "warm": "warm", "cold": "cold detached", "friendly": "friendly", | |
| "formal": "formal", "casual": "casual", "authoritative": "authoritative commanding", | |
| } | |
| _ACCENT_PHRASE = { | |
| "us": "standard American English accent with rhotic r sounds", | |
| "uk": "standard British English accent with non-rhotic received pronunciation", | |
| "au": "Australian English accent", | |
| "in": "Indian English accent", | |
| "neutral": "neutral international English accent", | |
| "other": "non-native English accent", | |
| } | |
| def _parse_instruction(instruction: str) -> dict[str, str]: | |
| """Parse a pipe-tag instruction (``| key: value | ...``) into a flat dict. | |
| Keys are lowercased; values are lowercased and stripped. Returns ``{}`` | |
| for free-form natural-language prompts (no tag pairs found), which | |
| signals ``_enhance_instruction`` to pass them through unchanged. Unknown | |
| or out-of-vocabulary values quietly drop out at preamble-build time | |
| because the phrase tables only contain mappings we trust to be in the | |
| base model's training distribution. | |
| """ | |
| if not instruction or "|" not in instruction: | |
| return {} | |
| out: dict[str, str] = {} | |
| for m in _INSTRUCTION_TAG_RE.finditer(instruction): | |
| key = m.group(1).strip().lower() | |
| val = m.group(2).strip().lower() | |
| if key and val: | |
| out[key] = val | |
| return out | |
| def _build_natural_preamble(parsed: dict[str, str]) -> str: | |
| gender = _GENDER_PHRASE.get(parsed.get("gender", ""), "") | |
| age = _AGE_PHRASE.get(parsed.get("age_group", ""), "") | |
| pitch = _PITCH_PHRASE.get(parsed.get("pitch", ""), "") | |
| speed = _SPEED_PHRASE.get(parsed.get("speed", ""), "") | |
| emotion = _EMOTION_PHRASE.get(parsed.get("emotion", ""), "") | |
| tone = _TONE_PHRASE.get(parsed.get("tone", ""), "") | |
| accent = _ACCENT_PHRASE.get(parsed.get("accent", ""), "") | |
| parts: list[str] = [] | |
| # Gender-first to avoid timbre drift on emotion-heavy prompts | |
| identity = " ".join(p for p in [gender, age] if p) | |
| if identity: | |
| parts.append(f"a {identity} voice") | |
| if emotion: | |
| parts.append(emotion) | |
| if accent: | |
| parts.append(f"speaking with a {accent}") | |
| if pitch: | |
| parts.append(pitch) | |
| if speed: | |
| parts.append(speed) | |
| if tone: | |
| parts.append(f"{tone} tone") | |
| if not parts: | |
| return "" | |
| preamble = "Speak as " + ", ".join(parts) + "." | |
| return preamble + " Use natural human prosody with realistic breath placement and varied intonation." | |
| def _enhance_instruction(instruction: str) -> str: | |
| """Prepend a natural-language preamble derived from any pipe tags. | |
| Pass-through when the input has no parseable tags or none of them map | |
| to a known phrase (so the preamble would be empty). Always keeps the | |
| original instruction at the end so the caller's free-form instructions | |
| still influence the model. | |
| """ | |
| parsed = _parse_instruction(instruction) | |
| if not parsed: | |
| return instruction | |
| preamble = _build_natural_preamble(parsed) | |
| if not preamble: | |
| return instruction | |
| return f"{preamble} {instruction}" | |
| # --------------------------------------------------------------------------- # | |
| # Text normalization # | |
| # --------------------------------------------------------------------------- # | |
| _NUM_WORDS = { | |
| "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", | |
| "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", | |
| "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", | |
| "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", | |
| "18": "eighteen", "19": "nineteen", "20": "twenty", "30": "thirty", | |
| "40": "forty", "50": "fifty", "60": "sixty", "70": "seventy", | |
| "80": "eighty", "90": "ninety", "100": "one hundred", | |
| } | |
| _ABBREV = { | |
| "Mr.": "Mister", "Mrs.": "Missus", "Dr.": "Doctor", "St.": "Saint", | |
| "etc.": "et cetera", "vs.": "versus", "approx.": "approximately", | |
| "dept.": "department", "govt.": "government", "mgr.": "manager", | |
| } | |
| # Pre-compiled at module load so we don't recompile on every call. | |
| _DOLLAR_RE = re.compile(r"\$(\d+)") | |
| _POUND_RE = re.compile(r"£(\d+)") | |
| _EURO_RE = re.compile(r"€(\d+)") | |
| _SMALL_INT_RE = re.compile(r"\b(\d{1,2})\b") | |
| _CONJ_RE = re.compile( | |
| r"(?<!\,)\s+(but|however|although|though|yet)\s+", | |
| flags=re.IGNORECASE, | |
| ) | |
| def _normalize_text_for_tts(text: str) -> str: | |
| """Rewrite a transcript so the talker emits cleaner, more prosodic speech. | |
| Concretely: expand a small list of common abbreviations, turn currency- | |
| prefixed integers into spelled-out phrases (``$5`` -> ``five dollars``), | |
| spell out 1-2 digit standalone integers, and insert a comma before | |
| coordinating conjunctions in long sentences so the model hears a beat | |
| where humans naturally take one. Larger numbers, decimals, and unknown | |
| abbreviations pass through unchanged. | |
| """ | |
| # Expand known abbreviations | |
| for abbr, expansion in _ABBREV.items(): | |
| text = text.replace(abbr, expansion) | |
| # Expand $N / £N / €N → "N dollars/pounds/euros" | |
| text = _DOLLAR_RE.sub( | |
| lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} dollars", text | |
| ) | |
| text = _POUND_RE.sub( | |
| lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} pounds", text | |
| ) | |
| text = _EURO_RE.sub( | |
| lambda m: f"{_NUM_WORDS.get(m.group(1), m.group(1))} euros", text | |
| ) | |
| # Expand standalone small integers (not part of larger numbers) | |
| text = _SMALL_INT_RE.sub( | |
| lambda m: _NUM_WORDS.get(m.group(1), m.group(1)), | |
| text, | |
| ) | |
| # Add comma pause before coordinating conjunctions in long sentences | |
| text = _CONJ_RE.sub(r", \1 ", text) | |
| return text.strip() | |
| class _RuntimeOpts: | |
| """Subset of vocence_config.yaml that the engine actually consumes.""" | |
| language: str = "English" | |
| sample_rate: int = 24000 | |
| max_instruction_chars: int = 600 | |
| max_text_chars: int = 2000 | |
| device_pref: str = "cuda" | |
| dtype_pref: str = "bfloat16" | |
| flash_attention_2: bool = False | |
| def from_repo(cls, repo: Path) -> "_RuntimeOpts": | |
| cfg_path = repo / _RUNTIME_CONFIG_FILE | |
| if not cfg_path.is_file(): | |
| return cls() | |
| from yaml import safe_load | |
| with cfg_path.open("r", encoding="utf-8") as fh: | |
| data = safe_load(fh) or {} | |
| runtime = data.get("runtime") or {} | |
| generation = data.get("generation") or {} | |
| limits = data.get("limits") or {} | |
| return cls( | |
| language=str(limits.get("default_language") or runtime.get("default_language") or "English"), | |
| sample_rate=int(generation.get("sample_rate", 24000)), | |
| max_instruction_chars=int(limits.get("max_instruction_chars", 600)), | |
| max_text_chars=int(limits.get("max_text_chars", 2000)), | |
| device_pref=str(runtime.get("device_preference", "cuda")).lower(), | |
| dtype_pref=str(runtime.get("dtype", "bfloat16")).lower(), | |
| flash_attention_2=bool(runtime.get("use_flash_attention_2", False)), | |
| ) | |
| class Miner: | |
| """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API.""" | |
| WARMUP_BUDGET_S = 180.0 | |
| def __init__(self, path_hf_repo: Path) -> None: | |
| self.repo = Path(path_hf_repo).resolve() | |
| if not (self.repo / _REPO_REQUIRED_FILE).is_file(): | |
| raise FileNotFoundError( | |
| f"Snapshot incomplete: {self.repo / _REPO_REQUIRED_FILE} not found" | |
| ) | |
| self.opts = _RuntimeOpts.from_repo(self.repo) | |
| self.model = self._build_model() | |
| def __repr__(self) -> str: | |
| return f"<Miner repo={self.repo.name} language={self.opts.language!r}>" | |
| # ------------------------------------------------------------------ # | |
| # Vocence contract # | |
| # ------------------------------------------------------------------ # | |
| def warmup(self) -> None: | |
| outcome: dict[str, Any] = {"ok": False, "err": None} | |
| def _heat() -> None: | |
| try: | |
| self.generate_wav(instruction="Calm neutral delivery.", text="Warmup.") | |
| outcome["ok"] = True | |
| except Exception as exc: # noqa: BLE001 — surface to host | |
| outcome["err"] = repr(exc) | |
| worker = threading.Thread(target=_heat, daemon=True) | |
| worker.start() | |
| worker.join(timeout=self.WARMUP_BUDGET_S) | |
| if not outcome["ok"]: | |
| raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}") | |
| def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]: | |
| # Cap raw inputs first so an oversized payload never reaches the | |
| # rewriter (which would just throw away the surplus anyway). | |
| prompt = self._truncate(instruction, self.opts.max_instruction_chars) | |
| body = self._truncate(text, self.opts.max_text_chars) | |
| # Tag-form instructions get a natural-language preamble prepended; | |
| # already-natural instructions pass through untouched. | |
| prompt = _enhance_instruction(prompt) | |
| # Spell out numbers/currency, expand a few abbreviations, and add | |
| # a beat before coordinating conjunctions in long sentences. | |
| body = _normalize_text_for_tts(body) | |
| # The preamble + abbreviation/number expansion can lengthen the | |
| # strings; re-clip to the same limits so we honour the contract | |
| # advertised in vocence_config.yaml's ``limits`` block. | |
| prompt = self._truncate(prompt, self.opts.max_instruction_chars) | |
| body = self._truncate(body, self.opts.max_text_chars) | |
| wavs, sample_rate = self.model.generate_voice_design( | |
| text=body, | |
| instruct=prompt, | |
| language=self.opts.language, | |
| ) | |
| if not wavs or wavs[0] is None: | |
| raise ValueError("Qwen3-TTS returned no audio") | |
| wave = self._coerce_mono_float32(wavs[0]) | |
| return wave, int(sample_rate) | |
| # ------------------------------------------------------------------ # | |
| # Internal # | |
| # ------------------------------------------------------------------ # | |
| def _truncate(value: str, limit: int) -> str: | |
| return value[:limit] if limit and limit > 0 else value | |
| def _coerce_mono_float32(arr: Any) -> np.ndarray: | |
| wave = np.asarray(arr, dtype=np.float32) | |
| if wave.ndim > 1: | |
| wave = wave.mean(axis=1) | |
| return wave | |
| def _build_model(self): | |
| import torch | |
| from qwen_tts import Qwen3TTSModel | |
| cuda_available = bool(torch.cuda.is_available()) | |
| device_map = "cuda:0" if (self.opts.device_pref == "cuda" and cuda_available) else "cpu" | |
| torch_dtype = ( | |
| torch.bfloat16 | |
| if (self.opts.dtype_pref == "bfloat16" and cuda_available) | |
| else torch.float32 | |
| ) | |
| attempt_order = ("flash_attention_2", "sdpa") if self.opts.flash_attention_2 else ("sdpa",) | |
| last_error: BaseException | None = None | |
| for attn in attempt_order: | |
| try: | |
| model = Qwen3TTSModel.from_pretrained( | |
| pretrained_model_name_or_path=str(self.repo), | |
| device_map=device_map, | |
| dtype=torch_dtype, | |
| attn_implementation=attn, | |
| ) | |
| print( | |
| f"[Miner] Qwen3-TTS ready on {device_map} " | |
| f"(dtype={self.opts.dtype_pref}, attn={attn})" | |
| ) | |
| return model | |
| except Exception as exc: # noqa: BLE001 — try next attn variant | |
| last_error = exc | |
| raise RuntimeError(f"Qwen3-TTS failed to load: {last_error!r}") | |