Text-to-Speech
Transformers
Safetensors
Qwen3-TTS
English
text-generation
tts
prompttts
qwen3-tts
voice-design
vocence
Instructions to use michael-chan-000/tts-v21 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use michael-chan-000/tts-v21 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-speech", model="michael-chan-000/tts-v21")# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("michael-chan-000/tts-v21", dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 3,203 Bytes
79529ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | """demo.py — quick smoke test for vocence_miner_v1.
Reads the merged checkpoint either from a local path or from the Hugging Face Hub,
then generates a small set of preset clips that exercise the prompt-following range.
pip install qwen-tts transformers torch soundfile
python demo.py # uses the current directory
python demo.py --source magma90909/vocence_miner_v8 # pull from HF
"""
from __future__ import annotations
import argparse
import dataclasses
import sys
from pathlib import Path
import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel
@dataclasses.dataclass(frozen=True)
class Sample:
slug: str
say: str
voice: str
SAMPLES: tuple[Sample, ...] = (
Sample(
slug="warm_male_storyteller",
say="Long ago, in a kingdom by the sea, a young girl made a remarkable discovery.",
voice="An older male narrator reads a bedtime story slowly, with warmth.",
),
Sample(
slug="whisper_female",
say="Don't say a word. Just listen carefully.",
voice="A young woman whispers, conspiratorial, low energy, very quiet.",
),
Sample(
slug="projecting_announcer",
say="And he scores in the final second of the match!",
voice="A high-pitched announcer projects an exciting headline at a fast pace.",
),
)
SAMPLER = dict(
temperature=0.85,
top_k=50,
top_p=0.95,
repetition_penalty=1.05,
max_new_tokens=600,
do_sample=True,
)
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
p.add_argument("--source", default=".", help="HF repo id or local checkpoint dir")
p.add_argument("--out", default="./demo_out", help="output dir for wav files")
p.add_argument("--precision", default="bfloat16", choices=("bfloat16", "float16", "float32"))
p.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
return p.parse_args(argv)
def load(source: str, device: str, precision: str) -> Qwen3TTSModel:
dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[precision]
print(f"[demo] loading {source!r} -> {device} ({precision})", flush=True)
return Qwen3TTSModel.from_pretrained(source, device_map=device, dtype=dtype)
def synth_one(model: Qwen3TTSModel, sample: Sample, out_dir: Path) -> Path:
wavs, sr = model.generate_voice_design(
text=sample.say,
instruct=sample.voice,
language="english",
**SAMPLER,
)
target = out_dir / f"{sample.slug}.wav"
sf.write(target, wavs[0], sr)
duration = len(wavs[0]) / sr
print(f" -> {target.name} ({duration:.2f}s @ {sr} Hz)")
return target
def run(args: argparse.Namespace) -> int:
out_dir = Path(args.out)
out_dir.mkdir(parents=True, exist_ok=True)
model = load(args.source, args.device, args.precision)
for sample in SAMPLES:
synth_one(model, sample, out_dir)
print(f"[demo] {len(SAMPLES)} clips written to {out_dir}/", flush=True)
return 0
if __name__ == "__main__":
sys.exit(run(parse_args()))
|