| """demo.py — quick smoke test for vocence_miner_v1. |
| |
| Reads the merged checkpoint either from a local path or from the Hugging Face Hub, |
| then generates a small set of preset clips that exercise the prompt-following range. |
| |
| pip install qwen-tts transformers torch soundfile |
| python demo.py # uses the current directory |
| python demo.py --source magma90909/vocence_miner_v8 # pull from HF |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import dataclasses |
| import sys |
| from pathlib import Path |
|
|
| import soundfile as sf |
| import torch |
| from qwen_tts import Qwen3TTSModel |
|
|
|
|
| @dataclasses.dataclass(frozen=True) |
| class Sample: |
| slug: str |
| say: str |
| voice: str |
|
|
|
|
| SAMPLES: tuple[Sample, ...] = ( |
| Sample( |
| slug="warm_male_storyteller", |
| say="Long ago, in a kingdom by the sea, a young girl made a remarkable discovery.", |
| voice="An older male narrator reads a bedtime story slowly, with warmth.", |
| ), |
| Sample( |
| slug="whisper_female", |
| say="Don't say a word. Just listen carefully.", |
| voice="A young woman whispers, conspiratorial, low energy, very quiet.", |
| ), |
| Sample( |
| slug="projecting_announcer", |
| say="And he scores in the final second of the match!", |
| voice="A high-pitched announcer projects an exciting headline at a fast pace.", |
| ), |
| ) |
|
|
|
|
| SAMPLER = dict( |
| temperature=0.85, |
| top_k=50, |
| top_p=0.95, |
| repetition_penalty=1.05, |
| max_new_tokens=600, |
| do_sample=True, |
| ) |
|
|
|
|
| def parse_args(argv: list[str] | None = None) -> argparse.Namespace: |
| p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0]) |
| p.add_argument("--source", default=".", help="HF repo id or local checkpoint dir") |
| p.add_argument("--out", default="./demo_out", help="output dir for wav files") |
| p.add_argument("--precision", default="bfloat16", choices=("bfloat16", "float16", "float32")) |
| p.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu") |
| return p.parse_args(argv) |
|
|
|
|
| def load(source: str, device: str, precision: str) -> Qwen3TTSModel: |
| dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[precision] |
| print(f"[demo] loading {source!r} -> {device} ({precision})", flush=True) |
| return Qwen3TTSModel.from_pretrained(source, device_map=device, dtype=dtype) |
|
|
|
|
| def synth_one(model: Qwen3TTSModel, sample: Sample, out_dir: Path) -> Path: |
| wavs, sr = model.generate_voice_design( |
| text=sample.say, |
| instruct=sample.voice, |
| language="english", |
| **SAMPLER, |
| ) |
| target = out_dir / f"{sample.slug}.wav" |
| sf.write(target, wavs[0], sr) |
| duration = len(wavs[0]) / sr |
| print(f" -> {target.name} ({duration:.2f}s @ {sr} Hz)") |
| return target |
|
|
|
|
| def run(args: argparse.Namespace) -> int: |
| out_dir = Path(args.out) |
| out_dir.mkdir(parents=True, exist_ok=True) |
| model = load(args.source, args.device, args.precision) |
| for sample in SAMPLES: |
| synth_one(model, sample, out_dir) |
| print(f"[demo] {len(SAMPLES)} clips written to {out_dir}/", flush=True) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(run(parse_args())) |
|
|