aiseosae
/

Second-TTS

Model card Files Files and versions

Second-TTS / demo.py

aiseosae's picture

Upload folder using huggingface_hub

7bc2975 verified 20 days ago

history blame contribute delete

3.2 kB

	"""demo.py — quick smoke test for vocence_miner_v1.

	Reads the merged checkpoint either from a local path or from the Hugging Face Hub,
	then generates a small set of preset clips that exercise the prompt-following range.

	pip install qwen-tts transformers torch soundfile
	python demo.py # uses the current directory
	python demo.py --source magma90909/vocence_miner_v8 # pull from HF
	"""
	from __future__ import annotations

	import argparse
	import dataclasses
	import sys
	from pathlib import Path

	import soundfile as sf
	import torch
	from qwen_tts import Qwen3TTSModel


	@dataclasses.dataclass(frozen=True)
	class Sample:
	slug: str
	say: str
	voice: str


	SAMPLES: tuple[Sample, ...] = (
	Sample(
	slug="warm_male_storyteller",
	say="Long ago, in a kingdom by the sea, a young girl made a remarkable discovery.",
	voice="An older male narrator reads a bedtime story slowly, with warmth.",
	),
	Sample(
	slug="whisper_female",
	say="Don't say a word. Just listen carefully.",
	voice="A young woman whispers, conspiratorial, low energy, very quiet.",
	),
	Sample(
	slug="projecting_announcer",
	say="And he scores in the final second of the match!",
	voice="A high-pitched announcer projects an exciting headline at a fast pace.",
	),
	)


	SAMPLER = dict(
	temperature=0.85,
	top_k=50,
	top_p=0.95,
	repetition_penalty=1.05,
	max_new_tokens=600,
	do_sample=True,
	)


	def parse_args(argv: list[str] \| None = None) -> argparse.Namespace:
	p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
	p.add_argument("--source", default=".", help="HF repo id or local checkpoint dir")
	p.add_argument("--out", default="./demo_out", help="output dir for wav files")
	p.add_argument("--precision", default="bfloat16", choices=("bfloat16", "float16", "float32"))
	p.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
	return p.parse_args(argv)


	def load(source: str, device: str, precision: str) -> Qwen3TTSModel:
	dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[precision]
	print(f"[demo] loading {source!r} -> {device} ({precision})", flush=True)
	return Qwen3TTSModel.from_pretrained(source, device_map=device, dtype=dtype)


	def synth_one(model: Qwen3TTSModel, sample: Sample, out_dir: Path) -> Path:
	wavs, sr = model.generate_voice_design(
	text=sample.say,
	instruct=sample.voice,
	language="english",
	**SAMPLER,
	)
	target = out_dir / f"{sample.slug}.wav"
	sf.write(target, wavs[0], sr)
	duration = len(wavs[0]) / sr
	print(f" -> {target.name} ({duration:.2f}s @ {sr} Hz)")
	return target


	def run(args: argparse.Namespace) -> int:
	out_dir = Path(args.out)
	out_dir.mkdir(parents=True, exist_ok=True)
	model = load(args.source, args.device, args.precision)
	for sample in SAMPLES:
	synth_one(model, sample, out_dir)
	print(f"[demo] {len(SAMPLES)} clips written to {out_dir}/", flush=True)
	return 0


	if __name__ == "__main__":
	sys.exit(run(parse_args()))