ShinyUser
/

vocence-miner02

text-generation

Model card Files Files and versions

vocence-miner02 / example_inference.py

ShinyUser's picture

Upload 22 files

62fba01 verified 22 days ago

history blame contribute delete

2.52 kB

	"""Minimal inference example for qwen3_voice_design_t1.

	Install:
	pip install qwen-tts transformers torch soundfile

	Run:
	python example_inference.py # loads from local dir (./)
	python example_inference.py --repo macminix/qwen3_voice_design_t1 # or pull from HF

	The model is self-contained. No base model download is required.
	"""
	from __future__ import annotations

	import argparse
	from pathlib import Path

	import soundfile as sf
	import torch
	from qwen_tts import Qwen3TTSModel


	PROMPTS = [
	dict(
	name="happy_male",
	text="Come and look at this, you are not going to believe it.",
	instruct="A male speaker delivers his happy speech at a moderate pace with standard energy.",
	),
	dict(
	name="sad_female_slow",
	text="I'm sorry. I tried everything I could think of.",
	instruct="A female voice speaks slowly with a sad, quiet tone.",
	),
	dict(
	name="angry_male_low_fast",
	text="You were warned, and you did it anyway.",
	instruct="A low-pitched male speaker, angry and forceful, speaking at a fast pace.",
	),
	]

	GEN_KWARGS = dict(
	language="english",
	temperature=0.9,
	top_k=50,
	top_p=1.0,
	repetition_penalty=1.05,
	max_new_tokens=600,
	do_sample=True,
	)


	def main() -> None:
	ap = argparse.ArgumentParser()
	ap.add_argument("--repo", default=".", help="HF repo id or local path (default: current dir)")
	ap.add_argument("--out-dir", default="./out", help="where to write wavs")
	ap.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
	ap.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu",
	help="torch device (default: cuda:0 if available, else cpu)")
	args = ap.parse_args()

	dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[args.dtype]

	print(f"loading model from {args.repo} (device={args.device}, dtype={args.dtype})")
	wrap = Qwen3TTSModel.from_pretrained(args.repo, device_map=args.device, dtype=dtype)

	out_dir = Path(args.out_dir)
	out_dir.mkdir(parents=True, exist_ok=True)

	for p in PROMPTS:
	wavs, sr = wrap.generate_voice_design(text=p["text"], instruct=p["instruct"], **GEN_KWARGS)
	path = out_dir / f"{p['name']}.wav"
	sf.write(path, wavs[0], sr)
	print(f" {path} ({len(wavs[0]) / sr:.1f} s @ {sr} Hz)")

	print("done")


	if __name__ == "__main__":
	main()