vocence-miner02 / example_inference.py
ShinyUser's picture
Upload 22 files
62fba01 verified
"""Minimal inference example for qwen3_voice_design_t1.
Install:
pip install qwen-tts transformers torch soundfile
Run:
python example_inference.py # loads from local dir (./)
python example_inference.py --repo macminix/qwen3_voice_design_t1 # or pull from HF
The model is self-contained. No base model download is required.
"""
from __future__ import annotations
import argparse
from pathlib import Path
import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel
PROMPTS = [
dict(
name="happy_male",
text="Come and look at this, you are not going to believe it.",
instruct="A male speaker delivers his happy speech at a moderate pace with standard energy.",
),
dict(
name="sad_female_slow",
text="I'm sorry. I tried everything I could think of.",
instruct="A female voice speaks slowly with a sad, quiet tone.",
),
dict(
name="angry_male_low_fast",
text="You were warned, and you did it anyway.",
instruct="A low-pitched male speaker, angry and forceful, speaking at a fast pace.",
),
]
GEN_KWARGS = dict(
language="english",
temperature=0.9,
top_k=50,
top_p=1.0,
repetition_penalty=1.05,
max_new_tokens=600,
do_sample=True,
)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--repo", default=".", help="HF repo id or local path (default: current dir)")
ap.add_argument("--out-dir", default="./out", help="where to write wavs")
ap.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
ap.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu",
help="torch device (default: cuda:0 if available, else cpu)")
args = ap.parse_args()
dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[args.dtype]
print(f"loading model from {args.repo} (device={args.device}, dtype={args.dtype})")
wrap = Qwen3TTSModel.from_pretrained(args.repo, device_map=args.device, dtype=dtype)
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
for p in PROMPTS:
wavs, sr = wrap.generate_voice_design(text=p["text"], instruct=p["instruct"], **GEN_KWARGS)
path = out_dir / f"{p['name']}.wav"
sf.write(path, wavs[0], sr)
print(f" {path} ({len(wavs[0]) / sr:.1f} s @ {sr} Hz)")
print("done")
if __name__ == "__main__":
main()