File size: 2,524 Bytes
df368b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Minimal inference example for qwen3_voice_design_t1.

Install:
    pip install qwen-tts transformers torch soundfile

Run:
    python example_inference.py                  # loads from local dir (./)
    python example_inference.py --repo macminix/qwen3_voice_design_t1   # or pull from HF

The model is self-contained. No base model download is required.
"""
from __future__ import annotations

import argparse
from pathlib import Path

import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel


PROMPTS = [
    dict(
        name="happy_male",
        text="Come and look at this, you are not going to believe it.",
        instruct="A male speaker delivers his happy speech at a moderate pace with standard energy.",
    ),
    dict(
        name="sad_female_slow",
        text="I'm sorry. I tried everything I could think of.",
        instruct="A female voice speaks slowly with a sad, quiet tone.",
    ),
    dict(
        name="angry_male_low_fast",
        text="You were warned, and you did it anyway.",
        instruct="A low-pitched male speaker, angry and forceful, speaking at a fast pace.",
    ),
]

GEN_KWARGS = dict(
    language="english",
    temperature=0.9,
    top_k=50,
    top_p=1.0,
    repetition_penalty=1.05,
    max_new_tokens=600,
    do_sample=True,
)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--repo", default=".", help="HF repo id or local path (default: current dir)")
    ap.add_argument("--out-dir", default="./out", help="where to write wavs")
    ap.add_argument("--dtype", default="bfloat16", choices=["bfloat16", "float16", "float32"])
    ap.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu",
                    help="torch device (default: cuda:0 if available, else cpu)")
    args = ap.parse_args()

    dtype = {"bfloat16": torch.bfloat16, "float16": torch.float16, "float32": torch.float32}[args.dtype]

    print(f"loading model from {args.repo} (device={args.device}, dtype={args.dtype})")
    wrap = Qwen3TTSModel.from_pretrained(args.repo, device_map=args.device, dtype=dtype)

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    for p in PROMPTS:
        wavs, sr = wrap.generate_voice_design(text=p["text"], instruct=p["instruct"], **GEN_KWARGS)
        path = out_dir / f"{p['name']}.wav"
        sf.write(path, wavs[0], sr)
        print(f"  {path}  ({len(wavs[0]) / sr:.1f} s @ {sr} Hz)")

    print("done")


if __name__ == "__main__":
    main()