File size: 1,495 Bytes
5fb4737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python3
"""
Simple example script for Pocket TTS ONNX inference.

Usage:
    python generate.py "Hello, this is a test." samples/reference.wav output.wav
    python generate.py "Hello world" samples/expresso_02_ex03-ex01_calm_005.wav output.wav
"""

import argparse
import time
from pocket_tts_onnx import PocketTTSOnnx


def main():
    parser = argparse.ArgumentParser(description="Generate speech with Pocket TTS ONNX")
    parser.add_argument("text", help="Text to synthesize")
    parser.add_argument("voice", help="Path to voice reference audio file")
    parser.add_argument("output", help="Output audio file path")
    parser.add_argument("--precision", choices=["int8", "fp32"], default="int8",
                        help="Model precision (default: int8)")
    args = parser.parse_args()

    print(f"Loading models (precision={args.precision})...")
    t0 = time.time()
    tts = PocketTTSOnnx(precision=args.precision)
    print(f"  Loaded in {time.time() - t0:.2f}s")

    print(f"Generating speech...")
    print(f"  Text: {args.text}")
    print(f"  Voice: {args.voice}")

    t0 = time.time()
    audio = tts.generate(args.text, voice=args.voice)
    gen_time = time.time() - t0

    duration = len(audio) / tts.SAMPLE_RATE
    rtfx = duration / gen_time

    print(f"  Generated {duration:.2f}s audio in {gen_time:.2f}s (RTFx: {rtfx:.2f}x)")

    tts.save_audio(audio, args.output)
    print(f"  Saved to: {args.output}")


if __name__ == "__main__":
    main()