File size: 1,726 Bytes
30c6bb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Viterbox - Command Line Inference
"""
import argparse
from pathlib import Path
from viterbox import Viterbox


def main():
    parser = argparse.ArgumentParser(description="Viterbox Text-to-Speech")
    parser.add_argument("--text", "-t", type=str, required=True, help="Text to synthesize")
    parser.add_argument("--lang", "-l", type=str, default="vi", help="Language (vi/en)")
    parser.add_argument("--ref", "-r", type=str, default=None, help="Reference audio for voice cloning")
    parser.add_argument("--output", "-o", type=str, default="output.wav", help="Output file path")
    parser.add_argument("--device", "-d", type=str, default="cuda", help="Device (cuda/cpu)")
    parser.add_argument("--exaggeration", type=float, default=0.5, help="Expression intensity (0.0-2.0)")
    parser.add_argument("--cfg-weight", type=float, default=0.5, help="CFG weight (0.0-1.0)")
    parser.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature (0.1-1.0)")
    parser.add_argument("--sentence-pause", type=float, default=0.5, help="Pause between sentences in seconds (default 0.5)")
    
    args = parser.parse_args()
    
    print("Loading model...")
    tts = Viterbox.from_pretrained(args.device)
    print("✅ Model loaded")
    
    print(f"Generating: '{args.text}'")
    
    audio = tts.generate(
        text=args.text,
        language=args.lang,
        audio_prompt=args.ref,
        exaggeration=args.exaggeration,
        cfg_weight=args.cfg_weight,
        temperature=args.temperature,
        sentence_pause_ms=int(args.sentence_pause * 1000),
    )
    
    tts.save_audio(audio, args.output)
    print(f"✅ Saved to: {args.output}")


if __name__ == "__main__":
    main()