File size: 4,157 Bytes
bbb0e68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
"""Command-line interface for KugelAudio."""

import argparse
import sys


def main():
    parser = argparse.ArgumentParser(
        description="KugelAudio - Open-source text-to-speech",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Launch web interface
  kugelaudio ui
  
  # Launch with public share link
  kugelaudio ui --share
  
  # Generate speech from command line
  kugelaudio generate "Hello world!" -o output.wav
  
  # Check watermark in audio file
  kugelaudio verify audio.wav
        """,
    )

    subparsers = parser.add_subparsers(dest="command", help="Available commands")

    # UI command
    ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface")
    ui_parser.add_argument("--share", action="store_true", help="Create public share link")
    ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname")
    ui_parser.add_argument("--port", type=int, default=7860, help="Server port")

    # Generate command
    gen_parser = subparsers.add_parser("generate", help="Generate speech from text")
    gen_parser.add_argument("text", help="Text to synthesize")
    gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path")
    gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning")
    gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID")
    gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale")

    # Verify command
    verify_parser = subparsers.add_parser("verify", help="Check watermark in audio")
    verify_parser.add_argument("audio", help="Audio file to check")

    args = parser.parse_args()

    if args.command == "ui":
        from kugelaudio_open.ui import launch_app

        launch_app(
            share=args.share,
            server_name=args.host,
            server_port=args.port,
        )

    elif args.command == "generate":
        import torch
        from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
        from kugelaudio_open.processors import KugelAudioProcessor

        device = "cuda" if torch.cuda.is_available() else "cpu"
        dtype = torch.bfloat16 if device == "cuda" else torch.float32

        print(f"Loading model {args.model}...")
        model = KugelAudioForConditionalGenerationInference.from_pretrained(
            args.model, torch_dtype=dtype
        ).to(device)
        model.eval()

        processor = KugelAudioProcessor.from_pretrained(args.model)

        # Process inputs (voice_prompt passed to processor for proper handling)
        inputs = processor(
            text=args.text,
            voice_prompt=args.reference,  # Pass reference audio path directly
            return_tensors="pt"
        )
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        print("Generating speech...")
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                cfg_scale=args.cfg_scale,
                max_new_tokens=4096,
            )

        # Audio is already watermarked by the model's generate method
        audio = outputs.speech_outputs[0]

        # Save
        processor.save_audio(audio, args.output)
        print(f"Audio saved to {args.output}")

    elif args.command == "verify":
        import numpy as np
        import soundfile as sf
        from kugelaudio_open.watermark import AudioWatermark

        audio, sr = sf.read(args.audio)

        watermark = AudioWatermark()
        result = watermark.detect(audio, sample_rate=sr)

        if result.detected:
            print(f"βœ… Watermark DETECTED (confidence: {result.confidence:.1%})")
            print("This audio was generated by KugelAudio.")
        else:
            print(f"❌ No watermark detected (confidence: {result.confidence:.1%})")
            print("This audio does not appear to be generated by KugelAudio.")

    else:
        parser.print_help()
        sys.exit(1)


if __name__ == "__main__":
    main()