#!/usr/bin/env python3 """Command-line interface for KugelAudio.""" import argparse import sys def main(): parser = argparse.ArgumentParser( description="KugelAudio - Open-source text-to-speech", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Launch web interface kugelaudio ui # Launch with public share link kugelaudio ui --share # Generate speech from command line kugelaudio generate "Hello world!" -o output.wav # Check watermark in audio file kugelaudio verify audio.wav """, ) subparsers = parser.add_subparsers(dest="command", help="Available commands") # UI command ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface") ui_parser.add_argument("--share", action="store_true", help="Create public share link") ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname") ui_parser.add_argument("--port", type=int, default=7860, help="Server port") # Generate command gen_parser = subparsers.add_parser("generate", help="Generate speech from text") gen_parser.add_argument("text", help="Text to synthesize") gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path") gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning") gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID") gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale") # Verify command verify_parser = subparsers.add_parser("verify", help="Check watermark in audio") verify_parser.add_argument("audio", help="Audio file to check") args = parser.parse_args() if args.command == "ui": from kugelaudio_open.ui import launch_app launch_app( share=args.share, server_name=args.host, server_port=args.port, ) elif args.command == "generate": import torch from kugelaudio_open.models import KugelAudioForConditionalGenerationInference from kugelaudio_open.processors import KugelAudioProcessor device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 if device == "cuda" else torch.float32 print(f"Loading model {args.model}...") model = KugelAudioForConditionalGenerationInference.from_pretrained( args.model, torch_dtype=dtype ).to(device) model.eval() processor = KugelAudioProcessor.from_pretrained(args.model) # Process inputs (voice_prompt passed to processor for proper handling) inputs = processor( text=args.text, voice_prompt=args.reference, # Pass reference audio path directly return_tensors="pt" ) inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} print("Generating speech...") with torch.no_grad(): outputs = model.generate( **inputs, cfg_scale=args.cfg_scale, max_new_tokens=4096, ) # Audio is already watermarked by the model's generate method audio = outputs.speech_outputs[0] # Save processor.save_audio(audio, args.output) print(f"Audio saved to {args.output}") elif args.command == "verify": import numpy as np import soundfile as sf from kugelaudio_open.watermark import AudioWatermark audio, sr = sf.read(args.audio) watermark = AudioWatermark() result = watermark.detect(audio, sample_rate=sr) if result.detected: print(f"✅ Watermark DETECTED (confidence: {result.confidence:.1%})") print("This audio was generated by KugelAudio.") else: print(f"❌ No watermark detected (confidence: {result.confidence:.1%})") print("This audio does not appear to be generated by KugelAudio.") else: parser.print_help() sys.exit(1) if __name__ == "__main__": main()