Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """Command-line interface for KugelAudio.""" | |
| import argparse | |
| import sys | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="KugelAudio - Open-source text-to-speech", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Launch web interface | |
| kugelaudio ui | |
| # Launch with public share link | |
| kugelaudio ui --share | |
| # Generate speech from command line | |
| kugelaudio generate "Hello world!" -o output.wav | |
| # Check watermark in audio file | |
| kugelaudio verify audio.wav | |
| """, | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", help="Available commands") | |
| # UI command | |
| ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface") | |
| ui_parser.add_argument("--share", action="store_true", help="Create public share link") | |
| ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname") | |
| ui_parser.add_argument("--port", type=int, default=7860, help="Server port") | |
| # Generate command | |
| gen_parser = subparsers.add_parser("generate", help="Generate speech from text") | |
| gen_parser.add_argument("text", help="Text to synthesize") | |
| gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path") | |
| gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning") | |
| gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID") | |
| gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale") | |
| # Verify command | |
| verify_parser = subparsers.add_parser("verify", help="Check watermark in audio") | |
| verify_parser.add_argument("audio", help="Audio file to check") | |
| args = parser.parse_args() | |
| if args.command == "ui": | |
| from kugelaudio_open.ui import launch_app | |
| launch_app( | |
| share=args.share, | |
| server_name=args.host, | |
| server_port=args.port, | |
| ) | |
| elif args.command == "generate": | |
| import torch | |
| from kugelaudio_open.models import KugelAudioForConditionalGenerationInference | |
| from kugelaudio_open.processors import KugelAudioProcessor | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.bfloat16 if device == "cuda" else torch.float32 | |
| print(f"Loading model {args.model}...") | |
| model = KugelAudioForConditionalGenerationInference.from_pretrained( | |
| args.model, torch_dtype=dtype | |
| ).to(device) | |
| model.eval() | |
| processor = KugelAudioProcessor.from_pretrained(args.model) | |
| # Process inputs (voice_prompt passed to processor for proper handling) | |
| inputs = processor( | |
| text=args.text, | |
| voice_prompt=args.reference, # Pass reference audio path directly | |
| return_tensors="pt" | |
| ) | |
| inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} | |
| print("Generating speech...") | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| cfg_scale=args.cfg_scale, | |
| max_new_tokens=4096, | |
| ) | |
| # Audio is already watermarked by the model's generate method | |
| audio = outputs.speech_outputs[0] | |
| # Save | |
| processor.save_audio(audio, args.output) | |
| print(f"Audio saved to {args.output}") | |
| elif args.command == "verify": | |
| import numpy as np | |
| import soundfile as sf | |
| from kugelaudio_open.watermark import AudioWatermark | |
| audio, sr = sf.read(args.audio) | |
| watermark = AudioWatermark() | |
| result = watermark.detect(audio, sample_rate=sr) | |
| if result.detected: | |
| print(f"β Watermark DETECTED (confidence: {result.confidence:.1%})") | |
| print("This audio was generated by KugelAudio.") | |
| else: | |
| print(f"β No watermark detected (confidence: {result.confidence:.1%})") | |
| print("This audio does not appear to be generated by KugelAudio.") | |
| else: | |
| parser.print_help() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |