multimodalart's picture
Upload 25 files
bbb0e68 verified
#!/usr/bin/env python3
"""Command-line interface for KugelAudio."""
import argparse
import sys
def main():
parser = argparse.ArgumentParser(
description="KugelAudio - Open-source text-to-speech",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Launch web interface
kugelaudio ui
# Launch with public share link
kugelaudio ui --share
# Generate speech from command line
kugelaudio generate "Hello world!" -o output.wav
# Check watermark in audio file
kugelaudio verify audio.wav
""",
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# UI command
ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface")
ui_parser.add_argument("--share", action="store_true", help="Create public share link")
ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname")
ui_parser.add_argument("--port", type=int, default=7860, help="Server port")
# Generate command
gen_parser = subparsers.add_parser("generate", help="Generate speech from text")
gen_parser.add_argument("text", help="Text to synthesize")
gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path")
gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning")
gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID")
gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale")
# Verify command
verify_parser = subparsers.add_parser("verify", help="Check watermark in audio")
verify_parser.add_argument("audio", help="Audio file to check")
args = parser.parse_args()
if args.command == "ui":
from kugelaudio_open.ui import launch_app
launch_app(
share=args.share,
server_name=args.host,
server_port=args.port,
)
elif args.command == "generate":
import torch
from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
from kugelaudio_open.processors import KugelAudioProcessor
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32
print(f"Loading model {args.model}...")
model = KugelAudioForConditionalGenerationInference.from_pretrained(
args.model, torch_dtype=dtype
).to(device)
model.eval()
processor = KugelAudioProcessor.from_pretrained(args.model)
# Process inputs (voice_prompt passed to processor for proper handling)
inputs = processor(
text=args.text,
voice_prompt=args.reference, # Pass reference audio path directly
return_tensors="pt"
)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
print("Generating speech...")
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=args.cfg_scale,
max_new_tokens=4096,
)
# Audio is already watermarked by the model's generate method
audio = outputs.speech_outputs[0]
# Save
processor.save_audio(audio, args.output)
print(f"Audio saved to {args.output}")
elif args.command == "verify":
import numpy as np
import soundfile as sf
from kugelaudio_open.watermark import AudioWatermark
audio, sr = sf.read(args.audio)
watermark = AudioWatermark()
result = watermark.detect(audio, sample_rate=sr)
if result.detected:
print(f"βœ… Watermark DETECTED (confidence: {result.confidence:.1%})")
print("This audio was generated by KugelAudio.")
else:
print(f"❌ No watermark detected (confidence: {result.confidence:.1%})")
print("This audio does not appear to be generated by KugelAudio.")
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()