Spaces:
Runtime error
Runtime error
File size: 4,157 Bytes
bbb0e68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | #!/usr/bin/env python3
"""Command-line interface for KugelAudio."""
import argparse
import sys
def main():
parser = argparse.ArgumentParser(
description="KugelAudio - Open-source text-to-speech",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Launch web interface
kugelaudio ui
# Launch with public share link
kugelaudio ui --share
# Generate speech from command line
kugelaudio generate "Hello world!" -o output.wav
# Check watermark in audio file
kugelaudio verify audio.wav
""",
)
subparsers = parser.add_subparsers(dest="command", help="Available commands")
# UI command
ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface")
ui_parser.add_argument("--share", action="store_true", help="Create public share link")
ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname")
ui_parser.add_argument("--port", type=int, default=7860, help="Server port")
# Generate command
gen_parser = subparsers.add_parser("generate", help="Generate speech from text")
gen_parser.add_argument("text", help="Text to synthesize")
gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path")
gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning")
gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID")
gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale")
# Verify command
verify_parser = subparsers.add_parser("verify", help="Check watermark in audio")
verify_parser.add_argument("audio", help="Audio file to check")
args = parser.parse_args()
if args.command == "ui":
from kugelaudio_open.ui import launch_app
launch_app(
share=args.share,
server_name=args.host,
server_port=args.port,
)
elif args.command == "generate":
import torch
from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
from kugelaudio_open.processors import KugelAudioProcessor
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32
print(f"Loading model {args.model}...")
model = KugelAudioForConditionalGenerationInference.from_pretrained(
args.model, torch_dtype=dtype
).to(device)
model.eval()
processor = KugelAudioProcessor.from_pretrained(args.model)
# Process inputs (voice_prompt passed to processor for proper handling)
inputs = processor(
text=args.text,
voice_prompt=args.reference, # Pass reference audio path directly
return_tensors="pt"
)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
print("Generating speech...")
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=args.cfg_scale,
max_new_tokens=4096,
)
# Audio is already watermarked by the model's generate method
audio = outputs.speech_outputs[0]
# Save
processor.save_audio(audio, args.output)
print(f"Audio saved to {args.output}")
elif args.command == "verify":
import numpy as np
import soundfile as sf
from kugelaudio_open.watermark import AudioWatermark
audio, sr = sf.read(args.audio)
watermark = AudioWatermark()
result = watermark.detect(audio, sample_rate=sr)
if result.detected:
print(f"β
Watermark DETECTED (confidence: {result.confidence:.1%})")
print("This audio was generated by KugelAudio.")
else:
print(f"β No watermark detected (confidence: {result.confidence:.1%})")
print("This audio does not appear to be generated by KugelAudio.")
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()
|