""" KugelAudio Gradio Demo Open-source text-to-speech for European languages with voice cloning capabilities. """ import logging import tempfile import time import gradio as gr import torch import torchaudio import spaces from kugelaudio_open import ( KugelAudioForConditionalGenerationInference, KugelAudioProcessor, ) from kugelaudio_open.watermark import AudioWatermark logger = logging.getLogger(__name__) # ─── Device & Model Setup ─────────────────────────────────────────────────── DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32 MODEL_ID = "kugelaudio/kugelaudio-0-open" OUTPUT_SAMPLE_RATE = 24000 logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE) model = KugelAudioForConditionalGenerationInference.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 , ).to("cuda") processor = KugelAudioProcessor.from_pretrained(MODEL_ID) watermarker = AudioWatermark() logger.info("Model loaded successfully.") # ─── Language Configuration ────────────────────────────────────────────────── LANGUAGES = { "English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr", "Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇧🇷🇵🇹": "pt", "Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru", "Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro", "Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da", "Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el", "Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr", "Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr", } EXAMPLE_TEXTS = { "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.", "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.", "fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.", "es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.", "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.", "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.", "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.", "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.", "ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.", } # ─── Inference Helpers ─────────────────────────────────────────────────────── def _to_device(inputs: dict) -> dict: """Move tensor values to the model device.""" return { k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in inputs.items() } def _save_to_tempfile(audio_tensor: torch.Tensor) -> str: """Write an audio tensor to a temporary WAV file and return its path.""" tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) processor.save_audio(audio_tensor, tmp.name) return tmp.name @spaces.GPU def generate_speech( text: str, language: str, cfg_scale: float, max_tokens: int, ) -> tuple[str, str]: """Generate speech from text and return (audio_path, info_markdown).""" if not text.strip(): raise gr.Error("Please enter some text to synthesize.") lang_code = LANGUAGES.get(language, "en") inputs = processor(text=text, return_tensors="pt") inputs = _to_device(inputs) t0 = time.perf_counter() with torch.no_grad(): outputs = model.generate( **inputs, cfg_scale=cfg_scale, max_new_tokens=int(max_tokens), ) elapsed = time.perf_counter() - t0 audio_path = _save_to_tempfile(outputs.speech_outputs[0]) audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE info = ( f"🔊 **Generation Complete**\n\n" f"Language: {language} (`{lang_code}`) · " f"CFG: {cfg_scale} · " f"Duration: {audio_duration:.1f}s · " f"Inference: {elapsed:.2f}s · " f"RTF: {elapsed / audio_duration:.2f}x" ) return audio_path, info @spaces.GPU def clone_voice( text: str, reference_audio: str | None, language: str, cfg_scale: float, max_tokens: int, ) -> tuple[str, str]: """Clone a voice from reference audio and synthesize new text.""" if not text.strip(): raise gr.Error("Please enter some text to synthesize.") if reference_audio is None: raise gr.Error("Please upload a reference audio file for voice cloning.") lang_code = LANGUAGES.get(language, "en") inputs = processor( text=text, voice_prompt=reference_audio, return_tensors="pt", ) inputs = _to_device(inputs) t0 = time.perf_counter() with torch.no_grad(): outputs = model.generate( **inputs, cfg_scale=cfg_scale, max_new_tokens=int(max_tokens), ) elapsed = time.perf_counter() - t0 audio_path = _save_to_tempfile(outputs.speech_outputs[0]) audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE info = ( f"🎭 **Voice Cloning Complete**\n\n" f"Language: {language} (`{lang_code}`) · " f"CFG: {cfg_scale} · " f"Duration: {audio_duration:.1f}s · " f"Inference: {elapsed:.2f}s · " f"RTF: {elapsed / audio_duration:.2f}x" ) return audio_path, info def verify_watermark(audio_file: str | None) -> str: """Detect the AudioSeal watermark in an uploaded audio file.""" if audio_file is None: raise gr.Error("Please upload an audio file to verify.") waveform, sr = torchaudio.load(audio_file) # Resample to the expected rate if necessary if sr != OUTPUT_SAMPLE_RATE: waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE) result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE) if result.detected: status = "✅ **Watermark Detected**" else: status = "❌ **No Watermark Detected**" return ( f"🔍 **Watermark Verification**\n\n" f"{status}\n\n" f"Confidence: **{result.confidence:.1%}**\n\n" f"Technology: Facebook AudioSeal · Resolution: 1/16k second" ) def fill_example_text(language: str) -> str: """Fill the text box with an example in the selected language.""" lang_code = LANGUAGES.get(language, "en") return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"]) # ─── Custom CSS ────────────────────────────────────────────────────────────── CSS = """ /* ── Base theme ── */ :root { --ka-primary: #1a1a2e; --ka-accent: #e94560; --ka-accent-hover: #ff6b81; --ka-surface: #16213e; --ka-surface-light: #1c2a4a; --ka-text: #eaeaea; --ka-text-muted: #8892a4; --ka-border: #2a3a5c; --ka-gold: #f5c518; --ka-green: #2ecc71; } /* ── Global ── */ .gradio-container { max-width: 960px !important; margin: 0 auto !important; font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important; } /* ── Hero header ── */ .hero-header { text-align: center; padding: .5rem 1.5rem 1.5rem; margin-bottom: 0.5rem; border-radius: 16px; position: relative; overflow: hidden; } .hero-header::before { content: ''; position: absolute; top: -50%; left: -50%; width: 200%; height: 200%; pointer-events: none; } .hero-header h1 { font-size: 2.4rem !important; font-weight: 700 !important; margin: 0 0 0.3rem !important; color: var(--body-text-color); letter-spacing: -0.02em; } .hero-header .hero-accent { color: var(--ka-accent); } .hero-header p { color: var(--ka-text-muted); font-size: 1.05rem; margin: 0; line-height: 1.5; } /* ── Badges row ── */ .badges { display: flex; justify-content: center; gap: 0.5rem; margin-top: 1rem; flex-wrap: wrap; } .badge { display: inline-flex; align-items: center; gap: 0.35rem; padding: 0.3rem 0.75rem; border-radius: 999px; font-size: 0.78rem; font-weight: 600; letter-spacing: 0.01em; background: var(--block-background-fill, var(--ka-surface-light)); color: var(--body-text-color, var(--ka-text)); border: 1px solid var(--border-color-primary, var(--ka-border)); } .badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); } .badge.green { border-color: var(--ka-green); color: var(--ka-green); } .badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); } /* ── Benchmark table ── */ .benchmark-table { width: 100%; border-collapse: separate; border-spacing: 0; margin: 0.75rem 0; font-size: 0.88rem; border-radius: 10px; overflow: hidden; border: 1px solid var(--ka-border); } .benchmark-table th { background: var(--ka-surface); color: var(--ka-text-muted); font-weight: 600; text-transform: uppercase; font-size: 0.72rem; letter-spacing: 0.06em; padding: 0.65rem 0.8rem; text-align: left; } .benchmark-table td { padding: 0.55rem 0.8rem; border-top: 1px solid var(--ka-border); color: var(--ka-text); } .benchmark-table tr.highlight td { background: rgba(233, 69, 96, 0.08); font-weight: 600; } .benchmark-table tr:not(.highlight) td { background: transparent; } /* ── Section divider ── */ .section-label { font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.1em; color: var(--ka-text-muted); margin: 1rem 0 0.3rem; padding-left: 2px; font-weight: 600; } /* ── Tab styling ── */ .tab-nav button { font-weight: 600 !important; letter-spacing: 0.01em !important; } .tab-nav button.selected { border-color: var(--ka-accent) !important; color: var(--ka-accent) !important; } /* ── Footer ── */ .footer { text-align: center; padding: 1.2rem; margin-top: 1rem; font-size: 0.8rem; color: var(--ka-text-muted); border-top: 1px solid var(--ka-border); line-height: 1.6; } .footer a { color: var(--ka-accent); text-decoration: none; } .footer a:hover { text-decoration: underline; } """ # ─── Header HTML ───────────────────────────────────────────────────────────── HEADER_HTML = """

🎙️ KugelAudio

Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset

🏆 #1 German TTS 24 Languages Voice Cloning MIT License 7B Parameters
""" BENCHMARK_HTML = """
Rank Model Score Win Rate
🥇 KugelAudio 26 78.0%
🥈 ElevenLabs Multi v2 25 62.2%
🥉 ElevenLabs v3 21 65.3%
4 Cartesia 21 59.1%
5 VibeVoice 10 28.8%
6 CosyVoice v3 9 14.2%

Based on 339 human A/B evaluations · OpenSkill Bayesian ranking

""" FOOTER_HTML = """ """ # ─── Build Gradio Interface ───────────────────────────────────────────────── with gr.Blocks( css=CSS, title="KugelAudio – European TTS", theme=gr.themes.Base( primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.slate, neutral_hue=gr.themes.colors.slate, font=gr.themes.GoogleFont("IBM Plex Sans"), font_mono=gr.themes.GoogleFont("IBM Plex Mono"), ), ) as demo: # ── Header ── gr.HTML(HEADER_HTML) # ── Main Tabs ── with gr.Tabs(): # ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ with gr.TabItem("🔊 Text-to-Speech", id="tts"): with gr.Row(): with gr.Column(scale=3): tts_language = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English 🇺🇸", label="Language", info="24 European languages supported", ) tts_text = gr.Textbox( label="Text to Synthesize", placeholder="Enter text here or click 'Fill Example' below…", lines=5, max_lines=12, ) with gr.Row(): tts_example_btn = gr.Button( "📝 Fill Example", size="sm", variant="secondary" ) tts_clear_btn = gr.ClearButton( [tts_text], value="🗑️ Clear", size="sm" ) with gr.Accordion("⚙️ Advanced Settings", open=False): tts_cfg = gr.Slider( minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="CFG Scale", info="Guidance scale — higher values follow the text more closely", ) tts_max_tokens = gr.Slider( minimum=512, maximum=8192, value=4096, step=512, label="Max Tokens", info="Maximum generation length in tokens", ) tts_generate_btn = gr.Button( "🎙️ Generate Speech", variant="primary", size="lg" ) with gr.Column(scale=2): tts_audio_out = gr.Audio( label="Generated Audio", type="filepath", interactive=False, ) tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*") # Events tts_example_btn.click( fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] ) tts_language.change( fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] ) tts_generate_btn.click( fn=generate_speech, inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens], outputs=[tts_audio_out, tts_info], ) # ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ with gr.TabItem("🎭 Voice Cloning", id="clone"): with gr.Row(): with gr.Column(scale=3): clone_language = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English 🇺🇸", label="Language", ) clone_text = gr.Textbox( label="Text to Synthesize", placeholder="Enter the text you want spoken in the cloned voice…", lines=4, max_lines=10, ) clone_ref = gr.Audio( label="Reference Voice", type="filepath", sources=["upload", "microphone"], ) gr.Markdown( "

Upload or record a few seconds of the " "target voice. The model will replicate its characteristics.

" ) with gr.Accordion("⚙️ Advanced Settings", open=False): clone_cfg = gr.Slider( minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="CFG Scale", ) clone_max_tokens = gr.Slider( minimum=512, maximum=8192, value=4096, step=512, label="Max Tokens", ) clone_btn = gr.Button( "🎭 Clone & Generate", variant="primary", size="lg" ) with gr.Column(scale=2): clone_audio_out = gr.Audio( label="Cloned Voice Output", type="filepath", interactive=False, ) clone_info = gr.Markdown( "*Upload a reference voice and press 'Clone & Generate'.*" ) # Events clone_btn.click( fn=clone_voice, inputs=[ clone_text, clone_ref, clone_language, clone_cfg, clone_max_tokens, ], outputs=[clone_audio_out, clone_info], ) # ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ with gr.TabItem("🔒 Watermark Verify", id="watermark"): with gr.Row(): with gr.Column(scale=3): wm_audio = gr.Audio( label="Audio to Verify", type="filepath", sources=["upload"], ) gr.Markdown( "

All KugelAudio outputs are watermarked " "with Facebook AudioSeal. Upload any audio file to check.

" ) wm_btn = gr.Button( "🔍 Verify Watermark", variant="primary", size="lg" ) with gr.Column(scale=2): wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*") wm_btn.click( fn=verify_watermark, inputs=[wm_audio], outputs=[wm_result], ) # ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ with gr.TabItem("🏆 Benchmarks", id="bench"): gr.Markdown("### Human Preference Ranking — German TTS") gr.HTML(BENCHMARK_HTML) gr.Markdown( "Evaluations covered **neutral speech, shouting, singing, and " "drunken voice** styles across diverse German-language samples. " "Participants heard a reference voice and compared outputs from " "two anonymous models in a blind A/B test." ) # ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ with gr.TabItem("ℹ️ About", id="about"): gr.Markdown(""" ### Architecture KugelAudio uses a hybrid **AR + Diffusion** pipeline: 1. **Text Encoder** — Qwen2-based language model encodes input text 2. **TTS Backbone** — Upper transformer layers generate speech representations 3. **Diffusion Head** — Predicts speech latents via denoising diffusion 4. **Acoustic Decoder** — Converts latents to waveforms ### Training | Detail | Value | |--------|-------| | Base model | Microsoft VibeVoice | | Training data | ~200,000 hours (YODAS2) | | Hardware | 8× NVIDIA H100 | | Duration | 5 days | | Parameters | 7B | ### Responsible Use KugelAudio is intended for accessibility, content creation, voice assistants, language learning, and creative projects **with consent**. All generated audio is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation without consent, fraud, or any illegal use is prohibited. ### License Released under the **MIT License**. ### Citation ```bibtex @software{kugelaudio2026, title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning}, author = {Kratzenstein, Kajo and Menke, Carlos}, year = {2026}, url = {https://github.com/kugelaudio/kugelaudio} } ``` """) # ── Footer ── gr.HTML(FOOTER_HTML) # ─── Launch ────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.queue() demo.launch( server_name="0.0.0.0", server_port=7860, # show_api=True, )