Spaces:
Runtime error
Runtime error
| """ | |
| KugelAudio Gradio Demo | |
| Open-source text-to-speech for European languages with voice cloning capabilities. | |
| """ | |
| import logging | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import spaces | |
| from kugelaudio_open import ( | |
| KugelAudioForConditionalGenerationInference, | |
| KugelAudioProcessor, | |
| ) | |
| from kugelaudio_open.watermark import AudioWatermark | |
| logger = logging.getLogger(__name__) | |
| # ─── Device & Model Setup ─────────────────────────────────────────────────── | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32 | |
| MODEL_ID = "kugelaudio/kugelaudio-0-open" | |
| OUTPUT_SAMPLE_RATE = 24000 | |
| logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE) | |
| model = KugelAudioForConditionalGenerationInference.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 , | |
| ).to("cuda") | |
| processor = KugelAudioProcessor.from_pretrained(MODEL_ID) | |
| watermarker = AudioWatermark() | |
| logger.info("Model loaded successfully.") | |
| # ─── Language Configuration ────────────────────────────────────────────────── | |
| LANGUAGES = { | |
| "English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr", | |
| "Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇧🇷🇵🇹": "pt", | |
| "Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru", | |
| "Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro", | |
| "Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da", | |
| "Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el", | |
| "Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr", | |
| "Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr", | |
| } | |
| EXAMPLE_TEXTS = { | |
| "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.", | |
| "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.", | |
| "fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.", | |
| "es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.", | |
| "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.", | |
| "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.", | |
| "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.", | |
| "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.", | |
| "ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.", | |
| } | |
| # ─── Inference Helpers ─────────────────────────────────────────────────────── | |
| def _to_device(inputs: dict) -> dict: | |
| """Move tensor values to the model device.""" | |
| return { | |
| k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v | |
| for k, v in inputs.items() | |
| } | |
| def _save_to_tempfile(audio_tensor: torch.Tensor) -> str: | |
| """Write an audio tensor to a temporary WAV file and return its path.""" | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| processor.save_audio(audio_tensor, tmp.name) | |
| return tmp.name | |
| def generate_speech( | |
| text: str, | |
| language: str, | |
| cfg_scale: float, | |
| max_tokens: int, | |
| ) -> tuple[str, str]: | |
| """Generate speech from text and return (audio_path, info_markdown).""" | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to synthesize.") | |
| lang_code = LANGUAGES.get(language, "en") | |
| inputs = processor(text=text, return_tensors="pt") | |
| inputs = _to_device(inputs) | |
| t0 = time.perf_counter() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| cfg_scale=cfg_scale, | |
| max_new_tokens=int(max_tokens), | |
| ) | |
| elapsed = time.perf_counter() - t0 | |
| audio_path = _save_to_tempfile(outputs.speech_outputs[0]) | |
| audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE | |
| info = ( | |
| f"🔊 **Generation Complete**\n\n" | |
| f"Language: {language} (`{lang_code}`) · " | |
| f"CFG: {cfg_scale} · " | |
| f"Duration: {audio_duration:.1f}s · " | |
| f"Inference: {elapsed:.2f}s · " | |
| f"RTF: {elapsed / audio_duration:.2f}x" | |
| ) | |
| return audio_path, info | |
| def clone_voice( | |
| text: str, | |
| reference_audio: str | None, | |
| language: str, | |
| cfg_scale: float, | |
| max_tokens: int, | |
| ) -> tuple[str, str]: | |
| """Clone a voice from reference audio and synthesize new text.""" | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to synthesize.") | |
| if reference_audio is None: | |
| raise gr.Error("Please upload a reference audio file for voice cloning.") | |
| lang_code = LANGUAGES.get(language, "en") | |
| inputs = processor( | |
| text=text, | |
| voice_prompt=reference_audio, | |
| return_tensors="pt", | |
| ) | |
| inputs = _to_device(inputs) | |
| t0 = time.perf_counter() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| cfg_scale=cfg_scale, | |
| max_new_tokens=int(max_tokens), | |
| ) | |
| elapsed = time.perf_counter() - t0 | |
| audio_path = _save_to_tempfile(outputs.speech_outputs[0]) | |
| audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE | |
| info = ( | |
| f"🎭 **Voice Cloning Complete**\n\n" | |
| f"Language: {language} (`{lang_code}`) · " | |
| f"CFG: {cfg_scale} · " | |
| f"Duration: {audio_duration:.1f}s · " | |
| f"Inference: {elapsed:.2f}s · " | |
| f"RTF: {elapsed / audio_duration:.2f}x" | |
| ) | |
| return audio_path, info | |
| def verify_watermark(audio_file: str | None) -> str: | |
| """Detect the AudioSeal watermark in an uploaded audio file.""" | |
| if audio_file is None: | |
| raise gr.Error("Please upload an audio file to verify.") | |
| waveform, sr = torchaudio.load(audio_file) | |
| # Resample to the expected rate if necessary | |
| if sr != OUTPUT_SAMPLE_RATE: | |
| waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE) | |
| result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE) | |
| if result.detected: | |
| status = "✅ **Watermark Detected**" | |
| else: | |
| status = "❌ **No Watermark Detected**" | |
| return ( | |
| f"🔍 **Watermark Verification**\n\n" | |
| f"{status}\n\n" | |
| f"Confidence: **{result.confidence:.1%}**\n\n" | |
| f"Technology: Facebook AudioSeal · Resolution: 1/16k second" | |
| ) | |
| def fill_example_text(language: str) -> str: | |
| """Fill the text box with an example in the selected language.""" | |
| lang_code = LANGUAGES.get(language, "en") | |
| return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"]) | |
| # ─── Custom CSS ────────────────────────────────────────────────────────────── | |
| CSS = """ | |
| /* ── Base theme ── */ | |
| :root { | |
| --ka-primary: #1a1a2e; | |
| --ka-accent: #e94560; | |
| --ka-accent-hover: #ff6b81; | |
| --ka-surface: #16213e; | |
| --ka-surface-light: #1c2a4a; | |
| --ka-text: #eaeaea; | |
| --ka-text-muted: #8892a4; | |
| --ka-border: #2a3a5c; | |
| --ka-gold: #f5c518; | |
| --ka-green: #2ecc71; | |
| } | |
| /* ── Global ── */ | |
| .gradio-container { | |
| max-width: 960px !important; | |
| margin: 0 auto !important; | |
| font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important; | |
| } | |
| /* ── Hero header ── */ | |
| .hero-header { | |
| text-align: center; | |
| padding: .5rem 1.5rem 1.5rem; | |
| margin-bottom: 0.5rem; | |
| border-radius: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .hero-header::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| pointer-events: none; | |
| } | |
| .hero-header h1 { | |
| font-size: 2.4rem !important; | |
| font-weight: 700 !important; | |
| margin: 0 0 0.3rem !important; | |
| color: var(--body-text-color); | |
| letter-spacing: -0.02em; | |
| } | |
| .hero-header .hero-accent { | |
| color: var(--ka-accent); | |
| } | |
| .hero-header p { | |
| color: var(--ka-text-muted); | |
| font-size: 1.05rem; | |
| margin: 0; | |
| line-height: 1.5; | |
| } | |
| /* ── Badges row ── */ | |
| .badges { | |
| display: flex; | |
| justify-content: center; | |
| gap: 0.5rem; | |
| margin-top: 1rem; | |
| flex-wrap: wrap; | |
| } | |
| .badge { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.35rem; | |
| padding: 0.3rem 0.75rem; | |
| border-radius: 999px; | |
| font-size: 0.78rem; | |
| font-weight: 600; | |
| letter-spacing: 0.01em; | |
| background: var(--block-background-fill, var(--ka-surface-light)); | |
| color: var(--body-text-color, var(--ka-text)); | |
| border: 1px solid var(--border-color-primary, var(--ka-border)); | |
| } | |
| .badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); } | |
| .badge.green { border-color: var(--ka-green); color: var(--ka-green); } | |
| .badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); } | |
| /* ── Benchmark table ── */ | |
| .benchmark-table { | |
| width: 100%; | |
| border-collapse: separate; | |
| border-spacing: 0; | |
| margin: 0.75rem 0; | |
| font-size: 0.88rem; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| border: 1px solid var(--ka-border); | |
| } | |
| .benchmark-table th { | |
| background: var(--ka-surface); | |
| color: var(--ka-text-muted); | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| font-size: 0.72rem; | |
| letter-spacing: 0.06em; | |
| padding: 0.65rem 0.8rem; | |
| text-align: left; | |
| } | |
| .benchmark-table td { | |
| padding: 0.55rem 0.8rem; | |
| border-top: 1px solid var(--ka-border); | |
| color: var(--ka-text); | |
| } | |
| .benchmark-table tr.highlight td { | |
| background: rgba(233, 69, 96, 0.08); | |
| font-weight: 600; | |
| } | |
| .benchmark-table tr:not(.highlight) td { | |
| background: transparent; | |
| } | |
| /* ── Section divider ── */ | |
| .section-label { | |
| font-size: 0.7rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--ka-text-muted); | |
| margin: 1rem 0 0.3rem; | |
| padding-left: 2px; | |
| font-weight: 600; | |
| } | |
| /* ── Tab styling ── */ | |
| .tab-nav button { | |
| font-weight: 600 !important; | |
| letter-spacing: 0.01em !important; | |
| } | |
| .tab-nav button.selected { | |
| border-color: var(--ka-accent) !important; | |
| color: var(--ka-accent) !important; | |
| } | |
| /* ── Footer ── */ | |
| .footer { | |
| text-align: center; | |
| padding: 1.2rem; | |
| margin-top: 1rem; | |
| font-size: 0.8rem; | |
| color: var(--ka-text-muted); | |
| border-top: 1px solid var(--ka-border); | |
| line-height: 1.6; | |
| } | |
| .footer a { | |
| color: var(--ka-accent); | |
| text-decoration: none; | |
| } | |
| .footer a:hover { | |
| text-decoration: underline; | |
| } | |
| """ | |
| # ─── Header HTML ───────────────────────────────────────────────────────────── | |
| HEADER_HTML = """ | |
| <div class="hero-header"> | |
| <h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1> | |
| <p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p> | |
| <div class="badges"> | |
| <span class="badge gold">🏆 #1 German TTS</span> | |
| <span class="badge green">24 Languages</span> | |
| <span class="badge accent">Voice Cloning</span> | |
| <span class="badge">MIT License</span> | |
| <span class="badge">7B Parameters</span> | |
| </div> | |
| </div> | |
| """ | |
| BENCHMARK_HTML = """ | |
| <table class="benchmark-table"> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Model</th> | |
| <th>Score</th> | |
| <th>Win Rate</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr class="highlight"> | |
| <td>🥇</td> | |
| <td>KugelAudio</td> | |
| <td>26</td> | |
| <td>78.0%</td> | |
| </tr> | |
| <tr> | |
| <td>🥈</td> | |
| <td>ElevenLabs Multi v2</td> | |
| <td>25</td> | |
| <td>62.2%</td> | |
| </tr> | |
| <tr> | |
| <td>🥉</td> | |
| <td>ElevenLabs v3</td> | |
| <td>21</td> | |
| <td>65.3%</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>Cartesia</td> | |
| <td>21</td> | |
| <td>59.1%</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>VibeVoice</td> | |
| <td>10</td> | |
| <td>28.8%</td> | |
| </tr> | |
| <tr> | |
| <td>6</td> | |
| <td>CosyVoice v3</td> | |
| <td>9</td> | |
| <td>14.2%</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;"> | |
| Based on 339 human A/B evaluations · OpenSkill Bayesian ranking | |
| </p> | |
| """ | |
| FOOTER_HTML = """ | |
| <div class="footer"> | |
| <strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br> | |
| <a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> · | |
| <a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> · | |
| <a href="https://kugelaudio.com">API</a> · | |
| <a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br> | |
| Funded by the German Federal Ministry of Research, Technology and Space (BMFTR) | |
| via the AI Service Center Berlin-Brandenburg at HPI | |
| </div> | |
| """ | |
| # ─── Build Gradio Interface ───────────────────────────────────────────────── | |
| with gr.Blocks( | |
| css=CSS, | |
| title="KugelAudio – European TTS", | |
| theme=gr.themes.Base( | |
| primary_hue=gr.themes.colors.red, | |
| secondary_hue=gr.themes.colors.slate, | |
| neutral_hue=gr.themes.colors.slate, | |
| font=gr.themes.GoogleFont("IBM Plex Sans"), | |
| font_mono=gr.themes.GoogleFont("IBM Plex Mono"), | |
| ), | |
| ) as demo: | |
| # ── Header ── | |
| gr.HTML(HEADER_HTML) | |
| # ── Main Tabs ── | |
| with gr.Tabs(): | |
| # ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| with gr.TabItem("🔊 Text-to-Speech", id="tts"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| tts_language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="English 🇺🇸", | |
| label="Language", | |
| info="24 European languages supported", | |
| ) | |
| tts_text = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter text here or click 'Fill Example' below…", | |
| lines=5, | |
| max_lines=12, | |
| ) | |
| with gr.Row(): | |
| tts_example_btn = gr.Button( | |
| "📝 Fill Example", size="sm", variant="secondary" | |
| ) | |
| tts_clear_btn = gr.ClearButton( | |
| [tts_text], value="🗑️ Clear", size="sm" | |
| ) | |
| with gr.Accordion("⚙️ Advanced Settings", open=False): | |
| tts_cfg = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="CFG Scale", | |
| info="Guidance scale — higher values follow the text more closely", | |
| ) | |
| tts_max_tokens = gr.Slider( | |
| minimum=512, | |
| maximum=8192, | |
| value=4096, | |
| step=512, | |
| label="Max Tokens", | |
| info="Maximum generation length in tokens", | |
| ) | |
| tts_generate_btn = gr.Button( | |
| "🎙️ Generate Speech", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| tts_audio_out = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath", | |
| interactive=False, | |
| ) | |
| tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*") | |
| # Events | |
| tts_example_btn.click( | |
| fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] | |
| ) | |
| tts_language.change( | |
| fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] | |
| ) | |
| tts_generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens], | |
| outputs=[tts_audio_out, tts_info], | |
| ) | |
| # ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| with gr.TabItem("🎭 Voice Cloning", id="clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| clone_language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="English 🇺🇸", | |
| label="Language", | |
| ) | |
| clone_text = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter the text you want spoken in the cloned voice…", | |
| lines=4, | |
| max_lines=10, | |
| ) | |
| clone_ref = gr.Audio( | |
| label="Reference Voice", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| ) | |
| gr.Markdown( | |
| "<p class='section-label'>Upload or record a few seconds of the " | |
| "target voice. The model will replicate its characteristics.</p>" | |
| ) | |
| with gr.Accordion("⚙️ Advanced Settings", open=False): | |
| clone_cfg = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="CFG Scale", | |
| ) | |
| clone_max_tokens = gr.Slider( | |
| minimum=512, | |
| maximum=8192, | |
| value=4096, | |
| step=512, | |
| label="Max Tokens", | |
| ) | |
| clone_btn = gr.Button( | |
| "🎭 Clone & Generate", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| clone_audio_out = gr.Audio( | |
| label="Cloned Voice Output", | |
| type="filepath", | |
| interactive=False, | |
| ) | |
| clone_info = gr.Markdown( | |
| "*Upload a reference voice and press 'Clone & Generate'.*" | |
| ) | |
| # Events | |
| clone_btn.click( | |
| fn=clone_voice, | |
| inputs=[ | |
| clone_text, | |
| clone_ref, | |
| clone_language, | |
| clone_cfg, | |
| clone_max_tokens, | |
| ], | |
| outputs=[clone_audio_out, clone_info], | |
| ) | |
| # ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| with gr.TabItem("🔒 Watermark Verify", id="watermark"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| wm_audio = gr.Audio( | |
| label="Audio to Verify", | |
| type="filepath", | |
| sources=["upload"], | |
| ) | |
| gr.Markdown( | |
| "<p class='section-label'>All KugelAudio outputs are watermarked " | |
| "with Facebook AudioSeal. Upload any audio file to check.</p>" | |
| ) | |
| wm_btn = gr.Button( | |
| "🔍 Verify Watermark", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*") | |
| wm_btn.click( | |
| fn=verify_watermark, | |
| inputs=[wm_audio], | |
| outputs=[wm_result], | |
| ) | |
| # ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| with gr.TabItem("🏆 Benchmarks", id="bench"): | |
| gr.Markdown("### Human Preference Ranking — German TTS") | |
| gr.HTML(BENCHMARK_HTML) | |
| gr.Markdown( | |
| "Evaluations covered **neutral speech, shouting, singing, and " | |
| "drunken voice** styles across diverse German-language samples. " | |
| "Participants heard a reference voice and compared outputs from " | |
| "two anonymous models in a blind A/B test." | |
| ) | |
| # ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ | |
| with gr.TabItem("ℹ️ About", id="about"): | |
| gr.Markdown(""" | |
| ### Architecture | |
| KugelAudio uses a hybrid **AR + Diffusion** pipeline: | |
| 1. **Text Encoder** — Qwen2-based language model encodes input text | |
| 2. **TTS Backbone** — Upper transformer layers generate speech representations | |
| 3. **Diffusion Head** — Predicts speech latents via denoising diffusion | |
| 4. **Acoustic Decoder** — Converts latents to waveforms | |
| ### Training | |
| | Detail | Value | | |
| |--------|-------| | |
| | Base model | Microsoft VibeVoice | | |
| | Training data | ~200,000 hours (YODAS2) | | |
| | Hardware | 8× NVIDIA H100 | | |
| | Duration | 5 days | | |
| | Parameters | 7B | | |
| ### Responsible Use | |
| KugelAudio is intended for accessibility, content creation, voice assistants, | |
| language learning, and creative projects **with consent**. All generated audio | |
| is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation | |
| without consent, fraud, or any illegal use is prohibited. | |
| ### License | |
| Released under the **MIT License**. | |
| ### Citation | |
| ```bibtex | |
| @software{kugelaudio2026, | |
| title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning}, | |
| author = {Kratzenstein, Kajo and Menke, Carlos}, | |
| year = {2026}, | |
| url = {https://github.com/kugelaudio/kugelaudio} | |
| } | |
| ``` | |
| """) | |
| # ── Footer ── | |
| gr.HTML(FOOTER_HTML) | |
| # ─── Launch ────────────────────────────────────────────────────────────────── | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| # show_api=True, | |
| ) |