Spaces:
Runtime error
Runtime error
| """ | |
| KugelAudio Gradio Demo | |
| Open-source text-to-speech for European languages with voice cloning capabilities. | |
| """ | |
| import logging | |
| import tempfile | |
| import time | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import spaces | |
| from kugelaudio_open import ( | |
| KugelAudioForConditionalGenerationInference, | |
| KugelAudioProcessor, | |
| ) | |
| from kugelaudio_open.watermark import AudioWatermark | |
| logger = logging.getLogger(__name__) | |
| # โโโ Device & Model Setup โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32 | |
| MODEL_ID = "kugelaudio/kugelaudio-0-open" | |
| OUTPUT_SAMPLE_RATE = 24000 | |
| logger.info("Loading KugelAudio model '%s' on %s (%s)โฆ", MODEL_ID, DEVICE, DTYPE) | |
| model = KugelAudioForConditionalGenerationInference.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 , | |
| ).to("cuda") | |
| processor = KugelAudioProcessor.from_pretrained(MODEL_ID) | |
| watermarker = AudioWatermark() | |
| logger.info("Model loaded successfully.") | |
| # โโโ Language Configuration โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| LANGUAGES = { | |
| "English ๐บ๐ธ": "en", "German ๐ฉ๐ช": "de", "French ๐ซ๐ท": "fr", | |
| "Spanish ๐ช๐ธ": "es", "Italian ๐ฎ๐น": "it", "Portuguese ๐ง๐ท๐ต๐น": "pt", | |
| "Dutch ๐ณ๐ฑ": "nl", "Polish ๐ต๐ฑ": "pl", "Russian ๐ท๐บ": "ru", | |
| "Ukrainian ๐บ๐ฆ": "uk", "Czech ๐จ๐ฟ": "cs", "Romanian ๐ท๐ด": "ro", | |
| "Hungarian ๐ญ๐บ": "hu", "Swedish ๐ธ๐ช": "sv", "Danish ๐ฉ๐ฐ": "da", | |
| "Finnish ๐ซ๐ฎ": "fi", "Norwegian ๐ณ๐ด": "no", "Greek ๐ฌ๐ท": "el", | |
| "Bulgarian ๐ง๐ฌ": "bg", "Slovak ๐ธ๐ฐ": "sk", "Croatian ๐ญ๐ท": "hr", | |
| "Serbian ๐ท๐ธ": "sr", "Turkish ๐น๐ท": "tr", | |
| } | |
| EXAMPLE_TEXTS = { | |
| "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.", | |
| "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System fรผr europรคische Sprachen. Unser Modell unterstรผtzt Voice Cloning und emotionale Sprachsynthese.", | |
| "fr": "Bienvenue sur KugelAudio, le systรจme de synthรจse vocale open-source pour les langues europรฉennes. Notre modรจle prend en charge le clonage vocal et la synthรจse vocale รฉmotionnelle.", | |
| "es": "Bienvenido a KugelAudio, el sistema de texto a voz de cรณdigo abierto para idiomas europeos. Nuestro modelo soporta clonaciรณn de voz y sรญntesis de habla emocional.", | |
| "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.", | |
| "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de cรณdigo aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e sรญntese de fala emocional.", | |
| "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.", | |
| "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie ลบrรณdลowym dla jฤzykรณw europejskich. Nasz model obsลuguje klonowanie gลosu i emocjonalnฤ syntezฤ mowy.", | |
| "ru": "ะะพะฑัะพ ะฟะพะถะฐะปะพะฒะฐัั ะฒ KugelAudio โ ัะธััะตะผั ัะธะฝัะตะทะฐ ัะตัะธ ั ะพัะบััััะผ ะธัั ะพะดะฝัะผ ะบะพะดะพะผ ะดะปั ะตะฒัะพะฟะตะนัะบะธั ัะทัะบะพะฒ. ะะฐัะฐ ะผะพะดะตะปั ะฟะพะดะดะตัะถะธะฒะฐะตั ะบะปะพะฝะธัะพะฒะฐะฝะธะต ะณะพะปะพัะฐ ะธ ัะผะพัะธะพะฝะฐะปัะฝัะน ัะธะฝัะตะท ัะตัะธ.", | |
| } | |
| # โโโ Inference Helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def _to_device(inputs: dict) -> dict: | |
| """Move tensor values to the model device.""" | |
| return { | |
| k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v | |
| for k, v in inputs.items() | |
| } | |
| def _save_to_tempfile(audio_tensor: torch.Tensor) -> str: | |
| """Write an audio tensor to a temporary WAV file and return its path.""" | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| processor.save_audio(audio_tensor, tmp.name) | |
| return tmp.name | |
| def generate_speech( | |
| text: str, | |
| language: str, | |
| cfg_scale: float, | |
| max_tokens: int, | |
| ) -> tuple[str, str]: | |
| """Generate speech from text and return (audio_path, info_markdown).""" | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to synthesize.") | |
| lang_code = LANGUAGES.get(language, "en") | |
| inputs = processor(text=text, return_tensors="pt") | |
| inputs = _to_device(inputs) | |
| t0 = time.perf_counter() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| cfg_scale=cfg_scale, | |
| max_new_tokens=int(max_tokens), | |
| ) | |
| elapsed = time.perf_counter() - t0 | |
| audio_path = _save_to_tempfile(outputs.speech_outputs[0]) | |
| audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE | |
| info = ( | |
| f"๐ **Generation Complete**\n\n" | |
| f"Language: {language} (`{lang_code}`) ยท " | |
| f"CFG: {cfg_scale} ยท " | |
| f"Duration: {audio_duration:.1f}s ยท " | |
| f"Inference: {elapsed:.2f}s ยท " | |
| f"RTF: {elapsed / audio_duration:.2f}x" | |
| ) | |
| return audio_path, info | |
| def clone_voice( | |
| text: str, | |
| reference_audio: str | None, | |
| language: str, | |
| cfg_scale: float, | |
| max_tokens: int, | |
| ) -> tuple[str, str]: | |
| """Clone a voice from reference audio and synthesize new text.""" | |
| if not text.strip(): | |
| raise gr.Error("Please enter some text to synthesize.") | |
| if reference_audio is None: | |
| raise gr.Error("Please upload a reference audio file for voice cloning.") | |
| lang_code = LANGUAGES.get(language, "en") | |
| inputs = processor( | |
| text=text, | |
| voice_prompt=reference_audio, | |
| return_tensors="pt", | |
| ) | |
| inputs = _to_device(inputs) | |
| t0 = time.perf_counter() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| cfg_scale=cfg_scale, | |
| max_new_tokens=int(max_tokens), | |
| ) | |
| elapsed = time.perf_counter() - t0 | |
| audio_path = _save_to_tempfile(outputs.speech_outputs[0]) | |
| audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE | |
| info = ( | |
| f"๐ญ **Voice Cloning Complete**\n\n" | |
| f"Language: {language} (`{lang_code}`) ยท " | |
| f"CFG: {cfg_scale} ยท " | |
| f"Duration: {audio_duration:.1f}s ยท " | |
| f"Inference: {elapsed:.2f}s ยท " | |
| f"RTF: {elapsed / audio_duration:.2f}x" | |
| ) | |
| return audio_path, info | |
| def verify_watermark(audio_file: str | None) -> str: | |
| """Detect the AudioSeal watermark in an uploaded audio file.""" | |
| if audio_file is None: | |
| raise gr.Error("Please upload an audio file to verify.") | |
| waveform, sr = torchaudio.load(audio_file) | |
| # Resample to the expected rate if necessary | |
| if sr != OUTPUT_SAMPLE_RATE: | |
| waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE) | |
| result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE) | |
| if result.detected: | |
| status = "โ **Watermark Detected**" | |
| else: | |
| status = "โ **No Watermark Detected**" | |
| return ( | |
| f"๐ **Watermark Verification**\n\n" | |
| f"{status}\n\n" | |
| f"Confidence: **{result.confidence:.1%}**\n\n" | |
| f"Technology: Facebook AudioSeal ยท Resolution: 1/16k second" | |
| ) | |
| def fill_example_text(language: str) -> str: | |
| """Fill the text box with an example in the selected language.""" | |
| lang_code = LANGUAGES.get(language, "en") | |
| return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"]) | |
| # โโโ Custom CSS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| CSS = """ | |
| /* โโ Base theme โโ */ | |
| :root { | |
| --ka-primary: #1a1a2e; | |
| --ka-accent: #e94560; | |
| --ka-accent-hover: #ff6b81; | |
| --ka-surface: #16213e; | |
| --ka-surface-light: #1c2a4a; | |
| --ka-text: #eaeaea; | |
| --ka-text-muted: #8892a4; | |
| --ka-border: #2a3a5c; | |
| --ka-gold: #f5c518; | |
| --ka-green: #2ecc71; | |
| } | |
| /* โโ Global โโ */ | |
| .gradio-container { | |
| max-width: 960px !important; | |
| margin: 0 auto !important; | |
| font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important; | |
| } | |
| /* โโ Hero header โโ */ | |
| .hero-header { | |
| text-align: center; | |
| padding: .5rem 1.5rem 1.5rem; | |
| margin-bottom: 0.5rem; | |
| border-radius: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .hero-header::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| pointer-events: none; | |
| } | |
| .hero-header h1 { | |
| font-size: 2.4rem !important; | |
| font-weight: 700 !important; | |
| margin: 0 0 0.3rem !important; | |
| color: var(--body-text-color); | |
| letter-spacing: -0.02em; | |
| } | |
| .hero-header .hero-accent { | |
| color: var(--ka-accent); | |
| } | |
| .hero-header p { | |
| color: var(--ka-text-muted); | |
| font-size: 1.05rem; | |
| margin: 0; | |
| line-height: 1.5; | |
| } | |
| /* โโ Badges row โโ */ | |
| .badges { | |
| display: flex; | |
| justify-content: center; | |
| gap: 0.5rem; | |
| margin-top: 1rem; | |
| flex-wrap: wrap; | |
| } | |
| .badge { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.35rem; | |
| padding: 0.3rem 0.75rem; | |
| border-radius: 999px; | |
| font-size: 0.78rem; | |
| font-weight: 600; | |
| letter-spacing: 0.01em; | |
| background: var(--block-background-fill, var(--ka-surface-light)); | |
| color: var(--body-text-color, var(--ka-text)); | |
| border: 1px solid var(--border-color-primary, var(--ka-border)); | |
| } | |
| .badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); } | |
| .badge.green { border-color: var(--ka-green); color: var(--ka-green); } | |
| .badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); } | |
| /* โโ Benchmark table โโ */ | |
| .benchmark-table { | |
| width: 100%; | |
| border-collapse: separate; | |
| border-spacing: 0; | |
| margin: 0.75rem 0; | |
| font-size: 0.88rem; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| border: 1px solid var(--ka-border); | |
| } | |
| .benchmark-table th { | |
| background: var(--ka-surface); | |
| color: var(--ka-text-muted); | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| font-size: 0.72rem; | |
| letter-spacing: 0.06em; | |
| padding: 0.65rem 0.8rem; | |
| text-align: left; | |
| } | |
| .benchmark-table td { | |
| padding: 0.55rem 0.8rem; | |
| border-top: 1px solid var(--ka-border); | |
| color: var(--ka-text); | |
| } | |
| .benchmark-table tr.highlight td { | |
| background: rgba(233, 69, 96, 0.08); | |
| font-weight: 600; | |
| } | |
| .benchmark-table tr:not(.highlight) td { | |
| background: transparent; | |
| } | |
| /* โโ Section divider โโ */ | |
| .section-label { | |
| font-size: 0.7rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.1em; | |
| color: var(--ka-text-muted); | |
| margin: 1rem 0 0.3rem; | |
| padding-left: 2px; | |
| font-weight: 600; | |
| } | |
| /* โโ Tab styling โโ */ | |
| .tab-nav button { | |
| font-weight: 600 !important; | |
| letter-spacing: 0.01em !important; | |
| } | |
| .tab-nav button.selected { | |
| border-color: var(--ka-accent) !important; | |
| color: var(--ka-accent) !important; | |
| } | |
| /* โโ Footer โโ */ | |
| .footer { | |
| text-align: center; | |
| padding: 1.2rem; | |
| margin-top: 1rem; | |
| font-size: 0.8rem; | |
| color: var(--ka-text-muted); | |
| border-top: 1px solid var(--ka-border); | |
| line-height: 1.6; | |
| } | |
| .footer a { | |
| color: var(--ka-accent); | |
| text-decoration: none; | |
| } | |
| .footer a:hover { | |
| text-decoration: underline; | |
| } | |
| """ | |
| # โโโ Header HTML โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| HEADER_HTML = """ | |
| <div class="hero-header"> | |
| <h1>๐๏ธ <span class="hero-accent">Kugel</span>Audio</h1> | |
| <p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p> | |
| <div class="badges"> | |
| <span class="badge gold">๐ #1 German TTS</span> | |
| <span class="badge green">24 Languages</span> | |
| <span class="badge accent">Voice Cloning</span> | |
| <span class="badge">MIT License</span> | |
| <span class="badge">7B Parameters</span> | |
| </div> | |
| </div> | |
| """ | |
| BENCHMARK_HTML = """ | |
| <table class="benchmark-table"> | |
| <thead> | |
| <tr> | |
| <th>Rank</th> | |
| <th>Model</th> | |
| <th>Score</th> | |
| <th>Win Rate</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr class="highlight"> | |
| <td>๐ฅ</td> | |
| <td>KugelAudio</td> | |
| <td>26</td> | |
| <td>78.0%</td> | |
| </tr> | |
| <tr> | |
| <td>๐ฅ</td> | |
| <td>ElevenLabs Multi v2</td> | |
| <td>25</td> | |
| <td>62.2%</td> | |
| </tr> | |
| <tr> | |
| <td>๐ฅ</td> | |
| <td>ElevenLabs v3</td> | |
| <td>21</td> | |
| <td>65.3%</td> | |
| </tr> | |
| <tr> | |
| <td>4</td> | |
| <td>Cartesia</td> | |
| <td>21</td> | |
| <td>59.1%</td> | |
| </tr> | |
| <tr> | |
| <td>5</td> | |
| <td>VibeVoice</td> | |
| <td>10</td> | |
| <td>28.8%</td> | |
| </tr> | |
| <tr> | |
| <td>6</td> | |
| <td>CosyVoice v3</td> | |
| <td>9</td> | |
| <td>14.2%</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;"> | |
| Based on 339 human A/B evaluations ยท OpenSkill Bayesian ranking | |
| </p> | |
| """ | |
| FOOTER_HTML = """ | |
| <div class="footer"> | |
| <strong>KugelAudio</strong> ยท Created by Kajo Kratzenstein & Carlos Menke<br> | |
| <a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ยท | |
| <a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ยท | |
| <a href="https://kugelaudio.com">API</a> ยท | |
| <a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br> | |
| Funded by the German Federal Ministry of Research, Technology and Space (BMFTR) | |
| via the AI Service Center Berlin-Brandenburg at HPI | |
| </div> | |
| """ | |
| # โโโ Build Gradio Interface โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.Blocks( | |
| css=CSS, | |
| title="KugelAudio โ European TTS", | |
| theme=gr.themes.Base( | |
| primary_hue=gr.themes.colors.red, | |
| secondary_hue=gr.themes.colors.slate, | |
| neutral_hue=gr.themes.colors.slate, | |
| font=gr.themes.GoogleFont("IBM Plex Sans"), | |
| font_mono=gr.themes.GoogleFont("IBM Plex Mono"), | |
| ), | |
| ) as demo: | |
| # โโ Header โโ | |
| gr.HTML(HEADER_HTML) | |
| # โโ Main Tabs โโ | |
| with gr.Tabs(): | |
| # โโโ Tab 1: Text-to-Speech โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.TabItem("๐ Text-to-Speech", id="tts"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| tts_language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="English ๐บ๐ธ", | |
| label="Language", | |
| info="24 European languages supported", | |
| ) | |
| tts_text = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter text here or click 'Fill Example' belowโฆ", | |
| lines=5, | |
| max_lines=12, | |
| ) | |
| with gr.Row(): | |
| tts_example_btn = gr.Button( | |
| "๐ Fill Example", size="sm", variant="secondary" | |
| ) | |
| tts_clear_btn = gr.ClearButton( | |
| [tts_text], value="๐๏ธ Clear", size="sm" | |
| ) | |
| with gr.Accordion("โ๏ธ Advanced Settings", open=False): | |
| tts_cfg = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="CFG Scale", | |
| info="Guidance scale โ higher values follow the text more closely", | |
| ) | |
| tts_max_tokens = gr.Slider( | |
| minimum=512, | |
| maximum=8192, | |
| value=4096, | |
| step=512, | |
| label="Max Tokens", | |
| info="Maximum generation length in tokens", | |
| ) | |
| tts_generate_btn = gr.Button( | |
| "๐๏ธ Generate Speech", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| tts_audio_out = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath", | |
| interactive=False, | |
| ) | |
| tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*") | |
| # Events | |
| tts_example_btn.click( | |
| fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] | |
| ) | |
| tts_language.change( | |
| fn=fill_example_text, inputs=[tts_language], outputs=[tts_text] | |
| ) | |
| tts_generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens], | |
| outputs=[tts_audio_out, tts_info], | |
| ) | |
| # โโโ Tab 2: Voice Cloning โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.TabItem("๐ญ Voice Cloning", id="clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| clone_language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="English ๐บ๐ธ", | |
| label="Language", | |
| ) | |
| clone_text = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter the text you want spoken in the cloned voiceโฆ", | |
| lines=4, | |
| max_lines=10, | |
| ) | |
| clone_ref = gr.Audio( | |
| label="Reference Voice", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| ) | |
| gr.Markdown( | |
| "<p class='section-label'>Upload or record a few seconds of the " | |
| "target voice. The model will replicate its characteristics.</p>" | |
| ) | |
| with gr.Accordion("โ๏ธ Advanced Settings", open=False): | |
| clone_cfg = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="CFG Scale", | |
| ) | |
| clone_max_tokens = gr.Slider( | |
| minimum=512, | |
| maximum=8192, | |
| value=4096, | |
| step=512, | |
| label="Max Tokens", | |
| ) | |
| clone_btn = gr.Button( | |
| "๐ญ Clone & Generate", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| clone_audio_out = gr.Audio( | |
| label="Cloned Voice Output", | |
| type="filepath", | |
| interactive=False, | |
| ) | |
| clone_info = gr.Markdown( | |
| "*Upload a reference voice and press 'Clone & Generate'.*" | |
| ) | |
| # Events | |
| clone_btn.click( | |
| fn=clone_voice, | |
| inputs=[ | |
| clone_text, | |
| clone_ref, | |
| clone_language, | |
| clone_cfg, | |
| clone_max_tokens, | |
| ], | |
| outputs=[clone_audio_out, clone_info], | |
| ) | |
| # โโโ Tab 3: Watermark Verification โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.TabItem("๐ Watermark Verify", id="watermark"): | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| wm_audio = gr.Audio( | |
| label="Audio to Verify", | |
| type="filepath", | |
| sources=["upload"], | |
| ) | |
| gr.Markdown( | |
| "<p class='section-label'>All KugelAudio outputs are watermarked " | |
| "with Facebook AudioSeal. Upload any audio file to check.</p>" | |
| ) | |
| wm_btn = gr.Button( | |
| "๐ Verify Watermark", variant="primary", size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*") | |
| wm_btn.click( | |
| fn=verify_watermark, | |
| inputs=[wm_audio], | |
| outputs=[wm_result], | |
| ) | |
| # โโโ Tab 4: Benchmarks โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.TabItem("๐ Benchmarks", id="bench"): | |
| gr.Markdown("### Human Preference Ranking โ German TTS") | |
| gr.HTML(BENCHMARK_HTML) | |
| gr.Markdown( | |
| "Evaluations covered **neutral speech, shouting, singing, and " | |
| "drunken voice** styles across diverse German-language samples. " | |
| "Participants heard a reference voice and compared outputs from " | |
| "two anonymous models in a blind A/B test." | |
| ) | |
| # โโโ Tab 5: About โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| with gr.TabItem("โน๏ธ About", id="about"): | |
| gr.Markdown(""" | |
| ### Architecture | |
| KugelAudio uses a hybrid **AR + Diffusion** pipeline: | |
| 1. **Text Encoder** โ Qwen2-based language model encodes input text | |
| 2. **TTS Backbone** โ Upper transformer layers generate speech representations | |
| 3. **Diffusion Head** โ Predicts speech latents via denoising diffusion | |
| 4. **Acoustic Decoder** โ Converts latents to waveforms | |
| ### Training | |
| | Detail | Value | | |
| |--------|-------| | |
| | Base model | Microsoft VibeVoice | | |
| | Training data | ~200,000 hours (YODAS2) | | |
| | Hardware | 8ร NVIDIA H100 | | |
| | Duration | 5 days | | |
| | Parameters | 7B | | |
| ### Responsible Use | |
| KugelAudio is intended for accessibility, content creation, voice assistants, | |
| language learning, and creative projects **with consent**. All generated audio | |
| is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation | |
| without consent, fraud, or any illegal use is prohibited. | |
| ### License | |
| Released under the **MIT License**. | |
| ### Citation | |
| ```bibtex | |
| @software{kugelaudio2026, | |
| title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning}, | |
| author = {Kratzenstein, Kajo and Menke, Carlos}, | |
| year = {2026}, | |
| url = {https://github.com/kugelaudio/kugelaudio} | |
| } | |
| ``` | |
| """) | |
| # โโ Footer โโ | |
| gr.HTML(FOOTER_HTML) | |
| # โโโ Launch โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| # show_api=True, | |
| ) |