kugelaudio

Runtime error

App Files Files Community

multimodalart HF Staff commited on 29 days ago

Commit

a1d2c1f

verified ·

1 Parent(s): b4096d5

Create app.py

Browse files

Files changed (1) hide show

app.py +688 -0

app.py ADDED Viewed

	@@ -0,0 +1,688 @@

+"""
+KugelAudio Gradio Demo
+Open-source text-to-speech for European languages with voice cloning capabilities.
+"""
+import logging
+import tempfile
+import time
+import gradio as gr
+import torch
+import torchaudio
+from kugelaudio_open import (
+    KugelAudioForConditionalGenerationInference,
+    KugelAudioProcessor,
+)
+from kugelaudio_open.watermark import AudioWatermark
+logger = logging.getLogger(__name__)
+# ─── Device & Model Setup ───────────────────────────────────────────────────
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
+MODEL_ID = "kugelaudio/kugelaudio-0-open"
+OUTPUT_SAMPLE_RATE = 24000
+logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE)
+model = KugelAudioForConditionalGenerationInference.from_pretrained(
+    MODEL_ID,
+    torch_dtype=DTYPE,
+).to(DEVICE)
+model.eval()
+processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
+watermarker = AudioWatermark()
+logger.info("Model loaded successfully.")
+# ─── Language Configuration ──────────────────────────────────────────────────
+LANGUAGES = {
+    "English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr",
+    "Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇵🇹": "pt",
+    "Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru",
+    "Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro",
+    "Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da",
+    "Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el",
+    "Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr",
+    "Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr",
+}
+EXAMPLE_TEXTS = {
+    "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
+    "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.",
+    "fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.",
+    "es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.",
+    "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
+    "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.",
+    "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
+    "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.",
+    "ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.",
+}
+# ─── Inference Helpers ───────────────────────────────────────────────────────
+def _to_device(inputs: dict) -> dict:
+    """Move tensor values to the model device."""
+    return {
+        k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
+        for k, v in inputs.items()
+    }
+def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
+    """Write an audio tensor to a temporary WAV file and return its path."""
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    processor.save_audio(audio_tensor, tmp.name)
+    return tmp.name
+def generate_speech(
+    text: str,
+    language: str,
+    cfg_scale: float,
+    max_tokens: int,
+) -> tuple[str, str]:
+    """Generate speech from text and return (audio_path, info_markdown)."""
+    if not text.strip():
+        raise gr.Error("Please enter some text to synthesize.")
+    lang_code = LANGUAGES.get(language, "en")
+    inputs = processor(text=text, return_tensors="pt")
+    inputs = _to_device(inputs)
+    t0 = time.perf_counter()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            cfg_scale=cfg_scale,
+            max_new_tokens=int(max_tokens),
+        )
+    elapsed = time.perf_counter() - t0
+    audio_path = _save_to_tempfile(outputs.speech_outputs[0])
+    audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
+    info = (
+        f"🔊 **Generation Complete**\n\n"
+        f"Language: {language} (`{lang_code}`) · "
+        f"CFG: {cfg_scale} · "
+        f"Duration: {audio_duration:.1f}s · "
+        f"Inference: {elapsed:.2f}s · "
+        f"RTF: {elapsed / audio_duration:.2f}x"
+    )
+    return audio_path, info
+def clone_voice(
+    text: str,
+    reference_audio: str | None,
+    language: str,
+    cfg_scale: float,
+    max_tokens: int,
+) -> tuple[str, str]:
+    """Clone a voice from reference audio and synthesize new text."""
+    if not text.strip():
+        raise gr.Error("Please enter some text to synthesize.")
+    if reference_audio is None:
+        raise gr.Error("Please upload a reference audio file for voice cloning.")
+    lang_code = LANGUAGES.get(language, "en")
+    inputs = processor(
+        text=text,
+        voice_prompt=reference_audio,
+        return_tensors="pt",
+    )
+    inputs = _to_device(inputs)
+    t0 = time.perf_counter()
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            cfg_scale=cfg_scale,
+            max_new_tokens=int(max_tokens),
+        )
+    elapsed = time.perf_counter() - t0
+    audio_path = _save_to_tempfile(outputs.speech_outputs[0])
+    audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
+    info = (
+        f"🎭 **Voice Cloning Complete**\n\n"
+        f"Language: {language} (`{lang_code}`) · "
+        f"CFG: {cfg_scale} · "
+        f"Duration: {audio_duration:.1f}s · "
+        f"Inference: {elapsed:.2f}s · "
+        f"RTF: {elapsed / audio_duration:.2f}x"
+    )
+    return audio_path, info
+def verify_watermark(audio_file: str | None) -> str:
+    """Detect the AudioSeal watermark in an uploaded audio file."""
+    if audio_file is None:
+        raise gr.Error("Please upload an audio file to verify.")
+    waveform, sr = torchaudio.load(audio_file)
+    # Resample to the expected rate if necessary
+    if sr != OUTPUT_SAMPLE_RATE:
+        waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)
+    result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)
+    if result.detected:
+        status = "✅ **Watermark Detected**"
+    else:
+        status = "❌ **No Watermark Detected**"
+    return (
+        f"🔍 **Watermark Verification**\n\n"
+        f"{status}\n\n"
+        f"Confidence: **{result.confidence:.1%}**\n\n"
+        f"Technology: Facebook AudioSeal · Resolution: 1/16k second"
+    )
+def fill_example_text(language: str) -> str:
+    """Fill the text box with an example in the selected language."""
+    lang_code = LANGUAGES.get(language, "en")
+    return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])
+# ─── Custom CSS ──────────────────────────────────────────────────────────────
+CSS = """
+/* ── Base theme ── */
+:root {
+    --ka-primary: #1a1a2e;
+    --ka-accent: #e94560;
+    --ka-accent-hover: #ff6b81;
+    --ka-surface: #16213e;
+    --ka-surface-light: #1c2a4a;
+    --ka-text: #eaeaea;
+    --ka-text-muted: #8892a4;
+    --ka-border: #2a3a5c;
+    --ka-gold: #f5c518;
+    --ka-green: #2ecc71;
+}
+/* ── Global ── */
+.gradio-container {
+    max-width: 960px !important;
+    margin: 0 auto !important;
+    font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
+}
+/* ── Hero header ── */
+.hero-header {
+    text-align: center;
+    padding: 2.5rem 1.5rem 1.5rem;
+    margin-bottom: 0.5rem;
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
+    border-radius: 16px;
+    border: 1px solid var(--ka-border);
+    position: relative;
+    overflow: hidden;
+}
+.hero-header::before {
+    content: '';
+    position: absolute;
+    top: -50%;
+    left: -50%;
+    width: 200%;
+    height: 200%;
+    background: radial-gradient(circle at 30% 50%, rgba(233,69,96,0.06) 0%, transparent 50%),
+                radial-gradient(circle at 70% 80%, rgba(15,52,96,0.08) 0%, transparent 50%);
+    pointer-events: none;
+}
+.hero-header h1 {
+    font-size: 2.4rem !important;
+    font-weight: 700 !important;
+    margin: 0 0 0.3rem !important;
+    color: #ffffff !important;
+    letter-spacing: -0.02em;
+}
+.hero-header .hero-accent {
+    color: var(--ka-accent);
+}
+.hero-header p {
+    color: var(--ka-text-muted);
+    font-size: 1.05rem;
+    margin: 0;
+    line-height: 1.5;
+}
+/* ── Badges row ── */
+.badges {
+    display: flex;
+    justify-content: center;
+    gap: 0.5rem;
+    margin-top: 1rem;
+    flex-wrap: wrap;
+}
+.badge {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.35rem;
+    padding: 0.3rem 0.75rem;
+    border-radius: 999px;
+    font-size: 0.78rem;
+    font-weight: 600;
+    letter-spacing: 0.01em;
+    border: 1px solid var(--ka-border);
+    background: var(--ka-surface-light);
+    color: var(--ka-text);
+}
+.badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
+.badge.green { border-color: var(--ka-green); color: var(--ka-green); }
+.badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }
+/* ── Benchmark table ── */
+.benchmark-table {
+    width: 100%;
+    border-collapse: separate;
+    border-spacing: 0;
+    margin: 0.75rem 0;
+    font-size: 0.88rem;
+    border-radius: 10px;
+    overflow: hidden;
+    border: 1px solid var(--ka-border);
+}
+.benchmark-table th {
+    background: var(--ka-surface);
+    color: var(--ka-text-muted);
+    font-weight: 600;
+    text-transform: uppercase;
+    font-size: 0.72rem;
+    letter-spacing: 0.06em;
+    padding: 0.65rem 0.8rem;
+    text-align: left;
+}
+.benchmark-table td {
+    padding: 0.55rem 0.8rem;
+    border-top: 1px solid var(--ka-border);
+    color: var(--ka-text);
+}
+.benchmark-table tr.highlight td {
+    background: rgba(233, 69, 96, 0.08);
+    font-weight: 600;
+}
+.benchmark-table tr:not(.highlight) td {
+    background: transparent;
+}
+/* ── Section divider ── */
+.section-label {
+    font-size: 0.7rem;
+    text-transform: uppercase;
+    letter-spacing: 0.1em;
+    color: var(--ka-text-muted);
+    margin: 1rem 0 0.3rem;
+    padding-left: 2px;
+    font-weight: 600;
+}
+/* ── Tab styling ── */
+.tab-nav button {
+    font-weight: 600 !important;
+    letter-spacing: 0.01em !important;
+}
+.tab-nav button.selected {
+    border-color: var(--ka-accent) !important;
+    color: var(--ka-accent) !important;
+}
+/* ── Footer ── */
+.footer {
+    text-align: center;
+    padding: 1.2rem;
+    margin-top: 1rem;
+    font-size: 0.8rem;
+    color: var(--ka-text-muted);
+    border-top: 1px solid var(--ka-border);
+    line-height: 1.6;
+}
+.footer a {
+    color: var(--ka-accent);
+    text-decoration: none;
+}
+.footer a:hover {
+    text-decoration: underline;
+}
+"""
+# ─── Header HTML ─────────────────────────────────────────────────────────────
+HEADER_HTML = """
+<div class="hero-header">
+    <h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1>
+    <p>Open-source text-to-speech for European languages · AR + Diffusion architecture</p>
+    <div class="badges">
+        <span class="badge gold">🏆 #1 German TTS</span>
+        <span class="badge green">24 Languages</span>
+        <span class="badge accent">Voice Cloning</span>
+        <span class="badge">MIT License</span>
+        <span class="badge">7B Parameters</span>
+    </div>
+</div>
+"""
+BENCHMARK_HTML = """
+<table class="benchmark-table">
+    <thead>
+        <tr>
+            <th>Rank</th>
+            <th>Model</th>
+            <th>Score</th>
+            <th>Win Rate</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr class="highlight">
+            <td>🥇</td>
+            <td>KugelAudio</td>
+            <td>26</td>
+            <td>78.0%</td>
+        </tr>
+        <tr>
+            <td>🥈</td>
+            <td>ElevenLabs Multi v2</td>
+            <td>25</td>
+            <td>62.2%</td>
+        </tr>
+        <tr>
+            <td>🥉</td>
+            <td>ElevenLabs v3</td>
+            <td>21</td>
+            <td>65.3%</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>Cartesia</td>
+            <td>21</td>
+            <td>59.1%</td>
+        </tr>
+        <tr>
+            <td>5</td>
+            <td>VibeVoice</td>
+            <td>10</td>
+            <td>28.8%</td>
+        </tr>
+        <tr>
+            <td>6</td>
+            <td>CosyVoice v3</td>
+            <td>9</td>
+            <td>14.2%</td>
+        </tr>
+    </tbody>
+</table>
+<p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
+    Based on 339 human A/B evaluations · OpenSkill Bayesian ranking
+</p>
+"""
+FOOTER_HTML = """
+<div class="footer">
+    <strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br>
+    <a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ·
+    <a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ·
+    <a href="https://kugelaudio.com">API</a> ·
+    <a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
+    Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
+    via the AI Service Center Berlin-Brandenburg at HPI
+</div>
+"""
+# ─── Build Gradio Interface ─────────────────────────────────────────────────
+with gr.Blocks(
+    css=CSS,
+    title="KugelAudio – European TTS",
+    theme=gr.themes.Base(
+        primary_hue=gr.themes.colors.red,
+        secondary_hue=gr.themes.colors.slate,
+        neutral_hue=gr.themes.colors.slate,
+        font=gr.themes.GoogleFont("IBM Plex Sans"),
+        font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
+    ),
+) as demo:
+    # ── Header ──
+    gr.HTML(HEADER_HTML)
+    # ── Main Tabs ──
+    with gr.Tabs():
+        # ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        with gr.TabItem("🔊 Text-to-Speech", id="tts"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    tts_language = gr.Dropdown(
+                        choices=list(LANGUAGES.keys()),
+                        value="English 🇺🇸",
+                        label="Language",
+                        info="24 European languages supported",
+                    )
+                    tts_text = gr.Textbox(
+                        label="Text to Synthesize",
+                        placeholder="Enter text here or click 'Fill Example' below…",
+                        lines=5,
+                        max_lines=12,
+                    )
+                    with gr.Row():
+                        tts_example_btn = gr.Button(
+                            "📝 Fill Example", size="sm", variant="secondary"
+                        )
+                        tts_clear_btn = gr.ClearButton(
+                            [tts_text], value="🗑️ Clear", size="sm"
+                        )
+                    with gr.Accordion("⚙️ Advanced Settings", open=False):
+                        tts_cfg = gr.Slider(
+                            minimum=1.0,
+                            maximum=10.0,
+                            value=3.0,
+                            step=0.5,
+                            label="CFG Scale",
+                            info="Guidance scale — higher values follow the text more closely",
+                        )
+                        tts_max_tokens = gr.Slider(
+                            minimum=512,
+                            maximum=8192,
+                            value=4096,
+                            step=512,
+                            label="Max Tokens",
+                            info="Maximum generation length in tokens",
+                        )
+                    tts_generate_btn = gr.Button(
+                        "🎙️ Generate Speech", variant="primary", size="lg"
+                    )
+                with gr.Column(scale=2):
+                    tts_audio_out = gr.Audio(
+                        label="Generated Audio",
+                        type="filepath",
+                        interactive=False,
+                    )
+                    tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*")
+            # Events
+            tts_example_btn.click(
+                fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
+            )
+            tts_language.change(
+                fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
+            )
+            tts_generate_btn.click(
+                fn=generate_speech,
+                inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
+                outputs=[tts_audio_out, tts_info],
+            )
+        # ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        with gr.TabItem("🎭 Voice Cloning", id="clone"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    clone_language = gr.Dropdown(
+                        choices=list(LANGUAGES.keys()),
+                        value="German 🇩🇪",
+                        label="Language",
+                    )
+                    clone_text = gr.Textbox(
+                        label="Text to Synthesize",
+                        placeholder="Enter the text you want spoken in the cloned voice…",
+                        lines=4,
+                        max_lines=10,
+                    )
+                    clone_ref = gr.Audio(
+                        label="Reference Voice",
+                        type="filepath",
+                        sources=["upload", "microphone"],
+                    )
+                    gr.Markdown(
+                        "<p class='section-label'>Upload or record a few seconds of the "
+                        "target voice. The model will replicate its characteristics.</p>"
+                    )
+                    with gr.Accordion("⚙️ Advanced Settings", open=False):
+                        clone_cfg = gr.Slider(
+                            minimum=1.0,
+                            maximum=10.0,
+                            value=3.0,
+                            step=0.5,
+                            label="CFG Scale",
+                        )
+                        clone_max_tokens = gr.Slider(
+                            minimum=512,
+                            maximum=8192,
+                            value=4096,
+                            step=512,
+                            label="Max Tokens",
+                        )
+                    clone_btn = gr.Button(
+                        "🎭 Clone & Generate", variant="primary", size="lg"
+                    )
+                with gr.Column(scale=2):
+                    clone_audio_out = gr.Audio(
+                        label="Cloned Voice Output",
+                        type="filepath",
+                        interactive=False,
+                    )
+                    clone_info = gr.Markdown(
+                        "*Upload a reference voice and press 'Clone & Generate'.*"
+                    )
+            # Events
+            clone_btn.click(
+                fn=clone_voice,
+                inputs=[
+                    clone_text,
+                    clone_ref,
+                    clone_language,
+                    clone_cfg,
+                    clone_max_tokens,
+                ],
+                outputs=[clone_audio_out, clone_info],
+            )
+        # ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        with gr.TabItem("🔒 Watermark Verify", id="watermark"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    wm_audio = gr.Audio(
+                        label="Audio to Verify",
+                        type="filepath",
+                        sources=["upload"],
+                    )
+                    gr.Markdown(
+                        "<p class='section-label'>All KugelAudio outputs are watermarked "
+                        "with Facebook AudioSeal. Upload any audio file to check.</p>"
+                    )
+                    wm_btn = gr.Button(
+                        "🔍 Verify Watermark", variant="primary", size="lg"
+                    )
+                with gr.Column(scale=2):
+                    wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*")
+            wm_btn.click(
+                fn=verify_watermark,
+                inputs=[wm_audio],
+                outputs=[wm_result],
+            )
+        # ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        with gr.TabItem("🏆 Benchmarks", id="bench"):
+            gr.Markdown("### Human Preference Ranking — German TTS")
+            gr.HTML(BENCHMARK_HTML)
+            gr.Markdown(
+                "Evaluations covered **neutral speech, shouting, singing, and "
+                "drunken voice** styles across diverse German-language samples. "
+                "Participants heard a reference voice and compared outputs from "
+                "two anonymous models in a blind A/B test."
+            )
+        # ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+        with gr.TabItem("ℹ️ About", id="about"):
+            gr.Markdown("""
+### Architecture
+KugelAudio uses a hybrid **AR + Diffusion** pipeline:
+1. **Text Encoder** — Qwen2-based language model encodes input text
+2. **TTS Backbone** — Upper transformer layers generate speech representations
+3. **Diffusion Head** — Predicts speech latents via denoising diffusion
+4. **Acoustic Decoder** — Converts latents to waveforms
+### Training
+| Detail | Value |
+|--------|-------|
+| Base model | Microsoft VibeVoice |
+| Training data | ~200,000 hours (YODAS2) |
+| Hardware | 8× NVIDIA H100 |
+| Duration | 5 days |
+| Parameters | 7B |
+### Responsible Use
+KugelAudio is intended for accessibility, content creation, voice assistants,
+language learning, and creative projects **with consent**. All generated audio
+is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
+without consent, fraud, or any illegal use is prohibited.
+### License
+Released under the **MIT License**.
+### Citation
+```bibtex
+@software{kugelaudio2026,
+  title   = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
+  author  = {Kratzenstein, Kajo and Menke, Carlos},
+  year    = {2026},
+  url     = {https://github.com/kugelaudio/kugelaudio}
+}
+```
+""")
+    # ── Footer ──
+    gr.HTML(FOOTER_HTML)
+# ─── Launch ──────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_api=True,
+    )