kugelaudio

Runtime error

File size: 24,520 Bytes

"""
KugelAudio Gradio Demo
Open-source text-to-speech for European languages with voice cloning capabilities.
"""

import logging
import tempfile
import time

import gradio as gr
import torch
import torchaudio

import spaces

from kugelaudio_open import (
    KugelAudioForConditionalGenerationInference,
    KugelAudioProcessor,
)
from kugelaudio_open.watermark import AudioWatermark


logger = logging.getLogger(__name__)

# ─── Device & Model Setup ───────────────────────────────────────────────────

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
MODEL_ID = "kugelaudio/kugelaudio-0-open"
OUTPUT_SAMPLE_RATE = 24000

logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE)

model = KugelAudioForConditionalGenerationInference.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16 ,
).to("cuda")

processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
watermarker = AudioWatermark()

logger.info("Model loaded successfully.")

# ─── Language Configuration ──────────────────────────────────────────────────

LANGUAGES = {
    "English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr",
    "Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇧🇷🇵🇹": "pt",
    "Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru",
    "Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro",
    "Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da",
    "Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el",
    "Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr",
    "Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr",
}

EXAMPLE_TEXTS = {
    "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
    "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.",
    "fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.",
    "es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.",
    "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
    "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.",
    "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
    "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.",
    "ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.",
}

# ─── Inference Helpers ───────────────────────────────────────────────────────


def _to_device(inputs: dict) -> dict:
    """Move tensor values to the model device."""
    return {
        k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
        for k, v in inputs.items()
    }


def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
    """Write an audio tensor to a temporary WAV file and return its path."""
    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    processor.save_audio(audio_tensor, tmp.name)
    return tmp.name

@spaces.GPU
def generate_speech(
    text: str,
    language: str,
    cfg_scale: float,
    max_tokens: int,
) -> tuple[str, str]:
    """Generate speech from text and return (audio_path, info_markdown)."""
    if not text.strip():
        raise gr.Error("Please enter some text to synthesize.")

    lang_code = LANGUAGES.get(language, "en")

    inputs = processor(text=text, return_tensors="pt")
    inputs = _to_device(inputs)

    t0 = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            cfg_scale=cfg_scale,
            max_new_tokens=int(max_tokens),
        )
    elapsed = time.perf_counter() - t0

    audio_path = _save_to_tempfile(outputs.speech_outputs[0])
    audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE

    info = (
        f"🔊 **Generation Complete**\n\n"
        f"Language: {language} (`{lang_code}`) · "
        f"CFG: {cfg_scale} · "
        f"Duration: {audio_duration:.1f}s · "
        f"Inference: {elapsed:.2f}s · "
        f"RTF: {elapsed / audio_duration:.2f}x"
    )
    return audio_path, info

@spaces.GPU
def clone_voice(
    text: str,
    reference_audio: str | None,
    language: str,
    cfg_scale: float,
    max_tokens: int,
) -> tuple[str, str]:
    """Clone a voice from reference audio and synthesize new text."""
    if not text.strip():
        raise gr.Error("Please enter some text to synthesize.")
    if reference_audio is None:
        raise gr.Error("Please upload a reference audio file for voice cloning.")

    lang_code = LANGUAGES.get(language, "en")

    inputs = processor(
        text=text,
        voice_prompt=reference_audio,
        return_tensors="pt",
    )
    inputs = _to_device(inputs)

    t0 = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            cfg_scale=cfg_scale,
            max_new_tokens=int(max_tokens),
        )
    elapsed = time.perf_counter() - t0

    audio_path = _save_to_tempfile(outputs.speech_outputs[0])
    audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE

    info = (
        f"🎭 **Voice Cloning Complete**\n\n"
        f"Language: {language} (`{lang_code}`) · "
        f"CFG: {cfg_scale} · "
        f"Duration: {audio_duration:.1f}s · "
        f"Inference: {elapsed:.2f}s · "
        f"RTF: {elapsed / audio_duration:.2f}x"
    )
    return audio_path, info


def verify_watermark(audio_file: str | None) -> str:
    """Detect the AudioSeal watermark in an uploaded audio file."""
    if audio_file is None:
        raise gr.Error("Please upload an audio file to verify.")

    waveform, sr = torchaudio.load(audio_file)

    # Resample to the expected rate if necessary
    if sr != OUTPUT_SAMPLE_RATE:
        waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)

    result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)

    if result.detected:
        status = "✅ **Watermark Detected**"
    else:
        status = "❌ **No Watermark Detected**"

    return (
        f"🔍 **Watermark Verification**\n\n"
        f"{status}\n\n"
        f"Confidence: **{result.confidence:.1%}**\n\n"
        f"Technology: Facebook AudioSeal · Resolution: 1/16k second"
    )


def fill_example_text(language: str) -> str:
    """Fill the text box with an example in the selected language."""
    lang_code = LANGUAGES.get(language, "en")
    return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])


# ─── Custom CSS ──────────────────────────────────────────────────────────────

CSS = """
/* ── Base theme ── */
:root {
    --ka-primary: #1a1a2e;
    --ka-accent: #e94560;
    --ka-accent-hover: #ff6b81;
    --ka-surface: #16213e;
    --ka-surface-light: #1c2a4a;
    --ka-text: #eaeaea;
    --ka-text-muted: #8892a4;
    --ka-border: #2a3a5c;
    --ka-gold: #f5c518;
    --ka-green: #2ecc71;
}

/* ── Global ── */
.gradio-container {
    max-width: 960px !important;
    margin: 0 auto !important;
    font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
}

/* ── Hero header ── */
.hero-header {
    text-align: center;
    padding: .5rem 1.5rem 1.5rem;
    margin-bottom: 0.5rem;
    border-radius: 16px;
    position: relative;
    overflow: hidden;
}
.hero-header::before {
    content: '';
    position: absolute;
    top: -50%;
    left: -50%;
    width: 200%;
    height: 200%;
    pointer-events: none;
}
.hero-header h1 {
    font-size: 2.4rem !important;
    font-weight: 700 !important;
    margin: 0 0 0.3rem !important;
    color: var(--body-text-color);
    letter-spacing: -0.02em;
}
.hero-header .hero-accent {
    color: var(--ka-accent);
}
.hero-header p {
    color: var(--ka-text-muted);
    font-size: 1.05rem;
    margin: 0;
    line-height: 1.5;
}

/* ── Badges row ── */
.badges {
    display: flex;
    justify-content: center;
    gap: 0.5rem;
    margin-top: 1rem;
    flex-wrap: wrap;
}
.badge {
    display: inline-flex;
    align-items: center;
    gap: 0.35rem;
    padding: 0.3rem 0.75rem;
    border-radius: 999px;
    font-size: 0.78rem;
    font-weight: 600;
    letter-spacing: 0.01em;
    background: var(--block-background-fill, var(--ka-surface-light));
    color: var(--body-text-color, var(--ka-text));
    border: 1px solid var(--border-color-primary, var(--ka-border));
}
.badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
.badge.green { border-color: var(--ka-green); color: var(--ka-green); }
.badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }

/* ── Benchmark table ── */
.benchmark-table {
    width: 100%;
    border-collapse: separate;
    border-spacing: 0;
    margin: 0.75rem 0;
    font-size: 0.88rem;
    border-radius: 10px;
    overflow: hidden;
    border: 1px solid var(--ka-border);
}
.benchmark-table th {
    background: var(--ka-surface);
    color: var(--ka-text-muted);
    font-weight: 600;
    text-transform: uppercase;
    font-size: 0.72rem;
    letter-spacing: 0.06em;
    padding: 0.65rem 0.8rem;
    text-align: left;
}
.benchmark-table td {
    padding: 0.55rem 0.8rem;
    border-top: 1px solid var(--ka-border);
    color: var(--ka-text);
}
.benchmark-table tr.highlight td {
    background: rgba(233, 69, 96, 0.08);
    font-weight: 600;
}
.benchmark-table tr:not(.highlight) td {
    background: transparent;
}

/* ── Section divider ── */
.section-label {
    font-size: 0.7rem;
    text-transform: uppercase;
    letter-spacing: 0.1em;
    color: var(--ka-text-muted);
    margin: 1rem 0 0.3rem;
    padding-left: 2px;
    font-weight: 600;
}

/* ── Tab styling ── */
.tab-nav button {
    font-weight: 600 !important;
    letter-spacing: 0.01em !important;
}
.tab-nav button.selected {
    border-color: var(--ka-accent) !important;
    color: var(--ka-accent) !important;
}

/* ── Footer ── */
.footer {
    text-align: center;
    padding: 1.2rem;
    margin-top: 1rem;
    font-size: 0.8rem;
    color: var(--ka-text-muted);
    border-top: 1px solid var(--ka-border);
    line-height: 1.6;
}
.footer a {
    color: var(--ka-accent);
    text-decoration: none;
}
.footer a:hover {
    text-decoration: underline;
}
"""

# ─── Header HTML ─────────────────────────────────────────────────────────────

HEADER_HTML = """
<div class="hero-header">
    <h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1>
    <p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p>
    <div class="badges">
        <span class="badge gold">🏆 #1 German TTS</span>
        <span class="badge green">24 Languages</span>
        <span class="badge accent">Voice Cloning</span>
        <span class="badge">MIT License</span>
        <span class="badge">7B Parameters</span>
    </div>
</div>
"""

BENCHMARK_HTML = """
<table class="benchmark-table">
    <thead>
        <tr>
            <th>Rank</th>
            <th>Model</th>
            <th>Score</th>
            <th>Win Rate</th>
        </tr>
    </thead>
    <tbody>
        <tr class="highlight">
            <td>🥇</td>
            <td>KugelAudio</td>
            <td>26</td>
            <td>78.0%</td>
        </tr>
        <tr>
            <td>🥈</td>
            <td>ElevenLabs Multi v2</td>
            <td>25</td>
            <td>62.2%</td>
        </tr>
        <tr>
            <td>🥉</td>
            <td>ElevenLabs v3</td>
            <td>21</td>
            <td>65.3%</td>
        </tr>
        <tr>
            <td>4</td>
            <td>Cartesia</td>
            <td>21</td>
            <td>59.1%</td>
        </tr>
        <tr>
            <td>5</td>
            <td>VibeVoice</td>
            <td>10</td>
            <td>28.8%</td>
        </tr>
        <tr>
            <td>6</td>
            <td>CosyVoice v3</td>
            <td>9</td>
            <td>14.2%</td>
        </tr>
    </tbody>
</table>
<p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
    Based on 339 human A/B evaluations · OpenSkill Bayesian ranking
</p>
"""

FOOTER_HTML = """
<div class="footer">
    <strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br>
    <a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ·
    <a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ·
    <a href="https://kugelaudio.com">API</a> ·
    <a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
    Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
    via the AI Service Center Berlin-Brandenburg at HPI
</div>
"""

# ─── Build Gradio Interface ─────────────────────────────────────────────────

with gr.Blocks(
    css=CSS,
    title="KugelAudio – European TTS",
    theme=gr.themes.Base(
        primary_hue=gr.themes.colors.red,
        secondary_hue=gr.themes.colors.slate,
        neutral_hue=gr.themes.colors.slate,
        font=gr.themes.GoogleFont("IBM Plex Sans"),
        font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
    ),
) as demo:

    # ── Header ──
    gr.HTML(HEADER_HTML)

    # ── Main Tabs ──
    with gr.Tabs():

        # ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        with gr.TabItem("🔊 Text-to-Speech", id="tts"):
            with gr.Row():
                with gr.Column(scale=3):
                    tts_language = gr.Dropdown(
                        choices=list(LANGUAGES.keys()),
                        value="English 🇺🇸",
                        label="Language",
                        info="24 European languages supported",
                    )
                    tts_text = gr.Textbox(
                        label="Text to Synthesize",
                        placeholder="Enter text here or click 'Fill Example' below…",
                        lines=5,
                        max_lines=12,
                    )
                    with gr.Row():
                        tts_example_btn = gr.Button(
                            "📝 Fill Example", size="sm", variant="secondary"
                        )
                        tts_clear_btn = gr.ClearButton(
                            [tts_text], value="🗑️ Clear", size="sm"
                        )
                    with gr.Accordion("⚙️ Advanced Settings", open=False):
                        tts_cfg = gr.Slider(
                            minimum=1.0,
                            maximum=10.0,
                            value=3.0,
                            step=0.5,
                            label="CFG Scale",
                            info="Guidance scale — higher values follow the text more closely",
                        )
                        tts_max_tokens = gr.Slider(
                            minimum=512,
                            maximum=8192,
                            value=4096,
                            step=512,
                            label="Max Tokens",
                            info="Maximum generation length in tokens",
                        )
                    tts_generate_btn = gr.Button(
                        "🎙️ Generate Speech", variant="primary", size="lg"
                    )

                with gr.Column(scale=2):
                    tts_audio_out = gr.Audio(
                        label="Generated Audio",
                        type="filepath",
                        interactive=False,
                    )
                    tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*")

            # Events
            tts_example_btn.click(
                fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
            )
            tts_language.change(
                fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
            )
            tts_generate_btn.click(
                fn=generate_speech,
                inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
                outputs=[tts_audio_out, tts_info],
            )

        # ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        with gr.TabItem("🎭 Voice Cloning", id="clone"):
            with gr.Row():
                with gr.Column(scale=3):
                    clone_language = gr.Dropdown(
                        choices=list(LANGUAGES.keys()),
                        value="English 🇺🇸",
                        label="Language",
                    )
                    clone_text = gr.Textbox(
                        label="Text to Synthesize",
                        placeholder="Enter the text you want spoken in the cloned voice…",
                        lines=4,
                        max_lines=10,
                    )
                    clone_ref = gr.Audio(
                        label="Reference Voice",
                        type="filepath",
                        sources=["upload", "microphone"],
                    )
                    gr.Markdown(
                        "<p class='section-label'>Upload or record a few seconds of the "
                        "target voice. The model will replicate its characteristics.</p>"
                    )
                    with gr.Accordion("⚙️ Advanced Settings", open=False):
                        clone_cfg = gr.Slider(
                            minimum=1.0,
                            maximum=10.0,
                            value=3.0,
                            step=0.5,
                            label="CFG Scale",
                        )
                        clone_max_tokens = gr.Slider(
                            minimum=512,
                            maximum=8192,
                            value=4096,
                            step=512,
                            label="Max Tokens",
                        )
                    clone_btn = gr.Button(
                        "🎭 Clone & Generate", variant="primary", size="lg"
                    )

                with gr.Column(scale=2):
                    clone_audio_out = gr.Audio(
                        label="Cloned Voice Output",
                        type="filepath",
                        interactive=False,
                    )
                    clone_info = gr.Markdown(
                        "*Upload a reference voice and press 'Clone & Generate'.*"
                    )

            # Events
            clone_btn.click(
                fn=clone_voice,
                inputs=[
                    clone_text,
                    clone_ref,
                    clone_language,
                    clone_cfg,
                    clone_max_tokens,
                ],
                outputs=[clone_audio_out, clone_info],
            )

        # ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        with gr.TabItem("🔒 Watermark Verify", id="watermark"):
            with gr.Row():
                with gr.Column(scale=3):
                    wm_audio = gr.Audio(
                        label="Audio to Verify",
                        type="filepath",
                        sources=["upload"],
                    )
                    gr.Markdown(
                        "<p class='section-label'>All KugelAudio outputs are watermarked "
                        "with Facebook AudioSeal. Upload any audio file to check.</p>"
                    )
                    wm_btn = gr.Button(
                        "🔍 Verify Watermark", variant="primary", size="lg"
                    )
                with gr.Column(scale=2):
                    wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*")

            wm_btn.click(
                fn=verify_watermark,
                inputs=[wm_audio],
                outputs=[wm_result],
            )

        # ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        with gr.TabItem("🏆 Benchmarks", id="bench"):
            gr.Markdown("### Human Preference Ranking — German TTS")
            gr.HTML(BENCHMARK_HTML)
            gr.Markdown(
                "Evaluations covered **neutral speech, shouting, singing, and "
                "drunken voice** styles across diverse German-language samples. "
                "Participants heard a reference voice and compared outputs from "
                "two anonymous models in a blind A/B test."
            )

        # ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
        with gr.TabItem("ℹ️ About", id="about"):
            gr.Markdown("""
### Architecture

KugelAudio uses a hybrid **AR + Diffusion** pipeline:

1. **Text Encoder** — Qwen2-based language model encodes input text  
2. **TTS Backbone** — Upper transformer layers generate speech representations  
3. **Diffusion Head** — Predicts speech latents via denoising diffusion  
4. **Acoustic Decoder** — Converts latents to waveforms  

### Training

| Detail | Value |
|--------|-------|
| Base model | Microsoft VibeVoice |
| Training data | ~200,000 hours (YODAS2) |
| Hardware | 8× NVIDIA H100 |
| Duration | 5 days |
| Parameters | 7B |

### Responsible Use

KugelAudio is intended for accessibility, content creation, voice assistants,
language learning, and creative projects **with consent**. All generated audio
is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
without consent, fraud, or any illegal use is prohibited.

### License

Released under the **MIT License**.

### Citation

```bibtex
@software{kugelaudio2026,
  title   = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
  author  = {Kratzenstein, Kajo and Menke, Carlos},
  year    = {2026},
  url     = {https://github.com/kugelaudio/kugelaudio}
}
```
""")

    # ── Footer ──
    gr.HTML(FOOTER_HTML)


# ─── Launch ──────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        # show_api=True,
    )