kugelaudio / app.py
multimodalart's picture
multimodalart HF Staff
Update app.py
6f41d1c verified
"""
KugelAudio Gradio Demo
Open-source text-to-speech for European languages with voice cloning capabilities.
"""
import logging
import tempfile
import time
import gradio as gr
import torch
import torchaudio
import spaces
from kugelaudio_open import (
KugelAudioForConditionalGenerationInference,
KugelAudioProcessor,
)
from kugelaudio_open.watermark import AudioWatermark
logger = logging.getLogger(__name__)
# ─── Device & Model Setup ───────────────────────────────────────────────────
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
MODEL_ID = "kugelaudio/kugelaudio-0-open"
OUTPUT_SAMPLE_RATE = 24000
logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE)
model = KugelAudioForConditionalGenerationInference.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 ,
).to("cuda")
processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
watermarker = AudioWatermark()
logger.info("Model loaded successfully.")
# ─── Language Configuration ──────────────────────────────────────────────────
LANGUAGES = {
"English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr",
"Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇧🇷🇵🇹": "pt",
"Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru",
"Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro",
"Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da",
"Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el",
"Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr",
"Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr",
}
EXAMPLE_TEXTS = {
"en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
"de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.",
"fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.",
"es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.",
"it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
"pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.",
"nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
"pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.",
"ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.",
}
# ─── Inference Helpers ───────────────────────────────────────────────────────
def _to_device(inputs: dict) -> dict:
"""Move tensor values to the model device."""
return {
k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
for k, v in inputs.items()
}
def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
"""Write an audio tensor to a temporary WAV file and return its path."""
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
processor.save_audio(audio_tensor, tmp.name)
return tmp.name
@spaces.GPU
def generate_speech(
text: str,
language: str,
cfg_scale: float,
max_tokens: int,
) -> tuple[str, str]:
"""Generate speech from text and return (audio_path, info_markdown)."""
if not text.strip():
raise gr.Error("Please enter some text to synthesize.")
lang_code = LANGUAGES.get(language, "en")
inputs = processor(text=text, return_tensors="pt")
inputs = _to_device(inputs)
t0 = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=cfg_scale,
max_new_tokens=int(max_tokens),
)
elapsed = time.perf_counter() - t0
audio_path = _save_to_tempfile(outputs.speech_outputs[0])
audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
info = (
f"🔊 **Generation Complete**\n\n"
f"Language: {language} (`{lang_code}`) · "
f"CFG: {cfg_scale} · "
f"Duration: {audio_duration:.1f}s · "
f"Inference: {elapsed:.2f}s · "
f"RTF: {elapsed / audio_duration:.2f}x"
)
return audio_path, info
@spaces.GPU
def clone_voice(
text: str,
reference_audio: str | None,
language: str,
cfg_scale: float,
max_tokens: int,
) -> tuple[str, str]:
"""Clone a voice from reference audio and synthesize new text."""
if not text.strip():
raise gr.Error("Please enter some text to synthesize.")
if reference_audio is None:
raise gr.Error("Please upload a reference audio file for voice cloning.")
lang_code = LANGUAGES.get(language, "en")
inputs = processor(
text=text,
voice_prompt=reference_audio,
return_tensors="pt",
)
inputs = _to_device(inputs)
t0 = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=cfg_scale,
max_new_tokens=int(max_tokens),
)
elapsed = time.perf_counter() - t0
audio_path = _save_to_tempfile(outputs.speech_outputs[0])
audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
info = (
f"🎭 **Voice Cloning Complete**\n\n"
f"Language: {language} (`{lang_code}`) · "
f"CFG: {cfg_scale} · "
f"Duration: {audio_duration:.1f}s · "
f"Inference: {elapsed:.2f}s · "
f"RTF: {elapsed / audio_duration:.2f}x"
)
return audio_path, info
def verify_watermark(audio_file: str | None) -> str:
"""Detect the AudioSeal watermark in an uploaded audio file."""
if audio_file is None:
raise gr.Error("Please upload an audio file to verify.")
waveform, sr = torchaudio.load(audio_file)
# Resample to the expected rate if necessary
if sr != OUTPUT_SAMPLE_RATE:
waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)
result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)
if result.detected:
status = "✅ **Watermark Detected**"
else:
status = "❌ **No Watermark Detected**"
return (
f"🔍 **Watermark Verification**\n\n"
f"{status}\n\n"
f"Confidence: **{result.confidence:.1%}**\n\n"
f"Technology: Facebook AudioSeal · Resolution: 1/16k second"
)
def fill_example_text(language: str) -> str:
"""Fill the text box with an example in the selected language."""
lang_code = LANGUAGES.get(language, "en")
return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])
# ─── Custom CSS ──────────────────────────────────────────────────────────────
CSS = """
/* ── Base theme ── */
:root {
--ka-primary: #1a1a2e;
--ka-accent: #e94560;
--ka-accent-hover: #ff6b81;
--ka-surface: #16213e;
--ka-surface-light: #1c2a4a;
--ka-text: #eaeaea;
--ka-text-muted: #8892a4;
--ka-border: #2a3a5c;
--ka-gold: #f5c518;
--ka-green: #2ecc71;
}
/* ── Global ── */
.gradio-container {
max-width: 960px !important;
margin: 0 auto !important;
font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
}
/* ── Hero header ── */
.hero-header {
text-align: center;
padding: .5rem 1.5rem 1.5rem;
margin-bottom: 0.5rem;
border-radius: 16px;
position: relative;
overflow: hidden;
}
.hero-header::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
pointer-events: none;
}
.hero-header h1 {
font-size: 2.4rem !important;
font-weight: 700 !important;
margin: 0 0 0.3rem !important;
color: var(--body-text-color);
letter-spacing: -0.02em;
}
.hero-header .hero-accent {
color: var(--ka-accent);
}
.hero-header p {
color: var(--ka-text-muted);
font-size: 1.05rem;
margin: 0;
line-height: 1.5;
}
/* ── Badges row ── */
.badges {
display: flex;
justify-content: center;
gap: 0.5rem;
margin-top: 1rem;
flex-wrap: wrap;
}
.badge {
display: inline-flex;
align-items: center;
gap: 0.35rem;
padding: 0.3rem 0.75rem;
border-radius: 999px;
font-size: 0.78rem;
font-weight: 600;
letter-spacing: 0.01em;
background: var(--block-background-fill, var(--ka-surface-light));
color: var(--body-text-color, var(--ka-text));
border: 1px solid var(--border-color-primary, var(--ka-border));
}
.badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
.badge.green { border-color: var(--ka-green); color: var(--ka-green); }
.badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }
/* ── Benchmark table ── */
.benchmark-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
margin: 0.75rem 0;
font-size: 0.88rem;
border-radius: 10px;
overflow: hidden;
border: 1px solid var(--ka-border);
}
.benchmark-table th {
background: var(--ka-surface);
color: var(--ka-text-muted);
font-weight: 600;
text-transform: uppercase;
font-size: 0.72rem;
letter-spacing: 0.06em;
padding: 0.65rem 0.8rem;
text-align: left;
}
.benchmark-table td {
padding: 0.55rem 0.8rem;
border-top: 1px solid var(--ka-border);
color: var(--ka-text);
}
.benchmark-table tr.highlight td {
background: rgba(233, 69, 96, 0.08);
font-weight: 600;
}
.benchmark-table tr:not(.highlight) td {
background: transparent;
}
/* ── Section divider ── */
.section-label {
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--ka-text-muted);
margin: 1rem 0 0.3rem;
padding-left: 2px;
font-weight: 600;
}
/* ── Tab styling ── */
.tab-nav button {
font-weight: 600 !important;
letter-spacing: 0.01em !important;
}
.tab-nav button.selected {
border-color: var(--ka-accent) !important;
color: var(--ka-accent) !important;
}
/* ── Footer ── */
.footer {
text-align: center;
padding: 1.2rem;
margin-top: 1rem;
font-size: 0.8rem;
color: var(--ka-text-muted);
border-top: 1px solid var(--ka-border);
line-height: 1.6;
}
.footer a {
color: var(--ka-accent);
text-decoration: none;
}
.footer a:hover {
text-decoration: underline;
}
"""
# ─── Header HTML ─────────────────────────────────────────────────────────────
HEADER_HTML = """
<div class="hero-header">
<h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1>
<p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p>
<div class="badges">
<span class="badge gold">🏆 #1 German TTS</span>
<span class="badge green">24 Languages</span>
<span class="badge accent">Voice Cloning</span>
<span class="badge">MIT License</span>
<span class="badge">7B Parameters</span>
</div>
</div>
"""
BENCHMARK_HTML = """
<table class="benchmark-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Score</th>
<th>Win Rate</th>
</tr>
</thead>
<tbody>
<tr class="highlight">
<td>🥇</td>
<td>KugelAudio</td>
<td>26</td>
<td>78.0%</td>
</tr>
<tr>
<td>🥈</td>
<td>ElevenLabs Multi v2</td>
<td>25</td>
<td>62.2%</td>
</tr>
<tr>
<td>🥉</td>
<td>ElevenLabs v3</td>
<td>21</td>
<td>65.3%</td>
</tr>
<tr>
<td>4</td>
<td>Cartesia</td>
<td>21</td>
<td>59.1%</td>
</tr>
<tr>
<td>5</td>
<td>VibeVoice</td>
<td>10</td>
<td>28.8%</td>
</tr>
<tr>
<td>6</td>
<td>CosyVoice v3</td>
<td>9</td>
<td>14.2%</td>
</tr>
</tbody>
</table>
<p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
Based on 339 human A/B evaluations · OpenSkill Bayesian ranking
</p>
"""
FOOTER_HTML = """
<div class="footer">
<strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br>
<a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ·
<a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ·
<a href="https://kugelaudio.com">API</a> ·
<a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
via the AI Service Center Berlin-Brandenburg at HPI
</div>
"""
# ─── Build Gradio Interface ─────────────────────────────────────────────────
with gr.Blocks(
css=CSS,
title="KugelAudio – European TTS",
theme=gr.themes.Base(
primary_hue=gr.themes.colors.red,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.slate,
font=gr.themes.GoogleFont("IBM Plex Sans"),
font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
),
) as demo:
# ── Header ──
gr.HTML(HEADER_HTML)
# ── Main Tabs ──
with gr.Tabs():
# ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
with gr.TabItem("🔊 Text-to-Speech", id="tts"):
with gr.Row():
with gr.Column(scale=3):
tts_language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English 🇺🇸",
label="Language",
info="24 European languages supported",
)
tts_text = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here or click 'Fill Example' below…",
lines=5,
max_lines=12,
)
with gr.Row():
tts_example_btn = gr.Button(
"📝 Fill Example", size="sm", variant="secondary"
)
tts_clear_btn = gr.ClearButton(
[tts_text], value="🗑️ Clear", size="sm"
)
with gr.Accordion("⚙️ Advanced Settings", open=False):
tts_cfg = gr.Slider(
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.5,
label="CFG Scale",
info="Guidance scale — higher values follow the text more closely",
)
tts_max_tokens = gr.Slider(
minimum=512,
maximum=8192,
value=4096,
step=512,
label="Max Tokens",
info="Maximum generation length in tokens",
)
tts_generate_btn = gr.Button(
"🎙️ Generate Speech", variant="primary", size="lg"
)
with gr.Column(scale=2):
tts_audio_out = gr.Audio(
label="Generated Audio",
type="filepath",
interactive=False,
)
tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*")
# Events
tts_example_btn.click(
fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
)
tts_language.change(
fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
)
tts_generate_btn.click(
fn=generate_speech,
inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
outputs=[tts_audio_out, tts_info],
)
# ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
with gr.TabItem("🎭 Voice Cloning", id="clone"):
with gr.Row():
with gr.Column(scale=3):
clone_language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English 🇺🇸",
label="Language",
)
clone_text = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want spoken in the cloned voice…",
lines=4,
max_lines=10,
)
clone_ref = gr.Audio(
label="Reference Voice",
type="filepath",
sources=["upload", "microphone"],
)
gr.Markdown(
"<p class='section-label'>Upload or record a few seconds of the "
"target voice. The model will replicate its characteristics.</p>"
)
with gr.Accordion("⚙️ Advanced Settings", open=False):
clone_cfg = gr.Slider(
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.5,
label="CFG Scale",
)
clone_max_tokens = gr.Slider(
minimum=512,
maximum=8192,
value=4096,
step=512,
label="Max Tokens",
)
clone_btn = gr.Button(
"🎭 Clone & Generate", variant="primary", size="lg"
)
with gr.Column(scale=2):
clone_audio_out = gr.Audio(
label="Cloned Voice Output",
type="filepath",
interactive=False,
)
clone_info = gr.Markdown(
"*Upload a reference voice and press 'Clone & Generate'.*"
)
# Events
clone_btn.click(
fn=clone_voice,
inputs=[
clone_text,
clone_ref,
clone_language,
clone_cfg,
clone_max_tokens,
],
outputs=[clone_audio_out, clone_info],
)
# ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
with gr.TabItem("🔒 Watermark Verify", id="watermark"):
with gr.Row():
with gr.Column(scale=3):
wm_audio = gr.Audio(
label="Audio to Verify",
type="filepath",
sources=["upload"],
)
gr.Markdown(
"<p class='section-label'>All KugelAudio outputs are watermarked "
"with Facebook AudioSeal. Upload any audio file to check.</p>"
)
wm_btn = gr.Button(
"🔍 Verify Watermark", variant="primary", size="lg"
)
with gr.Column(scale=2):
wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*")
wm_btn.click(
fn=verify_watermark,
inputs=[wm_audio],
outputs=[wm_result],
)
# ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
with gr.TabItem("🏆 Benchmarks", id="bench"):
gr.Markdown("### Human Preference Ranking — German TTS")
gr.HTML(BENCHMARK_HTML)
gr.Markdown(
"Evaluations covered **neutral speech, shouting, singing, and "
"drunken voice** styles across diverse German-language samples. "
"Participants heard a reference voice and compared outputs from "
"two anonymous models in a blind A/B test."
)
# ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
with gr.TabItem("ℹ️ About", id="about"):
gr.Markdown("""
### Architecture
KugelAudio uses a hybrid **AR + Diffusion** pipeline:
1. **Text Encoder** — Qwen2-based language model encodes input text
2. **TTS Backbone** — Upper transformer layers generate speech representations
3. **Diffusion Head** — Predicts speech latents via denoising diffusion
4. **Acoustic Decoder** — Converts latents to waveforms
### Training
| Detail | Value |
|--------|-------|
| Base model | Microsoft VibeVoice |
| Training data | ~200,000 hours (YODAS2) |
| Hardware | 8× NVIDIA H100 |
| Duration | 5 days |
| Parameters | 7B |
### Responsible Use
KugelAudio is intended for accessibility, content creation, voice assistants,
language learning, and creative projects **with consent**. All generated audio
is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
without consent, fraud, or any illegal use is prohibited.
### License
Released under the **MIT License**.
### Citation
```bibtex
@software{kugelaudio2026,
title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
author = {Kratzenstein, Kajo and Menke, Carlos},
year = {2026},
url = {https://github.com/kugelaudio/kugelaudio}
}
```
""")
# ── Footer ──
gr.HTML(FOOTER_HTML)
# ─── Launch ──────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.queue()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
# show_api=True,
)