kugelaudio / app.py
multimodalart's picture
Update app.py
6f41d1c verified
"""
KugelAudio Gradio Demo
Open-source text-to-speech for European languages with voice cloning capabilities.
"""
import logging
import tempfile
import time
import gradio as gr
import torch
import torchaudio
import spaces
from kugelaudio_open import (
KugelAudioForConditionalGenerationInference,
KugelAudioProcessor,
)
from kugelaudio_open.watermark import AudioWatermark
logger = logging.getLogger(__name__)
# โ”€โ”€โ”€ Device & Model Setup โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
MODEL_ID = "kugelaudio/kugelaudio-0-open"
OUTPUT_SAMPLE_RATE = 24000
logger.info("Loading KugelAudio model '%s' on %s (%s)โ€ฆ", MODEL_ID, DEVICE, DTYPE)
model = KugelAudioForConditionalGenerationInference.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16 ,
).to("cuda")
processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
watermarker = AudioWatermark()
logger.info("Model loaded successfully.")
# โ”€โ”€โ”€ Language Configuration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
LANGUAGES = {
"English ๐Ÿ‡บ๐Ÿ‡ธ": "en", "German ๐Ÿ‡ฉ๐Ÿ‡ช": "de", "French ๐Ÿ‡ซ๐Ÿ‡ท": "fr",
"Spanish ๐Ÿ‡ช๐Ÿ‡ธ": "es", "Italian ๐Ÿ‡ฎ๐Ÿ‡น": "it", "Portuguese ๐Ÿ‡ง๐Ÿ‡ท๐Ÿ‡ต๐Ÿ‡น": "pt",
"Dutch ๐Ÿ‡ณ๐Ÿ‡ฑ": "nl", "Polish ๐Ÿ‡ต๐Ÿ‡ฑ": "pl", "Russian ๐Ÿ‡ท๐Ÿ‡บ": "ru",
"Ukrainian ๐Ÿ‡บ๐Ÿ‡ฆ": "uk", "Czech ๐Ÿ‡จ๐Ÿ‡ฟ": "cs", "Romanian ๐Ÿ‡ท๐Ÿ‡ด": "ro",
"Hungarian ๐Ÿ‡ญ๐Ÿ‡บ": "hu", "Swedish ๐Ÿ‡ธ๐Ÿ‡ช": "sv", "Danish ๐Ÿ‡ฉ๐Ÿ‡ฐ": "da",
"Finnish ๐Ÿ‡ซ๐Ÿ‡ฎ": "fi", "Norwegian ๐Ÿ‡ณ๐Ÿ‡ด": "no", "Greek ๐Ÿ‡ฌ๐Ÿ‡ท": "el",
"Bulgarian ๐Ÿ‡ง๐Ÿ‡ฌ": "bg", "Slovak ๐Ÿ‡ธ๐Ÿ‡ฐ": "sk", "Croatian ๐Ÿ‡ญ๐Ÿ‡ท": "hr",
"Serbian ๐Ÿ‡ท๐Ÿ‡ธ": "sr", "Turkish ๐Ÿ‡น๐Ÿ‡ท": "tr",
}
EXAMPLE_TEXTS = {
"en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
"de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System fรผr europรคische Sprachen. Unser Modell unterstรผtzt Voice Cloning und emotionale Sprachsynthese.",
"fr": "Bienvenue sur KugelAudio, le systรจme de synthรจse vocale open-source pour les langues europรฉennes. Notre modรจle prend en charge le clonage vocal et la synthรจse vocale รฉmotionnelle.",
"es": "Bienvenido a KugelAudio, el sistema de texto a voz de cรณdigo abierto para idiomas europeos. Nuestro modelo soporta clonaciรณn de voz y sรญntesis de habla emocional.",
"it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
"pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de cรณdigo aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e sรญntese de fala emocional.",
"nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
"pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie ลบrรณdล‚owym dla jฤ™zykรณw europejskich. Nasz model obsล‚uguje klonowanie gล‚osu i emocjonalnฤ… syntezฤ™ mowy.",
"ru": "ะ”ะพะฑั€ะพ ะฟะพะถะฐะปะพะฒะฐั‚ัŒ ะฒ KugelAudio โ€” ัะธัั‚ะตะผัƒ ัะธะฝั‚ะตะทะฐ ั€ะตั‡ะธ ั ะพั‚ะบั€ั‹ั‚ั‹ะผ ะธัั…ะพะดะฝั‹ะผ ะบะพะดะพะผ ะดะปั ะตะฒั€ะพะฟะตะนัะบะธั… ัะทั‹ะบะพะฒ. ะะฐัˆะฐ ะผะพะดะตะปัŒ ะฟะพะดะดะตั€ะถะธะฒะฐะตั‚ ะบะปะพะฝะธั€ะพะฒะฐะฝะธะต ะณะพะปะพัะฐ ะธ ัะผะพั†ะธะพะฝะฐะปัŒะฝั‹ะน ัะธะฝั‚ะตะท ั€ะตั‡ะธ.",
}
# โ”€โ”€โ”€ Inference Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _to_device(inputs: dict) -> dict:
"""Move tensor values to the model device."""
return {
k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
for k, v in inputs.items()
}
def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
"""Write an audio tensor to a temporary WAV file and return its path."""
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
processor.save_audio(audio_tensor, tmp.name)
return tmp.name
@spaces.GPU
def generate_speech(
text: str,
language: str,
cfg_scale: float,
max_tokens: int,
) -> tuple[str, str]:
"""Generate speech from text and return (audio_path, info_markdown)."""
if not text.strip():
raise gr.Error("Please enter some text to synthesize.")
lang_code = LANGUAGES.get(language, "en")
inputs = processor(text=text, return_tensors="pt")
inputs = _to_device(inputs)
t0 = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=cfg_scale,
max_new_tokens=int(max_tokens),
)
elapsed = time.perf_counter() - t0
audio_path = _save_to_tempfile(outputs.speech_outputs[0])
audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
info = (
f"๐Ÿ”Š **Generation Complete**\n\n"
f"Language: {language} (`{lang_code}`) ยท "
f"CFG: {cfg_scale} ยท "
f"Duration: {audio_duration:.1f}s ยท "
f"Inference: {elapsed:.2f}s ยท "
f"RTF: {elapsed / audio_duration:.2f}x"
)
return audio_path, info
@spaces.GPU
def clone_voice(
text: str,
reference_audio: str | None,
language: str,
cfg_scale: float,
max_tokens: int,
) -> tuple[str, str]:
"""Clone a voice from reference audio and synthesize new text."""
if not text.strip():
raise gr.Error("Please enter some text to synthesize.")
if reference_audio is None:
raise gr.Error("Please upload a reference audio file for voice cloning.")
lang_code = LANGUAGES.get(language, "en")
inputs = processor(
text=text,
voice_prompt=reference_audio,
return_tensors="pt",
)
inputs = _to_device(inputs)
t0 = time.perf_counter()
with torch.no_grad():
outputs = model.generate(
**inputs,
cfg_scale=cfg_scale,
max_new_tokens=int(max_tokens),
)
elapsed = time.perf_counter() - t0
audio_path = _save_to_tempfile(outputs.speech_outputs[0])
audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
info = (
f"๐ŸŽญ **Voice Cloning Complete**\n\n"
f"Language: {language} (`{lang_code}`) ยท "
f"CFG: {cfg_scale} ยท "
f"Duration: {audio_duration:.1f}s ยท "
f"Inference: {elapsed:.2f}s ยท "
f"RTF: {elapsed / audio_duration:.2f}x"
)
return audio_path, info
def verify_watermark(audio_file: str | None) -> str:
"""Detect the AudioSeal watermark in an uploaded audio file."""
if audio_file is None:
raise gr.Error("Please upload an audio file to verify.")
waveform, sr = torchaudio.load(audio_file)
# Resample to the expected rate if necessary
if sr != OUTPUT_SAMPLE_RATE:
waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)
result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)
if result.detected:
status = "โœ… **Watermark Detected**"
else:
status = "โŒ **No Watermark Detected**"
return (
f"๐Ÿ” **Watermark Verification**\n\n"
f"{status}\n\n"
f"Confidence: **{result.confidence:.1%}**\n\n"
f"Technology: Facebook AudioSeal ยท Resolution: 1/16k second"
)
def fill_example_text(language: str) -> str:
"""Fill the text box with an example in the selected language."""
lang_code = LANGUAGES.get(language, "en")
return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])
# โ”€โ”€โ”€ Custom CSS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
CSS = """
/* โ”€โ”€ Base theme โ”€โ”€ */
:root {
--ka-primary: #1a1a2e;
--ka-accent: #e94560;
--ka-accent-hover: #ff6b81;
--ka-surface: #16213e;
--ka-surface-light: #1c2a4a;
--ka-text: #eaeaea;
--ka-text-muted: #8892a4;
--ka-border: #2a3a5c;
--ka-gold: #f5c518;
--ka-green: #2ecc71;
}
/* โ”€โ”€ Global โ”€โ”€ */
.gradio-container {
max-width: 960px !important;
margin: 0 auto !important;
font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
}
/* โ”€โ”€ Hero header โ”€โ”€ */
.hero-header {
text-align: center;
padding: .5rem 1.5rem 1.5rem;
margin-bottom: 0.5rem;
border-radius: 16px;
position: relative;
overflow: hidden;
}
.hero-header::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
pointer-events: none;
}
.hero-header h1 {
font-size: 2.4rem !important;
font-weight: 700 !important;
margin: 0 0 0.3rem !important;
color: var(--body-text-color);
letter-spacing: -0.02em;
}
.hero-header .hero-accent {
color: var(--ka-accent);
}
.hero-header p {
color: var(--ka-text-muted);
font-size: 1.05rem;
margin: 0;
line-height: 1.5;
}
/* โ”€โ”€ Badges row โ”€โ”€ */
.badges {
display: flex;
justify-content: center;
gap: 0.5rem;
margin-top: 1rem;
flex-wrap: wrap;
}
.badge {
display: inline-flex;
align-items: center;
gap: 0.35rem;
padding: 0.3rem 0.75rem;
border-radius: 999px;
font-size: 0.78rem;
font-weight: 600;
letter-spacing: 0.01em;
background: var(--block-background-fill, var(--ka-surface-light));
color: var(--body-text-color, var(--ka-text));
border: 1px solid var(--border-color-primary, var(--ka-border));
}
.badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
.badge.green { border-color: var(--ka-green); color: var(--ka-green); }
.badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }
/* โ”€โ”€ Benchmark table โ”€โ”€ */
.benchmark-table {
width: 100%;
border-collapse: separate;
border-spacing: 0;
margin: 0.75rem 0;
font-size: 0.88rem;
border-radius: 10px;
overflow: hidden;
border: 1px solid var(--ka-border);
}
.benchmark-table th {
background: var(--ka-surface);
color: var(--ka-text-muted);
font-weight: 600;
text-transform: uppercase;
font-size: 0.72rem;
letter-spacing: 0.06em;
padding: 0.65rem 0.8rem;
text-align: left;
}
.benchmark-table td {
padding: 0.55rem 0.8rem;
border-top: 1px solid var(--ka-border);
color: var(--ka-text);
}
.benchmark-table tr.highlight td {
background: rgba(233, 69, 96, 0.08);
font-weight: 600;
}
.benchmark-table tr:not(.highlight) td {
background: transparent;
}
/* โ”€โ”€ Section divider โ”€โ”€ */
.section-label {
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: var(--ka-text-muted);
margin: 1rem 0 0.3rem;
padding-left: 2px;
font-weight: 600;
}
/* โ”€โ”€ Tab styling โ”€โ”€ */
.tab-nav button {
font-weight: 600 !important;
letter-spacing: 0.01em !important;
}
.tab-nav button.selected {
border-color: var(--ka-accent) !important;
color: var(--ka-accent) !important;
}
/* โ”€โ”€ Footer โ”€โ”€ */
.footer {
text-align: center;
padding: 1.2rem;
margin-top: 1rem;
font-size: 0.8rem;
color: var(--ka-text-muted);
border-top: 1px solid var(--ka-border);
line-height: 1.6;
}
.footer a {
color: var(--ka-accent);
text-decoration: none;
}
.footer a:hover {
text-decoration: underline;
}
"""
# โ”€โ”€โ”€ Header HTML โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
HEADER_HTML = """
<div class="hero-header">
<h1>๐ŸŽ™๏ธ <span class="hero-accent">Kugel</span>Audio</h1>
<p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p>
<div class="badges">
<span class="badge gold">๐Ÿ† #1 German TTS</span>
<span class="badge green">24 Languages</span>
<span class="badge accent">Voice Cloning</span>
<span class="badge">MIT License</span>
<span class="badge">7B Parameters</span>
</div>
</div>
"""
BENCHMARK_HTML = """
<table class="benchmark-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Score</th>
<th>Win Rate</th>
</tr>
</thead>
<tbody>
<tr class="highlight">
<td>๐Ÿฅ‡</td>
<td>KugelAudio</td>
<td>26</td>
<td>78.0%</td>
</tr>
<tr>
<td>๐Ÿฅˆ</td>
<td>ElevenLabs Multi v2</td>
<td>25</td>
<td>62.2%</td>
</tr>
<tr>
<td>๐Ÿฅ‰</td>
<td>ElevenLabs v3</td>
<td>21</td>
<td>65.3%</td>
</tr>
<tr>
<td>4</td>
<td>Cartesia</td>
<td>21</td>
<td>59.1%</td>
</tr>
<tr>
<td>5</td>
<td>VibeVoice</td>
<td>10</td>
<td>28.8%</td>
</tr>
<tr>
<td>6</td>
<td>CosyVoice v3</td>
<td>9</td>
<td>14.2%</td>
</tr>
</tbody>
</table>
<p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
Based on 339 human A/B evaluations ยท OpenSkill Bayesian ranking
</p>
"""
FOOTER_HTML = """
<div class="footer">
<strong>KugelAudio</strong> ยท Created by Kajo Kratzenstein & Carlos Menke<br>
<a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ยท
<a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ยท
<a href="https://kugelaudio.com">API</a> ยท
<a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
via the AI Service Center Berlin-Brandenburg at HPI
</div>
"""
# โ”€โ”€โ”€ Build Gradio Interface โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(
css=CSS,
title="KugelAudio โ€“ European TTS",
theme=gr.themes.Base(
primary_hue=gr.themes.colors.red,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.slate,
font=gr.themes.GoogleFont("IBM Plex Sans"),
font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
),
) as demo:
# โ”€โ”€ Header โ”€โ”€
gr.HTML(HEADER_HTML)
# โ”€โ”€ Main Tabs โ”€โ”€
with gr.Tabs():
# โ”โ”โ” Tab 1: Text-to-Speech โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with gr.TabItem("๐Ÿ”Š Text-to-Speech", id="tts"):
with gr.Row():
with gr.Column(scale=3):
tts_language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English ๐Ÿ‡บ๐Ÿ‡ธ",
label="Language",
info="24 European languages supported",
)
tts_text = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here or click 'Fill Example' belowโ€ฆ",
lines=5,
max_lines=12,
)
with gr.Row():
tts_example_btn = gr.Button(
"๐Ÿ“ Fill Example", size="sm", variant="secondary"
)
tts_clear_btn = gr.ClearButton(
[tts_text], value="๐Ÿ—‘๏ธ Clear", size="sm"
)
with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
tts_cfg = gr.Slider(
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.5,
label="CFG Scale",
info="Guidance scale โ€” higher values follow the text more closely",
)
tts_max_tokens = gr.Slider(
minimum=512,
maximum=8192,
value=4096,
step=512,
label="Max Tokens",
info="Maximum generation length in tokens",
)
tts_generate_btn = gr.Button(
"๐ŸŽ™๏ธ Generate Speech", variant="primary", size="lg"
)
with gr.Column(scale=2):
tts_audio_out = gr.Audio(
label="Generated Audio",
type="filepath",
interactive=False,
)
tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*")
# Events
tts_example_btn.click(
fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
)
tts_language.change(
fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
)
tts_generate_btn.click(
fn=generate_speech,
inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
outputs=[tts_audio_out, tts_info],
)
# โ”โ”โ” Tab 2: Voice Cloning โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with gr.TabItem("๐ŸŽญ Voice Cloning", id="clone"):
with gr.Row():
with gr.Column(scale=3):
clone_language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="English ๐Ÿ‡บ๐Ÿ‡ธ",
label="Language",
)
clone_text = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want spoken in the cloned voiceโ€ฆ",
lines=4,
max_lines=10,
)
clone_ref = gr.Audio(
label="Reference Voice",
type="filepath",
sources=["upload", "microphone"],
)
gr.Markdown(
"<p class='section-label'>Upload or record a few seconds of the "
"target voice. The model will replicate its characteristics.</p>"
)
with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
clone_cfg = gr.Slider(
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.5,
label="CFG Scale",
)
clone_max_tokens = gr.Slider(
minimum=512,
maximum=8192,
value=4096,
step=512,
label="Max Tokens",
)
clone_btn = gr.Button(
"๐ŸŽญ Clone & Generate", variant="primary", size="lg"
)
with gr.Column(scale=2):
clone_audio_out = gr.Audio(
label="Cloned Voice Output",
type="filepath",
interactive=False,
)
clone_info = gr.Markdown(
"*Upload a reference voice and press 'Clone & Generate'.*"
)
# Events
clone_btn.click(
fn=clone_voice,
inputs=[
clone_text,
clone_ref,
clone_language,
clone_cfg,
clone_max_tokens,
],
outputs=[clone_audio_out, clone_info],
)
# โ”โ”โ” Tab 3: Watermark Verification โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with gr.TabItem("๐Ÿ”’ Watermark Verify", id="watermark"):
with gr.Row():
with gr.Column(scale=3):
wm_audio = gr.Audio(
label="Audio to Verify",
type="filepath",
sources=["upload"],
)
gr.Markdown(
"<p class='section-label'>All KugelAudio outputs are watermarked "
"with Facebook AudioSeal. Upload any audio file to check.</p>"
)
wm_btn = gr.Button(
"๐Ÿ” Verify Watermark", variant="primary", size="lg"
)
with gr.Column(scale=2):
wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*")
wm_btn.click(
fn=verify_watermark,
inputs=[wm_audio],
outputs=[wm_result],
)
# โ”โ”โ” Tab 4: Benchmarks โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with gr.TabItem("๐Ÿ† Benchmarks", id="bench"):
gr.Markdown("### Human Preference Ranking โ€” German TTS")
gr.HTML(BENCHMARK_HTML)
gr.Markdown(
"Evaluations covered **neutral speech, shouting, singing, and "
"drunken voice** styles across diverse German-language samples. "
"Participants heard a reference voice and compared outputs from "
"two anonymous models in a blind A/B test."
)
# โ”โ”โ” Tab 5: About โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
with gr.TabItem("โ„น๏ธ About", id="about"):
gr.Markdown("""
### Architecture
KugelAudio uses a hybrid **AR + Diffusion** pipeline:
1. **Text Encoder** โ€” Qwen2-based language model encodes input text
2. **TTS Backbone** โ€” Upper transformer layers generate speech representations
3. **Diffusion Head** โ€” Predicts speech latents via denoising diffusion
4. **Acoustic Decoder** โ€” Converts latents to waveforms
### Training
| Detail | Value |
|--------|-------|
| Base model | Microsoft VibeVoice |
| Training data | ~200,000 hours (YODAS2) |
| Hardware | 8ร— NVIDIA H100 |
| Duration | 5 days |
| Parameters | 7B |
### Responsible Use
KugelAudio is intended for accessibility, content creation, voice assistants,
language learning, and creative projects **with consent**. All generated audio
is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
without consent, fraud, or any illegal use is prohibited.
### License
Released under the **MIT License**.
### Citation
```bibtex
@software{kugelaudio2026,
title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
author = {Kratzenstein, Kajo and Menke, Carlos},
year = {2026},
url = {https://github.com/kugelaudio/kugelaudio}
}
```
""")
# โ”€โ”€ Footer โ”€โ”€
gr.HTML(FOOTER_HTML)
# โ”€โ”€โ”€ Launch โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
if __name__ == "__main__":
demo.queue()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
# show_api=True,
)