multimodalart HF Staff commited on
Commit
a1d2c1f
·
verified ·
1 Parent(s): b4096d5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +688 -0
app.py ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KugelAudio Gradio Demo
3
+ Open-source text-to-speech for European languages with voice cloning capabilities.
4
+ """
5
+
6
+ import logging
7
+ import tempfile
8
+ import time
9
+
10
+ import gradio as gr
11
+ import torch
12
+ import torchaudio
13
+
14
+ from kugelaudio_open import (
15
+ KugelAudioForConditionalGenerationInference,
16
+ KugelAudioProcessor,
17
+ )
18
+ from kugelaudio_open.watermark import AudioWatermark
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # ─── Device & Model Setup ───────────────────────────────────────────────────
23
+
24
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
+ DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
26
+ MODEL_ID = "kugelaudio/kugelaudio-0-open"
27
+ OUTPUT_SAMPLE_RATE = 24000
28
+
29
+ logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE)
30
+
31
+ model = KugelAudioForConditionalGenerationInference.from_pretrained(
32
+ MODEL_ID,
33
+ torch_dtype=DTYPE,
34
+ ).to(DEVICE)
35
+ model.eval()
36
+
37
+ processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
38
+ watermarker = AudioWatermark()
39
+
40
+ logger.info("Model loaded successfully.")
41
+
42
+ # ─── Language Configuration ──────────────────────────────────────────────────
43
+
44
+ LANGUAGES = {
45
+ "English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr",
46
+ "Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇵🇹": "pt",
47
+ "Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru",
48
+ "Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro",
49
+ "Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da",
50
+ "Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el",
51
+ "Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr",
52
+ "Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr",
53
+ }
54
+
55
+ EXAMPLE_TEXTS = {
56
+ "en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
57
+ "de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.",
58
+ "fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.",
59
+ "es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.",
60
+ "it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
61
+ "pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.",
62
+ "nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
63
+ "pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.",
64
+ "ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.",
65
+ }
66
+
67
+ # ─── Inference Helpers ───────────────────────────────────────────────────────
68
+
69
+
70
+ def _to_device(inputs: dict) -> dict:
71
+ """Move tensor values to the model device."""
72
+ return {
73
+ k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
74
+ for k, v in inputs.items()
75
+ }
76
+
77
+
78
+ def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
79
+ """Write an audio tensor to a temporary WAV file and return its path."""
80
+ tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
81
+ processor.save_audio(audio_tensor, tmp.name)
82
+ return tmp.name
83
+
84
+
85
+ def generate_speech(
86
+ text: str,
87
+ language: str,
88
+ cfg_scale: float,
89
+ max_tokens: int,
90
+ ) -> tuple[str, str]:
91
+ """Generate speech from text and return (audio_path, info_markdown)."""
92
+ if not text.strip():
93
+ raise gr.Error("Please enter some text to synthesize.")
94
+
95
+ lang_code = LANGUAGES.get(language, "en")
96
+
97
+ inputs = processor(text=text, return_tensors="pt")
98
+ inputs = _to_device(inputs)
99
+
100
+ t0 = time.perf_counter()
101
+ with torch.no_grad():
102
+ outputs = model.generate(
103
+ **inputs,
104
+ cfg_scale=cfg_scale,
105
+ max_new_tokens=int(max_tokens),
106
+ )
107
+ elapsed = time.perf_counter() - t0
108
+
109
+ audio_path = _save_to_tempfile(outputs.speech_outputs[0])
110
+ audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
111
+
112
+ info = (
113
+ f"🔊 **Generation Complete**\n\n"
114
+ f"Language: {language} (`{lang_code}`) · "
115
+ f"CFG: {cfg_scale} · "
116
+ f"Duration: {audio_duration:.1f}s · "
117
+ f"Inference: {elapsed:.2f}s · "
118
+ f"RTF: {elapsed / audio_duration:.2f}x"
119
+ )
120
+ return audio_path, info
121
+
122
+
123
+ def clone_voice(
124
+ text: str,
125
+ reference_audio: str | None,
126
+ language: str,
127
+ cfg_scale: float,
128
+ max_tokens: int,
129
+ ) -> tuple[str, str]:
130
+ """Clone a voice from reference audio and synthesize new text."""
131
+ if not text.strip():
132
+ raise gr.Error("Please enter some text to synthesize.")
133
+ if reference_audio is None:
134
+ raise gr.Error("Please upload a reference audio file for voice cloning.")
135
+
136
+ lang_code = LANGUAGES.get(language, "en")
137
+
138
+ inputs = processor(
139
+ text=text,
140
+ voice_prompt=reference_audio,
141
+ return_tensors="pt",
142
+ )
143
+ inputs = _to_device(inputs)
144
+
145
+ t0 = time.perf_counter()
146
+ with torch.no_grad():
147
+ outputs = model.generate(
148
+ **inputs,
149
+ cfg_scale=cfg_scale,
150
+ max_new_tokens=int(max_tokens),
151
+ )
152
+ elapsed = time.perf_counter() - t0
153
+
154
+ audio_path = _save_to_tempfile(outputs.speech_outputs[0])
155
+ audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE
156
+
157
+ info = (
158
+ f"🎭 **Voice Cloning Complete**\n\n"
159
+ f"Language: {language} (`{lang_code}`) · "
160
+ f"CFG: {cfg_scale} · "
161
+ f"Duration: {audio_duration:.1f}s · "
162
+ f"Inference: {elapsed:.2f}s · "
163
+ f"RTF: {elapsed / audio_duration:.2f}x"
164
+ )
165
+ return audio_path, info
166
+
167
+
168
+ def verify_watermark(audio_file: str | None) -> str:
169
+ """Detect the AudioSeal watermark in an uploaded audio file."""
170
+ if audio_file is None:
171
+ raise gr.Error("Please upload an audio file to verify.")
172
+
173
+ waveform, sr = torchaudio.load(audio_file)
174
+
175
+ # Resample to the expected rate if necessary
176
+ if sr != OUTPUT_SAMPLE_RATE:
177
+ waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)
178
+
179
+ result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)
180
+
181
+ if result.detected:
182
+ status = "✅ **Watermark Detected**"
183
+ else:
184
+ status = "❌ **No Watermark Detected**"
185
+
186
+ return (
187
+ f"🔍 **Watermark Verification**\n\n"
188
+ f"{status}\n\n"
189
+ f"Confidence: **{result.confidence:.1%}**\n\n"
190
+ f"Technology: Facebook AudioSeal · Resolution: 1/16k second"
191
+ )
192
+
193
+
194
+ def fill_example_text(language: str) -> str:
195
+ """Fill the text box with an example in the selected language."""
196
+ lang_code = LANGUAGES.get(language, "en")
197
+ return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])
198
+
199
+
200
+ # ─── Custom CSS ──────────────────────────────────────────────────────────────
201
+
202
+ CSS = """
203
+ /* ── Base theme ── */
204
+ :root {
205
+ --ka-primary: #1a1a2e;
206
+ --ka-accent: #e94560;
207
+ --ka-accent-hover: #ff6b81;
208
+ --ka-surface: #16213e;
209
+ --ka-surface-light: #1c2a4a;
210
+ --ka-text: #eaeaea;
211
+ --ka-text-muted: #8892a4;
212
+ --ka-border: #2a3a5c;
213
+ --ka-gold: #f5c518;
214
+ --ka-green: #2ecc71;
215
+ }
216
+
217
+ /* ── Global ── */
218
+ .gradio-container {
219
+ max-width: 960px !important;
220
+ margin: 0 auto !important;
221
+ font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
222
+ }
223
+
224
+ /* ── Hero header ── */
225
+ .hero-header {
226
+ text-align: center;
227
+ padding: 2.5rem 1.5rem 1.5rem;
228
+ margin-bottom: 0.5rem;
229
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
230
+ border-radius: 16px;
231
+ border: 1px solid var(--ka-border);
232
+ position: relative;
233
+ overflow: hidden;
234
+ }
235
+ .hero-header::before {
236
+ content: '';
237
+ position: absolute;
238
+ top: -50%;
239
+ left: -50%;
240
+ width: 200%;
241
+ height: 200%;
242
+ background: radial-gradient(circle at 30% 50%, rgba(233,69,96,0.06) 0%, transparent 50%),
243
+ radial-gradient(circle at 70% 80%, rgba(15,52,96,0.08) 0%, transparent 50%);
244
+ pointer-events: none;
245
+ }
246
+ .hero-header h1 {
247
+ font-size: 2.4rem !important;
248
+ font-weight: 700 !important;
249
+ margin: 0 0 0.3rem !important;
250
+ color: #ffffff !important;
251
+ letter-spacing: -0.02em;
252
+ }
253
+ .hero-header .hero-accent {
254
+ color: var(--ka-accent);
255
+ }
256
+ .hero-header p {
257
+ color: var(--ka-text-muted);
258
+ font-size: 1.05rem;
259
+ margin: 0;
260
+ line-height: 1.5;
261
+ }
262
+
263
+ /* ── Badges row ── */
264
+ .badges {
265
+ display: flex;
266
+ justify-content: center;
267
+ gap: 0.5rem;
268
+ margin-top: 1rem;
269
+ flex-wrap: wrap;
270
+ }
271
+ .badge {
272
+ display: inline-flex;
273
+ align-items: center;
274
+ gap: 0.35rem;
275
+ padding: 0.3rem 0.75rem;
276
+ border-radius: 999px;
277
+ font-size: 0.78rem;
278
+ font-weight: 600;
279
+ letter-spacing: 0.01em;
280
+ border: 1px solid var(--ka-border);
281
+ background: var(--ka-surface-light);
282
+ color: var(--ka-text);
283
+ }
284
+ .badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
285
+ .badge.green { border-color: var(--ka-green); color: var(--ka-green); }
286
+ .badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }
287
+
288
+ /* ── Benchmark table ── */
289
+ .benchmark-table {
290
+ width: 100%;
291
+ border-collapse: separate;
292
+ border-spacing: 0;
293
+ margin: 0.75rem 0;
294
+ font-size: 0.88rem;
295
+ border-radius: 10px;
296
+ overflow: hidden;
297
+ border: 1px solid var(--ka-border);
298
+ }
299
+ .benchmark-table th {
300
+ background: var(--ka-surface);
301
+ color: var(--ka-text-muted);
302
+ font-weight: 600;
303
+ text-transform: uppercase;
304
+ font-size: 0.72rem;
305
+ letter-spacing: 0.06em;
306
+ padding: 0.65rem 0.8rem;
307
+ text-align: left;
308
+ }
309
+ .benchmark-table td {
310
+ padding: 0.55rem 0.8rem;
311
+ border-top: 1px solid var(--ka-border);
312
+ color: var(--ka-text);
313
+ }
314
+ .benchmark-table tr.highlight td {
315
+ background: rgba(233, 69, 96, 0.08);
316
+ font-weight: 600;
317
+ }
318
+ .benchmark-table tr:not(.highlight) td {
319
+ background: transparent;
320
+ }
321
+
322
+ /* ── Section divider ── */
323
+ .section-label {
324
+ font-size: 0.7rem;
325
+ text-transform: uppercase;
326
+ letter-spacing: 0.1em;
327
+ color: var(--ka-text-muted);
328
+ margin: 1rem 0 0.3rem;
329
+ padding-left: 2px;
330
+ font-weight: 600;
331
+ }
332
+
333
+ /* ── Tab styling ── */
334
+ .tab-nav button {
335
+ font-weight: 600 !important;
336
+ letter-spacing: 0.01em !important;
337
+ }
338
+ .tab-nav button.selected {
339
+ border-color: var(--ka-accent) !important;
340
+ color: var(--ka-accent) !important;
341
+ }
342
+
343
+ /* ── Footer ── */
344
+ .footer {
345
+ text-align: center;
346
+ padding: 1.2rem;
347
+ margin-top: 1rem;
348
+ font-size: 0.8rem;
349
+ color: var(--ka-text-muted);
350
+ border-top: 1px solid var(--ka-border);
351
+ line-height: 1.6;
352
+ }
353
+ .footer a {
354
+ color: var(--ka-accent);
355
+ text-decoration: none;
356
+ }
357
+ .footer a:hover {
358
+ text-decoration: underline;
359
+ }
360
+ """
361
+
362
+ # ─── Header HTML ─────────────────────────────────────────────────────────────
363
+
364
+ HEADER_HTML = """
365
+ <div class="hero-header">
366
+ <h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1>
367
+ <p>Open-source text-to-speech for European languages · AR + Diffusion architecture</p>
368
+ <div class="badges">
369
+ <span class="badge gold">🏆 #1 German TTS</span>
370
+ <span class="badge green">24 Languages</span>
371
+ <span class="badge accent">Voice Cloning</span>
372
+ <span class="badge">MIT License</span>
373
+ <span class="badge">7B Parameters</span>
374
+ </div>
375
+ </div>
376
+ """
377
+
378
+ BENCHMARK_HTML = """
379
+ <table class="benchmark-table">
380
+ <thead>
381
+ <tr>
382
+ <th>Rank</th>
383
+ <th>Model</th>
384
+ <th>Score</th>
385
+ <th>Win Rate</th>
386
+ </tr>
387
+ </thead>
388
+ <tbody>
389
+ <tr class="highlight">
390
+ <td>🥇</td>
391
+ <td>KugelAudio</td>
392
+ <td>26</td>
393
+ <td>78.0%</td>
394
+ </tr>
395
+ <tr>
396
+ <td>🥈</td>
397
+ <td>ElevenLabs Multi v2</td>
398
+ <td>25</td>
399
+ <td>62.2%</td>
400
+ </tr>
401
+ <tr>
402
+ <td>🥉</td>
403
+ <td>ElevenLabs v3</td>
404
+ <td>21</td>
405
+ <td>65.3%</td>
406
+ </tr>
407
+ <tr>
408
+ <td>4</td>
409
+ <td>Cartesia</td>
410
+ <td>21</td>
411
+ <td>59.1%</td>
412
+ </tr>
413
+ <tr>
414
+ <td>5</td>
415
+ <td>VibeVoice</td>
416
+ <td>10</td>
417
+ <td>28.8%</td>
418
+ </tr>
419
+ <tr>
420
+ <td>6</td>
421
+ <td>CosyVoice v3</td>
422
+ <td>9</td>
423
+ <td>14.2%</td>
424
+ </tr>
425
+ </tbody>
426
+ </table>
427
+ <p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
428
+ Based on 339 human A/B evaluations · OpenSkill Bayesian ranking
429
+ </p>
430
+ """
431
+
432
+ FOOTER_HTML = """
433
+ <div class="footer">
434
+ <strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br>
435
+ <a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ·
436
+ <a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ·
437
+ <a href="https://kugelaudio.com">API</a> ·
438
+ <a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
439
+ Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
440
+ via the AI Service Center Berlin-Brandenburg at HPI
441
+ </div>
442
+ """
443
+
444
+ # ─── Build Gradio Interface ─────────────────────────────────────────────────
445
+
446
+ with gr.Blocks(
447
+ css=CSS,
448
+ title="KugelAudio – European TTS",
449
+ theme=gr.themes.Base(
450
+ primary_hue=gr.themes.colors.red,
451
+ secondary_hue=gr.themes.colors.slate,
452
+ neutral_hue=gr.themes.colors.slate,
453
+ font=gr.themes.GoogleFont("IBM Plex Sans"),
454
+ font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
455
+ ),
456
+ ) as demo:
457
+
458
+ # ── Header ──
459
+ gr.HTML(HEADER_HTML)
460
+
461
+ # ── Main Tabs ──
462
+ with gr.Tabs():
463
+
464
+ # ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
465
+ with gr.TabItem("🔊 Text-to-Speech", id="tts"):
466
+ with gr.Row():
467
+ with gr.Column(scale=3):
468
+ tts_language = gr.Dropdown(
469
+ choices=list(LANGUAGES.keys()),
470
+ value="English 🇺🇸",
471
+ label="Language",
472
+ info="24 European languages supported",
473
+ )
474
+ tts_text = gr.Textbox(
475
+ label="Text to Synthesize",
476
+ placeholder="Enter text here or click 'Fill Example' below…",
477
+ lines=5,
478
+ max_lines=12,
479
+ )
480
+ with gr.Row():
481
+ tts_example_btn = gr.Button(
482
+ "📝 Fill Example", size="sm", variant="secondary"
483
+ )
484
+ tts_clear_btn = gr.ClearButton(
485
+ [tts_text], value="🗑️ Clear", size="sm"
486
+ )
487
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
488
+ tts_cfg = gr.Slider(
489
+ minimum=1.0,
490
+ maximum=10.0,
491
+ value=3.0,
492
+ step=0.5,
493
+ label="CFG Scale",
494
+ info="Guidance scale — higher values follow the text more closely",
495
+ )
496
+ tts_max_tokens = gr.Slider(
497
+ minimum=512,
498
+ maximum=8192,
499
+ value=4096,
500
+ step=512,
501
+ label="Max Tokens",
502
+ info="Maximum generation length in tokens",
503
+ )
504
+ tts_generate_btn = gr.Button(
505
+ "🎙️ Generate Speech", variant="primary", size="lg"
506
+ )
507
+
508
+ with gr.Column(scale=2):
509
+ tts_audio_out = gr.Audio(
510
+ label="Generated Audio",
511
+ type="filepath",
512
+ interactive=False,
513
+ )
514
+ tts_info = gr.Markdown("*Press 'Generate Speech' to synthesize audio.*")
515
+
516
+ # Events
517
+ tts_example_btn.click(
518
+ fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
519
+ )
520
+ tts_language.change(
521
+ fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
522
+ )
523
+ tts_generate_btn.click(
524
+ fn=generate_speech,
525
+ inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
526
+ outputs=[tts_audio_out, tts_info],
527
+ )
528
+
529
+ # ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
530
+ with gr.TabItem("🎭 Voice Cloning", id="clone"):
531
+ with gr.Row():
532
+ with gr.Column(scale=3):
533
+ clone_language = gr.Dropdown(
534
+ choices=list(LANGUAGES.keys()),
535
+ value="German 🇩🇪",
536
+ label="Language",
537
+ )
538
+ clone_text = gr.Textbox(
539
+ label="Text to Synthesize",
540
+ placeholder="Enter the text you want spoken in the cloned voice…",
541
+ lines=4,
542
+ max_lines=10,
543
+ )
544
+ clone_ref = gr.Audio(
545
+ label="Reference Voice",
546
+ type="filepath",
547
+ sources=["upload", "microphone"],
548
+ )
549
+ gr.Markdown(
550
+ "<p class='section-label'>Upload or record a few seconds of the "
551
+ "target voice. The model will replicate its characteristics.</p>"
552
+ )
553
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
554
+ clone_cfg = gr.Slider(
555
+ minimum=1.0,
556
+ maximum=10.0,
557
+ value=3.0,
558
+ step=0.5,
559
+ label="CFG Scale",
560
+ )
561
+ clone_max_tokens = gr.Slider(
562
+ minimum=512,
563
+ maximum=8192,
564
+ value=4096,
565
+ step=512,
566
+ label="Max Tokens",
567
+ )
568
+ clone_btn = gr.Button(
569
+ "🎭 Clone & Generate", variant="primary", size="lg"
570
+ )
571
+
572
+ with gr.Column(scale=2):
573
+ clone_audio_out = gr.Audio(
574
+ label="Cloned Voice Output",
575
+ type="filepath",
576
+ interactive=False,
577
+ )
578
+ clone_info = gr.Markdown(
579
+ "*Upload a reference voice and press 'Clone & Generate'.*"
580
+ )
581
+
582
+ # Events
583
+ clone_btn.click(
584
+ fn=clone_voice,
585
+ inputs=[
586
+ clone_text,
587
+ clone_ref,
588
+ clone_language,
589
+ clone_cfg,
590
+ clone_max_tokens,
591
+ ],
592
+ outputs=[clone_audio_out, clone_info],
593
+ )
594
+
595
+ # ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
596
+ with gr.TabItem("🔒 Watermark Verify", id="watermark"):
597
+ with gr.Row():
598
+ with gr.Column(scale=3):
599
+ wm_audio = gr.Audio(
600
+ label="Audio to Verify",
601
+ type="filepath",
602
+ sources=["upload"],
603
+ )
604
+ gr.Markdown(
605
+ "<p class='section-label'>All KugelAudio outputs are watermarked "
606
+ "with Facebook AudioSeal. Upload any audio file to check.</p>"
607
+ )
608
+ wm_btn = gr.Button(
609
+ "🔍 Verify Watermark", variant="primary", size="lg"
610
+ )
611
+ with gr.Column(scale=2):
612
+ wm_result = gr.Markdown("*Upload audio and press 'Verify Watermark'.*")
613
+
614
+ wm_btn.click(
615
+ fn=verify_watermark,
616
+ inputs=[wm_audio],
617
+ outputs=[wm_result],
618
+ )
619
+
620
+ # ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
621
+ with gr.TabItem("🏆 Benchmarks", id="bench"):
622
+ gr.Markdown("### Human Preference Ranking — German TTS")
623
+ gr.HTML(BENCHMARK_HTML)
624
+ gr.Markdown(
625
+ "Evaluations covered **neutral speech, shouting, singing, and "
626
+ "drunken voice** styles across diverse German-language samples. "
627
+ "Participants heard a reference voice and compared outputs from "
628
+ "two anonymous models in a blind A/B test."
629
+ )
630
+
631
+ # ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
632
+ with gr.TabItem("ℹ️ About", id="about"):
633
+ gr.Markdown("""
634
+ ### Architecture
635
+
636
+ KugelAudio uses a hybrid **AR + Diffusion** pipeline:
637
+
638
+ 1. **Text Encoder** — Qwen2-based language model encodes input text
639
+ 2. **TTS Backbone** — Upper transformer layers generate speech representations
640
+ 3. **Diffusion Head** — Predicts speech latents via denoising diffusion
641
+ 4. **Acoustic Decoder** — Converts latents to waveforms
642
+
643
+ ### Training
644
+
645
+ | Detail | Value |
646
+ |--------|-------|
647
+ | Base model | Microsoft VibeVoice |
648
+ | Training data | ~200,000 hours (YODAS2) |
649
+ | Hardware | 8× NVIDIA H100 |
650
+ | Duration | 5 days |
651
+ | Parameters | 7B |
652
+
653
+ ### Responsible Use
654
+
655
+ KugelAudio is intended for accessibility, content creation, voice assistants,
656
+ language learning, and creative projects **with consent**. All generated audio
657
+ is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
658
+ without consent, fraud, or any illegal use is prohibited.
659
+
660
+ ### License
661
+
662
+ Released under the **MIT License**.
663
+
664
+ ### Citation
665
+
666
+ ```bibtex
667
+ @software{kugelaudio2026,
668
+ title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
669
+ author = {Kratzenstein, Kajo and Menke, Carlos},
670
+ year = {2026},
671
+ url = {https://github.com/kugelaudio/kugelaudio}
672
+ }
673
+ ```
674
+ """)
675
+
676
+ # ── Footer ──
677
+ gr.HTML(FOOTER_HTML)
678
+
679
+
680
+ # ─── Launch ──────────────────────────────────────────────────────────────────
681
+
682
+ if __name__ == "__main__":
683
+ demo.queue()
684
+ demo.launch(
685
+ server_name="0.0.0.0",
686
+ server_port=7860,
687
+ show_api=True,
688
+ )