kugelaudio

Runtime error

App Files Files Community

kugelaudio / app.py

multimodalart HF Staff

Update app.py

6f41d1c verified 3 months ago

raw

history blame contribute delete

24.5 kB

	"""
	KugelAudio Gradio Demo
	Open-source text-to-speech for European languages with voice cloning capabilities.
	"""

	import logging
	import tempfile
	import time

	import gradio as gr
	import torch
	import torchaudio

	import spaces

	from kugelaudio_open import (
	KugelAudioForConditionalGenerationInference,
	KugelAudioProcessor,
	)
	from kugelaudio_open.watermark import AudioWatermark


	logger = logging.getLogger(__name__)

	# ─── Device & Model Setup ───────────────────────────────────────────────────

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
	MODEL_ID = "kugelaudio/kugelaudio-0-open"
	OUTPUT_SAMPLE_RATE = 24000

	logger.info("Loading KugelAudio model '%s' on %s (%s)…", MODEL_ID, DEVICE, DTYPE)

	model = KugelAudioForConditionalGenerationInference.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 ,
	).to("cuda")

	processor = KugelAudioProcessor.from_pretrained(MODEL_ID)
	watermarker = AudioWatermark()

	logger.info("Model loaded successfully.")

	# ─── Language Configuration ──────────────────────────────────────────────────

	LANGUAGES = {
	"English 🇺🇸": "en", "German 🇩🇪": "de", "French 🇫🇷": "fr",
	"Spanish 🇪🇸": "es", "Italian 🇮🇹": "it", "Portuguese 🇧🇷🇵🇹": "pt",
	"Dutch 🇳🇱": "nl", "Polish 🇵🇱": "pl", "Russian 🇷🇺": "ru",
	"Ukrainian 🇺🇦": "uk", "Czech 🇨🇿": "cs", "Romanian 🇷🇴": "ro",
	"Hungarian 🇭🇺": "hu", "Swedish 🇸🇪": "sv", "Danish 🇩🇰": "da",
	"Finnish 🇫🇮": "fi", "Norwegian 🇳🇴": "no", "Greek 🇬🇷": "el",
	"Bulgarian 🇧🇬": "bg", "Slovak 🇸🇰": "sk", "Croatian 🇭🇷": "hr",
	"Serbian 🇷🇸": "sr", "Turkish 🇹🇷": "tr",
	}

	EXAMPLE_TEXTS = {
	"en": "Welcome to KugelAudio, the open-source text-to-speech system for European languages. Our model supports voice cloning and emotional speech synthesis.",
	"de": "Willkommen bei KugelAudio, dem Open-Source Text-to-Speech System für europäische Sprachen. Unser Modell unterstützt Voice Cloning und emotionale Sprachsynthese.",
	"fr": "Bienvenue sur KugelAudio, le système de synthèse vocale open-source pour les langues européennes. Notre modèle prend en charge le clonage vocal et la synthèse vocale émotionnelle.",
	"es": "Bienvenido a KugelAudio, el sistema de texto a voz de código abierto para idiomas europeos. Nuestro modelo soporta clonación de voz y síntesis de habla emocional.",
	"it": "Benvenuto in KugelAudio, il sistema di sintesi vocale open-source per le lingue europee. Il nostro modello supporta la clonazione vocale e la sintesi vocale emotiva.",
	"pt": "Bem-vindo ao KugelAudio, o sistema de texto para fala de código aberto para idiomas europeus. Nosso modelo suporta clonagem de voz e síntese de fala emocional.",
	"nl": "Welkom bij KugelAudio, het open-source tekst-naar-spraak systeem voor Europese talen. Ons model ondersteunt stemklonering en emotionele spraaksynthese.",
	"pl": "Witamy w KugelAudio, systemie syntezy mowy o otwartym kodzie źródłowym dla języków europejskich. Nasz model obsługuje klonowanie głosu i emocjonalną syntezę mowy.",
	"ru": "Добро пожаловать в KugelAudio — систему синтеза речи с открытым исходным кодом для европейских языков. Наша модель поддерживает клонирование голоса и эмоциональный синтез речи.",
	}

	# ─── Inference Helpers ───────────────────────────────────────────────────────


	def _to_device(inputs: dict) -> dict:
	"""Move tensor values to the model device."""
	return {
	k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v
	for k, v in inputs.items()
	}


	def _save_to_tempfile(audio_tensor: torch.Tensor) -> str:
	"""Write an audio tensor to a temporary WAV file and return its path."""
	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	processor.save_audio(audio_tensor, tmp.name)
	return tmp.name

	@spaces.GPU
	def generate_speech(
	text: str,
	language: str,
	cfg_scale: float,
	max_tokens: int,
	) -> tuple[str, str]:
	"""Generate speech from text and return (audio_path, info_markdown)."""
	if not text.strip():
	raise gr.Error("Please enter some text to synthesize.")

	lang_code = LANGUAGES.get(language, "en")

	inputs = processor(text=text, return_tensors="pt")
	inputs = _to_device(inputs)

	t0 = time.perf_counter()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	cfg_scale=cfg_scale,
	max_new_tokens=int(max_tokens),
	)
	elapsed = time.perf_counter() - t0

	audio_path = _save_to_tempfile(outputs.speech_outputs[0])
	audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE

	info = (
	f"🔊 Generation Complete\n\n"
	f"Language: {language} (`{lang_code}`) · "
	f"CFG: {cfg_scale} · "
	f"Duration: {audio_duration:.1f}s · "
	f"Inference: {elapsed:.2f}s · "
	f"RTF: {elapsed / audio_duration:.2f}x"
	)
	return audio_path, info

	@spaces.GPU
	def clone_voice(
	text: str,
	reference_audio: str \| None,
	language: str,
	cfg_scale: float,
	max_tokens: int,
	) -> tuple[str, str]:
	"""Clone a voice from reference audio and synthesize new text."""
	if not text.strip():
	raise gr.Error("Please enter some text to synthesize.")
	if reference_audio is None:
	raise gr.Error("Please upload a reference audio file for voice cloning.")

	lang_code = LANGUAGES.get(language, "en")

	inputs = processor(
	text=text,
	voice_prompt=reference_audio,
	return_tensors="pt",
	)
	inputs = _to_device(inputs)

	t0 = time.perf_counter()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	cfg_scale=cfg_scale,
	max_new_tokens=int(max_tokens),
	)
	elapsed = time.perf_counter() - t0

	audio_path = _save_to_tempfile(outputs.speech_outputs[0])
	audio_duration = outputs.speech_outputs[0].shape[-1] / OUTPUT_SAMPLE_RATE

	info = (
	f"🎭 Voice Cloning Complete\n\n"
	f"Language: {language} (`{lang_code}`) · "
	f"CFG: {cfg_scale} · "
	f"Duration: {audio_duration:.1f}s · "
	f"Inference: {elapsed:.2f}s · "
	f"RTF: {elapsed / audio_duration:.2f}x"
	)
	return audio_path, info


	def verify_watermark(audio_file: str \| None) -> str:
	"""Detect the AudioSeal watermark in an uploaded audio file."""
	if audio_file is None:
	raise gr.Error("Please upload an audio file to verify.")

	waveform, sr = torchaudio.load(audio_file)

	# Resample to the expected rate if necessary
	if sr != OUTPUT_SAMPLE_RATE:
	waveform = torchaudio.functional.resample(waveform, sr, OUTPUT_SAMPLE_RATE)

	result = watermarker.detect(waveform, sample_rate=OUTPUT_SAMPLE_RATE)

	if result.detected:
	status = "✅ Watermark Detected"
	else:
	status = "❌ No Watermark Detected"

	return (
	f"🔍 Watermark Verification\n\n"
	f"{status}\n\n"
	f"Confidence: {result.confidence:.1%}\n\n"
	f"Technology: Facebook AudioSeal · Resolution: 1/16k second"
	)


	def fill_example_text(language: str) -> str:
	"""Fill the text box with an example in the selected language."""
	lang_code = LANGUAGES.get(language, "en")
	return EXAMPLE_TEXTS.get(lang_code, EXAMPLE_TEXTS["en"])


	# ─── Custom CSS ──────────────────────────────────────────────────────────────

	CSS = """
	/* ── Base theme ── */
	:root {
	--ka-primary: #1a1a2e;
	--ka-accent: #e94560;
	--ka-accent-hover: #ff6b81;
	--ka-surface: #16213e;
	--ka-surface-light: #1c2a4a;
	--ka-text: #eaeaea;
	--ka-text-muted: #8892a4;
	--ka-border: #2a3a5c;
	--ka-gold: #f5c518;
	--ka-green: #2ecc71;
	}

	/* ── Global ── */
	.gradio-container {
	max-width: 960px !important;
	margin: 0 auto !important;
	font-family: 'IBM Plex Sans', 'Segoe UI', system-ui, sans-serif !important;
	}

	/* ── Hero header ── */
	.hero-header {
	text-align: center;
	padding: .5rem 1.5rem 1.5rem;
	margin-bottom: 0.5rem;
	border-radius: 16px;
	position: relative;
	overflow: hidden;
	}
	.hero-header::before {
	content: '';
	position: absolute;
	top: -50%;
	left: -50%;
	width: 200%;
	height: 200%;
	pointer-events: none;
	}
	.hero-header h1 {
	font-size: 2.4rem !important;
	font-weight: 700 !important;
	margin: 0 0 0.3rem !important;
	color: var(--body-text-color);
	letter-spacing: -0.02em;
	}
	.hero-header .hero-accent {
	color: var(--ka-accent);
	}
	.hero-header p {
	color: var(--ka-text-muted);
	font-size: 1.05rem;
	margin: 0;
	line-height: 1.5;
	}

	/* ── Badges row ── */
	.badges {
	display: flex;
	justify-content: center;
	gap: 0.5rem;
	margin-top: 1rem;
	flex-wrap: wrap;
	}
	.badge {
	display: inline-flex;
	align-items: center;
	gap: 0.35rem;
	padding: 0.3rem 0.75rem;
	border-radius: 999px;
	font-size: 0.78rem;
	font-weight: 600;
	letter-spacing: 0.01em;
	background: var(--block-background-fill, var(--ka-surface-light));
	color: var(--body-text-color, var(--ka-text));
	border: 1px solid var(--border-color-primary, var(--ka-border));
	}
	.badge.gold { border-color: var(--ka-gold); color: var(--ka-gold); }
	.badge.green { border-color: var(--ka-green); color: var(--ka-green); }
	.badge.accent { border-color: var(--ka-accent); color: var(--ka-accent); }

	/* ── Benchmark table ── */
	.benchmark-table {
	width: 100%;
	border-collapse: separate;
	border-spacing: 0;
	margin: 0.75rem 0;
	font-size: 0.88rem;
	border-radius: 10px;
	overflow: hidden;
	border: 1px solid var(--ka-border);
	}
	.benchmark-table th {
	background: var(--ka-surface);
	color: var(--ka-text-muted);
	font-weight: 600;
	text-transform: uppercase;
	font-size: 0.72rem;
	letter-spacing: 0.06em;
	padding: 0.65rem 0.8rem;
	text-align: left;
	}
	.benchmark-table td {
	padding: 0.55rem 0.8rem;
	border-top: 1px solid var(--ka-border);
	color: var(--ka-text);
	}
	.benchmark-table tr.highlight td {
	background: rgba(233, 69, 96, 0.08);
	font-weight: 600;
	}
	.benchmark-table tr:not(.highlight) td {
	background: transparent;
	}

	/* ── Section divider ── */
	.section-label {
	font-size: 0.7rem;
	text-transform: uppercase;
	letter-spacing: 0.1em;
	color: var(--ka-text-muted);
	margin: 1rem 0 0.3rem;
	padding-left: 2px;
	font-weight: 600;
	}

	/* ── Tab styling ── */
	.tab-nav button {
	font-weight: 600 !important;
	letter-spacing: 0.01em !important;
	}
	.tab-nav button.selected {
	border-color: var(--ka-accent) !important;
	color: var(--ka-accent) !important;
	}

	/* ── Footer ── */
	.footer {
	text-align: center;
	padding: 1.2rem;
	margin-top: 1rem;
	font-size: 0.8rem;
	color: var(--ka-text-muted);
	border-top: 1px solid var(--ka-border);
	line-height: 1.6;
	}
	.footer a {
	color: var(--ka-accent);
	text-decoration: none;
	}
	.footer a:hover {
	text-decoration: underline;
	}
	"""

	# ─── Header HTML ─────────────────────────────────────────────────────────────

	HEADER_HTML = """
	<div class="hero-header">
	<h1>🎙️ <span class="hero-accent">Kugel</span>Audio</h1>
	<p>Kugel-Audio is a fine-tune of Vibe-Voice 7B in 200K hours of 24 european languages on the YODAS2 dataset</p>
	<div class="badges">
	<span class="badge gold">🏆 #1 German TTS</span>
	<span class="badge green">24 Languages</span>
	<span class="badge accent">Voice Cloning</span>
	<span class="badge">MIT License</span>
	<span class="badge">7B Parameters</span>
	</div>
	</div>
	"""

	BENCHMARK_HTML = """
	<table class="benchmark-table">
	<thead>
	<tr>
	<th>Rank</th>
	<th>Model</th>
	<th>Score</th>
	<th>Win Rate</th>
	</tr>
	</thead>
	<tbody>
	<tr class="highlight">
	<td>🥇</td>
	<td>KugelAudio</td>
	<td>26</td>
	<td>78.0%</td>
	</tr>
	<tr>
	<td>🥈</td>
	<td>ElevenLabs Multi v2</td>
	<td>25</td>
	<td>62.2%</td>
	</tr>
	<tr>
	<td>🥉</td>
	<td>ElevenLabs v3</td>
	<td>21</td>
	<td>65.3%</td>
	</tr>
	<tr>
	<td>4</td>
	<td>Cartesia</td>
	<td>21</td>
	<td>59.1%</td>
	</tr>
	<tr>
	<td>5</td>
	<td>VibeVoice</td>
	<td>10</td>
	<td>28.8%</td>
	</tr>
	<tr>
	<td>6</td>
	<td>CosyVoice v3</td>
	<td>9</td>
	<td>14.2%</td>
	</tr>
	</tbody>
	</table>
	<p style="text-align:center;color:var(--ka-text-muted);font-size:0.76rem;margin-top:0.3rem;">
	Based on 339 human A/B evaluations · OpenSkill Bayesian ranking
	</p>
	"""

	FOOTER_HTML = """
	<div class="footer">
	<strong>KugelAudio</strong> · Created by Kajo Kratzenstein & Carlos Menke<br>
	<a href="https://github.com/Kugelaudio/kugelaudio-open">GitHub</a> ·
	<a href="https://huggingface.co/kugelaudio/kugelaudio-0-open">HuggingFace</a> ·
	<a href="https://kugelaudio.com">API</a> ·
	<a href="https://docs.kugelaudio.com/sdks/python">Docs</a><br>
	Funded by the German Federal Ministry of Research, Technology and Space (BMFTR)
	via the AI Service Center Berlin-Brandenburg at HPI
	</div>
	"""

	# ─── Build Gradio Interface ─────────────────────────────────────────────────

	with gr.Blocks(
	css=CSS,
	title="KugelAudio – European TTS",
	theme=gr.themes.Base(
	primary_hue=gr.themes.colors.red,
	secondary_hue=gr.themes.colors.slate,
	neutral_hue=gr.themes.colors.slate,
	font=gr.themes.GoogleFont("IBM Plex Sans"),
	font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
	),
	) as demo:

	# ── Header ──
	gr.HTML(HEADER_HTML)

	# ── Main Tabs ──
	with gr.Tabs():

	# ━━━ Tab 1: Text-to-Speech ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	with gr.TabItem("🔊 Text-to-Speech", id="tts"):
	with gr.Row():
	with gr.Column(scale=3):
	tts_language = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="English 🇺🇸",
	label="Language",
	info="24 European languages supported",
	)
	tts_text = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter text here or click 'Fill Example' below…",
	lines=5,
	max_lines=12,
	)
	with gr.Row():
	tts_example_btn = gr.Button(
	"📝 Fill Example", size="sm", variant="secondary"
	)
	tts_clear_btn = gr.ClearButton(
	[tts_text], value="🗑️ Clear", size="sm"
	)
	with gr.Accordion("⚙️ Advanced Settings", open=False):
	tts_cfg = gr.Slider(
	minimum=1.0,
	maximum=10.0,
	value=3.0,
	step=0.5,
	label="CFG Scale",
	info="Guidance scale — higher values follow the text more closely",
	)
	tts_max_tokens = gr.Slider(
	minimum=512,
	maximum=8192,
	value=4096,
	step=512,
	label="Max Tokens",
	info="Maximum generation length in tokens",
	)
	tts_generate_btn = gr.Button(
	"🎙️ Generate Speech", variant="primary", size="lg"
	)

	with gr.Column(scale=2):
	tts_audio_out = gr.Audio(
	label="Generated Audio",
	type="filepath",
	interactive=False,
	)
	tts_info = gr.Markdown("Press 'Generate Speech' to synthesize audio.")

	# Events
	tts_example_btn.click(
	fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
	)
	tts_language.change(
	fn=fill_example_text, inputs=[tts_language], outputs=[tts_text]
	)
	tts_generate_btn.click(
	fn=generate_speech,
	inputs=[tts_text, tts_language, tts_cfg, tts_max_tokens],
	outputs=[tts_audio_out, tts_info],
	)

	# ━━━ Tab 2: Voice Cloning ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	with gr.TabItem("🎭 Voice Cloning", id="clone"):
	with gr.Row():
	with gr.Column(scale=3):
	clone_language = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="English 🇺🇸",
	label="Language",
	)
	clone_text = gr.Textbox(
	label="Text to Synthesize",
	placeholder="Enter the text you want spoken in the cloned voice…",
	lines=4,
	max_lines=10,
	)
	clone_ref = gr.Audio(
	label="Reference Voice",
	type="filepath",
	sources=["upload", "microphone"],
	)
	gr.Markdown(
	"<p class='section-label'>Upload or record a few seconds of the "
	"target voice. The model will replicate its characteristics.</p>"
	)
	with gr.Accordion("⚙️ Advanced Settings", open=False):
	clone_cfg = gr.Slider(
	minimum=1.0,
	maximum=10.0,
	value=3.0,
	step=0.5,
	label="CFG Scale",
	)
	clone_max_tokens = gr.Slider(
	minimum=512,
	maximum=8192,
	value=4096,
	step=512,
	label="Max Tokens",
	)
	clone_btn = gr.Button(
	"🎭 Clone & Generate", variant="primary", size="lg"
	)

	with gr.Column(scale=2):
	clone_audio_out = gr.Audio(
	label="Cloned Voice Output",
	type="filepath",
	interactive=False,
	)
	clone_info = gr.Markdown(
	"Upload a reference voice and press 'Clone & Generate'."
	)

	# Events
	clone_btn.click(
	fn=clone_voice,
	inputs=[
	clone_text,
	clone_ref,
	clone_language,
	clone_cfg,
	clone_max_tokens,
	],
	outputs=[clone_audio_out, clone_info],
	)

	# ━━━ Tab 3: Watermark Verification ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	with gr.TabItem("🔒 Watermark Verify", id="watermark"):
	with gr.Row():
	with gr.Column(scale=3):
	wm_audio = gr.Audio(
	label="Audio to Verify",
	type="filepath",
	sources=["upload"],
	)
	gr.Markdown(
	"<p class='section-label'>All KugelAudio outputs are watermarked "
	"with Facebook AudioSeal. Upload any audio file to check.</p>"
	)
	wm_btn = gr.Button(
	"🔍 Verify Watermark", variant="primary", size="lg"
	)
	with gr.Column(scale=2):
	wm_result = gr.Markdown("Upload audio and press 'Verify Watermark'.")

	wm_btn.click(
	fn=verify_watermark,
	inputs=[wm_audio],
	outputs=[wm_result],
	)

	# ━━━ Tab 4: Benchmarks ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	with gr.TabItem("🏆 Benchmarks", id="bench"):
	gr.Markdown("### Human Preference Ranking — German TTS")
	gr.HTML(BENCHMARK_HTML)
	gr.Markdown(
	"Evaluations covered **neutral speech, shouting, singing, and "
	"drunken voice** styles across diverse German-language samples. "
	"Participants heard a reference voice and compared outputs from "
	"two anonymous models in a blind A/B test."
	)

	# ━━━ Tab 5: About ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	with gr.TabItem("ℹ️ About", id="about"):
	gr.Markdown("""
	### Architecture

	KugelAudio uses a hybrid AR + Diffusion pipeline:

	1. Text Encoder — Qwen2-based language model encodes input text
	2. TTS Backbone — Upper transformer layers generate speech representations
	3. Diffusion Head — Predicts speech latents via denoising diffusion
	4. Acoustic Decoder — Converts latents to waveforms

	### Training

	\| Detail \| Value \|
	\|--------\|-------\|
	\| Base model \| Microsoft VibeVoice \|
	\| Training data \| ~200,000 hours (YODAS2) \|
	\| Hardware \| 8× NVIDIA H100 \|
	\| Duration \| 5 days \|
	\| Parameters \| 7B \|

	### Responsible Use

	KugelAudio is intended for accessibility, content creation, voice assistants,
	language learning, and creative projects with consent. All generated audio
	is watermarked with Facebook AudioSeal. Creating deepfakes, impersonation
	without consent, fraud, or any illegal use is prohibited.

	### License

	Released under the MIT License.

	### Citation

	```bibtex
	@software{kugelaudio2026,
	title = {KugelAudio: Open-Source TTS for European Languages with Voice Cloning},
	author = {Kratzenstein, Kajo and Menke, Carlos},
	year = {2026},
	url = {https://github.com/kugelaudio/kugelaudio}
	}
	```
	""")

	# ── Footer ──
	gr.HTML(FOOTER_HTML)


	# ─── Launch ──────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	demo.queue()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	# show_api=True,
	)