KittenTTS-Demo / app.py
Your Name
a
a60137f
import gradio as gr
import numpy as np
import os
from kittentts import KittenTTS
SAMPLE_RATE = 24000
MODELS = {
"Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32",
"Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8",
"Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8",
}
VOICES = [
"Bella",
"Jasper",
"Luna",
"Bruno",
"Rosie",
"Hugo",
"Kiki",
"Leo",
]
# Initialize all models at startup
print("Loading models...")
_model_cache: dict[str, KittenTTS] = {}
for model_name, model_id in MODELS.items():
print(f"Loading {model_name}...")
_model_cache[model_name] = KittenTTS(model_id)
print("All models loaded!")
def get_model(model_name: str) -> KittenTTS:
return _model_cache[model_name]
def synthesize(text: str, model_name: str, voice: str, speed: float):
if not text or not text.strip():
raise gr.Error("Please enter some text.")
tts = get_model(model_name)
# Note: speed parameter may not be supported in v0.8
# If you get an error, remove speed=speed from the generate call
try:
audio = tts.generate(text.strip(), voice=voice, speed=speed)
except TypeError:
# Fallback if speed is not supported
audio = tts.generate(text.strip(), voice=voice)
# audio shape is (1, samples) or (samples,) — normalize to 1-D
audio = np.squeeze(audio)
return (SAMPLE_RATE, audio)
theme = gr.themes.Base(
primary_hue="neutral",
secondary_hue="neutral",
neutral_hue="neutral",
font=gr.themes.GoogleFont("Inter"),
).set(
body_background_fill="white",
body_background_fill_dark="white",
block_background_fill="white",
block_background_fill_dark="white",
block_border_color="#e5e5e5",
block_border_color_dark="#e5e5e5",
block_shadow="none",
block_shadow_dark="none",
button_primary_background_fill="#111111",
button_primary_background_fill_hover="#333333",
button_primary_text_color="white",
button_primary_border_color="#111111",
input_background_fill="white",
input_background_fill_dark="white",
input_border_color="#e5e5e5",
slider_color="#111111",
table_border_color="#e5e5e5",
table_even_background_fill="white",
table_odd_background_fill="white",
table_row_focus="white",
)
css = """
/* Force light mode — prevents OS dark mode from affecting the page */
:root, html, body { color-scheme: light !important; }
body, .gradio-container, .main { background: white !important; }
.gradio-container { max-width: 860px !important; margin: 40px auto !important; }
footer { display: none !important; }
/* Force all text to black — no accent colors */
*, *::before, *::after {
color: #111 !important;
--body-text-color: #111 !important;
--block-label-text-color: #111 !important;
--block-title-text-color: #111 !important;
--color-accent: #111 !important;
--link-text-color: #111 !important;
--link-text-color-hover: #111 !important;
--link-text-color-visited: #111 !important;
--link-text-color-active: #111 !important;
}
/* Exceptions — keep button text white */
button.primary, button[variant="primary"] { color: white !important; }
/* Error toast notification */
.toast-wrap, .toast-body, [class*="toast"] {
background: white !important;
border: 1px solid #e5e5e5 !important;
box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
}
[class*="toast"] .toast-title, [class*="toast"] .error,
.toast-wrap .error, span.error {
color: #b91c1c !important;
font-weight: 600 !important;
}
[class*="toast"] p, [class*="toast"] .toast-text {
color: #555 !important;
}
/* Error badge inside output block */
.error-wrap, .error {
background: #fef2f2 !important;
border-color: #fca5a5 !important;
color: #b91c1c !important;
}
/* Placeholder text */
::placeholder { color: #aaa !important; }
/* Backgrounds */
.block, .form, .wrap, .panel, .gap, .tabs { background: white !important; }
/* Block label tabs (e.g. "Output" on the audio component) */
[data-testid="block-label"] {
background: white !important;
color: #111 !important;
border-color: #e5e5e5 !important;
}
[data-testid="block-label"] * { color: #111 !important; }
/* Dropdown closed state — gray on the full inner wrapper with its natural padding */
input[role="listbox"] {
background: transparent !important;
}
.wrap-inner {
background: #f7f7f7 !important;
border-radius: 4px !important;
}
/* Dropdown popup list */
ul.options {
background: #f7f7f7 !important;
border: 1px solid #e5e5e5 !important;
box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important;
}
ul.options li {
background: #f7f7f7 !important;
color: #111 !important;
}
ul.options li:hover, ul.options li.selected {
background: #eeeeee !important;
}
/* Examples table — force all borders to match */
.examples-holder, .table-wrap, table, thead, tbody, tr, td, th {
background: white !important;
border-color: #e5e5e5 !important;
}
.tr-head { box-shadow: none !important; }
tr:hover td { background: #f9f9f9 !important; }
/* Speed number input container and divider */
.tab-like-container, .tab-like-container *, input[type=number] {
border-color: #e5e5e5 !important;
}
.reset-button {
-webkit-appearance: none !important;
appearance: none !important;
border: none !important;
background: white !important;
}
/* Slider track */
input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; }
input[type=range]::-webkit-slider-thumb { background: #111 !important; }
"""
with gr.Blocks(title="KittenTTS Demo") as demo:
gr.Markdown("# KittenTTS Demo")
gr.Markdown('<img width="607" height="255" alt="KittenTTS Banner" src="https://github.com/user-attachments/assets/f4646722-ba78-4b25-8a65-81bacee0d4f6" />')
gr.Markdown("Text-to-speech synthesis with multiple models and voices.")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text",
placeholder="Enter text to synthesize…",
lines=5,
)
with gr.Row():
model_select = gr.Dropdown(
choices=list(MODELS.keys()),
value="Micro (40M - Balanced)",
label="Model",
)
voice_select = gr.Dropdown(
choices=VOICES,
value="Jasper",
label="Voice",
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.05,
label="Speed",
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Output", type="numpy")
generate_btn.click(
fn=synthesize,
inputs=[text_input, model_select, voice_select, speed_slider],
outputs=audio_output,
)
gr.Examples(
examples=[
[
"Space is a three-dimensional continuum containing positions and directions.",
"Micro (40M - Balanced)",
"Jasper",
1.0,
],
[
"It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.",
"Mini (80M - Best Quality)",
"Luna",
1.0,
],
[
"Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.",
"Nano (15M - Fastest)",
"Bella",
1.1,
],
],
inputs=[text_input, model_select, voice_select, speed_slider],
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", theme=theme, css=css)