Spaces:

KittenML
/

KittenTTS-Demo

Running

File size: 8,005 Bytes

e3b97f2

import gradio as gr
import numpy as np
import os
from kittentts import KittenTTS

SAMPLE_RATE = 24000

MODELS = {
    "Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32",
    "Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8",
    "Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8",
}

VOICES = [
    "Bella",
    "Jasper",
    "Luna",
    "Bruno",
    "Rosie",
    "Hugo",
    "Kiki",
    "Leo",
]

# Initialize all models at startup
print("Loading models...")
_model_cache: dict[str, KittenTTS] = {}
for model_name, model_id in MODELS.items():
    print(f"Loading {model_name}...")
    _model_cache[model_name] = KittenTTS(model_id)
print("All models loaded!")


def get_model(model_name: str) -> KittenTTS:
    return _model_cache[model_name]


def synthesize(text: str, model_name: str, voice: str, speed: float):
    if not text or not text.strip():
        raise gr.Error("Please enter some text.")

    tts = get_model(model_name)
    # Note: speed parameter may not be supported in v0.8
    # If you get an error, remove speed=speed from the generate call
    try:
        audio = tts.generate(text.strip(), voice=voice, speed=speed)
    except TypeError:
        # Fallback if speed is not supported
        audio = tts.generate(text.strip(), voice=voice)

    # audio shape is (1, samples) or (samples,) — normalize to 1-D
    audio = np.squeeze(audio)

    return (SAMPLE_RATE, audio)


theme = gr.themes.Base(
    primary_hue="neutral",
    secondary_hue="neutral",
    neutral_hue="neutral",
    font=gr.themes.GoogleFont("Inter"),
).set(
    body_background_fill="white",
    body_background_fill_dark="white",
    block_background_fill="white",
    block_background_fill_dark="white",
    block_border_color="#e5e5e5",
    block_border_color_dark="#e5e5e5",
    block_shadow="none",
    block_shadow_dark="none",
    button_primary_background_fill="#111111",
    button_primary_background_fill_hover="#333333",
    button_primary_text_color="white",
    button_primary_border_color="#111111",
    input_background_fill="white",
    input_background_fill_dark="white",
    input_border_color="#e5e5e5",
    slider_color="#111111",
    table_border_color="#e5e5e5",
    table_even_background_fill="white",
    table_odd_background_fill="white",
    table_row_focus="white",
)

css = """
/* Force light mode — prevents OS dark mode from affecting the page */
:root, html, body { color-scheme: light !important; }
body, .gradio-container, .main { background: white !important; }
.gradio-container { max-width: 860px !important; margin: 40px auto !important; }
footer { display: none !important; }

/* Force all text to black — no accent colors */
*, *::before, *::after {
    color: #111 !important;
    --body-text-color: #111 !important;
    --block-label-text-color: #111 !important;
    --block-title-text-color: #111 !important;
    --color-accent: #111 !important;
    --link-text-color: #111 !important;
    --link-text-color-hover: #111 !important;
    --link-text-color-visited: #111 !important;
    --link-text-color-active: #111 !important;
}

/* Exceptions — keep button text white */
button.primary, button[variant="primary"] { color: white !important; }

/* Error toast notification */
.toast-wrap, .toast-body, [class*="toast"] {
    background: white !important;
    border: 1px solid #e5e5e5 !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
}
[class*="toast"] .toast-title, [class*="toast"] .error,
.toast-wrap .error, span.error {
    color: #b91c1c !important;
    font-weight: 600 !important;
}
[class*="toast"] p, [class*="toast"] .toast-text {
    color: #555 !important;
}
/* Error badge inside output block */
.error-wrap, .error {
    background: #fef2f2 !important;
    border-color: #fca5a5 !important;
    color: #b91c1c !important;
}

/* Placeholder text */
::placeholder { color: #aaa !important; }

/* Backgrounds */
.block, .form, .wrap, .panel, .gap, .tabs { background: white !important; }

/* Block label tabs (e.g. "Output" on the audio component) */
[data-testid="block-label"] {
    background: white !important;
    color: #111 !important;
    border-color: #e5e5e5 !important;
}
[data-testid="block-label"] * { color: #111 !important; }

/* Dropdown closed state — gray on the full inner wrapper with its natural padding */
input[role="listbox"] {
    background: transparent !important;
}
.wrap-inner {
    background: #f7f7f7 !important;
    border-radius: 4px !important;
}

/* Dropdown popup list */
ul.options {
    background: #f7f7f7 !important;
    border: 1px solid #e5e5e5 !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important;
}
ul.options li {
    background: #f7f7f7 !important;
    color: #111 !important;
}
ul.options li:hover, ul.options li.selected {
    background: #eeeeee !important;
}

/* Examples table — force all borders to match */
.examples-holder, .table-wrap, table, thead, tbody, tr, td, th {
    background: white !important;
    border-color: #e5e5e5 !important;
}
.tr-head { box-shadow: none !important; }
tr:hover td { background: #f9f9f9 !important; }

/* Speed number input container and divider */
.tab-like-container, .tab-like-container *, input[type=number] {
    border-color: #e5e5e5 !important;
}
.reset-button {
    -webkit-appearance: none !important;
    appearance: none !important;
    border: none !important;
    background: white !important;
}

/* Slider track */
input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; }
input[type=range]::-webkit-slider-thumb { background: #111 !important; }
"""

with gr.Blocks(title="KittenTTS Demo") as demo:
    gr.Markdown("# KittenTTS Demo")
    gr.Markdown('<img width="607" height="255" alt="KittenTTS Banner" src="https://github.com/user-attachments/assets/f4646722-ba78-4b25-8a65-81bacee0d4f6" />')
    gr.Markdown("Text-to-speech synthesis with multiple models and voices.")

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter text to synthesize…",
                lines=5,
            )
            with gr.Row():
                model_select = gr.Dropdown(
                    choices=list(MODELS.keys()),
                    value="Micro (40M - Balanced)",
                    label="Model",
                )
                voice_select = gr.Dropdown(
                    choices=VOICES,
                    value="Jasper",
                    label="Voice",
                )
            speed_slider = gr.Slider(
                minimum=0.5,
                maximum=2.0,
                value=1.0,
                step=0.05,
                label="Speed",
            )
            generate_btn = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Output", type="numpy")

    generate_btn.click(
        fn=synthesize,
        inputs=[text_input, model_select, voice_select, speed_slider],
        outputs=audio_output,
    )

    gr.Examples(
        examples=[
            [
                "Space is a three-dimensional continuum containing positions and directions.",
                "Micro (40M - Balanced)",
                "Jasper",
                1.0,
            ],
            [
                "It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.",
                "Mini (80M - Best Quality)",
                "Luna",
                1.0,
            ],
            [
                "Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.",
                "Nano (15M - Fastest)",
                "Bella",
                1.1,
            ],
        ],
        inputs=[text_input, model_select, voice_select, speed_slider],
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", theme=theme, css=css)