| | import gradio as gr |
| | import numpy as np |
| | import os |
| | from kittentts import KittenTTS |
| |
|
| | SAMPLE_RATE = 24000 |
| |
|
| | MODELS = { |
| | "Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32", |
| | "Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8", |
| | "Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8", |
| | } |
| |
|
| | VOICES = [ |
| | "Bella", |
| | "Jasper", |
| | "Luna", |
| | "Bruno", |
| | "Rosie", |
| | "Hugo", |
| | "Kiki", |
| | "Leo", |
| | ] |
| |
|
| | |
| | print("Loading models...") |
| | _model_cache: dict[str, KittenTTS] = {} |
| | for model_name, model_id in MODELS.items(): |
| | print(f"Loading {model_name}...") |
| | _model_cache[model_name] = KittenTTS(model_id) |
| | print("All models loaded!") |
| |
|
| |
|
| | def get_model(model_name: str) -> KittenTTS: |
| | return _model_cache[model_name] |
| |
|
| |
|
| | def synthesize(text: str, model_name: str, voice: str, speed: float): |
| | if not text or not text.strip(): |
| | raise gr.Error("Please enter some text.") |
| |
|
| | tts = get_model(model_name) |
| | |
| | |
| | try: |
| | audio = tts.generate(text.strip(), voice=voice, speed=speed) |
| | except TypeError: |
| | |
| | audio = tts.generate(text.strip(), voice=voice) |
| |
|
| | |
| | audio = np.squeeze(audio) |
| |
|
| | return (SAMPLE_RATE, audio) |
| |
|
| |
|
| | theme = gr.themes.Base( |
| | primary_hue="neutral", |
| | secondary_hue="neutral", |
| | neutral_hue="neutral", |
| | font=gr.themes.GoogleFont("Inter"), |
| | ).set( |
| | body_background_fill="white", |
| | body_background_fill_dark="white", |
| | block_background_fill="white", |
| | block_background_fill_dark="white", |
| | block_border_color="#e5e5e5", |
| | block_border_color_dark="#e5e5e5", |
| | block_shadow="none", |
| | block_shadow_dark="none", |
| | button_primary_background_fill="#111111", |
| | button_primary_background_fill_hover="#333333", |
| | button_primary_text_color="white", |
| | button_primary_border_color="#111111", |
| | input_background_fill="white", |
| | input_background_fill_dark="white", |
| | input_border_color="#e5e5e5", |
| | slider_color="#111111", |
| | table_border_color="#e5e5e5", |
| | table_even_background_fill="white", |
| | table_odd_background_fill="white", |
| | table_row_focus="white", |
| | ) |
| |
|
| | css = """ |
| | /* Force light mode — prevents OS dark mode from affecting the page */ |
| | :root, html, body { color-scheme: light !important; } |
| | body, .gradio-container, .main { background: white !important; } |
| | .gradio-container { max-width: 860px !important; margin: 40px auto !important; } |
| | footer { display: none !important; } |
| | |
| | /* Force all text to black — no accent colors */ |
| | *, *::before, *::after { |
| | color: #111 !important; |
| | --body-text-color: #111 !important; |
| | --block-label-text-color: #111 !important; |
| | --block-title-text-color: #111 !important; |
| | --color-accent: #111 !important; |
| | --link-text-color: #111 !important; |
| | --link-text-color-hover: #111 !important; |
| | --link-text-color-visited: #111 !important; |
| | --link-text-color-active: #111 !important; |
| | } |
| | |
| | /* Exceptions — keep button text white */ |
| | button.primary, button[variant="primary"] { color: white !important; } |
| | |
| | /* Error toast notification */ |
| | .toast-wrap, .toast-body, [class*="toast"] { |
| | background: white !important; |
| | border: 1px solid #e5e5e5 !important; |
| | box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important; |
| | } |
| | [class*="toast"] .toast-title, [class*="toast"] .error, |
| | .toast-wrap .error, span.error { |
| | color: #b91c1c !important; |
| | font-weight: 600 !important; |
| | } |
| | [class*="toast"] p, [class*="toast"] .toast-text { |
| | color: #555 !important; |
| | } |
| | /* Error badge inside output block */ |
| | .error-wrap, .error { |
| | background: #fef2f2 !important; |
| | border-color: #fca5a5 !important; |
| | color: #b91c1c !important; |
| | } |
| | |
| | /* Placeholder text */ |
| | ::placeholder { color: #aaa !important; } |
| | |
| | /* Backgrounds */ |
| | .block, .form, .wrap, .panel, .gap, .tabs { background: white !important; } |
| | |
| | /* Block label tabs (e.g. "Output" on the audio component) */ |
| | [data-testid="block-label"] { |
| | background: white !important; |
| | color: #111 !important; |
| | border-color: #e5e5e5 !important; |
| | } |
| | [data-testid="block-label"] * { color: #111 !important; } |
| | |
| | /* Dropdown closed state — gray on the full inner wrapper with its natural padding */ |
| | input[role="listbox"] { |
| | background: transparent !important; |
| | } |
| | .wrap-inner { |
| | background: #f7f7f7 !important; |
| | border-radius: 4px !important; |
| | } |
| | |
| | /* Dropdown popup list */ |
| | ul.options { |
| | background: #f7f7f7 !important; |
| | border: 1px solid #e5e5e5 !important; |
| | box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important; |
| | } |
| | ul.options li { |
| | background: #f7f7f7 !important; |
| | color: #111 !important; |
| | } |
| | ul.options li:hover, ul.options li.selected { |
| | background: #eeeeee !important; |
| | } |
| | |
| | /* Examples table — force all borders to match */ |
| | .examples-holder, .table-wrap, table, thead, tbody, tr, td, th { |
| | background: white !important; |
| | border-color: #e5e5e5 !important; |
| | } |
| | .tr-head { box-shadow: none !important; } |
| | tr:hover td { background: #f9f9f9 !important; } |
| | |
| | /* Speed number input container and divider */ |
| | .tab-like-container, .tab-like-container *, input[type=number] { |
| | border-color: #e5e5e5 !important; |
| | } |
| | .reset-button { |
| | -webkit-appearance: none !important; |
| | appearance: none !important; |
| | border: none !important; |
| | background: white !important; |
| | } |
| | |
| | /* Slider track */ |
| | input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; } |
| | input[type=range]::-webkit-slider-thumb { background: #111 !important; } |
| | """ |
| |
|
| | with gr.Blocks(title="KittenTTS Demo") as demo: |
| | gr.Markdown("# KittenTTS Demo") |
| | gr.Markdown('<img width="607" height="255" alt="KittenTTS Banner" src="https://github.com/user-attachments/assets/f4646722-ba78-4b25-8a65-81bacee0d4f6" />') |
| | gr.Markdown("Text-to-speech synthesis with multiple models and voices.") |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=2): |
| | text_input = gr.Textbox( |
| | label="Text", |
| | placeholder="Enter text to synthesize…", |
| | lines=5, |
| | ) |
| | with gr.Row(): |
| | model_select = gr.Dropdown( |
| | choices=list(MODELS.keys()), |
| | value="Micro (40M - Balanced)", |
| | label="Model", |
| | ) |
| | voice_select = gr.Dropdown( |
| | choices=VOICES, |
| | value="Jasper", |
| | label="Voice", |
| | ) |
| | speed_slider = gr.Slider( |
| | minimum=0.5, |
| | maximum=2.0, |
| | value=1.0, |
| | step=0.05, |
| | label="Speed", |
| | ) |
| | generate_btn = gr.Button("Generate Speech", variant="primary") |
| |
|
| | with gr.Column(scale=1): |
| | audio_output = gr.Audio(label="Output", type="numpy") |
| |
|
| | generate_btn.click( |
| | fn=synthesize, |
| | inputs=[text_input, model_select, voice_select, speed_slider], |
| | outputs=audio_output, |
| | ) |
| |
|
| | gr.Examples( |
| | examples=[ |
| | [ |
| | "Space is a three-dimensional continuum containing positions and directions.", |
| | "Micro (40M - Balanced)", |
| | "Jasper", |
| | 1.0, |
| | ], |
| | [ |
| | "It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.", |
| | "Mini (80M - Best Quality)", |
| | "Luna", |
| | 1.0, |
| | ], |
| | [ |
| | "Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.", |
| | "Nano (15M - Fastest)", |
| | "Bella", |
| | 1.1, |
| | ], |
| | ], |
| | inputs=[text_input, model_select, voice_select, speed_slider], |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(server_name="0.0.0.0", theme=theme, css=css) |
| |
|