import gradio as gr import numpy as np import os from kittentts import KittenTTS SAMPLE_RATE = 24000 MODELS = { "Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32", "Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8", "Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8", } VOICES = [ "Bella", "Jasper", "Luna", "Bruno", "Rosie", "Hugo", "Kiki", "Leo", ] # Initialize all models at startup print("Loading models...") _model_cache: dict[str, KittenTTS] = {} for model_name, model_id in MODELS.items(): print(f"Loading {model_name}...") _model_cache[model_name] = KittenTTS(model_id) print("All models loaded!") def get_model(model_name: str) -> KittenTTS: return _model_cache[model_name] def synthesize(text: str, model_name: str, voice: str, speed: float): if not text or not text.strip(): raise gr.Error("Please enter some text.") tts = get_model(model_name) # Note: speed parameter may not be supported in v0.8 # If you get an error, remove speed=speed from the generate call try: audio = tts.generate(text.strip(), voice=voice, speed=speed) except TypeError: # Fallback if speed is not supported audio = tts.generate(text.strip(), voice=voice) # audio shape is (1, samples) or (samples,) — normalize to 1-D audio = np.squeeze(audio) return (SAMPLE_RATE, audio) theme = gr.themes.Base( primary_hue="neutral", secondary_hue="neutral", neutral_hue="neutral", font=gr.themes.GoogleFont("Inter"), ).set( body_background_fill="white", body_background_fill_dark="white", block_background_fill="white", block_background_fill_dark="white", block_border_color="#e5e5e5", block_border_color_dark="#e5e5e5", block_shadow="none", block_shadow_dark="none", button_primary_background_fill="#111111", button_primary_background_fill_hover="#333333", button_primary_text_color="white", button_primary_border_color="#111111", input_background_fill="white", input_background_fill_dark="white", input_border_color="#e5e5e5", slider_color="#111111", table_border_color="#e5e5e5", table_even_background_fill="white", table_odd_background_fill="white", table_row_focus="white", ) css = """ /* Force light mode — prevents OS dark mode from affecting the page */ :root, html, body { color-scheme: light !important; } body, .gradio-container, .main { background: white !important; } .gradio-container { max-width: 860px !important; margin: 40px auto !important; } footer { display: none !important; } /* Force all text to black — no accent colors */ *, *::before, *::after { color: #111 !important; --body-text-color: #111 !important; --block-label-text-color: #111 !important; --block-title-text-color: #111 !important; --color-accent: #111 !important; --link-text-color: #111 !important; --link-text-color-hover: #111 !important; --link-text-color-visited: #111 !important; --link-text-color-active: #111 !important; } /* Exceptions — keep button text white */ button.primary, button[variant="primary"] { color: white !important; } /* Error toast notification */ .toast-wrap, .toast-body, [class*="toast"] { background: white !important; border: 1px solid #e5e5e5 !important; box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important; } [class*="toast"] .toast-title, [class*="toast"] .error, .toast-wrap .error, span.error { color: #b91c1c !important; font-weight: 600 !important; } [class*="toast"] p, [class*="toast"] .toast-text { color: #555 !important; } /* Error badge inside output block */ .error-wrap, .error { background: #fef2f2 !important; border-color: #fca5a5 !important; color: #b91c1c !important; } /* Placeholder text */ ::placeholder { color: #aaa !important; } /* Backgrounds */ .block, .form, .wrap, .panel, .gap, .tabs { background: white !important; } /* Block label tabs (e.g. "Output" on the audio component) */ [data-testid="block-label"] { background: white !important; color: #111 !important; border-color: #e5e5e5 !important; } [data-testid="block-label"] * { color: #111 !important; } /* Dropdown closed state — gray on the full inner wrapper with its natural padding */ input[role="listbox"] { background: transparent !important; } .wrap-inner { background: #f7f7f7 !important; border-radius: 4px !important; } /* Dropdown popup list */ ul.options { background: #f7f7f7 !important; border: 1px solid #e5e5e5 !important; box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important; } ul.options li { background: #f7f7f7 !important; color: #111 !important; } ul.options li:hover, ul.options li.selected { background: #eeeeee !important; } /* Examples table — force all borders to match */ .examples-holder, .table-wrap, table, thead, tbody, tr, td, th { background: white !important; border-color: #e5e5e5 !important; } .tr-head { box-shadow: none !important; } tr:hover td { background: #f9f9f9 !important; } /* Speed number input container and divider */ .tab-like-container, .tab-like-container *, input[type=number] { border-color: #e5e5e5 !important; } .reset-button { -webkit-appearance: none !important; appearance: none !important; border: none !important; background: white !important; } /* Slider track */ input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; } input[type=range]::-webkit-slider-thumb { background: #111 !important; } """ with gr.Blocks(title="KittenTTS Demo") as demo: gr.Markdown("# KittenTTS Demo") gr.Markdown('KittenTTS Banner') gr.Markdown("Text-to-speech synthesis with multiple models and voices.") with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text", placeholder="Enter text to synthesize…", lines=5, ) with gr.Row(): model_select = gr.Dropdown( choices=list(MODELS.keys()), value="Micro (40M - Balanced)", label="Model", ) voice_select = gr.Dropdown( choices=VOICES, value="Jasper", label="Voice", ) speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.05, label="Speed", ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio(label="Output", type="numpy") generate_btn.click( fn=synthesize, inputs=[text_input, model_select, voice_select, speed_slider], outputs=audio_output, ) gr.Examples( examples=[ [ "Space is a three-dimensional continuum containing positions and directions.", "Micro (40M - Balanced)", "Jasper", 1.0, ], [ "It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.", "Mini (80M - Best Quality)", "Luna", 1.0, ], [ "Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.", "Nano (15M - Fastest)", "Bella", 1.1, ], ], inputs=[text_input, model_select, voice_select, speed_slider], ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", theme=theme, css=css)