KittenTTS

Sleeping

Your Name

a60137f 23 days ago

8.01 kB

	import gradio as gr
	import numpy as np
	import os
	from kittentts import KittenTTS

	SAMPLE_RATE = 24000

	MODELS = {
	"Nano (15M - Fastest)": "KittenML/kitten-tts-nano-0.8-fp32",
	"Micro (40M - Balanced)": "KittenML/kitten-tts-micro-0.8",
	"Mini (80M - Best Quality)": "KittenML/kitten-tts-mini-0.8",
	}

	VOICES = [
	"Bella",
	"Jasper",
	"Luna",
	"Bruno",
	"Rosie",
	"Hugo",
	"Kiki",
	"Leo",
	]

	# Initialize all models at startup
	print("Loading models...")
	_model_cache: dict[str, KittenTTS] = {}
	for model_name, model_id in MODELS.items():
	print(f"Loading {model_name}...")
	_model_cache[model_name] = KittenTTS(model_id)
	print("All models loaded!")


	def get_model(model_name: str) -> KittenTTS:
	return _model_cache[model_name]


	def synthesize(text: str, model_name: str, voice: str, speed: float):
	if not text or not text.strip():
	raise gr.Error("Please enter some text.")

	tts = get_model(model_name)
	# Note: speed parameter may not be supported in v0.8
	# If you get an error, remove speed=speed from the generate call
	try:
	audio = tts.generate(text.strip(), voice=voice, speed=speed)
	except TypeError:
	# Fallback if speed is not supported
	audio = tts.generate(text.strip(), voice=voice)

	# audio shape is (1, samples) or (samples,) — normalize to 1-D
	audio = np.squeeze(audio)

	return (SAMPLE_RATE, audio)


	theme = gr.themes.Base(
	primary_hue="neutral",
	secondary_hue="neutral",
	neutral_hue="neutral",
	font=gr.themes.GoogleFont("Inter"),
	).set(
	body_background_fill="white",
	body_background_fill_dark="white",
	block_background_fill="white",
	block_background_fill_dark="white",
	block_border_color="#e5e5e5",
	block_border_color_dark="#e5e5e5",
	block_shadow="none",
	block_shadow_dark="none",
	button_primary_background_fill="#111111",
	button_primary_background_fill_hover="#333333",
	button_primary_text_color="white",
	button_primary_border_color="#111111",
	input_background_fill="white",
	input_background_fill_dark="white",
	input_border_color="#e5e5e5",
	slider_color="#111111",
	table_border_color="#e5e5e5",
	table_even_background_fill="white",
	table_odd_background_fill="white",
	table_row_focus="white",
	)

	css = """
	/* Force light mode — prevents OS dark mode from affecting the page */
	:root, html, body { color-scheme: light !important; }
	body, .gradio-container, .main { background: white !important; }
	.gradio-container { max-width: 860px !important; margin: 40px auto !important; }
	footer { display: none !important; }

	/* Force all text to black — no accent colors */
	, ::before, *::after {
	color: #111 !important;
	--body-text-color: #111 !important;
	--block-label-text-color: #111 !important;
	--block-title-text-color: #111 !important;
	--color-accent: #111 !important;
	--link-text-color: #111 !important;
	--link-text-color-hover: #111 !important;
	--link-text-color-visited: #111 !important;
	--link-text-color-active: #111 !important;
	}

	/* Exceptions — keep button text white */
	button.primary, button[variant="primary"] { color: white !important; }

	/* Error toast notification */
	.toast-wrap, .toast-body, [class*="toast"] {
	background: white !important;
	border: 1px solid #e5e5e5 !important;
	box-shadow: 0 4px 12px rgba(0,0,0,0.08) !important;
	}
	[class="toast"] .toast-title, [class="toast"] .error,
	.toast-wrap .error, span.error {
	color: #b91c1c !important;
	font-weight: 600 !important;
	}
	[class="toast"] p, [class="toast"] .toast-text {
	color: #555 !important;
	}
	/* Error badge inside output block */
	.error-wrap, .error {
	background: #fef2f2 !important;
	border-color: #fca5a5 !important;
	color: #b91c1c !important;
	}

	/* Placeholder text */
	::placeholder { color: #aaa !important; }

	/* Backgrounds */
	.block, .form, .wrap, .panel, .gap, .tabs { background: white !important; }

	/* Block label tabs (e.g. "Output" on the audio component) */
	[data-testid="block-label"] {
	background: white !important;
	color: #111 !important;
	border-color: #e5e5e5 !important;
	}
	[data-testid="block-label"] * { color: #111 !important; }

	/* Dropdown closed state — gray on the full inner wrapper with its natural padding */
	input[role="listbox"] {
	background: transparent !important;
	}
	.wrap-inner {
	background: #f7f7f7 !important;
	border-radius: 4px !important;
	}

	/* Dropdown popup list */
	ul.options {
	background: #f7f7f7 !important;
	border: 1px solid #e5e5e5 !important;
	box-shadow: 0 4px 12px rgba(0,0,0,0.06) !important;
	}
	ul.options li {
	background: #f7f7f7 !important;
	color: #111 !important;
	}
	ul.options li:hover, ul.options li.selected {
	background: #eeeeee !important;
	}

	/* Examples table — force all borders to match */
	.examples-holder, .table-wrap, table, thead, tbody, tr, td, th {
	background: white !important;
	border-color: #e5e5e5 !important;
	}
	.tr-head { box-shadow: none !important; }
	tr:hover td { background: #f9f9f9 !important; }

	/* Speed number input container and divider */
	.tab-like-container, .tab-like-container *, input[type=number] {
	border-color: #e5e5e5 !important;
	}
	.reset-button {
	-webkit-appearance: none !important;
	appearance: none !important;
	border: none !important;
	background: white !important;
	}

	/* Slider track */
	input[type=range]::-webkit-slider-runnable-track { background: #e5e5e5 !important; }
	input[type=range]::-webkit-slider-thumb { background: #111 !important; }
	"""

	with gr.Blocks(title="KittenTTS Demo") as demo:
	gr.Markdown("# KittenTTS Demo")
	gr.Markdown('<img width="607" height="255" alt="KittenTTS Banner" src="https://github.com/user-attachments/assets/f4646722-ba78-4b25-8a65-81bacee0d4f6" />')
	gr.Markdown("Text-to-speech synthesis with multiple models and voices.")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Text",
	placeholder="Enter text to synthesize…",
	lines=5,
	)
	with gr.Row():
	model_select = gr.Dropdown(
	choices=list(MODELS.keys()),
	value="Micro (40M - Balanced)",
	label="Model",
	)
	voice_select = gr.Dropdown(
	choices=VOICES,
	value="Jasper",
	label="Voice",
	)
	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.05,
	label="Speed",
	)
	generate_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Output", type="numpy")

	generate_btn.click(
	fn=synthesize,
	inputs=[text_input, model_select, voice_select, speed_slider],
	outputs=audio_output,
	)

	gr.Examples(
	examples=[
	[
	"Space is a three-dimensional continuum containing positions and directions.",
	"Micro (40M - Balanced)",
	"Jasper",
	1.0,
	],
	[
	"It begins with an 'Ugh!' Another mysterious stain appears on a favorite shirt. Every trick has been tried, but the stain persists.",
	"Mini (80M - Best Quality)",
	"Luna",
	1.0,
	],
	[
	"Hello! Welcome to the KittenTTS demo. You can choose different voices and models to find the combination you like best.",
	"Nano (15M - Fastest)",
	"Bella",
	1.1,
	],
	],
	inputs=[text_input, model_select, voice_select, speed_slider],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", theme=theme, css=css)