KittenTTS / app.py
Javedalam's picture
Create app.py
fdc5dab verified
import gradio as gr
import numpy as np
import os
from huggingface_hub import snapshot_download
from kittentts import KittenTTS
SR = 24000
# Download full repo and auto-discover ONNX
repo_dir = snapshot_download("KittenML/kitten-tts-nano-0.2")
onnx_files = [os.path.join(repo_dir, f) for f in os.listdir(repo_dir) if f.endswith(".onnx")]
MODEL_PATH = onnx_files[0]
tts = KittenTTS(MODEL_PATH)
VOICES = [
"expr-voice-2-f",
"expr-voice-3-m",
"expr-voice-4-f",
]
EXAMPLES = [
["Small models can sound natural without giant cloud systems.", "expr-voice-2-f", 1.0],
["This demo runs a tiny expressive TTS model on CPU only.", "expr-voice-4-f", 1.05],
["Most AI stacks are bloated. This one is not.", "expr-voice-3-m", 0.9],
]
def synthesize(text, voice, speed):
if not text.strip():
return None
audio = tts.generate(text, voice=voice, speed=float(speed))
audio = np.asarray(audio, dtype=np.float32)
return SR, audio
with gr.Blocks(title="KittenTTS – Tiny Expressive TTS") as demo:
gr.Markdown(
"""
# 🐱 KittenTTS
**Tiny, expressive Text-to-Speech (~15M params, CPU-only)**
Use the controls below to explore voice and pacing.
"""
)
with gr.Row():
with gr.Column(scale=2):
text = gr.Textbox(
label="Text",
lines=4,
placeholder="Type text or click an example below…",
)
voice = gr.Dropdown(
choices=VOICES,
value="expr-voice-2-f",
label="Voice",
)
speed = gr.Slider(
minimum=0.7,
maximum=1.3,
value=1.0,
step=0.05,
label="Speaking speed",
)
generate = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=1):
audio = gr.Audio(label="Output", type="numpy")
gr.Markdown("### Example prompts")
gr.Examples(
examples=EXAMPLES,
inputs=[text, voice, speed],
)
generate.click(
fn=synthesize,
inputs=[text, voice, speed],
outputs=audio,
)
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)