import gradio as gr import numpy as np import os from huggingface_hub import snapshot_download from kittentts import KittenTTS SR = 24000 # Download full repo and auto-discover ONNX repo_dir = snapshot_download("KittenML/kitten-tts-nano-0.2") onnx_files = [os.path.join(repo_dir, f) for f in os.listdir(repo_dir) if f.endswith(".onnx")] MODEL_PATH = onnx_files[0] tts = KittenTTS(MODEL_PATH) VOICES = [ "expr-voice-2-f", "expr-voice-3-m", "expr-voice-4-f", ] EXAMPLES = [ ["Small models can sound natural without giant cloud systems.", "expr-voice-2-f", 1.0], ["This demo runs a tiny expressive TTS model on CPU only.", "expr-voice-4-f", 1.05], ["Most AI stacks are bloated. This one is not.", "expr-voice-3-m", 0.9], ] def synthesize(text, voice, speed): if not text.strip(): return None audio = tts.generate(text, voice=voice, speed=float(speed)) audio = np.asarray(audio, dtype=np.float32) return SR, audio with gr.Blocks(title="KittenTTS – Tiny Expressive TTS") as demo: gr.Markdown( """ # 🐱 KittenTTS **Tiny, expressive Text-to-Speech (~15M params, CPU-only)** Use the controls below to explore voice and pacing. """ ) with gr.Row(): with gr.Column(scale=2): text = gr.Textbox( label="Text", lines=4, placeholder="Type text or click an example below…", ) voice = gr.Dropdown( choices=VOICES, value="expr-voice-2-f", label="Voice", ) speed = gr.Slider( minimum=0.7, maximum=1.3, value=1.0, step=0.05, label="Speaking speed", ) generate = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): audio = gr.Audio(label="Output", type="numpy") gr.Markdown("### Example prompts") gr.Examples( examples=EXAMPLES, inputs=[text, voice, speed], ) generate.click( fn=synthesize, inputs=[text, voice, speed], outputs=audio, ) demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)