| | import gradio as gr |
| | import numpy as np |
| | import os |
| | from huggingface_hub import snapshot_download |
| | from kittentts import KittenTTS |
| |
|
| | SR = 24000 |
| |
|
| | |
| | repo_dir = snapshot_download("KittenML/kitten-tts-nano-0.2") |
| | onnx_files = [os.path.join(repo_dir, f) for f in os.listdir(repo_dir) if f.endswith(".onnx")] |
| | MODEL_PATH = onnx_files[0] |
| |
|
| | tts = KittenTTS(MODEL_PATH) |
| |
|
| | VOICES = [ |
| | "expr-voice-2-f", |
| | "expr-voice-3-m", |
| | "expr-voice-4-f", |
| | ] |
| |
|
| | EXAMPLES = [ |
| | ["Small models can sound natural without giant cloud systems.", "expr-voice-2-f", 1.0], |
| | |
| | ["This demo runs a tiny expressive TTS model on CPU only.", "expr-voice-4-f", 1.05], |
| | ["Most AI stacks are bloated. This one is not.", "expr-voice-3-m", 0.9], |
| | ] |
| |
|
| | def synthesize(text, voice, speed): |
| | if not text.strip(): |
| | return None |
| | audio = tts.generate(text, voice=voice, speed=float(speed)) |
| | audio = np.asarray(audio, dtype=np.float32) |
| | return SR, audio |
| |
|
| | with gr.Blocks(title="KittenTTS – Tiny Expressive TTS") as demo: |
| | gr.Markdown( |
| | """ |
| | # 🐱 KittenTTS |
| | **Tiny, expressive Text-to-Speech (~15M params, CPU-only)** |
| | |
| | Use the controls below to explore voice and pacing. |
| | """ |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=2): |
| | text = gr.Textbox( |
| | label="Text", |
| | lines=4, |
| | placeholder="Type text or click an example below…", |
| | ) |
| |
|
| | voice = gr.Dropdown( |
| | choices=VOICES, |
| | value="expr-voice-2-f", |
| | label="Voice", |
| | ) |
| |
|
| | speed = gr.Slider( |
| | minimum=0.7, |
| | maximum=1.3, |
| | value=1.0, |
| | step=0.05, |
| | label="Speaking speed", |
| | ) |
| |
|
| | generate = gr.Button("Generate Speech", variant="primary") |
| |
|
| | with gr.Column(scale=1): |
| | audio = gr.Audio(label="Output", type="numpy") |
| |
|
| | gr.Markdown("### Example prompts") |
| | gr.Examples( |
| | examples=EXAMPLES, |
| | inputs=[text, voice, speed], |
| | ) |
| |
|
| | generate.click( |
| | fn=synthesize, |
| | inputs=[text, voice, speed], |
| | outputs=audio, |
| | ) |
| |
|
| | demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True) |