import gradio as gr
import numpy as np
import os
from huggingface_hub import snapshot_download
from kittentts import KittenTTS

SR = 24000

# Download full repo and auto-discover ONNX
repo_dir = snapshot_download("KittenML/kitten-tts-nano-0.2")
onnx_files = [os.path.join(repo_dir, f) for f in os.listdir(repo_dir) if f.endswith(".onnx")]
MODEL_PATH = onnx_files[0]

tts = KittenTTS(MODEL_PATH)

VOICES = [
    "expr-voice-2-f",
    "expr-voice-3-m",
    "expr-voice-4-f",
]

EXAMPLES = [
    ["Small models can sound natural without giant cloud systems.", "expr-voice-2-f", 1.0],
    
    ["This demo runs a tiny expressive TTS model on CPU only.", "expr-voice-4-f", 1.05],
    ["Most AI stacks are bloated. This one is not.", "expr-voice-3-m", 0.9],
]

def synthesize(text, voice, speed):
    if not text.strip():
        return None
    audio = tts.generate(text, voice=voice, speed=float(speed))
    audio = np.asarray(audio, dtype=np.float32)
    return SR, audio

with gr.Blocks(title="KittenTTS – Tiny Expressive TTS") as demo:
    gr.Markdown(
        """
        # 🐱 KittenTTS  
        **Tiny, expressive Text-to-Speech (~15M params, CPU-only)**

        Use the controls below to explore voice and pacing.
        """
    )

    with gr.Row():
        with gr.Column(scale=2):
            text = gr.Textbox(
                label="Text",
                lines=4,
                placeholder="Type text or click an example below…",
            )

            voice = gr.Dropdown(
                choices=VOICES,
                value="expr-voice-2-f",
                label="Voice",
            )

            speed = gr.Slider(
                minimum=0.7,
                maximum=1.3,
                value=1.0,
                step=0.05,
                label="Speaking speed",
            )

            generate = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio = gr.Audio(label="Output", type="numpy")

    gr.Markdown("### Example prompts")
    gr.Examples(
        examples=EXAMPLES,
        inputs=[text, voice, speed],
    )

    generate.click(
        fn=synthesize,
        inputs=[text, voice, speed],
        outputs=audio,
    )

demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)