""" Kokoro-82M TTS with 54 Voices Built on StyleTTS 2 Architecture """ import gradio as gr import numpy as np import scipy.io.wavfile as wavfile from io import BytesIO import requests import json # Voice database - 54 voices VOICES = { "American Female": { "af_heart": "Heart - Warm & Friendly", "af_bella": "Bella - Elegant & Smooth", "af_nicole": "Nicole - Professional", "af_aoede": "Aoede - Cheerful", "af_kore": "Kore - Gentle", "af_sarah": "Sarah - Clear", "af_nova": "Nova - Modern", "af_sky": "Sky - Light", "af_alloy": "Alloy - Versatile", "af_jessica": "Jessica - Natural", "af_river": "River - Calm" }, "American Male": { "am_michael": "Michael - Deep & Authoritative", "am_fenrir": "Fenrir - Strong", "am_puck": "Puck - Playful", "am_echo": "Echo - Resonant", "am_eric": "Eric - Professional", "am_liam": "Liam - Friendly", "am_onyx": "Onyx - Rich", "am_adam": "Adam - Natural" }, "British Female": { "bf_emma": "Emma - Refined", "bf_isabella": "Isabella - Elegant", "bf_alice": "Alice - Clear", "bf_lily": "Lily - Soft" }, "British Male": { "bm_george": "George - Distinguished", "bm_fable": "Fable - Storyteller", "bm_lewis": "Lewis - Smooth", "bm_daniel": "Daniel - Professional" } } # Flatten voice dict for dropdown def get_voice_list(): voice_list = [] for category, voices in VOICES.items(): for voice_id, desc in voices.items(): voice_list.append(f"{desc} ({voice_id})") return voice_list def generate_speech(text, voice_dropdown, speed): """Generate speech using Kokoro-82M via HF API""" if not text.strip(): return None, "❌ Please enter some text" # Extract voice_id from dropdown selection voice_id = voice_dropdown.split("(")[-1].strip(")") try: # Use Hugging Face Inference API API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M" headers = { "Content-Type": "application/json" } payload = { "inputs": text, "parameters": { "voice": voice_id, "speed": speed } } response = requests.post(API_URL, headers=headers, json=payload) if response.status_code == 200: # Save audio audio_bytes = response.content # Return audio for playback return audio_bytes, f"βœ… Generated with {voice_id} at {speed}x speed" else: return None, f"❌ API Error: {response.status_code}" except Exception as e: return None, f"❌ Error: {str(e)}" # Build Gradio interface with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # πŸŽ™οΈ Kokoro-82M Text-to-Speech **82 Million Parameters β€’ 54 Premium Voices β€’ StyleTTS 2 Architecture** Choose from American & British voices with unique characteristics! """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🎭 Voice Selection") voice_selector = gr.Dropdown( choices=get_voice_list(), value=get_voice_list()[0], label="Choose Voice (54 options)", interactive=True ) gr.Markdown("### βš™οΈ Settings") speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.05, label="Speed", interactive=True ) gr.Markdown(""" ### 🌟 Voice Categories - πŸ‡ΊπŸ‡Έ **American Female**: 11 voices - πŸ‡ΊπŸ‡Έ **American Male**: 8 voices - πŸ‡¬πŸ‡§ **British Female**: 4 voices - πŸ‡¬πŸ‡§ **British Male**: 4 voices """) with gr.Column(scale=2): gr.Markdown("### πŸ“ Text Input") text_input = gr.Textbox( lines=5, placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!", value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.", label="Text to synthesize" ) generate_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg") status_text = gr.Textbox(label="Status", interactive=False) audio_output = gr.Audio( label="Generated Audio", type="numpy", interactive=False ) gr.Markdown(""" ### πŸ“Š Model Information - **Model**: Kokoro-82M - **Architecture**: StyleTTS 2 + ISTFTNet - **Parameters**: 82 Million - **License**: Apache 2.0 - **Training**: Few hundred hours of permissive data """) # Connect event generate_btn.click( fn=generate_speech, inputs=[text_input, voice_selector, speed], outputs=[audio_output, status_text] ) gr.Markdown(""" --- **Note**: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading. Subsequent generations are faster (~2-5 seconds). """) if __name__ == "__main__": demo.launch()