Kokoro82m Text-to-Speech

import gradio as gr
import os
import numpy as np
from tts_core import KokoroTTS
import time

# Initialize the TTS engine
tts_engine = KokoroTTS()

# CSS for styling the interface
css = """
.container {
    max-width: 900px;
    margin: auto;
    padding-top: 1.5rem;
}
.title {
    text-align: center;
    color: #2C3E50;
}
.subtitle {
    text-align: center;
    color: #7F8C8D;
    margin-bottom: 2rem;
}
.footer {
    text-align: center;
    margin-top: 2rem;
    color: #7F8C8D;
    font-size: 0.9rem;
}
.settings-block {
    padding: 1rem;
    border-radius: 8px;
    background-color: #f8f9fa;
    margin-bottom: 1rem;
}
.voice-selector {
    margin-bottom: 1rem;
}
.advanced-settings {
    margin-top: 1rem;
}
.output-block {
    margin-top: 1.5rem;
}
"""

# Get all available voices
voice_options = [(name, id) for id, name in tts_engine.us_english_voices.items()]

def text_to_speech(text, voice, speed, add_pronunciation_guide):
    """
    Convert text to speech using the selected voice and settings
    """
    if not text.strip():
        return None, "Please enter some text to convert to speech."
    
    # Add pronunciation guide if requested
    if add_pronunciation_guide:
        # Add simple pronunciation guide for demonstration
        text = text.replace("Kokoro", "[Kokoro](/kˈOkəɹO/)")
    
    # Generate speech
    try:
        start_time = time.time()
        output_file, sample_rate, audio_data = tts_engine.generate_speech(
            text=text,
            voice=voice,
            speed=float(speed)
        )
        generation_time = time.time() - start_time
        
        # Create info message
        info = f"✅ Generated audio ({len(audio_data)/sample_rate:.2f}s) in {generation_time:.2f}s using voice: {tts_engine.us_english_voices[voice]}"
        
        return (sample_rate, audio_data), info
    except Exception as e:
        return None, f"❌ Error generating speech: {str(e)}"

def create_demo():
    """Create the Gradio interface"""
    
    with gr.Blocks(css=css) as demo:
        gr.HTML("""
        <div class="container">
            <h1 class="title">Kokoro82m Text-to-Speech</h1>
            <p class="subtitle">A CPU-optimized TTS application with all US English voices</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Text input area
                text_input = gr.Textbox(
                    label="Text to convert to speech",
                    placeholder="Enter text here...",
                    lines=10,
                    value="Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient."
                )
                
                # Settings
                with gr.Box(elem_classes=["settings-block"]):
                    gr.Markdown("### Voice Settings")
                    
                    # Voice selection
                    voice_selector = gr.Dropdown(
                        choices=voice_options,
                        value="af_heart",  # Default voice
                        label="Select Voice",
                        elem_classes=["voice-selector"]
                    )
                    
                    with gr.Accordion("Advanced Settings", open=False, elem_classes=["advanced-settings"]):
                        speed_slider = gr.Slider(
                            minimum=0.5,
                            maximum=2.0,
                            value=1.0,
                            step=0.05,
                            label="Speech Speed"
                        )
                        
                        pronunciation_checkbox = gr.Checkbox(
                            label="Add pronunciation guides for better quality",
                            value=False
                        )
                
                # Generate button
                generate_btn = gr.Button("Generate Speech", variant="primary")
            
            with gr.Column(scale=1):
                # Output audio
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    elem_classes=["output-block"]
                )
                
                # Info message
                info_message = gr.Markdown("")
        
        # Set up event handlers
        generate_btn.click(
            fn=text_to_speech,
            inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
            outputs=[audio_output, info_message]
        )
        
        # Examples
        examples = [
            ["Hello, my name is Kokoro. I am a text-to-speech model with 82 million parameters.", "af_heart", 1.0, True],
            ["The quick brown fox jumps over the lazy dog. This is a sample of my voice.", "af_bella", 1.0, False],
            ["Welcome to the world of artificial intelligence and text-to-speech technology.", "am_fenrir", 1.0, False],
            ["This is an example of a slower speaking rate for more deliberate speech.", "af_nicole", 0.8, False],
            ["This is an example of a faster speaking rate for more energetic speech.", "am_michael", 1.3, False]
        ]
        
        gr.Examples(
            examples=examples,
            inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
            outputs=[audio_output, info_message],
            fn=text_to_speech,
            cache_examples=True
        )
        
        gr.HTML("""
        <div class="footer">
            <p>Powered by Kokoro82m TTS - An open-weight TTS model with 82 million parameters</p>
            <p>CPU-optimized for efficient inference on limited resources</p>
        </div>
        """)
    
    return demo

# Create and launch the demo
demo = create_demo()

# For Hugging Face Spaces
if __name__ == "__main__":
    demo.launch()