File size: 6,019 Bytes
979c57e
a488285
979c57e
 
 
a488285
979c57e
 
a488285
979c57e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a488285
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
import os
import numpy as np
from tts_core import KokoroTTS
import time

# Initialize the TTS engine
tts_engine = KokoroTTS()

# CSS for styling the interface
css = """
.container {
    max-width: 900px;
    margin: auto;
    padding-top: 1.5rem;
}
.title {
    text-align: center;
    color: #2C3E50;
}
.subtitle {
    text-align: center;
    color: #7F8C8D;
    margin-bottom: 2rem;
}
.footer {
    text-align: center;
    margin-top: 2rem;
    color: #7F8C8D;
    font-size: 0.9rem;
}
.settings-block {
    padding: 1rem;
    border-radius: 8px;
    background-color: #f8f9fa;
    margin-bottom: 1rem;
}
.voice-selector {
    margin-bottom: 1rem;
}
.advanced-settings {
    margin-top: 1rem;
}
.output-block {
    margin-top: 1.5rem;
}
"""

# Get all available voices
voice_options = [(name, id) for id, name in tts_engine.us_english_voices.items()]

def text_to_speech(text, voice, speed, add_pronunciation_guide):
    """
    Convert text to speech using the selected voice and settings
    """
    if not text.strip():
        return None, "Please enter some text to convert to speech."
    
    # Add pronunciation guide if requested
    if add_pronunciation_guide:
        # Add simple pronunciation guide for demonstration
        text = text.replace("Kokoro", "[Kokoro](/kˈOkΙ™ΙΉO/)")
    
    # Generate speech
    try:
        start_time = time.time()
        output_file, sample_rate, audio_data = tts_engine.generate_speech(
            text=text,
            voice=voice,
            speed=float(speed)
        )
        generation_time = time.time() - start_time
        
        # Create info message
        info = f"βœ… Generated audio ({len(audio_data)/sample_rate:.2f}s) in {generation_time:.2f}s using voice: {tts_engine.us_english_voices[voice]}"
        
        return (sample_rate, audio_data), info
    except Exception as e:
        return None, f"❌ Error generating speech: {str(e)}"

def create_demo():
    """Create the Gradio interface"""
    
    with gr.Blocks(css=css) as demo:
        gr.HTML("""
        <div class="container">
            <h1 class="title">Kokoro82m Text-to-Speech</h1>
            <p class="subtitle">A CPU-optimized TTS application with all US English voices</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Text input area
                text_input = gr.Textbox(
                    label="Text to convert to speech",
                    placeholder="Enter text here...",
                    lines=10,
                    value="Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient."
                )
                
                # Settings
                with gr.Box(elem_classes=["settings-block"]):
                    gr.Markdown("### Voice Settings")
                    
                    # Voice selection
                    voice_selector = gr.Dropdown(
                        choices=voice_options,
                        value="af_heart",  # Default voice
                        label="Select Voice",
                        elem_classes=["voice-selector"]
                    )
                    
                    with gr.Accordion("Advanced Settings", open=False, elem_classes=["advanced-settings"]):
                        speed_slider = gr.Slider(
                            minimum=0.5,
                            maximum=2.0,
                            value=1.0,
                            step=0.05,
                            label="Speech Speed"
                        )
                        
                        pronunciation_checkbox = gr.Checkbox(
                            label="Add pronunciation guides for better quality",
                            value=False
                        )
                
                # Generate button
                generate_btn = gr.Button("Generate Speech", variant="primary")
            
            with gr.Column(scale=1):
                # Output audio
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    elem_classes=["output-block"]
                )
                
                # Info message
                info_message = gr.Markdown("")
        
        # Set up event handlers
        generate_btn.click(
            fn=text_to_speech,
            inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
            outputs=[audio_output, info_message]
        )
        
        # Examples
        examples = [
            ["Hello, my name is Kokoro. I am a text-to-speech model with 82 million parameters.", "af_heart", 1.0, True],
            ["The quick brown fox jumps over the lazy dog. This is a sample of my voice.", "af_bella", 1.0, False],
            ["Welcome to the world of artificial intelligence and text-to-speech technology.", "am_fenrir", 1.0, False],
            ["This is an example of a slower speaking rate for more deliberate speech.", "af_nicole", 0.8, False],
            ["This is an example of a faster speaking rate for more energetic speech.", "am_michael", 1.3, False]
        ]
        
        gr.Examples(
            examples=examples,
            inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
            outputs=[audio_output, info_message],
            fn=text_to_speech,
            cache_examples=True
        )
        
        gr.HTML("""
        <div class="footer">
            <p>Powered by Kokoro82m TTS - An open-weight TTS model with 82 million parameters</p>
            <p>CPU-optimized for efficient inference on limited resources</p>
        </div>
        """)
    
    return demo

# Create and launch the demo
demo = create_demo()

# For Hugging Face Spaces
if __name__ == "__main__":
    demo.launch()