|
|
import gradio as gr |
|
|
import os |
|
|
import numpy as np |
|
|
from tts_core import KokoroTTS |
|
|
import time |
|
|
|
|
|
|
|
|
tts_engine = KokoroTTS() |
|
|
|
|
|
|
|
|
css = """ |
|
|
.container { |
|
|
max-width: 900px; |
|
|
margin: auto; |
|
|
padding-top: 1.5rem; |
|
|
} |
|
|
.title { |
|
|
text-align: center; |
|
|
color: #2C3E50; |
|
|
} |
|
|
.subtitle { |
|
|
text-align: center; |
|
|
color: #7F8C8D; |
|
|
margin-bottom: 2rem; |
|
|
} |
|
|
.footer { |
|
|
text-align: center; |
|
|
margin-top: 2rem; |
|
|
color: #7F8C8D; |
|
|
font-size: 0.9rem; |
|
|
} |
|
|
.settings-block { |
|
|
padding: 1rem; |
|
|
border-radius: 8px; |
|
|
background-color: #f8f9fa; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
.voice-selector { |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
.advanced-settings { |
|
|
margin-top: 1rem; |
|
|
} |
|
|
.output-block { |
|
|
margin-top: 1.5rem; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
voice_options = [(name, id) for id, name in tts_engine.us_english_voices.items()] |
|
|
|
|
|
def text_to_speech(text, voice, speed, add_pronunciation_guide): |
|
|
""" |
|
|
Convert text to speech using the selected voice and settings |
|
|
""" |
|
|
if not text.strip(): |
|
|
return None, "Please enter some text to convert to speech." |
|
|
|
|
|
|
|
|
if add_pronunciation_guide: |
|
|
|
|
|
text = text.replace("Kokoro", "[Kokoro](/kΛOkΙΙΉO/)") |
|
|
|
|
|
|
|
|
try: |
|
|
start_time = time.time() |
|
|
output_file, sample_rate, audio_data = tts_engine.generate_speech( |
|
|
text=text, |
|
|
voice=voice, |
|
|
speed=float(speed) |
|
|
) |
|
|
generation_time = time.time() - start_time |
|
|
|
|
|
|
|
|
info = f"β
Generated audio ({len(audio_data)/sample_rate:.2f}s) in {generation_time:.2f}s using voice: {tts_engine.us_english_voices[voice]}" |
|
|
|
|
|
return (sample_rate, audio_data), info |
|
|
except Exception as e: |
|
|
return None, f"β Error generating speech: {str(e)}" |
|
|
|
|
|
def create_demo(): |
|
|
"""Create the Gradio interface""" |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
gr.HTML(""" |
|
|
<div class="container"> |
|
|
<h1 class="title">Kokoro82m Text-to-Speech</h1> |
|
|
<p class="subtitle">A CPU-optimized TTS application with all US English voices</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text to convert to speech", |
|
|
placeholder="Enter text here...", |
|
|
lines=10, |
|
|
value="Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient." |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Box(elem_classes=["settings-block"]): |
|
|
gr.Markdown("### Voice Settings") |
|
|
|
|
|
|
|
|
voice_selector = gr.Dropdown( |
|
|
choices=voice_options, |
|
|
value="af_heart", |
|
|
label="Select Voice", |
|
|
elem_classes=["voice-selector"] |
|
|
) |
|
|
|
|
|
with gr.Accordion("Advanced Settings", open=False, elem_classes=["advanced-settings"]): |
|
|
speed_slider = gr.Slider( |
|
|
minimum=0.5, |
|
|
maximum=2.0, |
|
|
value=1.0, |
|
|
step=0.05, |
|
|
label="Speech Speed" |
|
|
) |
|
|
|
|
|
pronunciation_checkbox = gr.Checkbox( |
|
|
label="Add pronunciation guides for better quality", |
|
|
value=False |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn = gr.Button("Generate Speech", variant="primary") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
type="numpy", |
|
|
elem_classes=["output-block"] |
|
|
) |
|
|
|
|
|
|
|
|
info_message = gr.Markdown("") |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=text_to_speech, |
|
|
inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox], |
|
|
outputs=[audio_output, info_message] |
|
|
) |
|
|
|
|
|
|
|
|
examples = [ |
|
|
["Hello, my name is Kokoro. I am a text-to-speech model with 82 million parameters.", "af_heart", 1.0, True], |
|
|
["The quick brown fox jumps over the lazy dog. This is a sample of my voice.", "af_bella", 1.0, False], |
|
|
["Welcome to the world of artificial intelligence and text-to-speech technology.", "am_fenrir", 1.0, False], |
|
|
["This is an example of a slower speaking rate for more deliberate speech.", "af_nicole", 0.8, False], |
|
|
["This is an example of a faster speaking rate for more energetic speech.", "am_michael", 1.3, False] |
|
|
] |
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox], |
|
|
outputs=[audio_output, info_message], |
|
|
fn=text_to_speech, |
|
|
cache_examples=True |
|
|
) |
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="footer"> |
|
|
<p>Powered by Kokoro82m TTS - An open-weight TTS model with 82 million parameters</p> |
|
|
<p>CPU-optimized for efficient inference on limited resources</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
demo = create_demo() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|