TTS / app.py
garyuzair's picture
Upload 2 files
979c57e verified
import gradio as gr
import os
import numpy as np
from tts_core import KokoroTTS
import time
# Initialize the TTS engine
tts_engine = KokoroTTS()
# CSS for styling the interface
css = """
.container {
max-width: 900px;
margin: auto;
padding-top: 1.5rem;
}
.title {
text-align: center;
color: #2C3E50;
}
.subtitle {
text-align: center;
color: #7F8C8D;
margin-bottom: 2rem;
}
.footer {
text-align: center;
margin-top: 2rem;
color: #7F8C8D;
font-size: 0.9rem;
}
.settings-block {
padding: 1rem;
border-radius: 8px;
background-color: #f8f9fa;
margin-bottom: 1rem;
}
.voice-selector {
margin-bottom: 1rem;
}
.advanced-settings {
margin-top: 1rem;
}
.output-block {
margin-top: 1.5rem;
}
"""
# Get all available voices
voice_options = [(name, id) for id, name in tts_engine.us_english_voices.items()]
def text_to_speech(text, voice, speed, add_pronunciation_guide):
"""
Convert text to speech using the selected voice and settings
"""
if not text.strip():
return None, "Please enter some text to convert to speech."
# Add pronunciation guide if requested
if add_pronunciation_guide:
# Add simple pronunciation guide for demonstration
text = text.replace("Kokoro", "[Kokoro](/kˈOkΙ™ΙΉO/)")
# Generate speech
try:
start_time = time.time()
output_file, sample_rate, audio_data = tts_engine.generate_speech(
text=text,
voice=voice,
speed=float(speed)
)
generation_time = time.time() - start_time
# Create info message
info = f"βœ… Generated audio ({len(audio_data)/sample_rate:.2f}s) in {generation_time:.2f}s using voice: {tts_engine.us_english_voices[voice]}"
return (sample_rate, audio_data), info
except Exception as e:
return None, f"❌ Error generating speech: {str(e)}"
def create_demo():
"""Create the Gradio interface"""
with gr.Blocks(css=css) as demo:
gr.HTML("""
<div class="container">
<h1 class="title">Kokoro82m Text-to-Speech</h1>
<p class="subtitle">A CPU-optimized TTS application with all US English voices</p>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
# Text input area
text_input = gr.Textbox(
label="Text to convert to speech",
placeholder="Enter text here...",
lines=10,
value="Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient."
)
# Settings
with gr.Box(elem_classes=["settings-block"]):
gr.Markdown("### Voice Settings")
# Voice selection
voice_selector = gr.Dropdown(
choices=voice_options,
value="af_heart", # Default voice
label="Select Voice",
elem_classes=["voice-selector"]
)
with gr.Accordion("Advanced Settings", open=False, elem_classes=["advanced-settings"]):
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.05,
label="Speech Speed"
)
pronunciation_checkbox = gr.Checkbox(
label="Add pronunciation guides for better quality",
value=False
)
# Generate button
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=1):
# Output audio
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
elem_classes=["output-block"]
)
# Info message
info_message = gr.Markdown("")
# Set up event handlers
generate_btn.click(
fn=text_to_speech,
inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
outputs=[audio_output, info_message]
)
# Examples
examples = [
["Hello, my name is Kokoro. I am a text-to-speech model with 82 million parameters.", "af_heart", 1.0, True],
["The quick brown fox jumps over the lazy dog. This is a sample of my voice.", "af_bella", 1.0, False],
["Welcome to the world of artificial intelligence and text-to-speech technology.", "am_fenrir", 1.0, False],
["This is an example of a slower speaking rate for more deliberate speech.", "af_nicole", 0.8, False],
["This is an example of a faster speaking rate for more energetic speech.", "am_michael", 1.3, False]
]
gr.Examples(
examples=examples,
inputs=[text_input, voice_selector, speed_slider, pronunciation_checkbox],
outputs=[audio_output, info_message],
fn=text_to_speech,
cache_examples=True
)
gr.HTML("""
<div class="footer">
<p>Powered by Kokoro82m TTS - An open-weight TTS model with 82 million parameters</p>
<p>CPU-optimized for efficient inference on limited resources</p>
</div>
""")
return demo
# Create and launch the demo
demo = create_demo()
# For Hugging Face Spaces
if __name__ == "__main__":
demo.launch()