tiny-tts-demo / app.py
backtracking's picture
Upload app.py with huggingface_hub
f499fee verified
import gradio as gr
import nltk
from tiny_tts import TinyTTS
# Download required NLTK data for g2p-en
try:
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('cmudict', quiet=True)
except Exception as e:
print(f"NLTK download warning: {e}")
# Initialize the model (auto-downloads from HF Hub if needed)
print("Initializing TinyTTS...")
tts = TinyTTS()
print("Model loaded successfully!")
def synthesize_audio(text, speed):
output_path = "output.wav"
try:
tts.speak(text, output_path=output_path, speaker="MALE", speed=speed)
return output_path
except Exception as e:
return f"Error: {e}"
COMPARISON_TABLE = """
## ⚑ Comparison with Other TTS Engines
All numbers are **CPU-only** on the same Intel Core laptop. Text: *"The weather is nice today, and I feel very relaxed."*
| ENGINE | Params | Total (s) | Audio (s) | RTFx |
|:---|---:|---:|---:|---:|
| **TinyTTS (ONNX) πŸš€** | **1.6M** | **0.092** | **4.88** | **~53x** |
| Piper (ONNX, 22kHz) | ~63M | 0.112 | 2.91 | ~26x |
| TinyTTS (PyTorch) | 1.6M | 0.272 | 4.88 | ~18x |
| KittenTTS nano | ~10M | 0.286 | 4.87 | ~17x |
| Supertonic (2-step) | ~82M | 0.249 | 3.69 | ~15x |
| Pocket-TTS | 100M | 0.928 | 3.68 | ~4x |
| Kokoro ONNX | 82M | 0.933 | 3.16 | ~3x |
| KittenTTS mini | ~25M | 2.047 | 4.17 | ~2x |
> **RTFx** = Audio Duration Γ· Synthesis Time (higher = faster).
> TinyTTS achieves the **best speed-to-size ratio**: only **1.6M params** / **3.4 MB** ONNX yet ~53Γ— real-time at 44.1kHz.
"""
# Create Gradio interface
with gr.Blocks(title="TinyTTS Demo", theme=gr.themes.Soft()) as app:
gr.Markdown("# πŸ—£οΈ TinyTTS")
gr.Markdown(
"**Ultra-lightweight English Text-to-Speech β€” only 1.6M parameters, ~3.4 MB ONNX**\n\n"
"This space runs on CPU efficiently and synthesizes high-quality 44.1kHz audio **~53Γ— faster** than real-time."
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter English text here...",
value="The weather is nice today, and I feel very relaxed.",
lines=4
)
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speed (1.0 = normal, >1 = faster, <1 = slower)"
)
submit_btn = gr.Button("πŸ”Š Synthesize Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output Audio", type="filepath")
# Example prompts
gr.Examples(
examples=[
["The weather is nice today, and I feel very relaxed.", 1.0],
["TinyTTS has only one point six million parameters, making it extremely fast on CPUs.", 1.0],
["This is a speed test. Speaking at one and a half times the normal rate.", 1.5],
["Slow and steady wins the race. Let me speak more carefully.", 0.7],
],
inputs=[text_input, speed_slider],
)
submit_btn.click(
fn=synthesize_audio,
inputs=[text_input, speed_slider],
outputs=audio_output
)
# Comparison table
gr.Markdown(COMPARISON_TABLE)
if __name__ == "__main__":
app.launch(server_name="0.0.0.0")