import gradio as gr import nltk from tiny_tts import TinyTTS # Download required NLTK data for g2p-en try: nltk.download('averaged_perceptron_tagger_eng', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) nltk.download('cmudict', quiet=True) except Exception as e: print(f"NLTK download warning: {e}") # Initialize the model (auto-downloads from HF Hub if needed) print("Initializing TinyTTS...") tts = TinyTTS() print("Model loaded successfully!") def synthesize_audio(text, speed): output_path = "output.wav" try: tts.speak(text, output_path=output_path, speaker="MALE", speed=speed) return output_path except Exception as e: return f"Error: {e}" COMPARISON_TABLE = """ ## โšก Comparison with Other TTS Engines All numbers are **CPU-only** on the same Intel Core laptop. Text: *"The weather is nice today, and I feel very relaxed."* | ENGINE | Params | Total (s) | Audio (s) | RTFx | |:---|---:|---:|---:|---:| | **TinyTTS (ONNX) ๐Ÿš€** | **1.6M** | **0.092** | **4.88** | **~53x** | | Piper (ONNX, 22kHz) | ~63M | 0.112 | 2.91 | ~26x | | TinyTTS (PyTorch) | 1.6M | 0.272 | 4.88 | ~18x | | KittenTTS nano | ~10M | 0.286 | 4.87 | ~17x | | Supertonic (2-step) | ~82M | 0.249 | 3.69 | ~15x | | Pocket-TTS | 100M | 0.928 | 3.68 | ~4x | | Kokoro ONNX | 82M | 0.933 | 3.16 | ~3x | | KittenTTS mini | ~25M | 2.047 | 4.17 | ~2x | > **RTFx** = Audio Duration รท Synthesis Time (higher = faster). > TinyTTS achieves the **best speed-to-size ratio**: only **1.6M params** / **3.4 MB** ONNX yet ~53ร— real-time at 44.1kHz. """ # Create Gradio interface with gr.Blocks(title="TinyTTS Demo", theme=gr.themes.Soft()) as app: gr.Markdown("# ๐Ÿ—ฃ๏ธ TinyTTS") gr.Markdown( "**Ultra-lightweight English Text-to-Speech โ€” only 1.6M parameters, ~3.4 MB ONNX**\n\n" "This space runs on CPU efficiently and synthesizes high-quality 44.1kHz audio **~53ร— faster** than real-time." ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="Enter English text here...", value="The weather is nice today, and I feel very relaxed.", lines=4 ) speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed (1.0 = normal, >1 = faster, <1 = slower)" ) submit_btn = gr.Button("๐Ÿ”Š Synthesize Speech", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Output Audio", type="filepath") # Example prompts gr.Examples( examples=[ ["The weather is nice today, and I feel very relaxed.", 1.0], ["TinyTTS has only one point six million parameters, making it extremely fast on CPUs.", 1.0], ["This is a speed test. Speaking at one and a half times the normal rate.", 1.5], ["Slow and steady wins the race. Let me speak more carefully.", 0.7], ], inputs=[text_input, speed_slider], ) submit_btn.click( fn=synthesize_audio, inputs=[text_input, speed_slider], outputs=audio_output ) # Comparison table gr.Markdown(COMPARISON_TABLE) if __name__ == "__main__": app.launch(server_name="0.0.0.0")