| |
| |
| |
| import os |
| import gradio as gr |
|
|
| |
| MODELS = { |
| "nari-labs/Dia-1.6B": "Dia-1.6B", |
| "hexgrad/Kokoro-82M": "Kokoro-82M", |
| "sesame/csm-1b": "csm-1b", |
| "SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B", |
| "canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft", |
| "SWivid/F5-TTS": "F5-TTS", |
| "Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer", |
| "coqui/XTTS-v2": "XTTS-v2", |
| "HKUSTAudio/Llasa-3B": "Llasa-3B", |
| "amphion/MaskGCT": "MaskGCT", |
| "OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B", |
| "ByteDance/MegaTTS3": "MegaTTS3", |
| "Kyutai/Kyutai-TTS" |
| } |
|
|
| |
| MODEL_RATINGS = { |
| "nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"}, |
| "hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
| "sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, |
| "SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, |
| "canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, |
| "SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, |
| "Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"}, |
| "coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
| "HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"}, |
| "amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
| "OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"}, |
| "ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} |
| "Kyutai/Kyutai-TTS": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} |
| } |
|
|
| |
| MODEL_DESCRIPTIONS = { |
| "nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality", |
| "hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity", |
| "sesame/csm-1b": "High-quality synthesis with excellent naturalness", |
| "SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance", |
| "canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality", |
| "SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings", |
| "Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model", |
| "coqui/XTTS-v2": "Multi-lingual excellence with proven performance", |
| "HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model", |
| "amphion/MaskGCT": "Masked generative modeling approach", |
| "OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance", |
| "ByteDance/MegaTTS3": "Industrial-grade TTS solution" |
| "Kyutai/Kyutai-TTS": "Industrial-grade TTS solution" |
| } |
|
|
| |
| SAMPLES_DIR = "samples" |
| CLIP_NAME = "generated-audio.wav" |
|
|
| |
| TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!" |
|
|
| def repo_to_slug(repo: str) -> str: |
| """Convert huggingface/xxx to huggingface_xxx for folder naming.""" |
| return repo.replace("/", "_") |
|
|
| def get_rating_emoji(rating: str) -> str: |
| """Convert rating to emoji.""" |
| if rating == "Excellent": |
| return "🟢" |
| elif rating == "Good": |
| return "🟡" |
| else: |
| return "🟠" |
|
|
| def get_audio_path(repo: str) -> str: |
| """Get the audio file path for a given repository.""" |
| audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME) |
| return audio_path if os.path.isfile(audio_path) else None |
|
|
| def filter_models(search_term: str): |
| """Filter models based on search term.""" |
| if not search_term.strip(): |
| return list(MODELS.keys()) |
| |
| search_lower = search_term.lower().strip() |
| return [ |
| repo for repo, name in MODELS.items() |
| if search_lower in repo.lower() or search_lower in name.lower() |
| ] |
|
|
| def create_model_card(repo: str) -> str: |
| """Create a formatted model card with ratings and description.""" |
| display_name = MODELS[repo] |
| description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model") |
| ratings = MODEL_RATINGS.get(repo, {}) |
| |
| card_html = f""" |
| <div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;"> |
| <h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3> |
| |
| </div> |
| """ |
| return card_html |
|
|
| |
| custom_css = """ |
| #title { |
| text-align: center; |
| background: rgb(203, 255, 77); |
| color: white; |
| padding: 2rem; |
| border-radius: 15px; |
| margin-bottom: 2rem; |
| } |
| |
| #intro-section { |
| background: #f8f9fa; |
| color: #2c3e50; |
| padding: 1.5rem; |
| border-radius: 10px; |
| margin: 1rem 0; |
| border-left: 4px solid rgb(0, 72, 10); |
| } |
| |
| #intro-section h2, |
| #intro-section h3 { |
| color: #2c3e50; |
| } |
| |
| #intro-section p { |
| color: #34495e; |
| } |
| |
| #intro-section ul li { |
| color: #34495e; |
| } |
| |
| #intro-section .mission-text { |
| color: #667eea !important; |
| font-weight: bold; |
| text-align: center; |
| } |
| |
| #intro-section strong { |
| color: #2c3e50 !important; |
| } |
| |
| #intro-section em { |
| color: #2c3e50 !important; |
| } |
| |
| #intro-section .mission-text strong { |
| color: #667eea !important; |
| } |
| |
| #test-prompt { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| padding: 1.5rem; |
| border-radius: 10px; |
| text-align: center; |
| margin: 1rem 0; |
| } |
| |
| .model-grid { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); |
| gap: 1rem; |
| margin: 1rem 0; |
| } |
| |
| #footer { |
| text-align: center; |
| padding: 2rem; |
| color: #666; |
| border-top: 1px solid #eee; |
| margin-top: 2rem; |
| } |
| |
| /* make all the text in our white‐background cards dark */ |
| .model-grid .gr-html * { |
| color: #2c3e50 !important; |
| } |
| |
| .model-card { |
| background: white; |
| color: #2c3e50 !important; |
| border: 1px solid #ddd; |
| border-radius: 12px; |
| padding: 20px; |
| margin: 10px 0; |
| } |
| |
| """ |
|
|
| |
| def create_interface(): |
| with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo: |
| |
| |
| gr.HTML(""" |
| <div id="title"> |
| <h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1> |
| </div> |
| """) |
| |
| |
| gr.HTML(""" |
| <div id="intro-section"> |
| <h3>🔬 Our Exciting Quest</h3> |
| <p>We’re on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.</p> |
| |
| <p><strong>Featured TTS Models:</strong></p> |
| <ul> |
| <li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li> |
| <li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li> |
| <li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li> |
| <li>🎵 <strong>XTTS-v2</strong> - Multi-lingual excellence</li> |
| <li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li> |
| <li>🎤 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li> |
| <li><em>...and 6 more incredible models!</em></li> |
| </ul> |
| |
| <h3>🔑 Key Findings</h3> |
| <ol> |
| <li><strong>Outstanding Speech Quality</strong><br> |
| Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>, |
| <strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong> delivered exceptionally |
| natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong> |
| stood out as the most well-rounded model as they combined good synthesized speech with solid controllability. |
| </li> |
| <li><strong>Superior Controllability</strong><br> |
| <strong>Zonos-v0.1-transformer</strong> emerged as the best in fine-grained control: it offers detailed |
| adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise |
| voice modulation. |
| </li> |
| <li><strong>Performance vs. Footprint Trade-off</strong><br> |
| Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical. |
| Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual |
| synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources. |
| </li> |
| <li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br> |
| <strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice |
| cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. |
| <strong>Llama-OuteTTS-1.0-1B</strong> and <strong>MegaTTS3</strong> also offer multilingual input handling, |
| though they may require careful sampling parameter tuning to achieve optimal results. |
| </li> |
| </ol> |
| |
| </div> |
| """) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| with gr.Row(): |
| search_box = gr.Textbox( |
| label="🔍 Search Models", |
| placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')", |
| value="", |
| scale=3 |
| ) |
| clear_btn = gr.Button("Clear", scale=1) |
| |
| |
| gr.Markdown("## 🎧 Model Gallery") |
| |
| |
| model_components = [] |
| |
| for repo, display_name in MODELS.items(): |
| with gr.Group(): |
| |
| model_info = gr.HTML(create_model_card(repo)) |
| |
| |
| audio_path = get_audio_path(repo) |
| if audio_path: |
| audio_player = gr.Audio( |
| value=audio_path, |
| label=f"🎵 {display_name} Audio Sample", |
| interactive=False |
| ) |
| else: |
| audio_player = gr.HTML(f"<p style='color: red;'>🤷♂️ Audio sample not found for {display_name}</p>") |
| |
| model_components.append((repo, model_info, audio_player)) |
| |
| |
| def update_visibility(search_term): |
| filtered_repos = filter_models(search_term) |
| updates = [] |
| |
| for repo, model_info, audio_player in model_components: |
| visible = repo in filtered_repos |
| updates.extend([ |
| gr.update(visible=visible), |
| gr.update(visible=visible) |
| ]) |
| |
| return updates |
| |
| |
| search_box.change( |
| fn=update_visibility, |
| inputs=[search_box], |
| outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]] |
| ) |
| |
| clear_btn.click( |
| fn=lambda: "", |
| outputs=[search_box] |
| ) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| return demo |
|
|
| |
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch( |
| share=True, |
| inbrowser=True, |
| show_error=True |
| ) |