Spaces:

Inferless
/

Open-Source-TTS-Gallary

Running

App Files Files Community

Open-Source-TTS-Gallary / app.py

rbgo

Update app.py

2d88a42 verified 10 months ago

raw

history blame

18.7 kB

	# ---------------------------------------------------------------
	# app.py – "TTS Showcase" (Gradio Implementation)
	# ---------------------------------------------------------------
	import os
	import gradio as gr

	# ---------- 1. Demo metadata ----------
	MODELS = {
	"nari-labs/Dia-1.6B": "Dia-1.6B",
	"hexgrad/Kokoro-82M": "Kokoro-82M",
	"sesame/csm-1b": "csm-1b",
	"SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B",
	"canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft",
	"SWivid/F5-TTS": "F5-TTS",
	"Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer",
	"coqui/XTTS-v2": "XTTS-v2",
	"HKUSTAudio/Llasa-3B": "Llasa-3B",
	"amphion/MaskGCT": "MaskGCT",
	"OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B",
	"ByteDance/MegaTTS3": "MegaTTS3",
	"Kyutai/Kyutai-TTS"
	}

	# Performance ratings for each model
	MODEL_RATINGS = {
	"nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"},
	"hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
	"sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
	"SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
	"canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
	"SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
	"Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"},
	"coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
	"HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"},
	"amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
	"OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"},
	"ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"}
	"Kyutai/Kyutai-TTS": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"}
	}

	# Model descriptions for better understanding
	MODEL_DESCRIPTIONS = {
	"nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality",
	"hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity",
	"sesame/csm-1b": "High-quality synthesis with excellent naturalness",
	"SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance",
	"canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality",
	"SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings",
	"Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model",
	"coqui/XTTS-v2": "Multi-lingual excellence with proven performance",
	"HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model",
	"amphion/MaskGCT": "Masked generative modeling approach",
	"OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance",
	"ByteDance/MegaTTS3": "Industrial-grade TTS solution"
	"Kyutai/Kyutai-TTS": "Industrial-grade TTS solution"
	}

	# Folder that contains subfolders with the audio clips
	SAMPLES_DIR = "samples"
	CLIP_NAME = "generated-audio.wav"

	# Test prompt used for evaluation
	TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!"

	def repo_to_slug(repo: str) -> str:
	"""Convert huggingface/xxx to huggingface_xxx for folder naming."""
	return repo.replace("/", "_")

	def get_rating_emoji(rating: str) -> str:
	"""Convert rating to emoji."""
	if rating == "Excellent":
	return "🟢"
	elif rating == "Good":
	return "🟡"
	else:
	return "🟠"

	def get_audio_path(repo: str) -> str:
	"""Get the audio file path for a given repository."""
	audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME)
	return audio_path if os.path.isfile(audio_path) else None

	def filter_models(search_term: str):
	"""Filter models based on search term."""
	if not search_term.strip():
	return list(MODELS.keys())

	search_lower = search_term.lower().strip()
	return [
	repo for repo, name in MODELS.items()
	if search_lower in repo.lower() or search_lower in name.lower()
	]

	def create_model_card(repo: str) -> str:
	"""Create a formatted model card with ratings and description."""
	display_name = MODELS[repo]
	description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model")
	ratings = MODEL_RATINGS.get(repo, {})

	card_html = f"""
	<div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;">
	<h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3>

	</div>
	"""
	return card_html

	# ---------- 2. Custom CSS ----------
	custom_css = """
	#title {
	text-align: center;
	background: rgb(203, 255, 77);
	color: white;
	padding: 2rem;
	border-radius: 15px;
	margin-bottom: 2rem;
	}

	#intro-section {
	background: #f8f9fa;
	color: #2c3e50;
	padding: 1.5rem;
	border-radius: 10px;
	margin: 1rem 0;
	border-left: 4px solid rgb(0, 72, 10);
	}

	#intro-section h2,
	#intro-section h3 {
	color: #2c3e50;
	}

	#intro-section p {
	color: #34495e;
	}

	#intro-section ul li {
	color: #34495e;
	}

	#intro-section .mission-text {
	color: #667eea !important;
	font-weight: bold;
	text-align: center;
	}

	#intro-section strong {
	color: #2c3e50 !important;
	}

	#intro-section em {
	color: #2c3e50 !important;
	}

	#intro-section .mission-text strong {
	color: #667eea !important;
	}

	#test-prompt {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 1.5rem;
	border-radius: 10px;
	text-align: center;
	margin: 1rem 0;
	}

	.model-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
	gap: 1rem;
	margin: 1rem 0;
	}

	#footer {
	text-align: center;
	padding: 2rem;
	color: #666;
	border-top: 1px solid #eee;
	margin-top: 2rem;
	}

	/* make all the text in our white‐background cards dark */
	.model-grid .gr-html * {
	color: #2c3e50 !important;
	}

	.model-card {
	background: white;
	color: #2c3e50 !important;
	border: 1px solid #ddd;
	border-radius: 12px;
	padding: 20px;
	margin: 10px 0;
	}

	"""

	# ---------- 3. Main Gradio Interface ----------
	def create_interface():
	with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo:

	# Header Section
	gr.HTML("""
	<div id="title">
	<h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1>
	</div>
	""")

	# Introduction Section
	gr.HTML("""
	<div id="intro-section">
	<h3>🔬 Our Exciting Quest</h3>
	<p>We’re on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.</p>

	<p><strong>Featured TTS Models:</strong></p>
	<ul>
	<li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
	<li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
	<li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li>
	<li>🎵 <strong>XTTS-v2</strong> - Multi-lingual excellence</li>
	<li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li>
	<li>🎤 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li>
	<li><em>...and 6 more incredible models!</em></li>
	</ul>

	<h3>🔑 Key Findings</h3>
	<ol>
	<li><strong>Outstanding Speech Quality</strong><br>
	Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>,
	<strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong> delivered exceptionally
	natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong>
	stood out as the most well-rounded model as they combined good synthesized speech with solid controllability.
	</li>
	<li><strong>Superior Controllability</strong><br>
	<strong>Zonos-v0.1-transformer</strong> emerged as the best in fine-grained control: it offers detailed
	adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise
	voice modulation.
	</li>
	<li><strong>Performance vs. Footprint Trade-off</strong><br>
	Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical.
	Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual
	synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources.
	</li>
	<li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br>
	<strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice
	cloning, making them strong candidates for projects that need multi-language support or short-clip cloning.
	<strong>Llama-OuteTTS-1.0-1B</strong> and <strong>MegaTTS3</strong> also offer multilingual input handling,
	though they may require careful sampling parameter tuning to achieve optimal results.
	</li>
	</ol>

	</div>
	""")

	# Test Prompt Section
	# gr.HTML(f"""
	# <div id="test-prompt">
	# <h3>🎯 Universal Test Prompt</h3>
	# <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p>
	# <p style="font-size: 0.9em; opacity: 0.9;">
	# Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models
	# </p>
	# </div>
	# """)

	# Evaluation Criteria
	# with gr.Row():
	# with gr.Column():
	# gr.HTML("""
	# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
	# <div style="font-size: 2rem;">🎭</div>
	# <strong>Naturalness</strong><br>
	# <small>Human-like quality & emotional expression</small>
	# </div>
	# """)
	# with gr.Column():
	# gr.HTML("""
	# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
	# <div style="font-size: 2rem;">🗣️</div>
	# <strong>Intelligibility</strong><br>
	# <small>Clarity & pronunciation accuracy</small>
	# </div>
	# """)
	# with gr.Column():
	# gr.HTML("""
	# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
	# <div style="font-size: 2rem;">🎛️</div>
	# <strong>Controllability</strong><br>
	# <small>Tone, pace & parameter flexibility</small>
	# </div>
	# """)

	# gr.Markdown("---")
	# gr.Markdown("""
	# ## 🔑 Key Findings

	# 1. Outstanding Speech Quality
	# Several models—namely Kokoro-82M, csm-1b, Spark-TTS-0.5B, Orpheus-3b-0.1-ft, F5-TTS, and Llasa-3B—delivered exceptionally natural, clear, and realistic synthesized speech. Among these, csm-1b and F5-TTS stood out as the most well-rounded: they combined top-tier naturalness and intelligibility with solid controllability.

	# 2. Superior Controllability
	# Zonos-v0.1-transformer emerged as the leader in fine-grained control: it offers detailed adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise voice modulation.

	# 3. Performance vs. Footprint Trade-off
	# Smaller models (e.g., Kokoro-82M at 82 million parameters) can still achieve “Good” or “Excellent” ratings in many scenarios, especially when efficient inference or low VRAM usage is critical. Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual synthesis, zero-shot voice cloning, and multi-speaker generation—but require heavier compute resources.

	# 4. Special Notes on Multilingual & Cloning Capabilities
	# Spark-TTS-0.5B and XTTS-v2 excel at cross-lingual and zero-shot voice cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. Llama-OuteTTS-1.0-1B and MegaTTS3 also offer multilingual input handling, though they may require careful sampling parameter tuning to achieve optimal results.
	# """)

	# Search and Filter Section
	with gr.Row():
	search_box = gr.Textbox(
	label="🔍 Search Models",
	placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')",
	value="",
	scale=3
	)
	clear_btn = gr.Button("Clear", scale=1)

	# Model Gallery Section
	gr.Markdown("## 🎧 Model Gallery")

	# Create model cards and audio players
	model_components = []

	for repo, display_name in MODELS.items():
	with gr.Group():
	# Model information card
	model_info = gr.HTML(create_model_card(repo))

	# Audio player
	audio_path = get_audio_path(repo)
	if audio_path:
	audio_player = gr.Audio(
	value=audio_path,
	label=f"🎵 {display_name} Audio Sample",
	interactive=False
	)
	else:
	audio_player = gr.HTML(f"<p style='color: red;'>🤷‍♂️ Audio sample not found for {display_name}</p>")

	model_components.append((repo, model_info, audio_player))

	# Search functionality
	def update_visibility(search_term):
	filtered_repos = filter_models(search_term)
	updates = []

	for repo, model_info, audio_player in model_components:
	visible = repo in filtered_repos
	updates.extend([
	gr.update(visible=visible), # model_info
	gr.update(visible=visible) # audio_player
	])

	return updates

	# Connect search functionality
	search_box.change(
	fn=update_visibility,
	inputs=[search_box],
	outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]]
	)

	clear_btn.click(
	fn=lambda: "",
	outputs=[search_box]
	)

	# Methodology Section
	# with gr.Accordion("📋 Detailed Evaluation Methodology", open=False):
	# gr.Markdown("""
	# ### Test Prompt

	# `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`


	# ### Model Evaluation Criteria:

	# 🎭 Naturalness (Human-like Quality)
	# - Prosody and rhythm patterns
	# - Emotional expression capability
	# - Voice texture and warmth
	# - Natural breathing and pauses

	# 🗣️ Intelligibility (Clarity & Accuracy)
	# - Word pronunciation precision
	# - Consonant and vowel clarity
	# - Sentence comprehensibility
	# - Technical term handling

	# 🎛️ Controllability (Flexibility)
	# - Parameter responsiveness
	# - Tone modification capability
	# - Speed and pitch control
	# - Customization potential

	# ### Key Insights:
	# - Smaller models (82M-500M) can excel in specific scenarios
	# - Larger models (1B-3B+) offer more versatility but require more resources
	# - Architecture matters as much as parameter count
	# - Training data quality significantly impacts output quality
	# """)

	# Footer
	# gr.HTML("""
	# <div id="footer">
	# <p><strong>🚀 Ready to deploy your own TTS model?</strong></p>
	# <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p>
	# <p><em>Built with ❤️ using Gradio • All models are open-source and available on Hugging Face</em></p>
	# <p>⚡ Powered by Inferless</p>
	# </div>
	# """)

	return demo

	# ---------- 4. Launch the application ----------
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(
	share=True,
	inbrowser=True,
	show_error=True
	)