StyleTTS2-lite-space

Sleeping

App Files Files Community

StyleTTS2-lite-space / app.py

humair025

Update app.py

53b396f verified about 2 months ago

raw

history blame contribute delete

33.2 kB

	import gradio as gr
	import subprocess
	import os
	import sys
	import soundfile as sf
	import torch
	import traceback
	import random
	import numpy as np
	import spaces
	import json
	from datetime import datetime
	import shutil
	import sys
	import phonemizer
	if sys.platform.startswith("win"):
	try:
	from phonemizer.backend.espeak.wrapper import EspeakWrapper
	import espeakng_loader
	EspeakWrapper.set_library(espeakng_loader.get_library_path())
	except Exception as e:
	print(f"[DEBUG] EspeakWrapper setup error: {e}")
	def get_phoneme(text, lang):
	try:
	print(f"[DEBUG] Getting phoneme for text: {text[:50]}... \| lang: {lang}")
	my_phonemizer = phonemizer.backend.EspeakBackend(
	language=lang,
	preserve_punctuation=True,
	with_stress=True,
	language_switch='remove-flags'
	)
	result = my_phonemizer.phonemize([text])[0]
	print(f"[DEBUG] Phoneme result: {result[:100]}...")
	return result
	except Exception as e:
	print(f"[DEBUG] Phoneme error: {e}")
	traceback.print_exc()
	return None
	def split_text_into_sentences(text, max_chars=200):
	"""Split text into sentences for streaming generation"""
	import re

	# Split by common sentence endings
	sentences = re.split(r'(?<=[.!?])\s+', text)

	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) <= max_chars:
	current_chunk += sentence + " "
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + " "

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks
	# Setup repository
	repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite"
	repo_dir = "StyleTTS2-lite"
	if not os.path.exists(repo_dir):
	print(f"[DEBUG] Cloning repository from {repo_url}")
	subprocess.run(["git", "clone", repo_url, repo_dir])
	else:
	print(f"[DEBUG] Repository already exists at {repo_dir}")
	sys.path.append(os.path.abspath(repo_dir))
	from inference import StyleTTS2
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f"[DEBUG] Using device: {device}")
	config_path = os.path.join(repo_dir, "Models", "config.yaml")
	models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
	voice_path = os.path.join(repo_dir, "Audio")
	print(f"[DEBUG] Config path: {config_path}")
	print(f"[DEBUG] Models path: {models_path}")
	print(f"[DEBUG] Voice path: {voice_path}")
	model = StyleTTS2(config_path, models_path).eval().to(device)
	print(f"[DEBUG] Model loaded successfully")
	# Create directory for custom uploaded audio
	custom_audio_dir = "custom_reference_audio"
	os.makedirs(custom_audio_dir, exist_ok=True)
	print(f"[DEBUG] Custom audio directory: {custom_audio_dir}")
	# Extended example texts with categories
	eg_texts = {
	"Creative & Narrative": [
	"Beneath layers of bureaucracy and forgotten policies, the school still held a quiet magic—whispers of chalk dust, scuffed floors, and dreams once declared aloud in voices full of belief.",
	"He had never believed in fate, but when their paths crossed in the middle of a thunderstorm under a flickering streetlight, even his rational mind couldn't deny the poetic timing.",
	"In a distant galaxy orbiting a dying star, a species of sentient machines debates whether to intervene in the fate of a nearby organic civilization on the brink of collapse.",
	"The ancient temple walls, once vibrant with murals, now bore the weathered marks of centuries, yet even in decay, they whispered stories that modern minds struggled to fully comprehend.",
	],
	"Technical & Informative": [
	"Technological advancements in artificial intelligence have not only accelerated the pace of automation but have also raised critical questions about ethics, job displacement, and the future role of human creativity.",
	"Every algorithm reflects its designer's worldview, no matter how neutral it appears, and therein lies the paradox of objectivity in machine learning: pure logic still casts a human shadow.",
	"The process of photosynthesis converts light energy into chemical energy, enabling plants to produce glucose from carbon dioxide and water while releasing oxygen as a byproduct.",
	],
	"Conversational": [
	"Hey there! I hope you're having a wonderful day. I just wanted to check in and see how things are going with that project we discussed last week.",
	"You know what? I think we should grab coffee sometime soon. It's been way too long since we caught up properly.",
	"I completely understand where you're coming from, and I appreciate you sharing that with me. Let's figure this out together.",
	],
	"Dramatic & Suspenseful": [
	"The engine sputtered twice before giving in completely, leaving them stranded on a desolate mountain road with no reception, dwindling supplies, and a storm brewing over the ridge to the west.",
	"The museum guard never expected the sculpture to move, but at precisely midnight, its eyes blinked, and its lips curled into a knowing smile, as if awakening from centuries of silence.",
	"Time slowed as the coin spun in the air, glinting with a brilliance far beyond its monetary value, carrying with it the weight of a decision neither of them wanted to make.",
	],
	"Poetic & Reflective": [
	"The sound of rain on the tin roof reminded him of summers long past, when the world was smaller, days were longer, and time moved like honey down a warm spoon.",
	"While standing at the edge of the quiet lake, Maria couldn't help but wonder how many untold stories were buried beneath its still surface, reflecting the sky like a perfect mirror.",
	"As the solar eclipse reached totality, the temperature dropped, the birds went silent, and for a few seconds, the world stood still beneath an alien, awe-inspiring sky.",
	]
	}
	voice_map = {
	'🇺🇸 🚺 Heart ❤️': '1_heart.wav',
	'🇺🇸 🚺 Bella 🔥': '2_belle.wav',
	'🇺🇸 🚺 Kore': '3_kore.wav',
	'🇺🇸 🚺 Sarah': '4_sarah.wav',
	'🇺🇸 🚺 Nova': '5_nova.wav',
	'🇺🇸 🚺 Sky': '6_sky.wav',
	'🇺🇸 🚺 Alloy': '7_alloy.wav',
	'🇺🇸 🚺 Jessica': '8_jessica.wav',
	'🇺🇸 🚺 River': '9_river.wav',
	'🇺🇸 🚹 Michael': '10_michael.wav',
	'🇺🇸 🚹 Fenrir': '11_fenrir.wav',
	'🇺🇸 🚹 Puck': '12_puck.wav',
	'🇺🇸 🚹 Echo': '13_echo.wav',
	'🇺🇸 🚹 Eric': '14_eric.wav',
	'🇺🇸 🚹 Liam': '15_liam.wav',
	'🇺🇸 🚹 Onyx': '16_onyx.wav',
	'🇺🇸 🚹 Santa': '17_santa.wav',
	'🇺🇸 🚹 Adam': '18_adam.wav',
	}
	voice_choices = [
	(label, os.path.join(voice_path, filename))
	for label, filename in voice_map.items()
	]
	print(f"[DEBUG] Voice choices created: {len(voice_choices)} voices")
	for label, path in voice_choices[:3]:
	print(f"[DEBUG] Sample voice: {label} -> {path}")
	# Streaming inference function
	@spaces.GPU
	def generate_stream(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()):
	"""Generator function that yields audio chunks for streaming"""
	try:
	print(f"\n[DEBUG] ===== STREAMING GENERATION START =====")
	print(f"[DEBUG] Text prompt: {text_prompt[:100]}...")
	print(f"[DEBUG] Reference path: {reference_paths}")
	print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}")

	if not text_prompt or text_prompt.strip() == "":
	print(f"[DEBUG] Error: Empty text prompt")
	yield None
	return

	if not reference_paths or not os.path.exists(reference_paths):
	print(f"[DEBUG] Error: Invalid reference path")
	yield None
	return

	# Set seed for reproducibility
	if seed != -1:
	torch.manual_seed(seed)
	np.random.seed(seed)
	print(f"[DEBUG] Seed set to: {seed}")

	# Split text into chunks for streaming
	text_chunks = split_text_into_sentences(text_prompt, max_chars=200)
	print(f"[DEBUG] Split into {len(text_chunks)} chunks")

	speaker = {
	"path": reference_paths,
	"speed": speed
	}

	progress(0.1, desc="Extracting voice styles...")

	# Extract styles once (reuse for all chunks)
	with torch.no_grad():
	styles = model.get_styles(speaker, denoise, avg_style)
	print(f"[DEBUG] Styles extracted")

	first_chunk = True
	total_chunks = len(text_chunks)

	for idx, chunk in enumerate(text_chunks, 1):
	try:
	progress_val = 0.1 + (0.8 * idx / total_chunks)
	progress(progress_val, desc=f"Generating chunk {idx}/{total_chunks}...")

	print(f"[DEBUG] Processing chunk {idx}/{total_chunks}: {chunk[:50]}...")

	with torch.no_grad():
	# Get phonemes for this chunk
	phonemes = get_phoneme(text=chunk, lang="en-us")

	if phonemes is None:
	print(f"[DEBUG] Warning: Phoneme processing failed for chunk {idx}")
	continue

	# Generate audio for this chunk
	audio_chunk = model.generate(phonemes, styles, stabilize, 18)

	# Handle NaN and normalize
	audio_chunk = np.nan_to_num(audio_chunk)
	max_abs = np.max(np.abs(audio_chunk))
	if max_abs > 0:
	audio_chunk /= max_abs
	else:
	audio_chunk = np.zeros_like(audio_chunk)
	audio_chunk = np.clip(audio_chunk, -1, 1)

	print(f"[DEBUG] Generated chunk {idx}: {len(audio_chunk)} samples")

	# Yield the audio chunk
	yield (24000, audio_chunk.astype(np.float32))

	# Add tiny silence after first chunk for smoother transitions
	if first_chunk and total_chunks > 1:
	first_chunk = False
	silence = np.zeros(int(24000 * 0.1), dtype=np.float32) # 0.1 second silence
	yield (24000, silence)
	print(f"[DEBUG] Added silence separator")

	except Exception as e:
	print(f"[DEBUG] Error processing chunk {idx}: {str(e)}")
	traceback.print_exc()
	continue

	progress(1.0, desc="Complete!")
	print(f"[DEBUG] ===== STREAMING GENERATION COMPLETE =====\n")

	except Exception as e:
	error_message = traceback.format_exc()
	print(f"[DEBUG] ===== STREAMING ERROR =====")
	print(f"[DEBUG] Error: {str(e)}")
	print(f"[DEBUG] Traceback:\n{error_message}")
	print(f"[DEBUG] ===== END ERROR =====\n")
	yield None
	# Non-streaming inference function (original)
	@spaces.GPU
	def main(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()):
	try:
	print(f"\n[DEBUG] ===== GENERATION START =====")
	print(f"[DEBUG] Text prompt: {text_prompt[:100]}...")
	print(f"[DEBUG] Reference path: {reference_paths}")
	print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}")
	print(f"[DEBUG] Avg style: {avg_style}, Stabilize: {stabilize}, Seed: {seed}")

	if not text_prompt or text_prompt.strip() == "":
	print(f"[DEBUG] Error: Empty text prompt")
	return None, "❌ Error: Please enter text to generate speech."

	if not reference_paths:
	print(f"[DEBUG] Error: No reference path")
	return None, "❌ Error: Please select a reference voice or upload your own audio."

	# Check if reference file exists
	if not os.path.exists(reference_paths):
	print(f"[DEBUG] Error: Reference file does not exist: {reference_paths}")
	return None, f"❌ Error: Reference file not found: {reference_paths}"

	print(f"[DEBUG] Reference file exists: {os.path.exists(reference_paths)}")

	# Set seed for reproducibility
	if seed != -1:
	torch.manual_seed(seed)
	np.random.seed(seed)
	print(f"[DEBUG] Seed set to: {seed}")

	progress(0.1, desc="Initializing...")

	speaker = {
	"path": reference_paths,
	"speed": speed
	}
	print(f"[DEBUG] Speaker config: {speaker}")

	progress(0.3, desc="Processing phonemes...")
	with torch.no_grad():
	phonemes = get_phoneme(text=text_prompt, lang="en-us")

	if phonemes is None:
	print(f"[DEBUG] Error: Phoneme processing failed")
	return None, "❌ Error: Failed to process phonemes."

	print(f"[DEBUG] Phonemes processed successfully")

	progress(0.5, desc="Extracting voice styles...")
	print(f"[DEBUG] Getting styles from model...")
	styles = model.get_styles(speaker, denoise, avg_style)
	print(f"[DEBUG] Styles extracted: {type(styles)}")

	progress(0.7, desc="Generating audio...")
	print(f"[DEBUG] Generating audio with model...")
	r = model.generate(phonemes, styles, stabilize, 18)
	print(f"[DEBUG] Audio generated: shape={r.shape if hasattr(r, 'shape') else len(r)}")

	progress(0.9, desc="Finalizing...")
	# Handle NaN and normalize
	r = np.nan_to_num(r)
	max_abs = np.max(np.abs(r))
	if max_abs > 0:
	r /= max_abs
	else:
	r = np.zeros_like(r)
	r = np.clip(r, -1, 1)
	print(f"[DEBUG] Audio normalized")

	# Calculate audio duration
	duration = len(r) / 24000
	print(f"[DEBUG] Audio duration: {duration:.2f}s")

	progress(1.0, desc="Complete!")

	print(f"[DEBUG] ===== GENERATION COMPLETE =====\n")
	return (24000, r.astype(np.float32)), f"✅ Audio generated successfully! Duration: {duration:.2f}s \| Device: {device} \| Seed: {seed if seed != -1 else 'Random'}"

	except Exception as e:
	error_message = traceback.format_exc()
	print(f"[DEBUG] ===== GENERATION ERROR =====")
	print(f"[DEBUG] Error type: {type(e).__name__}")
	print(f"[DEBUG] Error message: {str(e)}")
	print(f"[DEBUG] Full traceback:\n{error_message}")
	print(f"[DEBUG] ===== END ERROR =====\n")
	return None, f"❌ Error: {str(e)}\n\n{error_message}"
	def handle_custom_audio_upload(audio_file, audio_source):
	"""Handle uploaded custom audio file"""
	try:
	print(f"[DEBUG] handle_custom_audio_upload called")
	print(f"[DEBUG] Audio file: {audio_file}")
	print(f"[DEBUG] Audio source: {audio_source}")

	if audio_source != "custom":
	print(f"[DEBUG] Audio source is not custom, ignoring upload")
	return None, None, "⚠️ Please select 'Custom Upload' as audio source first."

	if audio_file is None:
	print(f"[DEBUG] No audio file provided")
	return None, None, "⚠️ Please upload an audio file."

	# Validate file format
	valid_extensions = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
	file_ext = os.path.splitext(audio_file)[1].lower()

	if file_ext not in valid_extensions:
	return None, None, f"❌ Invalid file format. Supported: {', '.join(valid_extensions)}"

	# Create unique filename
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	custom_filename = f"custom_ref_{timestamp}{file_ext}"
	custom_path = os.path.join(custom_audio_dir, custom_filename)

	# Copy uploaded file
	shutil.copy2(audio_file, custom_path)
	print(f"[DEBUG] Custom audio saved to: {custom_path}")

	# Validate audio file
	try:
	audio_data, sample_rate = sf.read(custom_path)
	duration = len(audio_data) / sample_rate
	print(f"[DEBUG] Audio validated: {duration:.2f}s @ {sample_rate}Hz")

	if duration < 1.0:
	os.remove(custom_path)
	return None, None, "❌ Audio too short. Please upload at least 1 second of audio."

	if duration > 30.0:
	return None, custom_path, f"⚠️ Audio is {duration:.1f}s long. Shorter clips (3-10s) work best, but we'll use it."

	return custom_path, custom_path, f"✅ Custom audio uploaded! Duration: {duration:.2f}s @ {sample_rate}Hz"

	except Exception as e:
	if os.path.exists(custom_path):
	os.remove(custom_path)
	return None, None, f"❌ Failed to read audio file: {str(e)}"

	except Exception as e:
	error_msg = traceback.format_exc()
	print(f"[DEBUG] Upload error: {error_msg}")
	return None, None, f"❌ Upload failed: {str(e)}"
	def load_example_voice(example_voices):
	print(f"[DEBUG] load_example_voice called with: {example_voices}")
	print(f"[DEBUG] Type: {type(example_voices)}")

	if example_voices:
	# Find the voice name safely
	voice_name = "Unknown"
	for k, v in voice_map.items():
	full_path = os.path.join(voice_path, v)
	if full_path == example_voices:
	voice_name = k
	print(f"[DEBUG] Found matching voice: {voice_name}")
	break

	if voice_name == "Unknown":
	print(f"[DEBUG] Warning: Could not find voice name for path: {example_voices}")

	result = example_voices, f"✅ Loaded voice: {voice_name}"
	print(f"[DEBUG] Returning: {result}")
	return result

	print(f"[DEBUG] No voice selected")
	return None, "⚠️ No voice selected."
	def switch_audio_source(source):
	"""Handle switching between preset and custom audio"""
	print(f"[DEBUG] Switching audio source to: {source}")

	if source == "preset":
	# Show preset dropdown, hide upload
	return (
	gr.update(visible=True), # example_voices dropdown
	gr.update(visible=False), # custom_audio_upload
	voice_choices[0][1], # reference_audios - load default
	"✅ Using preset voices" # status
	)
	else: # custom
	# Hide preset dropdown, show upload
	return (
	gr.update(visible=False), # example_voices dropdown
	gr.update(visible=True), # custom_audio_upload
	None, # reference_audios - clear
	"📤 Upload your own reference audio (WAV, MP3, FLAC, OGG, M4A)" # status
	)
	def random_text(category):
	print(f"[DEBUG] random_text called with category: {category}")
	if category == "All Categories":
	all_texts = [text for texts in eg_texts.values() for text in texts]
	selected = random.choice(all_texts)
	print(f"[DEBUG] Selected random text from all categories")
	else:
	selected = random.choice(eg_texts.get(category, []))
	print(f"[DEBUG] Selected random text from {category}")

	print(f"[DEBUG] Selected text: {selected[:50]}...")
	return selected, f"✅ Randomized text from: {category}"
	def clear_all():
	print(f"[DEBUG] Clearing all fields")
	return "", None, None, None, "✅ All fields cleared."
	def estimate_duration(text):
	# Rough estimation: ~150 words per minute, average 5 chars per word
	words = len(text.split())
	estimated_seconds = (words / 150) * 60
	print(f"[DEBUG] Estimated duration for {words} words: {estimated_seconds:.1f}s")
	return f"⏱️ Estimated duration: ~{estimated_seconds:.1f}s"
	def generate_random_seed():
	seed_value = random.randint(0, 2**31 - 1)
	print(f"[DEBUG] Generated random seed: {seed_value}")
	return seed_value, "✅ Random seed generated."
	def voice_button_click(vp, vn):
	print(f"[DEBUG] Voice button clicked: {vn}")
	print(f"[DEBUG] Voice path: {vp}")
	result = vp, vp, f"✅ Selected: {vn}"
	print(f"[DEBUG] Returning: {result}")
	return result
	def text_button_click(t):
	print(f"[DEBUG] Text button clicked")
	print(f"[DEBUG] Text: {t[:50]}...")
	result = t, f"✅ Loaded example text"
	print(f"[DEBUG] Returning: {result}")
	return result
	# Custom CSS for better styling
	custom_css = """
	#main_container {
	max-width: 1400px;
	margin: auto;
	}
	.header {
	text-align: center;
	padding: 20px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 10px;
	margin-bottom: 20px;
	}
	.header h1 {
	color: white;
	font-size: 2.5em;
	margin: 0;
	}
	.header p {
	color: #f0f0f0;
	font-size: 1.1em;
	margin-top: 10px;
	}
	.audio-source-radio {
	background: #f8f9fa;
	padding: 15px;
	border-radius: 8px;
	margin: 10px 0;
	}
	.streaming-badge {
	display: inline-block;
	background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
	color: white;
	padding: 5px 15px;
	border-radius: 20px;
	font-weight: bold;
	margin-left: 10px;
	}
	"""
	# Gradio UI
	with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
	gr.HTML("""
	<div class="header">
	<h1>🎙️ StyleTTS2-Lite Pro <span class="streaming-badge">🔥 STREAMING</span></h1>
	<p>Advanced Text-to-Speech Synthesis with Real-time Streaming</p>
	<p style="font-size: 0.9em; color: #ffeb3b;">✨ Now with Custom Audio Upload & Real-time Streaming!</p>
	</div>
	""")
	gr.Markdown(f"""
	### 🌟 Features
	- 18 Premium Voices (9 Female, 9 Male)
	- 🔥 Real-time Streaming - Hear audio as it generates
	- 🆕 Custom Audio Upload - Use your own voice!
	- Advanced Controls (Speed, Denoising, Style Averaging)
	- Text Categories (Creative, Technical, Conversational, and more)
	- Reproducible Seeds for consistent results

	---

	### 🐛 Debug Information
	- Device: {device}
	- Voice Path: {voice_path}
	- Available Voices: {len(voice_choices)}
	""")
	with gr.Row(elem_id="main_container"):
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Text Input")

	text_category = gr.Dropdown(
	label="Text Category",
	choices=["All Categories"] + list(eg_texts.keys()),
	value="All Categories",
	interactive=True
	)

	text_prompt = gr.Textbox(
	label="Text Prompt",
	placeholder="Enter your text here or use the randomize button...",
	lines=8,
	max_lines=15
	)

	text_info = gr.Textbox(
	label="Text Info",
	value="",
	interactive=False,
	lines=1
	)

	with gr.Row():
	random_text_button = gr.Button("🎲 Randomize Text", variant="secondary")
	clear_button = gr.Button("🗑️ Clear All", variant="stop")

	text_prompt.change(fn=estimate_duration, inputs=text_prompt, outputs=text_info)

	gr.Markdown("### 🎚️ Audio Controls")

	with gr.Accordion("Basic Settings", open=True):
	speed = gr.Slider(
	0.5, 2.0,
	step=0.1,
	value=1.0,
	label="Speaking Speed",
	info="Adjust how fast the speech is generated"
	)
	denoise = gr.Slider(
	0.0, 1.0,
	step=0.05,
	value=0.2,
	label="Denoise Strength",
	info="Higher values produce cleaner but less expressive audio"
	)

	with gr.Accordion("Advanced Settings", open=False):
	avg_style = gr.Checkbox(
	label="Use Average Styles",
	value=True,
	info="Blend multiple style characteristics for smoother output"
	)
	stabilize = gr.Checkbox(
	label="Stabilize Speaking Speed",
	value=True,
	info="Maintain consistent pacing throughout generation"
	)
	seed = gr.Number(
	label="Random Seed (-1 for random)",
	value=-1,
	precision=0,
	info="Use same seed for reproducible results"
	)
	random_seed_button = gr.Button("🎲 Generate Random Seed", size="sm")
	with gr.Column(scale=1):
	gr.Markdown("### 🎤 Voice Selection")

	audio_source = gr.Radio(
	choices=[("Preset Voices", "preset"), ("Custom Upload", "custom")],
	value="preset",
	label="Audio Source - Choose between preset voices or upload your own",
	elem_classes="audio-source-radio"
	)

	example_voices = gr.Dropdown(
	label="Select Preset Voice",
	choices=voice_choices,
	value=voice_choices[0][1],
	interactive=True,
	allow_custom_value=False,
	filterable=True,
	visible=True
	)

	custom_audio_upload = gr.Audio(
	label="Upload Custom Reference Audio (3-10 seconds of clear speech)",
	type='filepath',
	visible=False
	)

	reference_audios = gr.Audio(
	label="Reference Audio Preview",
	type='filepath',
	interactive=False,
	value=voice_choices[0][1]
	)

	gr.Markdown("### 🔊 Generated Output")

	# Streaming output
	streaming_audio = gr.Audio(
	label="🔥 Streaming Audio (Real-time)",
	type='numpy',
	interactive=False,
	streaming=True,
	autoplay=True
	)

	# Non-streaming output
	synthesized_audio = gr.Audio(
	label="Complete Audio (Non-streaming)",
	type='numpy',
	interactive=False
	)

	with gr.Row():
	stream_button = gr.Button("🔥 Stream Speech", variant="primary", size="lg")
	gen_button = gr.Button("🗣️ Generate Complete", variant="secondary", size="lg")

	status = gr.Textbox(
	label="Status",
	interactive=False,
	lines=3,
	placeholder="Status messages will appear here..."
	)

	# Voice examples section
	with gr.Accordion("🎭 Voice Gallery & Examples", open=False):
	gr.Markdown("### Quick Voice Preview")
	gr.Markdown("Browse through all available voices:")

	with gr.Row():
	female_voices = [v for v in voice_choices if '🚺' in v[0]]
	male_voices = [v for v in voice_choices if '🚹' in v[0]]

	with gr.Column():
	gr.Markdown(f"Female Voices ({len(female_voices)})")
	for voice_name, voice_path_item in female_voices:
	btn = gr.Button(voice_name, size="sm")
	btn.click(
	fn=voice_button_click,
	inputs=[gr.State(voice_path_item), gr.State(voice_name)],
	outputs=[example_voices, reference_audios, status]
	)

	with gr.Column():
	gr.Markdown(f"Male Voices ({len(male_voices)})")
	for voice_name, voice_path_item in male_voices:
	btn = gr.Button(voice_name, size="sm")
	btn.click(
	fn=voice_button_click,
	inputs=[gr.State(voice_path_item), gr.State(voice_name)],
	outputs=[example_voices, reference_audios, status]
	)

	# Example texts section
	with gr.Accordion("📚 Example Text Library", open=False):
	gr.Markdown("### Browse Example Texts by Category")

	for category, texts in eg_texts.items():
	with gr.Accordion(f"{category} ({len(texts)} examples)", open=False):
	for idx, text in enumerate(texts, 1):
	with gr.Row():
	text_display = gr.Textbox(
	label=f"Example {idx}",
	value=text,
	lines=3,
	interactive=False,
	scale=4
	)
	load_btn = gr.Button("📋 Load", size="sm", scale=1)
	load_btn.click(
	fn=text_button_click,
	inputs=gr.State(text),
	outputs=[text_prompt, status]
	)

	# Event handlers

	# Random text button
	random_text_button.click(
	fn=random_text,
	inputs=text_category,
	outputs=[text_prompt, status]
	)

	# Clear all button
	clear_button.click(
	fn=clear_all,
	outputs=[text_prompt, reference_audios, streaming_audio, synthesized_audio, status]
	)

	# Random seed button
	random_seed_button.click(
	fn=generate_random_seed,
	outputs=[seed, status]
	)

	# Audio source switch
	audio_source.change(
	fn=switch_audio_source,
	inputs=audio_source,
	outputs=[example_voices, custom_audio_upload, reference_audios, status]
	)

	# Example voice selection
	example_voices.change(
	fn=load_example_voice,
	inputs=example_voices,
	outputs=[reference_audios, status]
	)

	# Custom audio upload
	custom_audio_upload.change(
	fn=handle_custom_audio_upload,
	inputs=[custom_audio_upload, audio_source],
	outputs=[reference_audios, reference_audios, status]
	)

	# Streaming generation button
	stream_button.click(
	fn=generate_stream,
	inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed],
	outputs=streaming_audio
	)

	# Non-streaming generation button
	gen_button.click(
	fn=main,
	inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed],
	outputs=[synthesized_audio, status]
	)

	# Footer
	gr.Markdown(f"""
	---
	### 📖 Usage Tips
	1. Choose a Voice: Select from 18 preset voices or upload your own reference audio (3-10 seconds recommended)
	2. Enter Text: Type or select example text from the library
	3. Adjust Settings: Fine-tune speed, denoising, and other parameters
	4. Generate:
	- Click "🔥 Stream Speech" for real-time audio generation (hear it as it's created)
	- Click "🗣️ Generate Complete" for full audio generation at once
	5. Experiment: Try different voices, speeds, and text styles!

	### ⚙️ Parameter Guide
	- Speaking Speed: 0.5 = slow, 1.0 = normal, 2.0 = fast
	- Denoise Strength: Higher values = cleaner audio but less natural variation
	- Average Styles: Blends multiple style characteristics for consistency
	- Stabilize Speed: Maintains consistent pacing throughout speech
	- Random Seed: Use -1 for random, or set a specific number for reproducible results

	### 🎯 Custom Audio Tips
	- Use clear, high-quality recordings
	- 3-10 seconds of speech works best
	- Speak in a natural, conversational tone
	- Avoid background noise
	- Supported formats: WAV, MP3, FLAC, OGG, M4A

	### 🔥 Streaming vs Complete Generation
	- Streaming: Hear audio as it's being generated, chunk by chunk (great for long texts!)
	- Complete: Generates entire audio at once (better for short texts and downloading)

	---

	Model: StyleTTS2-Lite \| Device: {device} \| Voices: {len(voice_choices)}

	💡 Tip: Use the seed parameter to generate the same audio multiple times with different settings!
	""")
	# Launch the app
	if __name__ == "__main__":
	demo.queue(max_size=20)
	demo.launch(
	share=True,
	debug=True,
	show_error=True
	)