humair025's picture
Update app.py
53b396f verified
import gradio as gr
import subprocess
import os
import sys
import soundfile as sf
import torch
import traceback
import random
import numpy as np
import spaces
import json
from datetime import datetime
import shutil
import sys
import phonemizer
if sys.platform.startswith("win"):
try:
from phonemizer.backend.espeak.wrapper import EspeakWrapper
import espeakng_loader
EspeakWrapper.set_library(espeakng_loader.get_library_path())
except Exception as e:
print(f"[DEBUG] EspeakWrapper setup error: {e}")
def get_phoneme(text, lang):
try:
print(f"[DEBUG] Getting phoneme for text: {text[:50]}... | lang: {lang}")
my_phonemizer = phonemizer.backend.EspeakBackend(
language=lang,
preserve_punctuation=True,
with_stress=True,
language_switch='remove-flags'
)
result = my_phonemizer.phonemize([text])[0]
print(f"[DEBUG] Phoneme result: {result[:100]}...")
return result
except Exception as e:
print(f"[DEBUG] Phoneme error: {e}")
traceback.print_exc()
return None
def split_text_into_sentences(text, max_chars=200):
"""Split text into sentences for streaming generation"""
import re
# Split by common sentence endings
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Setup repository
repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite"
repo_dir = "StyleTTS2-lite"
if not os.path.exists(repo_dir):
print(f"[DEBUG] Cloning repository from {repo_url}")
subprocess.run(["git", "clone", repo_url, repo_dir])
else:
print(f"[DEBUG] Repository already exists at {repo_dir}")
sys.path.append(os.path.abspath(repo_dir))
from inference import StyleTTS2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"[DEBUG] Using device: {device}")
config_path = os.path.join(repo_dir, "Models", "config.yaml")
models_path = os.path.join(repo_dir, "Models", "inference", "model.pth")
voice_path = os.path.join(repo_dir, "Audio")
print(f"[DEBUG] Config path: {config_path}")
print(f"[DEBUG] Models path: {models_path}")
print(f"[DEBUG] Voice path: {voice_path}")
model = StyleTTS2(config_path, models_path).eval().to(device)
print(f"[DEBUG] Model loaded successfully")
# Create directory for custom uploaded audio
custom_audio_dir = "custom_reference_audio"
os.makedirs(custom_audio_dir, exist_ok=True)
print(f"[DEBUG] Custom audio directory: {custom_audio_dir}")
# Extended example texts with categories
eg_texts = {
"Creative & Narrative": [
"Beneath layers of bureaucracy and forgotten policies, the school still held a quiet magicβ€”whispers of chalk dust, scuffed floors, and dreams once declared aloud in voices full of belief.",
"He had never believed in fate, but when their paths crossed in the middle of a thunderstorm under a flickering streetlight, even his rational mind couldn't deny the poetic timing.",
"In a distant galaxy orbiting a dying star, a species of sentient machines debates whether to intervene in the fate of a nearby organic civilization on the brink of collapse.",
"The ancient temple walls, once vibrant with murals, now bore the weathered marks of centuries, yet even in decay, they whispered stories that modern minds struggled to fully comprehend.",
],
"Technical & Informative": [
"Technological advancements in artificial intelligence have not only accelerated the pace of automation but have also raised critical questions about ethics, job displacement, and the future role of human creativity.",
"Every algorithm reflects its designer's worldview, no matter how neutral it appears, and therein lies the paradox of objectivity in machine learning: pure logic still casts a human shadow.",
"The process of photosynthesis converts light energy into chemical energy, enabling plants to produce glucose from carbon dioxide and water while releasing oxygen as a byproduct.",
],
"Conversational": [
"Hey there! I hope you're having a wonderful day. I just wanted to check in and see how things are going with that project we discussed last week.",
"You know what? I think we should grab coffee sometime soon. It's been way too long since we caught up properly.",
"I completely understand where you're coming from, and I appreciate you sharing that with me. Let's figure this out together.",
],
"Dramatic & Suspenseful": [
"The engine sputtered twice before giving in completely, leaving them stranded on a desolate mountain road with no reception, dwindling supplies, and a storm brewing over the ridge to the west.",
"The museum guard never expected the sculpture to move, but at precisely midnight, its eyes blinked, and its lips curled into a knowing smile, as if awakening from centuries of silence.",
"Time slowed as the coin spun in the air, glinting with a brilliance far beyond its monetary value, carrying with it the weight of a decision neither of them wanted to make.",
],
"Poetic & Reflective": [
"The sound of rain on the tin roof reminded him of summers long past, when the world was smaller, days were longer, and time moved like honey down a warm spoon.",
"While standing at the edge of the quiet lake, Maria couldn't help but wonder how many untold stories were buried beneath its still surface, reflecting the sky like a perfect mirror.",
"As the solar eclipse reached totality, the temperature dropped, the birds went silent, and for a few seconds, the world stood still beneath an alien, awe-inspiring sky.",
]
}
voice_map = {
'πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️': '1_heart.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯': '2_belle.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Kore': '3_kore.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Sarah': '4_sarah.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Nova': '5_nova.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Sky': '6_sky.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Alloy': '7_alloy.wav',
'πŸ‡ΊπŸ‡Έ 🚺 Jessica': '8_jessica.wav',
'πŸ‡ΊπŸ‡Έ 🚺 River': '9_river.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Michael': '10_michael.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Fenrir': '11_fenrir.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Puck': '12_puck.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Echo': '13_echo.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Eric': '14_eric.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Liam': '15_liam.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Onyx': '16_onyx.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Santa': '17_santa.wav',
'πŸ‡ΊπŸ‡Έ 🚹 Adam': '18_adam.wav',
}
voice_choices = [
(label, os.path.join(voice_path, filename))
for label, filename in voice_map.items()
]
print(f"[DEBUG] Voice choices created: {len(voice_choices)} voices")
for label, path in voice_choices[:3]:
print(f"[DEBUG] Sample voice: {label} -> {path}")
# Streaming inference function
@spaces.GPU
def generate_stream(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()):
"""Generator function that yields audio chunks for streaming"""
try:
print(f"\n[DEBUG] ===== STREAMING GENERATION START =====")
print(f"[DEBUG] Text prompt: {text_prompt[:100]}...")
print(f"[DEBUG] Reference path: {reference_paths}")
print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}")
if not text_prompt or text_prompt.strip() == "":
print(f"[DEBUG] Error: Empty text prompt")
yield None
return
if not reference_paths or not os.path.exists(reference_paths):
print(f"[DEBUG] Error: Invalid reference path")
yield None
return
# Set seed for reproducibility
if seed != -1:
torch.manual_seed(seed)
np.random.seed(seed)
print(f"[DEBUG] Seed set to: {seed}")
# Split text into chunks for streaming
text_chunks = split_text_into_sentences(text_prompt, max_chars=200)
print(f"[DEBUG] Split into {len(text_chunks)} chunks")
speaker = {
"path": reference_paths,
"speed": speed
}
progress(0.1, desc="Extracting voice styles...")
# Extract styles once (reuse for all chunks)
with torch.no_grad():
styles = model.get_styles(speaker, denoise, avg_style)
print(f"[DEBUG] Styles extracted")
first_chunk = True
total_chunks = len(text_chunks)
for idx, chunk in enumerate(text_chunks, 1):
try:
progress_val = 0.1 + (0.8 * idx / total_chunks)
progress(progress_val, desc=f"Generating chunk {idx}/{total_chunks}...")
print(f"[DEBUG] Processing chunk {idx}/{total_chunks}: {chunk[:50]}...")
with torch.no_grad():
# Get phonemes for this chunk
phonemes = get_phoneme(text=chunk, lang="en-us")
if phonemes is None:
print(f"[DEBUG] Warning: Phoneme processing failed for chunk {idx}")
continue
# Generate audio for this chunk
audio_chunk = model.generate(phonemes, styles, stabilize, 18)
# Handle NaN and normalize
audio_chunk = np.nan_to_num(audio_chunk)
max_abs = np.max(np.abs(audio_chunk))
if max_abs > 0:
audio_chunk /= max_abs
else:
audio_chunk = np.zeros_like(audio_chunk)
audio_chunk = np.clip(audio_chunk, -1, 1)
print(f"[DEBUG] Generated chunk {idx}: {len(audio_chunk)} samples")
# Yield the audio chunk
yield (24000, audio_chunk.astype(np.float32))
# Add tiny silence after first chunk for smoother transitions
if first_chunk and total_chunks > 1:
first_chunk = False
silence = np.zeros(int(24000 * 0.1), dtype=np.float32) # 0.1 second silence
yield (24000, silence)
print(f"[DEBUG] Added silence separator")
except Exception as e:
print(f"[DEBUG] Error processing chunk {idx}: {str(e)}")
traceback.print_exc()
continue
progress(1.0, desc="Complete!")
print(f"[DEBUG] ===== STREAMING GENERATION COMPLETE =====\n")
except Exception as e:
error_message = traceback.format_exc()
print(f"[DEBUG] ===== STREAMING ERROR =====")
print(f"[DEBUG] Error: {str(e)}")
print(f"[DEBUG] Traceback:\n{error_message}")
print(f"[DEBUG] ===== END ERROR =====\n")
yield None
# Non-streaming inference function (original)
@spaces.GPU
def main(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()):
try:
print(f"\n[DEBUG] ===== GENERATION START =====")
print(f"[DEBUG] Text prompt: {text_prompt[:100]}...")
print(f"[DEBUG] Reference path: {reference_paths}")
print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}")
print(f"[DEBUG] Avg style: {avg_style}, Stabilize: {stabilize}, Seed: {seed}")
if not text_prompt or text_prompt.strip() == "":
print(f"[DEBUG] Error: Empty text prompt")
return None, "❌ Error: Please enter text to generate speech."
if not reference_paths:
print(f"[DEBUG] Error: No reference path")
return None, "❌ Error: Please select a reference voice or upload your own audio."
# Check if reference file exists
if not os.path.exists(reference_paths):
print(f"[DEBUG] Error: Reference file does not exist: {reference_paths}")
return None, f"❌ Error: Reference file not found: {reference_paths}"
print(f"[DEBUG] Reference file exists: {os.path.exists(reference_paths)}")
# Set seed for reproducibility
if seed != -1:
torch.manual_seed(seed)
np.random.seed(seed)
print(f"[DEBUG] Seed set to: {seed}")
progress(0.1, desc="Initializing...")
speaker = {
"path": reference_paths,
"speed": speed
}
print(f"[DEBUG] Speaker config: {speaker}")
progress(0.3, desc="Processing phonemes...")
with torch.no_grad():
phonemes = get_phoneme(text=text_prompt, lang="en-us")
if phonemes is None:
print(f"[DEBUG] Error: Phoneme processing failed")
return None, "❌ Error: Failed to process phonemes."
print(f"[DEBUG] Phonemes processed successfully")
progress(0.5, desc="Extracting voice styles...")
print(f"[DEBUG] Getting styles from model...")
styles = model.get_styles(speaker, denoise, avg_style)
print(f"[DEBUG] Styles extracted: {type(styles)}")
progress(0.7, desc="Generating audio...")
print(f"[DEBUG] Generating audio with model...")
r = model.generate(phonemes, styles, stabilize, 18)
print(f"[DEBUG] Audio generated: shape={r.shape if hasattr(r, 'shape') else len(r)}")
progress(0.9, desc="Finalizing...")
# Handle NaN and normalize
r = np.nan_to_num(r)
max_abs = np.max(np.abs(r))
if max_abs > 0:
r /= max_abs
else:
r = np.zeros_like(r)
r = np.clip(r, -1, 1)
print(f"[DEBUG] Audio normalized")
# Calculate audio duration
duration = len(r) / 24000
print(f"[DEBUG] Audio duration: {duration:.2f}s")
progress(1.0, desc="Complete!")
print(f"[DEBUG] ===== GENERATION COMPLETE =====\n")
return (24000, r.astype(np.float32)), f"βœ… Audio generated successfully! Duration: {duration:.2f}s | Device: {device} | Seed: {seed if seed != -1 else 'Random'}"
except Exception as e:
error_message = traceback.format_exc()
print(f"[DEBUG] ===== GENERATION ERROR =====")
print(f"[DEBUG] Error type: {type(e).__name__}")
print(f"[DEBUG] Error message: {str(e)}")
print(f"[DEBUG] Full traceback:\n{error_message}")
print(f"[DEBUG] ===== END ERROR =====\n")
return None, f"❌ Error: {str(e)}\n\n{error_message}"
def handle_custom_audio_upload(audio_file, audio_source):
"""Handle uploaded custom audio file"""
try:
print(f"[DEBUG] handle_custom_audio_upload called")
print(f"[DEBUG] Audio file: {audio_file}")
print(f"[DEBUG] Audio source: {audio_source}")
if audio_source != "custom":
print(f"[DEBUG] Audio source is not custom, ignoring upload")
return None, None, "⚠️ Please select 'Custom Upload' as audio source first."
if audio_file is None:
print(f"[DEBUG] No audio file provided")
return None, None, "⚠️ Please upload an audio file."
# Validate file format
valid_extensions = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
file_ext = os.path.splitext(audio_file)[1].lower()
if file_ext not in valid_extensions:
return None, None, f"❌ Invalid file format. Supported: {', '.join(valid_extensions)}"
# Create unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
custom_filename = f"custom_ref_{timestamp}{file_ext}"
custom_path = os.path.join(custom_audio_dir, custom_filename)
# Copy uploaded file
shutil.copy2(audio_file, custom_path)
print(f"[DEBUG] Custom audio saved to: {custom_path}")
# Validate audio file
try:
audio_data, sample_rate = sf.read(custom_path)
duration = len(audio_data) / sample_rate
print(f"[DEBUG] Audio validated: {duration:.2f}s @ {sample_rate}Hz")
if duration < 1.0:
os.remove(custom_path)
return None, None, "❌ Audio too short. Please upload at least 1 second of audio."
if duration > 30.0:
return None, custom_path, f"⚠️ Audio is {duration:.1f}s long. Shorter clips (3-10s) work best, but we'll use it."
return custom_path, custom_path, f"βœ… Custom audio uploaded! Duration: {duration:.2f}s @ {sample_rate}Hz"
except Exception as e:
if os.path.exists(custom_path):
os.remove(custom_path)
return None, None, f"❌ Failed to read audio file: {str(e)}"
except Exception as e:
error_msg = traceback.format_exc()
print(f"[DEBUG] Upload error: {error_msg}")
return None, None, f"❌ Upload failed: {str(e)}"
def load_example_voice(example_voices):
print(f"[DEBUG] load_example_voice called with: {example_voices}")
print(f"[DEBUG] Type: {type(example_voices)}")
if example_voices:
# Find the voice name safely
voice_name = "Unknown"
for k, v in voice_map.items():
full_path = os.path.join(voice_path, v)
if full_path == example_voices:
voice_name = k
print(f"[DEBUG] Found matching voice: {voice_name}")
break
if voice_name == "Unknown":
print(f"[DEBUG] Warning: Could not find voice name for path: {example_voices}")
result = example_voices, f"βœ… Loaded voice: {voice_name}"
print(f"[DEBUG] Returning: {result}")
return result
print(f"[DEBUG] No voice selected")
return None, "⚠️ No voice selected."
def switch_audio_source(source):
"""Handle switching between preset and custom audio"""
print(f"[DEBUG] Switching audio source to: {source}")
if source == "preset":
# Show preset dropdown, hide upload
return (
gr.update(visible=True), # example_voices dropdown
gr.update(visible=False), # custom_audio_upload
voice_choices[0][1], # reference_audios - load default
"βœ… Using preset voices" # status
)
else: # custom
# Hide preset dropdown, show upload
return (
gr.update(visible=False), # example_voices dropdown
gr.update(visible=True), # custom_audio_upload
None, # reference_audios - clear
"πŸ“€ Upload your own reference audio (WAV, MP3, FLAC, OGG, M4A)" # status
)
def random_text(category):
print(f"[DEBUG] random_text called with category: {category}")
if category == "All Categories":
all_texts = [text for texts in eg_texts.values() for text in texts]
selected = random.choice(all_texts)
print(f"[DEBUG] Selected random text from all categories")
else:
selected = random.choice(eg_texts.get(category, []))
print(f"[DEBUG] Selected random text from {category}")
print(f"[DEBUG] Selected text: {selected[:50]}...")
return selected, f"βœ… Randomized text from: {category}"
def clear_all():
print(f"[DEBUG] Clearing all fields")
return "", None, None, None, "βœ… All fields cleared."
def estimate_duration(text):
# Rough estimation: ~150 words per minute, average 5 chars per word
words = len(text.split())
estimated_seconds = (words / 150) * 60
print(f"[DEBUG] Estimated duration for {words} words: {estimated_seconds:.1f}s")
return f"⏱️ Estimated duration: ~{estimated_seconds:.1f}s"
def generate_random_seed():
seed_value = random.randint(0, 2**31 - 1)
print(f"[DEBUG] Generated random seed: {seed_value}")
return seed_value, "βœ… Random seed generated."
def voice_button_click(vp, vn):
print(f"[DEBUG] Voice button clicked: {vn}")
print(f"[DEBUG] Voice path: {vp}")
result = vp, vp, f"βœ… Selected: {vn}"
print(f"[DEBUG] Returning: {result}")
return result
def text_button_click(t):
print(f"[DEBUG] Text button clicked")
print(f"[DEBUG] Text: {t[:50]}...")
result = t, f"βœ… Loaded example text"
print(f"[DEBUG] Returning: {result}")
return result
# Custom CSS for better styling
custom_css = """
#main_container {
max-width: 1400px;
margin: auto;
}
.header {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 10px;
margin-bottom: 20px;
}
.header h1 {
color: white;
font-size: 2.5em;
margin: 0;
}
.header p {
color: #f0f0f0;
font-size: 1.1em;
margin-top: 10px;
}
.audio-source-radio {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
margin: 10px 0;
}
.streaming-badge {
display: inline-block;
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
color: white;
padding: 5px 15px;
border-radius: 20px;
font-weight: bold;
margin-left: 10px;
}
"""
# Gradio UI
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div class="header">
<h1>πŸŽ™οΈ StyleTTS2-Lite Pro <span class="streaming-badge">πŸ”₯ STREAMING</span></h1>
<p>Advanced Text-to-Speech Synthesis with Real-time Streaming</p>
<p style="font-size: 0.9em; color: #ffeb3b;">✨ Now with Custom Audio Upload & Real-time Streaming!</p>
</div>
""")
gr.Markdown(f"""
### 🌟 Features
- **18 Premium Voices** (9 Female, 9 Male)
- **πŸ”₯ Real-time Streaming** - Hear audio as it generates
- **πŸ†• Custom Audio Upload** - Use your own voice!
- **Advanced Controls** (Speed, Denoising, Style Averaging)
- **Text Categories** (Creative, Technical, Conversational, and more)
- **Reproducible Seeds** for consistent results
---
### πŸ› Debug Information
- **Device**: {device}
- **Voice Path**: {voice_path}
- **Available Voices**: {len(voice_choices)}
""")
with gr.Row(elem_id="main_container"):
with gr.Column(scale=1):
gr.Markdown("### πŸ“ Text Input")
text_category = gr.Dropdown(
label="Text Category",
choices=["All Categories"] + list(eg_texts.keys()),
value="All Categories",
interactive=True
)
text_prompt = gr.Textbox(
label="Text Prompt",
placeholder="Enter your text here or use the randomize button...",
lines=8,
max_lines=15
)
text_info = gr.Textbox(
label="Text Info",
value="",
interactive=False,
lines=1
)
with gr.Row():
random_text_button = gr.Button("🎲 Randomize Text", variant="secondary")
clear_button = gr.Button("πŸ—‘οΈ Clear All", variant="stop")
text_prompt.change(fn=estimate_duration, inputs=text_prompt, outputs=text_info)
gr.Markdown("### 🎚️ Audio Controls")
with gr.Accordion("Basic Settings", open=True):
speed = gr.Slider(
0.5, 2.0,
step=0.1,
value=1.0,
label="Speaking Speed",
info="Adjust how fast the speech is generated"
)
denoise = gr.Slider(
0.0, 1.0,
step=0.05,
value=0.2,
label="Denoise Strength",
info="Higher values produce cleaner but less expressive audio"
)
with gr.Accordion("Advanced Settings", open=False):
avg_style = gr.Checkbox(
label="Use Average Styles",
value=True,
info="Blend multiple style characteristics for smoother output"
)
stabilize = gr.Checkbox(
label="Stabilize Speaking Speed",
value=True,
info="Maintain consistent pacing throughout generation"
)
seed = gr.Number(
label="Random Seed (-1 for random)",
value=-1,
precision=0,
info="Use same seed for reproducible results"
)
random_seed_button = gr.Button("🎲 Generate Random Seed", size="sm")
with gr.Column(scale=1):
gr.Markdown("### 🎀 Voice Selection")
audio_source = gr.Radio(
choices=[("Preset Voices", "preset"), ("Custom Upload", "custom")],
value="preset",
label="Audio Source - Choose between preset voices or upload your own",
elem_classes="audio-source-radio"
)
example_voices = gr.Dropdown(
label="Select Preset Voice",
choices=voice_choices,
value=voice_choices[0][1],
interactive=True,
allow_custom_value=False,
filterable=True,
visible=True
)
custom_audio_upload = gr.Audio(
label="Upload Custom Reference Audio (3-10 seconds of clear speech)",
type='filepath',
visible=False
)
reference_audios = gr.Audio(
label="Reference Audio Preview",
type='filepath',
interactive=False,
value=voice_choices[0][1]
)
gr.Markdown("### πŸ”Š Generated Output")
# Streaming output
streaming_audio = gr.Audio(
label="πŸ”₯ Streaming Audio (Real-time)",
type='numpy',
interactive=False,
streaming=True,
autoplay=True
)
# Non-streaming output
synthesized_audio = gr.Audio(
label="Complete Audio (Non-streaming)",
type='numpy',
interactive=False
)
with gr.Row():
stream_button = gr.Button("πŸ”₯ Stream Speech", variant="primary", size="lg")
gen_button = gr.Button("πŸ—£οΈ Generate Complete", variant="secondary", size="lg")
status = gr.Textbox(
label="Status",
interactive=False,
lines=3,
placeholder="Status messages will appear here..."
)
# Voice examples section
with gr.Accordion("🎭 Voice Gallery & Examples", open=False):
gr.Markdown("### Quick Voice Preview")
gr.Markdown("Browse through all available voices:")
with gr.Row():
female_voices = [v for v in voice_choices if '🚺' in v[0]]
male_voices = [v for v in voice_choices if '🚹' in v[0]]
with gr.Column():
gr.Markdown(f"**Female Voices ({len(female_voices)})**")
for voice_name, voice_path_item in female_voices:
btn = gr.Button(voice_name, size="sm")
btn.click(
fn=voice_button_click,
inputs=[gr.State(voice_path_item), gr.State(voice_name)],
outputs=[example_voices, reference_audios, status]
)
with gr.Column():
gr.Markdown(f"**Male Voices ({len(male_voices)})**")
for voice_name, voice_path_item in male_voices:
btn = gr.Button(voice_name, size="sm")
btn.click(
fn=voice_button_click,
inputs=[gr.State(voice_path_item), gr.State(voice_name)],
outputs=[example_voices, reference_audios, status]
)
# Example texts section
with gr.Accordion("πŸ“š Example Text Library", open=False):
gr.Markdown("### Browse Example Texts by Category")
for category, texts in eg_texts.items():
with gr.Accordion(f"{category} ({len(texts)} examples)", open=False):
for idx, text in enumerate(texts, 1):
with gr.Row():
text_display = gr.Textbox(
label=f"Example {idx}",
value=text,
lines=3,
interactive=False,
scale=4
)
load_btn = gr.Button("πŸ“‹ Load", size="sm", scale=1)
load_btn.click(
fn=text_button_click,
inputs=gr.State(text),
outputs=[text_prompt, status]
)
# Event handlers
# Random text button
random_text_button.click(
fn=random_text,
inputs=text_category,
outputs=[text_prompt, status]
)
# Clear all button
clear_button.click(
fn=clear_all,
outputs=[text_prompt, reference_audios, streaming_audio, synthesized_audio, status]
)
# Random seed button
random_seed_button.click(
fn=generate_random_seed,
outputs=[seed, status]
)
# Audio source switch
audio_source.change(
fn=switch_audio_source,
inputs=audio_source,
outputs=[example_voices, custom_audio_upload, reference_audios, status]
)
# Example voice selection
example_voices.change(
fn=load_example_voice,
inputs=example_voices,
outputs=[reference_audios, status]
)
# Custom audio upload
custom_audio_upload.change(
fn=handle_custom_audio_upload,
inputs=[custom_audio_upload, audio_source],
outputs=[reference_audios, reference_audios, status]
)
# Streaming generation button
stream_button.click(
fn=generate_stream,
inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed],
outputs=streaming_audio
)
# Non-streaming generation button
gen_button.click(
fn=main,
inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed],
outputs=[synthesized_audio, status]
)
# Footer
gr.Markdown(f"""
---
### πŸ“– Usage Tips
1. **Choose a Voice**: Select from 18 preset voices or upload your own reference audio (3-10 seconds recommended)
2. **Enter Text**: Type or select example text from the library
3. **Adjust Settings**: Fine-tune speed, denoising, and other parameters
4. **Generate**:
- Click "πŸ”₯ Stream Speech" for real-time audio generation (hear it as it's created)
- Click "πŸ—£οΈ Generate Complete" for full audio generation at once
5. **Experiment**: Try different voices, speeds, and text styles!
### βš™οΈ Parameter Guide
- **Speaking Speed**: 0.5 = slow, 1.0 = normal, 2.0 = fast
- **Denoise Strength**: Higher values = cleaner audio but less natural variation
- **Average Styles**: Blends multiple style characteristics for consistency
- **Stabilize Speed**: Maintains consistent pacing throughout speech
- **Random Seed**: Use -1 for random, or set a specific number for reproducible results
### 🎯 Custom Audio Tips
- Use clear, high-quality recordings
- 3-10 seconds of speech works best
- Speak in a natural, conversational tone
- Avoid background noise
- Supported formats: WAV, MP3, FLAC, OGG, M4A
### πŸ”₯ Streaming vs Complete Generation
- **Streaming**: Hear audio as it's being generated, chunk by chunk (great for long texts!)
- **Complete**: Generates entire audio at once (better for short texts and downloading)
---
**Model**: StyleTTS2-Lite | **Device**: {device} | **Voices**: {len(voice_choices)}
πŸ’‘ *Tip: Use the seed parameter to generate the same audio multiple times with different settings!*
""")
# Launch the app
if __name__ == "__main__":
demo.queue(max_size=20)
demo.launch(
share=True,
debug=True,
show_error=True
)