Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import subprocess | |
| import os | |
| import sys | |
| import soundfile as sf | |
| import torch | |
| import traceback | |
| import random | |
| import numpy as np | |
| import spaces | |
| import json | |
| from datetime import datetime | |
| import shutil | |
| import sys | |
| import phonemizer | |
| if sys.platform.startswith("win"): | |
| try: | |
| from phonemizer.backend.espeak.wrapper import EspeakWrapper | |
| import espeakng_loader | |
| EspeakWrapper.set_library(espeakng_loader.get_library_path()) | |
| except Exception as e: | |
| print(f"[DEBUG] EspeakWrapper setup error: {e}") | |
| def get_phoneme(text, lang): | |
| try: | |
| print(f"[DEBUG] Getting phoneme for text: {text[:50]}... | lang: {lang}") | |
| my_phonemizer = phonemizer.backend.EspeakBackend( | |
| language=lang, | |
| preserve_punctuation=True, | |
| with_stress=True, | |
| language_switch='remove-flags' | |
| ) | |
| result = my_phonemizer.phonemize([text])[0] | |
| print(f"[DEBUG] Phoneme result: {result[:100]}...") | |
| return result | |
| except Exception as e: | |
| print(f"[DEBUG] Phoneme error: {e}") | |
| traceback.print_exc() | |
| return None | |
| def split_text_into_sentences(text, max_chars=200): | |
| """Split text into sentences for streaming generation""" | |
| import re | |
| # Split by common sentence endings | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= max_chars: | |
| current_chunk += sentence + " " | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Setup repository | |
| repo_url = "https://huggingface.co/dangtr0408/StyleTTS2-lite" | |
| repo_dir = "StyleTTS2-lite" | |
| if not os.path.exists(repo_dir): | |
| print(f"[DEBUG] Cloning repository from {repo_url}") | |
| subprocess.run(["git", "clone", repo_url, repo_dir]) | |
| else: | |
| print(f"[DEBUG] Repository already exists at {repo_dir}") | |
| sys.path.append(os.path.abspath(repo_dir)) | |
| from inference import StyleTTS2 | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| print(f"[DEBUG] Using device: {device}") | |
| config_path = os.path.join(repo_dir, "Models", "config.yaml") | |
| models_path = os.path.join(repo_dir, "Models", "inference", "model.pth") | |
| voice_path = os.path.join(repo_dir, "Audio") | |
| print(f"[DEBUG] Config path: {config_path}") | |
| print(f"[DEBUG] Models path: {models_path}") | |
| print(f"[DEBUG] Voice path: {voice_path}") | |
| model = StyleTTS2(config_path, models_path).eval().to(device) | |
| print(f"[DEBUG] Model loaded successfully") | |
| # Create directory for custom uploaded audio | |
| custom_audio_dir = "custom_reference_audio" | |
| os.makedirs(custom_audio_dir, exist_ok=True) | |
| print(f"[DEBUG] Custom audio directory: {custom_audio_dir}") | |
| # Extended example texts with categories | |
| eg_texts = { | |
| "Creative & Narrative": [ | |
| "Beneath layers of bureaucracy and forgotten policies, the school still held a quiet magicβwhispers of chalk dust, scuffed floors, and dreams once declared aloud in voices full of belief.", | |
| "He had never believed in fate, but when their paths crossed in the middle of a thunderstorm under a flickering streetlight, even his rational mind couldn't deny the poetic timing.", | |
| "In a distant galaxy orbiting a dying star, a species of sentient machines debates whether to intervene in the fate of a nearby organic civilization on the brink of collapse.", | |
| "The ancient temple walls, once vibrant with murals, now bore the weathered marks of centuries, yet even in decay, they whispered stories that modern minds struggled to fully comprehend.", | |
| ], | |
| "Technical & Informative": [ | |
| "Technological advancements in artificial intelligence have not only accelerated the pace of automation but have also raised critical questions about ethics, job displacement, and the future role of human creativity.", | |
| "Every algorithm reflects its designer's worldview, no matter how neutral it appears, and therein lies the paradox of objectivity in machine learning: pure logic still casts a human shadow.", | |
| "The process of photosynthesis converts light energy into chemical energy, enabling plants to produce glucose from carbon dioxide and water while releasing oxygen as a byproduct.", | |
| ], | |
| "Conversational": [ | |
| "Hey there! I hope you're having a wonderful day. I just wanted to check in and see how things are going with that project we discussed last week.", | |
| "You know what? I think we should grab coffee sometime soon. It's been way too long since we caught up properly.", | |
| "I completely understand where you're coming from, and I appreciate you sharing that with me. Let's figure this out together.", | |
| ], | |
| "Dramatic & Suspenseful": [ | |
| "The engine sputtered twice before giving in completely, leaving them stranded on a desolate mountain road with no reception, dwindling supplies, and a storm brewing over the ridge to the west.", | |
| "The museum guard never expected the sculpture to move, but at precisely midnight, its eyes blinked, and its lips curled into a knowing smile, as if awakening from centuries of silence.", | |
| "Time slowed as the coin spun in the air, glinting with a brilliance far beyond its monetary value, carrying with it the weight of a decision neither of them wanted to make.", | |
| ], | |
| "Poetic & Reflective": [ | |
| "The sound of rain on the tin roof reminded him of summers long past, when the world was smaller, days were longer, and time moved like honey down a warm spoon.", | |
| "While standing at the edge of the quiet lake, Maria couldn't help but wonder how many untold stories were buried beneath its still surface, reflecting the sky like a perfect mirror.", | |
| "As the solar eclipse reached totality, the temperature dropped, the birds went silent, and for a few seconds, the world stood still beneath an alien, awe-inspiring sky.", | |
| ] | |
| } | |
| voice_map = { | |
| 'πΊπΈ πΊ Heart β€οΈ': '1_heart.wav', | |
| 'πΊπΈ πΊ Bella π₯': '2_belle.wav', | |
| 'πΊπΈ πΊ Kore': '3_kore.wav', | |
| 'πΊπΈ πΊ Sarah': '4_sarah.wav', | |
| 'πΊπΈ πΊ Nova': '5_nova.wav', | |
| 'πΊπΈ πΊ Sky': '6_sky.wav', | |
| 'πΊπΈ πΊ Alloy': '7_alloy.wav', | |
| 'πΊπΈ πΊ Jessica': '8_jessica.wav', | |
| 'πΊπΈ πΊ River': '9_river.wav', | |
| 'πΊπΈ πΉ Michael': '10_michael.wav', | |
| 'πΊπΈ πΉ Fenrir': '11_fenrir.wav', | |
| 'πΊπΈ πΉ Puck': '12_puck.wav', | |
| 'πΊπΈ πΉ Echo': '13_echo.wav', | |
| 'πΊπΈ πΉ Eric': '14_eric.wav', | |
| 'πΊπΈ πΉ Liam': '15_liam.wav', | |
| 'πΊπΈ πΉ Onyx': '16_onyx.wav', | |
| 'πΊπΈ πΉ Santa': '17_santa.wav', | |
| 'πΊπΈ πΉ Adam': '18_adam.wav', | |
| } | |
| voice_choices = [ | |
| (label, os.path.join(voice_path, filename)) | |
| for label, filename in voice_map.items() | |
| ] | |
| print(f"[DEBUG] Voice choices created: {len(voice_choices)} voices") | |
| for label, path in voice_choices[:3]: | |
| print(f"[DEBUG] Sample voice: {label} -> {path}") | |
| # Streaming inference function | |
| def generate_stream(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()): | |
| """Generator function that yields audio chunks for streaming""" | |
| try: | |
| print(f"\n[DEBUG] ===== STREAMING GENERATION START =====") | |
| print(f"[DEBUG] Text prompt: {text_prompt[:100]}...") | |
| print(f"[DEBUG] Reference path: {reference_paths}") | |
| print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}") | |
| if not text_prompt or text_prompt.strip() == "": | |
| print(f"[DEBUG] Error: Empty text prompt") | |
| yield None | |
| return | |
| if not reference_paths or not os.path.exists(reference_paths): | |
| print(f"[DEBUG] Error: Invalid reference path") | |
| yield None | |
| return | |
| # Set seed for reproducibility | |
| if seed != -1: | |
| torch.manual_seed(seed) | |
| np.random.seed(seed) | |
| print(f"[DEBUG] Seed set to: {seed}") | |
| # Split text into chunks for streaming | |
| text_chunks = split_text_into_sentences(text_prompt, max_chars=200) | |
| print(f"[DEBUG] Split into {len(text_chunks)} chunks") | |
| speaker = { | |
| "path": reference_paths, | |
| "speed": speed | |
| } | |
| progress(0.1, desc="Extracting voice styles...") | |
| # Extract styles once (reuse for all chunks) | |
| with torch.no_grad(): | |
| styles = model.get_styles(speaker, denoise, avg_style) | |
| print(f"[DEBUG] Styles extracted") | |
| first_chunk = True | |
| total_chunks = len(text_chunks) | |
| for idx, chunk in enumerate(text_chunks, 1): | |
| try: | |
| progress_val = 0.1 + (0.8 * idx / total_chunks) | |
| progress(progress_val, desc=f"Generating chunk {idx}/{total_chunks}...") | |
| print(f"[DEBUG] Processing chunk {idx}/{total_chunks}: {chunk[:50]}...") | |
| with torch.no_grad(): | |
| # Get phonemes for this chunk | |
| phonemes = get_phoneme(text=chunk, lang="en-us") | |
| if phonemes is None: | |
| print(f"[DEBUG] Warning: Phoneme processing failed for chunk {idx}") | |
| continue | |
| # Generate audio for this chunk | |
| audio_chunk = model.generate(phonemes, styles, stabilize, 18) | |
| # Handle NaN and normalize | |
| audio_chunk = np.nan_to_num(audio_chunk) | |
| max_abs = np.max(np.abs(audio_chunk)) | |
| if max_abs > 0: | |
| audio_chunk /= max_abs | |
| else: | |
| audio_chunk = np.zeros_like(audio_chunk) | |
| audio_chunk = np.clip(audio_chunk, -1, 1) | |
| print(f"[DEBUG] Generated chunk {idx}: {len(audio_chunk)} samples") | |
| # Yield the audio chunk | |
| yield (24000, audio_chunk.astype(np.float32)) | |
| # Add tiny silence after first chunk for smoother transitions | |
| if first_chunk and total_chunks > 1: | |
| first_chunk = False | |
| silence = np.zeros(int(24000 * 0.1), dtype=np.float32) # 0.1 second silence | |
| yield (24000, silence) | |
| print(f"[DEBUG] Added silence separator") | |
| except Exception as e: | |
| print(f"[DEBUG] Error processing chunk {idx}: {str(e)}") | |
| traceback.print_exc() | |
| continue | |
| progress(1.0, desc="Complete!") | |
| print(f"[DEBUG] ===== STREAMING GENERATION COMPLETE =====\n") | |
| except Exception as e: | |
| error_message = traceback.format_exc() | |
| print(f"[DEBUG] ===== STREAMING ERROR =====") | |
| print(f"[DEBUG] Error: {str(e)}") | |
| print(f"[DEBUG] Traceback:\n{error_message}") | |
| print(f"[DEBUG] ===== END ERROR =====\n") | |
| yield None | |
| # Non-streaming inference function (original) | |
| def main(text_prompt, reference_paths, speed, denoise, avg_style, stabilize, seed, progress=gr.Progress()): | |
| try: | |
| print(f"\n[DEBUG] ===== GENERATION START =====") | |
| print(f"[DEBUG] Text prompt: {text_prompt[:100]}...") | |
| print(f"[DEBUG] Reference path: {reference_paths}") | |
| print(f"[DEBUG] Speed: {speed}, Denoise: {denoise}") | |
| print(f"[DEBUG] Avg style: {avg_style}, Stabilize: {stabilize}, Seed: {seed}") | |
| if not text_prompt or text_prompt.strip() == "": | |
| print(f"[DEBUG] Error: Empty text prompt") | |
| return None, "β Error: Please enter text to generate speech." | |
| if not reference_paths: | |
| print(f"[DEBUG] Error: No reference path") | |
| return None, "β Error: Please select a reference voice or upload your own audio." | |
| # Check if reference file exists | |
| if not os.path.exists(reference_paths): | |
| print(f"[DEBUG] Error: Reference file does not exist: {reference_paths}") | |
| return None, f"β Error: Reference file not found: {reference_paths}" | |
| print(f"[DEBUG] Reference file exists: {os.path.exists(reference_paths)}") | |
| # Set seed for reproducibility | |
| if seed != -1: | |
| torch.manual_seed(seed) | |
| np.random.seed(seed) | |
| print(f"[DEBUG] Seed set to: {seed}") | |
| progress(0.1, desc="Initializing...") | |
| speaker = { | |
| "path": reference_paths, | |
| "speed": speed | |
| } | |
| print(f"[DEBUG] Speaker config: {speaker}") | |
| progress(0.3, desc="Processing phonemes...") | |
| with torch.no_grad(): | |
| phonemes = get_phoneme(text=text_prompt, lang="en-us") | |
| if phonemes is None: | |
| print(f"[DEBUG] Error: Phoneme processing failed") | |
| return None, "β Error: Failed to process phonemes." | |
| print(f"[DEBUG] Phonemes processed successfully") | |
| progress(0.5, desc="Extracting voice styles...") | |
| print(f"[DEBUG] Getting styles from model...") | |
| styles = model.get_styles(speaker, denoise, avg_style) | |
| print(f"[DEBUG] Styles extracted: {type(styles)}") | |
| progress(0.7, desc="Generating audio...") | |
| print(f"[DEBUG] Generating audio with model...") | |
| r = model.generate(phonemes, styles, stabilize, 18) | |
| print(f"[DEBUG] Audio generated: shape={r.shape if hasattr(r, 'shape') else len(r)}") | |
| progress(0.9, desc="Finalizing...") | |
| # Handle NaN and normalize | |
| r = np.nan_to_num(r) | |
| max_abs = np.max(np.abs(r)) | |
| if max_abs > 0: | |
| r /= max_abs | |
| else: | |
| r = np.zeros_like(r) | |
| r = np.clip(r, -1, 1) | |
| print(f"[DEBUG] Audio normalized") | |
| # Calculate audio duration | |
| duration = len(r) / 24000 | |
| print(f"[DEBUG] Audio duration: {duration:.2f}s") | |
| progress(1.0, desc="Complete!") | |
| print(f"[DEBUG] ===== GENERATION COMPLETE =====\n") | |
| return (24000, r.astype(np.float32)), f"β Audio generated successfully! Duration: {duration:.2f}s | Device: {device} | Seed: {seed if seed != -1 else 'Random'}" | |
| except Exception as e: | |
| error_message = traceback.format_exc() | |
| print(f"[DEBUG] ===== GENERATION ERROR =====") | |
| print(f"[DEBUG] Error type: {type(e).__name__}") | |
| print(f"[DEBUG] Error message: {str(e)}") | |
| print(f"[DEBUG] Full traceback:\n{error_message}") | |
| print(f"[DEBUG] ===== END ERROR =====\n") | |
| return None, f"β Error: {str(e)}\n\n{error_message}" | |
| def handle_custom_audio_upload(audio_file, audio_source): | |
| """Handle uploaded custom audio file""" | |
| try: | |
| print(f"[DEBUG] handle_custom_audio_upload called") | |
| print(f"[DEBUG] Audio file: {audio_file}") | |
| print(f"[DEBUG] Audio source: {audio_source}") | |
| if audio_source != "custom": | |
| print(f"[DEBUG] Audio source is not custom, ignoring upload") | |
| return None, None, "β οΈ Please select 'Custom Upload' as audio source first." | |
| if audio_file is None: | |
| print(f"[DEBUG] No audio file provided") | |
| return None, None, "β οΈ Please upload an audio file." | |
| # Validate file format | |
| valid_extensions = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'] | |
| file_ext = os.path.splitext(audio_file)[1].lower() | |
| if file_ext not in valid_extensions: | |
| return None, None, f"β Invalid file format. Supported: {', '.join(valid_extensions)}" | |
| # Create unique filename | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| custom_filename = f"custom_ref_{timestamp}{file_ext}" | |
| custom_path = os.path.join(custom_audio_dir, custom_filename) | |
| # Copy uploaded file | |
| shutil.copy2(audio_file, custom_path) | |
| print(f"[DEBUG] Custom audio saved to: {custom_path}") | |
| # Validate audio file | |
| try: | |
| audio_data, sample_rate = sf.read(custom_path) | |
| duration = len(audio_data) / sample_rate | |
| print(f"[DEBUG] Audio validated: {duration:.2f}s @ {sample_rate}Hz") | |
| if duration < 1.0: | |
| os.remove(custom_path) | |
| return None, None, "β Audio too short. Please upload at least 1 second of audio." | |
| if duration > 30.0: | |
| return None, custom_path, f"β οΈ Audio is {duration:.1f}s long. Shorter clips (3-10s) work best, but we'll use it." | |
| return custom_path, custom_path, f"β Custom audio uploaded! Duration: {duration:.2f}s @ {sample_rate}Hz" | |
| except Exception as e: | |
| if os.path.exists(custom_path): | |
| os.remove(custom_path) | |
| return None, None, f"β Failed to read audio file: {str(e)}" | |
| except Exception as e: | |
| error_msg = traceback.format_exc() | |
| print(f"[DEBUG] Upload error: {error_msg}") | |
| return None, None, f"β Upload failed: {str(e)}" | |
| def load_example_voice(example_voices): | |
| print(f"[DEBUG] load_example_voice called with: {example_voices}") | |
| print(f"[DEBUG] Type: {type(example_voices)}") | |
| if example_voices: | |
| # Find the voice name safely | |
| voice_name = "Unknown" | |
| for k, v in voice_map.items(): | |
| full_path = os.path.join(voice_path, v) | |
| if full_path == example_voices: | |
| voice_name = k | |
| print(f"[DEBUG] Found matching voice: {voice_name}") | |
| break | |
| if voice_name == "Unknown": | |
| print(f"[DEBUG] Warning: Could not find voice name for path: {example_voices}") | |
| result = example_voices, f"β Loaded voice: {voice_name}" | |
| print(f"[DEBUG] Returning: {result}") | |
| return result | |
| print(f"[DEBUG] No voice selected") | |
| return None, "β οΈ No voice selected." | |
| def switch_audio_source(source): | |
| """Handle switching between preset and custom audio""" | |
| print(f"[DEBUG] Switching audio source to: {source}") | |
| if source == "preset": | |
| # Show preset dropdown, hide upload | |
| return ( | |
| gr.update(visible=True), # example_voices dropdown | |
| gr.update(visible=False), # custom_audio_upload | |
| voice_choices[0][1], # reference_audios - load default | |
| "β Using preset voices" # status | |
| ) | |
| else: # custom | |
| # Hide preset dropdown, show upload | |
| return ( | |
| gr.update(visible=False), # example_voices dropdown | |
| gr.update(visible=True), # custom_audio_upload | |
| None, # reference_audios - clear | |
| "π€ Upload your own reference audio (WAV, MP3, FLAC, OGG, M4A)" # status | |
| ) | |
| def random_text(category): | |
| print(f"[DEBUG] random_text called with category: {category}") | |
| if category == "All Categories": | |
| all_texts = [text for texts in eg_texts.values() for text in texts] | |
| selected = random.choice(all_texts) | |
| print(f"[DEBUG] Selected random text from all categories") | |
| else: | |
| selected = random.choice(eg_texts.get(category, [])) | |
| print(f"[DEBUG] Selected random text from {category}") | |
| print(f"[DEBUG] Selected text: {selected[:50]}...") | |
| return selected, f"β Randomized text from: {category}" | |
| def clear_all(): | |
| print(f"[DEBUG] Clearing all fields") | |
| return "", None, None, None, "β All fields cleared." | |
| def estimate_duration(text): | |
| # Rough estimation: ~150 words per minute, average 5 chars per word | |
| words = len(text.split()) | |
| estimated_seconds = (words / 150) * 60 | |
| print(f"[DEBUG] Estimated duration for {words} words: {estimated_seconds:.1f}s") | |
| return f"β±οΈ Estimated duration: ~{estimated_seconds:.1f}s" | |
| def generate_random_seed(): | |
| seed_value = random.randint(0, 2**31 - 1) | |
| print(f"[DEBUG] Generated random seed: {seed_value}") | |
| return seed_value, "β Random seed generated." | |
| def voice_button_click(vp, vn): | |
| print(f"[DEBUG] Voice button clicked: {vn}") | |
| print(f"[DEBUG] Voice path: {vp}") | |
| result = vp, vp, f"β Selected: {vn}" | |
| print(f"[DEBUG] Returning: {result}") | |
| return result | |
| def text_button_click(t): | |
| print(f"[DEBUG] Text button clicked") | |
| print(f"[DEBUG] Text: {t[:50]}...") | |
| result = t, f"β Loaded example text" | |
| print(f"[DEBUG] Returning: {result}") | |
| return result | |
| # Custom CSS for better styling | |
| custom_css = """ | |
| #main_container { | |
| max-width: 1400px; | |
| margin: auto; | |
| } | |
| .header { | |
| text-align: center; | |
| padding: 20px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 10px; | |
| margin-bottom: 20px; | |
| } | |
| .header h1 { | |
| color: white; | |
| font-size: 2.5em; | |
| margin: 0; | |
| } | |
| .header p { | |
| color: #f0f0f0; | |
| font-size: 1.1em; | |
| margin-top: 10px; | |
| } | |
| .audio-source-radio { | |
| background: #f8f9fa; | |
| padding: 15px; | |
| border-radius: 8px; | |
| margin: 10px 0; | |
| } | |
| .streaming-badge { | |
| display: inline-block; | |
| background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); | |
| color: white; | |
| padding: 5px 15px; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| margin-left: 10px; | |
| } | |
| """ | |
| # Gradio UI | |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>ποΈ StyleTTS2-Lite Pro <span class="streaming-badge">π₯ STREAMING</span></h1> | |
| <p>Advanced Text-to-Speech Synthesis with Real-time Streaming</p> | |
| <p style="font-size: 0.9em; color: #ffeb3b;">β¨ Now with Custom Audio Upload & Real-time Streaming!</p> | |
| </div> | |
| """) | |
| gr.Markdown(f""" | |
| ### π Features | |
| - **18 Premium Voices** (9 Female, 9 Male) | |
| - **π₯ Real-time Streaming** - Hear audio as it generates | |
| - **π Custom Audio Upload** - Use your own voice! | |
| - **Advanced Controls** (Speed, Denoising, Style Averaging) | |
| - **Text Categories** (Creative, Technical, Conversational, and more) | |
| - **Reproducible Seeds** for consistent results | |
| --- | |
| ### π Debug Information | |
| - **Device**: {device} | |
| - **Voice Path**: {voice_path} | |
| - **Available Voices**: {len(voice_choices)} | |
| """) | |
| with gr.Row(elem_id="main_container"): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Text Input") | |
| text_category = gr.Dropdown( | |
| label="Text Category", | |
| choices=["All Categories"] + list(eg_texts.keys()), | |
| value="All Categories", | |
| interactive=True | |
| ) | |
| text_prompt = gr.Textbox( | |
| label="Text Prompt", | |
| placeholder="Enter your text here or use the randomize button...", | |
| lines=8, | |
| max_lines=15 | |
| ) | |
| text_info = gr.Textbox( | |
| label="Text Info", | |
| value="", | |
| interactive=False, | |
| lines=1 | |
| ) | |
| with gr.Row(): | |
| random_text_button = gr.Button("π² Randomize Text", variant="secondary") | |
| clear_button = gr.Button("ποΈ Clear All", variant="stop") | |
| text_prompt.change(fn=estimate_duration, inputs=text_prompt, outputs=text_info) | |
| gr.Markdown("### ποΈ Audio Controls") | |
| with gr.Accordion("Basic Settings", open=True): | |
| speed = gr.Slider( | |
| 0.5, 2.0, | |
| step=0.1, | |
| value=1.0, | |
| label="Speaking Speed", | |
| info="Adjust how fast the speech is generated" | |
| ) | |
| denoise = gr.Slider( | |
| 0.0, 1.0, | |
| step=0.05, | |
| value=0.2, | |
| label="Denoise Strength", | |
| info="Higher values produce cleaner but less expressive audio" | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| avg_style = gr.Checkbox( | |
| label="Use Average Styles", | |
| value=True, | |
| info="Blend multiple style characteristics for smoother output" | |
| ) | |
| stabilize = gr.Checkbox( | |
| label="Stabilize Speaking Speed", | |
| value=True, | |
| info="Maintain consistent pacing throughout generation" | |
| ) | |
| seed = gr.Number( | |
| label="Random Seed (-1 for random)", | |
| value=-1, | |
| precision=0, | |
| info="Use same seed for reproducible results" | |
| ) | |
| random_seed_button = gr.Button("π² Generate Random Seed", size="sm") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π€ Voice Selection") | |
| audio_source = gr.Radio( | |
| choices=[("Preset Voices", "preset"), ("Custom Upload", "custom")], | |
| value="preset", | |
| label="Audio Source - Choose between preset voices or upload your own", | |
| elem_classes="audio-source-radio" | |
| ) | |
| example_voices = gr.Dropdown( | |
| label="Select Preset Voice", | |
| choices=voice_choices, | |
| value=voice_choices[0][1], | |
| interactive=True, | |
| allow_custom_value=False, | |
| filterable=True, | |
| visible=True | |
| ) | |
| custom_audio_upload = gr.Audio( | |
| label="Upload Custom Reference Audio (3-10 seconds of clear speech)", | |
| type='filepath', | |
| visible=False | |
| ) | |
| reference_audios = gr.Audio( | |
| label="Reference Audio Preview", | |
| type='filepath', | |
| interactive=False, | |
| value=voice_choices[0][1] | |
| ) | |
| gr.Markdown("### π Generated Output") | |
| # Streaming output | |
| streaming_audio = gr.Audio( | |
| label="π₯ Streaming Audio (Real-time)", | |
| type='numpy', | |
| interactive=False, | |
| streaming=True, | |
| autoplay=True | |
| ) | |
| # Non-streaming output | |
| synthesized_audio = gr.Audio( | |
| label="Complete Audio (Non-streaming)", | |
| type='numpy', | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| stream_button = gr.Button("π₯ Stream Speech", variant="primary", size="lg") | |
| gen_button = gr.Button("π£οΈ Generate Complete", variant="secondary", size="lg") | |
| status = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=3, | |
| placeholder="Status messages will appear here..." | |
| ) | |
| # Voice examples section | |
| with gr.Accordion("π Voice Gallery & Examples", open=False): | |
| gr.Markdown("### Quick Voice Preview") | |
| gr.Markdown("Browse through all available voices:") | |
| with gr.Row(): | |
| female_voices = [v for v in voice_choices if 'πΊ' in v[0]] | |
| male_voices = [v for v in voice_choices if 'πΉ' in v[0]] | |
| with gr.Column(): | |
| gr.Markdown(f"**Female Voices ({len(female_voices)})**") | |
| for voice_name, voice_path_item in female_voices: | |
| btn = gr.Button(voice_name, size="sm") | |
| btn.click( | |
| fn=voice_button_click, | |
| inputs=[gr.State(voice_path_item), gr.State(voice_name)], | |
| outputs=[example_voices, reference_audios, status] | |
| ) | |
| with gr.Column(): | |
| gr.Markdown(f"**Male Voices ({len(male_voices)})**") | |
| for voice_name, voice_path_item in male_voices: | |
| btn = gr.Button(voice_name, size="sm") | |
| btn.click( | |
| fn=voice_button_click, | |
| inputs=[gr.State(voice_path_item), gr.State(voice_name)], | |
| outputs=[example_voices, reference_audios, status] | |
| ) | |
| # Example texts section | |
| with gr.Accordion("π Example Text Library", open=False): | |
| gr.Markdown("### Browse Example Texts by Category") | |
| for category, texts in eg_texts.items(): | |
| with gr.Accordion(f"{category} ({len(texts)} examples)", open=False): | |
| for idx, text in enumerate(texts, 1): | |
| with gr.Row(): | |
| text_display = gr.Textbox( | |
| label=f"Example {idx}", | |
| value=text, | |
| lines=3, | |
| interactive=False, | |
| scale=4 | |
| ) | |
| load_btn = gr.Button("π Load", size="sm", scale=1) | |
| load_btn.click( | |
| fn=text_button_click, | |
| inputs=gr.State(text), | |
| outputs=[text_prompt, status] | |
| ) | |
| # Event handlers | |
| # Random text button | |
| random_text_button.click( | |
| fn=random_text, | |
| inputs=text_category, | |
| outputs=[text_prompt, status] | |
| ) | |
| # Clear all button | |
| clear_button.click( | |
| fn=clear_all, | |
| outputs=[text_prompt, reference_audios, streaming_audio, synthesized_audio, status] | |
| ) | |
| # Random seed button | |
| random_seed_button.click( | |
| fn=generate_random_seed, | |
| outputs=[seed, status] | |
| ) | |
| # Audio source switch | |
| audio_source.change( | |
| fn=switch_audio_source, | |
| inputs=audio_source, | |
| outputs=[example_voices, custom_audio_upload, reference_audios, status] | |
| ) | |
| # Example voice selection | |
| example_voices.change( | |
| fn=load_example_voice, | |
| inputs=example_voices, | |
| outputs=[reference_audios, status] | |
| ) | |
| # Custom audio upload | |
| custom_audio_upload.change( | |
| fn=handle_custom_audio_upload, | |
| inputs=[custom_audio_upload, audio_source], | |
| outputs=[reference_audios, reference_audios, status] | |
| ) | |
| # Streaming generation button | |
| stream_button.click( | |
| fn=generate_stream, | |
| inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed], | |
| outputs=streaming_audio | |
| ) | |
| # Non-streaming generation button | |
| gen_button.click( | |
| fn=main, | |
| inputs=[text_prompt, reference_audios, speed, denoise, avg_style, stabilize, seed], | |
| outputs=[synthesized_audio, status] | |
| ) | |
| # Footer | |
| gr.Markdown(f""" | |
| --- | |
| ### π Usage Tips | |
| 1. **Choose a Voice**: Select from 18 preset voices or upload your own reference audio (3-10 seconds recommended) | |
| 2. **Enter Text**: Type or select example text from the library | |
| 3. **Adjust Settings**: Fine-tune speed, denoising, and other parameters | |
| 4. **Generate**: | |
| - Click "π₯ Stream Speech" for real-time audio generation (hear it as it's created) | |
| - Click "π£οΈ Generate Complete" for full audio generation at once | |
| 5. **Experiment**: Try different voices, speeds, and text styles! | |
| ### βοΈ Parameter Guide | |
| - **Speaking Speed**: 0.5 = slow, 1.0 = normal, 2.0 = fast | |
| - **Denoise Strength**: Higher values = cleaner audio but less natural variation | |
| - **Average Styles**: Blends multiple style characteristics for consistency | |
| - **Stabilize Speed**: Maintains consistent pacing throughout speech | |
| - **Random Seed**: Use -1 for random, or set a specific number for reproducible results | |
| ### π― Custom Audio Tips | |
| - Use clear, high-quality recordings | |
| - 3-10 seconds of speech works best | |
| - Speak in a natural, conversational tone | |
| - Avoid background noise | |
| - Supported formats: WAV, MP3, FLAC, OGG, M4A | |
| ### π₯ Streaming vs Complete Generation | |
| - **Streaming**: Hear audio as it's being generated, chunk by chunk (great for long texts!) | |
| - **Complete**: Generates entire audio at once (better for short texts and downloading) | |
| --- | |
| **Model**: StyleTTS2-Lite | **Device**: {device} | **Voices**: {len(voice_choices)} | |
| π‘ *Tip: Use the seed parameter to generate the same audio multiple times with different settings!* | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue(max_size=20) | |
| demo.launch( | |
| share=True, | |
| debug=True, | |
| show_error=True | |
| ) |