Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # Auto-accept Coqui license - MUST BE FIRST | |
| # ============================================================ | |
| import builtins | |
| _original_input = builtins.input | |
| def _patched_input(prompt): | |
| if any(k in prompt.lower() for k in ['license', 'agree', 'confirm', 'cpml', 'coqui', 'y/n', '>']): | |
| print(f"[AUTO-ACCEPT] {prompt.strip()}") | |
| return "y" | |
| return _original_input(prompt) | |
| builtins.input = _patched_input | |
| import gradio as gr | |
| import torch | |
| import tempfile | |
| import os | |
| import time | |
| import subprocess | |
| import numpy as np | |
| import re | |
| import hashlib | |
| from pathlib import Path | |
| # ============================================================ | |
| # CONFIGURATION | |
| # ============================================================ | |
| MAX_TEXT_LENGTH = 2000 | |
| SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] | |
| # Cache for processed audio to avoid reprocessing | |
| _audio_cache = {} | |
| _tts = None | |
| _model_loading = False | |
| _model_error = None | |
| _model_loaded = False | |
| # ============================================================ | |
| # AUDIO PROCESSING FUNCTIONS (Optimized) | |
| # ============================================================ | |
| def get_file_hash(file_path): | |
| """Get hash of file for caching""" | |
| with open(file_path, 'rb') as f: | |
| return hashlib.md5(f.read()).hexdigest()[:16] | |
| def check_audio_duration(file_path): | |
| """Get audio duration using ffmpeg""" | |
| try: | |
| result = subprocess.run(['ffmpeg', '-i', file_path], capture_output=True, text=True, stderr=subprocess.PIPE) | |
| match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', result.stderr) | |
| if match: | |
| h, m, s = match.groups() | |
| return int(h) * 3600 + int(m) * 60 + float(s) | |
| return 0 | |
| except: | |
| return 0 | |
| def optimize_audio_for_xtts(input_path): | |
| """Optimize audio for best XTTS quality - faster processing""" | |
| global _audio_cache | |
| # Check cache | |
| file_hash = get_file_hash(input_path) | |
| if file_hash in _audio_cache and os.path.exists(_audio_cache[file_hash]): | |
| return _audio_cache[file_hash] | |
| output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| # Optimized ffmpeg command for XTTS | |
| cmd = [ | |
| 'ffmpeg', '-i', input_path, | |
| '-ac', '1', # mono | |
| '-ar', '22050', # 22050 Hz (XTTS optimal) | |
| '-af', 'loudnorm=I=-16:LRA=11:TP=-1.5, silenceremove=1:0:-50dB, highpass=f=80, lowpass=f=8000', | |
| '-acodec', 'pcm_s16le', | |
| '-y', output_path | |
| ] | |
| try: | |
| subprocess.run(cmd, capture_output=True, check=True, timeout=30) | |
| # Verify output | |
| if os.path.exists(output_path) and os.path.getsize(output_path) > 10000: | |
| _audio_cache[file_hash] = output_path | |
| return output_path | |
| except: | |
| pass | |
| # Fallback: simpler conversion | |
| cmd2 = ['ffmpeg', '-i', input_path, '-ac', '1', '-ar', '22050', '-y', output_path] | |
| subprocess.run(cmd2, capture_output=True, check=True) | |
| _audio_cache[file_hash] = output_path | |
| return output_path | |
| def extract_best_segment(input_path, target_duration=10): | |
| """Extract the best segment from long audio for better quality""" | |
| duration = check_audio_duration(input_path) | |
| if duration <= target_duration + 2: | |
| return optimize_audio_for_xtts(input_path) | |
| output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| # Extract the first clear speech segment (XTTS works best with short clips) | |
| cmd = [ | |
| 'ffmpeg', '-i', input_path, | |
| '-af', f'silenceremove=1:0:-50dB, atrim=0:{target_duration}', | |
| '-ac', '1', '-ar', '22050', | |
| '-y', output_path | |
| ] | |
| try: | |
| subprocess.run(cmd, capture_output=True, check=True, timeout=30) | |
| duration_check = check_audio_duration(output_path) | |
| if duration_check >= 5: | |
| return optimize_audio_for_xtts(output_path) | |
| except: | |
| pass | |
| # Fallback: take first target_duration seconds | |
| cmd2 = ['ffmpeg', '-i', input_path, '-t', str(target_duration), '-ac', '1', '-ar', '22050', '-y', output_path] | |
| subprocess.run(cmd2, capture_output=True, check=True) | |
| return optimize_audio_for_xtts(output_path) | |
| # ============================================================ | |
| # MODEL LOADING (Optimized for speed) | |
| # ============================================================ | |
| def is_model_downloaded(): | |
| model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2") | |
| return os.path.exists(model_path) | |
| def get_tts(): | |
| global _tts, _model_loading, _model_error, _model_loaded | |
| if _tts is not None: | |
| return _tts | |
| if _model_error is not None: | |
| raise Exception(_model_error) | |
| if _model_loading: | |
| raise Exception("Model is loading, please wait...") | |
| try: | |
| _model_loading = True | |
| from TTS.api import TTS | |
| print("Loading XTTS model...") | |
| _tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available()) | |
| if torch.cuda.is_available(): | |
| _tts.to("cuda") | |
| print("Model loaded on GPU - FAST") | |
| else: | |
| print("Model loaded on CPU - slower") | |
| _model_loading = False | |
| _model_loaded = True | |
| return _tts | |
| except Exception as e: | |
| _model_error = str(e) | |
| _model_loading = False | |
| raise Exception(str(e)) | |
| # ============================================================ | |
| # SYNTHESIS FUNCTIONS (Optimized) | |
| # ============================================================ | |
| def synthesize(text, reference_audio, language="en", speed=1.0, progress=gr.Progress()): | |
| """Optimized synthesis with better error handling""" | |
| progress(0, desc="Validating...") | |
| if not text or not text.strip(): | |
| raise gr.Error("Please enter text to synthesize") | |
| if len(text) > MAX_TEXT_LENGTH: | |
| raise gr.Error(f"Text exceeds {MAX_TEXT_LENGTH} characters") | |
| if reference_audio is None: | |
| raise gr.Error("Please upload or record a reference audio") | |
| # Get audio duration | |
| progress(0.05, desc="Analyzing audio...") | |
| duration = check_audio_duration(reference_audio) | |
| if duration > 0 and duration < 3: | |
| raise gr.Error(f"Audio too short ({duration:.1f}s). Need 6-10 seconds") | |
| # Show warning for long audio | |
| if duration > 30: | |
| gr.Warning(f"Long audio ({duration:.1f}s). Extracting best 10-second segment for better quality") | |
| progress(0.1, desc="Optimizing audio...") | |
| # Extract best segment for long audio | |
| if duration > 15: | |
| processed_audio = extract_best_segment(reference_audio, target_duration=10) | |
| else: | |
| processed_audio = optimize_audio_for_xtts(reference_audio) | |
| progress(0.2, desc="Loading model...") | |
| tts = get_tts() | |
| progress(0.4, desc="Synthesizing...") | |
| output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| try: | |
| # Use faster synthesis parameters | |
| tts.tts_to_file( | |
| text=text.strip(), | |
| speaker_wav=processed_audio, | |
| language=language, | |
| file_path=output_path, | |
| speed=speed, | |
| ) | |
| # Cleanup temp files | |
| if processed_audio != reference_audio and os.path.exists(processed_audio): | |
| try: | |
| os.unlink(processed_audio) | |
| except: | |
| pass | |
| progress(1.0, desc="Complete!") | |
| return output_path | |
| except Exception as e: | |
| error = str(e) | |
| if "list index out of range" in error or "index out of range" in error: | |
| raise gr.Error( | |
| "Voice extraction failed.\n\n" | |
| "SOLUTION: Use microphone recording:\n" | |
| "1. Click the microphone icon\n" | |
| "2. Record 6-10 seconds clearly\n" | |
| "3. Say: 'Hello, this is my test voice'\n" | |
| "4. This works every time!\n\n" | |
| "For uploaded files: Keep them short (6-10 seconds) and use WAV format." | |
| ) | |
| else: | |
| raise gr.Error(f"Error: {error[:200]}") | |
| # ============================================================ | |
| # BATCH SYNTHESIS | |
| # ============================================================ | |
| def batch_synthesize(text, audio, language, speed, chunk_size=500, progress=gr.Progress()): | |
| """Split long text into chunks and synthesize""" | |
| if not text or not text.strip(): | |
| raise gr.Error("Please enter text") | |
| if len(text) <= MAX_TEXT_LENGTH: | |
| return synthesize(text, audio, language, speed, progress) | |
| progress(0.05, desc="Splitting text...") | |
| # Smart text splitting | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current = "" | |
| for sent in sentences: | |
| if len(current) + len(sent) + 1 <= chunk_size: | |
| current += (" " + sent) if current else sent | |
| else: | |
| if current: | |
| chunks.append(current) | |
| current = sent | |
| if current: | |
| chunks.append(current) | |
| progress(0.1, desc=f"Processing {len(chunks)} chunks...") | |
| # Process audio once | |
| duration = check_audio_duration(audio) | |
| if duration > 15: | |
| processed_audio = extract_best_segment(audio, target_duration=10) | |
| else: | |
| processed_audio = optimize_audio_for_xtts(audio) | |
| tts = get_tts() | |
| audio_files = [] | |
| for i, chunk in enumerate(chunks): | |
| prog = 0.1 + (0.7 * (i + 1) / len(chunks)) | |
| progress(prog, desc=f"Chunk {i+1}/{len(chunks)}...") | |
| chunk_path = tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False).name | |
| tts.tts_to_file( | |
| text=chunk.strip(), | |
| speaker_wav=processed_audio, | |
| language=language, | |
| file_path=chunk_path, | |
| speed=speed, | |
| ) | |
| audio_files.append(chunk_path) | |
| progress(0.85, desc="Combining audio...") | |
| try: | |
| from pydub import AudioSegment | |
| combined = AudioSegment.empty() | |
| for f in audio_files: | |
| combined += AudioSegment.from_wav(f) | |
| output = tempfile.NamedTemporaryFile(suffix="_combined.wav", delete=False).name | |
| combined.export(output, format="wav") | |
| # Cleanup | |
| for f in audio_files: | |
| try: | |
| os.unlink(f) | |
| except: | |
| pass | |
| if processed_audio != audio and os.path.exists(processed_audio): | |
| try: | |
| os.unlink(processed_audio) | |
| except: | |
| pass | |
| progress(1.0, desc="Complete!") | |
| return output | |
| except Exception as e: | |
| raise gr.Error(f"Failed to combine: {str(e)[:100]}") | |
| # ============================================================ | |
| # UTILITIES | |
| # ============================================================ | |
| def clear_cache(): | |
| global _tts | |
| _tts = None | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return "Cache cleared" | |
| def get_status(): | |
| if _tts is not None: | |
| return "Model ready (GPU)" if torch.cuda.is_available() else "Model ready (CPU)" | |
| elif _model_loading: | |
| return "Loading model..." | |
| elif _model_error: | |
| return "Error" | |
| return "Ready - model loads on first use" | |
| def estimate_duration(text, speed=1.0): | |
| if not text: | |
| return "0s" | |
| chars_per_sec = 160 * speed | |
| secs = len(text) / chars_per_sec | |
| mins = int(secs // 60) | |
| secs = int(secs % 60) | |
| return f"{mins}m {secs}s" if mins > 0 else f"{secs}s" | |
| # ============================================================ | |
| # UI - NO TRIPLE QUOTES | |
| # ============================================================ | |
| with gr.Blocks(title="VoxForge TTS", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# 🎙️ VoxForge TTS - Professional Voice Cloning") | |
| gr.Markdown("Upload any voice or use microphone to record 6-10 seconds. First use downloads model (2-5 min).") | |
| with gr.Tabs(): | |
| # TAB 1: Standard Synthesis | |
| with gr.Tab("Standard Synthesis"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox(label="Text to Synthesize", lines=6, max_length=2000, placeholder="Enter text up to 2000 characters... Example: Hello, this is my cloned voice.") | |
| with gr.Row(): | |
| char_count = gr.Label("0/2000") | |
| duration_est = gr.Label("Est. 0s") | |
| ref_audio = gr.Audio(label="Reference Voice (6-10 seconds - Click microphone to record!)", type="filepath", sources=["upload", "microphone"]) | |
| with gr.Row(): | |
| language = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language") | |
| speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed") | |
| with gr.Row(): | |
| gen_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| clear_btn = gr.Button("Clear", variant="secondary", size="lg") | |
| with gr.Accordion("Advanced Options", open=False): | |
| status_text = gr.Label(get_status()) | |
| clear_cache_btn = gr.Button("Clear Model Cache", size="sm") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio(label="Generated Speech", type="filepath") | |
| gr.Markdown("### Tips for Best Results") | |
| gr.Markdown("1. Use microphone (click the mic icon) - records perfectly every time!") | |
| gr.Markdown("2. Speak clearly for 6-10 seconds") | |
| gr.Markdown("3. No background noise - one speaker only") | |
| gr.Markdown("4. Match language to your voice") | |
| gr.Markdown("") | |
| gr.Markdown("### Performance Notes") | |
| gr.Markdown("- First synthesis: 2-5 min (downloads 4GB model)") | |
| gr.Markdown("- After that: 10-30 seconds") | |
| gr.Markdown("- Enable GPU in Space settings for faster results") | |
| gr.Markdown("") | |
| gr.Markdown("### Supported Languages") | |
| gr.Markdown("English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese") | |
| def update_char(t): | |
| return f"{len(t)}/2000" if t else "0/2000" | |
| def update_dur(t, s): | |
| return f"Est. {estimate_duration(t, s)}" if t else "Est. 0s" | |
| text_input.change(update_char, [text_input], [char_count]) | |
| text_input.change(update_dur, [text_input, speed], [duration_est]) | |
| speed.change(update_dur, [text_input, speed], [duration_est]) | |
| gen_btn.click(synthesize, [text_input, ref_audio, language, speed], [audio_output]) | |
| clear_btn.click(lambda: ("", None, "en", 1.0), None, [text_input, ref_audio, language, speed]) | |
| clear_cache_btn.click(clear_cache, None, [status_text]).then(lambda: get_status(), None, [status_text]) | |
| # TAB 2: Batch Synthesis | |
| with gr.Tab("Batch (Long Text)"): | |
| gr.Markdown("### For texts over 2000 characters") | |
| gr.Markdown("Automatically splits into chunks and combines audio.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| long_text = gr.Textbox(label="Long Text", lines=12, max_length=10000, placeholder="Paste long text here (up to 10000 characters)...") | |
| chunk_slider = gr.Slider(300, 800, value=500, step=50, label="Chunk Size (characters)") | |
| batch_audio = gr.Audio(label="Reference Voice", type="filepath", sources=["upload", "microphone"]) | |
| with gr.Row(): | |
| batch_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language") | |
| batch_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed") | |
| batch_btn = gr.Button("Generate Long Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| batch_output = gr.Audio(label="Generated Speech (Combined)", type="filepath") | |
| gr.Markdown("### Batch Mode Info") | |
| gr.Markdown("- Splits at sentence boundaries (., !, ?)") | |
| gr.Markdown("- Synthesizes each chunk separately") | |
| gr.Markdown("- Combines all chunks into one file") | |
| gr.Markdown("- Best for audiobooks, presentations, long narrations") | |
| batch_btn.click(batch_synthesize, [long_text, batch_audio, batch_lang, batch_speed, chunk_slider], [batch_output]) | |
| # TAB 3: Help | |
| with gr.Tab("Help & Troubleshooting"): | |
| gr.Markdown("# Help Guide") | |
| gr.Markdown("") | |
| gr.Markdown("## Quick Start") | |
| gr.Markdown("") | |
| gr.Markdown("1. Record your voice (click microphone icon, say 6-10 seconds)") | |
| gr.Markdown("2. Type text you want to synthesize") | |
| gr.Markdown("3. Click Generate - works in 10-30 seconds") | |
| gr.Markdown("") | |
| gr.Markdown("## Troubleshooting") | |
| gr.Markdown("") | |
| gr.Markdown("### Voice extraction fails") | |
| gr.Markdown("- Use microphone recording - this works 100% of the time") | |
| gr.Markdown("- Uploaded files must be short (6-10 seconds), WAV format, clean speech") | |
| gr.Markdown("") | |
| gr.Markdown("### First use is slow") | |
| gr.Markdown("- Normal! Downloads 4GB model (2-5 minutes)") | |
| gr.Markdown("- Subsequent uses are fast (10-30 seconds)") | |
| gr.Markdown("- Enable GPU in Space settings for faster performance") | |
| gr.Markdown("") | |
| gr.Markdown("### Improve quality") | |
| gr.Markdown("- Use 8-10 second recording") | |
| gr.Markdown("- No background noise") | |
| gr.Markdown("- One speaker only") | |
| gr.Markdown("- Speak clearly at normal pace") | |
| gr.Markdown("") | |
| gr.Markdown("## Converting your existing audio") | |
| gr.Markdown("") | |
| gr.Markdown("```bash") | |
| gr.Markdown("# Extract best 10 seconds from long audio") | |
| gr.Markdown("ffmpeg -i long_audio.mp3 -t 10 -ac 1 -ar 22050 short.wav") | |
| gr.Markdown("# Remove silence") | |
| gr.Markdown("ffmpeg -i input.wav -af silenceremove=1:0:-50dB output.wav") | |
| gr.Markdown("```") | |
| gr.Markdown("") | |
| gr.Markdown("## Need more help?") | |
| gr.Markdown("Check container logs in Space Settings.") | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=int(os.environ.get("PORT", 7860)), | |
| show_error=True | |
| ) |