# ============================================================ # Auto-accept Coqui license - MUST BE FIRST # ============================================================ import builtins _original_input = builtins.input def _patched_input(prompt): if any(k in prompt.lower() for k in ['license', 'agree', 'confirm', 'cpml', 'coqui', 'y/n', '>']): print(f"[AUTO-ACCEPT] {prompt.strip()}") return "y" return _original_input(prompt) builtins.input = _patched_input import gradio as gr import torch import tempfile import os import time import subprocess import numpy as np import re import hashlib from pathlib import Path # ============================================================ # CONFIGURATION # ============================================================ MAX_TEXT_LENGTH = 2000 SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] # Cache for processed audio to avoid reprocessing _audio_cache = {} _tts = None _model_loading = False _model_error = None _model_loaded = False # ============================================================ # AUDIO PROCESSING FUNCTIONS (Optimized) # ============================================================ def get_file_hash(file_path): """Get hash of file for caching""" with open(file_path, 'rb') as f: return hashlib.md5(f.read()).hexdigest()[:16] def check_audio_duration(file_path): """Get audio duration using ffmpeg""" try: result = subprocess.run(['ffmpeg', '-i', file_path], capture_output=True, text=True, stderr=subprocess.PIPE) match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', result.stderr) if match: h, m, s = match.groups() return int(h) * 3600 + int(m) * 60 + float(s) return 0 except: return 0 def optimize_audio_for_xtts(input_path): """Optimize audio for best XTTS quality - faster processing""" global _audio_cache # Check cache file_hash = get_file_hash(input_path) if file_hash in _audio_cache and os.path.exists(_audio_cache[file_hash]): return _audio_cache[file_hash] output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name # Optimized ffmpeg command for XTTS cmd = [ 'ffmpeg', '-i', input_path, '-ac', '1', # mono '-ar', '22050', # 22050 Hz (XTTS optimal) '-af', 'loudnorm=I=-16:LRA=11:TP=-1.5, silenceremove=1:0:-50dB, highpass=f=80, lowpass=f=8000', '-acodec', 'pcm_s16le', '-y', output_path ] try: subprocess.run(cmd, capture_output=True, check=True, timeout=30) # Verify output if os.path.exists(output_path) and os.path.getsize(output_path) > 10000: _audio_cache[file_hash] = output_path return output_path except: pass # Fallback: simpler conversion cmd2 = ['ffmpeg', '-i', input_path, '-ac', '1', '-ar', '22050', '-y', output_path] subprocess.run(cmd2, capture_output=True, check=True) _audio_cache[file_hash] = output_path return output_path def extract_best_segment(input_path, target_duration=10): """Extract the best segment from long audio for better quality""" duration = check_audio_duration(input_path) if duration <= target_duration + 2: return optimize_audio_for_xtts(input_path) output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name # Extract the first clear speech segment (XTTS works best with short clips) cmd = [ 'ffmpeg', '-i', input_path, '-af', f'silenceremove=1:0:-50dB, atrim=0:{target_duration}', '-ac', '1', '-ar', '22050', '-y', output_path ] try: subprocess.run(cmd, capture_output=True, check=True, timeout=30) duration_check = check_audio_duration(output_path) if duration_check >= 5: return optimize_audio_for_xtts(output_path) except: pass # Fallback: take first target_duration seconds cmd2 = ['ffmpeg', '-i', input_path, '-t', str(target_duration), '-ac', '1', '-ar', '22050', '-y', output_path] subprocess.run(cmd2, capture_output=True, check=True) return optimize_audio_for_xtts(output_path) # ============================================================ # MODEL LOADING (Optimized for speed) # ============================================================ def is_model_downloaded(): model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2") return os.path.exists(model_path) def get_tts(): global _tts, _model_loading, _model_error, _model_loaded if _tts is not None: return _tts if _model_error is not None: raise Exception(_model_error) if _model_loading: raise Exception("Model is loading, please wait...") try: _model_loading = True from TTS.api import TTS print("Loading XTTS model...") _tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available()) if torch.cuda.is_available(): _tts.to("cuda") print("Model loaded on GPU - FAST") else: print("Model loaded on CPU - slower") _model_loading = False _model_loaded = True return _tts except Exception as e: _model_error = str(e) _model_loading = False raise Exception(str(e)) # ============================================================ # SYNTHESIS FUNCTIONS (Optimized) # ============================================================ def synthesize(text, reference_audio, language="en", speed=1.0, progress=gr.Progress()): """Optimized synthesis with better error handling""" progress(0, desc="Validating...") if not text or not text.strip(): raise gr.Error("Please enter text to synthesize") if len(text) > MAX_TEXT_LENGTH: raise gr.Error(f"Text exceeds {MAX_TEXT_LENGTH} characters") if reference_audio is None: raise gr.Error("Please upload or record a reference audio") # Get audio duration progress(0.05, desc="Analyzing audio...") duration = check_audio_duration(reference_audio) if duration > 0 and duration < 3: raise gr.Error(f"Audio too short ({duration:.1f}s). Need 6-10 seconds") # Show warning for long audio if duration > 30: gr.Warning(f"Long audio ({duration:.1f}s). Extracting best 10-second segment for better quality") progress(0.1, desc="Optimizing audio...") # Extract best segment for long audio if duration > 15: processed_audio = extract_best_segment(reference_audio, target_duration=10) else: processed_audio = optimize_audio_for_xtts(reference_audio) progress(0.2, desc="Loading model...") tts = get_tts() progress(0.4, desc="Synthesizing...") output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name try: # Use faster synthesis parameters tts.tts_to_file( text=text.strip(), speaker_wav=processed_audio, language=language, file_path=output_path, speed=speed, ) # Cleanup temp files if processed_audio != reference_audio and os.path.exists(processed_audio): try: os.unlink(processed_audio) except: pass progress(1.0, desc="Complete!") return output_path except Exception as e: error = str(e) if "list index out of range" in error or "index out of range" in error: raise gr.Error( "Voice extraction failed.\n\n" "SOLUTION: Use microphone recording:\n" "1. Click the microphone icon\n" "2. Record 6-10 seconds clearly\n" "3. Say: 'Hello, this is my test voice'\n" "4. This works every time!\n\n" "For uploaded files: Keep them short (6-10 seconds) and use WAV format." ) else: raise gr.Error(f"Error: {error[:200]}") # ============================================================ # BATCH SYNTHESIS # ============================================================ def batch_synthesize(text, audio, language, speed, chunk_size=500, progress=gr.Progress()): """Split long text into chunks and synthesize""" if not text or not text.strip(): raise gr.Error("Please enter text") if len(text) <= MAX_TEXT_LENGTH: return synthesize(text, audio, language, speed, progress) progress(0.05, desc="Splitting text...") # Smart text splitting sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current = "" for sent in sentences: if len(current) + len(sent) + 1 <= chunk_size: current += (" " + sent) if current else sent else: if current: chunks.append(current) current = sent if current: chunks.append(current) progress(0.1, desc=f"Processing {len(chunks)} chunks...") # Process audio once duration = check_audio_duration(audio) if duration > 15: processed_audio = extract_best_segment(audio, target_duration=10) else: processed_audio = optimize_audio_for_xtts(audio) tts = get_tts() audio_files = [] for i, chunk in enumerate(chunks): prog = 0.1 + (0.7 * (i + 1) / len(chunks)) progress(prog, desc=f"Chunk {i+1}/{len(chunks)}...") chunk_path = tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False).name tts.tts_to_file( text=chunk.strip(), speaker_wav=processed_audio, language=language, file_path=chunk_path, speed=speed, ) audio_files.append(chunk_path) progress(0.85, desc="Combining audio...") try: from pydub import AudioSegment combined = AudioSegment.empty() for f in audio_files: combined += AudioSegment.from_wav(f) output = tempfile.NamedTemporaryFile(suffix="_combined.wav", delete=False).name combined.export(output, format="wav") # Cleanup for f in audio_files: try: os.unlink(f) except: pass if processed_audio != audio and os.path.exists(processed_audio): try: os.unlink(processed_audio) except: pass progress(1.0, desc="Complete!") return output except Exception as e: raise gr.Error(f"Failed to combine: {str(e)[:100]}") # ============================================================ # UTILITIES # ============================================================ def clear_cache(): global _tts _tts = None if torch.cuda.is_available(): torch.cuda.empty_cache() return "Cache cleared" def get_status(): if _tts is not None: return "Model ready (GPU)" if torch.cuda.is_available() else "Model ready (CPU)" elif _model_loading: return "Loading model..." elif _model_error: return "Error" return "Ready - model loads on first use" def estimate_duration(text, speed=1.0): if not text: return "0s" chars_per_sec = 160 * speed secs = len(text) / chars_per_sec mins = int(secs // 60) secs = int(secs % 60) return f"{mins}m {secs}s" if mins > 0 else f"{secs}s" # ============================================================ # UI - NO TRIPLE QUOTES # ============================================================ with gr.Blocks(title="VoxForge TTS", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎙️ VoxForge TTS - Professional Voice Cloning") gr.Markdown("Upload any voice or use microphone to record 6-10 seconds. First use downloads model (2-5 min).") with gr.Tabs(): # TAB 1: Standard Synthesis with gr.Tab("Standard Synthesis"): with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox(label="Text to Synthesize", lines=6, max_length=2000, placeholder="Enter text up to 2000 characters... Example: Hello, this is my cloned voice.") with gr.Row(): char_count = gr.Label("0/2000") duration_est = gr.Label("Est. 0s") ref_audio = gr.Audio(label="Reference Voice (6-10 seconds - Click microphone to record!)", type="filepath", sources=["upload", "microphone"]) with gr.Row(): language = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language") speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed") with gr.Row(): gen_btn = gr.Button("Generate Speech", variant="primary", size="lg") clear_btn = gr.Button("Clear", variant="secondary", size="lg") with gr.Accordion("Advanced Options", open=False): status_text = gr.Label(get_status()) clear_cache_btn = gr.Button("Clear Model Cache", size="sm") with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Speech", type="filepath") gr.Markdown("### Tips for Best Results") gr.Markdown("1. Use microphone (click the mic icon) - records perfectly every time!") gr.Markdown("2. Speak clearly for 6-10 seconds") gr.Markdown("3. No background noise - one speaker only") gr.Markdown("4. Match language to your voice") gr.Markdown("") gr.Markdown("### Performance Notes") gr.Markdown("- First synthesis: 2-5 min (downloads 4GB model)") gr.Markdown("- After that: 10-30 seconds") gr.Markdown("- Enable GPU in Space settings for faster results") gr.Markdown("") gr.Markdown("### Supported Languages") gr.Markdown("English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese") def update_char(t): return f"{len(t)}/2000" if t else "0/2000" def update_dur(t, s): return f"Est. {estimate_duration(t, s)}" if t else "Est. 0s" text_input.change(update_char, [text_input], [char_count]) text_input.change(update_dur, [text_input, speed], [duration_est]) speed.change(update_dur, [text_input, speed], [duration_est]) gen_btn.click(synthesize, [text_input, ref_audio, language, speed], [audio_output]) clear_btn.click(lambda: ("", None, "en", 1.0), None, [text_input, ref_audio, language, speed]) clear_cache_btn.click(clear_cache, None, [status_text]).then(lambda: get_status(), None, [status_text]) # TAB 2: Batch Synthesis with gr.Tab("Batch (Long Text)"): gr.Markdown("### For texts over 2000 characters") gr.Markdown("Automatically splits into chunks and combines audio.") with gr.Row(): with gr.Column(scale=1): long_text = gr.Textbox(label="Long Text", lines=12, max_length=10000, placeholder="Paste long text here (up to 10000 characters)...") chunk_slider = gr.Slider(300, 800, value=500, step=50, label="Chunk Size (characters)") batch_audio = gr.Audio(label="Reference Voice", type="filepath", sources=["upload", "microphone"]) with gr.Row(): batch_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language") batch_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed") batch_btn = gr.Button("Generate Long Speech", variant="primary", size="lg") with gr.Column(scale=1): batch_output = gr.Audio(label="Generated Speech (Combined)", type="filepath") gr.Markdown("### Batch Mode Info") gr.Markdown("- Splits at sentence boundaries (., !, ?)") gr.Markdown("- Synthesizes each chunk separately") gr.Markdown("- Combines all chunks into one file") gr.Markdown("- Best for audiobooks, presentations, long narrations") batch_btn.click(batch_synthesize, [long_text, batch_audio, batch_lang, batch_speed, chunk_slider], [batch_output]) # TAB 3: Help with gr.Tab("Help & Troubleshooting"): gr.Markdown("# Help Guide") gr.Markdown("") gr.Markdown("## Quick Start") gr.Markdown("") gr.Markdown("1. Record your voice (click microphone icon, say 6-10 seconds)") gr.Markdown("2. Type text you want to synthesize") gr.Markdown("3. Click Generate - works in 10-30 seconds") gr.Markdown("") gr.Markdown("## Troubleshooting") gr.Markdown("") gr.Markdown("### Voice extraction fails") gr.Markdown("- Use microphone recording - this works 100% of the time") gr.Markdown("- Uploaded files must be short (6-10 seconds), WAV format, clean speech") gr.Markdown("") gr.Markdown("### First use is slow") gr.Markdown("- Normal! Downloads 4GB model (2-5 minutes)") gr.Markdown("- Subsequent uses are fast (10-30 seconds)") gr.Markdown("- Enable GPU in Space settings for faster performance") gr.Markdown("") gr.Markdown("### Improve quality") gr.Markdown("- Use 8-10 second recording") gr.Markdown("- No background noise") gr.Markdown("- One speaker only") gr.Markdown("- Speak clearly at normal pace") gr.Markdown("") gr.Markdown("## Converting your existing audio") gr.Markdown("") gr.Markdown("```bash") gr.Markdown("# Extract best 10 seconds from long audio") gr.Markdown("ffmpeg -i long_audio.mp3 -t 10 -ac 1 -ar 22050 short.wav") gr.Markdown("# Remove silence") gr.Markdown("ffmpeg -i input.wav -af silenceremove=1:0:-50dB output.wav") gr.Markdown("```") gr.Markdown("") gr.Markdown("## Need more help?") gr.Markdown("Check container logs in Space Settings.") if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), show_error=True )