# ============================================================
# Auto-accept Coqui license - MUST BE FIRST
# ============================================================
import builtins

_original_input = builtins.input

def _patched_input(prompt):
    if any(k in prompt.lower() for k in ['license', 'agree', 'confirm', 'cpml', 'coqui', 'y/n', '>']):
        print(f"[AUTO-ACCEPT] {prompt.strip()}")
        return "y"
    return _original_input(prompt)

builtins.input = _patched_input

import gradio as gr
import torch
import tempfile
import os
import time
import subprocess
import numpy as np
import re
import hashlib
from pathlib import Path

# ============================================================
# CONFIGURATION
# ============================================================
MAX_TEXT_LENGTH = 2000
SUPPORTED_LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]

# Cache for processed audio to avoid reprocessing
_audio_cache = {}
_tts = None
_model_loading = False
_model_error = None
_model_loaded = False

# ============================================================
# AUDIO PROCESSING FUNCTIONS (Optimized)
# ============================================================

def get_file_hash(file_path):
    """Get hash of file for caching"""
    with open(file_path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()[:16]

def check_audio_duration(file_path):
    """Get audio duration using ffmpeg"""
    try:
        result = subprocess.run(['ffmpeg', '-i', file_path], capture_output=True, text=True, stderr=subprocess.PIPE)
        match = re.search(r'Duration: (\d{2}):(\d{2}):(\d{2}\.\d{2})', result.stderr)
        if match:
            h, m, s = match.groups()
            return int(h) * 3600 + int(m) * 60 + float(s)
        return 0
    except:
        return 0

def optimize_audio_for_xtts(input_path):
    """Optimize audio for best XTTS quality - faster processing"""
    global _audio_cache
    
    # Check cache
    file_hash = get_file_hash(input_path)
    if file_hash in _audio_cache and os.path.exists(_audio_cache[file_hash]):
        return _audio_cache[file_hash]
    
    output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    
    # Optimized ffmpeg command for XTTS
    cmd = [
        'ffmpeg', '-i', input_path,
        '-ac', '1',                     # mono
        '-ar', '22050',                 # 22050 Hz (XTTS optimal)
        '-af', 'loudnorm=I=-16:LRA=11:TP=-1.5, silenceremove=1:0:-50dB, highpass=f=80, lowpass=f=8000',
        '-acodec', 'pcm_s16le',
        '-y', output_path
    ]
    
    try:
        subprocess.run(cmd, capture_output=True, check=True, timeout=30)
        # Verify output
        if os.path.exists(output_path) and os.path.getsize(output_path) > 10000:
            _audio_cache[file_hash] = output_path
            return output_path
    except:
        pass
    
    # Fallback: simpler conversion
    cmd2 = ['ffmpeg', '-i', input_path, '-ac', '1', '-ar', '22050', '-y', output_path]
    subprocess.run(cmd2, capture_output=True, check=True)
    _audio_cache[file_hash] = output_path
    return output_path

def extract_best_segment(input_path, target_duration=10):
    """Extract the best segment from long audio for better quality"""
    duration = check_audio_duration(input_path)
    
    if duration <= target_duration + 2:
        return optimize_audio_for_xtts(input_path)
    
    output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    
    # Extract the first clear speech segment (XTTS works best with short clips)
    cmd = [
        'ffmpeg', '-i', input_path,
        '-af', f'silenceremove=1:0:-50dB, atrim=0:{target_duration}',
        '-ac', '1', '-ar', '22050',
        '-y', output_path
    ]
    
    try:
        subprocess.run(cmd, capture_output=True, check=True, timeout=30)
        duration_check = check_audio_duration(output_path)
        if duration_check >= 5:
            return optimize_audio_for_xtts(output_path)
    except:
        pass
    
    # Fallback: take first target_duration seconds
    cmd2 = ['ffmpeg', '-i', input_path, '-t', str(target_duration), '-ac', '1', '-ar', '22050', '-y', output_path]
    subprocess.run(cmd2, capture_output=True, check=True)
    return optimize_audio_for_xtts(output_path)

# ============================================================
# MODEL LOADING (Optimized for speed)
# ============================================================

def is_model_downloaded():
    model_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
    return os.path.exists(model_path)

def get_tts():
    global _tts, _model_loading, _model_error, _model_loaded
    
    if _tts is not None:
        return _tts
    
    if _model_error is not None:
        raise Exception(_model_error)
    
    if _model_loading:
        raise Exception("Model is loading, please wait...")
    
    try:
        _model_loading = True
        from TTS.api import TTS
        
        print("Loading XTTS model...")
        _tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
        
        if torch.cuda.is_available():
            _tts.to("cuda")
            print("Model loaded on GPU - FAST")
        else:
            print("Model loaded on CPU - slower")
        
        _model_loading = False
        _model_loaded = True
        return _tts
        
    except Exception as e:
        _model_error = str(e)
        _model_loading = False
        raise Exception(str(e))

# ============================================================
# SYNTHESIS FUNCTIONS (Optimized)
# ============================================================

def synthesize(text, reference_audio, language="en", speed=1.0, progress=gr.Progress()):
    """Optimized synthesis with better error handling"""
    
    progress(0, desc="Validating...")
    
    if not text or not text.strip():
        raise gr.Error("Please enter text to synthesize")
    if len(text) > MAX_TEXT_LENGTH:
        raise gr.Error(f"Text exceeds {MAX_TEXT_LENGTH} characters")
    if reference_audio is None:
        raise gr.Error("Please upload or record a reference audio")
    
    # Get audio duration
    progress(0.05, desc="Analyzing audio...")
    duration = check_audio_duration(reference_audio)
    
    if duration > 0 and duration < 3:
        raise gr.Error(f"Audio too short ({duration:.1f}s). Need 6-10 seconds")
    
    # Show warning for long audio
    if duration > 30:
        gr.Warning(f"Long audio ({duration:.1f}s). Extracting best 10-second segment for better quality")
    
    progress(0.1, desc="Optimizing audio...")
    
    # Extract best segment for long audio
    if duration > 15:
        processed_audio = extract_best_segment(reference_audio, target_duration=10)
    else:
        processed_audio = optimize_audio_for_xtts(reference_audio)
    
    progress(0.2, desc="Loading model...")
    tts = get_tts()
    
    progress(0.4, desc="Synthesizing...")
    output_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    
    try:
        # Use faster synthesis parameters
        tts.tts_to_file(
            text=text.strip(),
            speaker_wav=processed_audio,
            language=language,
            file_path=output_path,
            speed=speed,
        )
        
        # Cleanup temp files
        if processed_audio != reference_audio and os.path.exists(processed_audio):
            try:
                os.unlink(processed_audio)
            except:
                pass
        
        progress(1.0, desc="Complete!")
        return output_path
        
    except Exception as e:
        error = str(e)
        if "list index out of range" in error or "index out of range" in error:
            raise gr.Error(
                "Voice extraction failed.\n\n"
                "SOLUTION: Use microphone recording:\n"
                "1. Click the microphone icon\n"
                "2. Record 6-10 seconds clearly\n"
                "3. Say: 'Hello, this is my test voice'\n"
                "4. This works every time!\n\n"
                "For uploaded files: Keep them short (6-10 seconds) and use WAV format."
            )
        else:
            raise gr.Error(f"Error: {error[:200]}")

# ============================================================
# BATCH SYNTHESIS
# ============================================================

def batch_synthesize(text, audio, language, speed, chunk_size=500, progress=gr.Progress()):
    """Split long text into chunks and synthesize"""
    if not text or not text.strip():
        raise gr.Error("Please enter text")
    
    if len(text) <= MAX_TEXT_LENGTH:
        return synthesize(text, audio, language, speed, progress)
    
    progress(0.05, desc="Splitting text...")
    
    # Smart text splitting
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current = ""
    
    for sent in sentences:
        if len(current) + len(sent) + 1 <= chunk_size:
            current += (" " + sent) if current else sent
        else:
            if current:
                chunks.append(current)
            current = sent
    if current:
        chunks.append(current)
    
    progress(0.1, desc=f"Processing {len(chunks)} chunks...")
    
    # Process audio once
    duration = check_audio_duration(audio)
    if duration > 15:
        processed_audio = extract_best_segment(audio, target_duration=10)
    else:
        processed_audio = optimize_audio_for_xtts(audio)
    
    tts = get_tts()
    audio_files = []
    
    for i, chunk in enumerate(chunks):
        prog = 0.1 + (0.7 * (i + 1) / len(chunks))
        progress(prog, desc=f"Chunk {i+1}/{len(chunks)}...")
        
        chunk_path = tempfile.NamedTemporaryFile(suffix=f"_{i}.wav", delete=False).name
        tts.tts_to_file(
            text=chunk.strip(),
            speaker_wav=processed_audio,
            language=language,
            file_path=chunk_path,
            speed=speed,
        )
        audio_files.append(chunk_path)
    
    progress(0.85, desc="Combining audio...")
    
    try:
        from pydub import AudioSegment
        combined = AudioSegment.empty()
        for f in audio_files:
            combined += AudioSegment.from_wav(f)
        
        output = tempfile.NamedTemporaryFile(suffix="_combined.wav", delete=False).name
        combined.export(output, format="wav")
        
        # Cleanup
        for f in audio_files:
            try:
                os.unlink(f)
            except:
                pass
        if processed_audio != audio and os.path.exists(processed_audio):
            try:
                os.unlink(processed_audio)
            except:
                pass
        
        progress(1.0, desc="Complete!")
        return output
        
    except Exception as e:
        raise gr.Error(f"Failed to combine: {str(e)[:100]}")

# ============================================================
# UTILITIES
# ============================================================

def clear_cache():
    global _tts
    _tts = None
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    return "Cache cleared"

def get_status():
    if _tts is not None:
        return "Model ready (GPU)" if torch.cuda.is_available() else "Model ready (CPU)"
    elif _model_loading:
        return "Loading model..."
    elif _model_error:
        return "Error"
    return "Ready - model loads on first use"

def estimate_duration(text, speed=1.0):
    if not text:
        return "0s"
    chars_per_sec = 160 * speed
    secs = len(text) / chars_per_sec
    mins = int(secs // 60)
    secs = int(secs % 60)
    return f"{mins}m {secs}s" if mins > 0 else f"{secs}s"

# ============================================================
# UI - NO TRIPLE QUOTES
# ============================================================

with gr.Blocks(title="VoxForge TTS", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("# 🎙️ VoxForge TTS - Professional Voice Cloning")
    gr.Markdown("Upload any voice or use microphone to record 6-10 seconds. First use downloads model (2-5 min).")
    
    with gr.Tabs():
        
        # TAB 1: Standard Synthesis
        with gr.Tab("Standard Synthesis"):
            with gr.Row():
                with gr.Column(scale=1):
                    text_input = gr.Textbox(label="Text to Synthesize", lines=6, max_length=2000, placeholder="Enter text up to 2000 characters... Example: Hello, this is my cloned voice.")
                    
                    with gr.Row():
                        char_count = gr.Label("0/2000")
                        duration_est = gr.Label("Est. 0s")
                    
                    ref_audio = gr.Audio(label="Reference Voice (6-10 seconds - Click microphone to record!)", type="filepath", sources=["upload", "microphone"])
                    
                    with gr.Row():
                        language = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
                        speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")
                    
                    with gr.Row():
                        gen_btn = gr.Button("Generate Speech", variant="primary", size="lg")
                        clear_btn = gr.Button("Clear", variant="secondary", size="lg")
                    
                    with gr.Accordion("Advanced Options", open=False):
                        status_text = gr.Label(get_status())
                        clear_cache_btn = gr.Button("Clear Model Cache", size="sm")
                
                with gr.Column(scale=1):
                    audio_output = gr.Audio(label="Generated Speech", type="filepath")
                    
                    gr.Markdown("### Tips for Best Results")
                    gr.Markdown("1. Use microphone (click the mic icon) - records perfectly every time!")
                    gr.Markdown("2. Speak clearly for 6-10 seconds")
                    gr.Markdown("3. No background noise - one speaker only")
                    gr.Markdown("4. Match language to your voice")
                    gr.Markdown("")
                    gr.Markdown("### Performance Notes")
                    gr.Markdown("- First synthesis: 2-5 min (downloads 4GB model)")
                    gr.Markdown("- After that: 10-30 seconds")
                    gr.Markdown("- Enable GPU in Space settings for faster results")
                    gr.Markdown("")
                    gr.Markdown("### Supported Languages")
                    gr.Markdown("English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese")
            
            def update_char(t):
                return f"{len(t)}/2000" if t else "0/2000"
            
            def update_dur(t, s):
                return f"Est. {estimate_duration(t, s)}" if t else "Est. 0s"
            
            text_input.change(update_char, [text_input], [char_count])
            text_input.change(update_dur, [text_input, speed], [duration_est])
            speed.change(update_dur, [text_input, speed], [duration_est])
            
            gen_btn.click(synthesize, [text_input, ref_audio, language, speed], [audio_output])
            clear_btn.click(lambda: ("", None, "en", 1.0), None, [text_input, ref_audio, language, speed])
            clear_cache_btn.click(clear_cache, None, [status_text]).then(lambda: get_status(), None, [status_text])
        
        # TAB 2: Batch Synthesis
        with gr.Tab("Batch (Long Text)"):
            gr.Markdown("### For texts over 2000 characters")
            gr.Markdown("Automatically splits into chunks and combines audio.")
            
            with gr.Row():
                with gr.Column(scale=1):
                    long_text = gr.Textbox(label="Long Text", lines=12, max_length=10000, placeholder="Paste long text here (up to 10000 characters)...")
                    chunk_slider = gr.Slider(300, 800, value=500, step=50, label="Chunk Size (characters)")
                    batch_audio = gr.Audio(label="Reference Voice", type="filepath", sources=["upload", "microphone"])
                    with gr.Row():
                        batch_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="en", label="Language")
                        batch_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.05, label="Speed")
                    batch_btn = gr.Button("Generate Long Speech", variant="primary", size="lg")
                with gr.Column(scale=1):
                    batch_output = gr.Audio(label="Generated Speech (Combined)", type="filepath")
                    
                    gr.Markdown("### Batch Mode Info")
                    gr.Markdown("- Splits at sentence boundaries (., !, ?)")
                    gr.Markdown("- Synthesizes each chunk separately")
                    gr.Markdown("- Combines all chunks into one file")
                    gr.Markdown("- Best for audiobooks, presentations, long narrations")
            
            batch_btn.click(batch_synthesize, [long_text, batch_audio, batch_lang, batch_speed, chunk_slider], [batch_output])
        
        # TAB 3: Help
        with gr.Tab("Help & Troubleshooting"):
            gr.Markdown("# Help Guide")
            gr.Markdown("")
            gr.Markdown("## Quick Start")
            gr.Markdown("")
            gr.Markdown("1. Record your voice (click microphone icon, say 6-10 seconds)")
            gr.Markdown("2. Type text you want to synthesize")
            gr.Markdown("3. Click Generate - works in 10-30 seconds")
            gr.Markdown("")
            gr.Markdown("## Troubleshooting")
            gr.Markdown("")
            gr.Markdown("### Voice extraction fails")
            gr.Markdown("- Use microphone recording - this works 100% of the time")
            gr.Markdown("- Uploaded files must be short (6-10 seconds), WAV format, clean speech")
            gr.Markdown("")
            gr.Markdown("### First use is slow")
            gr.Markdown("- Normal! Downloads 4GB model (2-5 minutes)")
            gr.Markdown("- Subsequent uses are fast (10-30 seconds)")
            gr.Markdown("- Enable GPU in Space settings for faster performance")
            gr.Markdown("")
            gr.Markdown("### Improve quality")
            gr.Markdown("- Use 8-10 second recording")
            gr.Markdown("- No background noise")
            gr.Markdown("- One speaker only")
            gr.Markdown("- Speak clearly at normal pace")
            gr.Markdown("")
            gr.Markdown("## Converting your existing audio")
            gr.Markdown("")
            gr.Markdown("```bash")
            gr.Markdown("# Extract best 10 seconds from long audio")
            gr.Markdown("ffmpeg -i long_audio.mp3 -t 10 -ac 1 -ar 22050 short.wav")
            gr.Markdown("# Remove silence")
            gr.Markdown("ffmpeg -i input.wav -af silenceremove=1:0:-50dB output.wav")
            gr.Markdown("```")
            gr.Markdown("")
            gr.Markdown("## Need more help?")
            gr.Markdown("Check container logs in Space Settings.")

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
        show_error=True
    )