import gradio as gr
import torch
import torchaudio
import tempfile
import os
import warnings
from contextlib import contextmanager
import gc
import librosa
import soundfile as sf

warnings.filterwarnings("ignore")
os.environ["COQUI_TOS_AGREED"] = "1"
print("🚀 Starting FINAL CORRECTED Voice Cloning Studio...")

@contextmanager
def patch_torch_load():
    original_load = torch.load
    def patched_load(f, *args, **kwargs):
        kwargs['weights_only'] = False
        return original_load(f, *args, **kwargs)
    torch.load = patched_load
    try:
        yield
    finally:
        torch.load = original_load

# Hardware setup
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 Device: {DEVICE}")

# Global model variables
TTS_MODEL = None
WHISPER_MODEL = None
MODEL_STATUS = "Not Loaded"

def load_xtts_optimized():
    global TTS_MODEL, MODEL_STATUS
    if TTS_MODEL is not None:
        return True
    try:
        with patch_torch_load():
            from TTS.api import TTS
            print("📦 Loading XTTS...")
            TTS_MODEL = TTS(
                model_name="tts_models/multilingual/multi-dataset/xtts_v2",
                progress_bar=False,
                gpu=(DEVICE == "cuda")
            )
            MODEL_STATUS = "XTTS-v2 Ready"
            print("✅ XTTS loaded successfully!")
            return True
    except Exception as e:
        print(f"❌ XTTS loading failed: {e}")
        MODEL_STATUS = f"XTTS Failed: {str(e)}"
        return False

def load_whisper_optimized():
    global WHISPER_MODEL
    if WHISPER_MODEL is not None:
        return True
    try:
        import whisper
        WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
        print("✅ Whisper loaded!")
        return True
    except Exception as e:
        print(f"❌ Whisper failed: {e}")
        return False

def optimize_audio_input(audio_path, max_duration=25):
    try:
        if not os.path.exists(audio_path):
            print(f"⚠️ Audio file not found: {audio_path}")
            return audio_path
            
        audio, sr = librosa.load(audio_path, sr=22050)
        max_samples = int(max_duration * sr)
        if len(audio) > max_samples:
            audio = audio[:max_samples]
            print(f"🔄 Audio trimmed to {max_duration}s")
        
        optimized_path = audio_path.replace('.wav', '_opt.wav').replace('.mp3', '_opt.wav')
        sf.write(optimized_path, audio, sr)
        print(f"✅ Audio optimized: {optimized_path}")
        return optimized_path
        
    except Exception as e:
        print(f"⚠️ Audio optimization failed: {e}")
        return audio_path

def safe_file_path(file_input, input_name="audio"):
    """Extract file path from various input formats"""
    try:
        if file_input is None:
            return None
            
        # If it's already a string path
        if isinstance(file_input, str):
            if os.path.exists(file_input):
                return file_input
            else:
                print(f"⚠️ File path doesn't exist: {file_input}")
                return None
        
        # If it's a file object with name attribute
        if hasattr(file_input, 'name'):
            file_path = file_input.name
            if file_path and os.path.exists(file_path):
                return file_path
        
        # If it's a dict-like object
        if hasattr(file_input, 'get'):
            file_path = file_input.get('name') or file_input.get('path')
            if file_path and os.path.exists(file_path):
                return file_path
        
        print(f"⚠️ Could not extract file path from {input_name}: {type(file_input)}")
        return None
        
    except Exception as e:
        print(f"❌ Error processing {input_name}: {e}")
        return None

def voice_to_voice_clone_final(reference_audio, input_audio, language="en"):
    """FINAL CORRECTED voice cloning function"""
    try:
        print(f"🎭 Voice cloning request: {language}")
        print(f"📁 Input types - Ref: {type(reference_audio)}, Input: {type(input_audio)}")
        
        # Extract file paths safely
        reference_path = safe_file_path(reference_audio, "reference")
        input_path = safe_file_path(input_audio, "input")
        
        if not reference_path:
            return None, "❌ Could not process reference audio file."
        
        if not input_path:
            return None, "❌ Could not process input audio file."
        
        print(f"📁 Processing files - Ref: {reference_path}, Input: {input_path}")
        
        # Validate files
        if not os.path.exists(reference_path) or os.path.getsize(reference_path) < 1000:
            return None, "❌ Reference audio file is invalid."
            
        if not os.path.exists(input_path) or os.path.getsize(input_path) < 1000:
            return None, "❌ Input audio file is invalid."
        
        # Load models
        if not load_xtts_optimized():
            return None, f"❌ XTTS model failed: {MODEL_STATUS}"
        
        load_whisper_optimized()
        
        # Optimize audio files
        print("🔄 Optimizing audio files...")
        ref_optimized = optimize_audio_input(reference_path, max_duration=20)
        input_optimized = optimize_audio_input(input_path, max_duration=25)
        
        # Transcribe input audio
        extracted_text = "This is a voice cloning demonstration."
        if WHISPER_MODEL:
            try:
                print("🎤 Transcribing audio...")
                with torch.no_grad():
                    result = WHISPER_MODEL.transcribe(
                        input_optimized,
                        fp16=(DEVICE == "cuda"),
                        language=language if language != 'auto' else None
                    )
                text = result.get("text", "").strip()
                if text and len(text) > 5:
                    extracted_text = text[:400]
                print(f"✅ Transcribed: '{extracted_text[:50]}...'")
            except Exception as e:
                print(f"⚠️ Transcription warning: {e}")
        
        # Generate cloned voice
        print("🚀 Generating cloned voice...")
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            output_path = tmp_file.name
        
        try:
            with patch_torch_load(), torch.no_grad():
                TTS_MODEL.tts_to_file(
                    text=extracted_text,
                    speaker_wav=ref_optimized,
                    language=language,
                    file_path=output_path,
                    temperature=0.7,
                    length_penalty=1.0,
                    repetition_penalty=5.0
                )
        except Exception as tts_error:
            print(f"❌ TTS generation error: {tts_error}")
            return None, f"❌ Voice generation failed: {str(tts_error)}"
        
        # Memory cleanup
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
        gc.collect()
        
        # Validate and return output
        if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
            file_size_kb = os.path.getsize(output_path) / 1024
            
            success_message = f"""✅ VOICE CLONING SUCCESS! 🎉

📝 Text: "{extracted_text[:100]}{'...' if len(extracted_text) > 100 else ''}"
🎭 Device: {DEVICE} | Model: {MODEL_STATUS}
📊 Output: {file_size_kb:.1f} KB | Language: {language.upper()}
🔧 Optimizations Applied Successfully"""
            
            print("✅ Voice cloning completed successfully!")
            
            # CRITICAL FIX: Return file path directly for Gradio compatibility
            return output_path, success_message
            
        else:
            return None, "❌ Voice cloning failed - output file is empty."
            
    except Exception as e:
        error_msg = f"❌ Voice cloning error: {str(e)}"
        print(error_msg)
        import traceback
        print("Full traceback:", traceback.format_exc())
        return None, error_msg

# CRITICAL: Use gr.Interface (not Blocks) for better API compatibility
interface = gr.Interface(
    fn=voice_to_voice_clone_final,
    inputs=[
        gr.Audio(
            label="🎤 Reference Audio (Voice to Clone)",
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Audio(
            label="🎵 Input Audio (Content to Transform)", 
            type="filepath"  # CRITICAL: Must be filepath for API compatibility
        ),
        gr.Dropdown(
            choices=["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
            value="en",
            label="🌍 Language"
        )
    ],
    outputs=[
        gr.Audio(
            label="🎉 Cloned Voice Result",
            type="filepath"  # CRITICAL: Must be filepath for proper return
        ),
        gr.Textbox(
            label="📋 Processing Status",
            lines=8
        )
    ],
    title="🎭 AI Voice Cloning Studio - FINAL",
    description="Transform voices using XTTS-v2 and Whisper AI. Upload clear audio files (10-30 seconds each).",
    theme=gr.themes.Soft(),
    allow_flagging="never",
    api_name="voice_to_voice_clone"  # CRITICAL: API endpoint name
)

if __name__ == "__main__":
    print("🌐 Launching FINAL CORRECTED Voice Cloning Studio...")
    
    # CORRECTED: Proper queue configuration
    interface.queue(
        max_size=2,  # Reduced for stability
        api_open=True,
        default_concurrency_limit=1
    ).launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_api=True,
        debug=False  # Disable debug for production
    )