import gradio as gr
import torch
import numpy as np
import librosa
from transformers import (
    pipeline, AutoTokenizer, AutoModelForCausalLM, 
    WhisperProcessor, WhisperForConditionalGeneration
)
import soundfile as sf
import json
import time
from datetime import datetime
import os
import warnings

# Import Dia model correctly[2]
try:
    from dia.model import Dia
    DIA_AVAILABLE = True
    print("✅ Dia model imported successfully")
except ImportError as e:
    print(f"⚠️ Dia import failed: {e}")
    DIA_AVAILABLE = False

warnings.filterwarnings("ignore")

class MayaAI:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"🚀 Initializing Maya AI on {self.device}")
        
        # Load Whisper ASR with FORCED English
        self.asr_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
        self.asr_model = WhisperForConditionalGeneration.from_pretrained(
            "openai/whisper-large-v3",
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
        ).to(self.device)
        
        # FORCE English transcription
        self.asr_model.config.forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(
            language="english", 
            task="transcribe"
        )
        print("✅ Whisper ASR loaded with FORCED English")
        
        # Load FREE LLM with FIXED attention mask
        self.llm_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
        # FIX: Set pad_token to eos_token to avoid attention mask warnings
        if self.llm_tokenizer.pad_token is None:
            self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
        
        self.llm_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/DialoGPT-large",
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map="auto",
            pad_token_id=self.llm_tokenizer.eos_token_id
        )
        print("✅ DialoGPT-Large loaded with FIXED attention masks")
        
        # Load Emotion Recognition
        self.emotion_model = pipeline(
            "audio-classification",
            model="superb/wav2vec2-base-superb-er",
            device=self.device
        )
        print("✅ Emotion recognition loaded")
        
        # Load REAL Dia TTS Model[2]
        if DIA_AVAILABLE:
            try:
                # Load Dia model with correct parameters[2]
                self.dia_model = Dia.from_pretrained(
                    "nari-labs/Dia-1.6B", 
                    compute_dtype="float16" if self.device == "cuda" else "float32",
                    device=self.device
                )
                print("✅ Dia TTS loaded (Ultra-realistic dialogue generation)")
                self.use_dia = True
            except Exception as e:
                print(f"⚠️ Dia loading failed: {e}")
                self.use_dia = False
                self._load_fallback_tts()
        else:
            print("⚠️ Dia not available, using fallback TTS")
            self.use_dia = False
            self._load_fallback_tts()
        
        # Conversation storage
        self.conversations = {}
        self.call_active = False
        self.speaker_turn = 1  # Track speaker turns for Dia[2]
        
    def _load_fallback_tts(self):
        """Load fallback TTS if Dia is not available"""
        try:
            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
            from datasets import load_dataset
            
            self.tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
            self.tts_model = SpeechT5ForTextToSpeech.from_pretrained(
                "microsoft/speecht5_tts",
                torch_dtype=torch.float32
            ).to(self.device)
            self.vocoder = SpeechT5HifiGan.from_pretrained(
                "microsoft/speecht5_hifigan",
                torch_dtype=torch.float32
            ).to(self.device)
            
            # Load female speaker embeddings
            embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
            self.speaker_embeddings = torch.tensor(
                embeddings_dataset[7306]["xvector"], 
                dtype=torch.float32
            ).unsqueeze(0).to(self.device)
            print("✅ SpeechT5 TTS loaded as fallback")
        except Exception as e:
            print(f"❌ Fallback TTS loading failed: {e}")
    
    def transcribe_with_whisper(self, audio_path):
        """Transcribe using Whisper with FORCED English"""
        try:
            if audio_path is None:
                return "No audio provided"
            
            # Load and preprocess audio
            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
            
            # Process with Whisper - FORCE English
            inputs = self.asr_processor(
                audio, 
                sampling_rate=16000, 
                return_tensors="pt",
                language="english"
            ).to(self.device)
            
            with torch.no_grad():
                predicted_ids = self.asr_model.generate(
                    inputs.input_features,
                    max_new_tokens=150,
                    do_sample=False,
                    forced_decoder_ids=self.asr_model.config.forced_decoder_ids
                )
            
            transcription = self.asr_processor.batch_decode(
                predicted_ids, 
                skip_special_tokens=True
            )[0]
            
            return transcription.strip()
            
        except Exception as e:
            return f"Transcription error: {str(e)}"
    
    def recognize_emotion_from_audio(self, audio_path):
        """Recognize emotion using superb model"""
        try:
            if audio_path is None:
                return "neutral"
            
            result = self.emotion_model(audio_path)
            emotion_label = result[0]["label"].lower()
            
            # Map to human emotions
            emotion_map = {
                "ang": "angry", "hap": "happy", "exc": "excited",
                "sad": "sad", "fru": "frustrated", "fea": "fearful",
                "sur": "surprised", "neu": "neutral", "dis": "disgusted"
            }
            
            return emotion_map.get(emotion_label, emotion_label)
        except:
            return "neutral"
    
    def generate_with_free_llm(self, text, emotion, history):
        """Generate response using FREE LLM with FIXED attention masks"""
        try:
            # Emotional context prompting
            emotion_prompts = {
                "angry": "I understand you're frustrated. Let me help calm this situation.",
                "sad": "I can hear the sadness in your voice. I'm here to support you.",
                "happy": "Your joy is infectious! I love your positive energy.",
                "excited": "Your enthusiasm is amazing! Tell me more!",
                "fearful": "I sense your concern. Let's work through this together.",
                "surprised": "That sounds unexpected! What happened?",
                "neutral": "I'm listening carefully. Please continue."
            }
            
            emotion_context = emotion_prompts.get(emotion, "I'm here to help.")
            
            # Build conversation context
            context_text = ""
            if history:
                for entry in history[-2:]:
                    context_text += f"User: {entry.get('user_input', '')}\nMaya: {entry.get('ai_response', '')}\n"
            
            prompt = f"{context_text}User: {text}\nMaya:"
            
            # Tokenize input with PROPER attention mask
            inputs = self.llm_tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=1024,
                padding=True,
                add_special_tokens=True
            ).to(self.device)
            
            # Generate response with PROPER attention mask
            with torch.no_grad():
                outputs = self.llm_model.generate(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=80,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.llm_tokenizer.pad_token_id,
                    eos_token_id=self.llm_tokenizer.eos_token_id
                )
            
            # Decode response
            response = self.llm_tokenizer.decode(
                outputs[0][inputs.input_ids.shape[1]:], 
                skip_special_tokens=True
            ).strip()
            
            # Clean up response
            if not response or len(response) < 5:
                return emotion_context
            
            return response
            
        except Exception as e:
            return f"{emotion_prompts.get(emotion, 'I understand.')} Could you tell me more about that?"
    
    def synthesize_with_dia(self, text, emotion):
        """Generate ultra-realistic dialogue using Dia[2]"""
        try:
            if not text or len(text.strip()) == 0:
                return None
            
            if self.use_dia:
                # Format text for Dia with proper speaker tags[2]
                speaker_tag = f"[S{self.speaker_turn}]"
                
                # Add emotional non-verbals based on emotion[2]
                if emotion == "happy":
                    emotional_text = f"{speaker_tag} {text} (laughs)"
                elif emotion == "sad":
                    emotional_text = f"{speaker_tag} {text} (sighs)"
                elif emotion == "excited":
                    emotional_text = f"{speaker_tag} {text}!"
                elif emotion == "angry":
                    emotional_text = f"{speaker_tag} {text} (frustrated tone)"
                elif emotion == "surprised":
                    emotional_text = f"{speaker_tag} {text} (gasps)"
                else:
                    emotional_text = f"{speaker_tag} {text}"
                
                # Generate with Dia[2]
                output = self.dia_model.generate(
                    emotional_text, 
                    use_torch_compile=True if self.device == "cuda" else False,
                    verbose=False
                )
                
                # Toggle speaker for next turn[2]
                self.speaker_turn = 2 if self.speaker_turn == 1 else 1
                
                return output
            else:
                # Fallback to SpeechT5
                return self._synthesize_with_fallback(text, emotion)
            
        except Exception as e:
            print(f"Dia TTS error: {e}")
            return self._synthesize_with_fallback(text, emotion)
    
    def _synthesize_with_fallback(self, text, emotion):
        """Fallback TTS synthesis"""
        try:
            clean_text = text.replace("[", "").replace("]", "").strip()
            if len(clean_text) > 200:
                clean_text = clean_text[:200] + "..."
            
            # Add emotional inflection through punctuation
            if emotion == "happy":
                clean_text = clean_text.replace(".", "!")
            elif emotion == "excited":
                clean_text = clean_text + "!"
            elif emotion == "sad":
                clean_text = clean_text.replace("!", ".")
            
            inputs = self.tts_processor(text=clean_text, return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                speech = self.tts_model.generate_speech(
                    inputs["input_ids"], 
                    self.speaker_embeddings, 
                    vocoder=self.vocoder
                )
            
            if isinstance(speech, torch.Tensor):
                speech = speech.cpu().numpy().astype(np.float32)
            
            return speech
        except Exception as e:
            print(f"Fallback TTS error: {e}")
            return None
    
    def start_call(self):
        """Start a new call session"""
        self.call_active = True
        self.speaker_turn = 1  # Reset speaker turn[2]
        greeting = "Hello! I'm Maya, your AI conversation partner. I'm here to chat with you naturally and understand your emotions. How are you feeling today?"
        
        greeting_audio = self.synthesize_with_dia(greeting, "happy")
        
        # Dia outputs at 24kHz, fallback at 22050Hz[2]
        sample_rate = 24000 if self.use_dia else 22050
        return greeting, (sample_rate, greeting_audio) if greeting_audio is not None else None, "📞 Call started! Maya is greeting you with ultra-realistic speech..."
    
    def end_call(self, user_id="default"):
        """End call and clear conversation"""
        self.call_active = False
        if user_id in self.conversations:
            self.conversations[user_id] = []
        
        farewell = "Thank you for chatting with me! It was wonderful talking with you. Have a great day!"
        farewell_audio = self.synthesize_with_dia(farewell, "happy")
        
        sample_rate = 24000 if self.use_dia else 22050
        return farewell, (sample_rate, farewell_audio) if farewell_audio is not None else None, "📞 Call ended. Conversation cleared!"
    
    def process_conversation(self, audio_input, user_id="default"):
        """Main conversation processing pipeline"""
        if not self.call_active:
            return "Please start a call first by clicking the 'Start Call' button", None, "No active call"
        
        if audio_input is None:
            return "Please record some audio", None, "No audio input"
        
        start_time = time.time()
        
        if user_id not in self.conversations:
            self.conversations[user_id] = []
        
        try:
            # Step 1: ASR with FORCED English
            transcription = self.transcribe_with_whisper(audio_input)
            
            # Step 2: Emotion recognition
            emotion = self.recognize_emotion_from_audio(audio_input)
            
            # Step 3: FREE LLM generation with FIXED attention masks
            response_text = self.generate_with_free_llm(
                transcription, emotion, self.conversations[user_id]
            )
            
            # Step 4: Ultra-realistic TTS with Dia[2]
            response_audio = self.synthesize_with_dia(response_text, emotion)
            
            # Step 5: Update conversation history
            processing_time = time.time() - start_time
            conversation_entry = {
                "timestamp": datetime.now().strftime("%H:%M:%S"),
                "user_input": transcription,
                "user_emotion": emotion,
                "ai_response": response_text,
                "processing_time": processing_time
            }
            
            self.conversations[user_id].append(conversation_entry)
            
            # Keep last 1000 exchanges as requested[5]
            if len(self.conversations[user_id]) > 1000:
                self.conversations[user_id] = self.conversations[user_id][-1000:]
            
            history = self.format_conversation_history(user_id)
            
            sample_rate = 24000 if self.use_dia else 22050
            return transcription, (sample_rate, response_audio) if response_audio is not None else None, history
            
        except Exception as e:
            return f"Processing error: {str(e)}", None, "Error in processing"
    
    def format_conversation_history(self, user_id):
        """Format conversation history for display"""
        if user_id not in self.conversations or not self.conversations[user_id]:
            return "No conversation history yet."
        
        history = []
        for i, entry in enumerate(self.conversations[user_id][-10:], 1):
            history.append(f"**Exchange {i}** ({entry['timestamp']})")
            history.append(f"🎤 **You** ({entry['user_emotion']}): {entry['user_input']}")
            history.append(f"🤖 **Maya**: {entry['ai_response']}")
            history.append(f"⏱️ *{entry['processing_time']:.2f}s*")
            history.append("---")
        
        return "\n".join(history)

# Initialize Maya AI
print("🚀 Starting Maya AI with REAL Dia TTS...")
maya = MayaAI()
print("✅ Maya AI ready with ultra-realistic dialogue generation!")

# Gradio Interface Functions
def start_call_handler():
    return maya.start_call()

def end_call_handler():
    return maya.end_call()

def process_audio_handler(audio):
    return maya.process_conversation(audio)

# Create Gradio Interface[7]
with gr.Blocks(
    title="Maya AI - Dia-Powered Sesame Killer",
    theme=gr.themes.Soft()
) as demo:
    
    gr.Markdown("""
    # 🎤 Maya AI - Dia-Powered Sesame Killer
    *Ultra-realistic dialogue generation with Dia TTS - Natural breathing, laughter, and human-like responses*
    
    **Features:** ✅ Real Dia TTS ✅ English-only ASR ✅ Emotion Recognition ✅ FREE LLM ✅ Ultra-realistic Speech
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📞 Call Controls")
            
            start_call_btn = gr.Button("📞 Start Call", variant="primary", size="lg")
            end_call_btn = gr.Button("📞 End Call", variant="stop", size="lg")
            
            gr.Markdown("### 🎙️ Voice Input")
            audio_input = gr.Audio(
                sources=["microphone"],
                type="filepath",
                label="Record your message in English"
            )
            
            process_btn = gr.Button("🎯 Process Audio", variant="primary")
            
        with gr.Column(scale=2):
            gr.Markdown("### 💬 Ultra-Realistic Conversation")
            
            transcription_output = gr.Textbox(
                label="📝 What you said (English)",
                lines=2,
                interactive=False
            )
            
            audio_output = gr.Audio(
                label="🔊 Maya's Ultra-Realistic Response (Dia TTS)",
                interactive=False,
                autoplay=True
            )
            
            conversation_display = gr.Textbox(
                label="💭 Live Conversation (FREE & Ultra-Realistic)",
                lines=15,
                interactive=False,
                show_copy_button=True
            )
    
    # Event Handlers
    start_call_btn.click(
        fn=start_call_handler,
        outputs=[transcription_output, audio_output, conversation_display]
    )
    
    end_call_btn.click(
        fn=end_call_handler,
        outputs=[transcription_output, audio_output, conversation_display]
    )
    
    process_btn.click(
        fn=process_audio_handler,
        inputs=[audio_input],
        outputs=[transcription_output, audio_output, conversation_display]
    )
    
    audio_input.stop_recording(
        fn=process_audio_handler,
        inputs=[audio_input],
        outputs=[transcription_output, audio_output, conversation_display]
    )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )