import gradio as gr
import torch
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
from gtts import gTTS
import numpy as np
import tempfile
import os
import google.generativeai as genai

# Set Google GenAI API key from environment variable
#GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
#genai.configure(api_key=GOOGLE_API_KEY)
genai.configure(api_key="AIzaSyB3N9BHeIWs_8sdFK76PU-v9N6prcIq2Hw")
#model = genai.GenerativeModel("gemini-1.5-pro")
#chat = model.start_chat(history=[])

# Load GenAI model
print("Loading Google Generative AI model...")
gen_model = genai.GenerativeModel("gemini-1.5-pro")


# Load ASR
print("Loading ASR model...")
speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

# Load GPT-2
print("Loading GPT-2 model...")
response_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
response_model = GPT2LMHeadModel.from_pretrained("gpt2")
response_model.eval()

# Main logic
def process_input(emotion, audio_input, text_input):
    print(f"\n---\nEmotion: {emotion}")

    # Handle audio input
    audio_text = ""
    if audio_input is not None:
        print("Audio input detected. Transcribing...")
        try:
            sample_rate, audio_data = audio_input
            if len(audio_data) == 0 or np.all(audio_data == 0):
                print("Silent or empty audio.")
            else:
                audio_data = audio_data / np.max(np.abs(audio_data))
                audio_text = speech_to_text_pipeline({
                    "sampling_rate": sample_rate,
                    "array": audio_data
                })["text"]
                print(f"Audio transcription: {audio_text}")
        except Exception as e:
            print(f"Speech-to-text error: {e}")
            audio_text = ""

    # Combine input
    combined_input_text = (text_input or "") + " " + (audio_text or "")
    combined_input_text = combined_input_text.strip()
    print(f"User input: {combined_input_text}")

    if not combined_input_text:
        return "Please provide text or audio input.", None

    # Add emotion context
    prompt = f"The user feels {emotion}. Respond supportively: {combined_input_text}"
    print(f"Final prompt to model: {prompt}")

    # Use Google GenAI
    try:
        gen_response = gen_model.generate_content(prompt)
        text_output = gen_response.text.strip()
        print(f"Google GenAI response: {text_output}")
    except Exception as e:
        print(f"GenAI Error: {e}")
        # Fallback to GPT-2
        print("Falling back to GPT-2...")
        try:
            input_ids = response_tokenizer.encode(prompt, return_tensors='pt')[:, -512:]
            with torch.no_grad():
                output = response_model.generate(
                    input_ids=input_ids,
                    max_length=input_ids.shape[1] + 50,
                    num_beams=3,
                    temperature=0.8,
                    no_repeat_ngram_size=2,
                    early_stopping=True
                )
            text_output = response_tokenizer.decode(output[0], skip_special_tokens=True)
            print(f"GPT-2 fallback response: {text_output}")
        except Exception as gpt_error:
            print(f"GPT-2 Error: {gpt_error}")
            text_output = "Sorry, I couldn't generate a response."

    # TTS conversion
    try:
        print("Generating speech...")
        tts = gTTS(text_output)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_file.name)
        audio_output_path = temp_file.name
        print(f"TTS audio saved at: {audio_output_path}")
    except Exception as e:
        print(f"TTS Error: {e}")
        audio_output_path = None

    return text_output, audio_output_path

# Gradio Interface
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Radio(["positive", "neutral", "negative"], label="Your Emotion"),
        gr.Audio(type="numpy", label="Speak..."),
        gr.Textbox(label="Text Input", placeholder="Or type here..."),
    ],
    outputs=[
        gr.Textbox(label="AI Response"),
        gr.Audio(label="Spoken Response"),
    ],
    title="Emotion-Aware Multimodal AI Assistant",
    description="Choose your emotional state, then talk or type to the AI assistant. It responds based on your emotional context.",
)

if __name__ == "__main__":
    iface.launch()