import face_recognition
import cv2
import numpy as np
import google.generativeai as genai
import gradio as gr
from PIL import Image

def register_new_face(name, image, known_faces):
    """
    Registers a new face encoding into the state dictionary.
    """
    if not name or image is None:
        raise gr.Error("Please provide both a name and an image.")
    
    # Convert to RGB for face_recognition
    if image.shape[2] == 4:
        image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGB)
    else:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
    encodings = face_recognition.face_encodings(image)
    
    if not encodings:
        raise gr.Error("No face detected in the image. Please try another photo.")
    
    # Update state
    new_known_faces = known_faces.copy()
    new_known_faces[name] = encodings[0]
    
    # Return updated state, updated JSON list, and clear inputs
    return new_known_faces, list(new_known_faces.keys()), "", None

def process_video_frame(frame, known_faces):
    """
    Processes a video frame: detects faces, recognizes them, draws boxes,
    and updates the current detected user.
    """
    if frame is None:
        return None, "Unknown", "**👤 Detected:** Unknown", None

    # Store original for multimodal context (before drawing boxes)
    original_frame = frame.copy()

    # Resize for faster processing
    small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
    
    # Convert BGR/BGRA to RGB
    # Gradio usually provides RGB, but opencv operations might need care
    rgb_small_frame = np.ascontiguousarray(small_frame)

    # Find faces
    face_locations = face_recognition.face_locations(rgb_small_frame)
    face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)

    detected_name = "Unknown"
    face_names = []

    for face_encoding in face_encodings:
        name = "Unknown"
        if known_faces:
            known_names = list(known_faces.keys())
            known_encodings_list = list(known_faces.values())
            
            matches = face_recognition.compare_faces(known_encodings_list, face_encoding)
            face_distances = face_recognition.face_distance(known_encodings_list, face_encoding)
            
            if len(face_distances) > 0:
                best_match_index = np.argmin(face_distances)
                if matches[best_match_index]:
                    name = known_names[best_match_index]
        
        face_names.append(name)
        if name != "Unknown":
            detected_name = name

    # Draw results on frame
    annotated_frame = draw_overlays(frame, face_locations, face_names)
    
    status_md = f"**👤 Detected:** {detected_name}"
    if detected_name != "Unknown":
        status_md = f"**👤 Detected:** <span style='color: green'>{detected_name}</span>"

    return annotated_frame, detected_name, status_md, original_frame

def draw_overlays(frame, face_locations, face_names):
    """Helper to draw boxes and names on the frame"""
    # Scale back up face locations since we detected on 1/4 size
    for (top, right, bottom, left), name in zip(face_locations, face_names):
        top *= 4
        right *= 4
        bottom *= 4
        left *= 4

        # Draw a box around the face
        color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
        cv2.rectangle(frame, (left, top), (right, bottom), color, 2)

        # Draw a label with a name below the face
        cv2.rectangle(frame, (left, bottom - 35), (right, bottom), color, cv2.FILLED)
        font = cv2.FONT_HERSHEY_DUPLEX
        cv2.putText(frame, name, (left + 6, bottom - 6), font, 0.7, (255, 255, 255), 1)
    
    return frame

def generate_gemini_response(audio_path, history, user_name, api_key, system_prompt, use_vision, image_frame):
    """
    Sends audio (and optionally image) to Gemini and returns the response.
    """
    if not api_key:
        gr.Warning("Please enter your Gemini API Key in the Settings tab.")
        return history, history, None

    if not audio_path:
        return history, history, None

    try:
        genai.configure(api_key=api_key)
        # Use flash for speed
        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        
        # Prepare prompt
        identity_context = ""
        if user_name and user_name != "Unknown":
            identity_context = f"The user speaking is named {user_name}. "
        
        full_prompt = f"{system_prompt} {identity_context} Listen to the audio and respond."

        # Prepare content parts
        content_parts = [full_prompt]
        
        # Add Audio
        # Gradio audio type="filepath" gives a path to a wav/mp3
        # We upload the file to Gemini temporarily or pass bytes if small enough.
        # For simplicity in this demo, we upload the file using the File API 
        # (or simply pass data if supported by the specific SDK version, but File API is safer for audio).
        
        # Note: In a production high-traffic app, manage file lifecycle carefully.
        myfile = genai.upload_file(audio_path)
        content_parts.append(myfile)

        # Add Image if multimodal enabled
        if use_vision and image_frame is not None:
            # Convert numpy array to PIL Image
            pil_img = Image.fromarray(image_frame)
            content_parts.append(pil_img)
            content_parts[0] += " The user has also provided a video frame of what they are looking at."

        # Generate
        response = model.generate_content(content_parts)
        response_text = response.text

        # Update History (Gradio Chatbot 'messages' format)
        # User message is just a placeholder for "Audio Sent" since we don't STT locally
        history.append({"role": "user", "content": "🎤 [Audio Sent]"})
        history.append({"role": "assistant", "content": response_text})

        return history, history, None # Return None to clear the audio input

    except Exception as e:
        error_msg = f"Error: {str(e)}"
        history.append({"role": "assistant", "content": error_msg})
        return history, history, None