import cv2
import numpy as np
import pickle
import tensorflow as tf
import mediapipe as mp
from typing import List

# ----------------------------------------------------------------------
# Model and Encoder Loading (This section should remain unchanged)
# ----------------------------------------------------------------------
# Letters Model 1 (Static hand signs)
lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
    labelEncoder = pickle.load(f)

# Letters Model 2 (Temporal signs like J, Z, motion)
lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
    labelEncoder2 = pickle.load(f)

# Numbers Model (Static number signs)
numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
    numLabelEncoder = pickle.load(f)

sequenceNum = 20
hands = mp.solutions.hands.Hands(static_image_mode=True)
# ----------------------------------------------------------------------

def detectFromImage(sequenceList: List[bytes]):
    """
    Processes a sequence of image frames (provided as raw bytes) to detect sign 
    language letters and numbers using multiple models.
    """
    
    # 1. Input Validation
    if len(sequenceList) != sequenceNum:
        return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}

    processedSequence = []
    
    # Placeholder for the last valid frame (used for static fallback models)
    fallback_frame_cv2 = None 

    # 2. Process Sequence Frames (Temporal Model)
    for image_bytes in sequenceList:
        # --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) ---
        np_arr = np.frombuffer(image_bytes, np.uint8)
        image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array
        
        if image is None:
            # Skip corrupted frames
            continue 
        
        # Keep the last valid frame in OpenCV format for static models later
        fallback_frame_cv2 = image 
        
        # Convert BGR to RGB for MediaPipe
        imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = hands.process(imgRGB)

        if not results.multi_hand_landmarks:
            # Skip frames without a detected hand
            continue  

        handLandmarks = results.multi_hand_landmarks[0] 

        # --- Landmarking and Normalization ---
        xList, yList = [], []
        dataAux2 = []

        for lm in handLandmarks.landmark:
            xList.append(lm.x)
            yList.append(lm.y)

        # Normalize landmarks relative to minimum x and y
        for lm in handLandmarks.landmark:
            dataAux2.append(lm.x - min(xList))
            dataAux2.append(lm.y - min(yList))
            dataAux2.append(0) # Padding the Z dimension

        processedSequence.append(dataAux2)

    confidence2 = 0.0
    label2 = ""

    # The interpolation logic is commented out, leaving it as-is based on your provided code.
    
    # 3. Temporal Model Prediction (LettersModel2)
    if len(processedSequence) != sequenceNum:
        print("incomplete sequence: ", len(processedSequence))
        # If the sequence is too short after dropping frames, return empty result
        return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
        
    inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
    prediction2 = lettersModel2.predict(inputData2, verbose=0)

    index2 = np.argmax(prediction2, axis=1)[0]
    confidence2 = float(np.max(prediction2))
    label2 = labelEncoder2.inverse_transform([index2])[0]
    print(f'Letters Model 2:{label2} at {confidence2}')

    # 4. Static Model Prediction (Fallback/Verification)
    if fallback_frame_cv2 is not None:
        # Use the last valid frame detected by MediaPipe
        imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB)
        results = hands.process(imgRGB)
        
        if results.multi_hand_landmarks:
            handLandmarks = results.multi_hand_landmarks[0]
            xList, yList = [], []
            dataAux = []

            for lm in handLandmarks.landmark:
                xList.append(lm.x)
                yList.append(lm.y)

            for lm in handLandmarks.landmark:
                dataAux.append(lm.x - min(xList))
                dataAux.append(lm.y - min(yList))

            # check in letters model 1
            inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
            prediction1 = lettersModel.predict(inputData1, verbose=0)
            index1 = np.argmax(prediction1, axis=1)[0]
            confidence1 = float(np.max(prediction1))
            label1 = labelEncoder.inverse_transform([index1])[0]
            print(f'Letters Model 1: {label1} at {confidence1}')

            # check in numbers model
            prediction3 = numbersModel.predict(inputData1, verbose=0)
            index3 = np.argmax(prediction3, axis=1)[0]
            confidence3 = float(np.max(prediction3))
            label3 = numLabelEncoder.inverse_transform([index3])[0]
            print(f'Numbers Model: {label3} at {confidence3}')

            # 5. Result Aggregation
            if label1 == label2:
                # Both models agree on the letter
                return {'letter': label2, 'confidenceLetter': confidence2,
                        'number': label3, 'confidenceNumber': confidence3}
            else:
                # Default to static model 1 if disagreement (or implement better fusion logic here)
                return {'letter': label1, 'confidenceLetter': confidence1
                        , 'number': label3, 'confidenceNumber': confidence3} 
        else:
            # Hand detected in sequence but not in the final fallback frame (unlikely)
            return {'letter': label2, 'confidenceLetter': confidence2
                    , 'number': '', 'confidenceNumber': 0.0} 
    else: 
        # No hand detected in any frame, or all frames failed to decode.
        return {'letter': label2, 'confidenceLetter': confidence2
                , 'number': '', 'confidenceNumber': 0.0}