import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import mediapipe as mp

modelPath = 'ai_model/words/saved_models/best_sign_classifier_model_40_words_seq90.keras'
csvPath = 'ai_model/words/wlasl_40_words_personal_final_processed_data_augmented_seq90.csv'
sequenceLength = 30
expectedCoordsPerFrame = 1662
confidenceThreshold = 0.7

model = load_model(modelPath)
df = pd.read_csv(csvPath)
uniqueGlosses = df['gloss'].unique()
idToGloss = {i: g for i, g in enumerate(uniqueGlosses)}

mpHolistic = mp.solutions.holistic.Holistic(
    static_image_mode=True,
    model_complexity=1,
    min_detection_confidence=0.2,
    min_tracking_confidence=0.5
)

numPoseCoordsSingle = 33*4
numHandCoordsSingle = 21*3
numFaceCoordsSingle = 468*3

def normalizeLandmarks(landmarksSequence):
    if landmarksSequence.ndim == 1:
        landmarksSequence = np.expand_dims(landmarksSequence, axis=0)

    normalizedSequences = []
    for frameLandmarks in landmarksSequence:
        if np.all(frameLandmarks == 0):
            normalizedSequences.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
            continue

        poseCoordsFlat = frameLandmarks[0 : numPoseCoordsSingle]
        leftHandCoordsFlat = frameLandmarks[numPoseCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle]
        rightHandCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle*2]
        faceCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle*2 : ]

        allPartsData = [
            (poseCoordsFlat, 4, [0.0]*numPoseCoordsSingle),
            (leftHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
            (rightHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
            (faceCoordsFlat, 3, [0.0]*numFaceCoordsSingle)
        ]

        normalizedFrameParts = []
        for flatLms, coordsPerLm, template in allPartsData:
            if np.all(flatLms == 0):
                normalizedFrameParts.append(np.array(template, dtype=np.float32))
                continue

            lmsArray = flatLms.reshape(-1, coordsPerLm)
            coordsForMean = lmsArray[:, :3] if coordsPerLm == 4 else lmsArray
            meanCoords = np.mean(coordsForMean, axis=0)
            translatedLms = lmsArray.copy()
            translatedLms[:, :3] -= meanCoords
            scaleFactor = np.max(np.linalg.norm(translatedLms[:, :3], axis=1))
            if scaleFactor > 1e-6:
                translatedLms[:, :3] /= scaleFactor
            normalizedFrameParts.append(translatedLms.flatten())

        combinedFrame = np.concatenate(normalizedFrameParts).astype(np.float32)
        if len(combinedFrame) < expectedCoordsPerFrame:
            combinedFrame = np.pad(combinedFrame, (0, expectedCoordsPerFrame - len(combinedFrame)), 'constant')
        elif len(combinedFrame) > expectedCoordsPerFrame:
            combinedFrame = combinedFrame[:expectedCoordsPerFrame]

        normalizedSequences.append(combinedFrame)

    return np.array(normalizedSequences, dtype=np.float32)

def padOrTruncateSequence(sequence, targetLength, featureDimension):
    if sequence.shape[0] < targetLength:
        padding = np.zeros((targetLength - sequence.shape[0], featureDimension), dtype=np.float32)
        return np.vstack((sequence, padding))
    return sequence[:targetLength, :]

def detectFromImageBytes(sequenceBytesList):
    sequence = []

    for idx, imageBytes in enumerate(sequenceBytesList):
        nparr = np.frombuffer(imageBytes, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        if img is None:
            print(f"Warning: Could not decode image bytes at index {idx}")
            sequence.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
            continue

        imgRgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        mpResults = mpHolistic.process(imgRgb)

        frameLms = np.zeros(expectedCoordsPerFrame, dtype=np.float32)
        currentIdx = 0

        if mpResults.pose_landmarks:
            poseFlat = [coord for lm in mpResults.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]]
            frameLms[currentIdx:currentIdx + len(poseFlat)] = poseFlat
        else:
            print(f"Warning: No pose landmarks detected in frame {idx}")
        currentIdx += numPoseCoordsSingle


        if mpResults.left_hand_landmarks:
            lhFlat = [coord for lm in mpResults.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
            frameLms[currentIdx:currentIdx + len(lhFlat)] = lhFlat
        else:
            print(f"Warning: No left hand landmarks detected in frame {idx}")
        currentIdx += numHandCoordsSingle

        if mpResults.right_hand_landmarks:
            rhFlat = [coord for lm in mpResults.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
            frameLms[currentIdx:currentIdx + len(rhFlat)] = rhFlat
        else:
            print(f"Warning: No right hand landmarks detected in frame {idx}")
        currentIdx += numHandCoordsSingle

        if mpResults.face_landmarks:
            faceFlat = [coord for lm in mpResults.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
            frameLms[currentIdx:currentIdx + len(faceFlat)] = faceFlat
        else:
            print(f"Warning: No face landmarks detected in frame {idx}")

        sequence.append(frameLms)

    if not sequence:
        return {"word": "", "confidence": 0.0}

    sequence = normalizeLandmarks(np.array(sequence, dtype=np.float32))
    sequence = padOrTruncateSequence(sequence, sequenceLength, expectedCoordsPerFrame)
    sequence = np.expand_dims(sequence, axis=0)

    preds = model.predict(sequence, verbose=0)
    predictedId = int(np.argmax(preds))
    confidence = float(np.max(preds))
    predictedWord = idToGloss.get(predictedId, "Unknown")

    result = {"word": predictedWord if confidence >= confidenceThreshold else "",
              "confidence": confidence}

    print(f"Prediction result: {result}")
    return result