import cv2 import numpy as np import pickle import tensorflow as tf import mediapipe as mp from typing import List # ---------------------------------------------------------------------- # Model and Encoder Loading (This section should remain unchanged) # ---------------------------------------------------------------------- # Letters Model 1 (Static hand signs) lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras') with open('ai_model/models/labelEncoder.pickle', 'rb') as f: labelEncoder = pickle.load(f) # Letters Model 2 (Temporal signs like J, Z, motion) lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras') with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f: labelEncoder2 = pickle.load(f) # Numbers Model (Static number signs) numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras') with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f: numLabelEncoder = pickle.load(f) sequenceNum = 20 hands = mp.solutions.hands.Hands(static_image_mode=True) # ---------------------------------------------------------------------- def detectFromImage(sequenceList: List[bytes]): """ Processes a sequence of image frames (provided as raw bytes) to detect sign language letters and numbers using multiple models. """ # 1. Input Validation if len(sequenceList) != sequenceNum: return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0} processedSequence = [] # Placeholder for the last valid frame (used for static fallback models) fallback_frame_cv2 = None # 2. Process Sequence Frames (Temporal Model) for image_bytes in sequenceList: # --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) --- np_arr = np.frombuffer(image_bytes, np.uint8) image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array if image is None: # Skip corrupted frames continue # Keep the last valid frame in OpenCV format for static models later fallback_frame_cv2 = image # Convert BGR to RGB for MediaPipe imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) results = hands.process(imgRGB) if not results.multi_hand_landmarks: # Skip frames without a detected hand continue handLandmarks = results.multi_hand_landmarks[0] # --- Landmarking and Normalization --- xList, yList = [], [] dataAux2 = [] for lm in handLandmarks.landmark: xList.append(lm.x) yList.append(lm.y) # Normalize landmarks relative to minimum x and y for lm in handLandmarks.landmark: dataAux2.append(lm.x - min(xList)) dataAux2.append(lm.y - min(yList)) dataAux2.append(0) # Padding the Z dimension processedSequence.append(dataAux2) confidence2 = 0.0 label2 = "" # The interpolation logic is commented out, leaving it as-is based on your provided code. # 3. Temporal Model Prediction (LettersModel2) if len(processedSequence) != sequenceNum: print("incomplete sequence: ", len(processedSequence)) # If the sequence is too short after dropping frames, return empty result return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0} inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63) prediction2 = lettersModel2.predict(inputData2, verbose=0) index2 = np.argmax(prediction2, axis=1)[0] confidence2 = float(np.max(prediction2)) label2 = labelEncoder2.inverse_transform([index2])[0] print(f'Letters Model 2:{label2} at {confidence2}') # 4. Static Model Prediction (Fallback/Verification) if fallback_frame_cv2 is not None: # Use the last valid frame detected by MediaPipe imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB) results = hands.process(imgRGB) if results.multi_hand_landmarks: handLandmarks = results.multi_hand_landmarks[0] xList, yList = [], [] dataAux = [] for lm in handLandmarks.landmark: xList.append(lm.x) yList.append(lm.y) for lm in handLandmarks.landmark: dataAux.append(lm.x - min(xList)) dataAux.append(lm.y - min(yList)) # check in letters model 1 inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1) prediction1 = lettersModel.predict(inputData1, verbose=0) index1 = np.argmax(prediction1, axis=1)[0] confidence1 = float(np.max(prediction1)) label1 = labelEncoder.inverse_transform([index1])[0] print(f'Letters Model 1: {label1} at {confidence1}') # check in numbers model prediction3 = numbersModel.predict(inputData1, verbose=0) index3 = np.argmax(prediction3, axis=1)[0] confidence3 = float(np.max(prediction3)) label3 = numLabelEncoder.inverse_transform([index3])[0] print(f'Numbers Model: {label3} at {confidence3}') # 5. Result Aggregation if label1 == label2: # Both models agree on the letter return {'letter': label2, 'confidenceLetter': confidence2, 'number': label3, 'confidenceNumber': confidence3} else: # Default to static model 1 if disagreement (or implement better fusion logic here) return {'letter': label1, 'confidenceLetter': confidence1 , 'number': label3, 'confidenceNumber': confidence3} else: # Hand detected in sequence but not in the final fallback frame (unlikely) return {'letter': label2, 'confidenceLetter': confidence2 , 'number': '', 'confidenceNumber': 0.0} else: # No hand detected in any frame, or all frames failed to decode. return {'letter': label2, 'confidenceLetter': confidence2 , 'number': '', 'confidenceNumber': 0.0}