Spaces:
Sleeping
Sleeping
| import cv2 | |
| import numpy as np | |
| import pickle | |
| import tensorflow as tf | |
| import mediapipe as mp | |
| from typing import List | |
| # ---------------------------------------------------------------------- | |
| # Model and Encoder Loading (This section should remain unchanged) | |
| # ---------------------------------------------------------------------- | |
| # Letters Model 1 (Static hand signs) | |
| lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras') | |
| with open('ai_model/models/labelEncoder.pickle', 'rb') as f: | |
| labelEncoder = pickle.load(f) | |
| # Letters Model 2 (Temporal signs like J, Z, motion) | |
| lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras') | |
| with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f: | |
| labelEncoder2 = pickle.load(f) | |
| # Numbers Model (Static number signs) | |
| numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras') | |
| with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f: | |
| numLabelEncoder = pickle.load(f) | |
| sequenceNum = 20 | |
| hands = mp.solutions.hands.Hands(static_image_mode=True) | |
| # ---------------------------------------------------------------------- | |
| def detectFromImage(sequenceList: List[bytes]): | |
| """ | |
| Processes a sequence of image frames (provided as raw bytes) to detect sign | |
| language letters and numbers using multiple models. | |
| """ | |
| # 1. Input Validation | |
| if len(sequenceList) != sequenceNum: | |
| return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0} | |
| processedSequence = [] | |
| # Placeholder for the last valid frame (used for static fallback models) | |
| fallback_frame_cv2 = None | |
| # 2. Process Sequence Frames (Temporal Model) | |
| for image_bytes in sequenceList: | |
| # --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) --- | |
| np_arr = np.frombuffer(image_bytes, np.uint8) | |
| image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array | |
| if image is None: | |
| # Skip corrupted frames | |
| continue | |
| # Keep the last valid frame in OpenCV format for static models later | |
| fallback_frame_cv2 = image | |
| # Convert BGR to RGB for MediaPipe | |
| imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
| results = hands.process(imgRGB) | |
| if not results.multi_hand_landmarks: | |
| # Skip frames without a detected hand | |
| continue | |
| handLandmarks = results.multi_hand_landmarks[0] | |
| # --- Landmarking and Normalization --- | |
| xList, yList = [], [] | |
| dataAux2 = [] | |
| for lm in handLandmarks.landmark: | |
| xList.append(lm.x) | |
| yList.append(lm.y) | |
| # Normalize landmarks relative to minimum x and y | |
| for lm in handLandmarks.landmark: | |
| dataAux2.append(lm.x - min(xList)) | |
| dataAux2.append(lm.y - min(yList)) | |
| dataAux2.append(0) # Padding the Z dimension | |
| processedSequence.append(dataAux2) | |
| confidence2 = 0.0 | |
| label2 = "" | |
| # The interpolation logic is commented out, leaving it as-is based on your provided code. | |
| # 3. Temporal Model Prediction (LettersModel2) | |
| if len(processedSequence) != sequenceNum: | |
| print("incomplete sequence: ", len(processedSequence)) | |
| # If the sequence is too short after dropping frames, return empty result | |
| return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0} | |
| inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63) | |
| prediction2 = lettersModel2.predict(inputData2, verbose=0) | |
| index2 = np.argmax(prediction2, axis=1)[0] | |
| confidence2 = float(np.max(prediction2)) | |
| label2 = labelEncoder2.inverse_transform([index2])[0] | |
| print(f'Letters Model 2:{label2} at {confidence2}') | |
| # 4. Static Model Prediction (Fallback/Verification) | |
| if fallback_frame_cv2 is not None: | |
| # Use the last valid frame detected by MediaPipe | |
| imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB) | |
| results = hands.process(imgRGB) | |
| if results.multi_hand_landmarks: | |
| handLandmarks = results.multi_hand_landmarks[0] | |
| xList, yList = [], [] | |
| dataAux = [] | |
| for lm in handLandmarks.landmark: | |
| xList.append(lm.x) | |
| yList.append(lm.y) | |
| for lm in handLandmarks.landmark: | |
| dataAux.append(lm.x - min(xList)) | |
| dataAux.append(lm.y - min(yList)) | |
| # check in letters model 1 | |
| inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1) | |
| prediction1 = lettersModel.predict(inputData1, verbose=0) | |
| index1 = np.argmax(prediction1, axis=1)[0] | |
| confidence1 = float(np.max(prediction1)) | |
| label1 = labelEncoder.inverse_transform([index1])[0] | |
| print(f'Letters Model 1: {label1} at {confidence1}') | |
| # check in numbers model | |
| prediction3 = numbersModel.predict(inputData1, verbose=0) | |
| index3 = np.argmax(prediction3, axis=1)[0] | |
| confidence3 = float(np.max(prediction3)) | |
| label3 = numLabelEncoder.inverse_transform([index3])[0] | |
| print(f'Numbers Model: {label3} at {confidence3}') | |
| # 5. Result Aggregation | |
| if label1 == label2: | |
| # Both models agree on the letter | |
| return {'letter': label2, 'confidenceLetter': confidence2, | |
| 'number': label3, 'confidenceNumber': confidence3} | |
| else: | |
| # Default to static model 1 if disagreement (or implement better fusion logic here) | |
| return {'letter': label1, 'confidenceLetter': confidence1 | |
| , 'number': label3, 'confidenceNumber': confidence3} | |
| else: | |
| # Hand detected in sequence but not in the final fallback frame (unlikely) | |
| return {'letter': label2, 'confidenceLetter': confidence2 | |
| , 'number': '', 'confidenceNumber': 0.0} | |
| else: | |
| # No hand detected in any frame, or all frames failed to decode. | |
| return {'letter': label2, 'confidenceLetter': confidence2 | |
| , 'number': '', 'confidenceNumber': 0.0} | |