Spaces:
Sleeping
Sleeping
File size: 6,308 Bytes
de8ea8e 5efe294 de8ea8e 5efe294 641b34a de8ea8e 5efe294 641b34a de8ea8e 5efe294 641b34a de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 de8ea8e 5efe294 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import cv2
import numpy as np
import pickle
import tensorflow as tf
import mediapipe as mp
from typing import List
# ----------------------------------------------------------------------
# Model and Encoder Loading (This section should remain unchanged)
# ----------------------------------------------------------------------
# Letters Model 1 (Static hand signs)
lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
labelEncoder = pickle.load(f)
# Letters Model 2 (Temporal signs like J, Z, motion)
lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
labelEncoder2 = pickle.load(f)
# Numbers Model (Static number signs)
numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
numLabelEncoder = pickle.load(f)
sequenceNum = 20
hands = mp.solutions.hands.Hands(static_image_mode=True)
# ----------------------------------------------------------------------
def detectFromImage(sequenceList: List[bytes]):
"""
Processes a sequence of image frames (provided as raw bytes) to detect sign
language letters and numbers using multiple models.
"""
# 1. Input Validation
if len(sequenceList) != sequenceNum:
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
processedSequence = []
# Placeholder for the last valid frame (used for static fallback models)
fallback_frame_cv2 = None
# 2. Process Sequence Frames (Temporal Model)
for image_bytes in sequenceList:
# --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) ---
np_arr = np.frombuffer(image_bytes, np.uint8)
image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array
if image is None:
# Skip corrupted frames
continue
# Keep the last valid frame in OpenCV format for static models later
fallback_frame_cv2 = image
# Convert BGR to RGB for MediaPipe
imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(imgRGB)
if not results.multi_hand_landmarks:
# Skip frames without a detected hand
continue
handLandmarks = results.multi_hand_landmarks[0]
# --- Landmarking and Normalization ---
xList, yList = [], []
dataAux2 = []
for lm in handLandmarks.landmark:
xList.append(lm.x)
yList.append(lm.y)
# Normalize landmarks relative to minimum x and y
for lm in handLandmarks.landmark:
dataAux2.append(lm.x - min(xList))
dataAux2.append(lm.y - min(yList))
dataAux2.append(0) # Padding the Z dimension
processedSequence.append(dataAux2)
confidence2 = 0.0
label2 = ""
# The interpolation logic is commented out, leaving it as-is based on your provided code.
# 3. Temporal Model Prediction (LettersModel2)
if len(processedSequence) != sequenceNum:
print("incomplete sequence: ", len(processedSequence))
# If the sequence is too short after dropping frames, return empty result
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
prediction2 = lettersModel2.predict(inputData2, verbose=0)
index2 = np.argmax(prediction2, axis=1)[0]
confidence2 = float(np.max(prediction2))
label2 = labelEncoder2.inverse_transform([index2])[0]
print(f'Letters Model 2:{label2} at {confidence2}')
# 4. Static Model Prediction (Fallback/Verification)
if fallback_frame_cv2 is not None:
# Use the last valid frame detected by MediaPipe
imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB)
results = hands.process(imgRGB)
if results.multi_hand_landmarks:
handLandmarks = results.multi_hand_landmarks[0]
xList, yList = [], []
dataAux = []
for lm in handLandmarks.landmark:
xList.append(lm.x)
yList.append(lm.y)
for lm in handLandmarks.landmark:
dataAux.append(lm.x - min(xList))
dataAux.append(lm.y - min(yList))
# check in letters model 1
inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
prediction1 = lettersModel.predict(inputData1, verbose=0)
index1 = np.argmax(prediction1, axis=1)[0]
confidence1 = float(np.max(prediction1))
label1 = labelEncoder.inverse_transform([index1])[0]
print(f'Letters Model 1: {label1} at {confidence1}')
# check in numbers model
prediction3 = numbersModel.predict(inputData1, verbose=0)
index3 = np.argmax(prediction3, axis=1)[0]
confidence3 = float(np.max(prediction3))
label3 = numLabelEncoder.inverse_transform([index3])[0]
print(f'Numbers Model: {label3} at {confidence3}')
# 5. Result Aggregation
if label1 == label2:
# Both models agree on the letter
return {'letter': label2, 'confidenceLetter': confidence2,
'number': label3, 'confidenceNumber': confidence3}
else:
# Default to static model 1 if disagreement (or implement better fusion logic here)
return {'letter': label1, 'confidenceLetter': confidence1
, 'number': label3, 'confidenceNumber': confidence3}
else:
# Hand detected in sequence but not in the final fallback frame (unlikely)
return {'letter': label2, 'confidenceLetter': confidence2
, 'number': '', 'confidenceNumber': 0.0}
else:
# No hand detected in any frame, or all frames failed to decode.
return {'letter': label2, 'confidenceLetter': confidence2
, 'number': '', 'confidenceNumber': 0.0}
|