Spaces:

tmkdt
/

handsUp-backend

Sleeping

handsUp-backend / lettersController.py

mutarisi

fixed upload issue

5efe294 3 months ago

6.31 kB

	import cv2
	import numpy as np
	import pickle
	import tensorflow as tf
	import mediapipe as mp
	from typing import List

	# ----------------------------------------------------------------------
	# Model and Encoder Loading (This section should remain unchanged)
	# ----------------------------------------------------------------------
	# Letters Model 1 (Static hand signs)
	lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
	with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
	labelEncoder = pickle.load(f)

	# Letters Model 2 (Temporal signs like J, Z, motion)
	lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
	with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
	labelEncoder2 = pickle.load(f)

	# Numbers Model (Static number signs)
	numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
	with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
	numLabelEncoder = pickle.load(f)

	sequenceNum = 20
	hands = mp.solutions.hands.Hands(static_image_mode=True)
	# ----------------------------------------------------------------------

	def detectFromImage(sequenceList: List[bytes]):
	"""
	Processes a sequence of image frames (provided as raw bytes) to detect sign
	language letters and numbers using multiple models.
	"""

	# 1. Input Validation
	if len(sequenceList) != sequenceNum:
	return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}

	processedSequence = []

	# Placeholder for the last valid frame (used for static fallback models)
	fallback_frame_cv2 = None

	# 2. Process Sequence Frames (Temporal Model)
	for image_bytes in sequenceList:
	# --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) ---
	np_arr = np.frombuffer(image_bytes, np.uint8)
	image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array

	if image is None:
	# Skip corrupted frames
	continue

	# Keep the last valid frame in OpenCV format for static models later
	fallback_frame_cv2 = image

	# Convert BGR to RGB for MediaPipe
	imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	results = hands.process(imgRGB)

	if not results.multi_hand_landmarks:
	# Skip frames without a detected hand
	continue

	handLandmarks = results.multi_hand_landmarks[0]

	# --- Landmarking and Normalization ---
	xList, yList = [], []
	dataAux2 = []

	for lm in handLandmarks.landmark:
	xList.append(lm.x)
	yList.append(lm.y)

	# Normalize landmarks relative to minimum x and y
	for lm in handLandmarks.landmark:
	dataAux2.append(lm.x - min(xList))
	dataAux2.append(lm.y - min(yList))
	dataAux2.append(0) # Padding the Z dimension

	processedSequence.append(dataAux2)

	confidence2 = 0.0
	label2 = ""

	# The interpolation logic is commented out, leaving it as-is based on your provided code.

	# 3. Temporal Model Prediction (LettersModel2)
	if len(processedSequence) != sequenceNum:
	print("incomplete sequence: ", len(processedSequence))
	# If the sequence is too short after dropping frames, return empty result
	return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}

	inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
	prediction2 = lettersModel2.predict(inputData2, verbose=0)

	index2 = np.argmax(prediction2, axis=1)[0]
	confidence2 = float(np.max(prediction2))
	label2 = labelEncoder2.inverse_transform([index2])[0]
	print(f'Letters Model 2:{label2} at {confidence2}')

	# 4. Static Model Prediction (Fallback/Verification)
	if fallback_frame_cv2 is not None:
	# Use the last valid frame detected by MediaPipe
	imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB)
	results = hands.process(imgRGB)

	if results.multi_hand_landmarks:
	handLandmarks = results.multi_hand_landmarks[0]
	xList, yList = [], []
	dataAux = []

	for lm in handLandmarks.landmark:
	xList.append(lm.x)
	yList.append(lm.y)

	for lm in handLandmarks.landmark:
	dataAux.append(lm.x - min(xList))
	dataAux.append(lm.y - min(yList))

	# check in letters model 1
	inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
	prediction1 = lettersModel.predict(inputData1, verbose=0)
	index1 = np.argmax(prediction1, axis=1)[0]
	confidence1 = float(np.max(prediction1))
	label1 = labelEncoder.inverse_transform([index1])[0]
	print(f'Letters Model 1: {label1} at {confidence1}')

	# check in numbers model
	prediction3 = numbersModel.predict(inputData1, verbose=0)
	index3 = np.argmax(prediction3, axis=1)[0]
	confidence3 = float(np.max(prediction3))
	label3 = numLabelEncoder.inverse_transform([index3])[0]
	print(f'Numbers Model: {label3} at {confidence3}')

	# 5. Result Aggregation
	if label1 == label2:
	# Both models agree on the letter
	return {'letter': label2, 'confidenceLetter': confidence2,
	'number': label3, 'confidenceNumber': confidence3}
	else:
	# Default to static model 1 if disagreement (or implement better fusion logic here)
	return {'letter': label1, 'confidenceLetter': confidence1
	, 'number': label3, 'confidenceNumber': confidence3}
	else:
	# Hand detected in sequence but not in the final fallback frame (unlikely)
	return {'letter': label2, 'confidenceLetter': confidence2
	, 'number': '', 'confidenceNumber': 0.0}
	else:
	# No hand detected in any frame, or all frames failed to decode.
	return {'letter': label2, 'confidenceLetter': confidence2
	, 'number': '', 'confidenceNumber': 0.0}