Spaces:
Sleeping
Sleeping
mutarisi commited on
Commit ·
5efe294
1
Parent(s): 641b34a
fixed upload issue
Browse files- apiRoutes.py +29 -4
- lettersController.py +62 -50
apiRoutes.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
from fastapi import APIRouter, UploadFile, File, HTTPException
|
| 3 |
from fastapi.responses import JSONResponse
|
| 4 |
from typing import List
|
|
|
|
| 5 |
from lettersController import detectFromImage
|
| 6 |
from wordsController import detectWords
|
| 7 |
from glossController import translateGloss
|
|
@@ -15,8 +19,20 @@ async def process_letters(frames: List[UploadFile] = File(...)):
|
|
| 15 |
if len(frames) != sequence_num:
|
| 16 |
raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return JSONResponse(content=result)
|
| 21 |
|
| 22 |
@router.post("/processWords")
|
|
@@ -26,8 +42,17 @@ async def process_words(frames: List[UploadFile] = File(...)):
|
|
| 26 |
if len(frames) != sequence_num:
|
| 27 |
raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Call the imported function directly
|
| 30 |
-
result = detectWords(
|
| 31 |
return JSONResponse(content=result)
|
| 32 |
|
| 33 |
@router.post("/sentence")
|
|
@@ -39,4 +64,4 @@ async def sign_sentence(data: dict):
|
|
| 39 |
|
| 40 |
# Call the imported function directly
|
| 41 |
result = translateGloss(gloss_input)
|
| 42 |
-
return JSONResponse(content=result)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import tempfile
|
| 4 |
+
import asyncio
|
| 5 |
from fastapi import APIRouter, UploadFile, File, HTTPException
|
| 6 |
from fastapi.responses import JSONResponse
|
| 7 |
from typing import List
|
| 8 |
+
# Ensure these imports are correct
|
| 9 |
from lettersController import detectFromImage
|
| 10 |
from wordsController import detectWords
|
| 11 |
from glossController import translateGloss
|
|
|
|
| 19 |
if len(frames) != sequence_num:
|
| 20 |
raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
|
| 21 |
|
| 22 |
+
# CRITICAL: Read the binary content of each file
|
| 23 |
+
# We will pass a list of image bytes (memory buffers), NOT UploadFile objects.
|
| 24 |
+
image_bytes_list = []
|
| 25 |
+
try:
|
| 26 |
+
for frame in frames:
|
| 27 |
+
# frame.file is an async context manager, read() returns bytes
|
| 28 |
+
contents = await frame.read()
|
| 29 |
+
image_bytes_list.append(contents)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
# Handle potential file read errors
|
| 32 |
+
raise HTTPException(status_code=500, detail=f"Error reading uploaded file contents: {e}")
|
| 33 |
+
|
| 34 |
+
# Pass the list of image bytes to the controller
|
| 35 |
+
result = detectFromImage(image_bytes_list)
|
| 36 |
return JSONResponse(content=result)
|
| 37 |
|
| 38 |
@router.post("/processWords")
|
|
|
|
| 42 |
if len(frames) != sequence_num:
|
| 43 |
raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
|
| 44 |
|
| 45 |
+
# CRITICAL: Read the binary content of each file
|
| 46 |
+
image_bytes_list = []
|
| 47 |
+
try:
|
| 48 |
+
for frame in frames:
|
| 49 |
+
contents = await frame.read()
|
| 50 |
+
image_bytes_list.append(contents)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
raise HTTPException(status_code=500, detail=f"Error reading uploaded file contents: {e}")
|
| 53 |
+
|
| 54 |
# Call the imported function directly
|
| 55 |
+
result = detectWords(image_bytes_list)
|
| 56 |
return JSONResponse(content=result)
|
| 57 |
|
| 58 |
@router.post("/sentence")
|
|
|
|
| 64 |
|
| 65 |
# Call the imported function directly
|
| 66 |
result = translateGloss(gloss_input)
|
| 67 |
+
return JSONResponse(content=result)
|
lettersController.py
CHANGED
|
@@ -3,42 +3,69 @@ import numpy as np
|
|
| 3 |
import pickle
|
| 4 |
import tensorflow as tf
|
| 5 |
import mediapipe as mp
|
|
|
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
|
| 8 |
with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
|
| 9 |
labelEncoder = pickle.load(f)
|
| 10 |
|
|
|
|
| 11 |
lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
|
| 12 |
with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
|
| 13 |
labelEncoder2 = pickle.load(f)
|
| 14 |
|
|
|
|
| 15 |
numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
|
| 16 |
with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
|
| 17 |
numLabelEncoder = pickle.load(f)
|
| 18 |
|
| 19 |
sequenceNum = 20
|
| 20 |
hands = mp.solutions.hands.Hands(static_image_mode=True)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
if len(sequenceList) != sequenceNum:
|
| 25 |
-
return {'letter': '', '
|
| 26 |
|
| 27 |
processedSequence = []
|
| 28 |
-
|
| 29 |
-
for
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
if image is None:
|
|
|
|
| 32 |
continue
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 35 |
results = hands.process(imgRGB)
|
| 36 |
|
| 37 |
if not results.multi_hand_landmarks:
|
|
|
|
| 38 |
continue
|
| 39 |
|
| 40 |
-
handLandmarks = results.multi_hand_landmarks[0]
|
| 41 |
|
|
|
|
| 42 |
xList, yList = [], []
|
| 43 |
dataAux2 = []
|
| 44 |
|
|
@@ -46,46 +73,25 @@ def detectFromImage(sequenceList):
|
|
| 46 |
xList.append(lm.x)
|
| 47 |
yList.append(lm.y)
|
| 48 |
|
|
|
|
| 49 |
for lm in handLandmarks.landmark:
|
| 50 |
dataAux2.append(lm.x - min(xList))
|
| 51 |
dataAux2.append(lm.y - min(yList))
|
| 52 |
-
dataAux2.append(0)
|
| 53 |
|
| 54 |
processedSequence.append(dataAux2)
|
| 55 |
|
| 56 |
confidence2 = 0.0
|
| 57 |
label2 = ""
|
| 58 |
-
fallback_frame = cv2.imread(sequenceList[-1])
|
| 59 |
-
|
| 60 |
-
# for i in range(len(processedSequence)):
|
| 61 |
-
# if processedSequence[i] is None:
|
| 62 |
-
# prevIdx, nextIdx = -1, -1
|
| 63 |
-
|
| 64 |
-
# for j in range(i - 1, -1, -1):
|
| 65 |
-
# if processedSequence[j] is not None:
|
| 66 |
-
# prevIdx = j
|
| 67 |
-
# break
|
| 68 |
-
|
| 69 |
-
# for j in range(i + 1, len(processedSequence)):
|
| 70 |
-
# if processedSequence[j] is not None:
|
| 71 |
-
# nextIdx = j
|
| 72 |
-
# break
|
| 73 |
-
|
| 74 |
-
# if prevIdx != -1 and nextIdx != -1:
|
| 75 |
-
# prevData = np.array(processedSequence[prevIdx])
|
| 76 |
-
# nextData = np.array(processedSequence[nextIdx])
|
| 77 |
-
# t = (i - prevIdx) / (nextIdx - prevIdx)
|
| 78 |
-
# interpolatedData = prevData + (nextData - prevData) * t
|
| 79 |
-
# processedSequence[i] = interpolatedData.tolist()
|
| 80 |
-
# elif prevIdx != -1:
|
| 81 |
-
# processedSequence[i] = processedSequence[prevIdx]
|
| 82 |
-
# elif nextIdx != -1:
|
| 83 |
-
# processedSequence[i] = processedSequence[nextIdx]
|
| 84 |
|
|
|
|
|
|
|
|
|
|
| 85 |
if len(processedSequence) != sequenceNum:
|
| 86 |
print("incomplete sequence: ", len(processedSequence))
|
|
|
|
| 87 |
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
|
| 88 |
-
|
| 89 |
inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
|
| 90 |
prediction2 = lettersModel2.predict(inputData2, verbose=0)
|
| 91 |
|
|
@@ -94,9 +100,12 @@ def detectFromImage(sequenceList):
|
|
| 94 |
label2 = labelEncoder2.inverse_transform([index2])[0]
|
| 95 |
print(f'Letters Model 2:{label2} at {confidence2}')
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
| 99 |
results = hands.process(imgRGB)
|
|
|
|
| 100 |
if results.multi_hand_landmarks:
|
| 101 |
handLandmarks = results.multi_hand_landmarks[0]
|
| 102 |
xList, yList = [], []
|
|
@@ -110,32 +119,35 @@ def detectFromImage(sequenceList):
|
|
| 110 |
dataAux.append(lm.x - min(xList))
|
| 111 |
dataAux.append(lm.y - min(yList))
|
| 112 |
|
| 113 |
-
#check in letters
|
| 114 |
inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
|
| 115 |
prediction1 = lettersModel.predict(inputData1, verbose=0)
|
| 116 |
index1 = np.argmax(prediction1, axis=1)[0]
|
| 117 |
confidence1 = float(np.max(prediction1))
|
| 118 |
label1 = labelEncoder.inverse_transform([index1])[0]
|
| 119 |
-
|
| 120 |
print(f'Letters Model 1: {label1} at {confidence1}')
|
| 121 |
|
|
|
|
| 122 |
prediction3 = numbersModel.predict(inputData1, verbose=0)
|
| 123 |
index3 = np.argmax(prediction3, axis=1)[0]
|
| 124 |
confidence3 = float(np.max(prediction3))
|
| 125 |
label3 = numLabelEncoder.inverse_transform([index3])[0]
|
| 126 |
-
|
| 127 |
print(f'Numbers Model: {label3} at {confidence3}')
|
| 128 |
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
return {'letter': label2, 'confidenceLetter': confidence2,
|
| 131 |
'number': label3, 'confidenceNumber': confidence3}
|
| 132 |
-
# elif label2=="Z" and label1=="L":
|
| 133 |
-
# return {'letter': label2, 'confidence': confidence2}
|
| 134 |
-
# elif label2=="J" and label1=="I":
|
| 135 |
-
# return {'letter': label2, 'confidence': confidence2}
|
| 136 |
else:
|
|
|
|
| 137 |
return {'letter': label1, 'confidenceLetter': confidence1
|
| 138 |
-
, 'number': label3, 'confidenceNumber': confidence3}
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
return {'letter': label2, 'confidenceLetter': confidence2
|
| 141 |
-
, 'number': '', 'confidenceNumber': 0.0}
|
|
|
|
| 3 |
import pickle
|
| 4 |
import tensorflow as tf
|
| 5 |
import mediapipe as mp
|
| 6 |
+
from typing import List
|
| 7 |
|
| 8 |
+
# ----------------------------------------------------------------------
|
| 9 |
+
# Model and Encoder Loading (This section should remain unchanged)
|
| 10 |
+
# ----------------------------------------------------------------------
|
| 11 |
+
# Letters Model 1 (Static hand signs)
|
| 12 |
lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
|
| 13 |
with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
|
| 14 |
labelEncoder = pickle.load(f)
|
| 15 |
|
| 16 |
+
# Letters Model 2 (Temporal signs like J, Z, motion)
|
| 17 |
lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
|
| 18 |
with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
|
| 19 |
labelEncoder2 = pickle.load(f)
|
| 20 |
|
| 21 |
+
# Numbers Model (Static number signs)
|
| 22 |
numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
|
| 23 |
with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
|
| 24 |
numLabelEncoder = pickle.load(f)
|
| 25 |
|
| 26 |
sequenceNum = 20
|
| 27 |
hands = mp.solutions.hands.Hands(static_image_mode=True)
|
| 28 |
+
# ----------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
def detectFromImage(sequenceList: List[bytes]):
|
| 31 |
+
"""
|
| 32 |
+
Processes a sequence of image frames (provided as raw bytes) to detect sign
|
| 33 |
+
language letters and numbers using multiple models.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# 1. Input Validation
|
| 37 |
if len(sequenceList) != sequenceNum:
|
| 38 |
+
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
|
| 39 |
|
| 40 |
processedSequence = []
|
| 41 |
+
|
| 42 |
+
# Placeholder for the last valid frame (used for static fallback models)
|
| 43 |
+
fallback_frame_cv2 = None
|
| 44 |
+
|
| 45 |
+
# 2. Process Sequence Frames (Temporal Model)
|
| 46 |
+
for image_bytes in sequenceList:
|
| 47 |
+
# --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) ---
|
| 48 |
+
np_arr = np.frombuffer(image_bytes, np.uint8)
|
| 49 |
+
image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array
|
| 50 |
+
|
| 51 |
if image is None:
|
| 52 |
+
# Skip corrupted frames
|
| 53 |
continue
|
| 54 |
+
|
| 55 |
+
# Keep the last valid frame in OpenCV format for static models later
|
| 56 |
+
fallback_frame_cv2 = image
|
| 57 |
+
|
| 58 |
+
# Convert BGR to RGB for MediaPipe
|
| 59 |
imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 60 |
results = hands.process(imgRGB)
|
| 61 |
|
| 62 |
if not results.multi_hand_landmarks:
|
| 63 |
+
# Skip frames without a detected hand
|
| 64 |
continue
|
| 65 |
|
| 66 |
+
handLandmarks = results.multi_hand_landmarks[0]
|
| 67 |
|
| 68 |
+
# --- Landmarking and Normalization ---
|
| 69 |
xList, yList = [], []
|
| 70 |
dataAux2 = []
|
| 71 |
|
|
|
|
| 73 |
xList.append(lm.x)
|
| 74 |
yList.append(lm.y)
|
| 75 |
|
| 76 |
+
# Normalize landmarks relative to minimum x and y
|
| 77 |
for lm in handLandmarks.landmark:
|
| 78 |
dataAux2.append(lm.x - min(xList))
|
| 79 |
dataAux2.append(lm.y - min(yList))
|
| 80 |
+
dataAux2.append(0) # Padding the Z dimension
|
| 81 |
|
| 82 |
processedSequence.append(dataAux2)
|
| 83 |
|
| 84 |
confidence2 = 0.0
|
| 85 |
label2 = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
# The interpolation logic is commented out, leaving it as-is based on your provided code.
|
| 88 |
+
|
| 89 |
+
# 3. Temporal Model Prediction (LettersModel2)
|
| 90 |
if len(processedSequence) != sequenceNum:
|
| 91 |
print("incomplete sequence: ", len(processedSequence))
|
| 92 |
+
# If the sequence is too short after dropping frames, return empty result
|
| 93 |
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
|
| 94 |
+
|
| 95 |
inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
|
| 96 |
prediction2 = lettersModel2.predict(inputData2, verbose=0)
|
| 97 |
|
|
|
|
| 100 |
label2 = labelEncoder2.inverse_transform([index2])[0]
|
| 101 |
print(f'Letters Model 2:{label2} at {confidence2}')
|
| 102 |
|
| 103 |
+
# 4. Static Model Prediction (Fallback/Verification)
|
| 104 |
+
if fallback_frame_cv2 is not None:
|
| 105 |
+
# Use the last valid frame detected by MediaPipe
|
| 106 |
+
imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB)
|
| 107 |
results = hands.process(imgRGB)
|
| 108 |
+
|
| 109 |
if results.multi_hand_landmarks:
|
| 110 |
handLandmarks = results.multi_hand_landmarks[0]
|
| 111 |
xList, yList = [], []
|
|
|
|
| 119 |
dataAux.append(lm.x - min(xList))
|
| 120 |
dataAux.append(lm.y - min(yList))
|
| 121 |
|
| 122 |
+
# check in letters model 1
|
| 123 |
inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
|
| 124 |
prediction1 = lettersModel.predict(inputData1, verbose=0)
|
| 125 |
index1 = np.argmax(prediction1, axis=1)[0]
|
| 126 |
confidence1 = float(np.max(prediction1))
|
| 127 |
label1 = labelEncoder.inverse_transform([index1])[0]
|
|
|
|
| 128 |
print(f'Letters Model 1: {label1} at {confidence1}')
|
| 129 |
|
| 130 |
+
# check in numbers model
|
| 131 |
prediction3 = numbersModel.predict(inputData1, verbose=0)
|
| 132 |
index3 = np.argmax(prediction3, axis=1)[0]
|
| 133 |
confidence3 = float(np.max(prediction3))
|
| 134 |
label3 = numLabelEncoder.inverse_transform([index3])[0]
|
|
|
|
| 135 |
print(f'Numbers Model: {label3} at {confidence3}')
|
| 136 |
|
| 137 |
+
# 5. Result Aggregation
|
| 138 |
+
if label1 == label2:
|
| 139 |
+
# Both models agree on the letter
|
| 140 |
return {'letter': label2, 'confidenceLetter': confidence2,
|
| 141 |
'number': label3, 'confidenceNumber': confidence3}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
else:
|
| 143 |
+
# Default to static model 1 if disagreement (or implement better fusion logic here)
|
| 144 |
return {'letter': label1, 'confidenceLetter': confidence1
|
| 145 |
+
, 'number': label3, 'confidenceNumber': confidence3}
|
| 146 |
+
else:
|
| 147 |
+
# Hand detected in sequence but not in the final fallback frame (unlikely)
|
| 148 |
+
return {'letter': label2, 'confidenceLetter': confidence2
|
| 149 |
+
, 'number': '', 'confidenceNumber': 0.0}
|
| 150 |
+
else:
|
| 151 |
+
# No hand detected in any frame, or all frames failed to decode.
|
| 152 |
return {'letter': label2, 'confidenceLetter': confidence2
|
| 153 |
+
, 'number': '', 'confidenceNumber': 0.0}
|