mutarisi commited on
Commit
5efe294
·
1 Parent(s): 641b34a

fixed upload issue

Browse files
Files changed (2) hide show
  1. apiRoutes.py +29 -4
  2. lettersController.py +62 -50
apiRoutes.py CHANGED
@@ -1,7 +1,11 @@
1
  import os
 
 
 
2
  from fastapi import APIRouter, UploadFile, File, HTTPException
3
  from fastapi.responses import JSONResponse
4
  from typing import List
 
5
  from lettersController import detectFromImage
6
  from wordsController import detectWords
7
  from glossController import translateGloss
@@ -15,8 +19,20 @@ async def process_letters(frames: List[UploadFile] = File(...)):
15
  if len(frames) != sequence_num:
16
  raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
17
 
18
- # Call the imported function directly
19
- result = detectFromImage(frames)
 
 
 
 
 
 
 
 
 
 
 
 
20
  return JSONResponse(content=result)
21
 
22
  @router.post("/processWords")
@@ -26,8 +42,17 @@ async def process_words(frames: List[UploadFile] = File(...)):
26
  if len(frames) != sequence_num:
27
  raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
28
 
 
 
 
 
 
 
 
 
 
29
  # Call the imported function directly
30
- result = detectWords(frames)
31
  return JSONResponse(content=result)
32
 
33
  @router.post("/sentence")
@@ -39,4 +64,4 @@ async def sign_sentence(data: dict):
39
 
40
  # Call the imported function directly
41
  result = translateGloss(gloss_input)
42
- return JSONResponse(content=result)
 
1
  import os
2
+ import shutil
3
+ import tempfile
4
+ import asyncio
5
  from fastapi import APIRouter, UploadFile, File, HTTPException
6
  from fastapi.responses import JSONResponse
7
  from typing import List
8
+ # Ensure these imports are correct
9
  from lettersController import detectFromImage
10
  from wordsController import detectWords
11
  from glossController import translateGloss
 
19
  if len(frames) != sequence_num:
20
  raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
21
 
22
+ # CRITICAL: Read the binary content of each file
23
+ # We will pass a list of image bytes (memory buffers), NOT UploadFile objects.
24
+ image_bytes_list = []
25
+ try:
26
+ for frame in frames:
27
+ # frame.file is an async context manager, read() returns bytes
28
+ contents = await frame.read()
29
+ image_bytes_list.append(contents)
30
+ except Exception as e:
31
+ # Handle potential file read errors
32
+ raise HTTPException(status_code=500, detail=f"Error reading uploaded file contents: {e}")
33
+
34
+ # Pass the list of image bytes to the controller
35
+ result = detectFromImage(image_bytes_list)
36
  return JSONResponse(content=result)
37
 
38
  @router.post("/processWords")
 
42
  if len(frames) != sequence_num:
43
  raise HTTPException(status_code=400, detail=f"Exactly {sequence_num} frames are required")
44
 
45
+ # CRITICAL: Read the binary content of each file
46
+ image_bytes_list = []
47
+ try:
48
+ for frame in frames:
49
+ contents = await frame.read()
50
+ image_bytes_list.append(contents)
51
+ except Exception as e:
52
+ raise HTTPException(status_code=500, detail=f"Error reading uploaded file contents: {e}")
53
+
54
  # Call the imported function directly
55
+ result = detectWords(image_bytes_list)
56
  return JSONResponse(content=result)
57
 
58
  @router.post("/sentence")
 
64
 
65
  # Call the imported function directly
66
  result = translateGloss(gloss_input)
67
+ return JSONResponse(content=result)
lettersController.py CHANGED
@@ -3,42 +3,69 @@ import numpy as np
3
  import pickle
4
  import tensorflow as tf
5
  import mediapipe as mp
 
6
 
 
 
 
 
7
  lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
8
  with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
9
  labelEncoder = pickle.load(f)
10
 
 
11
  lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
12
  with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
13
  labelEncoder2 = pickle.load(f)
14
 
 
15
  numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
16
  with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
17
  numLabelEncoder = pickle.load(f)
18
 
19
  sequenceNum = 20
20
  hands = mp.solutions.hands.Hands(static_image_mode=True)
21
-
22
- def detectFromImage(sequenceList):
23
-
 
 
 
 
 
 
24
  if len(sequenceList) != sequenceNum:
25
- return {'letter': '', 'confidence': 0.0}
26
 
27
  processedSequence = []
28
-
29
- for imagePath in sequenceList:
30
- image = cv2.imread(imagePath)
 
 
 
 
 
 
 
31
  if image is None:
 
32
  continue
33
-
 
 
 
 
34
  imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
35
  results = hands.process(imgRGB)
36
 
37
  if not results.multi_hand_landmarks:
 
38
  continue
39
 
40
- handLandmarks = results.multi_hand_landmarks[0]
41
 
 
42
  xList, yList = [], []
43
  dataAux2 = []
44
 
@@ -46,46 +73,25 @@ def detectFromImage(sequenceList):
46
  xList.append(lm.x)
47
  yList.append(lm.y)
48
 
 
49
  for lm in handLandmarks.landmark:
50
  dataAux2.append(lm.x - min(xList))
51
  dataAux2.append(lm.y - min(yList))
52
- dataAux2.append(0)
53
 
54
  processedSequence.append(dataAux2)
55
 
56
  confidence2 = 0.0
57
  label2 = ""
58
- fallback_frame = cv2.imread(sequenceList[-1])
59
-
60
- # for i in range(len(processedSequence)):
61
- # if processedSequence[i] is None:
62
- # prevIdx, nextIdx = -1, -1
63
-
64
- # for j in range(i - 1, -1, -1):
65
- # if processedSequence[j] is not None:
66
- # prevIdx = j
67
- # break
68
-
69
- # for j in range(i + 1, len(processedSequence)):
70
- # if processedSequence[j] is not None:
71
- # nextIdx = j
72
- # break
73
-
74
- # if prevIdx != -1 and nextIdx != -1:
75
- # prevData = np.array(processedSequence[prevIdx])
76
- # nextData = np.array(processedSequence[nextIdx])
77
- # t = (i - prevIdx) / (nextIdx - prevIdx)
78
- # interpolatedData = prevData + (nextData - prevData) * t
79
- # processedSequence[i] = interpolatedData.tolist()
80
- # elif prevIdx != -1:
81
- # processedSequence[i] = processedSequence[prevIdx]
82
- # elif nextIdx != -1:
83
- # processedSequence[i] = processedSequence[nextIdx]
84
 
 
 
 
85
  if len(processedSequence) != sequenceNum:
86
  print("incomplete sequence: ", len(processedSequence))
 
87
  return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
88
-
89
  inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
90
  prediction2 = lettersModel2.predict(inputData2, verbose=0)
91
 
@@ -94,9 +100,12 @@ def detectFromImage(sequenceList):
94
  label2 = labelEncoder2.inverse_transform([index2])[0]
95
  print(f'Letters Model 2:{label2} at {confidence2}')
96
 
97
- if fallback_frame is not None:
98
- imgRGB = cv2.cvtColor(fallback_frame, cv2.COLOR_BGR2RGB)
 
 
99
  results = hands.process(imgRGB)
 
100
  if results.multi_hand_landmarks:
101
  handLandmarks = results.multi_hand_landmarks[0]
102
  xList, yList = [], []
@@ -110,32 +119,35 @@ def detectFromImage(sequenceList):
110
  dataAux.append(lm.x - min(xList))
111
  dataAux.append(lm.y - min(yList))
112
 
113
- #check in letters model1
114
  inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
115
  prediction1 = lettersModel.predict(inputData1, verbose=0)
116
  index1 = np.argmax(prediction1, axis=1)[0]
117
  confidence1 = float(np.max(prediction1))
118
  label1 = labelEncoder.inverse_transform([index1])[0]
119
-
120
  print(f'Letters Model 1: {label1} at {confidence1}')
121
 
 
122
  prediction3 = numbersModel.predict(inputData1, verbose=0)
123
  index3 = np.argmax(prediction3, axis=1)[0]
124
  confidence3 = float(np.max(prediction3))
125
  label3 = numLabelEncoder.inverse_transform([index3])[0]
126
-
127
  print(f'Numbers Model: {label3} at {confidence3}')
128
 
129
- if label1==label2:
 
 
130
  return {'letter': label2, 'confidenceLetter': confidence2,
131
  'number': label3, 'confidenceNumber': confidence3}
132
- # elif label2=="Z" and label1=="L":
133
- # return {'letter': label2, 'confidence': confidence2}
134
- # elif label2=="J" and label1=="I":
135
- # return {'letter': label2, 'confidence': confidence2}
136
  else:
 
137
  return {'letter': label1, 'confidenceLetter': confidence1
138
- , 'number': label3, 'confidenceNumber': confidence3}
139
- else:
 
 
 
 
 
140
  return {'letter': label2, 'confidenceLetter': confidence2
141
- , 'number': '', 'confidenceNumber': 0.0}
 
3
  import pickle
4
  import tensorflow as tf
5
  import mediapipe as mp
6
+ from typing import List
7
 
8
+ # ----------------------------------------------------------------------
9
+ # Model and Encoder Loading (This section should remain unchanged)
10
+ # ----------------------------------------------------------------------
11
+ # Letters Model 1 (Static hand signs)
12
  lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
13
  with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
14
  labelEncoder = pickle.load(f)
15
 
16
+ # Letters Model 2 (Temporal signs like J, Z, motion)
17
  lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
18
  with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
19
  labelEncoder2 = pickle.load(f)
20
 
21
+ # Numbers Model (Static number signs)
22
  numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
23
  with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
24
  numLabelEncoder = pickle.load(f)
25
 
26
  sequenceNum = 20
27
  hands = mp.solutions.hands.Hands(static_image_mode=True)
28
+ # ----------------------------------------------------------------------
29
+
30
+ def detectFromImage(sequenceList: List[bytes]):
31
+ """
32
+ Processes a sequence of image frames (provided as raw bytes) to detect sign
33
+ language letters and numbers using multiple models.
34
+ """
35
+
36
+ # 1. Input Validation
37
  if len(sequenceList) != sequenceNum:
38
+ return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
39
 
40
  processedSequence = []
41
+
42
+ # Placeholder for the last valid frame (used for static fallback models)
43
+ fallback_frame_cv2 = None
44
+
45
+ # 2. Process Sequence Frames (Temporal Model)
46
+ for image_bytes in sequenceList:
47
+ # --- FIX: Decode bytes into an OpenCV image array (cv2.imdecode) ---
48
+ np_arr = np.frombuffer(image_bytes, np.uint8)
49
+ image = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) # Convert bytes to BGR image array
50
+
51
  if image is None:
52
+ # Skip corrupted frames
53
  continue
54
+
55
+ # Keep the last valid frame in OpenCV format for static models later
56
+ fallback_frame_cv2 = image
57
+
58
+ # Convert BGR to RGB for MediaPipe
59
  imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
60
  results = hands.process(imgRGB)
61
 
62
  if not results.multi_hand_landmarks:
63
+ # Skip frames without a detected hand
64
  continue
65
 
66
+ handLandmarks = results.multi_hand_landmarks[0]
67
 
68
+ # --- Landmarking and Normalization ---
69
  xList, yList = [], []
70
  dataAux2 = []
71
 
 
73
  xList.append(lm.x)
74
  yList.append(lm.y)
75
 
76
+ # Normalize landmarks relative to minimum x and y
77
  for lm in handLandmarks.landmark:
78
  dataAux2.append(lm.x - min(xList))
79
  dataAux2.append(lm.y - min(yList))
80
+ dataAux2.append(0) # Padding the Z dimension
81
 
82
  processedSequence.append(dataAux2)
83
 
84
  confidence2 = 0.0
85
  label2 = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # The interpolation logic is commented out, leaving it as-is based on your provided code.
88
+
89
+ # 3. Temporal Model Prediction (LettersModel2)
90
  if len(processedSequence) != sequenceNum:
91
  print("incomplete sequence: ", len(processedSequence))
92
+ # If the sequence is too short after dropping frames, return empty result
93
  return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
94
+
95
  inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
96
  prediction2 = lettersModel2.predict(inputData2, verbose=0)
97
 
 
100
  label2 = labelEncoder2.inverse_transform([index2])[0]
101
  print(f'Letters Model 2:{label2} at {confidence2}')
102
 
103
+ # 4. Static Model Prediction (Fallback/Verification)
104
+ if fallback_frame_cv2 is not None:
105
+ # Use the last valid frame detected by MediaPipe
106
+ imgRGB = cv2.cvtColor(fallback_frame_cv2, cv2.COLOR_BGR2RGB)
107
  results = hands.process(imgRGB)
108
+
109
  if results.multi_hand_landmarks:
110
  handLandmarks = results.multi_hand_landmarks[0]
111
  xList, yList = [], []
 
119
  dataAux.append(lm.x - min(xList))
120
  dataAux.append(lm.y - min(yList))
121
 
122
+ # check in letters model 1
123
  inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
124
  prediction1 = lettersModel.predict(inputData1, verbose=0)
125
  index1 = np.argmax(prediction1, axis=1)[0]
126
  confidence1 = float(np.max(prediction1))
127
  label1 = labelEncoder.inverse_transform([index1])[0]
 
128
  print(f'Letters Model 1: {label1} at {confidence1}')
129
 
130
+ # check in numbers model
131
  prediction3 = numbersModel.predict(inputData1, verbose=0)
132
  index3 = np.argmax(prediction3, axis=1)[0]
133
  confidence3 = float(np.max(prediction3))
134
  label3 = numLabelEncoder.inverse_transform([index3])[0]
 
135
  print(f'Numbers Model: {label3} at {confidence3}')
136
 
137
+ # 5. Result Aggregation
138
+ if label1 == label2:
139
+ # Both models agree on the letter
140
  return {'letter': label2, 'confidenceLetter': confidence2,
141
  'number': label3, 'confidenceNumber': confidence3}
 
 
 
 
142
  else:
143
+ # Default to static model 1 if disagreement (or implement better fusion logic here)
144
  return {'letter': label1, 'confidenceLetter': confidence1
145
+ , 'number': label3, 'confidenceNumber': confidence3}
146
+ else:
147
+ # Hand detected in sequence but not in the final fallback frame (unlikely)
148
+ return {'letter': label2, 'confidenceLetter': confidence2
149
+ , 'number': '', 'confidenceNumber': 0.0}
150
+ else:
151
+ # No hand detected in any frame, or all frames failed to decode.
152
  return {'letter': label2, 'confidenceLetter': confidence2
153
+ , 'number': '', 'confidenceNumber': 0.0}