SondosM commited on
Commit
95cd476
Β·
verified Β·
1 Parent(s): 23ecb89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -263
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Arabic Sign Language Interpreter - FastAPI Server (Optimized)
3
  """
4
 
5
  import io
@@ -28,14 +28,7 @@ from fastapi.responses import JSONResponse
28
  import uvicorn
29
  from huggingface_hub import hf_hub_download
30
 
31
- # ─── Runtime mode ──────────────────────────────────────────────────────────────
32
- MODE = os.environ.get("MODE", "full").lower()
33
- assert MODE in ("full", "quantized", "lightweight"), \
34
- f"Unknown MODE={MODE!r}. Choose full | quantized | lightweight."
35
-
36
- print(f"[INFO] Running in MODE={MODE!r}")
37
-
38
- # ─── Compatibility patches ─────────────────────────────────────────────────────
39
  if not hasattr(inspect, "getargspec"):
40
  inspect.getargspec = inspect.getfullargspec
41
 
@@ -44,7 +37,7 @@ for attr, typ in [("int", int), ("float", float), ("complex", complex),
44
  if not hasattr(np, attr):
45
  setattr(np, attr, typ)
46
 
47
- # ─── Pyrender / OpenGL mock (headless) ────────────────────────────────────────
48
  pyrender_mock = types.ModuleType("pyrender")
49
  for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
50
  "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
@@ -59,7 +52,7 @@ for _mod in ["OpenGL", "OpenGL.GL", "OpenGL.GL.framebufferobjects",
59
 
60
  os.environ["PYOPENGL_PLATFORM"] = "osmesa"
61
 
62
- # ─── Hugging Face Model Integration ───────────────────────────────────────────
63
  REPO_ID = "SondosM/api_GP"
64
 
65
  def get_hf_file(filename, is_mano=False):
@@ -76,24 +69,23 @@ def get_hf_file(filename, is_mano=False):
76
 
77
  return temp_path
78
 
79
- # ─── Download all required files at startup ───────────────────────────────────
80
  print("Initializing model file paths...")
81
 
82
- # MANO files
83
  get_hf_file("mano_data/mano_data/mano_mean_params.npz", is_mano=True)
84
  get_hf_file("mano_data/mano_data/MANO_LEFT.pkl", is_mano=True)
85
  get_hf_file("mano_data/mano_data/MANO_RIGHT.pkl", is_mano=True)
86
 
87
- # Model weights
88
- WILOR_REPO_PATH = "./WiLoR"
89
- WILOR_CKPT = get_hf_file("pretrained_models/pretrained_models/wilor_final.ckpt")
90
- WILOR_CFG = get_hf_file("pretrained_models/pretrained_models/model_config.yaml")
91
- DETECTOR_PATH = get_hf_file("pretrained_models/pretrained_models/detector.pt")
92
 
93
- # Classifiers
94
- CLASSIFIER_PATH = "classifier.pkl"
95
- MLP_LETTERS_PATH = "MLP_letters.pkl"
96
- MLP_NUMBERS_PATH = "MLP_numbers.pkl"
 
97
 
98
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
99
 
@@ -102,344 +94,211 @@ WILOR_TRANSFORM = transforms.Compose([
102
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
103
  ])
104
 
105
- # ─── Global model handles ──────────────────────────────────────────────────────
106
- wilor_model = None
107
- yolo_detector = None
108
- mp_hands_model = None
109
- classifier = None
110
- mlp_letters = None
111
- mlp_numbers = None
112
 
113
 
114
- # ─────────────────────────────────────────────────────────────────────────────
115
- # Model loading
116
- # ─────────────────────────────────────────────────────────────────────────────
117
 
118
- def _load_wilor_full():
119
  sys.path.insert(0, WILOR_REPO_PATH)
120
  from wilor.models import load_wilor
121
- model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
122
- model.to(DEVICE).eval()
123
- return model
124
 
 
 
 
 
125
 
126
- def _load_wilor_quantized():
127
- sys.path.insert(0, WILOR_REPO_PATH)
128
- from wilor.models import load_wilor
129
- import torch.quantization
130
 
131
- print("[INFO] Loading WiLoR (FP16 + INT8 quantization)...")
132
- model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
133
- model.eval()
134
- model = model.half()
135
- model = torch.quantization.quantize_dynamic(
136
- model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8,
137
- )
138
- model.to("cpu")
139
- return model
140
 
 
141
 
142
- def _load_mediapipe():
143
- import mediapipe as mp
144
- return mp.solutions.hands.Hands(
145
- static_image_mode=True,
146
- max_num_hands=1,
147
- min_detection_confidence=0.5,
148
- model_complexity=1,
149
- )
150
 
 
 
 
 
151
 
152
- def load_models():
153
- global wilor_model, yolo_detector, mp_hands_model
154
- global classifier, mlp_letters, mlp_numbers
155
 
156
- print(f"[INFO] Loading stage-1 classifier from {CLASSIFIER_PATH} ...")
157
- classifier = joblib.load(CLASSIFIER_PATH)
158
- print("[INFO] Stage-1 classifier loaded.")
159
 
160
- print(f"[INFO] Loading MLP_letters from {MLP_LETTERS_PATH} ...")
161
- mlp_letters = joblib.load(MLP_LETTERS_PATH)
162
- print("[INFO] MLP_letters loaded.")
 
 
 
163
 
164
- print(f"[INFO] Loading MLP_numbers from {MLP_NUMBERS_PATH} ...")
165
- mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
166
- print("[INFO] MLP_numbers loaded.")
167
 
168
- if MODE == "lightweight":
169
- print("[INFO] Loading MediaPipe Hands (lightweight mode)...")
170
- mp_hands_model = _load_mediapipe()
171
- print("βœ… MediaPipe loaded.")
172
- else:
173
- from ultralytics import YOLO
174
- print(f"[INFO] Loading YOLO detector from {DETECTOR_PATH} ...")
175
- yolo_detector = YOLO(DETECTOR_PATH)
176
- print("[INFO] YOLO loaded.")
177
 
178
- if MODE == "full":
179
- print(f"[INFO] Loading WiLoR FP32 on {DEVICE}...")
180
- wilor_model = _load_wilor_full()
181
- else:
182
- wilor_model = _load_wilor_quantized()
183
 
184
- print("βœ… All models loaded successfully!")
 
185
 
 
 
186
 
187
- # ─────────────────────────────────────────────────────────────────────────────
188
- # Feature extraction
189
- # ─────────────────────────────────────────────────────────────────────────────
 
190
 
191
- def _build_features_from_joints(joints: np.ndarray, theta: np.ndarray) -> np.ndarray:
192
- tips = [4, 8, 12, 16, 20]
193
  hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
 
194
  dist_feats = []
195
  for i in range(1, 5):
196
  dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
197
  for i in range(1, 4):
198
  dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
 
199
  return np.concatenate([theta, dist_feats])
200
 
201
 
202
- def _wilor_run(crop_rgb: np.ndarray) -> dict:
203
  img_input = cv2.resize(crop_rgb, (256, 256))
204
- img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0)
205
- if MODE == "quantized":
206
- img_tensor = img_tensor.half().to("cpu")
207
- else:
208
- img_tensor = img_tensor.to(DEVICE)
209
  with torch.no_grad():
210
  output = wilor_model({"img": img_tensor})
211
- return output
212
-
213
-
214
- def extract_features_wilor(crop_rgb: np.ndarray) -> np.ndarray | None:
215
- output = _wilor_run(crop_rgb)
216
- if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
217
- return None
218
- mano = output["pred_mano_params"]
219
- hand_pose = mano["hand_pose"][0].cpu().float().numpy().flatten()
220
- global_orient = mano["global_orient"][0].cpu().float().numpy().flatten()
221
- theta = np.concatenate([global_orient, hand_pose])
222
- joints = output["pred_keypoints_3d"][0].cpu().float().numpy()
223
- return _build_features_from_joints(joints, theta)
224
-
225
-
226
- def get_3d_joints_wilor(crop_rgb: np.ndarray) -> np.ndarray:
227
- output = _wilor_run(crop_rgb)
228
- return output["pred_keypoints_3d"][0].cpu().float().numpy()
229
-
230
 
231
- def extract_features_mediapipe(img_rgb: np.ndarray):
232
- result = mp_hands_model.process(img_rgb)
233
- if not result.multi_hand_landmarks:
234
- return None, None, None, None
235
- h, w = img_rgb.shape[:2]
236
- hand_landmarks = result.multi_hand_landmarks[0]
237
- handedness = result.multi_handedness[0].classification[0].label.lower()
238
- joints = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], dtype=np.float32)
239
- xs, ys = (joints[:, 0] * w).astype(int), (joints[:, 1] * h).astype(int)
240
- pad = 20
241
- x1, y1 = max(0, int(xs.min()) - pad), max(0, int(ys.min()) - pad)
242
- x2, y2 = min(w, int(xs.max()) + pad), min(h, int(ys.max()) + pad)
243
- theta = np.zeros(48, dtype=np.float32)
244
- features = _build_features_from_joints(joints, theta)
245
- return features, joints, handedness, [x1, y1, x2, y2]
246
 
 
 
 
 
 
 
247
 
248
- # ─────────────────────────────────────────────────────────────────────────────
249
- # Two-stage inference
250
- # ─────────────────────────────────────────────────────────────────────────────
251
 
252
  def _align_features(model, features: np.ndarray) -> pd.DataFrame:
253
  expected_cols = model.feature_names_in_
254
- vec = np.zeros(len(expected_cols), dtype=np.float64)
255
  limit = min(len(features), len(vec))
256
  vec[:limit] = features[:limit]
257
  return pd.DataFrame([vec], columns=expected_cols)
258
 
259
 
260
- def run_stage1(features: np.ndarray) -> tuple[str, float]:
 
261
  feat_df = _align_features(classifier, features)
262
  category = str(classifier.predict(feat_df)[0])
263
- proba = classifier.predict_proba(feat_df)[0]
264
- return category, float(proba.max())
265
-
266
 
267
- def run_stage2(category: str, features: np.ndarray) -> tuple[str, float]:
268
  cat = category.lower().strip()
269
  if cat in ("letter", "letters", "حرف", "حروف"):
270
  model = mlp_letters
271
  elif cat in ("number", "numbers", "digit", "digits", "Ψ±Ω‚Ω…", "Ψ£Ψ±Ω‚Ψ§Ω…", "Ψ§Ψ±Ω‚Ψ§Ω…"):
272
  model = mlp_numbers
273
  else:
274
- feat_df_l = _align_features(mlp_letters, features)
275
- feat_df_n = _align_features(mlp_numbers, features)
276
- proba_l = float(mlp_letters.predict_proba(feat_df_l)[0].max())
277
- proba_n = float(mlp_numbers.predict_proba(feat_df_n)[0].max())
278
- if proba_l >= proba_n:
279
- return str(mlp_letters.predict(feat_df_l)[0]), proba_l
280
- else:
281
- return str(mlp_numbers.predict(feat_df_n)[0]), proba_n
282
 
283
  feat_df = _align_features(model, features)
284
  label = str(model.predict(feat_df)[0])
285
- proba = model.predict_proba(feat_df)[0]
286
- return label, float(proba.max())
287
 
288
-
289
- def full_pipeline(features: np.ndarray) -> dict:
290
- category, stage1_conf = run_stage1(features)
291
- label, stage2_conf = run_stage2(category, features)
292
  return {
293
  "sign": label,
294
- "sign_confidence": round(stage2_conf, 4),
295
  "category": category,
296
- "category_confidence": round(stage1_conf, 4),
297
  }
298
 
299
 
300
  # ─────────────────────────────────────────────────────────────────────────────
301
- # Utilities
302
  # ─────────────────────────────────────────────────────────────────────────────
303
 
304
- def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
305
- arr = np.frombuffer(file_bytes, np.uint8)
306
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
307
- if img is None:
308
- raise HTTPException(status_code=400, detail="Cannot decode image.")
309
- return img
310
 
311
 
312
- def _yolo_detect(img_rgb: np.ndarray):
 
 
 
 
 
313
  results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
314
  if not results[0].boxes:
315
  raise HTTPException(status_code=422, detail="No hand detected.")
 
316
  box = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
317
  label_id = int(results[0].boxes.cls[0].cpu().item())
318
- side = "left" if label_id == 0 else "right"
319
- h, w = img_rgb.shape[:2]
 
320
  x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
321
  crop = img_rgb[y1:y2, x1:x2]
322
- if crop.size == 0:
323
- raise HTTPException(status_code=422, detail="Empty crop after bounding box clamp.")
324
- return [x1, y1, x2, y2], side, crop
325
-
326
 
327
- # ─────────────────────────────────────────────────────────────────────────────
328
- # FastAPI app
329
- # ─────────────────────────────────────────────────────────────────────────────
330
-
331
- app_ready = False
332
-
333
- @asynccontextmanager
334
- async def lifespan(app: FastAPI):
335
- global app_ready
336
- load_models()
337
- app_ready = True
338
- yield
339
-
340
- app = FastAPI(
341
- title="Arabic Sign Language Interpreter",
342
- description="Two-stage pipeline: Stage-1 classifies letter vs number, Stage-2 identifies the specific sign.",
343
- version="2.0.0",
344
- lifespan=lifespan,
345
- )
346
-
347
- app.add_middleware(
348
- CORSMiddleware,
349
- allow_origins=["*"],
350
- allow_methods=["*"],
351
- allow_headers=["*"],
352
- )
353
 
 
 
 
354
 
355
- @app.get("/")
356
- def root():
357
- if not app_ready:
358
- return JSONResponse(
359
- status_code=503,
360
- content={"status": "loading", "device": DEVICE, "mode": MODE}
361
- )
362
- return {"status": "running", "device": DEVICE, "mode": MODE, "version": "2.0.0"}
363
 
364
 
365
- @app.post("/predict")
366
- async def predict(file: UploadFile = File(...)):
367
  raw = await file.read()
368
  img_bgr = read_image_from_upload(raw)
369
  img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
370
 
371
- if MODE == "lightweight":
372
- features, _, hand_side, bbox = extract_features_mediapipe(img_rgb)
373
- if features is None:
374
- raise HTTPException(status_code=422, detail="No hand detected.")
375
- else:
376
- bbox, hand_side, crop = _yolo_detect(img_rgb)
377
- features = extract_features_wilor(crop)
378
- if features is None:
379
- raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
380
-
381
- result = full_pipeline(features)
382
- return JSONResponse({**result, "hand_side": hand_side, "bbox": bbox, "mode": MODE})
383
 
 
 
 
384
 
385
- @app.post("/predict_with_skeleton")
386
- async def predict_with_skeleton(file: UploadFile = File(...)):
387
- raw = await file.read()
388
- img_bgr = read_image_from_upload(raw)
389
- img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
390
 
391
- if MODE == "lightweight":
392
- features, joints, hand_side, bbox = extract_features_mediapipe(img_rgb)
393
- if features is None:
394
- raise HTTPException(status_code=422, detail="No hand detected.")
395
- x1, y1, x2, y2 = bbox
396
- crop = img_rgb[y1:y2, x1:x2]
397
- else:
398
- bbox, hand_side, crop = _yolo_detect(img_rgb)
399
- features = extract_features_wilor(crop)
400
- if features is None:
401
- raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
402
- joints = get_3d_joints_wilor(crop)
403
 
404
- result = full_pipeline(features)
405
  _, buf = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
406
  crop_b64 = base64.b64encode(buf).decode("utf-8")
407
 
408
  return JSONResponse({
409
  **result,
410
  "hand_side": hand_side,
411
- "bbox": bbox,
412
  "joints_3d": joints.tolist(),
413
  "crop_b64": crop_b64,
414
- "mode": MODE,
415
  })
416
 
417
 
418
- @app.get("/info")
419
- def info():
420
- import psutil
421
- proc_mb = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
422
-
423
- def _feat_len(model):
424
- return len(model.feature_names_in_) if model and hasattr(model, "feature_names_in_") else None
425
-
426
- return {
427
- "mode": MODE,
428
- "device": DEVICE,
429
- "process_ram_mb": round(proc_mb, 1),
430
- "classifier_features": _feat_len(classifier),
431
- "mlp_letters_features": _feat_len(mlp_letters),
432
- "mlp_numbers_features": _feat_len(mlp_numbers),
433
- "models_loaded": {
434
- "stage1_classifier": classifier is not None,
435
- "mlp_letters": mlp_letters is not None,
436
- "mlp_numbers": mlp_numbers is not None,
437
- "wilor": wilor_model is not None,
438
- "yolo": yolo_detector is not None,
439
- "mediapipe": mp_hands_model is not None,
440
- },
441
- }
442
-
443
-
444
  if __name__ == "__main__":
445
- uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)
 
1
  """
2
+ Arabic Sign Language Interpreter - FastAPI Server
3
  """
4
 
5
  import io
 
28
  import uvicorn
29
  from huggingface_hub import hf_hub_download
30
 
31
+ # --- Compatibility Patches ---
 
 
 
 
 
 
 
32
  if not hasattr(inspect, "getargspec"):
33
  inspect.getargspec = inspect.getfullargspec
34
 
 
37
  if not hasattr(np, attr):
38
  setattr(np, attr, typ)
39
 
40
+ # --- Pyrender / OpenGL Mock (Headless) ---
41
  pyrender_mock = types.ModuleType("pyrender")
42
  for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
43
  "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
 
52
 
53
  os.environ["PYOPENGL_PLATFORM"] = "osmesa"
54
 
55
+ # --- Hugging Face Model Integration ---
56
  REPO_ID = "SondosM/api_GP"
57
 
58
  def get_hf_file(filename, is_mano=False):
 
69
 
70
  return temp_path
71
 
72
+ # --- Download required files ---
73
  print("Initializing model file paths...")
74
 
 
75
  get_hf_file("mano_data/mano_data/mano_mean_params.npz", is_mano=True)
76
  get_hf_file("mano_data/mano_data/MANO_LEFT.pkl", is_mano=True)
77
  get_hf_file("mano_data/mano_data/MANO_RIGHT.pkl", is_mano=True)
78
 
79
+ WILOR_REPO_PATH = "./WiLoR"
80
+ WILOR_CKPT = get_hf_file("pretrained_models/pretrained_models/wilor_final.ckpt")
81
+ WILOR_CFG = get_hf_file("pretrained_models/pretrained_models/model_config.yaml")
82
+ DETECTOR_PATH = get_hf_file("pretrained_models/pretrained_models/detector.pt")
 
83
 
84
+ # ─── الفرق Ψ§Ω„Ψ£Ψ³Ψ§Ψ³ΩŠ: Ψ§Ω„ΩƒΩˆΨ― Ψ§Ω„Ψ£ΩˆΩ„ ΩƒΨ§Ω† Ψ¨ΩŠΨ­Ω…Ω‘Ω„ classifier.pkl Ω…Ω† Ω…Ψ³Ψ§Ψ± Ω…Ψ­Ω„ΩŠ Ψ«Ψ§Ψ¨Ψͺ
85
+ # Ψ¨Ψ―Ω„ Ω…Ψ§ ΩŠΨ­Ω…Ω‘Ω„Ω‡ Ω…Ω† HF زي Ψ¨Ψ§Ω‚ΩŠ الملفاΨͺ ─────────────────────────────────────────
86
+ CLASSIFIER_PATH = get_hf_file("classifier.pkl")
87
+ MLP_LETTERS_PATH = get_hf_file("MLP_letters.pkl")
88
+ MLP_NUMBERS_PATH = get_hf_file("MLP_numbers.pkl")
89
 
90
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
91
 
 
94
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
95
  ])
96
 
97
+ wilor_model = None
98
+ yolo_detector = None
99
+ classifier = None
100
+ mlp_letters = None
101
+ mlp_numbers = None
 
 
102
 
103
 
104
+ def load_models():
105
+ global wilor_model, yolo_detector, classifier, mlp_letters, mlp_numbers
 
106
 
 
107
  sys.path.insert(0, WILOR_REPO_PATH)
108
  from wilor.models import load_wilor
109
+ from ultralytics import YOLO
 
 
110
 
111
+ print(f"Loading WiLoR on {DEVICE}...")
112
+ wilor_model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
113
+ wilor_model.to(DEVICE)
114
+ wilor_model.eval()
115
 
116
+ print("Loading YOLO detector...")
117
+ yolo_detector = YOLO(DETECTOR_PATH)
 
 
118
 
119
+ print("Loading classifiers...")
120
+ classifier = joblib.load(CLASSIFIER_PATH)
121
+ mlp_letters = joblib.load(MLP_LETTERS_PATH)
122
+ mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
 
 
 
 
 
123
 
124
+ print("βœ… All models loaded successfully!")
125
 
 
 
 
 
 
 
 
 
126
 
127
+ @asynccontextmanager
128
+ async def lifespan(app: FastAPI):
129
+ load_models()
130
+ yield
131
 
 
 
 
132
 
133
+ app = FastAPI(title="Arabic Sign Language Interpreter", lifespan=lifespan)
 
 
134
 
135
+ app.add_middleware(
136
+ CORSMiddleware,
137
+ allow_origins=["*"],
138
+ allow_methods=["*"],
139
+ allow_headers=["*"],
140
+ )
141
 
 
 
 
142
 
143
+ # ─────────────────────────────────────────────────────────────────────────────
144
+ # Feature extraction
145
+ # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
146
 
147
+ def extract_features(crop_rgb: np.ndarray) -> np.ndarray | None:
148
+ img_input = cv2.resize(crop_rgb, (256, 256))
149
+ img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0).to(DEVICE)
 
 
150
 
151
+ with torch.no_grad():
152
+ output = wilor_model({"img": img_tensor})
153
 
154
+ if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
155
+ return None
156
 
157
+ mano = output["pred_mano_params"]
158
+ hand_pose = mano["hand_pose"][0].cpu().numpy().flatten()
159
+ global_orient = mano["global_orient"][0].cpu().numpy().flatten()
160
+ theta = np.concatenate([global_orient, hand_pose])
161
 
162
+ joints = output["pred_keypoints_3d"][0].cpu().numpy()
163
+ tips = [4, 8, 12, 16, 20]
164
  hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
165
+
166
  dist_feats = []
167
  for i in range(1, 5):
168
  dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
169
  for i in range(1, 4):
170
  dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
171
+
172
  return np.concatenate([theta, dist_feats])
173
 
174
 
175
+ def get_3d_joints(crop_rgb: np.ndarray) -> np.ndarray:
176
  img_input = cv2.resize(crop_rgb, (256, 256))
177
+ img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0).to(DEVICE)
 
 
 
 
178
  with torch.no_grad():
179
  output = wilor_model({"img": img_tensor})
180
+ return output["pred_keypoints_3d"][0].cpu().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
184
+ arr = np.frombuffer(file_bytes, np.uint8)
185
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
186
+ if img is None:
187
+ raise HTTPException(status_code=400, detail="Invalid image format.")
188
+ return img
189
 
 
 
 
190
 
191
  def _align_features(model, features: np.ndarray) -> pd.DataFrame:
192
  expected_cols = model.feature_names_in_
193
+ vec = np.zeros(len(expected_cols))
194
  limit = min(len(features), len(vec))
195
  vec[:limit] = features[:limit]
196
  return pd.DataFrame([vec], columns=expected_cols)
197
 
198
 
199
+ def run_two_stage(features: np.ndarray) -> dict:
200
+ # Stage 1: letter or number?
201
  feat_df = _align_features(classifier, features)
202
  category = str(classifier.predict(feat_df)[0])
203
+ cat_conf = float(classifier.predict_proba(feat_df)[0].max())
 
 
204
 
205
+ # Stage 2: which sign exactly?
206
  cat = category.lower().strip()
207
  if cat in ("letter", "letters", "حرف", "حروف"):
208
  model = mlp_letters
209
  elif cat in ("number", "numbers", "digit", "digits", "Ψ±Ω‚Ω…", "Ψ£Ψ±Ω‚Ψ§Ω…", "Ψ§Ψ±Ω‚Ψ§Ω…"):
210
  model = mlp_numbers
211
  else:
212
+ # fallback: pick whichever is more confident
213
+ feat_l = _align_features(mlp_letters, features)
214
+ feat_n = _align_features(mlp_numbers, features)
215
+ prob_l = float(mlp_letters.predict_proba(feat_l)[0].max())
216
+ prob_n = float(mlp_numbers.predict_proba(feat_n)[0].max())
217
+ model = mlp_letters if prob_l >= prob_n else mlp_numbers
 
 
218
 
219
  feat_df = _align_features(model, features)
220
  label = str(model.predict(feat_df)[0])
221
+ conf = float(model.predict_proba(feat_df)[0].max())
 
222
 
 
 
 
 
223
  return {
224
  "sign": label,
225
+ "sign_confidence": round(conf, 4),
226
  "category": category,
227
+ "category_confidence": round(cat_conf, 4),
228
  }
229
 
230
 
231
  # ─────────────────────────────────────────────────────────────────────────────
232
+ # Routes
233
  # ─────────────────────────────────────────────────────────────────────────────
234
 
235
+ @app.get("/")
236
+ def root():
237
+ return {"status": "running", "device": DEVICE}
 
 
 
238
 
239
 
240
+ @app.post("/predict")
241
+ async def predict(file: UploadFile = File(...)):
242
+ raw = await file.read()
243
+ img_bgr = read_image_from_upload(raw)
244
+ img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
245
+
246
  results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
247
  if not results[0].boxes:
248
  raise HTTPException(status_code=422, detail="No hand detected.")
249
+
250
  box = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
251
  label_id = int(results[0].boxes.cls[0].cpu().item())
252
+ hand_side = "left" if label_id == 0 else "right"
253
+
254
+ h, w = img_rgb.shape[:2]
255
  x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
256
  crop = img_rgb[y1:y2, x1:x2]
 
 
 
 
257
 
258
+ if crop.size == 0:
259
+ raise HTTPException(status_code=422, detail="Empty hand crop.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ features = extract_features(crop)
262
+ if features is None:
263
+ raise HTTPException(status_code=500, detail="Feature extraction failed.")
264
 
265
+ result = run_two_stage(features)
266
+ return JSONResponse({**result, "hand_side": hand_side, "bbox": [int(x1), int(y1), int(x2), int(y2)]})
 
 
 
 
 
 
267
 
268
 
269
+ @app.post("/predict_with_skeleton")
270
+ async def predict_with_skeleton(file: UploadFile = File(...)):
271
  raw = await file.read()
272
  img_bgr = read_image_from_upload(raw)
273
  img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
274
 
275
+ results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
276
+ if not results[0].boxes:
277
+ raise HTTPException(status_code=422, detail="No hand detected.")
 
 
 
 
 
 
 
 
 
278
 
279
+ box = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
280
+ label_id = int(results[0].boxes.cls[0].cpu().item())
281
+ hand_side = "left" if label_id == 0 else "right"
282
 
283
+ h, w = img_rgb.shape[:2]
284
+ x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
285
+ crop = img_rgb[y1:y2, x1:x2]
 
 
286
 
287
+ features = extract_features(crop)
288
+ joints = get_3d_joints(crop)
 
 
 
 
 
 
 
 
 
 
289
 
290
+ result = run_two_stage(features)
291
  _, buf = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
292
  crop_b64 = base64.b64encode(buf).decode("utf-8")
293
 
294
  return JSONResponse({
295
  **result,
296
  "hand_side": hand_side,
297
+ "bbox": [int(x1), int(y1), int(x2), int(y2)],
298
  "joints_3d": joints.tolist(),
299
  "crop_b64": crop_b64,
 
300
  })
301
 
302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  if __name__ == "__main__":
304
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)