SondosM commited on
Commit
1bfd4ce
·
verified ·
1 Parent(s): 593648b

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -515
app.py DELETED
@@ -1,515 +0,0 @@
1
- """
2
- Arabic Sign Language Interpreter - FastAPI Server (Optimized)
3
-
4
- Pipeline:
5
- Image Input
6
- ──► YOLO Detection (hand crop)
7
- ──► WiLoR 3D Pose Estimation
8
- ──► Stage-1 Classifier → "letter" or "number"?
9
- ──► Stage-2 MLP → specific sign label + confidence
10
-
11
- Modes (set MODE env var):
12
- full : YOLO + WiLoR FP32 + MLP (~1.1–2.5 GB)
13
- quantized : YOLO + WiLoR INT8 + MLP (~600 MB–1.2 GB)
14
- lightweight : MediaPipe + MLP (~50 MB)
15
- """
16
-
17
- import io
18
- import base64
19
- import inspect
20
- import sys
21
- import os
22
- import types
23
- from unittest.mock import MagicMock
24
-
25
- import numpy as np
26
- import cv2
27
- import torch
28
- import joblib
29
- import pandas as pd
30
- from pathlib import Path
31
- from scipy.spatial import distance
32
- from torchvision import transforms
33
- from PIL import Image
34
- from contextlib import asynccontextmanager
35
-
36
- from fastapi import FastAPI, File, UploadFile, HTTPException
37
- from fastapi.middleware.cors import CORSMiddleware
38
- from fastapi.responses import JSONResponse
39
- import uvicorn
40
-
41
- # ─── Runtime mode ──────────────────────────────────────────────────────────────
42
- MODE = os.environ.get("MODE", "full").lower()
43
- assert MODE in ("full", "quantized", "lightweight"), \
44
- f"Unknown MODE={MODE!r}. Choose full | quantized | lightweight."
45
-
46
- print(f"[INFO] Running in MODE={MODE!r}")
47
-
48
- # ─── Compatibility patches ─────────────────────────────────────────────────────
49
- if not hasattr(inspect, "getargspec"):
50
- inspect.getargspec = inspect.getfullargspec
51
-
52
- for attr, typ in [("int", int), ("float", float), ("complex", complex),
53
- ("bool", bool), ("object", object), ("str", str), ("unicode", str)]:
54
- if not hasattr(np, attr):
55
- setattr(np, attr, typ)
56
-
57
- # ─── Pyrender / OpenGL mock (headless) ────────────────────────────────────────
58
- pyrender_mock = types.ModuleType("pyrender")
59
- for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
60
- "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
61
- "Viewer", "MetallicRoughnessMaterial"]:
62
- setattr(pyrender_mock, _attr, MagicMock)
63
- sys.modules["pyrender"] = pyrender_mock
64
-
65
- for _mod in ["OpenGL", "OpenGL.GL", "OpenGL.GL.framebufferobjects",
66
- "OpenGL.platform", "OpenGL.error"]:
67
- if _mod not in sys.modules:
68
- sys.modules[_mod] = types.ModuleType(_mod)
69
-
70
- os.environ["PYOPENGL_PLATFORM"] = "osmesa"
71
-
72
- # ─── Configuration ─────────────────────────────────────────────────────────────
73
- WILOR_REPO_PATH = "./WiLoR"
74
- WILOR_CKPT = "./pretrained_models/wilor_final.ckpt"
75
- WILOR_CFG = "./pretrained_models/model_config.yaml"
76
-
77
- # Stage-1: binary classifier → "letter" vs "number"
78
- CLASSIFIER_PATH = "./classifier.pkl"
79
-
80
- # Stage-2: task-specific MLPs
81
- MLP_LETTERS_PATH = "./MLP_letters.pkl"
82
- MLP_NUMBERS_PATH = "./MLP_numbers.pkl"
83
-
84
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
85
-
86
- WILOR_TRANSFORM = transforms.Compose([
87
- transforms.ToTensor(),
88
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
89
- ])
90
-
91
- def _resolve_detector_path() -> str:
92
- candidates = ["./detector", "./pretrained_models/detector.pt", "./pretrained_models/detector"]
93
- for c in candidates:
94
- if Path(c).exists():
95
- return c
96
- raise FileNotFoundError("Detector not found in any candidate path!")
97
-
98
- # ─── Global model handles ──────────────────────────────────────────────────────
99
- wilor_model = None
100
- yolo_detector = None
101
- mp_hands_model = None # lightweight mode only
102
- classifier = None # Stage-1: letter vs number
103
- mlp_letters = None # Stage-2a
104
- mlp_numbers = None # Stage-2b
105
-
106
-
107
- # ─────────────────────────────────────────────────────────────────────────────
108
- # Model loading
109
- # ─────────────────────────────────────────────────────────────────────────────
110
-
111
- def _load_wilor_full():
112
- sys.path.insert(0, WILOR_REPO_PATH)
113
- from wilor.models import load_wilor
114
- model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
115
- model.to(DEVICE).eval()
116
- return model
117
-
118
-
119
- def _load_wilor_quantized():
120
- sys.path.insert(0, WILOR_REPO_PATH)
121
- from wilor.models import load_wilor
122
- import torch.quantization
123
-
124
- print("[INFO] Loading WiLoR (FP16 + INT8 quantization)...")
125
- model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
126
- model.eval()
127
- model = model.half()
128
- model = torch.quantization.quantize_dynamic(
129
- model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8,
130
- )
131
- model.to("cpu")
132
- return model
133
-
134
-
135
- def _load_mediapipe():
136
- import mediapipe as mp
137
- return mp.solutions.hands.Hands(
138
- static_image_mode=True,
139
- max_num_hands=1,
140
- min_detection_confidence=0.5,
141
- model_complexity=1,
142
- )
143
-
144
-
145
- def load_models():
146
- global wilor_model, yolo_detector, mp_hands_model
147
- global classifier, mlp_letters, mlp_numbers
148
-
149
- # ── Stage-1 classifier (letter vs number) ─────────────────────────────────
150
- print(f"[INFO] Loading stage-1 classifier from {CLASSIFIER_PATH} ...")
151
- classifier = joblib.load(CLASSIFIER_PATH)
152
- print("[INFO] Stage-1 classifier loaded.")
153
-
154
- # ── Stage-2 MLPs ──────────────────────────────────────────────────────────
155
- print(f"[INFO] Loading MLP_letters from {MLP_LETTERS_PATH} ...")
156
- mlp_letters = joblib.load(MLP_LETTERS_PATH)
157
- print("[INFO] MLP_letters loaded.")
158
-
159
- print(f"[INFO] Loading MLP_numbers from {MLP_NUMBERS_PATH} ...")
160
- mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
161
- print("[INFO] MLP_numbers loaded.")
162
-
163
- # ── Pose estimation backbone ───────────────────────────────────────────────
164
- if MODE == "lightweight":
165
- print("[INFO] Loading MediaPipe Hands (lightweight mode)...")
166
- mp_hands_model = _load_mediapipe()
167
- print("✅ MediaPipe loaded — no YOLO, no WiLoR needed.")
168
- else:
169
- detector_path = _resolve_detector_path()
170
- from ultralytics import YOLO
171
- print(f"[INFO] Loading YOLO detector from {detector_path} ...")
172
- yolo_detector = YOLO(detector_path)
173
- print("[INFO] YOLO loaded.")
174
-
175
- if MODE == "full":
176
- print(f"[INFO] Loading WiLoR FP32 on {DEVICE}...")
177
- wilor_model = _load_wilor_full()
178
- else:
179
- wilor_model = _load_wilor_quantized()
180
-
181
- print("✅ All models loaded successfully!")
182
-
183
-
184
- # ─────────────────────────────────────────────────────────────────────────────
185
- # Feature extraction
186
- # ─────────────────────────────────────────────────────────────────────────────
187
-
188
- def _build_features_from_joints(joints: np.ndarray, theta: np.ndarray) -> np.ndarray:
189
- """
190
- Build a 55-dim feature vector from 3-D hand joints + MANO pose params.
191
-
192
- joints : (21, 3) — 3-D joint positions
193
- theta : (48,) — global_orient (3) + hand_pose (45)
194
- Pass np.zeros(48) when using MediaPipe.
195
- Returns (55,) array.
196
- """
197
- tips = [4, 8, 12, 16, 20]
198
- hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
199
-
200
- dist_feats = []
201
- for i in range(1, 5):
202
- dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
203
- for i in range(1, 4):
204
- dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
205
-
206
- return np.concatenate([theta, dist_feats]) # 48 + 7 = 55
207
-
208
-
209
- # ── WiLoR path ────────────────────────────────────────────────────────────────
210
-
211
- def _wilor_run(crop_rgb: np.ndarray) -> dict:
212
- img_input = cv2.resize(crop_rgb, (256, 256))
213
- img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0)
214
-
215
- if MODE == "quantized":
216
- img_tensor = img_tensor.half().to("cpu")
217
- else:
218
- img_tensor = img_tensor.to(DEVICE)
219
-
220
- with torch.no_grad():
221
- output = wilor_model({"img": img_tensor})
222
- return output
223
-
224
-
225
- def extract_features_wilor(crop_rgb: np.ndarray) -> np.ndarray | None:
226
- output = _wilor_run(crop_rgb)
227
- if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
228
- return None
229
-
230
- mano = output["pred_mano_params"]
231
- hand_pose = mano["hand_pose"][0].cpu().float().numpy().flatten() # 45
232
- global_orient = mano["global_orient"][0].cpu().float().numpy().flatten() # 3
233
- theta = np.concatenate([global_orient, hand_pose]) # 48
234
-
235
- joints = output["pred_keypoints_3d"][0].cpu().float().numpy()
236
- return _build_features_from_joints(joints, theta)
237
-
238
-
239
- def get_3d_joints_wilor(crop_rgb: np.ndarray) -> np.ndarray:
240
- output = _wilor_run(crop_rgb)
241
- return output["pred_keypoints_3d"][0].cpu().float().numpy()
242
-
243
-
244
- # ── MediaPipe path ────────────────────────────────────────────────────────────
245
-
246
- def extract_features_mediapipe(img_rgb: np.ndarray):
247
- result = mp_hands_model.process(img_rgb)
248
- if not result.multi_hand_landmarks:
249
- return None, None, None, None
250
-
251
- h, w = img_rgb.shape[:2]
252
- hand_landmarks = result.multi_hand_landmarks[0]
253
- handedness = result.multi_handedness[0].classification[0].label.lower()
254
-
255
- joints = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark],
256
- dtype=np.float32)
257
-
258
- xs = (joints[:, 0] * w).astype(int)
259
- ys = (joints[:, 1] * h).astype(int)
260
- pad = 20
261
- x1 = max(0, int(xs.min()) - pad)
262
- y1 = max(0, int(ys.min()) - pad)
263
- x2 = min(w, int(xs.max()) + pad)
264
- y2 = min(h, int(ys.max()) + pad)
265
-
266
- theta = np.zeros(48, dtype=np.float32)
267
- features = _build_features_from_joints(joints, theta)
268
- return features, joints, handedness, [x1, y1, x2, y2]
269
-
270
-
271
- # ─────────────────────────────────────────────────────────────────────────────
272
- # Two-stage inference helpers
273
- # ─────────────────────────────────────────────────────────────────────────────
274
-
275
- def _align_features(model, features: np.ndarray) -> pd.DataFrame:
276
- """Pad / reorder features to match the model's expected column layout."""
277
- expected_cols = model.feature_names_in_
278
- vec = np.zeros(len(expected_cols), dtype=np.float64)
279
- limit = min(len(features), len(vec))
280
- vec[:limit] = features[:limit]
281
- return pd.DataFrame([vec], columns=expected_cols)
282
-
283
-
284
- def run_stage1(features: np.ndarray) -> tuple[str, float]:
285
- """
286
- Stage-1: binary classifier → 'letter' | 'number'
287
- Returns (category, confidence).
288
- """
289
- feat_df = _align_features(classifier, features)
290
- category = str(classifier.predict(feat_df)[0])
291
- proba = classifier.predict_proba(feat_df)[0]
292
- return category, float(proba.max())
293
-
294
-
295
- def run_stage2(category: str, features: np.ndarray) -> tuple[str, float]:
296
- """
297
- Stage-2: route to the appropriate MLP and return (label, confidence).
298
- category must be 'letter' or 'number' (case-insensitive).
299
- """
300
- cat = category.lower().strip()
301
-
302
- # Accept common synonyms / Arabic labels your classifier might emit
303
- if cat in ("letter", "letters", "حرف", "حروف"):
304
- model = mlp_letters
305
- elif cat in ("number", "numbers", "digit", "digits", "رقم", "أرقام", "ارقام"):
306
- model = mlp_numbers
307
- else:
308
- # Fallback: try both, return whichever has higher confidence
309
- feat_df_l = _align_features(mlp_letters, features)
310
- feat_df_n = _align_features(mlp_numbers, features)
311
- proba_l = float(mlp_letters.predict_proba(feat_df_l)[0].max())
312
- proba_n = float(mlp_numbers.predict_proba(feat_df_n)[0].max())
313
- if proba_l >= proba_n:
314
- label = str(mlp_letters.predict(feat_df_l)[0])
315
- return label, proba_l
316
- else:
317
- label = str(mlp_numbers.predict(feat_df_n)[0])
318
- return label, proba_n
319
-
320
- feat_df = _align_features(model, features)
321
- label = str(model.predict(feat_df)[0])
322
- proba = model.predict_proba(feat_df)[0]
323
- return label, float(proba.max())
324
-
325
-
326
- def full_pipeline(features: np.ndarray) -> dict:
327
- """
328
- Run the complete two-stage pipeline and return a structured result dict.
329
- """
330
- category, stage1_conf = run_stage1(features)
331
- label, stage2_conf = run_stage2(category, features)
332
-
333
- return {
334
- "sign": label,
335
- "sign_confidence": round(stage2_conf, 4),
336
- "category": category,
337
- "category_confidence": round(stage1_conf, 4),
338
- }
339
-
340
-
341
- # ─────────────────────────────────────────────────────────────────────────────
342
- # Utilities
343
- # ─────────────────────────────────────────────────────────────────────────────
344
-
345
- def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
346
- arr = np.frombuffer(file_bytes, np.uint8)
347
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
348
- if img is None:
349
- raise HTTPException(status_code=400, detail="Cannot decode image.")
350
- return img
351
-
352
-
353
- def _yolo_detect(img_rgb: np.ndarray):
354
- """Run YOLO and return (box_xyxy, hand_side, crop_rgb) or raise 422."""
355
- results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
356
- if not results[0].boxes:
357
- raise HTTPException(status_code=422, detail="No hand detected.")
358
-
359
- box = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
360
- label_id = int(results[0].boxes.cls[0].cpu().item())
361
- side = "left" if label_id == 0 else "right"
362
-
363
- h, w = img_rgb.shape[:2]
364
- x1, y1, x2, y2 = (max(0, box[0]), max(0, box[1]),
365
- min(w, box[2]), min(h, box[3]))
366
-
367
- crop = img_rgb[y1:y2, x1:x2]
368
- if crop.size == 0:
369
- raise HTTPException(status_code=422, detail="Empty crop after bounding box clamp.")
370
-
371
- return [x1, y1, x2, y2], side, crop
372
-
373
-
374
- # ─────────────────────────────────────────────────────────────────────────────
375
- # FastAPI app
376
- # ─────────────────────────────────────────────────────────────────────────────
377
-
378
- @asynccontextmanager
379
- async def lifespan(app: FastAPI):
380
- load_models()
381
- yield
382
-
383
- app = FastAPI(
384
- title="Arabic Sign Language Interpreter",
385
- description="Two-stage pipeline: Stage-1 classifies letter vs number, Stage-2 identifies the specific sign.",
386
- version="2.0.0",
387
- lifespan=lifespan,
388
- )
389
-
390
- app.add_middleware(
391
- CORSMiddleware,
392
- allow_origins=["*"],
393
- allow_methods=["*"],
394
- allow_headers=["*"],
395
- )
396
-
397
-
398
- # ─── Health ───────────────────────────────────────────────────────────────────
399
-
400
- @app.get("/")
401
- def root():
402
- return {"status": "running", "device": DEVICE, "mode": MODE, "version": "2.0.0"}
403
-
404
-
405
- # ─── /predict ─────────────────────────────────────────────────────────────────
406
-
407
- @app.post("/predict")
408
- async def predict(file: UploadFile = File(...)):
409
- """
410
- Upload a hand image → returns the predicted Arabic sign label.
411
-
412
- Response fields:
413
- - sign : predicted letter or digit
414
- - sign_confidence : MLP confidence (0–1)
415
- - category : "letter" or "number"
416
- - category_confidence: stage-1 classifier confidence
417
- - hand_side : "left" or "right"
418
- - bbox : [x1, y1, x2, y2] in pixels
419
- - mode : active pipeline mode
420
- """
421
- raw = await file.read()
422
- img_bgr = read_image_from_upload(raw)
423
- img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
424
-
425
- if MODE == "lightweight":
426
- features, _, hand_side, bbox = extract_features_mediapipe(img_rgb)
427
- if features is None:
428
- raise HTTPException(status_code=422, detail="No hand detected.")
429
- else:
430
- bbox, hand_side, crop = _yolo_detect(img_rgb)
431
- features = extract_features_wilor(crop)
432
- if features is None:
433
- raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
434
-
435
- result = full_pipeline(features)
436
-
437
- return JSONResponse({
438
- **result,
439
- "hand_side": hand_side,
440
- "bbox": bbox,
441
- "mode": MODE,
442
- })
443
-
444
-
445
- # ─── /predict_with_skeleton ───────────────────────────────────────────────────
446
-
447
- @app.post("/predict_with_skeleton")
448
- async def predict_with_skeleton(file: UploadFile = File(...)):
449
- """
450
- Same as /predict but also returns 3-D joint positions and a base64-encoded
451
- crop of the detected hand region.
452
- """
453
- raw = await file.read()
454
- img_bgr = read_image_from_upload(raw)
455
- img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
456
-
457
- if MODE == "lightweight":
458
- features, joints, hand_side, bbox = extract_features_mediapipe(img_rgb)
459
- if features is None:
460
- raise HTTPException(status_code=422, detail="No hand detected.")
461
- x1, y1, x2, y2 = bbox
462
- crop = img_rgb[y1:y2, x1:x2]
463
- else:
464
- bbox, hand_side, crop = _yolo_detect(img_rgb)
465
- features = extract_features_wilor(crop)
466
- if features is None:
467
- raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
468
- joints = get_3d_joints_wilor(crop)
469
-
470
- result = full_pipeline(features)
471
-
472
- _, buf = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
473
- crop_b64 = base64.b64encode(buf).decode("utf-8")
474
-
475
- return JSONResponse({
476
- **result,
477
- "hand_side": hand_side,
478
- "bbox": bbox,
479
- "joints_3d": joints.tolist(),
480
- "crop_b64": crop_b64,
481
- "mode": MODE,
482
- })
483
-
484
-
485
- # ─── /info ────────────────────────────────────────────────────────────────────
486
-
487
- @app.get("/info")
488
- def info():
489
- """Runtime diagnostics: memory, loaded models, feature dimensions."""
490
- import psutil
491
- proc_mb = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
492
-
493
- def _feat_len(model):
494
- return len(model.feature_names_in_) if model and hasattr(model, "feature_names_in_") else None
495
-
496
- return {
497
- "mode": MODE,
498
- "device": DEVICE,
499
- "process_ram_mb": round(proc_mb, 1),
500
- "classifier_features": _feat_len(classifier),
501
- "mlp_letters_features": _feat_len(mlp_letters),
502
- "mlp_numbers_features": _feat_len(mlp_numbers),
503
- "models_loaded": {
504
- "stage1_classifier": classifier is not None,
505
- "mlp_letters": mlp_letters is not None,
506
- "mlp_numbers": mlp_numbers is not None,
507
- "wilor": wilor_model is not None,
508
- "yolo": yolo_detector is not None,
509
- "mediapipe": mp_hands_model is not None,
510
- },
511
- }
512
-
513
-
514
- if __name__ == "__main__":
515
- uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)