SondosM commited on
Commit
206b1df
·
verified ·
1 Parent(s): fbfe038

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile (1) +41 -0
  2. app (1).py +427 -0
  3. hf_README.md +71 -0
Dockerfile (1) ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # System dependencies needed by OpenCV, PyTorch, osmesa (headless GL)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ git \
6
+ git-lfs \
7
+ libgl1 \
8
+ libglib2.0-0 \
9
+ libsm6 \
10
+ libxext6 \
11
+ libxrender1 \
12
+ libgomp1 \
13
+ libosmesa6 \
14
+ ffmpeg \
15
+ wget \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Set working directory
19
+ WORKDIR /app
20
+
21
+ # Copy requirements first (layer caching)
22
+ COPY requirements.txt .
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy everything else
26
+ COPY . .
27
+
28
+ # Make sure WiLoR repo is importable
29
+ ENV PYTHONPATH="/app/WiLoR:${PYTHONPATH}"
30
+
31
+ # Headless OpenGL
32
+ ENV PYOPENGL_PLATFORM=osmesa
33
+
34
+ # Default mode — override in HF Space environment variables
35
+ ENV MODE=full
36
+
37
+ # HF Spaces expects the app on port 7860 by default,
38
+ # but we set app_port=8000 in README so uvicorn binds here
39
+ EXPOSE 8000
40
+
41
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app (1).py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Arabic Sign Language Interpreter - FastAPI Server (Optimized)
3
+
4
+ Pipeline:
5
+ Image Input
6
+ ──► YOLO Detection (hand crop)
7
+ ──► WiLoR 3D Pose (extract 3D joints + MANO params)
8
+ ──► Stage-1: classifier.pkl → "letter" or "number"?
9
+ ──► Stage-2: MLP_letters.pkl → specific Arabic letter
10
+ OR MLP_numbers.pkl → specific digit
11
+ ──► JSON Response
12
+
13
+ Modes (set MODE env var):
14
+ full : YOLO + WiLoR FP32 + MLP (~1.1–2.5 GB)
15
+ quantized : YOLO + WiLoR INT8 + MLP (~600 MB–1.2 GB)
16
+ lightweight : MediaPipe + MLP (~50 MB)
17
+ """
18
+
19
+ import io
20
+ import base64
21
+ import inspect
22
+ import sys
23
+ import os
24
+ import types
25
+ from unittest.mock import MagicMock
26
+
27
+ import numpy as np
28
+ import cv2
29
+ import torch
30
+ import joblib
31
+ import pandas as pd
32
+ from pathlib import Path
33
+ from scipy.spatial import distance
34
+ from torchvision import transforms
35
+ from PIL import Image
36
+ from contextlib import asynccontextmanager
37
+
38
+ from fastapi import FastAPI, File, UploadFile, HTTPException
39
+ from fastapi.middleware.cors import CORSMiddleware
40
+ from fastapi.responses import JSONResponse
41
+ import uvicorn
42
+
43
+ # ─── Runtime mode ──────────────────────────────────────────────────────────────
44
+ MODE = os.environ.get("MODE", "full").lower()
45
+ assert MODE in ("full", "quantized", "lightweight"), \
46
+ f"Unknown MODE={MODE!r}. Choose full | quantized | lightweight."
47
+
48
+ print(f"[INFO] Running in MODE={MODE!r}")
49
+
50
+ # ─── Compatibility patches ─────────────────────────────────────────────────────
51
+ if not hasattr(inspect, "getargspec"):
52
+ inspect.getargspec = inspect.getfullargspec
53
+
54
+ for attr, typ in [("int", int), ("float", float), ("complex", complex),
55
+ ("bool", bool), ("object", object), ("str", str), ("unicode", str)]:
56
+ if not hasattr(np, attr):
57
+ setattr(np, attr, typ)
58
+
59
+ # ─── Pyrender / OpenGL mock (headless) ────────────────────────────────────────
60
+ pyrender_mock = types.ModuleType("pyrender")
61
+ for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
62
+ "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
63
+ "Viewer", "MetallicRoughnessMaterial"]:
64
+ setattr(pyrender_mock, _attr, MagicMock)
65
+ sys.modules["pyrender"] = pyrender_mock
66
+
67
+ for _mod in ["OpenGL", "OpenGL.GL", "OpenGL.GL.framebufferobjects",
68
+ "OpenGL.platform", "OpenGL.error"]:
69
+ if _mod not in sys.modules:
70
+ sys.modules[_mod] = types.ModuleType(_mod)
71
+
72
+ os.environ["PYOPENGL_PLATFORM"] = "osmesa"
73
+
74
+ # ─── Configuration ─────────────────────────────────────────────────────────────
75
+ WILOR_REPO_PATH = "./WiLoR"
76
+ WILOR_CKPT = "./pretrained_models/wilor_final.ckpt"
77
+ WILOR_CFG = "./pretrained_models/model_config.yaml"
78
+ CLASSIFIER_PATH = "./classifier.pkl"
79
+ MLP_LETTERS_PATH = "./MLP_letters.pkl"
80
+ MLP_NUMBERS_PATH = "./MLP_numbers.pkl"
81
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
82
+
83
+ WILOR_TRANSFORM = transforms.Compose([
84
+ transforms.ToTensor(),
85
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
86
+ ])
87
+
88
+ def _resolve_detector_path() -> str:
89
+ candidates = ["./detector", "./pretrained_models/detector.pt", "./pretrained_models/detector"]
90
+ for c in candidates:
91
+ if Path(c).exists():
92
+ return c
93
+ raise FileNotFoundError("Detector not found in any candidate path!")
94
+
95
+ # ─── Global model handles ──────────────────────────────────────────────────────
96
+ wilor_model = None
97
+ yolo_detector = None
98
+ mp_hands_model = None
99
+ classifier = None
100
+ mlp_letters = None
101
+ mlp_numbers = None
102
+
103
+
104
+ # ─────────────────────────────────────────────────────────────────────────────
105
+ # Model loading
106
+ # ─────────────────────────────────────────────────────────────────────────────
107
+
108
+ def _load_wilor_full():
109
+ sys.path.insert(0, WILOR_REPO_PATH)
110
+ from wilor.models import load_wilor
111
+ model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
112
+ model.to(DEVICE).eval()
113
+ return model
114
+
115
+
116
+ def _load_wilor_quantized():
117
+ sys.path.insert(0, WILOR_REPO_PATH)
118
+ from wilor.models import load_wilor
119
+ import torch.quantization
120
+
121
+ print("[INFO] Loading WiLoR (FP16 + INT8 quantization)...")
122
+ model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
123
+ model.eval()
124
+ model = model.half()
125
+ model = torch.quantization.quantize_dynamic(
126
+ model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8,
127
+ )
128
+ model.to("cpu")
129
+ return model
130
+
131
+
132
+ def _load_mediapipe():
133
+ import mediapipe as mp
134
+ return mp.solutions.hands.Hands(
135
+ static_image_mode=True,
136
+ max_num_hands=1,
137
+ min_detection_confidence=0.5,
138
+ model_complexity=1,
139
+ )
140
+
141
+
142
+ def load_models():
143
+ global wilor_model, yolo_detector, mp_hands_model
144
+ global classifier, mlp_letters, mlp_numbers
145
+
146
+ print(f"[INFO] Loading stage-1 classifier from {CLASSIFIER_PATH} ...")
147
+ classifier = joblib.load(CLASSIFIER_PATH)
148
+ print("[INFO] Stage-1 classifier loaded.")
149
+
150
+ print(f"[INFO] Loading MLP_letters from {MLP_LETTERS_PATH} ...")
151
+ mlp_letters = joblib.load(MLP_LETTERS_PATH)
152
+ print("[INFO] MLP_letters loaded.")
153
+
154
+ print(f"[INFO] Loading MLP_numbers from {MLP_NUMBERS_PATH} ...")
155
+ mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
156
+ print("[INFO] MLP_numbers loaded.")
157
+
158
+ if MODE == "lightweight":
159
+ print("[INFO] Loading MediaPipe Hands (lightweight mode)...")
160
+ mp_hands_model = _load_mediapipe()
161
+ print("✅ MediaPipe loaded.")
162
+ else:
163
+ detector_path = _resolve_detector_path()
164
+ from ultralytics import YOLO
165
+ print(f"[INFO] Loading YOLO detector from {detector_path} ...")
166
+ yolo_detector = YOLO(detector_path)
167
+ print("[INFO] YOLO loaded.")
168
+
169
+ if MODE == "full":
170
+ print(f"[INFO] Loading WiLoR FP32 on {DEVICE}...")
171
+ wilor_model = _load_wilor_full()
172
+ else:
173
+ wilor_model = _load_wilor_quantized()
174
+
175
+ print("✅ All models loaded successfully!")
176
+
177
+
178
+ # ─────────────────────────────────────────────────────────────────────────────
179
+ # Feature extraction
180
+ # ─────────────────────────────────────────────────────────────────────────────
181
+
182
+ def _build_features_from_joints(joints: np.ndarray, theta: np.ndarray) -> np.ndarray:
183
+ tips = [4, 8, 12, 16, 20]
184
+ hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
185
+ dist_feats = []
186
+ for i in range(1, 5):
187
+ dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
188
+ for i in range(1, 4):
189
+ dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
190
+ return np.concatenate([theta, dist_feats])
191
+
192
+
193
+ def _wilor_run(crop_rgb: np.ndarray) -> dict:
194
+ img_input = cv2.resize(crop_rgb, (256, 256))
195
+ img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0)
196
+ if MODE == "quantized":
197
+ img_tensor = img_tensor.half().to("cpu")
198
+ else:
199
+ img_tensor = img_tensor.to(DEVICE)
200
+ with torch.no_grad():
201
+ output = wilor_model({"img": img_tensor})
202
+ return output
203
+
204
+
205
+ def extract_features_wilor(crop_rgb: np.ndarray) -> np.ndarray | None:
206
+ output = _wilor_run(crop_rgb)
207
+ if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
208
+ return None
209
+ mano = output["pred_mano_params"]
210
+ hand_pose = mano["hand_pose"][0].cpu().float().numpy().flatten()
211
+ global_orient = mano["global_orient"][0].cpu().float().numpy().flatten()
212
+ theta = np.concatenate([global_orient, hand_pose])
213
+ joints = output["pred_keypoints_3d"][0].cpu().float().numpy()
214
+ return _build_features_from_joints(joints, theta)
215
+
216
+
217
+ def get_3d_joints_wilor(crop_rgb: np.ndarray) -> np.ndarray:
218
+ output = _wilor_run(crop_rgb)
219
+ return output["pred_keypoints_3d"][0].cpu().float().numpy()
220
+
221
+
222
+ def extract_features_mediapipe(img_rgb: np.ndarray):
223
+ result = mp_hands_model.process(img_rgb)
224
+ if not result.multi_hand_landmarks:
225
+ return None, None, None, None
226
+ h, w = img_rgb.shape[:2]
227
+ hand_landmarks = result.multi_hand_landmarks[0]
228
+ handedness = result.multi_handedness[0].classification[0].label.lower()
229
+ joints = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], dtype=np.float32)
230
+ xs, ys = (joints[:, 0] * w).astype(int), (joints[:, 1] * h).astype(int)
231
+ pad = 20
232
+ x1, y1 = max(0, int(xs.min()) - pad), max(0, int(ys.min()) - pad)
233
+ x2, y2 = min(w, int(xs.max()) + pad), min(h, int(ys.max()) + pad)
234
+ theta = np.zeros(48, dtype=np.float32)
235
+ features = _build_features_from_joints(joints, theta)
236
+ return features, joints, handedness, [x1, y1, x2, y2]
237
+
238
+
239
+ # ───────────────────────────────────────────────────────────────���─────────────
240
+ # Two-stage inference
241
+ # ─────────────────────────────────────────────────────────────────────────────
242
+
243
+ def _align_features(model, features: np.ndarray) -> pd.DataFrame:
244
+ expected_cols = model.feature_names_in_
245
+ vec = np.zeros(len(expected_cols), dtype=np.float64)
246
+ limit = min(len(features), len(vec))
247
+ vec[:limit] = features[:limit]
248
+ return pd.DataFrame([vec], columns=expected_cols)
249
+
250
+
251
+ def run_stage1(features: np.ndarray) -> tuple[str, float]:
252
+ feat_df = _align_features(classifier, features)
253
+ category = str(classifier.predict(feat_df)[0])
254
+ proba = classifier.predict_proba(feat_df)[0]
255
+ return category, float(proba.max())
256
+
257
+
258
+ def run_stage2(category: str, features: np.ndarray) -> tuple[str, float]:
259
+ cat = category.lower().strip()
260
+ if cat in ("letter", "letters", "حرف", "حروف"):
261
+ model = mlp_letters
262
+ elif cat in ("number", "numbers", "digit", "digits", "رقم", "أرقام", "ارقام"):
263
+ model = mlp_numbers
264
+ else:
265
+ feat_df_l = _align_features(mlp_letters, features)
266
+ feat_df_n = _align_features(mlp_numbers, features)
267
+ proba_l = float(mlp_letters.predict_proba(feat_df_l)[0].max())
268
+ proba_n = float(mlp_numbers.predict_proba(feat_df_n)[0].max())
269
+ if proba_l >= proba_n:
270
+ return str(mlp_letters.predict(feat_df_l)[0]), proba_l
271
+ else:
272
+ return str(mlp_numbers.predict(feat_df_n)[0]), proba_n
273
+
274
+ feat_df = _align_features(model, features)
275
+ label = str(model.predict(feat_df)[0])
276
+ proba = model.predict_proba(feat_df)[0]
277
+ return label, float(proba.max())
278
+
279
+
280
+ def full_pipeline(features: np.ndarray) -> dict:
281
+ category, stage1_conf = run_stage1(features)
282
+ label, stage2_conf = run_stage2(category, features)
283
+ return {
284
+ "sign": label,
285
+ "sign_confidence": round(stage2_conf, 4),
286
+ "category": category,
287
+ "category_confidence": round(stage1_conf, 4),
288
+ }
289
+
290
+
291
+ # ─────────────────────────────────────────────────────────────────────────────
292
+ # Utilities
293
+ # ─────────────────────────────────────────────────────────────────────────────
294
+
295
+ def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
296
+ arr = np.frombuffer(file_bytes, np.uint8)
297
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
298
+ if img is None:
299
+ raise HTTPException(status_code=400, detail="Cannot decode image.")
300
+ return img
301
+
302
+
303
+ def _yolo_detect(img_rgb: np.ndarray):
304
+ results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
305
+ if not results[0].boxes:
306
+ raise HTTPException(status_code=422, detail="No hand detected.")
307
+ box = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
308
+ label_id = int(results[0].boxes.cls[0].cpu().item())
309
+ side = "left" if label_id == 0 else "right"
310
+ h, w = img_rgb.shape[:2]
311
+ x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
312
+ crop = img_rgb[y1:y2, x1:x2]
313
+ if crop.size == 0:
314
+ raise HTTPException(status_code=422, detail="Empty crop after bounding box clamp.")
315
+ return [x1, y1, x2, y2], side, crop
316
+
317
+
318
+ # ─────────────────────────────────────────────────────────────────────────────
319
+ # FastAPI app
320
+ # ─────────────────────────────────────────────────────────────────────────────
321
+
322
+ @asynccontextmanager
323
+ async def lifespan(app: FastAPI):
324
+ load_models()
325
+ yield
326
+
327
+ app = FastAPI(
328
+ title="Arabic Sign Language Interpreter",
329
+ description="Two-stage pipeline: Stage-1 classifies letter vs number, Stage-2 identifies the specific sign.",
330
+ version="2.0.0",
331
+ lifespan=lifespan,
332
+ )
333
+
334
+ app.add_middleware(
335
+ CORSMiddleware,
336
+ allow_origins=["*"],
337
+ allow_methods=["*"],
338
+ allow_headers=["*"],
339
+ )
340
+
341
+
342
+ @app.get("/")
343
+ def root():
344
+ return {"status": "running", "device": DEVICE, "mode": MODE, "version": "2.0.0"}
345
+
346
+
347
+ @app.post("/predict")
348
+ async def predict(file: UploadFile = File(...)):
349
+ raw = await file.read()
350
+ img_bgr = read_image_from_upload(raw)
351
+ img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
352
+
353
+ if MODE == "lightweight":
354
+ features, _, hand_side, bbox = extract_features_mediapipe(img_rgb)
355
+ if features is None:
356
+ raise HTTPException(status_code=422, detail="No hand detected.")
357
+ else:
358
+ bbox, hand_side, crop = _yolo_detect(img_rgb)
359
+ features = extract_features_wilor(crop)
360
+ if features is None:
361
+ raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
362
+
363
+ result = full_pipeline(features)
364
+ return JSONResponse({**result, "hand_side": hand_side, "bbox": bbox, "mode": MODE})
365
+
366
+
367
+ @app.post("/predict_with_skeleton")
368
+ async def predict_with_skeleton(file: UploadFile = File(...)):
369
+ raw = await file.read()
370
+ img_bgr = read_image_from_upload(raw)
371
+ img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
372
+
373
+ if MODE == "lightweight":
374
+ features, joints, hand_side, bbox = extract_features_mediapipe(img_rgb)
375
+ if features is None:
376
+ raise HTTPException(status_code=422, detail="No hand detected.")
377
+ x1, y1, x2, y2 = bbox
378
+ crop = img_rgb[y1:y2, x1:x2]
379
+ else:
380
+ bbox, hand_side, crop = _yolo_detect(img_rgb)
381
+ features = extract_features_wilor(crop)
382
+ if features is None:
383
+ raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
384
+ joints = get_3d_joints_wilor(crop)
385
+
386
+ result = full_pipeline(features)
387
+ _, buf = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
388
+ crop_b64 = base64.b64encode(buf).decode("utf-8")
389
+
390
+ return JSONResponse({
391
+ **result,
392
+ "hand_side": hand_side,
393
+ "bbox": bbox,
394
+ "joints_3d": joints.tolist(),
395
+ "crop_b64": crop_b64,
396
+ "mode": MODE,
397
+ })
398
+
399
+
400
+ @app.get("/info")
401
+ def info():
402
+ import psutil
403
+ proc_mb = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
404
+
405
+ def _feat_len(model):
406
+ return len(model.feature_names_in_) if model and hasattr(model, "feature_names_in_") else None
407
+
408
+ return {
409
+ "mode": MODE,
410
+ "device": DEVICE,
411
+ "process_ram_mb": round(proc_mb, 1),
412
+ "classifier_features": _feat_len(classifier),
413
+ "mlp_letters_features": _feat_len(mlp_letters),
414
+ "mlp_numbers_features": _feat_len(mlp_numbers),
415
+ "models_loaded": {
416
+ "stage1_classifier": classifier is not None,
417
+ "mlp_letters": mlp_letters is not None,
418
+ "mlp_numbers": mlp_numbers is not None,
419
+ "wilor": wilor_model is not None,
420
+ "yolo": yolo_detector is not None,
421
+ "mediapipe": mp_hands_model is not None,
422
+ },
423
+ }
424
+
425
+
426
+ if __name__ == "__main__":
427
+ uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)
hf_README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Arabic Sign Language Interpreter
3
+ emoji: 🤟
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ app_port: 8000
10
+ ---
11
+
12
+ # Arabic Sign Language Interpreter API v2.0
13
+
14
+ REST API لتفسير لغة الإشارة العربية — يقبل صورة يد ويرجع الحرف أو الرقم المقابل.
15
+
16
+ ---
17
+
18
+ ## البايبلاين
19
+
20
+ ```
21
+ Image Input
22
+ ──► YOLO Detection (crop the hand)
23
+ ──► WiLoR 3D Pose (extract 3D joints + MANO params)
24
+ ──► Stage-1: classifier.pkl → "letter" or "number"?
25
+ ──► Stage-2: MLP_letters.pkl → specific Arabic letter
26
+ OR MLP_numbers.pkl → specific digit
27
+ ──► JSON Response
28
+ ```
29
+
30
+ ---
31
+
32
+ ## الـ Endpoints
33
+
34
+ ### `GET /`
35
+ Health check.
36
+
37
+ ### `POST /predict`
38
+ الـ endpoint الرئيسي.
39
+
40
+ **Request:** `multipart/form-data` — حقل `file` يحتوي على صورة اليد.
41
+
42
+ **Response:**
43
+ ```json
44
+ {
45
+ "sign": "ب",
46
+ "sign_confidence": 0.9731,
47
+ "category": "letter",
48
+ "category_confidence": 0.9812,
49
+ "hand_side": "right",
50
+ "bbox": [120, 85, 340, 310],
51
+ "mode": "full"
52
+ }
53
+ ```
54
+
55
+ ### `POST /predict_with_skeleton`
56
+ نفس `/predict` + مفاصل اليد 3D + crop بـ base64.
57
+
58
+ ### `GET /info`
59
+ معلومات تشخيصية عن الـ runtime.
60
+
61
+ ---
62
+
63
+ ## متطلبات الذاكرة
64
+
65
+ | Mode | RAM تقريبي | الدقة |
66
+ |-----------|-------------|---------|
67
+ | full | 1.1–2.5 GB | الأعلى |
68
+ | quantized | 600MB–1.2GB | عالية |
69
+ | lightweight | ~50 MB | متوسطة |
70
+
71
+ > الـ Space بيشتغل بـ `MODE=full` بشكل افتراضي. غيّريه من متغيرات البيئة في إعدادات الـ Space.