Spaces:

SondosM
/

API_2x1

Sleeping

App Files Files Community

SondosM commited on Apr 20

Commit

95cd476

verified ·

1 Parent(s): 23ecb89

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -263

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Arabic Sign Language Interpreter - FastAPI Server (Optimized)
 """
 import io
@@ -28,14 +28,7 @@ from fastapi.responses import JSONResponse
 import uvicorn
 from huggingface_hub import hf_hub_download
-# ─── Runtime mode ──────────────────────────────────────────────────────────────
-MODE = os.environ.get("MODE", "full").lower()
-assert MODE in ("full", "quantized", "lightweight"), \
-    f"Unknown MODE={MODE!r}. Choose full | quantized | lightweight."
-print(f"[INFO] Running in MODE={MODE!r}")
-# ─── Compatibility patches ─────────────────────────────────────────────────────
 if not hasattr(inspect, "getargspec"):
     inspect.getargspec = inspect.getfullargspec
@@ -44,7 +37,7 @@ for attr, typ in [("int", int), ("float", float), ("complex", complex),
     if not hasattr(np, attr):
         setattr(np, attr, typ)
-# ─── Pyrender / OpenGL mock (headless) ────────────────────────────────────────
 pyrender_mock = types.ModuleType("pyrender")
 for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
               "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
@@ -59,7 +52,7 @@ for _mod in ["OpenGL", "OpenGL.GL", "OpenGL.GL.framebufferobjects",
 os.environ["PYOPENGL_PLATFORM"] = "osmesa"
-# ─── Hugging Face Model Integration ───────────────────────────────────────────
 REPO_ID = "SondosM/api_GP"
 def get_hf_file(filename, is_mano=False):
@@ -76,24 +69,23 @@ def get_hf_file(filename, is_mano=False):
     return temp_path
-# ─── Download all required files at startup ───────────────────────────────────
 print("Initializing model file paths...")
-# MANO files
 get_hf_file("mano_data/mano_data/mano_mean_params.npz", is_mano=True)
 get_hf_file("mano_data/mano_data/MANO_LEFT.pkl",        is_mano=True)
 get_hf_file("mano_data/mano_data/MANO_RIGHT.pkl",       is_mano=True)
-# Model weights
-WILOR_REPO_PATH   = "./WiLoR"
-WILOR_CKPT        = get_hf_file("pretrained_models/pretrained_models/wilor_final.ckpt")
-WILOR_CFG         = get_hf_file("pretrained_models/pretrained_models/model_config.yaml")
-DETECTOR_PATH     = get_hf_file("pretrained_models/pretrained_models/detector.pt")
-# Classifiers
-CLASSIFIER_PATH   = "classifier.pkl"
-MLP_LETTERS_PATH  = "MLP_letters.pkl"
-MLP_NUMBERS_PATH  = "MLP_numbers.pkl"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -102,344 +94,211 @@ WILOR_TRANSFORM = transforms.Compose([
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
-# ─── Global model handles ──────────────────────────────────────────────────────
-wilor_model    = None
-yolo_detector  = None
-mp_hands_model = None
-classifier     = None
-mlp_letters    = None
-mlp_numbers    = None
-# ─────────────────────────────────────────────────────────────────────────────
-#  Model loading
-# ─────────────────────────────────────────────────────────────────────────────
-def _load_wilor_full():
     sys.path.insert(0, WILOR_REPO_PATH)
     from wilor.models import load_wilor
-    model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
-    model.to(DEVICE).eval()
-    return model
-def _load_wilor_quantized():
-    sys.path.insert(0, WILOR_REPO_PATH)
-    from wilor.models import load_wilor
-    import torch.quantization
-    print("[INFO] Loading WiLoR (FP16 + INT8 quantization)...")
-    model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
-    model.eval()
-    model = model.half()
-    model = torch.quantization.quantize_dynamic(
-        model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8,
-    )
-    model.to("cpu")
-    return model
-def _load_mediapipe():
-    import mediapipe as mp
-    return mp.solutions.hands.Hands(
-        static_image_mode=True,
-        max_num_hands=1,
-        min_detection_confidence=0.5,
-        model_complexity=1,
-    )
-def load_models():
-    global wilor_model, yolo_detector, mp_hands_model
-    global classifier, mlp_letters, mlp_numbers
-    print(f"[INFO] Loading stage-1 classifier from {CLASSIFIER_PATH} ...")
-    classifier = joblib.load(CLASSIFIER_PATH)
-    print("[INFO] Stage-1 classifier loaded.")
-    print(f"[INFO] Loading MLP_letters from {MLP_LETTERS_PATH} ...")
-    mlp_letters = joblib.load(MLP_LETTERS_PATH)
-    print("[INFO] MLP_letters loaded.")
-    print(f"[INFO] Loading MLP_numbers from {MLP_NUMBERS_PATH} ...")
-    mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
-    print("[INFO] MLP_numbers loaded.")
-    if MODE == "lightweight":
-        print("[INFO] Loading MediaPipe Hands (lightweight mode)...")
-        mp_hands_model = _load_mediapipe()
-        print("✅ MediaPipe loaded.")
-    else:
-        from ultralytics import YOLO
-        print(f"[INFO] Loading YOLO detector from {DETECTOR_PATH} ...")
-        yolo_detector = YOLO(DETECTOR_PATH)
-        print("[INFO] YOLO loaded.")
-        if MODE == "full":
-            print(f"[INFO] Loading WiLoR FP32 on {DEVICE}...")
-            wilor_model = _load_wilor_full()
-        else:
-            wilor_model = _load_wilor_quantized()
-    print("✅ All models loaded successfully!")
-# ─────────────────────────────────────────────────────────────────────────────
-#  Feature extraction
-# ─────────────────────────────────────────────────────────────────────────────
-def _build_features_from_joints(joints: np.ndarray, theta: np.ndarray) -> np.ndarray:
-    tips = [4, 8, 12, 16, 20]
     hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
     dist_feats = []
     for i in range(1, 5):
         dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
     for i in range(1, 4):
         dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
     return np.concatenate([theta, dist_feats])
-def _wilor_run(crop_rgb: np.ndarray) -> dict:
     img_input  = cv2.resize(crop_rgb, (256, 256))
-    img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0)
-    if MODE == "quantized":
-        img_tensor = img_tensor.half().to("cpu")
-    else:
-        img_tensor = img_tensor.to(DEVICE)
     with torch.no_grad():
         output = wilor_model({"img": img_tensor})
-    return output
-def extract_features_wilor(crop_rgb: np.ndarray) -> np.ndarray | None:
-    output = _wilor_run(crop_rgb)
-    if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
-        return None
-    mano          = output["pred_mano_params"]
-    hand_pose     = mano["hand_pose"][0].cpu().float().numpy().flatten()
-    global_orient = mano["global_orient"][0].cpu().float().numpy().flatten()
-    theta         = np.concatenate([global_orient, hand_pose])
-    joints        = output["pred_keypoints_3d"][0].cpu().float().numpy()
-    return _build_features_from_joints(joints, theta)
-def get_3d_joints_wilor(crop_rgb: np.ndarray) -> np.ndarray:
-    output = _wilor_run(crop_rgb)
-    return output["pred_keypoints_3d"][0].cpu().float().numpy()
-def extract_features_mediapipe(img_rgb: np.ndarray):
-    result = mp_hands_model.process(img_rgb)
-    if not result.multi_hand_landmarks:
-        return None, None, None, None
-    h, w           = img_rgb.shape[:2]
-    hand_landmarks = result.multi_hand_landmarks[0]
-    handedness     = result.multi_handedness[0].classification[0].label.lower()
-    joints = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], dtype=np.float32)
-    xs, ys = (joints[:, 0] * w).astype(int), (joints[:, 1] * h).astype(int)
-    pad = 20
-    x1, y1 = max(0, int(xs.min()) - pad), max(0, int(ys.min()) - pad)
-    x2, y2 = min(w, int(xs.max()) + pad), min(h, int(ys.max()) + pad)
-    theta    = np.zeros(48, dtype=np.float32)
-    features = _build_features_from_joints(joints, theta)
-    return features, joints, handedness, [x1, y1, x2, y2]
-# ─────────────────────────────────────────────────────────────────────────────
-#  Two-stage inference
-# ─────────────────────────────────────────────────────────────────────────────
 def _align_features(model, features: np.ndarray) -> pd.DataFrame:
     expected_cols = model.feature_names_in_
-    vec           = np.zeros(len(expected_cols), dtype=np.float64)
     limit         = min(len(features), len(vec))
     vec[:limit]   = features[:limit]
     return pd.DataFrame([vec], columns=expected_cols)
-def run_stage1(features: np.ndarray) -> tuple[str, float]:
     feat_df  = _align_features(classifier, features)
     category = str(classifier.predict(feat_df)[0])
-    proba    = classifier.predict_proba(feat_df)[0]
-    return category, float(proba.max())
-def run_stage2(category: str, features: np.ndarray) -> tuple[str, float]:
     cat = category.lower().strip()
     if cat in ("letter", "letters", "حرف", "حروف"):
         model = mlp_letters
     elif cat in ("number", "numbers", "digit", "digits", "رقم", "أرقام", "ارقام"):
         model = mlp_numbers
     else:
-        feat_df_l = _align_features(mlp_letters, features)
-        feat_df_n = _align_features(mlp_numbers, features)
-        proba_l   = float(mlp_letters.predict_proba(feat_df_l)[0].max())
-        proba_n   = float(mlp_numbers.predict_proba(feat_df_n)[0].max())
-        if proba_l >= proba_n:
-            return str(mlp_letters.predict(feat_df_l)[0]), proba_l
-        else:
-            return str(mlp_numbers.predict(feat_df_n)[0]), proba_n
     feat_df = _align_features(model, features)
     label   = str(model.predict(feat_df)[0])
-    proba   = model.predict_proba(feat_df)[0]
-    return label, float(proba.max())
-def full_pipeline(features: np.ndarray) -> dict:
-    category, stage1_conf = run_stage1(features)
-    label,    stage2_conf = run_stage2(category, features)
     return {
         "sign":                label,
-        "sign_confidence":     round(stage2_conf, 4),
         "category":            category,
-        "category_confidence": round(stage1_conf, 4),
     }
 # ─────────────────────────────────────────────────────────────────────────────
-#  Utilities
 # ─────────────────────────────────────────────────────────────────────────────
-def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
-    arr = np.frombuffer(file_bytes, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is None:
-        raise HTTPException(status_code=400, detail="Cannot decode image.")
-    return img
-def _yolo_detect(img_rgb: np.ndarray):
     results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
     if not results[0].boxes:
         raise HTTPException(status_code=422, detail="No hand detected.")
     box      = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
     label_id = int(results[0].boxes.cls[0].cpu().item())
-    side     = "left" if label_id == 0 else "right"
-    h, w     = img_rgb.shape[:2]
     x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
     crop = img_rgb[y1:y2, x1:x2]
-    if crop.size == 0:
-        raise HTTPException(status_code=422, detail="Empty crop after bounding box clamp.")
-    return [x1, y1, x2, y2], side, crop
-# ─────────────────────────────────────────────────────────────────────────────
-#  FastAPI app
-# ─────────────────────────────────────────────────────────────────────────────
-app_ready = False
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    global app_ready
-    load_models()
-    app_ready = True
-    yield
-app = FastAPI(
-    title="Arabic Sign Language Interpreter",
-    description="Two-stage pipeline: Stage-1 classifies letter vs number, Stage-2 identifies the specific sign.",
-    version="2.0.0",
-    lifespan=lifespan,
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.get("/")
-def root():
-    if not app_ready:
-        return JSONResponse(
-            status_code=503,
-            content={"status": "loading", "device": DEVICE, "mode": MODE}
-        )
-    return {"status": "running", "device": DEVICE, "mode": MODE, "version": "2.0.0"}
-@app.post("/predict")
-async def predict(file: UploadFile = File(...)):
     raw     = await file.read()
     img_bgr = read_image_from_upload(raw)
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
-    if MODE == "lightweight":
-        features, _, hand_side, bbox = extract_features_mediapipe(img_rgb)
-        if features is None:
-            raise HTTPException(status_code=422, detail="No hand detected.")
-    else:
-        bbox, hand_side, crop = _yolo_detect(img_rgb)
-        features = extract_features_wilor(crop)
-        if features is None:
-            raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
-    result = full_pipeline(features)
-    return JSONResponse({**result, "hand_side": hand_side, "bbox": bbox, "mode": MODE})
-@app.post("/predict_with_skeleton")
-async def predict_with_skeleton(file: UploadFile = File(...)):
-    raw     = await file.read()
-    img_bgr = read_image_from_upload(raw)
-    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
-    if MODE == "lightweight":
-        features, joints, hand_side, bbox = extract_features_mediapipe(img_rgb)
-        if features is None:
-            raise HTTPException(status_code=422, detail="No hand detected.")
-        x1, y1, x2, y2 = bbox
-        crop = img_rgb[y1:y2, x1:x2]
-    else:
-        bbox, hand_side, crop = _yolo_detect(img_rgb)
-        features = extract_features_wilor(crop)
-        if features is None:
-            raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
-        joints = get_3d_joints_wilor(crop)
-    result   = full_pipeline(features)
     _, buf   = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
     crop_b64 = base64.b64encode(buf).decode("utf-8")
     return JSONResponse({
         **result,
         "hand_side": hand_side,
-        "bbox":      bbox,
         "joints_3d": joints.tolist(),
         "crop_b64":  crop_b64,
-        "mode":      MODE,
     })
-@app.get("/info")
-def info():
-    import psutil
-    proc_mb = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
-    def _feat_len(model):
-        return len(model.feature_names_in_) if model and hasattr(model, "feature_names_in_") else None
-    return {
-        "mode":                 MODE,
-        "device":               DEVICE,
-        "process_ram_mb":       round(proc_mb, 1),
-        "classifier_features":  _feat_len(classifier),
-        "mlp_letters_features": _feat_len(mlp_letters),
-        "mlp_numbers_features": _feat_len(mlp_numbers),
-        "models_loaded": {
-            "stage1_classifier": classifier  is not None,
-            "mlp_letters":       mlp_letters is not None,
-            "mlp_numbers":       mlp_numbers is not None,
-            "wilor":             wilor_model is not None,
-            "yolo":              yolo_detector is not None,
-            "mediapipe":         mp_hands_model is not None,
-        },
-    }
 if __name__ == "__main__":
-    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

 """
+Arabic Sign Language Interpreter - FastAPI Server
 """
 import io
 import uvicorn
 from huggingface_hub import hf_hub_download
+# --- Compatibility Patches ---
 if not hasattr(inspect, "getargspec"):
     inspect.getargspec = inspect.getfullargspec
     if not hasattr(np, attr):
         setattr(np, attr, typ)
+# --- Pyrender / OpenGL Mock (Headless) ---
 pyrender_mock = types.ModuleType("pyrender")
 for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
               "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
 os.environ["PYOPENGL_PLATFORM"] = "osmesa"
+# --- Hugging Face Model Integration ---
 REPO_ID = "SondosM/api_GP"
 def get_hf_file(filename, is_mano=False):
     return temp_path
+# --- Download required files ---
 print("Initializing model file paths...")
 get_hf_file("mano_data/mano_data/mano_mean_params.npz", is_mano=True)
 get_hf_file("mano_data/mano_data/MANO_LEFT.pkl",        is_mano=True)
 get_hf_file("mano_data/mano_data/MANO_RIGHT.pkl",       is_mano=True)
+WILOR_REPO_PATH = "./WiLoR"
+WILOR_CKPT      = get_hf_file("pretrained_models/pretrained_models/wilor_final.ckpt")
+WILOR_CFG       = get_hf_file("pretrained_models/pretrained_models/model_config.yaml")
+DETECTOR_PATH   = get_hf_file("pretrained_models/pretrained_models/detector.pt")
+# ─── الفرق الأساسي: الكود الأول كان بيحمّل classifier.pkl من مسار محلي ثابت
+# بدل ما يحمّله من HF زي باقي الملفات ─────────────────────────────────────────
+CLASSIFIER_PATH   = get_hf_file("classifier.pkl")
+MLP_LETTERS_PATH  = get_hf_file("MLP_letters.pkl")
+MLP_NUMBERS_PATH  = get_hf_file("MLP_numbers.pkl")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
+wilor_model   = None
+yolo_detector = None
+classifier    = None
+mlp_letters   = None
+mlp_numbers   = None
+def load_models():
+    global wilor_model, yolo_detector, classifier, mlp_letters, mlp_numbers
     sys.path.insert(0, WILOR_REPO_PATH)
     from wilor.models import load_wilor
+    from ultralytics import YOLO
+    print(f"Loading WiLoR on {DEVICE}...")
+    wilor_model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
+    wilor_model.to(DEVICE)
+    wilor_model.eval()
+    print("Loading YOLO detector...")
+    yolo_detector = YOLO(DETECTOR_PATH)
+    print("Loading classifiers...")
+    classifier  = joblib.load(CLASSIFIER_PATH)
+    mlp_letters = joblib.load(MLP_LETTERS_PATH)
+    mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
+    print("✅ All models loaded successfully!")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    load_models()
+    yield
+app = FastAPI(title="Arabic Sign Language Interpreter", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ─────────────────────────────────────────────────────────────────────────────
+#  Feature extraction
+# ─────────────────────────────────────────────────────────────────────────────
+def extract_features(crop_rgb: np.ndarray) -> np.ndarray | None:
+    img_input  = cv2.resize(crop_rgb, (256, 256))
+    img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        output = wilor_model({"img": img_tensor})
+    if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
+        return None
+    mano          = output["pred_mano_params"]
+    hand_pose     = mano["hand_pose"][0].cpu().numpy().flatten()
+    global_orient = mano["global_orient"][0].cpu().numpy().flatten()
+    theta         = np.concatenate([global_orient, hand_pose])
+    joints     = output["pred_keypoints_3d"][0].cpu().numpy()
+    tips       = [4, 8, 12, 16, 20]
     hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
     dist_feats = []
     for i in range(1, 5):
         dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
     for i in range(1, 4):
         dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
     return np.concatenate([theta, dist_feats])
+def get_3d_joints(crop_rgb: np.ndarray) -> np.ndarray:
     img_input  = cv2.resize(crop_rgb, (256, 256))
+    img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
         output = wilor_model({"img": img_tensor})
+    return output["pred_keypoints_3d"][0].cpu().numpy()
+def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
+    arr = np.frombuffer(file_bytes, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is None:
+        raise HTTPException(status_code=400, detail="Invalid image format.")
+    return img
 def _align_features(model, features: np.ndarray) -> pd.DataFrame:
     expected_cols = model.feature_names_in_
+    vec           = np.zeros(len(expected_cols))
     limit         = min(len(features), len(vec))
     vec[:limit]   = features[:limit]
     return pd.DataFrame([vec], columns=expected_cols)
+def run_two_stage(features: np.ndarray) -> dict:
+    # Stage 1: letter or number?
     feat_df  = _align_features(classifier, features)
     category = str(classifier.predict(feat_df)[0])
+    cat_conf = float(classifier.predict_proba(feat_df)[0].max())
+    # Stage 2: which sign exactly?
     cat = category.lower().strip()
     if cat in ("letter", "letters", "حرف", "حروف"):
         model = mlp_letters
     elif cat in ("number", "numbers", "digit", "digits", "رقم", "أرقام", "ارقام"):
         model = mlp_numbers
     else:
+        # fallback: pick whichever is more confident
+        feat_l  = _align_features(mlp_letters, features)
+        feat_n  = _align_features(mlp_numbers, features)
+        prob_l  = float(mlp_letters.predict_proba(feat_l)[0].max())
+        prob_n  = float(mlp_numbers.predict_proba(feat_n)[0].max())
+        model   = mlp_letters if prob_l >= prob_n else mlp_numbers
     feat_df = _align_features(model, features)
     label   = str(model.predict(feat_df)[0])
+    conf    = float(model.predict_proba(feat_df)[0].max())
     return {
         "sign":                label,
+        "sign_confidence":     round(conf, 4),
         "category":            category,
+        "category_confidence": round(cat_conf, 4),
     }
 # ─────────────────────────────────────────────────────────────────────────────
+#  Routes
 # ─────────────────────────────────────────────────────────────────────────────
+@app.get("/")
+def root():
+    return {"status": "running", "device": DEVICE}
+@app.post("/predict")
+async def predict(file: UploadFile = File(...)):
+    raw     = await file.read()
+    img_bgr = read_image_from_upload(raw)
+    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
     if not results[0].boxes:
         raise HTTPException(status_code=422, detail="No hand detected.")
     box      = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
     label_id = int(results[0].boxes.cls[0].cpu().item())
+    hand_side = "left" if label_id == 0 else "right"
+    h, w = img_rgb.shape[:2]
     x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
     crop = img_rgb[y1:y2, x1:x2]
+    if crop.size == 0:
+        raise HTTPException(status_code=422, detail="Empty hand crop.")
+    features = extract_features(crop)
+    if features is None:
+        raise HTTPException(status_code=500, detail="Feature extraction failed.")
+    result = run_two_stage(features)
+    return JSONResponse({**result, "hand_side": hand_side, "bbox": [int(x1), int(y1), int(x2), int(y2)]})
+@app.post("/predict_with_skeleton")
+async def predict_with_skeleton(file: UploadFile = File(...)):
     raw     = await file.read()
     img_bgr = read_image_from_upload(raw)
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+    results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
+    if not results[0].boxes:
+        raise HTTPException(status_code=422, detail="No hand detected.")
+    box      = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
+    label_id = int(results[0].boxes.cls[0].cpu().item())
+    hand_side = "left" if label_id == 0 else "right"
+    h, w = img_rgb.shape[:2]
+    x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
+    crop = img_rgb[y1:y2, x1:x2]
+    features = extract_features(crop)
+    joints   = get_3d_joints(crop)
+    result   = run_two_stage(features)
     _, buf   = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
     crop_b64 = base64.b64encode(buf).decode("utf-8")
     return JSONResponse({
         **result,
         "hand_side": hand_side,
+        "bbox":      [int(x1), int(y1), int(x2), int(y2)],
         "joints_3d": joints.tolist(),
         "crop_b64":  crop_b64,
     })
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)