Spaces:

SondosM
/

API_2x1

Sleeping

App Files Files Community

SondosM commited on Apr 20

Commit

206b1df

verified ·

1 Parent(s): fbfe038

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile (1) +41 -0
app (1).py +427 -0
hf_README.md +71 -0

Dockerfile (1) ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:3.10-slim
+# System dependencies needed by OpenCV, PyTorch, osmesa (headless GL)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    libosmesa6 \
+    ffmpeg \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements first (layer caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy everything else
+COPY . .
+# Make sure WiLoR repo is importable
+ENV PYTHONPATH="/app/WiLoR:${PYTHONPATH}"
+# Headless OpenGL
+ENV PYOPENGL_PLATFORM=osmesa
+# Default mode — override in HF Space environment variables
+ENV MODE=full
+# HF Spaces expects the app on port 7860 by default,
+# but we set app_port=8000 in README so uvicorn binds here
+EXPOSE 8000
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

app (1).py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Arabic Sign Language Interpreter - FastAPI Server (Optimized)
+Pipeline:
+  Image Input
+    ──► YOLO Detection        (hand crop)
+    ──► WiLoR 3D Pose         (extract 3D joints + MANO params)
+    ──► Stage-1: classifier.pkl    →  "letter" or "number"?
+    ──► Stage-2: MLP_letters.pkl   →  specific Arabic letter
+             OR  MLP_numbers.pkl   →  specific digit
+    ──► JSON Response
+Modes (set MODE env var):
+  full        : YOLO + WiLoR FP32  + MLP   (~1.1–2.5 GB)
+  quantized   : YOLO + WiLoR INT8  + MLP   (~600 MB–1.2 GB)
+  lightweight : MediaPipe           + MLP   (~50 MB)
+"""
+import io
+import base64
+import inspect
+import sys
+import os
+import types
+from unittest.mock import MagicMock
+import numpy as np
+import cv2
+import torch
+import joblib
+import pandas as pd
+from pathlib import Path
+from scipy.spatial import distance
+from torchvision import transforms
+from PIL import Image
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+import uvicorn
+# ─── Runtime mode ──────────────────────────────────────────────────────────────
+MODE = os.environ.get("MODE", "full").lower()
+assert MODE in ("full", "quantized", "lightweight"), \
+    f"Unknown MODE={MODE!r}. Choose full | quantized | lightweight."
+print(f"[INFO] Running in MODE={MODE!r}")
+# ─── Compatibility patches ─────────────────────────────────────────────────────
+if not hasattr(inspect, "getargspec"):
+    inspect.getargspec = inspect.getfullargspec
+for attr, typ in [("int", int), ("float", float), ("complex", complex),
+                  ("bool", bool), ("object", object), ("str", str), ("unicode", str)]:
+    if not hasattr(np, attr):
+        setattr(np, attr, typ)
+# ─── Pyrender / OpenGL mock (headless) ────────────────────────────────────────
+pyrender_mock = types.ModuleType("pyrender")
+for _attr in ["Scene", "Mesh", "Node", "PerspectiveCamera", "DirectionalLight",
+              "PointLight", "SpotLight", "OffscreenRenderer", "RenderFlags",
+              "Viewer", "MetallicRoughnessMaterial"]:
+    setattr(pyrender_mock, _attr, MagicMock)
+sys.modules["pyrender"] = pyrender_mock
+for _mod in ["OpenGL", "OpenGL.GL", "OpenGL.GL.framebufferobjects",
+             "OpenGL.platform", "OpenGL.error"]:
+    if _mod not in sys.modules:
+        sys.modules[_mod] = types.ModuleType(_mod)
+os.environ["PYOPENGL_PLATFORM"] = "osmesa"
+# ─── Configuration ─────────────────────────────────────────────────────────────
+WILOR_REPO_PATH  = "./WiLoR"
+WILOR_CKPT       = "./pretrained_models/wilor_final.ckpt"
+WILOR_CFG        = "./pretrained_models/model_config.yaml"
+CLASSIFIER_PATH  = "./classifier.pkl"
+MLP_LETTERS_PATH = "./MLP_letters.pkl"
+MLP_NUMBERS_PATH = "./MLP_numbers.pkl"
+DEVICE           = "cuda" if torch.cuda.is_available() else "cpu"
+WILOR_TRANSFORM = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+def _resolve_detector_path() -> str:
+    candidates = ["./detector", "./pretrained_models/detector.pt", "./pretrained_models/detector"]
+    for c in candidates:
+        if Path(c).exists():
+            return c
+    raise FileNotFoundError("Detector not found in any candidate path!")
+# ─── Global model handles ──────────────────────────────────────────────────────
+wilor_model    = None
+yolo_detector  = None
+mp_hands_model = None
+classifier     = None
+mlp_letters    = None
+mlp_numbers    = None
+# ─────────────────────────────────────────────────────────────────────────────
+#  Model loading
+# ─────────────────────────────────────────────────────────────────────────────
+def _load_wilor_full():
+    sys.path.insert(0, WILOR_REPO_PATH)
+    from wilor.models import load_wilor
+    model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
+    model.to(DEVICE).eval()
+    return model
+def _load_wilor_quantized():
+    sys.path.insert(0, WILOR_REPO_PATH)
+    from wilor.models import load_wilor
+    import torch.quantization
+    print("[INFO] Loading WiLoR (FP16 + INT8 quantization)...")
+    model, _ = load_wilor(checkpoint_path=WILOR_CKPT, cfg_path=WILOR_CFG)
+    model.eval()
+    model = model.half()
+    model = torch.quantization.quantize_dynamic(
+        model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8,
+    )
+    model.to("cpu")
+    return model
+def _load_mediapipe():
+    import mediapipe as mp
+    return mp.solutions.hands.Hands(
+        static_image_mode=True,
+        max_num_hands=1,
+        min_detection_confidence=0.5,
+        model_complexity=1,
+    )
+def load_models():
+    global wilor_model, yolo_detector, mp_hands_model
+    global classifier, mlp_letters, mlp_numbers
+    print(f"[INFO] Loading stage-1 classifier from {CLASSIFIER_PATH} ...")
+    classifier = joblib.load(CLASSIFIER_PATH)
+    print("[INFO] Stage-1 classifier loaded.")
+    print(f"[INFO] Loading MLP_letters from {MLP_LETTERS_PATH} ...")
+    mlp_letters = joblib.load(MLP_LETTERS_PATH)
+    print("[INFO] MLP_letters loaded.")
+    print(f"[INFO] Loading MLP_numbers from {MLP_NUMBERS_PATH} ...")
+    mlp_numbers = joblib.load(MLP_NUMBERS_PATH)
+    print("[INFO] MLP_numbers loaded.")
+    if MODE == "lightweight":
+        print("[INFO] Loading MediaPipe Hands (lightweight mode)...")
+        mp_hands_model = _load_mediapipe()
+        print("✅ MediaPipe loaded.")
+    else:
+        detector_path = _resolve_detector_path()
+        from ultralytics import YOLO
+        print(f"[INFO] Loading YOLO detector from {detector_path} ...")
+        yolo_detector = YOLO(detector_path)
+        print("[INFO] YOLO loaded.")
+        if MODE == "full":
+            print(f"[INFO] Loading WiLoR FP32 on {DEVICE}...")
+            wilor_model = _load_wilor_full()
+        else:
+            wilor_model = _load_wilor_quantized()
+    print("✅ All models loaded successfully!")
+# ─────────────────────────────────────────────────────────────────────────────
+#  Feature extraction
+# ─────────────────────────────────────────────────────────────────────────────
+def _build_features_from_joints(joints: np.ndarray, theta: np.ndarray) -> np.ndarray:
+    tips = [4, 8, 12, 16, 20]
+    hand_scale = distance.euclidean(joints[0], joints[9]) + 1e-8
+    dist_feats = []
+    for i in range(1, 5):
+        dist_feats.append(distance.euclidean(joints[tips[0]], joints[tips[i]]) / hand_scale)
+    for i in range(1, 4):
+        dist_feats.append(distance.euclidean(joints[tips[i]], joints[tips[i + 1]]) / hand_scale)
+    return np.concatenate([theta, dist_feats])
+def _wilor_run(crop_rgb: np.ndarray) -> dict:
+    img_input  = cv2.resize(crop_rgb, (256, 256))
+    img_tensor = WILOR_TRANSFORM(img_input).unsqueeze(0)
+    if MODE == "quantized":
+        img_tensor = img_tensor.half().to("cpu")
+    else:
+        img_tensor = img_tensor.to(DEVICE)
+    with torch.no_grad():
+        output = wilor_model({"img": img_tensor})
+    return output
+def extract_features_wilor(crop_rgb: np.ndarray) -> np.ndarray | None:
+    output = _wilor_run(crop_rgb)
+    if "pred_mano_params" not in output or "pred_keypoints_3d" not in output:
+        return None
+    mano          = output["pred_mano_params"]
+    hand_pose     = mano["hand_pose"][0].cpu().float().numpy().flatten()
+    global_orient = mano["global_orient"][0].cpu().float().numpy().flatten()
+    theta         = np.concatenate([global_orient, hand_pose])
+    joints        = output["pred_keypoints_3d"][0].cpu().float().numpy()
+    return _build_features_from_joints(joints, theta)
+def get_3d_joints_wilor(crop_rgb: np.ndarray) -> np.ndarray:
+    output = _wilor_run(crop_rgb)
+    return output["pred_keypoints_3d"][0].cpu().float().numpy()
+def extract_features_mediapipe(img_rgb: np.ndarray):
+    result = mp_hands_model.process(img_rgb)
+    if not result.multi_hand_landmarks:
+        return None, None, None, None
+    h, w           = img_rgb.shape[:2]
+    hand_landmarks = result.multi_hand_landmarks[0]
+    handedness     = result.multi_handedness[0].classification[0].label.lower()
+    joints = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], dtype=np.float32)
+    xs, ys = (joints[:, 0] * w).astype(int), (joints[:, 1] * h).astype(int)
+    pad = 20
+    x1, y1 = max(0, int(xs.min()) - pad), max(0, int(ys.min()) - pad)
+    x2, y2 = min(w, int(xs.max()) + pad), min(h, int(ys.max()) + pad)
+    theta    = np.zeros(48, dtype=np.float32)
+    features = _build_features_from_joints(joints, theta)
+    return features, joints, handedness, [x1, y1, x2, y2]
+# ───────────────────────────────────────────────────────────────���─────────────
+#  Two-stage inference
+# ─────────────────────────────────────────────────────────────────────────────
+def _align_features(model, features: np.ndarray) -> pd.DataFrame:
+    expected_cols = model.feature_names_in_
+    vec           = np.zeros(len(expected_cols), dtype=np.float64)
+    limit         = min(len(features), len(vec))
+    vec[:limit]   = features[:limit]
+    return pd.DataFrame([vec], columns=expected_cols)
+def run_stage1(features: np.ndarray) -> tuple[str, float]:
+    feat_df  = _align_features(classifier, features)
+    category = str(classifier.predict(feat_df)[0])
+    proba    = classifier.predict_proba(feat_df)[0]
+    return category, float(proba.max())
+def run_stage2(category: str, features: np.ndarray) -> tuple[str, float]:
+    cat = category.lower().strip()
+    if cat in ("letter", "letters", "حرف", "حروف"):
+        model = mlp_letters
+    elif cat in ("number", "numbers", "digit", "digits", "رقم", "أرقام", "ارقام"):
+        model = mlp_numbers
+    else:
+        feat_df_l = _align_features(mlp_letters, features)
+        feat_df_n = _align_features(mlp_numbers, features)
+        proba_l   = float(mlp_letters.predict_proba(feat_df_l)[0].max())
+        proba_n   = float(mlp_numbers.predict_proba(feat_df_n)[0].max())
+        if proba_l >= proba_n:
+            return str(mlp_letters.predict(feat_df_l)[0]), proba_l
+        else:
+            return str(mlp_numbers.predict(feat_df_n)[0]), proba_n
+    feat_df = _align_features(model, features)
+    label   = str(model.predict(feat_df)[0])
+    proba   = model.predict_proba(feat_df)[0]
+    return label, float(proba.max())
+def full_pipeline(features: np.ndarray) -> dict:
+    category, stage1_conf = run_stage1(features)
+    label,    stage2_conf = run_stage2(category, features)
+    return {
+        "sign":                label,
+        "sign_confidence":     round(stage2_conf, 4),
+        "category":            category,
+        "category_confidence": round(stage1_conf, 4),
+    }
+# ─────────────────────────────────────────────────────────────────────────────
+#  Utilities
+# ─────────────────────────────────────────────────────────────────────────────
+def read_image_from_upload(file_bytes: bytes) -> np.ndarray:
+    arr = np.frombuffer(file_bytes, np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is None:
+        raise HTTPException(status_code=400, detail="Cannot decode image.")
+    return img
+def _yolo_detect(img_rgb: np.ndarray):
+    results = yolo_detector.predict(img_rgb, conf=0.5, verbose=False, device=DEVICE)
+    if not results[0].boxes:
+        raise HTTPException(status_code=422, detail="No hand detected.")
+    box      = results[0].boxes.xyxy[0].cpu().numpy().astype(int)
+    label_id = int(results[0].boxes.cls[0].cpu().item())
+    side     = "left" if label_id == 0 else "right"
+    h, w     = img_rgb.shape[:2]
+    x1, y1, x2, y2 = max(0, box[0]), max(0, box[1]), min(w, box[2]), min(h, box[3])
+    crop = img_rgb[y1:y2, x1:x2]
+    if crop.size == 0:
+        raise HTTPException(status_code=422, detail="Empty crop after bounding box clamp.")
+    return [x1, y1, x2, y2], side, crop
+# ─────────────────────────────────────────────────────────────────────────────
+#  FastAPI app
+# ─────────────────────────────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    load_models()
+    yield
+app = FastAPI(
+    title="Arabic Sign Language Interpreter",
+    description="Two-stage pipeline: Stage-1 classifies letter vs number, Stage-2 identifies the specific sign.",
+    version="2.0.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+def root():
+    return {"status": "running", "device": DEVICE, "mode": MODE, "version": "2.0.0"}
+@app.post("/predict")
+async def predict(file: UploadFile = File(...)):
+    raw     = await file.read()
+    img_bgr = read_image_from_upload(raw)
+    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+    if MODE == "lightweight":
+        features, _, hand_side, bbox = extract_features_mediapipe(img_rgb)
+        if features is None:
+            raise HTTPException(status_code=422, detail="No hand detected.")
+    else:
+        bbox, hand_side, crop = _yolo_detect(img_rgb)
+        features = extract_features_wilor(crop)
+        if features is None:
+            raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
+    result = full_pipeline(features)
+    return JSONResponse({**result, "hand_side": hand_side, "bbox": bbox, "mode": MODE})
+@app.post("/predict_with_skeleton")
+async def predict_with_skeleton(file: UploadFile = File(...)):
+    raw     = await file.read()
+    img_bgr = read_image_from_upload(raw)
+    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+    if MODE == "lightweight":
+        features, joints, hand_side, bbox = extract_features_mediapipe(img_rgb)
+        if features is None:
+            raise HTTPException(status_code=422, detail="No hand detected.")
+        x1, y1, x2, y2 = bbox
+        crop = img_rgb[y1:y2, x1:x2]
+    else:
+        bbox, hand_side, crop = _yolo_detect(img_rgb)
+        features = extract_features_wilor(crop)
+        if features is None:
+            raise HTTPException(status_code=500, detail="WiLoR feature extraction failed.")
+        joints = get_3d_joints_wilor(crop)
+    result   = full_pipeline(features)
+    _, buf   = cv2.imencode(".png", cv2.cvtColor(crop, cv2.COLOR_RGB2BGR))
+    crop_b64 = base64.b64encode(buf).decode("utf-8")
+    return JSONResponse({
+        **result,
+        "hand_side": hand_side,
+        "bbox":      bbox,
+        "joints_3d": joints.tolist(),
+        "crop_b64":  crop_b64,
+        "mode":      MODE,
+    })
+@app.get("/info")
+def info():
+    import psutil
+    proc_mb = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
+    def _feat_len(model):
+        return len(model.feature_names_in_) if model and hasattr(model, "feature_names_in_") else None
+    return {
+        "mode":                 MODE,
+        "device":               DEVICE,
+        "process_ram_mb":       round(proc_mb, 1),
+        "classifier_features":  _feat_len(classifier),
+        "mlp_letters_features": _feat_len(mlp_letters),
+        "mlp_numbers_features": _feat_len(mlp_numbers),
+        "models_loaded": {
+            "stage1_classifier": classifier  is not None,
+            "mlp_letters":       mlp_letters is not None,
+            "mlp_numbers":       mlp_numbers is not None,
+            "wilor":             wilor_model is not None,
+            "yolo":              yolo_detector is not None,
+            "mediapipe":         mp_hands_model is not None,
+        },
+    }
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

hf_README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+title: Arabic Sign Language Interpreter
+emoji: 🤟
+colorFrom: green
+colorTo: blue
+sdk: docker
+pinned: false
+license: mit
+app_port: 8000
+---
+# Arabic Sign Language Interpreter API v2.0
+REST API لتفسير لغة الإشارة العربية — يقبل صورة يد ويرجع الحرف أو الرقم المقابل.
+---
+## البايبلاين
+```
+Image Input
+  ──► YOLO Detection        (crop the hand)
+  ──► WiLoR 3D Pose         (extract 3D joints + MANO params)
+  ──► Stage-1: classifier.pkl    →  "letter" or "number"?
+  ──► Stage-2: MLP_letters.pkl   →  specific Arabic letter
+           OR  MLP_numbers.pkl   →  specific digit
+  ──► JSON Response
+```
+---
+## الـ Endpoints
+### `GET /`
+Health check.
+### `POST /predict`
+الـ endpoint الرئيسي.
+**Request:** `multipart/form-data` — حقل `file` يحتوي على صورة اليد.
+**Response:**
+```json
+{
+  "sign": "ب",
+  "sign_confidence": 0.9731,
+  "category": "letter",
+  "category_confidence": 0.9812,
+  "hand_side": "right",
+  "bbox": [120, 85, 340, 310],
+  "mode": "full"
+}
+```
+### `POST /predict_with_skeleton`
+نفس `/predict` + مفاصل اليد 3D + crop بـ base64.
+### `GET /info`
+معلومات تشخيصية عن الـ runtime.
+---
+## متطلبات الذاكرة
+| Mode      | RAM تقريبي  | الدقة   |
+|-----------|-------------|---------|
+| full      | 1.1–2.5 GB  | الأعلى  |
+| quantized | 600MB–1.2GB | عالية   |
+| lightweight | ~50 MB    | متوسطة  |
+> الـ Space بيشتغل بـ `MODE=full` بشكل افتراضي. غيّريه من متغيرات البيئة في إعدادات الـ Space.