meaculpitt
/

ScoreVision

+"""
+Score Vision SN44 — VehicleDetect miner endpoint.
+Class mapping (output indices):
+  0 = car        (COCO class 2)
+  1 = bus        (COCO class 5)
+  2 = truck      (COCO class 7)
+  3 = motorcycle (COCO class 3)
+Accepts: base64-encoded image or raw image bytes via chutes cord.
+Returns: list of {bbox: [x1,y1,x2,y2], score: float, class_id: int, class_name: str}
+CUDA fix: onnxruntime-gpu finds cuDNN via ldconfig (registered during image build),
+          with ctypes preload as belt-and-suspenders fallback.
+"""
+from __future__ import annotations
+import base64
+import io
+import os
+import time
+from pathlib import Path
+from typing import Any
+import ctypes
+import cv2
+import numpy as np
+from PIL import Image
+# ── cuDNN preload (belt-and-suspenders fallback) ──────────────────────────────
+# Primary fix is ldconfig at image build time (see Image builder below).
+# This ctypes preload catches any edge cases where ld.so.cache isn't used.
+def _preload_cuda_libs() -> None:
+    _NVIDIA = "/usr/local/lib/python3.12/dist-packages/nvidia"
+    _LIBS = [
+        "/usr/lib/x86_64-linux-gnu/libcuda.so.1",         # driver stub — must be first
+        f"{_NVIDIA}/cublas/lib/libcublasLt.so.12",
+        f"{_NVIDIA}/cublas/lib/libcublas.so.12",
+        f"{_NVIDIA}/cudnn/lib/libcudnn.so.9",
+    ]
+    for path in _LIBS:
+        if os.path.exists(path):
+            try:
+                ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
+            except OSError:
+                pass
+_preload_cuda_libs()
+import onnxruntime as ort  # noqa: E402 — must come after preload
+# ── Constants ────────────────────────────────────────────────────────────────
+MODEL_DIR   = Path(__file__).parent
+WEIGHTS     = MODEL_DIR / "weights.onnx"
+IMG_SIZE    = 640
+CONF_THRESH = 0.25
+IOU_THRESH  = 0.45
+# COCO class index → submission class index
+COCO_TO_OUT: dict[int, int] = {2: 0, 5: 1, 7: 2, 3: 3}
+COCO_VEHICLE_IDX = list(COCO_TO_OUT.keys())
+OUT_NAMES = ["car", "bus", "truck", "motorcycle"]
+# ── Model loader (singleton) ─────────────────────────────────────────────────
+_SESSION: ort.InferenceSession | None = None
+def get_session() -> ort.InferenceSession:
+    global _SESSION
+    if _SESSION is None:
+        opts = ort.SessionOptions()
+        opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        opts.enable_mem_pattern = True
+        opts.enable_mem_reuse   = True
+        cuda_opts = {
+            "device_id": 0,
+            "arena_extend_strategy": "kNextPowerOfTwo",
+            "gpu_mem_limit": 2 * 1024 ** 3,
+            "cudnn_conv_algo_search": "EXHAUSTIVE",
+            "do_copy_in_default_stream": True,
+        }
+        _SESSION = ort.InferenceSession(
+            str(WEIGHTS),
+            sess_options=opts,
+            providers=[
+                ("CUDAExecutionProvider", cuda_opts),
+                "CPUExecutionProvider",
+            ],
+        )
+        provider = _SESSION.get_providers()[0]
+        print(f"[miner] Model loaded. Provider: {provider}", flush=True)
+    return _SESSION
+# ── Preprocessing ────────────────────────────────────────────────────────────
+def letterbox(img: np.ndarray, size: int = IMG_SIZE) -> tuple[np.ndarray, float, int, int]:
+    h, w = img.shape[:2]
+    r = min(size / h, size / w)
+    new_w, new_h = int(round(w * r)), int(round(h * r))
+    img_r = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    dw, dh = size - new_w, size - new_h
+    pad_l, pad_t = dw // 2, dh // 2
+    img_p = cv2.copyMakeBorder(
+        img_r, pad_t, dh - pad_t, pad_l, dw - pad_l,
+        cv2.BORDER_CONSTANT, value=(114, 114, 114),
+    )
+    return img_p, r, pad_l, pad_t
+def preprocess(img_bgr: np.ndarray) -> tuple[np.ndarray, float, int, int]:
+    img_p, ratio, pad_l, pad_t = letterbox(img_bgr)
+    img_rgb = cv2.cvtColor(img_p, cv2.COLOR_BGR2RGB)
+    inp = img_rgb.transpose(2, 0, 1).astype(np.float32) * (1.0 / 255.0)
+    return np.ascontiguousarray(inp[np.newaxis]), ratio, pad_l, pad_t
+# ── NMS ──────────────────────────────────────────────────────────────────────
+def nms(boxes: np.ndarray, scores: np.ndarray, iou_thresh: float = IOU_THRESH) -> list[int]:
+    if not len(boxes):
+        return []
+    x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+    keep: list[int] = []
+    while len(order):
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
+        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-7)
+        order = order[1:][iou <= iou_thresh]
+    return keep
+# ── Postprocessing ───────────────────────────────────────────────────────────
+def postprocess(
+    raw: np.ndarray,
+    ratio: float,
+    pad_l: int,
+    pad_t: int,
+    orig_w: int,
+    orig_h: int,
+) -> list[dict[str, Any]]:
+    pred = raw  # [84, 8400]
+    veh_row_idx = np.array([4 + c for c in COCO_VEHICLE_IDX])
+    max_veh_score = pred[veh_row_idx].max(axis=0)
+    mask = max_veh_score > CONF_THRESH
+    if not mask.any():
+        return []
+    pred_f = pred[:, mask]
+    cx, cy, bw, bh = pred_f[0], pred_f[1], pred_f[2], pred_f[3]
+    x1 = np.clip((cx - bw / 2 - pad_l) / ratio, 0, orig_w)
+    y1 = np.clip((cy - bh / 2 - pad_t) / ratio, 0, orig_h)
+    x2 = np.clip((cx + bw / 2 - pad_l) / ratio, 0, orig_w)
+    y2 = np.clip((cy + bh / 2 - pad_t) / ratio, 0, orig_h)
+    boxes = np.stack([x1, y1, x2, y2], axis=1)
+    results: list[dict[str, Any]] = []
+    for coco_cls in COCO_VEHICLE_IDX:
+        scores = pred_f[4 + coco_cls]
+        cls_mask = scores > CONF_THRESH
+        if not cls_mask.any():
+            continue
+        keep = nms(boxes[cls_mask], scores[cls_mask])
+        out_cls = COCO_TO_OUT[coco_cls]
+        for k in keep:
+            box = boxes[cls_mask][k]
+            results.append({
+                "bbox": [
+                    float(box[0]), float(box[1]),
+                    float(box[2]), float(box[3]),
+                ],
+                "score": float(scores[cls_mask][k]),
+                "class_id": out_cls,
+                "class_name": OUT_NAMES[out_cls],
+            })
+    return results
+# ── Image decoding helpers ───────────────────────────────────────────────────
+def decode_image(data: bytes | str) -> np.ndarray:
+    if isinstance(data, str):
+        data = base64.b64decode(data)
+    elif isinstance(data, (bytes, bytearray)):
+        try:
+            data = base64.b64decode(data)
+        except Exception:
+            pass
+    arr = np.frombuffer(data, dtype=np.uint8)
+    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
+    if img is None:
+        pil = Image.open(io.BytesIO(data)).convert("RGB")
+        img = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
+    return img
+# ── Core predict function ────────────────────────────────────────────────────
+def predict(image_data: bytes | str | np.ndarray) -> dict[str, Any]:
+    sess = get_session()
+    if isinstance(image_data, np.ndarray):
+        img_bgr = image_data
+    else:
+        img_bgr = decode_image(image_data)
+    orig_h, orig_w = img_bgr.shape[:2]
+    inp, ratio, pad_l, pad_t = preprocess(img_bgr)
+    t0 = time.perf_counter()
+    outputs = sess.run(None, {"images": inp})
+    infer_ms = (time.perf_counter() - t0) * 1000.0
+    raw = outputs[0][0]   # [84, 8400]
+    detections = postprocess(raw, ratio, pad_l, pad_t, orig_w, orig_h)
+    return {
+        "detections": detections,
+        "inference_ms": round(infer_ms, 3),
+        "provider": sess.get_providers()[0],
+    }
+# ── Chutes cord wrapper ──────────────────────────────────────────────────────
+try:
+    from chutes.chute import Chute
+    from chutes.chute.node_selector import NodeSelector
+    from chutes.image import Image as ChuteImage
+    chute_image = (
+        ChuteImage(
+            username="lculpitt",
+            name="vehicle-detect-sn44",
+            tag="v4-cuda",
+            readme=(Path(__file__).parent / "README.md").read_text(),
+        )
+        .from_base("parachutes/python:3.12")
+        .run_command("pip install --upgrade setuptools wheel")
+        .run_command(
+            "pip install 'numpy>=1.23' 'onnxruntime-gpu>=1.16' "
+            "'opencv-python-headless>=4.7' 'pillow>=9.5' "
+            "'huggingface_hub>=0.19.4' 'pydantic>=2.0' "
+            "'pyyaml>=6.0' 'aiohttp>=3.9'"
+        )
+        # Bake cuDNN/cuBLAS paths into the image as Docker ENV so onnxruntime
+        # CUDAExecutionProvider finds libcudnn.so.9 on every node at container start.
+        .with_env(
+            "LD_LIBRARY_PATH",
+            "/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib"
+            ":/usr/local/lib/python3.12/dist-packages/nvidia/cublas/lib",
+        )
+    )
+    chute = Chute(
+        username="lculpitt",
+        name="vehicle-detect-sn44",
+        tagline="YOLO11n vehicle detector — car, bus, truck, motorcycle",
+        readme=(Path(__file__).parent / "README.md").read_text(),
+        image=chute_image,
+        concurrency=4,
+        max_instances=5,
+        shutdown_after_seconds=300,
+        scaling_threshold=0.5,
+        node_selector=NodeSelector(
+            gpu_count=1,
+            min_vram_gb_per_gpu=16,
+            # All CUDA 12.x, all $0.40–$0.85/hr (within 2.5× spread from cheapest)
+            include=["4090", "a40", "a6000", "l40", "l40s"],
+        ),
+    )
+    @chute.cord(path="/predict", method="POST")
+    async def predict_cord(image_b64: str) -> dict:
+        """
+        POST /predict
+        Body: {"image_b64": "<base64-encoded image>"}
+        Returns detection JSON.
+        """
+        return predict(image_b64)
+except ImportError:
+    pass
+# ── Local test ───────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import sys
+    print("=" * 55)
+    print("  miner.py — local smoke test")
+    print("=" * 55)
+    dummy_bgr = np.full((720, 1280, 3), 128, dtype=np.uint8)
+    cv2.rectangle(dummy_bgr, (100, 100), (400, 300), (0, 255, 0), 3)
+    if len(sys.argv) > 1:
+        loaded = cv2.imread(sys.argv[1])
+        if loaded is not None:
+            dummy_bgr = loaded
+            print(f"  Using image: {sys.argv[1]}  ({loaded.shape[1]}x{loaded.shape[0]})")
+        else:
+            print(f"  Could not load {sys.argv[1]}, using dummy.")
+    else:
+        print("  Using synthetic 1280x720 dummy image.")
+    result = predict(dummy_bgr)
+    print(f"\n  Provider     : {result['provider']}")
+    print(f"  Inference    : {result['inference_ms']:.2f} ms")
+    print(f"  Detections   : {len(result['detections'])}")
+    for d in result["detections"]:
+        x1, y1, x2, y2 = [round(v, 1) for v in d["bbox"]]
+        print(f"    [{d['class_id']}] {d['class_name']:12s}  score={d['score']:.3f}  "
+              f"bbox=[{x1},{y1},{x2},{y2}]")
+    print("\n  Latency benchmark (50 runs)...")
+    times = []
+    for _ in range(50):
+        t0 = time.perf_counter()
+        predict(dummy_bgr)
+        times.append((time.perf_counter() - t0) * 1000)
+    times.sort()
+    p50, p95 = times[25], times[47]
+    fps = 1000.0 / p50
+    print(f"  P50={p50:.2f}ms  P95={p95:.2f}ms  FPS={fps:.1f}")
+    print(f"  Target >=30 FPS  : {'PASS' if fps >= 30 else 'FAIL'}")
+    print(f"  Target P95<50ms  : {'PASS' if p95 < 50 else 'FAIL'}")
+    print("=" * 55)