meaculpitt
/

ScoreVision-Beverage

+"""SN44 beverage detection miner — single-element chute for
+manak0/Detect-beverage-detect.
+Adapted from the auto-generated Manako baseline with three substantive
+changes ported from the production numberplate miner:
+1. CUDA library preload at import time so onnxruntime-gpu finds
+   libcudnn / libcublas from the nvidia-* pip wheels even when
+   LD_LIBRARY_PATH is not set.
+2. Letterbox preprocessing (aspect-preserving with grey 114 padding)
+   instead of anisotropic cv2.resize. Beverage geometry (cylindrical
+   bottles/cans/cups) is sensitive to AR distortion.
+3. Standard NMS replaced with per-class Gaussian Soft-NMS (sigma=0.5).
+   Soft-NMS decays scores of overlapping boxes instead of suppressing
+   them outright. Per-class so that an overlapping bottle and cup don't
+   suppress each other (beverage scenes routinely have mixed objects in
+   frame). We use a gentler sigma than the numberplate miner's 0.3
+   because beverage scenes typically have fewer near-duplicate
+   detections than plate scenes.
+Plus a GPU warmup pass in __init__ (10 calls on a synthetic frame) to
+force ORT/CUDA/cuDNN kernel compilation before the first real
+validator frame.
+Soft-NMS is inlined here rather than imported because the chute
+platform sandbox restricts non-stdlib imports beyond the deps declared
+in chute_config.yml.
+NOT ported from numberplate (intentional):
+  - SAHI quad-4 tiling: beverage objects are 50–500 px on validator
+    frames, not 5–30 px like plates — tiling is overkill.
+  - Horizontal-flip TTA: doubles latency for marginal gain.
+  - End2end [1,N,6] shape support: our ONNX export uses raw
+    [1, C, anchors] format with NMS done here.
+  - Aspect-ratio / max-side output filters: plate-specific (plates
+    are wide-flat); beverage geometry is the opposite.
+  - Empty-submission guard: plate-specific failure mode.
+"""
+import ctypes
+import glob as _glob
+import logging as _logging
+import math
+import os
+_cuda_log = _logging.getLogger(__name__)
+def _preload_cuda_libs() -> None:
+    """Pre-load CUDA + cuDNN + cuBLAS shared libs from nvidia-* pip wheels.
+    Without this, onnxruntime-gpu's CUDAExecutionProvider silently falls
+    back to CPU because it can't dlopen libcudnn.so.9 — the nvidia
+    wheels ship the library inside `nvidia/cudnn/lib/` but do NOT add
+    that directory to the loader path. We import the wheel modules to
+    locate their lib dirs, prepend them to LD_LIBRARY_PATH for any
+    child processes, and ctypes.CDLL the .so files with RTLD_GLOBAL so
+    onnxruntime's dlopen sees them.
+    """
+    try:
+        lib_dirs: list[str] = []
+        for mod_name in (
+            "nvidia.cudnn",
+            "nvidia.cublas",
+            "nvidia.cuda_runtime",
+            "nvidia.cufft",
+            "nvidia.curand",
+            "nvidia.cusolver",
+            "nvidia.cusparse",
+            "nvidia.nvjitlink",
+        ):
+            try:
+                mod = __import__(mod_name, fromlist=["__file__"])
+                lib_dir = os.path.join(os.path.dirname(mod.__file__), "lib")
+                if os.path.isdir(lib_dir) and lib_dir not in lib_dirs:
+                    lib_dirs.append(lib_dir)
+            except ImportError:
+                pass
+        if not lib_dirs:
+            _cuda_log.warning("no nvidia-* lib dirs found; ORT GPU may fall back to CPU")
+            return
+        existing = os.environ.get("LD_LIBRARY_PATH", "")
+        os.environ["LD_LIBRARY_PATH"] = ":".join(
+            lib_dirs + ([existing] if existing else [])
+        )
+        for lib_dir in lib_dirs:
+            for so in sorted(_glob.glob(os.path.join(lib_dir, "lib*.so*"))):
+                try:
+                    ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
+                except OSError:
+                    pass
+    except Exception as e:  # pragma: no cover - best effort
+        _cuda_log.warning("CUDA preload failed: %s", e)
+_preload_cuda_libs()
+from pathlib import Path
+import cv2
+import numpy as np
+import onnxruntime as ort
+from numpy import ndarray
+from pydantic import BaseModel
+class BoundingBox(BaseModel):
+    x1: int
+    y1: int
+    x2: int
+    y2: int
+    cls_id: int
+    conf: float
+class TVFrameResult(BaseModel):
+    frame_id: int
+    boxes: list[BoundingBox]
+    keypoints: list[tuple[int, int]]
+class Miner:
+    """Single-element ONNX miner for the manak0/Detect-beverage-detect
+    element. Auto-loaded by the chute platform; the platform passes the
+    snapshot path of the HF repo containing weights.onnx as
+    ``path_hf_repo`` and calls ``predict_batch(batch_images, offset,
+    n_keypoints)`` for each request.
+    """
+    def __init__(self, path_hf_repo) -> None:
+        self.path_hf_repo = Path(path_hf_repo)
+        self.class_names = ["bottle", "can", "cup"]
+        self.session = ort.InferenceSession(
+            str(self.path_hf_repo / "weights.onnx"),
+            providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+        )
+        self.input_name = self.session.get_inputs()[0].name
+        # Hard-pin to 960x960 — this is the resolution we trained at and
+        # exported the ONNX with. Single-resolution preprocessing keeps the
+        # pipeline simple and matches what we've validated. The ONNX itself
+        # was exported with dynamic axes so it accepts other shapes too,
+        # but there's no reason to deviate from training resolution.
+        self.input_h = 960
+        self.input_w = 960
+        # Pre-NMS confidence threshold. Low floor so Soft-NMS has plenty of
+        # candidates to score-decay; final filtering happens via
+        # score_threshold below.
+        self.conf_threshold = 0.15
+        # Gaussian Soft-NMS sigma. 0.5 is the textbook default — gentler
+        # than numberplate's 0.3 because beverage scenes are less crowded.
+        self.soft_nms_sigma = 0.5
+        # Final score floor after Soft-NMS decay.
+        self.score_threshold = 0.01
+        # GPU warmup — force ORT/CUDA/cuDNN kernel compilation before the
+        # first real validator frame. Mirrors the numberplate miner pattern.
+        _warmup_frame = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
+        for _ in range(10):
+            try:
+                self._infer_single(_warmup_frame)
+            except Exception:  # pragma: no cover - best effort
+                break
+    def __repr__(self) -> str:
+        return (
+            f"BeverageMiner session={type(self.session).__name__} "
+            f"input={self.input_h}x{self.input_w} classes={len(self.class_names)}"
+        )
+    # ---------------------------------------------------------------- preproc
+    def _preprocess(self, image_bgr: ndarray):
+        """Letterbox the BGR image to (input_h, input_w), preserving aspect.
+        Returns the float32 NCHW tensor plus the metadata needed to undo
+        the letterbox during decode: (orig_h, orig_w, scale, dx, dy).
+        """
+        h, w = image_bgr.shape[:2]
+        scale = min(self.input_h / h, self.input_w / w)
+        nh, nw = int(round(h * scale)), int(round(w * scale))
+        resized = cv2.resize(image_bgr, (nw, nh))
+        canvas = np.full((self.input_h, self.input_w, 3), 114, dtype=np.uint8)
+        dy = (self.input_h - nh) // 2
+        dx = (self.input_w - nw) // 2
+        canvas[dy:dy + nh, dx:dx + nw] = resized
+        rgb = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)
+        x = rgb.astype(np.float32) / 255.0
+        x = np.transpose(x, (2, 0, 1))[None, ...]
+        return x, (h, w, scale, dx, dy)
+    # ---------------------------------------------------------------- decode
+    def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray:
+        """Handle both common ultralytics export shapes ([1,C,N] and [1,N,C])."""
+        pred = raw[0]
+        if pred.ndim != 2:
+            raise ValueError(f"Unexpected prediction shape: {raw.shape}")
+        if pred.shape[0] < pred.shape[1]:
+            pred = pred.transpose(1, 0)
+        return pred
+    # ---------------------------------------------------------------- cluster dedup
+    def _cluster_dedup(
+        self,
+        dets: list[tuple[float, float, float, float, float, int]],
+        iou_thresh: float = 0.5,
+    ) -> list[tuple[float, float, float, float, float, int]]:
+        """Per-class greedy near-duplicate suppression.
+        For any pair of same-class detections with IoU >= ``iou_thresh``,
+        keep only the higher-confidence one. Runs BEFORE Soft-NMS to kill
+        nearly-identical raw detections that Soft-NMS's gentle decay
+        leaves above ``score_threshold`` (verified failure mode in v1
+        smoke test: at sigma=0.5 and IoU≈1.0, a 0.94 detection decays to
+        only 0.13 — still above the 0.01 floor).
+        Per-class (not class-agnostic) so an overlapping bottle/cup pair
+        survives intact, consistent with the per-class Soft-NMS choice.
+        Cross-class confusion at IoU>=0.5 is rare with our trained model.
+        Mirrors the cluster-dedup step in the production numberplate
+        miner; threshold raised to 0.5 (vs 0.3 there) because we have no
+        TTA-induced near-duplicates to merge.
+        """
+        if not dets:
+            return []
+        srt = sorted(dets, key=lambda d: -d[4])
+        kept: list[tuple[float, float, float, float, float, int]] = []
+        suppressed = [False] * len(srt)
+        for i in range(len(srt)):
+            if suppressed[i]:
+                continue
+            x1i, y1i, x2i, y2i = srt[i][0], srt[i][1], srt[i][2], srt[i][3]
+            cls_i = srt[i][5]
+            area_i = max(0.0, x2i - x1i) * max(0.0, y2i - y1i)
+            kept.append(srt[i])
+            for j in range(i + 1, len(srt)):
+                if suppressed[j]:
+                    continue
+                if srt[j][5] != cls_i:  # per-class only
+                    continue
+                x1j, y1j, x2j, y2j = srt[j][0], srt[j][1], srt[j][2], srt[j][3]
+                ix1 = max(x1i, x1j); iy1 = max(y1i, y1j)
+                ix2 = min(x2i, x2j); iy2 = min(y2i, y2j)
+                iw = max(0.0, ix2 - ix1); ih = max(0.0, iy2 - iy1)
+                inter = iw * ih
+                area_j = max(0.0, x2j - x1j) * max(0.0, y2j - y1j)
+                union = area_i + area_j - inter
+                if union > 0 and inter / union >= iou_thresh:
+                    suppressed[j] = True
+        return kept
+    # ---------------------------------------------------------------- soft NMS
+    def _soft_nms(
+        self,
+        dets: list[tuple[float, float, float, float, float, int]],
+    ) -> list[tuple[float, float, float, float, float, int]]:
+        """Per-class Gaussian Soft-NMS.
+        Partitions detections by class id, runs the Gaussian decay
+        independently within each class, then merges and sorts by score
+        descending. A high-confidence can detection therefore won't
+        suppress an overlapping bottle detection — beverage scenes
+        routinely contain mixed objects in close spatial proximity.
+        """
+        if not dets:
+            return []
+        by_class: dict[int, list[tuple[float, float, float, float, float, int]]] = {}
+        for d in dets:
+            by_class.setdefault(int(d[5]), []).append(d)
+        combined: list[tuple[float, float, float, float, float, int]] = []
+        for class_dets in by_class.values():
+            combined.extend(self._soft_nms_per_class_pool(class_dets))
+        combined.sort(key=lambda d: -d[4])
+        return combined
+    def _soft_nms_per_class_pool(
+        self,
+        dets: list[tuple[float, float, float, float, float, int]],
+    ) -> list[tuple[float, float, float, float, float, int]]:
+        """Gaussian Soft-NMS over a pool of same-class detections.
+        Decays each remaining box's score by ``exp(-iou^2 / sigma)`` against
+        the highest-scoring picked box, then drops anything below
+        ``self.score_threshold``. Returns kept detections in descending
+        decayed-score order.
+        """
+        if not dets:
+            return []
+        boxes = np.asarray([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32)
+        scores = np.asarray([d[4] for d in dets], dtype=np.float32)
+        cls_ids = [int(d[5]) for d in dets]
+        n = len(dets)
+        keep_idx: list[int] = []
+        keep_scores: list[float] = []
+        active = np.ones(n, dtype=bool)
+        while True:
+            valid_mask = active & (scores >= self.score_threshold)
+            if not valid_mask.any():
+                break
+            valid_idx = np.where(valid_mask)[0]
+            m_local = valid_idx[int(np.argmax(scores[valid_idx]))]
+            keep_idx.append(int(m_local))
+            keep_scores.append(float(scores[m_local]))
+            active[m_local] = False
+            others = np.where(active)[0]
+            if others.size == 0:
+                break
+            ax1 = np.maximum(boxes[m_local, 0], boxes[others, 0])
+            ay1 = np.maximum(boxes[m_local, 1], boxes[others, 1])
+            ax2 = np.minimum(boxes[m_local, 2], boxes[others, 2])
+            ay2 = np.minimum(boxes[m_local, 3], boxes[others, 3])
+            inter_w = np.clip(ax2 - ax1, a_min=0.0, a_max=None)
+            inter_h = np.clip(ay2 - ay1, a_min=0.0, a_max=None)
+            inter = inter_w * inter_h
+            area_m = max(0.0, (boxes[m_local, 2] - boxes[m_local, 0])) * \
+                     max(0.0, (boxes[m_local, 3] - boxes[m_local, 1]))
+            area_o = (
+                np.clip(boxes[others, 2] - boxes[others, 0], a_min=0.0, a_max=None) *
+                np.clip(boxes[others, 3] - boxes[others, 1], a_min=0.0, a_max=None)
+            )
+            union = area_m + area_o - inter
+            iou = np.where(union > 0.0, inter / union, 0.0)
+            decay = np.exp(-(iou * iou) / self.soft_nms_sigma)
+            scores[others] = scores[others] * decay
+        return [
+            (
+                float(boxes[i, 0]),
+                float(boxes[i, 1]),
+                float(boxes[i, 2]),
+                float(boxes[i, 3]),
+                float(s),
+                cls_ids[i],
+            )
+            for i, s in zip(keep_idx, keep_scores)
+        ]
+    # ---------------------------------------------------------------- inference
+    def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
+        """Letterbox preprocess -> ONNX -> unletterbox -> per-class Soft-NMS -> BoundingBox list."""
+        inp, (orig_h, orig_w, scale, dx, dy) = self._preprocess(image_bgr)
+        out = self.session.run(None, {self.input_name: inp})[0]
+        pred = self._normalize_predictions(out)
+        if pred.shape[1] < 5:
+            return []
+        boxes_m = pred[:, :4]
+        cls_scores = pred[:, 4:]
+        if cls_scores.shape[1] == 0:
+            return []
+        cls_ids = np.argmax(cls_scores, axis=1)
+        confs = np.max(cls_scores, axis=1)
+        keep = confs >= self.conf_threshold
+        boxes_m = boxes_m[keep]
+        confs = confs[keep]
+        cls_ids = cls_ids[keep]
+        if boxes_m.shape[0] == 0:
+            return []
+        # Decode model-space cx,cy,w,h -> letterbox-space xyxy -> original xyxy
+        # via inverse letterbox: (model - pad) / scale.
+        dets: list[tuple[float, float, float, float, float, int]] = []
+        for i in range(boxes_m.shape[0]):
+            cx, cy, bw, bh = boxes_m[i].tolist()
+            x1m = cx - bw / 2.0
+            y1m = cy - bh / 2.0
+            x2m = cx + bw / 2.0
+            y2m = cy + bh / 2.0
+            x1 = (x1m - dx) / scale
+            y1 = (y1m - dy) / scale
+            x2 = (x2m - dx) / scale
+            y2 = (y2m - dy) / scale
+            dets.append((x1, y1, x2, y2, float(confs[i]), int(cls_ids[i])))
+        # Pre-NMS dedup: kill same-class near-duplicates (IoU >= 0.5) that
+        # would otherwise survive Soft-NMS's gentle decay above the score floor.
+        dets = self._cluster_dedup(dets, iou_thresh=0.5)
+        dets = self._soft_nms(dets)
+        out_boxes: list[BoundingBox] = []
+        for x1, y1, x2, y2, conf, cls_id in dets:
+            ix1 = max(0, min(orig_w, math.floor(x1)))
+            iy1 = max(0, min(orig_h, math.floor(y1)))
+            ix2 = max(0, min(orig_w, math.ceil(x2)))
+            iy2 = max(0, min(orig_h, math.ceil(y2)))
+            if ix2 <= ix1 or iy2 <= iy1:
+                continue
+            out_boxes.append(
+                BoundingBox(
+                    x1=ix1,
+                    y1=iy1,
+                    x2=ix2,
+                    y2=iy2,
+                    cls_id=cls_id,
+                    conf=max(0.0, min(1.0, conf)),
+                )
+            )
+        return out_boxes
+    # ---------------------------------------------------------------- entry
+    def predict_batch(
+        self,
+        batch_images: list[ndarray],
+        offset: int,
+        n_keypoints: int,
+    ) -> list[TVFrameResult]:
+        results: list[TVFrameResult] = []
+        for idx, image in enumerate(batch_images):
+            boxes = self._infer_single(image)
+            keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
+            results.append(
+                TVFrameResult(
+                    frame_id=offset + idx,
+                    boxes=boxes,
+                    keypoints=keypoints,
+                )
+            )
+        return results