""" SN44 number plate detection miner — single-element chute for manak0/Detect-number-plates-1-0. Adapted from the auto-generated detect-person-reference miner with four substantive changes: 1. Class set is the single class ``numberplate`` (the validator's exact label string). 2. Lower confidence threshold (0.15 vs 0.25) because the validator's plates are tiny — 5–92 px wide on a 1408 px frame, median ~30 px. At standard 0.25 most true positives get filtered before NMS. 3. Standard NMS replaced with Gaussian Soft-NMS (sigma=0.5). Soft-NMS decays scores of overlapping boxes instead of suppressing them outright, which helps on plate-dense frames (parking lot, car carrier, gas station forecourt) where standard NMS over-suppresses adjacent plates. 4. CUDA library preload at import time so onnxruntime-gpu finds libcudnn / libcublas from the nvidia-* pip wheels even when LD_LIBRARY_PATH is not set (the chute container ships these wheels but does not export them). Soft-NMS is inlined here rather than imported from /home/miner/utils because the chute platform sandbox restricts non-stdlib imports beyond the deps declared in chute_config.yml. The implementation is a specialised single-class version of soft_nms_yolo from /home/miner/utils/soft_nms.py — see that file for the full multi-class / multi-backend version. """ import ctypes import glob as _glob import logging as _logging import os _cuda_log = _logging.getLogger(__name__) def _preload_cuda_libs() -> None: """Pre-load CUDA + cuDNN + cuBLAS shared libs from nvidia-* pip wheels. Without this, onnxruntime-gpu's CUDAExecutionProvider silently falls back to CPU because it can't dlopen libcudnn.so.9 — the nvidia wheels ship the library inside `nvidia/cudnn/lib/` but do NOT add that directory to the loader path. We import the wheel modules to locate their lib dirs, prepend them to LD_LIBRARY_PATH for any child processes, and ctypes.CDLL the .so files with RTLD_GLOBAL so onnxruntime's dlopen sees them. """ try: lib_dirs: list[str] = [] for mod_name in ( "nvidia.cudnn", "nvidia.cublas", "nvidia.cuda_runtime", "nvidia.cufft", "nvidia.curand", "nvidia.cusolver", "nvidia.cusparse", "nvidia.nvjitlink", ): try: mod = __import__(mod_name, fromlist=["__file__"]) lib_dir = os.path.join(os.path.dirname(mod.__file__), "lib") if os.path.isdir(lib_dir) and lib_dir not in lib_dirs: lib_dirs.append(lib_dir) except ImportError: pass if not lib_dirs: _cuda_log.warning("no nvidia-* lib dirs found; ORT GPU may fall back to CPU") return # Update LD_LIBRARY_PATH for any child processes / dlopen fallbacks existing = os.environ.get("LD_LIBRARY_PATH", "") os.environ["LD_LIBRARY_PATH"] = ":".join( lib_dirs + ([existing] if existing else []) ) # ctypes.CDLL each .so so the symbols are globally visible to ORT for lib_dir in lib_dirs: for so in sorted(_glob.glob(os.path.join(lib_dir, "lib*.so*"))): try: ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL) except OSError: pass except Exception as e: # pragma: no cover - best effort _cuda_log.warning("CUDA preload failed: %s", e) _preload_cuda_libs() from pathlib import Path import math import cv2 import numpy as np import onnxruntime as ort from numpy import ndarray from pydantic import BaseModel class BoundingBox(BaseModel): x1: int y1: int x2: int y2: int cls_id: int conf: float class TVFrameResult(BaseModel): frame_id: int boxes: list[BoundingBox] keypoints: list[tuple[int, int]] class Miner: """ Single-element ONNX miner for the manak0/Detect-number-plates-1-0 element. Auto-loaded by the chute platform; the platform passes the snapshot path of the HF repo containing weights.onnx as ``path_hf_repo`` and calls ``predict_batch(batch_images, offset, n_keypoints)`` for each request. """ def __init__(self, path_hf_repo) -> None: self.path_hf_repo = Path(path_hf_repo) self.class_names = ['numberplate'] self.session = ort.InferenceSession( str(self.path_hf_repo / "numberplate_weights.onnx"), providers=["CUDAExecutionProvider", "CPUExecutionProvider"], ) self.input_name = self.session.get_inputs()[0].name input_shape = self.session.get_inputs()[0].shape # expected [N, C, H, W]; dynamic-export ONNX has string placeholders # for spatial dims. We always run inference at 1408 (the validator's # native frame width); the ONNX accepts variable shapes via dynamic # axes, and inference at 1408 gives substantially better small-plate # recall than the model's training resolution (verified on the 7 # starter assets: 43% recall at 960 vs 60% at 1408). def _maybe_int(d, default): try: return int(d) except (TypeError, ValueError): return default # Hard-pin to the validator's native 1408x768 (rectangular). This # is half the pixel count of a 1408x1408 square pad and matches # the validator's exact frame shape, eliminating wasted padding # rows. yolo11s strides are 32, both 1408 (44*32) and 768 (24*32) # are valid. self.input_h = 768 self.input_w = 1408 # Record what the ONNX *declared*, for diagnostic logging only self._onnx_declared_h = _maybe_int(input_shape[2], None) self._onnx_declared_w = _maybe_int(input_shape[3], None) # Pre-NMS confidence threshold. The reference uses 0.25; we lower # slightly because validator plates are tiny but not as far as 0.15 # which produces too many decayed-score ghost detections at 1408 # input resolution (verified on starter assets: F1 dropped from # 0.625 to 0.462 at conf=0.15). self.conf_threshold = 0.25 # Soft-NMS hyperparameters (Gaussian variant). self.soft_nms_sigma = 0.5 # Final score floor after Soft-NMS decay. At higher input resolution # the model produces more medium-confidence detections that survive # decay; we keep this stricter so they don't pollute the output. self.score_threshold = 0.20 # GPU warmup — force ORT / CUDA / cuDNN kernel compilation and pull # the 4090 out of low-power idle state so the first real validator # frame doesn't pay a ~20 ms DVFS spin-up tax. SCOREVISION_WARMUP_CALLS # at the chute level defaults to 3, which is not enough to reach # steady-state on this tiled inference path (measured: 3 calls -> 52 # ms p95 on the first few frames vs 31 ms steady). 10 full pipeline # runs on a synthetic frame gets us to the fast regime before the # platform warmup even starts. _warmup_frame = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8) for _ in range(10): try: self._infer_single(_warmup_frame) except Exception: # pragma: no cover - best effort break def __repr__(self) -> str: return ( f"NumberplateMiner session={type(self.session).__name__} " f"input={self.input_h}x{self.input_w} classes={len(self.class_names)}" ) # ---------------------------------------------------------------- preproc def _preprocess(self, image_bgr: ndarray): """Letterbox the BGR image to (input_h, input_w), preserving aspect. Returns the float32 NCHW tensor plus the metadata needed to undo the letterbox during decode: (orig_h, orig_w, scale, dx, dy). """ h, w = image_bgr.shape[:2] scale = min(self.input_h / h, self.input_w / w) nh, nw = int(round(h * scale)), int(round(w * scale)) resized = cv2.resize(image_bgr, (nw, nh)) # Pad to (input_h, input_w) with grey (114) - ultralytics default canvas = np.full((self.input_h, self.input_w, 3), 114, dtype=np.uint8) dy = (self.input_h - nh) // 2 dx = (self.input_w - nw) // 2 canvas[dy:dy + nh, dx:dx + nw] = resized rgb = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB) x = rgb.astype(np.float32) / 255.0 x = np.transpose(x, (2, 0, 1))[None, ...] return x, (h, w, scale, dx, dy) # ---------------------------------------------------------------- decode def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray: """Handle both common ultralytics export shapes ([1,C,N] and [1,N,C]).""" pred = raw[0] if pred.ndim != 2: raise ValueError(f"Unexpected prediction shape: {raw.shape}") if pred.shape[0] < pred.shape[1]: pred = pred.transpose(1, 0) return pred # ---------------------------------------------------------------- soft NMS def _soft_nms( self, dets: list[tuple[float, float, float, float, float, int]], ) -> list[tuple[float, float, float, float, float, int]]: """Gaussian Soft-NMS for a single class. Decays each remaining box's score by ``exp(-iou^2 / sigma)`` against the highest-scoring picked box, then drops anything below ``self.score_threshold``. Returns detections in descending decayed score order. """ if not dets: return [] boxes = np.asarray([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32) scores = np.asarray([d[4] for d in dets], dtype=np.float32) cls_ids = [int(d[5]) for d in dets] n = len(dets) keep_idx: list[int] = [] keep_scores: list[float] = [] active = np.ones(n, dtype=bool) while True: valid_mask = active & (scores >= self.score_threshold) if not valid_mask.any(): break valid_idx = np.where(valid_mask)[0] m_local = valid_idx[int(np.argmax(scores[valid_idx]))] keep_idx.append(int(m_local)) keep_scores.append(float(scores[m_local])) active[m_local] = False # IoU of m_local against all still-active boxes others = np.where(active)[0] if others.size == 0: break ax1 = np.maximum(boxes[m_local, 0], boxes[others, 0]) ay1 = np.maximum(boxes[m_local, 1], boxes[others, 1]) ax2 = np.minimum(boxes[m_local, 2], boxes[others, 2]) ay2 = np.minimum(boxes[m_local, 3], boxes[others, 3]) inter_w = np.clip(ax2 - ax1, a_min=0.0, a_max=None) inter_h = np.clip(ay2 - ay1, a_min=0.0, a_max=None) inter = inter_w * inter_h area_m = max(0.0, (boxes[m_local, 2] - boxes[m_local, 0])) * \ max(0.0, (boxes[m_local, 3] - boxes[m_local, 1])) area_o = ( np.clip(boxes[others, 2] - boxes[others, 0], a_min=0.0, a_max=None) * np.clip(boxes[others, 3] - boxes[others, 1], a_min=0.0, a_max=None) ) union = area_m + area_o - inter iou = np.where(union > 0.0, inter / union, 0.0) decay = np.exp(-(iou * iou) / self.soft_nms_sigma) scores[others] = scores[others] * decay return [ ( float(boxes[i, 0]), float(boxes[i, 1]), float(boxes[i, 2]), float(boxes[i, 3]), float(s), cls_ids[i], ) for i, s in zip(keep_idx, keep_scores) ] # ---------------------------------------------------------------- inference def _infer_tile( self, image_bgr: ndarray, x0: int, y0: int, x1: int, y1: int, ) -> list[tuple[float, float, float, float, float, int]]: """Run one inference pass on ``image_bgr[y0:y1, x0:x1]`` resized anisotropically to ``(input_h, input_w)`` and return raw detections (pre-Soft-NMS) mapped back to ORIGINAL-image coordinates. Anisotropic resize is intentional: the tile aspect ratio differs from the model input, and we want the tile pixels to magnify up to the detector's stride-8 feature footprint. For the 1408x422 top/bottom tiles used by ``_infer_single`` this yields ~1.82x vertical magnification (and 1.0x horizontal), which is what pushes tiny-height plates (5-12 px on the validator's starter frames) above the stride-8 threshold. """ crop = image_bgr[y0:y1, x0:x1] ch, cw = crop.shape[:2] if ch == 0 or cw == 0: return [] resized = cv2.resize(crop, (self.input_w, self.input_h)) rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) x = np.transpose(rgb.astype(np.float32) / 255.0, (2, 0, 1))[None, ...] out = self.session.run(None, {self.input_name: x})[0] pred = self._normalize_predictions(out) if pred.shape[1] < 5: return [] boxes_m = pred[:, :4] cls_scores = pred[:, 4:] if cls_scores.shape[1] == 0: return [] cls_ids = np.argmax(cls_scores, axis=1) confs = np.max(cls_scores, axis=1) keep = confs >= self.conf_threshold boxes_m = boxes_m[keep] confs = confs[keep] cls_ids = cls_ids[keep] if boxes_m.shape[0] == 0: return [] # Model-space (input_w x input_h) -> crop-space -> original image sx = cw / self.input_w sy = ch / self.input_h dets: list[tuple[float, float, float, float, float, int]] = [] for i in range(boxes_m.shape[0]): cx, cy, bw, bh = boxes_m[i].tolist() xa = (cx - bw / 2.0) * sx + x0 ya = (cy - bh / 2.0) * sy + y0 xb = (cx + bw / 2.0) * sx + x0 yb = (cy + bh / 2.0) * sy + y0 dets.append((xa, ya, xb, yb, float(confs[i]), int(cls_ids[i]))) return dets def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]: """Quad-4 (2x2 quadrant) SAHI inference. Splits the frame into four overlapping quadrants, each anisotropically resized to ``(input_h, input_w)`` for ~2x magnification in both axes. This recovers plates that TB-2 (top/bottom only) missed — especially the 5-7 px plates in image 6 that need vertical AND horizontal magnification. Overlap is ~10% on each axis to avoid seam misses. All tile detections are merged via Soft-NMS. Measured on the 7 starter frames vs TB-2: mAP@50 0.406 -> 0.489 recall 0.433 -> 0.500 wall p95 55 ms -> 98 ms (budget 10 s) """ orig_h, orig_w = image_bgr.shape[:2] OVERLAP_X = 70 # ~10% of 1408/2 OVERLAP_Y = 38 # ~10% of 768/2 mx = orig_w // 2 my = orig_h // 2 tiles = [ (0, 0, min(orig_w, mx + OVERLAP_X), min(orig_h, my + OVERLAP_Y)), # TL (max(0, mx - OVERLAP_X), 0, orig_w, min(orig_h, my + OVERLAP_Y)), # TR (0, max(0, my - OVERLAP_Y), min(orig_w, mx + OVERLAP_X), orig_h), # BL (max(0, mx - OVERLAP_X), max(0, my - OVERLAP_Y), orig_w, orig_h), # BR ] all_dets = [] for x0, y0, x1, y1 in tiles: all_dets.extend(self._infer_tile(image_bgr, x0, y0, x1, y1)) dets = self._soft_nms(all_dets) out_boxes: list[BoundingBox] = [] for x1, y1, x2, y2, conf, cls_id in dets: ix1 = max(0, min(orig_w, math.floor(x1))) iy1 = max(0, min(orig_h, math.floor(y1))) ix2 = max(0, min(orig_w, math.ceil(x2))) iy2 = max(0, min(orig_h, math.ceil(y2))) out_boxes.append( BoundingBox( x1=ix1, y1=iy1, x2=ix2, y2=iy2, cls_id=cls_id, conf=max(0.0, min(1.0, conf)), ) ) return out_boxes # ---------------------------------------------------------------- entry def predict_batch( self, batch_images: list[ndarray], offset: int, n_keypoints: int, ) -> list[TVFrameResult]: results: list[TVFrameResult] = [] for idx, image in enumerate(batch_images): boxes = self._infer_single(image) keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))] results.append( TVFrameResult( frame_id=offset + idx, boxes=boxes, keypoints=keypoints, ) ) return results