ScoreVision / miner.py
meaculpitt's picture
quad-4 tiling + v3 epoch-10 weights: mAP@50 0.406->0.728
eabfd0e verified
"""
SN44 number plate detection miner — single-element chute for
manak0/Detect-number-plates-1-0.
Adapted from the auto-generated detect-person-reference miner with four
substantive changes:
1. Class set is the single class ``numberplate`` (the validator's exact
label string).
2. Lower confidence threshold (0.15 vs 0.25) because the validator's
plates are tiny — 5–92 px wide on a 1408 px frame, median ~30 px.
At standard 0.25 most true positives get filtered before NMS.
3. Standard NMS replaced with Gaussian Soft-NMS (sigma=0.5). Soft-NMS
decays scores of overlapping boxes instead of suppressing them
outright, which helps on plate-dense frames (parking lot, car
carrier, gas station forecourt) where standard NMS over-suppresses
adjacent plates.
4. CUDA library preload at import time so onnxruntime-gpu finds
libcudnn / libcublas from the nvidia-* pip wheels even when
LD_LIBRARY_PATH is not set (the chute container ships these wheels
but does not export them).
Soft-NMS is inlined here rather than imported from /home/miner/utils
because the chute platform sandbox restricts non-stdlib imports beyond
the deps declared in chute_config.yml. The implementation is a
specialised single-class version of soft_nms_yolo from
/home/miner/utils/soft_nms.py — see that file for the full
multi-class / multi-backend version.
"""
import ctypes
import glob as _glob
import logging as _logging
import os
_cuda_log = _logging.getLogger(__name__)
def _preload_cuda_libs() -> None:
"""Pre-load CUDA + cuDNN + cuBLAS shared libs from nvidia-* pip wheels.
Without this, onnxruntime-gpu's CUDAExecutionProvider silently falls
back to CPU because it can't dlopen libcudnn.so.9 — the nvidia
wheels ship the library inside `nvidia/cudnn/lib/` but do NOT add
that directory to the loader path. We import the wheel modules to
locate their lib dirs, prepend them to LD_LIBRARY_PATH for any
child processes, and ctypes.CDLL the .so files with RTLD_GLOBAL so
onnxruntime's dlopen sees them.
"""
try:
lib_dirs: list[str] = []
for mod_name in (
"nvidia.cudnn",
"nvidia.cublas",
"nvidia.cuda_runtime",
"nvidia.cufft",
"nvidia.curand",
"nvidia.cusolver",
"nvidia.cusparse",
"nvidia.nvjitlink",
):
try:
mod = __import__(mod_name, fromlist=["__file__"])
lib_dir = os.path.join(os.path.dirname(mod.__file__), "lib")
if os.path.isdir(lib_dir) and lib_dir not in lib_dirs:
lib_dirs.append(lib_dir)
except ImportError:
pass
if not lib_dirs:
_cuda_log.warning("no nvidia-* lib dirs found; ORT GPU may fall back to CPU")
return
# Update LD_LIBRARY_PATH for any child processes / dlopen fallbacks
existing = os.environ.get("LD_LIBRARY_PATH", "")
os.environ["LD_LIBRARY_PATH"] = ":".join(
lib_dirs + ([existing] if existing else [])
)
# ctypes.CDLL each .so so the symbols are globally visible to ORT
for lib_dir in lib_dirs:
for so in sorted(_glob.glob(os.path.join(lib_dir, "lib*.so*"))):
try:
ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
except OSError:
pass
except Exception as e: # pragma: no cover - best effort
_cuda_log.warning("CUDA preload failed: %s", e)
_preload_cuda_libs()
from pathlib import Path
import math
import cv2
import numpy as np
import onnxruntime as ort
from numpy import ndarray
from pydantic import BaseModel
class BoundingBox(BaseModel):
x1: int
y1: int
x2: int
y2: int
cls_id: int
conf: float
class TVFrameResult(BaseModel):
frame_id: int
boxes: list[BoundingBox]
keypoints: list[tuple[int, int]]
class Miner:
"""
Single-element ONNX miner for the manak0/Detect-number-plates-1-0
element. Auto-loaded by the chute platform; the platform passes the
snapshot path of the HF repo containing weights.onnx as
``path_hf_repo`` and calls ``predict_batch(batch_images, offset,
n_keypoints)`` for each request.
"""
def __init__(self, path_hf_repo) -> None:
self.path_hf_repo = Path(path_hf_repo)
self.class_names = ['numberplate']
self.session = ort.InferenceSession(
str(self.path_hf_repo / "numberplate_weights.onnx"),
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
self.input_name = self.session.get_inputs()[0].name
input_shape = self.session.get_inputs()[0].shape
# expected [N, C, H, W]; dynamic-export ONNX has string placeholders
# for spatial dims. We always run inference at 1408 (the validator's
# native frame width); the ONNX accepts variable shapes via dynamic
# axes, and inference at 1408 gives substantially better small-plate
# recall than the model's training resolution (verified on the 7
# starter assets: 43% recall at 960 vs 60% at 1408).
def _maybe_int(d, default):
try:
return int(d)
except (TypeError, ValueError):
return default
# Hard-pin to the validator's native 1408x768 (rectangular). This
# is half the pixel count of a 1408x1408 square pad and matches
# the validator's exact frame shape, eliminating wasted padding
# rows. yolo11s strides are 32, both 1408 (44*32) and 768 (24*32)
# are valid.
self.input_h = 768
self.input_w = 1408
# Record what the ONNX *declared*, for diagnostic logging only
self._onnx_declared_h = _maybe_int(input_shape[2], None)
self._onnx_declared_w = _maybe_int(input_shape[3], None)
# Pre-NMS confidence threshold. The reference uses 0.25; we lower
# slightly because validator plates are tiny but not as far as 0.15
# which produces too many decayed-score ghost detections at 1408
# input resolution (verified on starter assets: F1 dropped from
# 0.625 to 0.462 at conf=0.15).
self.conf_threshold = 0.25
# Soft-NMS hyperparameters (Gaussian variant).
self.soft_nms_sigma = 0.5
# Final score floor after Soft-NMS decay. At higher input resolution
# the model produces more medium-confidence detections that survive
# decay; we keep this stricter so they don't pollute the output.
self.score_threshold = 0.20
# GPU warmup — force ORT / CUDA / cuDNN kernel compilation and pull
# the 4090 out of low-power idle state so the first real validator
# frame doesn't pay a ~20 ms DVFS spin-up tax. SCOREVISION_WARMUP_CALLS
# at the chute level defaults to 3, which is not enough to reach
# steady-state on this tiled inference path (measured: 3 calls -> 52
# ms p95 on the first few frames vs 31 ms steady). 10 full pipeline
# runs on a synthetic frame gets us to the fast regime before the
# platform warmup even starts.
_warmup_frame = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
for _ in range(10):
try:
self._infer_single(_warmup_frame)
except Exception: # pragma: no cover - best effort
break
def __repr__(self) -> str:
return (
f"NumberplateMiner session={type(self.session).__name__} "
f"input={self.input_h}x{self.input_w} classes={len(self.class_names)}"
)
# ---------------------------------------------------------------- preproc
def _preprocess(self, image_bgr: ndarray):
"""Letterbox the BGR image to (input_h, input_w), preserving aspect.
Returns the float32 NCHW tensor plus the metadata needed to undo
the letterbox during decode: (orig_h, orig_w, scale, dx, dy).
"""
h, w = image_bgr.shape[:2]
scale = min(self.input_h / h, self.input_w / w)
nh, nw = int(round(h * scale)), int(round(w * scale))
resized = cv2.resize(image_bgr, (nw, nh))
# Pad to (input_h, input_w) with grey (114) - ultralytics default
canvas = np.full((self.input_h, self.input_w, 3), 114, dtype=np.uint8)
dy = (self.input_h - nh) // 2
dx = (self.input_w - nw) // 2
canvas[dy:dy + nh, dx:dx + nw] = resized
rgb = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)
x = rgb.astype(np.float32) / 255.0
x = np.transpose(x, (2, 0, 1))[None, ...]
return x, (h, w, scale, dx, dy)
# ---------------------------------------------------------------- decode
def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray:
"""Handle both common ultralytics export shapes ([1,C,N] and [1,N,C])."""
pred = raw[0]
if pred.ndim != 2:
raise ValueError(f"Unexpected prediction shape: {raw.shape}")
if pred.shape[0] < pred.shape[1]:
pred = pred.transpose(1, 0)
return pred
# ---------------------------------------------------------------- soft NMS
def _soft_nms(
self,
dets: list[tuple[float, float, float, float, float, int]],
) -> list[tuple[float, float, float, float, float, int]]:
"""Gaussian Soft-NMS for a single class.
Decays each remaining box's score by ``exp(-iou^2 / sigma)`` against
the highest-scoring picked box, then drops anything below
``self.score_threshold``. Returns detections in descending decayed
score order.
"""
if not dets:
return []
boxes = np.asarray([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32)
scores = np.asarray([d[4] for d in dets], dtype=np.float32)
cls_ids = [int(d[5]) for d in dets]
n = len(dets)
keep_idx: list[int] = []
keep_scores: list[float] = []
active = np.ones(n, dtype=bool)
while True:
valid_mask = active & (scores >= self.score_threshold)
if not valid_mask.any():
break
valid_idx = np.where(valid_mask)[0]
m_local = valid_idx[int(np.argmax(scores[valid_idx]))]
keep_idx.append(int(m_local))
keep_scores.append(float(scores[m_local]))
active[m_local] = False
# IoU of m_local against all still-active boxes
others = np.where(active)[0]
if others.size == 0:
break
ax1 = np.maximum(boxes[m_local, 0], boxes[others, 0])
ay1 = np.maximum(boxes[m_local, 1], boxes[others, 1])
ax2 = np.minimum(boxes[m_local, 2], boxes[others, 2])
ay2 = np.minimum(boxes[m_local, 3], boxes[others, 3])
inter_w = np.clip(ax2 - ax1, a_min=0.0, a_max=None)
inter_h = np.clip(ay2 - ay1, a_min=0.0, a_max=None)
inter = inter_w * inter_h
area_m = max(0.0, (boxes[m_local, 2] - boxes[m_local, 0])) * \
max(0.0, (boxes[m_local, 3] - boxes[m_local, 1]))
area_o = (
np.clip(boxes[others, 2] - boxes[others, 0], a_min=0.0, a_max=None) *
np.clip(boxes[others, 3] - boxes[others, 1], a_min=0.0, a_max=None)
)
union = area_m + area_o - inter
iou = np.where(union > 0.0, inter / union, 0.0)
decay = np.exp(-(iou * iou) / self.soft_nms_sigma)
scores[others] = scores[others] * decay
return [
(
float(boxes[i, 0]),
float(boxes[i, 1]),
float(boxes[i, 2]),
float(boxes[i, 3]),
float(s),
cls_ids[i],
)
for i, s in zip(keep_idx, keep_scores)
]
# ---------------------------------------------------------------- inference
def _infer_tile(
self,
image_bgr: ndarray,
x0: int,
y0: int,
x1: int,
y1: int,
) -> list[tuple[float, float, float, float, float, int]]:
"""Run one inference pass on ``image_bgr[y0:y1, x0:x1]`` resized
anisotropically to ``(input_h, input_w)`` and return raw detections
(pre-Soft-NMS) mapped back to ORIGINAL-image coordinates.
Anisotropic resize is intentional: the tile aspect ratio differs
from the model input, and we want the tile pixels to magnify up to
the detector's stride-8 feature footprint. For the 1408x422
top/bottom tiles used by ``_infer_single`` this yields ~1.82x
vertical magnification (and 1.0x horizontal), which is what pushes
tiny-height plates (5-12 px on the validator's starter frames)
above the stride-8 threshold.
"""
crop = image_bgr[y0:y1, x0:x1]
ch, cw = crop.shape[:2]
if ch == 0 or cw == 0:
return []
resized = cv2.resize(crop, (self.input_w, self.input_h))
rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
x = np.transpose(rgb.astype(np.float32) / 255.0, (2, 0, 1))[None, ...]
out = self.session.run(None, {self.input_name: x})[0]
pred = self._normalize_predictions(out)
if pred.shape[1] < 5:
return []
boxes_m = pred[:, :4]
cls_scores = pred[:, 4:]
if cls_scores.shape[1] == 0:
return []
cls_ids = np.argmax(cls_scores, axis=1)
confs = np.max(cls_scores, axis=1)
keep = confs >= self.conf_threshold
boxes_m = boxes_m[keep]
confs = confs[keep]
cls_ids = cls_ids[keep]
if boxes_m.shape[0] == 0:
return []
# Model-space (input_w x input_h) -> crop-space -> original image
sx = cw / self.input_w
sy = ch / self.input_h
dets: list[tuple[float, float, float, float, float, int]] = []
for i in range(boxes_m.shape[0]):
cx, cy, bw, bh = boxes_m[i].tolist()
xa = (cx - bw / 2.0) * sx + x0
ya = (cy - bh / 2.0) * sy + y0
xb = (cx + bw / 2.0) * sx + x0
yb = (cy + bh / 2.0) * sy + y0
dets.append((xa, ya, xb, yb, float(confs[i]), int(cls_ids[i])))
return dets
def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
"""Quad-4 (2x2 quadrant) SAHI inference.
Splits the frame into four overlapping quadrants, each
anisotropically resized to ``(input_h, input_w)`` for ~2x
magnification in both axes. This recovers plates that TB-2
(top/bottom only) missed — especially the 5-7 px plates in
image 6 that need vertical AND horizontal magnification.
Overlap is ~10% on each axis to avoid seam misses. All tile
detections are merged via Soft-NMS.
Measured on the 7 starter frames vs TB-2:
mAP@50 0.406 -> 0.489
recall 0.433 -> 0.500
wall p95 55 ms -> 98 ms (budget 10 s)
"""
orig_h, orig_w = image_bgr.shape[:2]
OVERLAP_X = 70 # ~10% of 1408/2
OVERLAP_Y = 38 # ~10% of 768/2
mx = orig_w // 2
my = orig_h // 2
tiles = [
(0, 0, min(orig_w, mx + OVERLAP_X), min(orig_h, my + OVERLAP_Y)), # TL
(max(0, mx - OVERLAP_X), 0, orig_w, min(orig_h, my + OVERLAP_Y)), # TR
(0, max(0, my - OVERLAP_Y), min(orig_w, mx + OVERLAP_X), orig_h), # BL
(max(0, mx - OVERLAP_X), max(0, my - OVERLAP_Y), orig_w, orig_h), # BR
]
all_dets = []
for x0, y0, x1, y1 in tiles:
all_dets.extend(self._infer_tile(image_bgr, x0, y0, x1, y1))
dets = self._soft_nms(all_dets)
out_boxes: list[BoundingBox] = []
for x1, y1, x2, y2, conf, cls_id in dets:
ix1 = max(0, min(orig_w, math.floor(x1)))
iy1 = max(0, min(orig_h, math.floor(y1)))
ix2 = max(0, min(orig_w, math.ceil(x2)))
iy2 = max(0, min(orig_h, math.ceil(y2)))
out_boxes.append(
BoundingBox(
x1=ix1,
y1=iy1,
x2=ix2,
y2=iy2,
cls_id=cls_id,
conf=max(0.0, min(1.0, conf)),
)
)
return out_boxes
# ---------------------------------------------------------------- entry
def predict_batch(
self,
batch_images: list[ndarray],
offset: int,
n_keypoints: int,
) -> list[TVFrameResult]:
results: list[TVFrameResult] = []
for idx, image in enumerate(batch_images):
boxes = self._infer_single(image)
keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
results.append(
TVFrameResult(
frame_id=offset + idx,
boxes=boxes,
keypoints=keypoints,
)
)
return results