numberplate: TB-2 sliced inference (top/bottom tile SAHI) + __init__ warmup. Recall 0.20->0.43, F1 0.30->0.55, p95 25->31ms. miner.py only — weights unchanged.
Browse files
miner.py
CHANGED
|
@@ -168,6 +168,21 @@ class Miner:
|
|
| 168 |
# decay; we keep this stricter so they don't pollute the output.
|
| 169 |
self.score_threshold = 0.20
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
def __repr__(self) -> str:
|
| 172 |
return (
|
| 173 |
f"NumberplateMiner session={type(self.session).__name__} "
|
|
@@ -276,15 +291,40 @@ class Miner:
|
|
| 276 |
]
|
| 277 |
|
| 278 |
# ---------------------------------------------------------------- inference
|
| 279 |
-
def
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
pred = self._normalize_predictions(out)
|
| 283 |
|
| 284 |
if pred.shape[1] < 5:
|
| 285 |
return []
|
| 286 |
|
| 287 |
-
|
| 288 |
cls_scores = pred[:, 4:]
|
| 289 |
if cls_scores.shape[1] == 0:
|
| 290 |
return []
|
|
@@ -292,26 +332,66 @@ class Miner:
|
|
| 292 |
cls_ids = np.argmax(cls_scores, axis=1)
|
| 293 |
confs = np.max(cls_scores, axis=1)
|
| 294 |
keep = confs >= self.conf_threshold
|
| 295 |
-
|
| 296 |
-
boxes = boxes[keep]
|
| 297 |
confs = confs[keep]
|
| 298 |
cls_ids = cls_ids[keep]
|
| 299 |
-
|
| 300 |
-
if boxes.shape[0] == 0:
|
| 301 |
return []
|
| 302 |
|
| 303 |
-
#
|
| 304 |
-
|
|
|
|
| 305 |
dets: list[tuple[float, float, float, float, float, int]] = []
|
| 306 |
-
for i in range(
|
| 307 |
-
cx, cy, bw, bh =
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
dets.append((
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
out_boxes: list[BoundingBox] = []
|
| 317 |
for x1, y1, x2, y2, conf, cls_id in dets:
|
|
|
|
| 168 |
# decay; we keep this stricter so they don't pollute the output.
|
| 169 |
self.score_threshold = 0.20
|
| 170 |
|
| 171 |
+
# GPU warmup — force ORT / CUDA / cuDNN kernel compilation and pull
|
| 172 |
+
# the 4090 out of low-power idle state so the first real validator
|
| 173 |
+
# frame doesn't pay a ~20 ms DVFS spin-up tax. SCOREVISION_WARMUP_CALLS
|
| 174 |
+
# at the chute level defaults to 3, which is not enough to reach
|
| 175 |
+
# steady-state on this tiled inference path (measured: 3 calls -> 52
|
| 176 |
+
# ms p95 on the first few frames vs 31 ms steady). 10 full pipeline
|
| 177 |
+
# runs on a synthetic frame gets us to the fast regime before the
|
| 178 |
+
# platform warmup even starts.
|
| 179 |
+
_warmup_frame = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
|
| 180 |
+
for _ in range(10):
|
| 181 |
+
try:
|
| 182 |
+
self._infer_single(_warmup_frame)
|
| 183 |
+
except Exception: # pragma: no cover - best effort
|
| 184 |
+
break
|
| 185 |
+
|
| 186 |
def __repr__(self) -> str:
|
| 187 |
return (
|
| 188 |
f"NumberplateMiner session={type(self.session).__name__} "
|
|
|
|
| 291 |
]
|
| 292 |
|
| 293 |
# ---------------------------------------------------------------- inference
|
| 294 |
+
def _infer_tile(
|
| 295 |
+
self,
|
| 296 |
+
image_bgr: ndarray,
|
| 297 |
+
x0: int,
|
| 298 |
+
y0: int,
|
| 299 |
+
x1: int,
|
| 300 |
+
y1: int,
|
| 301 |
+
) -> list[tuple[float, float, float, float, float, int]]:
|
| 302 |
+
"""Run one inference pass on ``image_bgr[y0:y1, x0:x1]`` resized
|
| 303 |
+
anisotropically to ``(input_h, input_w)`` and return raw detections
|
| 304 |
+
(pre-Soft-NMS) mapped back to ORIGINAL-image coordinates.
|
| 305 |
+
|
| 306 |
+
Anisotropic resize is intentional: the tile aspect ratio differs
|
| 307 |
+
from the model input, and we want the tile pixels to magnify up to
|
| 308 |
+
the detector's stride-8 feature footprint. For the 1408x422
|
| 309 |
+
top/bottom tiles used by ``_infer_single`` this yields ~1.82x
|
| 310 |
+
vertical magnification (and 1.0x horizontal), which is what pushes
|
| 311 |
+
tiny-height plates (5-12 px on the validator's starter frames)
|
| 312 |
+
above the stride-8 threshold.
|
| 313 |
+
"""
|
| 314 |
+
crop = image_bgr[y0:y1, x0:x1]
|
| 315 |
+
ch, cw = crop.shape[:2]
|
| 316 |
+
if ch == 0 or cw == 0:
|
| 317 |
+
return []
|
| 318 |
+
resized = cv2.resize(crop, (self.input_w, self.input_h))
|
| 319 |
+
rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
|
| 320 |
+
x = np.transpose(rgb.astype(np.float32) / 255.0, (2, 0, 1))[None, ...]
|
| 321 |
+
out = self.session.run(None, {self.input_name: x})[0]
|
| 322 |
pred = self._normalize_predictions(out)
|
| 323 |
|
| 324 |
if pred.shape[1] < 5:
|
| 325 |
return []
|
| 326 |
|
| 327 |
+
boxes_m = pred[:, :4]
|
| 328 |
cls_scores = pred[:, 4:]
|
| 329 |
if cls_scores.shape[1] == 0:
|
| 330 |
return []
|
|
|
|
| 332 |
cls_ids = np.argmax(cls_scores, axis=1)
|
| 333 |
confs = np.max(cls_scores, axis=1)
|
| 334 |
keep = confs >= self.conf_threshold
|
| 335 |
+
boxes_m = boxes_m[keep]
|
|
|
|
| 336 |
confs = confs[keep]
|
| 337 |
cls_ids = cls_ids[keep]
|
| 338 |
+
if boxes_m.shape[0] == 0:
|
|
|
|
| 339 |
return []
|
| 340 |
|
| 341 |
+
# Model-space (input_w x input_h) -> crop-space -> original image
|
| 342 |
+
sx = cw / self.input_w
|
| 343 |
+
sy = ch / self.input_h
|
| 344 |
dets: list[tuple[float, float, float, float, float, int]] = []
|
| 345 |
+
for i in range(boxes_m.shape[0]):
|
| 346 |
+
cx, cy, bw, bh = boxes_m[i].tolist()
|
| 347 |
+
xa = (cx - bw / 2.0) * sx + x0
|
| 348 |
+
ya = (cy - bh / 2.0) * sy + y0
|
| 349 |
+
xb = (cx + bw / 2.0) * sx + x0
|
| 350 |
+
yb = (cy + bh / 2.0) * sy + y0
|
| 351 |
+
dets.append((xa, ya, xb, yb, float(confs[i]), int(cls_ids[i])))
|
| 352 |
+
return dets
|
| 353 |
+
|
| 354 |
+
def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
|
| 355 |
+
"""Two-tile top/bottom SAHI inference.
|
| 356 |
+
|
| 357 |
+
The validator's tiny plates (5-12 px tall on 1408x768 starter
|
| 358 |
+
frames) are below YOLO's stride-8 detection footprint at native
|
| 359 |
+
resolution, so the single-pass letterbox baseline misses most of
|
| 360 |
+
them. This method runs two overlapping tile passes — top half
|
| 361 |
+
``[0, H/2 + 38]`` and bottom half ``[H/2 - 38, H]`` — each
|
| 362 |
+
anisotropically resized to ``(input_h, input_w)`` for ~1.82x
|
| 363 |
+
vertical magnification (1.0x horizontal). Detections are combined
|
| 364 |
+
and merged via Soft-NMS.
|
| 365 |
+
|
| 366 |
+
Measured on the 7 starter frames vs the prior single-pass path:
|
| 367 |
+
recall 0.200 -> 0.433
|
| 368 |
+
precision 0.600 -> 0.765
|
| 369 |
+
F1 0.300 -> 0.553
|
| 370 |
+
wall p95 25 ms -> 33 ms (budget 50 ms)
|
| 371 |
+
|
| 372 |
+
A full-frame pass is deliberately NOT run: every plate the full
|
| 373 |
+
pass detected is also detected by at least one tile (the tiles
|
| 374 |
+
overlap ~38 px past the midline), and adding it pushes p95 to
|
| 375 |
+
~55 ms which violates the latency budget.
|
| 376 |
+
|
| 377 |
+
Known blind spot: image 6 (plate heights 5-7 px) stays at 0/6.
|
| 378 |
+
Those plates need ~2x in BOTH dimensions; 2x2 quadrant tiling
|
| 379 |
+
reaches them (1/6) but runs at ~68 ms p95 which is over budget.
|
| 380 |
+
Closing image 6 is a training-side problem, not an inference-
|
| 381 |
+
path problem, at this model capacity.
|
| 382 |
+
"""
|
| 383 |
+
orig_h, orig_w = image_bgr.shape[:2]
|
| 384 |
+
my = orig_h // 2
|
| 385 |
+
overlap_y = 38 # ~10% of orig_h on each side of the midline
|
| 386 |
+
|
| 387 |
+
top_dets = self._infer_tile(
|
| 388 |
+
image_bgr, 0, 0, orig_w, min(orig_h, my + overlap_y),
|
| 389 |
+
)
|
| 390 |
+
bot_dets = self._infer_tile(
|
| 391 |
+
image_bgr, 0, max(0, my - overlap_y), orig_w, orig_h,
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
dets = self._soft_nms(top_dets + bot_dets)
|
| 395 |
|
| 396 |
out_boxes: list[BoundingBox] = []
|
| 397 |
for x1, y1, x2, y2, conf, cls_id in dets:
|