meaculpitt commited on
Commit
a8c4f6c
·
verified ·
1 Parent(s): fabc2ae

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +430 -0
miner.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SN44 beverage detection miner — single-element chute for
2
+ manak0/Detect-beverage-detect.
3
+
4
+ Adapted from the auto-generated Manako baseline with three substantive
5
+ changes ported from the production numberplate miner:
6
+
7
+ 1. CUDA library preload at import time so onnxruntime-gpu finds
8
+ libcudnn / libcublas from the nvidia-* pip wheels even when
9
+ LD_LIBRARY_PATH is not set.
10
+ 2. Letterbox preprocessing (aspect-preserving with grey 114 padding)
11
+ instead of anisotropic cv2.resize. Beverage geometry (cylindrical
12
+ bottles/cans/cups) is sensitive to AR distortion.
13
+ 3. Standard NMS replaced with per-class Gaussian Soft-NMS (sigma=0.5).
14
+ Soft-NMS decays scores of overlapping boxes instead of suppressing
15
+ them outright. Per-class so that an overlapping bottle and cup don't
16
+ suppress each other (beverage scenes routinely have mixed objects in
17
+ frame). We use a gentler sigma than the numberplate miner's 0.3
18
+ because beverage scenes typically have fewer near-duplicate
19
+ detections than plate scenes.
20
+
21
+ Plus a GPU warmup pass in __init__ (10 calls on a synthetic frame) to
22
+ force ORT/CUDA/cuDNN kernel compilation before the first real
23
+ validator frame.
24
+
25
+ Soft-NMS is inlined here rather than imported because the chute
26
+ platform sandbox restricts non-stdlib imports beyond the deps declared
27
+ in chute_config.yml.
28
+
29
+ NOT ported from numberplate (intentional):
30
+ - SAHI quad-4 tiling: beverage objects are 50–500 px on validator
31
+ frames, not 5–30 px like plates — tiling is overkill.
32
+ - Horizontal-flip TTA: doubles latency for marginal gain.
33
+ - End2end [1,N,6] shape support: our ONNX export uses raw
34
+ [1, C, anchors] format with NMS done here.
35
+ - Aspect-ratio / max-side output filters: plate-specific (plates
36
+ are wide-flat); beverage geometry is the opposite.
37
+ - Empty-submission guard: plate-specific failure mode.
38
+ """
39
+ import ctypes
40
+ import glob as _glob
41
+ import logging as _logging
42
+ import math
43
+ import os
44
+
45
+ _cuda_log = _logging.getLogger(__name__)
46
+
47
+
48
+ def _preload_cuda_libs() -> None:
49
+ """Pre-load CUDA + cuDNN + cuBLAS shared libs from nvidia-* pip wheels.
50
+
51
+ Without this, onnxruntime-gpu's CUDAExecutionProvider silently falls
52
+ back to CPU because it can't dlopen libcudnn.so.9 — the nvidia
53
+ wheels ship the library inside `nvidia/cudnn/lib/` but do NOT add
54
+ that directory to the loader path. We import the wheel modules to
55
+ locate their lib dirs, prepend them to LD_LIBRARY_PATH for any
56
+ child processes, and ctypes.CDLL the .so files with RTLD_GLOBAL so
57
+ onnxruntime's dlopen sees them.
58
+ """
59
+ try:
60
+ lib_dirs: list[str] = []
61
+ for mod_name in (
62
+ "nvidia.cudnn",
63
+ "nvidia.cublas",
64
+ "nvidia.cuda_runtime",
65
+ "nvidia.cufft",
66
+ "nvidia.curand",
67
+ "nvidia.cusolver",
68
+ "nvidia.cusparse",
69
+ "nvidia.nvjitlink",
70
+ ):
71
+ try:
72
+ mod = __import__(mod_name, fromlist=["__file__"])
73
+ lib_dir = os.path.join(os.path.dirname(mod.__file__), "lib")
74
+ if os.path.isdir(lib_dir) and lib_dir not in lib_dirs:
75
+ lib_dirs.append(lib_dir)
76
+ except ImportError:
77
+ pass
78
+
79
+ if not lib_dirs:
80
+ _cuda_log.warning("no nvidia-* lib dirs found; ORT GPU may fall back to CPU")
81
+ return
82
+
83
+ existing = os.environ.get("LD_LIBRARY_PATH", "")
84
+ os.environ["LD_LIBRARY_PATH"] = ":".join(
85
+ lib_dirs + ([existing] if existing else [])
86
+ )
87
+
88
+ for lib_dir in lib_dirs:
89
+ for so in sorted(_glob.glob(os.path.join(lib_dir, "lib*.so*"))):
90
+ try:
91
+ ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
92
+ except OSError:
93
+ pass
94
+ except Exception as e: # pragma: no cover - best effort
95
+ _cuda_log.warning("CUDA preload failed: %s", e)
96
+
97
+
98
+ _preload_cuda_libs()
99
+
100
+
101
+ from pathlib import Path
102
+
103
+ import cv2
104
+ import numpy as np
105
+ import onnxruntime as ort
106
+ from numpy import ndarray
107
+ from pydantic import BaseModel
108
+
109
+
110
+ class BoundingBox(BaseModel):
111
+ x1: int
112
+ y1: int
113
+ x2: int
114
+ y2: int
115
+ cls_id: int
116
+ conf: float
117
+
118
+
119
+ class TVFrameResult(BaseModel):
120
+ frame_id: int
121
+ boxes: list[BoundingBox]
122
+ keypoints: list[tuple[int, int]]
123
+
124
+
125
+ class Miner:
126
+ """Single-element ONNX miner for the manak0/Detect-beverage-detect
127
+ element. Auto-loaded by the chute platform; the platform passes the
128
+ snapshot path of the HF repo containing weights.onnx as
129
+ ``path_hf_repo`` and calls ``predict_batch(batch_images, offset,
130
+ n_keypoints)`` for each request.
131
+ """
132
+
133
+ def __init__(self, path_hf_repo) -> None:
134
+ self.path_hf_repo = Path(path_hf_repo)
135
+ self.class_names = ["bottle", "can", "cup"]
136
+ self.session = ort.InferenceSession(
137
+ str(self.path_hf_repo / "weights.onnx"),
138
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
139
+ )
140
+ self.input_name = self.session.get_inputs()[0].name
141
+
142
+ # Hard-pin to 960x960 — this is the resolution we trained at and
143
+ # exported the ONNX with. Single-resolution preprocessing keeps the
144
+ # pipeline simple and matches what we've validated. The ONNX itself
145
+ # was exported with dynamic axes so it accepts other shapes too,
146
+ # but there's no reason to deviate from training resolution.
147
+ self.input_h = 960
148
+ self.input_w = 960
149
+
150
+ # Pre-NMS confidence threshold. Low floor so Soft-NMS has plenty of
151
+ # candidates to score-decay; final filtering happens via
152
+ # score_threshold below.
153
+ self.conf_threshold = 0.15
154
+ # Gaussian Soft-NMS sigma. 0.5 is the textbook default — gentler
155
+ # than numberplate's 0.3 because beverage scenes are less crowded.
156
+ self.soft_nms_sigma = 0.5
157
+ # Final score floor after Soft-NMS decay.
158
+ self.score_threshold = 0.01
159
+
160
+ # GPU warmup — force ORT/CUDA/cuDNN kernel compilation before the
161
+ # first real validator frame. Mirrors the numberplate miner pattern.
162
+ _warmup_frame = np.zeros((self.input_h, self.input_w, 3), dtype=np.uint8)
163
+ for _ in range(10):
164
+ try:
165
+ self._infer_single(_warmup_frame)
166
+ except Exception: # pragma: no cover - best effort
167
+ break
168
+
169
+ def __repr__(self) -> str:
170
+ return (
171
+ f"BeverageMiner session={type(self.session).__name__} "
172
+ f"input={self.input_h}x{self.input_w} classes={len(self.class_names)}"
173
+ )
174
+
175
+ # ---------------------------------------------------------------- preproc
176
+ def _preprocess(self, image_bgr: ndarray):
177
+ """Letterbox the BGR image to (input_h, input_w), preserving aspect.
178
+
179
+ Returns the float32 NCHW tensor plus the metadata needed to undo
180
+ the letterbox during decode: (orig_h, orig_w, scale, dx, dy).
181
+ """
182
+ h, w = image_bgr.shape[:2]
183
+ scale = min(self.input_h / h, self.input_w / w)
184
+ nh, nw = int(round(h * scale)), int(round(w * scale))
185
+ resized = cv2.resize(image_bgr, (nw, nh))
186
+ canvas = np.full((self.input_h, self.input_w, 3), 114, dtype=np.uint8)
187
+ dy = (self.input_h - nh) // 2
188
+ dx = (self.input_w - nw) // 2
189
+ canvas[dy:dy + nh, dx:dx + nw] = resized
190
+ rgb = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)
191
+ x = rgb.astype(np.float32) / 255.0
192
+ x = np.transpose(x, (2, 0, 1))[None, ...]
193
+ return x, (h, w, scale, dx, dy)
194
+
195
+ # ---------------------------------------------------------------- decode
196
+ def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray:
197
+ """Handle both common ultralytics export shapes ([1,C,N] and [1,N,C])."""
198
+ pred = raw[0]
199
+ if pred.ndim != 2:
200
+ raise ValueError(f"Unexpected prediction shape: {raw.shape}")
201
+ if pred.shape[0] < pred.shape[1]:
202
+ pred = pred.transpose(1, 0)
203
+ return pred
204
+
205
+ # ---------------------------------------------------------------- cluster dedup
206
+ def _cluster_dedup(
207
+ self,
208
+ dets: list[tuple[float, float, float, float, float, int]],
209
+ iou_thresh: float = 0.5,
210
+ ) -> list[tuple[float, float, float, float, float, int]]:
211
+ """Per-class greedy near-duplicate suppression.
212
+
213
+ For any pair of same-class detections with IoU >= ``iou_thresh``,
214
+ keep only the higher-confidence one. Runs BEFORE Soft-NMS to kill
215
+ nearly-identical raw detections that Soft-NMS's gentle decay
216
+ leaves above ``score_threshold`` (verified failure mode in v1
217
+ smoke test: at sigma=0.5 and IoU≈1.0, a 0.94 detection decays to
218
+ only 0.13 — still above the 0.01 floor).
219
+
220
+ Per-class (not class-agnostic) so an overlapping bottle/cup pair
221
+ survives intact, consistent with the per-class Soft-NMS choice.
222
+ Cross-class confusion at IoU>=0.5 is rare with our trained model.
223
+
224
+ Mirrors the cluster-dedup step in the production numberplate
225
+ miner; threshold raised to 0.5 (vs 0.3 there) because we have no
226
+ TTA-induced near-duplicates to merge.
227
+ """
228
+ if not dets:
229
+ return []
230
+ srt = sorted(dets, key=lambda d: -d[4])
231
+ kept: list[tuple[float, float, float, float, float, int]] = []
232
+ suppressed = [False] * len(srt)
233
+ for i in range(len(srt)):
234
+ if suppressed[i]:
235
+ continue
236
+ x1i, y1i, x2i, y2i = srt[i][0], srt[i][1], srt[i][2], srt[i][3]
237
+ cls_i = srt[i][5]
238
+ area_i = max(0.0, x2i - x1i) * max(0.0, y2i - y1i)
239
+ kept.append(srt[i])
240
+ for j in range(i + 1, len(srt)):
241
+ if suppressed[j]:
242
+ continue
243
+ if srt[j][5] != cls_i: # per-class only
244
+ continue
245
+ x1j, y1j, x2j, y2j = srt[j][0], srt[j][1], srt[j][2], srt[j][3]
246
+ ix1 = max(x1i, x1j); iy1 = max(y1i, y1j)
247
+ ix2 = min(x2i, x2j); iy2 = min(y2i, y2j)
248
+ iw = max(0.0, ix2 - ix1); ih = max(0.0, iy2 - iy1)
249
+ inter = iw * ih
250
+ area_j = max(0.0, x2j - x1j) * max(0.0, y2j - y1j)
251
+ union = area_i + area_j - inter
252
+ if union > 0 and inter / union >= iou_thresh:
253
+ suppressed[j] = True
254
+ return kept
255
+
256
+ # ---------------------------------------------------------------- soft NMS
257
+ def _soft_nms(
258
+ self,
259
+ dets: list[tuple[float, float, float, float, float, int]],
260
+ ) -> list[tuple[float, float, float, float, float, int]]:
261
+ """Per-class Gaussian Soft-NMS.
262
+
263
+ Partitions detections by class id, runs the Gaussian decay
264
+ independently within each class, then merges and sorts by score
265
+ descending. A high-confidence can detection therefore won't
266
+ suppress an overlapping bottle detection — beverage scenes
267
+ routinely contain mixed objects in close spatial proximity.
268
+ """
269
+ if not dets:
270
+ return []
271
+ by_class: dict[int, list[tuple[float, float, float, float, float, int]]] = {}
272
+ for d in dets:
273
+ by_class.setdefault(int(d[5]), []).append(d)
274
+ combined: list[tuple[float, float, float, float, float, int]] = []
275
+ for class_dets in by_class.values():
276
+ combined.extend(self._soft_nms_per_class_pool(class_dets))
277
+ combined.sort(key=lambda d: -d[4])
278
+ return combined
279
+
280
+ def _soft_nms_per_class_pool(
281
+ self,
282
+ dets: list[tuple[float, float, float, float, float, int]],
283
+ ) -> list[tuple[float, float, float, float, float, int]]:
284
+ """Gaussian Soft-NMS over a pool of same-class detections.
285
+
286
+ Decays each remaining box's score by ``exp(-iou^2 / sigma)`` against
287
+ the highest-scoring picked box, then drops anything below
288
+ ``self.score_threshold``. Returns kept detections in descending
289
+ decayed-score order.
290
+ """
291
+ if not dets:
292
+ return []
293
+
294
+ boxes = np.asarray([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32)
295
+ scores = np.asarray([d[4] for d in dets], dtype=np.float32)
296
+ cls_ids = [int(d[5]) for d in dets]
297
+ n = len(dets)
298
+
299
+ keep_idx: list[int] = []
300
+ keep_scores: list[float] = []
301
+ active = np.ones(n, dtype=bool)
302
+
303
+ while True:
304
+ valid_mask = active & (scores >= self.score_threshold)
305
+ if not valid_mask.any():
306
+ break
307
+ valid_idx = np.where(valid_mask)[0]
308
+ m_local = valid_idx[int(np.argmax(scores[valid_idx]))]
309
+
310
+ keep_idx.append(int(m_local))
311
+ keep_scores.append(float(scores[m_local]))
312
+ active[m_local] = False
313
+
314
+ others = np.where(active)[0]
315
+ if others.size == 0:
316
+ break
317
+ ax1 = np.maximum(boxes[m_local, 0], boxes[others, 0])
318
+ ay1 = np.maximum(boxes[m_local, 1], boxes[others, 1])
319
+ ax2 = np.minimum(boxes[m_local, 2], boxes[others, 2])
320
+ ay2 = np.minimum(boxes[m_local, 3], boxes[others, 3])
321
+ inter_w = np.clip(ax2 - ax1, a_min=0.0, a_max=None)
322
+ inter_h = np.clip(ay2 - ay1, a_min=0.0, a_max=None)
323
+ inter = inter_w * inter_h
324
+ area_m = max(0.0, (boxes[m_local, 2] - boxes[m_local, 0])) * \
325
+ max(0.0, (boxes[m_local, 3] - boxes[m_local, 1]))
326
+ area_o = (
327
+ np.clip(boxes[others, 2] - boxes[others, 0], a_min=0.0, a_max=None) *
328
+ np.clip(boxes[others, 3] - boxes[others, 1], a_min=0.0, a_max=None)
329
+ )
330
+ union = area_m + area_o - inter
331
+ iou = np.where(union > 0.0, inter / union, 0.0)
332
+
333
+ decay = np.exp(-(iou * iou) / self.soft_nms_sigma)
334
+ scores[others] = scores[others] * decay
335
+
336
+ return [
337
+ (
338
+ float(boxes[i, 0]),
339
+ float(boxes[i, 1]),
340
+ float(boxes[i, 2]),
341
+ float(boxes[i, 3]),
342
+ float(s),
343
+ cls_ids[i],
344
+ )
345
+ for i, s in zip(keep_idx, keep_scores)
346
+ ]
347
+
348
+ # ---------------------------------------------------------------- inference
349
+ def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
350
+ """Letterbox preprocess -> ONNX -> unletterbox -> per-class Soft-NMS -> BoundingBox list."""
351
+ inp, (orig_h, orig_w, scale, dx, dy) = self._preprocess(image_bgr)
352
+ out = self.session.run(None, {self.input_name: inp})[0]
353
+ pred = self._normalize_predictions(out)
354
+
355
+ if pred.shape[1] < 5:
356
+ return []
357
+
358
+ boxes_m = pred[:, :4]
359
+ cls_scores = pred[:, 4:]
360
+ if cls_scores.shape[1] == 0:
361
+ return []
362
+
363
+ cls_ids = np.argmax(cls_scores, axis=1)
364
+ confs = np.max(cls_scores, axis=1)
365
+ keep = confs >= self.conf_threshold
366
+ boxes_m = boxes_m[keep]
367
+ confs = confs[keep]
368
+ cls_ids = cls_ids[keep]
369
+ if boxes_m.shape[0] == 0:
370
+ return []
371
+
372
+ # Decode model-space cx,cy,w,h -> letterbox-space xyxy -> original xyxy
373
+ # via inverse letterbox: (model - pad) / scale.
374
+ dets: list[tuple[float, float, float, float, float, int]] = []
375
+ for i in range(boxes_m.shape[0]):
376
+ cx, cy, bw, bh = boxes_m[i].tolist()
377
+ x1m = cx - bw / 2.0
378
+ y1m = cy - bh / 2.0
379
+ x2m = cx + bw / 2.0
380
+ y2m = cy + bh / 2.0
381
+ x1 = (x1m - dx) / scale
382
+ y1 = (y1m - dy) / scale
383
+ x2 = (x2m - dx) / scale
384
+ y2 = (y2m - dy) / scale
385
+ dets.append((x1, y1, x2, y2, float(confs[i]), int(cls_ids[i])))
386
+
387
+ # Pre-NMS dedup: kill same-class near-duplicates (IoU >= 0.5) that
388
+ # would otherwise survive Soft-NMS's gentle decay above the score floor.
389
+ dets = self._cluster_dedup(dets, iou_thresh=0.5)
390
+ dets = self._soft_nms(dets)
391
+
392
+ out_boxes: list[BoundingBox] = []
393
+ for x1, y1, x2, y2, conf, cls_id in dets:
394
+ ix1 = max(0, min(orig_w, math.floor(x1)))
395
+ iy1 = max(0, min(orig_h, math.floor(y1)))
396
+ ix2 = max(0, min(orig_w, math.ceil(x2)))
397
+ iy2 = max(0, min(orig_h, math.ceil(y2)))
398
+ if ix2 <= ix1 or iy2 <= iy1:
399
+ continue
400
+ out_boxes.append(
401
+ BoundingBox(
402
+ x1=ix1,
403
+ y1=iy1,
404
+ x2=ix2,
405
+ y2=iy2,
406
+ cls_id=cls_id,
407
+ conf=max(0.0, min(1.0, conf)),
408
+ )
409
+ )
410
+ return out_boxes
411
+
412
+ # ---------------------------------------------------------------- entry
413
+ def predict_batch(
414
+ self,
415
+ batch_images: list[ndarray],
416
+ offset: int,
417
+ n_keypoints: int,
418
+ ) -> list[TVFrameResult]:
419
+ results: list[TVFrameResult] = []
420
+ for idx, image in enumerate(batch_images):
421
+ boxes = self._infer_single(image)
422
+ keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
423
+ results.append(
424
+ TVFrameResult(
425
+ frame_id=offset + idx,
426
+ boxes=boxes,
427
+ keypoints=keypoints,
428
+ )
429
+ )
430
+ return results