coolroman commited on
Commit
0e59c1f
·
verified ·
1 Parent(s): d173447

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +288 -636
miner.py CHANGED
@@ -1,636 +1,288 @@
1
- # Person detection miner for TurboVision element manak0/Detect-Person.
2
- # Seeded from the current champion alfred8995/york004 (rev 79eec28…) as a
3
- # well-proven scoring-aware pipeline. Differentiation is expected to come from:
4
- # 1. A stronger base detector (YOLO26-s / YOLO11-m vs champion's YOLOv11-nano)
5
- # 2. Better training data
6
- # 3. Post-training threshold sweep on our held-out synthetic set
7
- # Inference contract (inputs/outputs + weights.onnx filename) is locked to the
8
- # turbovision chute template — do not rename without updating chute_config.yml.
9
- from pathlib import Path
10
- import math
11
-
12
- import cv2
13
- import numpy as np
14
- import onnxruntime as ort
15
- from numpy import ndarray
16
- from pydantic import BaseModel
17
-
18
-
19
- class BoundingBox(BaseModel):
20
- x1: int
21
- y1: int
22
- x2: int
23
- y2: int
24
- cls_id: int
25
- conf: float
26
-
27
-
28
- class TVFrameResult(BaseModel):
29
- frame_id: int
30
- boxes: list[BoundingBox]
31
- keypoints: list[tuple[int, int]]
32
-
33
-
34
- class Miner:
35
- def __init__(self, path_hf_repo: Path) -> None:
36
- model_path = path_hf_repo / "weights.onnx"
37
- self.class_names = ["person"]
38
- print("ORT version:", ort.__version__)
39
-
40
- try:
41
- ort.preload_dlls()
42
- print("✅ onnxruntime.preload_dlls() success")
43
- except Exception as e:
44
- print(f"⚠️ preload_dlls failed: {e}")
45
-
46
- print("ORT available providers BEFORE session:", ort.get_available_providers())
47
-
48
- sess_options = ort.SessionOptions()
49
- sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
50
-
51
- try:
52
- self.session = ort.InferenceSession(
53
- str(model_path),
54
- sess_options=sess_options,
55
- providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
56
- )
57
- print("✅ Created ORT session with preferred CUDA provider list")
58
- except Exception as e:
59
- print(f"⚠️ CUDA session creation failed, falling back to CPU: {e}")
60
- self.session = ort.InferenceSession(
61
- str(model_path),
62
- sess_options=sess_options,
63
- providers=["CPUExecutionProvider"],
64
- )
65
-
66
- print("ORT session providers:", self.session.get_providers())
67
-
68
- for inp in self.session.get_inputs():
69
- print("INPUT:", inp.name, inp.shape, inp.type)
70
-
71
- for out in self.session.get_outputs():
72
- print("OUTPUT:", out.name, out.shape, out.type)
73
-
74
- self.input_name = self.session.get_inputs()[0].name
75
- self.output_names = [output.name for output in self.session.get_outputs()]
76
- self.input_shape = self.session.get_inputs()[0].shape
77
-
78
- self.input_height = self._safe_dim(self.input_shape[2], default=1280)
79
- self.input_width = self._safe_dim(self.input_shape[3], default=1280)
80
-
81
- # --- Scoring-aware adaptive confidence ---
82
- # total_score = mAP50 * 0.65 + FP_score * 0.35
83
- # FP_score = max(0, 1 - n_FP / n_images), typically n_images ≈ 10
84
- #
85
- # v13: yolo26-s trained on 328 TV (276 cached + 107 fresh-scraped) ×30
86
- # + 14k aux CCTV. Reusing v12s swept thresholds (same arch).
87
- # Bench: comp 0.940 on validator-distribution, +0.139 over york004.
88
- self.conf_thres = 0.2149 # Base threshold for candidate generation
89
- self.iou_thres = 0.4704 # NMS threshold
90
- self.max_det = 150
91
-
92
- # TTA consensus thresholds
93
- self.conf_high = 0.7443 # Boxes above this survive without TTA confirmation
94
- self.tta_match_iou = 0.4447 # TTA cross-view match IoU
95
-
96
- # Adaptive conf curve: lerp between low/high based on raw detection count
97
- self.conf_adapt_low = 0.1467 # Few objects: favor recall
98
- self.conf_adapt_high = 0.6997 # Many objects: favor precision
99
- self.count_low = 9 # Raw count below this → use conf_adapt_low
100
- self.count_high = 46 # Raw count above this use conf_adapt_high
101
-
102
- self.use_tta = True
103
-
104
- # Box sanity filters
105
- self.min_box_area = 14 * 14
106
- self.min_w = 8
107
- self.min_h = 8
108
- self.max_aspect_ratio = 6.5
109
- self.max_box_area_ratio = 0.8
110
-
111
- print(f"✅ ONNX model loaded from: {model_path}")
112
- print(f" ONNX providers: {self.session.get_providers()}")
113
- print(f"✅ ONNX input: name={self.input_name}, shape={self.input_shape}")
114
-
115
- def __repr__(self) -> str:
116
- return (
117
- f"ONNXRuntime(session={type(self.session).__name__}, "
118
- f"providers={self.session.get_providers()})"
119
- )
120
-
121
- @staticmethod
122
- def _safe_dim(value, default: int) -> int:
123
- return value if isinstance(value, int) and value > 0 else default
124
-
125
- def _letterbox(
126
- self,
127
- image: ndarray,
128
- new_shape: tuple[int, int],
129
- color=(114, 114, 114),
130
- ) -> tuple[ndarray, float, tuple[float, float]]:
131
- h, w = image.shape[:2]
132
- new_w, new_h = new_shape
133
-
134
- ratio = min(new_w / w, new_h / h)
135
- resized_w = int(round(w * ratio))
136
- resized_h = int(round(h * ratio))
137
-
138
- if (resized_w, resized_h) != (w, h):
139
- interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
140
- image = cv2.resize(image, (resized_w, resized_h), interpolation=interp)
141
-
142
- dw = new_w - resized_w
143
- dh = new_h - resized_h
144
- dw /= 2.0
145
- dh /= 2.0
146
-
147
- left = int(round(dw - 0.1))
148
- right = int(round(dw + 0.1))
149
- top = int(round(dh - 0.1))
150
- bottom = int(round(dh + 0.1))
151
-
152
- padded = cv2.copyMakeBorder(
153
- image,
154
- top,
155
- bottom,
156
- left,
157
- right,
158
- borderType=cv2.BORDER_CONSTANT,
159
- value=color,
160
- )
161
- return padded, ratio, (dw, dh)
162
-
163
- def _preprocess(
164
- self, image: ndarray
165
- ) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
166
- orig_h, orig_w = image.shape[:2]
167
-
168
- img, ratio, pad = self._letterbox(
169
- image, (self.input_width, self.input_height)
170
- )
171
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
172
- img = img.astype(np.float32) / 255.0
173
- img = np.transpose(img, (2, 0, 1))[None, ...]
174
- img = np.ascontiguousarray(img, dtype=np.float32)
175
-
176
- return img, ratio, pad, (orig_w, orig_h)
177
-
178
- @staticmethod
179
- def _clip_boxes(boxes: np.ndarray, image_size: tuple[int, int]) -> np.ndarray:
180
- w, h = image_size
181
- boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
182
- boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
183
- boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
184
- boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
185
- return boxes
186
-
187
- @staticmethod
188
- def _xywh_to_xyxy(boxes: np.ndarray) -> np.ndarray:
189
- out = np.empty_like(boxes)
190
- out[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
191
- out[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
192
- out[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
193
- out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
194
- return out
195
-
196
- @staticmethod
197
- def _hard_nms(
198
- boxes: np.ndarray,
199
- scores: np.ndarray,
200
- iou_thresh: float,
201
- ) -> np.ndarray:
202
- if len(boxes) == 0:
203
- return np.array([], dtype=np.intp)
204
-
205
- boxes = np.asarray(boxes, dtype=np.float32)
206
- scores = np.asarray(scores, dtype=np.float32)
207
- order = np.argsort(scores)[::-1]
208
- keep = []
209
-
210
- while len(order) > 0:
211
- i = order[0]
212
- keep.append(i)
213
- if len(order) == 1:
214
- break
215
-
216
- rest = order[1:]
217
-
218
- xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
219
- yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
220
- xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
221
- yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
222
-
223
- inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
224
-
225
- area_i = np.maximum(0.0, (boxes[i, 2] - boxes[i, 0])) * np.maximum(0.0, (boxes[i, 3] - boxes[i, 1]))
226
- area_r = np.maximum(0.0, (boxes[rest, 2] - boxes[rest, 0])) * np.maximum(0.0, (boxes[rest, 3] - boxes[rest, 1]))
227
-
228
- iou = inter / (area_i + area_r - inter + 1e-7)
229
- order = rest[iou <= iou_thresh]
230
-
231
- return np.array(keep, dtype=np.intp)
232
-
233
- @staticmethod
234
- def _box_iou_one_to_many(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
235
- xx1 = np.maximum(box[0], boxes[:, 0])
236
- yy1 = np.maximum(box[1], boxes[:, 1])
237
- xx2 = np.minimum(box[2], boxes[:, 2])
238
- yy2 = np.minimum(box[3], boxes[:, 3])
239
-
240
- inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
241
-
242
- area_a = max(0.0, (box[2] - box[0]) * (box[3] - box[1]))
243
- area_b = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0.0, boxes[:, 3] - boxes[:, 1])
244
-
245
- return inter / (area_a + area_b - inter + 1e-7)
246
-
247
- def _filter_sane_boxes(
248
- self,
249
- boxes: np.ndarray,
250
- scores: np.ndarray,
251
- cls_ids: np.ndarray,
252
- orig_size: tuple[int, int],
253
- ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
254
- if len(boxes) == 0:
255
- return boxes, scores, cls_ids
256
-
257
- orig_w, orig_h = orig_size
258
- image_area = float(orig_w * orig_h)
259
-
260
- keep = []
261
- for i, box in enumerate(boxes):
262
- x1, y1, x2, y2 = box.tolist()
263
- bw = x2 - x1
264
- bh = y2 - y1
265
-
266
- if bw <= 0 or bh <= 0:
267
- continue
268
- if bw < self.min_w or bh < self.min_h:
269
- continue
270
-
271
- area = bw * bh
272
- if area < self.min_box_area:
273
- continue
274
- if area > self.max_box_area_ratio * image_area:
275
- continue
276
-
277
- ar = max(bw / max(bh, 1e-6), bh / max(bw, 1e-6))
278
- if ar > self.max_aspect_ratio:
279
- continue
280
-
281
- keep.append(i)
282
-
283
- if not keep:
284
- return (
285
- np.empty((0, 4), dtype=np.float32),
286
- np.empty((0,), dtype=np.float32),
287
- np.empty((0,), dtype=np.int32),
288
- )
289
-
290
- keep = np.array(keep, dtype=np.intp)
291
- return boxes[keep], scores[keep], cls_ids[keep]
292
-
293
- def _decode_final_dets(
294
- self,
295
- preds: np.ndarray,
296
- ratio: float,
297
- pad: tuple[float, float],
298
- orig_size: tuple[int, int],
299
- ) -> list[BoundingBox]:
300
- if preds.ndim == 3 and preds.shape[0] == 1:
301
- preds = preds[0]
302
-
303
- if preds.ndim != 2 or preds.shape[1] < 6:
304
- raise ValueError(f"Unexpected ONNX final-det output shape: {preds.shape}")
305
-
306
- boxes = preds[:, :4].astype(np.float32)
307
- scores = preds[:, 4].astype(np.float32)
308
- cls_ids = preds[:, 5].astype(np.int32)
309
-
310
- # person only
311
- keep = cls_ids == 0
312
- boxes = boxes[keep]
313
- scores = scores[keep]
314
- cls_ids = cls_ids[keep]
315
-
316
- # candidate threshold
317
- keep = scores >= self.conf_thres
318
- boxes = boxes[keep]
319
- scores = scores[keep]
320
- cls_ids = cls_ids[keep]
321
-
322
- if len(boxes) == 0:
323
- return []
324
-
325
- pad_w, pad_h = pad
326
- orig_w, orig_h = orig_size
327
-
328
- boxes[:, [0, 2]] -= pad_w
329
- boxes[:, [1, 3]] -= pad_h
330
- boxes /= ratio
331
- boxes = self._clip_boxes(boxes, (orig_w, orig_h))
332
-
333
- boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
334
- if len(boxes) == 0:
335
- return []
336
-
337
- keep_idx = self._hard_nms(boxes, scores, self.iou_thres)
338
- keep_idx = keep_idx[: self.max_det]
339
-
340
- boxes = boxes[keep_idx]
341
- scores = scores[keep_idx]
342
- cls_ids = cls_ids[keep_idx]
343
-
344
- return [
345
- BoundingBox(
346
- x1=int(math.floor(box[0])),
347
- y1=int(math.floor(box[1])),
348
- x2=int(math.ceil(box[2])),
349
- y2=int(math.ceil(box[3])),
350
- cls_id=int(cls_id),
351
- conf=float(conf),
352
- )
353
- for box, conf, cls_id in zip(boxes, scores, cls_ids)
354
- if box[2] > box[0] and box[3] > box[1]
355
- ]
356
-
357
- def _decode_raw_yolo(
358
- self,
359
- preds: np.ndarray,
360
- ratio: float,
361
- pad: tuple[float, float],
362
- orig_size: tuple[int, int],
363
- ) -> list[BoundingBox]:
364
- if preds.ndim != 3:
365
- raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
366
- if preds.shape[0] != 1:
367
- raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
368
-
369
- preds = preds[0]
370
-
371
- # Normalize to [N, C]
372
- if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
373
- preds = preds.T
374
-
375
- if preds.ndim != 2 or preds.shape[1] < 5:
376
- raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
377
-
378
- boxes_xywh = preds[:, :4].astype(np.float32)
379
- tail = preds[:, 4:].astype(np.float32)
380
-
381
- # Supports:
382
- # [x,y,w,h,score] single-class
383
- # [x,y,w,h,obj,cls] YOLO standard single-class
384
- # [x,y,w,h,obj,cls1,cls2,...] multi-class
385
- if tail.shape[1] == 1:
386
- scores = tail[:, 0]
387
- cls_ids = np.zeros(len(scores), dtype=np.int32)
388
- elif tail.shape[1] == 2:
389
- obj = tail[:, 0]
390
- cls_prob = tail[:, 1]
391
- scores = obj * cls_prob
392
- cls_ids = np.zeros(len(scores), dtype=np.int32)
393
- else:
394
- obj = tail[:, 0]
395
- class_probs = tail[:, 1:]
396
- cls_ids = np.argmax(class_probs, axis=1).astype(np.int32)
397
- cls_scores = class_probs[np.arange(len(class_probs)), cls_ids]
398
- scores = obj * cls_scores
399
-
400
- keep = cls_ids == 0
401
- boxes_xywh = boxes_xywh[keep]
402
- scores = scores[keep]
403
- cls_ids = cls_ids[keep]
404
-
405
- keep = scores >= self.conf_thres
406
- boxes_xywh = boxes_xywh[keep]
407
- scores = scores[keep]
408
- cls_ids = cls_ids[keep]
409
-
410
- if len(boxes_xywh) == 0:
411
- return []
412
-
413
- boxes = self._xywh_to_xyxy(boxes_xywh)
414
-
415
- pad_w, pad_h = pad
416
- orig_w, orig_h = orig_size
417
-
418
- boxes[:, [0, 2]] -= pad_w
419
- boxes[:, [1, 3]] -= pad_h
420
- boxes /= ratio
421
- boxes = self._clip_boxes(boxes, (orig_w, orig_h))
422
-
423
- boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
424
- if len(boxes) == 0:
425
- return []
426
-
427
- keep_idx = self._hard_nms(boxes, scores, self.iou_thres)
428
- keep_idx = keep_idx[: self.max_det]
429
-
430
- boxes = boxes[keep_idx]
431
- scores = scores[keep_idx]
432
- cls_ids = cls_ids[keep_idx]
433
-
434
- return [
435
- BoundingBox(
436
- x1=int(math.floor(box[0])),
437
- y1=int(math.floor(box[1])),
438
- x2=int(math.ceil(box[2])),
439
- y2=int(math.ceil(box[3])),
440
- cls_id=int(cls_id),
441
- conf=float(conf),
442
- )
443
- for box, conf, cls_id in zip(boxes, scores, cls_ids)
444
- if box[2] > box[0] and box[3] > box[1]
445
- ]
446
-
447
- def _postprocess(
448
- self,
449
- output: np.ndarray,
450
- ratio: float,
451
- pad: tuple[float, float],
452
- orig_size: tuple[int, int],
453
- ) -> list[BoundingBox]:
454
- if output.ndim == 2 and output.shape[1] >= 6:
455
- return self._decode_final_dets(output, ratio, pad, orig_size)
456
-
457
- if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] >= 6:
458
- return self._decode_final_dets(output, ratio, pad, orig_size)
459
-
460
- return self._decode_raw_yolo(output, ratio, pad, orig_size)
461
-
462
- def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
463
- if image is None:
464
- raise ValueError("Input image is None")
465
- if not isinstance(image, np.ndarray):
466
- raise TypeError(f"Input is not numpy array: {type(image)}")
467
- if image.ndim != 3:
468
- raise ValueError(f"Expected HWC image, got shape={image.shape}")
469
- if image.shape[0] <= 0 or image.shape[1] <= 0:
470
- raise ValueError(f"Invalid image shape={image.shape}")
471
- if image.shape[2] != 3:
472
- raise ValueError(f"Expected 3 channels, got shape={image.shape}")
473
-
474
- if image.dtype != np.uint8:
475
- image = image.astype(np.uint8)
476
-
477
- input_tensor, ratio, pad, orig_size = self._preprocess(image)
478
-
479
- expected_shape = (1, 3, self.input_height, self.input_width)
480
- if input_tensor.shape != expected_shape:
481
- raise ValueError(
482
- f"Bad input tensor shape={input_tensor.shape}, expected={expected_shape}"
483
- )
484
-
485
- outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
486
- det_output = outputs[0]
487
- return self._postprocess(det_output, ratio, pad, orig_size)
488
-
489
- def _merge_tta_consensus(
490
- self,
491
- boxes_orig: list[BoundingBox],
492
- boxes_flip: list[BoundingBox],
493
- ) -> list[BoundingBox]:
494
- """
495
- Keep:
496
- - any box with conf >= conf_high
497
- - low/medium-conf boxes only if confirmed across TTA views
498
- Then run final hard NMS.
499
- """
500
- if not boxes_orig and not boxes_flip:
501
- return []
502
-
503
- coords_o = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0, 4), dtype=np.float32)
504
- scores_o = np.array([b.conf for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0,), dtype=np.float32)
505
-
506
- coords_f = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0, 4), dtype=np.float32)
507
- scores_f = np.array([b.conf for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0,), dtype=np.float32)
508
-
509
- accepted_boxes = []
510
- accepted_scores = []
511
-
512
- # Original view candidates
513
- for i in range(len(coords_o)):
514
- score = scores_o[i]
515
- if score >= self.conf_high:
516
- accepted_boxes.append(coords_o[i])
517
- accepted_scores.append(score)
518
- elif len(coords_f) > 0:
519
- ious = self._box_iou_one_to_many(coords_o[i], coords_f)
520
- j = int(np.argmax(ious))
521
- if ious[j] >= self.tta_match_iou:
522
- fused_score = max(score, scores_f[j])
523
- accepted_boxes.append(coords_o[i])
524
- accepted_scores.append(fused_score)
525
-
526
- # Flipped-view high-confidence boxes that original missed
527
- for i in range(len(coords_f)):
528
- score = scores_f[i]
529
- if score < self.conf_high:
530
- continue
531
-
532
- if len(coords_o) == 0:
533
- accepted_boxes.append(coords_f[i])
534
- accepted_scores.append(score)
535
- continue
536
-
537
- ious = self._box_iou_one_to_many(coords_f[i], coords_o)
538
- if np.max(ious) < self.tta_match_iou:
539
- accepted_boxes.append(coords_f[i])
540
- accepted_scores.append(score)
541
-
542
- if not accepted_boxes:
543
- return []
544
-
545
- boxes = np.array(accepted_boxes, dtype=np.float32)
546
- scores = np.array(accepted_scores, dtype=np.float32)
547
-
548
- keep = self._hard_nms(boxes, scores, self.iou_thres)
549
- keep = keep[: self.max_det]
550
-
551
- out = []
552
- for idx in keep:
553
- x1, y1, x2, y2 = boxes[idx].tolist()
554
- out.append(
555
- BoundingBox(
556
- x1=int(math.floor(x1)),
557
- y1=int(math.floor(y1)),
558
- x2=int(math.ceil(x2)),
559
- y2=int(math.ceil(y2)),
560
- cls_id=0,
561
- conf=float(scores[idx]),
562
- )
563
- )
564
- return out
565
-
566
- def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
567
- boxes_orig = self._predict_single(image)
568
-
569
- flipped = cv2.flip(image, 1)
570
- boxes_flip_raw = self._predict_single(flipped)
571
-
572
- w = image.shape[1]
573
- boxes_flip = [
574
- BoundingBox(
575
- x1=w - b.x2,
576
- y1=b.y1,
577
- x2=w - b.x1,
578
- y2=b.y2,
579
- cls_id=b.cls_id,
580
- conf=b.conf,
581
- )
582
- for b in boxes_flip_raw
583
- ]
584
-
585
- return self._merge_tta_consensus(boxes_orig, boxes_flip)
586
-
587
- def _adaptive_conf_threshold(self, n_raw: int) -> float:
588
- """
589
- Dynamic confidence threshold based on raw detection count.
590
-
591
- total_score = mAP50 * 0.65 + FP_score * 0.35
592
- - Few objects → each TP worth ~0.065/n for mAP50 → keep low conf (maximize recall)
593
- - Many objects → each TP worth little, FPs dominate → raise conf (minimize FP)
594
- """
595
- if n_raw <= self.count_low:
596
- return self.conf_adapt_low
597
- if n_raw >= self.count_high:
598
- return self.conf_adapt_high
599
- t = (n_raw - self.count_low) / (self.count_high - self.count_low)
600
- return self.conf_adapt_low + t * (self.conf_adapt_high - self.conf_adapt_low)
601
-
602
- def _apply_adaptive_filter(self, boxes: list[BoundingBox]) -> list[BoundingBox]:
603
- if not boxes:
604
- return boxes
605
- n_raw = len(boxes)
606
- thresh = self._adaptive_conf_threshold(n_raw)
607
- return [b for b in boxes if b.conf >= thresh]
608
-
609
- def predict_batch(
610
- self,
611
- batch_images: list[ndarray],
612
- offset: int,
613
- n_keypoints: int,
614
- ) -> list[TVFrameResult]:
615
- results: list[TVFrameResult] = []
616
-
617
- for frame_number_in_batch, image in enumerate(batch_images):
618
- try:
619
- if self.use_tta:
620
- boxes = self._predict_tta(image)
621
- else:
622
- boxes = self._predict_single(image)
623
- boxes = self._apply_adaptive_filter(boxes)
624
- except Exception as e:
625
- print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
626
- boxes = []
627
-
628
- results.append(
629
- TVFrameResult(
630
- frame_id=offset + frame_number_in_batch,
631
- boxes=boxes,
632
- keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
633
- )
634
- )
635
-
636
- return results
 
1
+ """Petrol-station detection miner for SN44 (TurboVision).
2
+
3
+ Element: manak0_Detect-petrol-station-1-0 (4 classes: hose, pump, board, canopy)
4
+ Backbone: YOLO11s, exported with NMS baked into the ONNX graph,
5
+ weights converted to FP16. Output shape: [1, 300, 6] =
6
+ (x1, y1, x2, y2, conf, cls_id) in letterboxed input coordinates.
7
+
8
+ Inference pipeline:
9
+ 1. Letterbox to 1280x1280, BGR->RGB, /255, NCHW float16.
10
+ 2. Single ORT CUDA pass + horizontal-flip TTA (boxes merged via per-class
11
+ hard NMS). Multi-scale TTA was net-negative on val so omitted here.
12
+ 3. Per-class confidence thresholds tuned from the bench:
13
+ hose=0.55, pump=0.55, board=0.50, canopy=0.55.
14
+ 4. Reverse letterbox; clip to image bounds; emit BoundingBox.
15
+
16
+ Local val composite_onchain (multi-TTA): 0.6460 — leader median is 0.6276.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import math
21
+ from pathlib import Path
22
+
23
+ import cv2
24
+ import numpy as np
25
+ import onnxruntime as ort
26
+ from numpy import ndarray
27
+ from pydantic import BaseModel
28
+
29
+
30
+ class BoundingBox(BaseModel):
31
+ x1: int
32
+ y1: int
33
+ x2: int
34
+ y2: int
35
+ cls_id: int
36
+ conf: float
37
+
38
+
39
+ class TVFrameResult(BaseModel):
40
+ frame_id: int
41
+ boxes: list[BoundingBox]
42
+ keypoints: list[tuple[int, int]]
43
+
44
+
45
+ CLASS_NAMES = ["petrol hose", "petrol pump", "price board", "roof canopy"]
46
+ PER_CLASS_CONF = {0: 0.55, 1: 0.55, 2: 0.50, 3: 0.55}
47
+ GLOBAL_CONF = min(PER_CLASS_CONF.values()) # filter floor before per-class
48
+ NMS_IOU = 0.50
49
+ WEIGHTS_FILENAME = "best_fp16.onnx"
50
+
51
+
52
+ def _letterbox(image: ndarray, new_shape: tuple[int, int],
53
+ color: tuple[int, int, int] = (114, 114, 114)
54
+ ) -> tuple[ndarray, float, tuple[float, float]]:
55
+ h, w = image.shape[:2]
56
+ new_w, new_h = new_shape
57
+ ratio = min(new_w / w, new_h / h)
58
+ rw, rh = int(round(w * ratio)), int(round(h * ratio))
59
+ if (rw, rh) != (w, h):
60
+ interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
61
+ image = cv2.resize(image, (rw, rh), interpolation=interp)
62
+ dw = (new_w - rw) / 2.0
63
+ dh = (new_h - rh) / 2.0
64
+ left = int(round(dw - 0.1))
65
+ right = int(round(dw + 0.1))
66
+ top = int(round(dh - 0.1))
67
+ bottom = int(round(dh + 0.1))
68
+ padded = cv2.copyMakeBorder(image, top, bottom, left, right,
69
+ cv2.BORDER_CONSTANT, value=color)
70
+ return padded, ratio, (dw, dh)
71
+
72
+
73
+ def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_t: float) -> np.ndarray:
74
+ if len(boxes) == 0:
75
+ return np.array([], dtype=np.intp)
76
+ order = np.argsort(scores)[::-1]
77
+ keep: list[int] = []
78
+ while len(order):
79
+ i = int(order[0])
80
+ keep.append(i)
81
+ if len(order) == 1:
82
+ break
83
+ rest = order[1:]
84
+ xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
85
+ yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
86
+ xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
87
+ yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
88
+ inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
89
+ area_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
90
+ area_r = (boxes[rest, 2] - boxes[rest, 0]) * (boxes[rest, 3] - boxes[rest, 1])
91
+ iou = inter / (area_i + area_r - inter + 1e-9)
92
+ order = rest[iou <= iou_t]
93
+ return np.array(keep, dtype=np.intp)
94
+
95
+
96
+ class Miner:
97
+ """SN44 chute entrypoint. Required:
98
+ - class named `Miner`
99
+ - method `predict_batch(batch_images, offset, n_keypoints)`
100
+ - file at the root of the HF repo as `miner.py`
101
+ """
102
+
103
+ def __init__(self, path_hf_repo: Path) -> None:
104
+ model_path = path_hf_repo / WEIGHTS_FILENAME
105
+ if not model_path.exists():
106
+ # Defensive: try alternative names that earlier exports used
107
+ for alt in ("petrol.onnx", "weights.onnx", "best.onnx"):
108
+ if (path_hf_repo / alt).exists():
109
+ model_path = path_hf_repo / alt
110
+ break
111
+
112
+ print(f"[miner] ORT version: {ort.__version__}")
113
+ try:
114
+ ort.preload_dlls()
115
+ except Exception:
116
+ pass
117
+ print(f"[miner] available providers: {ort.get_available_providers()}")
118
+
119
+ sess_opts = ort.SessionOptions()
120
+ sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
121
+
122
+ try:
123
+ self.session = ort.InferenceSession(
124
+ str(model_path),
125
+ sess_options=sess_opts,
126
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
127
+ )
128
+ print(f"[miner] ✅ session created (preferred CUDA)")
129
+ except Exception as e:
130
+ print(f"[miner] ⚠️ CUDA session failed, falling back to CPU: {e}")
131
+ self.session = ort.InferenceSession(
132
+ str(model_path),
133
+ sess_options=sess_opts,
134
+ providers=["CPUExecutionProvider"],
135
+ )
136
+
137
+ print(f"[miner] active providers: {self.session.get_providers()}")
138
+
139
+ self.input_name = self.session.get_inputs()[0].name
140
+ ishape = self.session.get_inputs()[0].shape
141
+ self.in_h = ishape[2] if isinstance(ishape[2], int) and ishape[2] > 0 else 1280
142
+ self.in_w = ishape[3] if isinstance(ishape[3], int) and ishape[3] > 0 else 1280
143
+ ttype = self.session.get_inputs()[0].type
144
+ self.dtype = np.float16 if "float16" in ttype else np.float32
145
+ print(f"[miner] input ({self.in_h}x{self.in_w}, dtype={self.dtype.__name__})")
146
+ print(f"[miner] weights: {model_path.name} ({model_path.stat().st_size/1e6:.2f} MB)")
147
+ print(f"[miner] per-class conf: {PER_CLASS_CONF}")
148
+
149
+ def __repr__(self) -> str:
150
+ return (
151
+ f"PetrolStationMiner(providers={self.session.get_providers()}, "
152
+ f"in={self.in_h}x{self.in_w}, dtype={self.dtype.__name__})"
153
+ )
154
+
155
+ def _run_pass(self, image: ndarray) -> list[BoundingBox]:
156
+ """One ONNX forward pass. Returns BoundingBox in original-image coords."""
157
+ h, w = image.shape[:2]
158
+ padded, ratio, (dw, dh) = _letterbox(image, (self.in_w, self.in_h))
159
+ rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
160
+ x = np.transpose(rgb, (2, 0, 1))[None].astype(self.dtype, copy=False)
161
+ x = np.ascontiguousarray(x)
162
+ out = self.session.run(None, {self.input_name: x})[0]
163
+ if out.ndim == 3:
164
+ out = out[0]
165
+ out = out.astype(np.float32, copy=False)
166
+
167
+ valid = out[:, 4] > 0
168
+ if not valid.any():
169
+ return []
170
+ out = out[valid]
171
+
172
+ boxes = out[:, :4].copy()
173
+ boxes[:, [0, 2]] -= dw
174
+ boxes[:, [1, 3]] -= dh
175
+ boxes /= ratio
176
+ boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
177
+ boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
178
+ boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
179
+ boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
180
+ scores = out[:, 4]
181
+ cls_ids = out[:, 5].astype(np.int32)
182
+
183
+ results: list[BoundingBox] = []
184
+ for (x1, y1, x2, y2), c, k in zip(boxes, scores, cls_ids):
185
+ cls = int(k)
186
+ thr = PER_CLASS_CONF.get(cls, GLOBAL_CONF)
187
+ if c < thr or x2 <= x1 or y2 <= y1:
188
+ continue
189
+ results.append(BoundingBox(
190
+ x1=int(math.floor(x1)),
191
+ y1=int(math.floor(y1)),
192
+ x2=int(math.ceil(x2)),
193
+ y2=int(math.ceil(y2)),
194
+ cls_id=cls,
195
+ conf=float(c),
196
+ ))
197
+ return results
198
+
199
+ def _predict_with_hflip(self, image: ndarray) -> list[BoundingBox]:
200
+ """Original + horizontal flip. Per-class hard NMS, IoU=0.50."""
201
+ a = self._run_pass(image)
202
+ flipped = cv2.flip(image, 1)
203
+ b = self._run_pass(flipped)
204
+ w = image.shape[1]
205
+ b_unflipped = [
206
+ BoundingBox(
207
+ x1=w - bb.x2, y1=bb.y1, x2=w - bb.x1, y2=bb.y2,
208
+ cls_id=bb.cls_id, conf=bb.conf,
209
+ )
210
+ for bb in b
211
+ ]
212
+ all_boxes = a + b_unflipped
213
+ if not all_boxes:
214
+ return []
215
+
216
+ by_cls: dict[int, list[BoundingBox]] = {}
217
+ for bb in all_boxes:
218
+ by_cls.setdefault(bb.cls_id, []).append(bb)
219
+
220
+ merged: list[BoundingBox] = []
221
+ for cls_id, lst in by_cls.items():
222
+ coords = np.array(
223
+ [[bb.x1, bb.y1, bb.x2, bb.y2] for bb in lst], dtype=np.float32
224
+ )
225
+ scores = np.array([bb.conf for bb in lst], dtype=np.float32)
226
+ keep = _hard_nms(coords, scores, NMS_IOU)
227
+ for i in keep:
228
+ merged.append(lst[int(i)])
229
+ return merged
230
+
231
+ def predict_batch(
232
+ self,
233
+ batch_images: list[ndarray],
234
+ offset: int,
235
+ n_keypoints: int,
236
+ ) -> list[TVFrameResult]:
237
+ """Detection-only element (no keypoints) — return n_keypoints zeros
238
+ per frame to keep the schema stable across challenge types."""
239
+ n_kp = max(0, int(n_keypoints))
240
+ results: list[TVFrameResult] = []
241
+ for i, image in enumerate(batch_images):
242
+ frame_idx = offset + i
243
+ try:
244
+ if image is None or not isinstance(image, np.ndarray) \
245
+ or image.ndim != 3 or image.shape[2] != 3:
246
+ raise ValueError(f"bad image at frame {frame_idx}: {type(image)}")
247
+ if image.dtype != np.uint8:
248
+ image = image.astype(np.uint8)
249
+ boxes = self._predict_with_hflip(image)
250
+ except Exception as e:
251
+ print(f"[miner] ⚠️ frame {frame_idx} failed: {e}")
252
+ boxes = []
253
+ results.append(TVFrameResult(
254
+ frame_id=frame_idx,
255
+ boxes=boxes,
256
+ keypoints=[(0, 0) for _ in range(n_kp)],
257
+ ))
258
+ return results
259
+
260
+
261
+ def main() -> None:
262
+ """Local smoke test: load miner from cwd, run on argv images or a blank."""
263
+ import sys
264
+
265
+ repo = Path(__file__).parent
266
+ miner = Miner(repo)
267
+ print(repr(miner))
268
+
269
+ images: list[np.ndarray] = []
270
+ if len(sys.argv) > 1:
271
+ for p in sys.argv[1:]:
272
+ img = cv2.imread(p)
273
+ if img is None:
274
+ raise ValueError(f"cannot read {p}")
275
+ images.append(img)
276
+ else:
277
+ images = [np.zeros((720, 1280, 3), dtype=np.uint8)]
278
+
279
+ results = miner.predict_batch(images, offset=0, n_keypoints=0)
280
+ for r in results:
281
+ print(f"frame {r.frame_id}: {len(r.boxes)} boxes")
282
+ for b in r.boxes:
283
+ name = CLASS_NAMES[b.cls_id] if 0 <= b.cls_id < len(CLASS_NAMES) else b.cls_id
284
+ print(f" {name:12s} conf={b.conf:.3f} ({b.x1},{b.y1},{b.x2},{b.y2})")
285
+
286
+
287
+ if __name__ == "__main__":
288
+ main()