iotaminer commited on
Commit
685fd45
·
verified ·
1 Parent(s): a956cd6

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +293 -284
miner.py CHANGED
@@ -22,13 +22,22 @@ class TVFrameResult(BaseModel):
22
  boxes: list[BoundingBox]
23
  keypoints: list[tuple[int, int]]
24
 
 
 
25
 
26
  class Miner:
27
- def __init__(self,
28
- path_hf_repo: Path
29
- ) -> None:
30
  model_path = path_hf_repo / "weights.onnx"
31
- self.class_names = ['bus', 'car', 'truck', 'motorcycle']
 
 
 
 
 
 
 
 
 
32
  print("ORT version:", ort.__version__)
33
 
34
  try:
@@ -69,32 +78,14 @@ class Miner:
69
  self.output_names = [output.name for output in self.session.get_outputs()]
70
  self.input_shape = self.session.get_inputs()[0].shape
71
 
72
- self.input_height = self._safe_dim(self.input_shape[2], default=1280)
73
- self.input_width = self._safe_dim(self.input_shape[3], default=1280)
74
-
75
- # ---------- Scoring-oriented thresholds ----------
76
- # Low threshold for candidate generation
77
- self.conf_thres = 0.15
78
-
79
- # High-confidence boxes can survive without TTA confirmation
80
- self.conf_high = 0.50
81
-
82
- # NMS threshold
83
- self.iou_thres = 0.66
84
 
85
- # TTA confirmation IoU
86
- self.tta_match_iou = 0.91
87
-
88
- self.max_det = 150
89
  self.use_tta = True
90
 
91
- # Box sanity filters
92
- self.min_box_area = 4 * 4
93
- self.min_w = 2
94
- self.min_h = 2
95
- self.max_aspect_ratio = 12.0
96
- self.max_box_area_ratio = 0.95
97
-
98
  print(f"✅ ONNX model loaded from: {model_path}")
99
  print(f"✅ ONNX providers: {self.session.get_providers()}")
100
  print(f"✅ ONNX input: name={self.input_name}, shape={self.input_shape}")
@@ -115,6 +106,13 @@ class Miner:
115
  new_shape: tuple[int, int],
116
  color=(114, 114, 114),
117
  ) -> tuple[ndarray, float, tuple[float, float]]:
 
 
 
 
 
 
 
118
  h, w = image.shape[:2]
119
  new_w, new_h = new_shape
120
 
@@ -150,6 +148,14 @@ class Miner:
150
  def _preprocess(
151
  self, image: ndarray
152
  ) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
 
 
 
 
 
 
 
 
153
  orig_h, orig_w = image.shape[:2]
154
 
155
  img, ratio, pad = self._letterbox(
@@ -180,125 +186,125 @@ class Miner:
180
  out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
181
  return out
182
 
183
- @staticmethod
184
- def _hard_nms(
185
  boxes: np.ndarray,
186
  scores: np.ndarray,
187
- iou_thresh: float,
188
- ) -> np.ndarray:
189
- if len(boxes) == 0:
190
- return np.array([], dtype=np.intp)
191
-
192
- boxes = np.asarray(boxes, dtype=np.float32)
193
- scores = np.asarray(scores, dtype=np.float32)
194
- order = np.argsort(scores)[::-1]
195
- keep = []
 
196
 
197
- while len(order) > 0:
198
- i = order[0]
199
- keep.append(i)
200
- if len(order) == 1:
201
- break
202
 
203
- rest = order[1:]
 
 
 
 
204
 
205
- xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
206
- yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
207
- xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
208
- yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
209
 
 
 
 
 
210
  inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
211
 
212
- area_i = np.maximum(0.0, (boxes[i, 2] - boxes[i, 0])) * np.maximum(0.0, (boxes[i, 3] - boxes[i, 1]))
213
- area_r = np.maximum(0.0, (boxes[rest, 2] - boxes[rest, 0])) * np.maximum(0.0, (boxes[rest, 3] - boxes[rest, 1]))
214
-
215
- iou = inter / (area_i + area_r - inter + 1e-7)
216
- order = rest[iou <= iou_thresh]
 
 
 
 
217
 
218
- return np.array(keep, dtype=np.intp)
 
219
 
220
- @classmethod
221
- def _nms_per_class(
222
- cls,
223
  boxes: np.ndarray,
224
  scores: np.ndarray,
225
- cls_ids: np.ndarray,
226
  iou_thresh: float,
227
- max_det: int,
228
  ) -> np.ndarray:
229
- """NMS within each class so overlapping car vs bus predictions are not merged away."""
230
- if len(boxes) == 0:
 
 
 
 
231
  return np.array([], dtype=np.intp)
232
- keep_all: list[int] = []
233
- for c in np.unique(cls_ids):
234
- idxs = np.nonzero(cls_ids == c)[0]
235
- if len(idxs) == 0:
 
 
 
 
236
  continue
237
- local_keep = cls._hard_nms(boxes[idxs], scores[idxs], iou_thresh)
238
- keep_all.extend(idxs[local_keep].tolist())
239
- keep_all = np.array(keep_all, dtype=np.intp)
240
- order = np.argsort(scores[keep_all])[::-1]
241
- return keep_all[order[:max_det]]
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  @staticmethod
244
- def _box_iou_one_to_many(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
245
- xx1 = np.maximum(box[0], boxes[:, 0])
246
- yy1 = np.maximum(box[1], boxes[:, 1])
247
- xx2 = np.minimum(box[2], boxes[:, 2])
248
- yy2 = np.minimum(box[3], boxes[:, 3])
249
-
250
- inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
251
-
252
- area_a = max(0.0, (box[2] - box[0]) * (box[3] - box[1]))
253
- area_b = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0.0, boxes[:, 3] - boxes[:, 1])
254
-
255
- return inter / (area_a + area_b - inter + 1e-7)
256
-
257
- def _filter_sane_boxes(
258
- self,
259
- boxes: np.ndarray,
260
  scores: np.ndarray,
261
- cls_ids: np.ndarray,
262
- orig_size: tuple[int, int],
263
- ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
264
- if len(boxes) == 0:
265
- return boxes, scores, cls_ids
266
-
267
- orig_w, orig_h = orig_size
268
- image_area = float(orig_w * orig_h)
269
-
270
- keep = []
271
- for i, box in enumerate(boxes):
272
- x1, y1, x2, y2 = box.tolist()
273
- bw = x2 - x1
274
- bh = y2 - y1
275
-
276
- if bw <= 0 or bh <= 0:
277
- continue
278
- if bw < self.min_w or bh < self.min_h:
279
- continue
280
-
281
- area = bw * bh
282
- if area < self.min_box_area:
283
- continue
284
- if area > self.max_box_area_ratio * image_area:
285
- continue
286
-
287
- ar = max(bw / max(bh, 1e-6), bh / max(bw, 1e-6))
288
- if ar > self.max_aspect_ratio:
289
- continue
290
-
291
- keep.append(i)
292
-
293
- if not keep:
294
- return (
295
- np.empty((0, 4), dtype=np.float32),
296
- np.empty((0,), dtype=np.float32),
297
- np.empty((0,), dtype=np.int32),
298
- )
299
-
300
- keep = np.array(keep, dtype=np.intp)
301
- return boxes[keep], scores[keep], cls_ids[keep]
302
 
303
  def _decode_final_dets(
304
  self,
@@ -306,7 +312,13 @@ class Miner:
306
  ratio: float,
307
  pad: tuple[float, float],
308
  orig_size: tuple[int, int],
 
309
  ) -> list[BoundingBox]:
 
 
 
 
 
310
  if preds.ndim == 3 and preds.shape[0] == 1:
311
  preds = preds[0]
312
 
@@ -317,9 +329,6 @@ class Miner:
317
  scores = preds[:, 4].astype(np.float32)
318
  cls_ids = preds[:, 5].astype(np.int32)
319
 
320
- # All trained vehicle classes pass: bus, car, truck, motorcycle.
321
-
322
- # candidate threshold
323
  keep = scores >= self.conf_thres
324
  boxes = boxes[keep]
325
  scores = scores[keep]
@@ -331,35 +340,36 @@ class Miner:
331
  pad_w, pad_h = pad
332
  orig_w, orig_h = orig_size
333
 
 
334
  boxes[:, [0, 2]] -= pad_w
335
  boxes[:, [1, 3]] -= pad_h
336
  boxes /= ratio
337
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
338
 
339
- boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
340
- if len(boxes) == 0:
341
- return []
 
342
 
343
- keep_idx = self._nms_per_class(
344
- boxes, scores, cls_ids, self.iou_thres, self.max_det
345
- )
346
 
347
- boxes = boxes[keep_idx]
348
- scores = scores[keep_idx]
349
- cls_ids = cls_ids[keep_idx]
350
 
351
- return [
352
- BoundingBox(
353
- x1=int(math.floor(box[0])),
354
- y1=int(math.floor(box[1])),
355
- x2=int(math.ceil(box[2])),
356
- y2=int(math.ceil(box[3])),
357
- cls_id=int(cls_id),
358
- conf=float(conf),
 
359
  )
360
- for box, conf, cls_id in zip(boxes, scores, cls_ids)
361
- if box[2] > box[0] and box[3] > box[1]
362
- ]
363
 
364
  def _decode_raw_yolo(
365
  self,
@@ -368,8 +378,15 @@ class Miner:
368
  pad: tuple[float, float],
369
  orig_size: tuple[int, int],
370
  ) -> list[BoundingBox]:
 
 
 
 
 
 
371
  if preds.ndim != 3:
372
  raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
 
373
  if preds.shape[0] != 1:
374
  raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
375
 
@@ -383,26 +400,14 @@ class Miner:
383
  raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
384
 
385
  boxes_xywh = preds[:, :4].astype(np.float32)
386
- tail = preds[:, 4:].astype(np.float32)
387
-
388
- # Supports:
389
- # [x,y,w,h,score] single-class
390
- # [x,y,w,h,obj,cls] YOLO standard single-class
391
- # [x,y,w,h,obj,cls1,cls2,...] multi-class
392
- if tail.shape[1] == 1:
393
- scores = tail[:, 0]
394
- cls_ids = np.zeros(len(scores), dtype=np.int32)
395
- elif tail.shape[1] == 2:
396
- obj = tail[:, 0]
397
- cls_prob = tail[:, 1]
398
- scores = obj * cls_prob
399
  cls_ids = np.zeros(len(scores), dtype=np.int32)
400
  else:
401
- obj = tail[:, 0]
402
- class_probs = tail[:, 1:]
403
- cls_ids = np.argmax(class_probs, axis=1).astype(np.int32)
404
- cls_scores = class_probs[np.arange(len(class_probs)), cls_ids]
405
- scores = obj * cls_scores
406
 
407
  keep = scores >= self.conf_thres
408
  boxes_xywh = boxes_xywh[keep]
@@ -413,6 +418,12 @@ class Miner:
413
  return []
414
 
415
  boxes = self._xywh_to_xyxy(boxes_xywh)
 
 
 
 
 
 
416
 
417
  pad_w, pad_h = pad
418
  orig_w, orig_h = orig_size
@@ -422,30 +433,25 @@ class Miner:
422
  boxes /= ratio
423
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
424
 
425
- boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
426
- if len(boxes) == 0:
427
- return []
428
-
429
- keep_idx = self._nms_per_class(
430
- boxes, scores, cls_ids, self.iou_thres, self.max_det
431
- )
432
 
433
- boxes = boxes[keep_idx]
434
- scores = scores[keep_idx]
435
- cls_ids = cls_ids[keep_idx]
436
 
437
- return [
438
- BoundingBox(
439
- x1=int(math.floor(box[0])),
440
- y1=int(math.floor(box[1])),
441
- x2=int(math.ceil(box[2])),
442
- y2=int(math.ceil(box[3])),
443
- cls_id=int(cls_id),
444
- conf=float(conf),
 
445
  )
446
- for box, conf, cls_id in zip(boxes, scores, cls_ids)
447
- if box[2] > box[0] and box[3] > box[1]
448
- ]
449
 
450
  def _postprocess(
451
  self,
@@ -454,12 +460,19 @@ class Miner:
454
  pad: tuple[float, float],
455
  orig_size: tuple[int, int],
456
  ) -> list[BoundingBox]:
 
 
 
 
 
457
  if output.ndim == 2 and output.shape[1] >= 6:
458
  return self._decode_final_dets(output, ratio, pad, orig_size)
459
 
460
- if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] >= 6:
 
461
  return self._decode_final_dets(output, ratio, pad, orig_size)
462
 
 
463
  return self._decode_raw_yolo(output, ratio, pad, orig_size)
464
 
465
  def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
@@ -489,110 +502,49 @@ class Miner:
489
  det_output = outputs[0]
490
  return self._postprocess(det_output, ratio, pad, orig_size)
491
 
492
- def _merge_tta_consensus(
493
- self,
494
- boxes_orig: list[BoundingBox],
495
- boxes_flip: list[BoundingBox],
496
- ) -> list[BoundingBox]:
497
- """
498
- Keep:
499
- - any box with conf >= conf_high
500
- - low/medium-conf boxes only if confirmed across TTA views
501
- Then run final hard NMS.
502
- """
503
- if not boxes_orig and not boxes_flip:
504
- return []
505
-
506
- coords_o = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0, 4), dtype=np.float32)
507
- scores_o = np.array([b.conf for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0,), dtype=np.float32)
508
- cls_o = np.array([b.cls_id for b in boxes_orig], dtype=np.int32) if boxes_orig else np.empty((0,), dtype=np.int32)
509
-
510
- coords_f = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0, 4), dtype=np.float32)
511
- scores_f = np.array([b.conf for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0,), dtype=np.float32)
512
- cls_f = np.array([b.cls_id for b in boxes_flip], dtype=np.int32) if boxes_flip else np.empty((0,), dtype=np.int32)
513
-
514
- accepted_boxes = []
515
- accepted_scores = []
516
- accepted_cls = []
517
-
518
- # Original view candidates
519
- for i in range(len(coords_o)):
520
- score = scores_o[i]
521
- if score >= self.conf_high:
522
- accepted_boxes.append(coords_o[i])
523
- accepted_scores.append(score)
524
- accepted_cls.append(int(cls_o[i]))
525
- elif len(coords_f) > 0:
526
- ious = self._box_iou_one_to_many(coords_o[i], coords_f)
527
- j = int(np.argmax(ious))
528
- if ious[j] >= self.tta_match_iou:
529
- fused_score = max(score, scores_f[j])
530
- accepted_boxes.append(coords_o[i])
531
- accepted_scores.append(fused_score)
532
- accepted_cls.append(int(cls_o[i]))
533
-
534
- # Flipped-view high-confidence boxes that original missed
535
- for i in range(len(coords_f)):
536
- score = scores_f[i]
537
- if score < self.conf_high:
538
- continue
539
-
540
- if len(coords_o) == 0:
541
- accepted_boxes.append(coords_f[i])
542
- accepted_scores.append(score)
543
- accepted_cls.append(int(cls_f[i]))
544
- continue
545
-
546
- ious = self._box_iou_one_to_many(coords_f[i], coords_o)
547
- if np.max(ious) < self.tta_match_iou:
548
- accepted_boxes.append(coords_f[i])
549
- accepted_scores.append(score)
550
- accepted_cls.append(int(cls_f[i]))
551
-
552
- if not accepted_boxes:
553
- return []
554
-
555
- boxes = np.array(accepted_boxes, dtype=np.float32)
556
- scores = np.array(accepted_scores, dtype=np.float32)
557
- cls_ids = np.array(accepted_cls, dtype=np.int32)
558
-
559
- keep = self._nms_per_class(boxes, scores, cls_ids, self.iou_thres, self.max_det)
560
-
561
- out = []
562
- for idx in keep:
563
- x1, y1, x2, y2 = boxes[idx].tolist()
564
- out.append(
565
- BoundingBox(
566
- x1=int(math.floor(x1)),
567
- y1=int(math.floor(y1)),
568
- x2=int(math.ceil(x2)),
569
- y2=int(math.ceil(y2)),
570
- cls_id=int(cls_ids[idx]),
571
- conf=float(scores[idx]),
572
- )
573
- )
574
- return out
575
-
576
  def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
 
577
  boxes_orig = self._predict_single(image)
578
 
579
  flipped = cv2.flip(image, 1)
580
- boxes_flip_raw = self._predict_single(flipped)
581
 
582
  w = image.shape[1]
583
  boxes_flip = [
584
  BoundingBox(
585
- x1=w - b.x2,
586
- y1=b.y1,
587
- x2=w - b.x1,
588
- y2=b.y2,
589
- cls_id=b.cls_id,
590
- conf=b.conf,
591
  )
592
- for b in boxes_flip_raw
593
  ]
594
 
595
- return self._merge_tta_consensus(boxes_orig, boxes_flip)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  def predict_batch(
598
  self,
@@ -611,7 +563,14 @@ class Miner:
611
  except Exception as e:
612
  print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
613
  boxes = []
614
-
 
 
 
 
 
 
 
615
  results.append(
616
  TVFrameResult(
617
  frame_id=offset + frame_number_in_batch,
@@ -621,3 +580,53 @@ class Miner:
621
  )
622
 
623
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  boxes: list[BoundingBox]
23
  keypoints: list[tuple[int, int]]
24
 
25
+ SIZE = 1280
26
+
27
 
28
  class Miner:
29
+ def __init__(self, path_hf_repo: Path) -> None:
 
 
30
  model_path = path_hf_repo / "weights.onnx"
31
+ cn_path = model_path.with_name("class_names.txt")
32
+ if cn_path.is_file():
33
+ lines = cn_path.read_text(encoding="utf-8").splitlines()
34
+ self.class_names = [
35
+ ln.strip()
36
+ for ln in lines
37
+ if ln.strip() and not ln.strip().startswith("#")
38
+ ]
39
+ else:
40
+ self.class_names = ["person"]
41
  print("ORT version:", ort.__version__)
42
 
43
  try:
 
78
  self.output_names = [output.name for output in self.session.get_outputs()]
79
  self.input_shape = self.session.get_inputs()[0].shape
80
 
81
+ self.input_height = self._safe_dim(self.input_shape[2], default=SIZE)
82
+ self.input_width = self._safe_dim(self.input_shape[3], default=SIZE)
 
 
 
 
 
 
 
 
 
 
83
 
84
+ self.conf_thres = 0.45
85
+ self.iou_thres = 0.5
86
+ self.max_det = 30
 
87
  self.use_tta = True
88
 
 
 
 
 
 
 
 
89
  print(f"✅ ONNX model loaded from: {model_path}")
90
  print(f"✅ ONNX providers: {self.session.get_providers()}")
91
  print(f"✅ ONNX input: name={self.input_name}, shape={self.input_shape}")
 
106
  new_shape: tuple[int, int],
107
  color=(114, 114, 114),
108
  ) -> tuple[ndarray, float, tuple[float, float]]:
109
+ """
110
+ Resize with unchanged aspect ratio and pad to target shape.
111
+ Returns:
112
+ padded_image,
113
+ ratio,
114
+ (pad_w, pad_h) # half-padding
115
+ """
116
  h, w = image.shape[:2]
117
  new_w, new_h = new_shape
118
 
 
148
  def _preprocess(
149
  self, image: ndarray
150
  ) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
151
+ """
152
+ Preprocess for fixed-size ONNX export:
153
+ - enhance image quality (CLAHE, denoise, sharpen)
154
+ - letterbox to model input size
155
+ - BGR -> RGB
156
+ - normalize to [0,1]
157
+ - HWC -> NCHW float32
158
+ """
159
  orig_h, orig_w = image.shape[:2]
160
 
161
  img, ratio, pad = self._letterbox(
 
186
  out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
187
  return out
188
 
189
+ def _soft_nms(
190
+ self,
191
  boxes: np.ndarray,
192
  scores: np.ndarray,
193
+ sigma: float = 0.5,
194
+ score_thresh: float = 0.01,
195
+ ) -> tuple[np.ndarray, np.ndarray]:
196
+ """
197
+ Soft-NMS: Gaussian decay of overlapping scores instead of hard removal.
198
+ Returns (kept_original_indices, updated_scores).
199
+ """
200
+ N = len(boxes)
201
+ if N == 0:
202
+ return np.array([], dtype=np.intp), np.array([], dtype=np.float32)
203
 
204
+ boxes = boxes.astype(np.float32, copy=True)
205
+ scores = scores.astype(np.float32, copy=True)
206
+ order = np.arange(N)
 
 
207
 
208
+ for i in range(N):
209
+ max_pos = i + int(np.argmax(scores[i:]))
210
+ boxes[[i, max_pos]] = boxes[[max_pos, i]]
211
+ scores[[i, max_pos]] = scores[[max_pos, i]]
212
+ order[[i, max_pos]] = order[[max_pos, i]]
213
 
214
+ if i + 1 >= N:
215
+ break
 
 
216
 
217
+ xx1 = np.maximum(boxes[i, 0], boxes[i + 1:, 0])
218
+ yy1 = np.maximum(boxes[i, 1], boxes[i + 1:, 1])
219
+ xx2 = np.minimum(boxes[i, 2], boxes[i + 1:, 2])
220
+ yy2 = np.minimum(boxes[i, 3], boxes[i + 1:, 3])
221
  inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
222
 
223
+ area_i = max(0.0, float(
224
+ (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
225
+ ))
226
+ areas_j = (
227
+ np.maximum(0.0, boxes[i + 1:, 2] - boxes[i + 1:, 0])
228
+ * np.maximum(0.0, boxes[i + 1:, 3] - boxes[i + 1:, 1])
229
+ )
230
+ iou = inter / (area_i + areas_j - inter + 1e-7)
231
+ scores[i + 1:] *= np.exp(-(iou ** 2) / sigma)
232
 
233
+ mask = scores > score_thresh
234
+ return order[mask], scores[mask]
235
 
236
+ @staticmethod
237
+ def _hard_nms(
 
238
  boxes: np.ndarray,
239
  scores: np.ndarray,
 
240
  iou_thresh: float,
 
241
  ) -> np.ndarray:
242
+ """
243
+ Standard NMS: keep one box per overlapping cluster (the one with highest score).
244
+ Returns indices of kept boxes (into the boxes/scores arrays).
245
+ """
246
+ N = len(boxes)
247
+ if N == 0:
248
  return np.array([], dtype=np.intp)
249
+ boxes = np.asarray(boxes, dtype=np.float32)
250
+ scores = np.asarray(scores, dtype=np.float32)
251
+ order = np.argsort(scores)[::-1]
252
+ keep: list[int] = []
253
+ suppressed = np.zeros(N, dtype=bool)
254
+ for i in range(N):
255
+ idx = order[i]
256
+ if suppressed[idx]:
257
  continue
258
+ keep.append(idx)
259
+ bi = boxes[idx]
260
+ for k in range(i + 1, N):
261
+ jdx = order[k]
262
+ if suppressed[jdx]:
263
+ continue
264
+ bj = boxes[jdx]
265
+ xx1 = max(bi[0], bj[0])
266
+ yy1 = max(bi[1], bj[1])
267
+ xx2 = min(bi[2], bj[2])
268
+ yy2 = min(bi[3], bj[3])
269
+ inter = max(0.0, xx2 - xx1) * max(0.0, yy2 - yy1)
270
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
271
+ area_j = (bj[2] - bj[0]) * (bj[3] - bj[1])
272
+ iou = inter / (area_i + area_j - inter + 1e-7)
273
+ if iou > iou_thresh:
274
+ suppressed[jdx] = True
275
+ return np.array(keep)
276
 
277
  @staticmethod
278
+ def _max_score_per_cluster(
279
+ coords: np.ndarray,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  scores: np.ndarray,
281
+ keep_indices: np.ndarray,
282
+ iou_thresh: float,
283
+ ) -> np.ndarray:
284
+ """
285
+ For each kept box, return the max original score among itself and any
286
+ box that overlaps it with IOU >= iou_thresh (so TTA cluster keeps best conf).
287
+ """
288
+ n_keep = len(keep_indices)
289
+ if n_keep == 0:
290
+ return np.array([], dtype=np.float32)
291
+ out = np.empty(n_keep, dtype=np.float32)
292
+ coords = np.asarray(coords, dtype=np.float32)
293
+ scores = np.asarray(scores, dtype=np.float32)
294
+ for i in range(n_keep):
295
+ idx = keep_indices[i]
296
+ bi = coords[idx]
297
+ xx1 = np.maximum(bi[0], coords[:, 0])
298
+ yy1 = np.maximum(bi[1], coords[:, 1])
299
+ xx2 = np.minimum(bi[2], coords[:, 2])
300
+ yy2 = np.minimum(bi[3], coords[:, 3])
301
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
302
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
303
+ areas_j = (coords[:, 2] - coords[:, 0]) * (coords[:, 3] - coords[:, 1])
304
+ iou = inter / (area_i + areas_j - inter + 1e-7)
305
+ in_cluster = iou >= iou_thresh
306
+ out[i] = float(np.max(scores[in_cluster]))
307
+ return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  def _decode_final_dets(
310
  self,
 
312
  ratio: float,
313
  pad: tuple[float, float],
314
  orig_size: tuple[int, int],
315
+ apply_optional_dedup: bool = False,
316
  ) -> list[BoundingBox]:
317
+ """
318
+ Primary path:
319
+ expected output rows like [x1, y1, x2, y2, conf, cls_id]
320
+ in letterboxed input coordinates.
321
+ """
322
  if preds.ndim == 3 and preds.shape[0] == 1:
323
  preds = preds[0]
324
 
 
329
  scores = preds[:, 4].astype(np.float32)
330
  cls_ids = preds[:, 5].astype(np.int32)
331
 
 
 
 
332
  keep = scores >= self.conf_thres
333
  boxes = boxes[keep]
334
  scores = scores[keep]
 
340
  pad_w, pad_h = pad
341
  orig_w, orig_h = orig_size
342
 
343
+ # reverse letterbox
344
  boxes[:, [0, 2]] -= pad_w
345
  boxes[:, [1, 3]] -= pad_h
346
  boxes /= ratio
347
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
348
 
349
+ if apply_optional_dedup and len(boxes) > 1:
350
+ keep_idx, scores = self._soft_nms(boxes, scores)
351
+ boxes = boxes[keep_idx]
352
+ cls_ids = cls_ids[keep_idx]
353
 
354
+ results: list[BoundingBox] = []
355
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
356
+ x1, y1, x2, y2 = box.tolist()
357
 
358
+ if x2 <= x1 or y2 <= y1:
359
+ continue
 
360
 
361
+ results.append(
362
+ BoundingBox(
363
+ x1=int(math.floor(x1)),
364
+ y1=int(math.floor(y1)),
365
+ x2=int(math.ceil(x2)),
366
+ y2=int(math.ceil(y2)),
367
+ cls_id=int(cls_id),
368
+ conf=float(conf),
369
+ )
370
  )
371
+
372
+ return results
 
373
 
374
  def _decode_raw_yolo(
375
  self,
 
378
  pad: tuple[float, float],
379
  orig_size: tuple[int, int],
380
  ) -> list[BoundingBox]:
381
+ """
382
+ Fallback path for raw YOLO predictions.
383
+ Supports common layouts:
384
+ - [1, C, N]
385
+ - [1, N, C]
386
+ """
387
  if preds.ndim != 3:
388
  raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
389
+
390
  if preds.shape[0] != 1:
391
  raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
392
 
 
400
  raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
401
 
402
  boxes_xywh = preds[:, :4].astype(np.float32)
403
+ cls_part = preds[:, 4:].astype(np.float32)
404
+
405
+ if cls_part.shape[1] == 1:
406
+ scores = cls_part[:, 0]
 
 
 
 
 
 
 
 
 
407
  cls_ids = np.zeros(len(scores), dtype=np.int32)
408
  else:
409
+ cls_ids = np.argmax(cls_part, axis=1).astype(np.int32)
410
+ scores = cls_part[np.arange(len(cls_part)), cls_ids]
 
 
 
411
 
412
  keep = scores >= self.conf_thres
413
  boxes_xywh = boxes_xywh[keep]
 
418
  return []
419
 
420
  boxes = self._xywh_to_xyxy(boxes_xywh)
421
+ keep_idx, scores = self._soft_nms(boxes, scores)
422
+ keep_idx = keep_idx[: self.max_det]
423
+ scores = scores[: self.max_det]
424
+
425
+ boxes = boxes[keep_idx]
426
+ cls_ids = cls_ids[keep_idx]
427
 
428
  pad_w, pad_h = pad
429
  orig_w, orig_h = orig_size
 
433
  boxes /= ratio
434
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
435
 
436
+ results: list[BoundingBox] = []
437
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
438
+ x1, y1, x2, y2 = box.tolist()
 
 
 
 
439
 
440
+ if x2 <= x1 or y2 <= y1:
441
+ continue
 
442
 
443
+ results.append(
444
+ BoundingBox(
445
+ x1=int(math.floor(x1)),
446
+ y1=int(math.floor(y1)),
447
+ x2=int(math.ceil(x2)),
448
+ y2=int(math.ceil(y2)),
449
+ cls_id=int(cls_id),
450
+ conf=float(conf),
451
+ )
452
  )
453
+
454
+ return results
 
455
 
456
  def _postprocess(
457
  self,
 
460
  pad: tuple[float, float],
461
  orig_size: tuple[int, int],
462
  ) -> list[BoundingBox]:
463
+ """
464
+ Prefer final detections first.
465
+ Fallback to raw decode only if needed.
466
+ """
467
+ # final detections: [N,6]
468
  if output.ndim == 2 and output.shape[1] >= 6:
469
  return self._decode_final_dets(output, ratio, pad, orig_size)
470
 
471
+ # final detections: [1,N,6]
472
+ if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] == 6:
473
  return self._decode_final_dets(output, ratio, pad, orig_size)
474
 
475
+ # fallback raw decode
476
  return self._decode_raw_yolo(output, ratio, pad, orig_size)
477
 
478
  def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
 
502
  det_output = outputs[0]
503
  return self._postprocess(det_output, ratio, pad, orig_size)
504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
  def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
506
+ """Horizontal-flip TTA: merge original + flipped via hard NMS."""
507
  boxes_orig = self._predict_single(image)
508
 
509
  flipped = cv2.flip(image, 1)
510
+ boxes_flip = self._predict_single(flipped)
511
 
512
  w = image.shape[1]
513
  boxes_flip = [
514
  BoundingBox(
515
+ x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
516
+ cls_id=b.cls_id, conf=b.conf,
 
 
 
 
517
  )
518
+ for b in boxes_flip
519
  ]
520
 
521
+ all_boxes = boxes_orig + boxes_flip
522
+ if len(all_boxes) == 0:
523
+ return []
524
+
525
+ coords = np.array(
526
+ [[b.x1, b.y1, b.x2, b.y2] for b in all_boxes], dtype=np.float32
527
+ )
528
+ scores = np.array([b.conf for b in all_boxes], dtype=np.float32)
529
+
530
+ hard_keep = self._hard_nms(coords, scores, self.iou_thres)
531
+ if len(hard_keep) == 0:
532
+ return []
533
+
534
+ # _hard_nms already orders kept indices by descending score.
535
+ hard_keep = hard_keep[: self.max_det]
536
+
537
+ return [
538
+ BoundingBox(
539
+ x1=all_boxes[i].x1,
540
+ y1=all_boxes[i].y1,
541
+ x2=all_boxes[i].x2,
542
+ y2=all_boxes[i].y2,
543
+ cls_id=all_boxes[i].cls_id,
544
+ conf=float(scores[i]),
545
+ )
546
+ for i in hard_keep
547
+ ]
548
 
549
  def predict_batch(
550
  self,
 
563
  except Exception as e:
564
  print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
565
  boxes = []
566
+ # for box in boxes:
567
+ # if box.cls_id == 2:
568
+ # box.cls_id = 3
569
+ # elif box.cls_id == 3:
570
+ # box.cls_id = 2
571
+
572
+
573
+
574
  results.append(
575
  TVFrameResult(
576
  frame_id=offset + frame_number_in_batch,
 
580
  )
581
 
582
  return results
583
+
584
+
585
+ if __name__ == "__main__":
586
+ # Simple manual test: load weights.onnx, run on 1.png, and draw bboxes
587
+ repo_dir = Path(__file__).parent
588
+ miner = Miner(repo_dir)
589
+
590
+ image_path = repo_dir / "car1.png"
591
+ if not image_path.exists():
592
+ raise FileNotFoundError(f"Test image not found: {image_path}")
593
+
594
+ image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
595
+ if image is None:
596
+ raise RuntimeError(f"Failed to read image: {image_path}")
597
+
598
+ results = miner.predict_batch([image], offset=0, n_keypoints=0)
599
+ # Draw bounding boxes on a copy of the image
600
+ vis = image.copy()
601
+ colors = [(0, 255, 0), (0, 0, 255), (255, 0, 0)]
602
+ for frame in results:
603
+ print(f"Frame {frame.frame_id}:")
604
+ for i, box in enumerate(frame.boxes):
605
+ color = colors[i % len(colors)]
606
+ cv2.rectangle(
607
+ vis,
608
+ (box.x1, box.y1),
609
+ (box.x2, box.y2),
610
+ color,
611
+ 2,
612
+ )
613
+ label = f"{box.cls_id }_{miner.class_names[box.cls_id] if box.cls_id < len(miner.class_names) else box.cls_id}:{box.conf:.2f}"
614
+ cv2.putText(
615
+ vis,
616
+ label,
617
+ (box.x1, max(0, box.y1 - 5)),
618
+ cv2.FONT_HERSHEY_SIMPLEX,
619
+ box.conf,
620
+ color,
621
+ 1,
622
+ cv2.LINE_AA,
623
+ )
624
+ print(
625
+ f" cls={box.cls_id} conf={box.conf:.3f} "
626
+ f"box=({box.x1},{box.y1},{box.x2},{box.y2})"
627
+ )
628
+ print(len(frame.boxes))
629
+
630
+ out_path = repo_dir / f"1_out_iou{miner.iou_thres:.2f}.png"
631
+ cv2.imwrite(str(out_path), vis)
632
+ print(f"Saved visualization to: {out_path}")