SuperBitDev commited on
Commit
2eeeebe
·
verified ·
1 Parent(s): 72f590d

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. chute_config.yml +1 -1
  2. miner.py +422 -345
chute_config.yml CHANGED
@@ -8,7 +8,7 @@ Image:
8
  NodeSelector:
9
  gpu_count: 1
10
  min_vram_gb_per_gpu: 16
11
- max_hourly_price_per_gpu: 0.5
12
 
13
  exclude:
14
  - "5090"
 
8
  NodeSelector:
9
  gpu_count: 1
10
  min_vram_gb_per_gpu: 16
11
+ max_hourly_price_per_gpu: 1.0
12
 
13
  exclude:
14
  - "5090"
miner.py CHANGED
@@ -6,8 +6,6 @@ import numpy as np
6
  import onnxruntime as ort
7
  from numpy import ndarray
8
  from pydantic import BaseModel
9
- import argparse
10
- import json
11
 
12
 
13
  class BoundingBox(BaseModel):
@@ -26,20 +24,45 @@ class TVFrameResult(BaseModel):
26
 
27
  SIZE = 1280
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  class Miner:
31
  def __init__(self, path_hf_repo: Path) -> None:
32
  model_path = path_hf_repo / "weights.onnx"
33
- cn_path = model_path.with_name("class_names.txt")
34
- if cn_path.is_file():
35
- lines = cn_path.read_text(encoding="utf-8").splitlines()
36
- self.class_names = [
37
- ln.strip()
38
- for ln in lines
39
- if ln.strip() and not ln.strip().startswith("#")
40
- ]
41
  else:
42
- self.class_names = ["petrol_hose", "petrol_pump", "price board", "roof canopy"]
 
 
 
 
 
 
 
 
 
 
43
  print("ORT version:", ort.__version__)
44
 
45
  try:
@@ -83,14 +106,28 @@ class Miner:
83
  self.input_height = self._safe_dim(self.input_shape[2], default=SIZE)
84
  self.input_width = self._safe_dim(self.input_shape[3], default=SIZE)
85
 
86
- self.conf_thres = 0.49
87
- self.iou_thres = 0.28
88
- self.max_det = 100
 
 
 
89
  self.use_tta = True
90
- self.tile_size = SIZE
91
- self.overlap = 0.5
92
- self.use_slicer = True
93
- self.use_full_image_merge = True
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  print(f"✅ ONNX model loaded from: {model_path}")
96
  print(f"✅ ONNX providers: {self.session.get_providers()}")
@@ -106,6 +143,38 @@ class Miner:
106
  def _safe_dim(value, default: int) -> int:
107
  return value if isinstance(value, int) and value > 0 else default
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def _letterbox(
110
  self,
111
  image: ndarray,
@@ -192,182 +261,131 @@ class Miner:
192
  out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
193
  return out
194
 
195
- def _slice_image(
196
- self, image: np.ndarray
197
- ) -> list[tuple[np.ndarray, tuple[int, int], tuple[int, int]]]:
198
- h, w = image.shape[:2]
199
- t = self.tile_size
200
- st = max(1, int(t * (1.0 - self.overlap)))
201
-
202
- xs = []
203
- x = 0
204
- while True:
205
- if x + t >= w:
206
- xs.append(max(0, w - t))
207
- break
208
- xs.append(x)
209
- x += st
210
-
211
- ys = []
212
- y = 0
213
- while True:
214
- if y + t >= h:
215
- ys.append(max(0, h - t))
216
- break
217
- ys.append(y)
218
- y += st
219
-
220
- xs = list(dict.fromkeys(xs))
221
- ys = list(dict.fromkeys(ys))
222
-
223
- out = []
224
- for y0 in ys:
225
- for x0 in xs:
226
- x1 = min(x0 + t, w)
227
- y1 = min(y0 + t, h)
228
- crop = image[y0:y1, x0:x1]
229
- vh, vw = crop.shape[:2]
230
- out.append((crop, (x0, y0), (vw, vh)))
231
- return out
232
-
233
- def _soft_nms(
234
- self,
235
  boxes: np.ndarray,
236
  scores: np.ndarray,
237
- sigma: float = 0.5,
238
- score_thresh: float = 0.01,
239
- ) -> tuple[np.ndarray, np.ndarray]:
240
- """
241
- Soft-NMS: Gaussian decay of overlapping scores instead of hard removal.
242
- Returns (kept_original_indices, updated_scores).
243
- """
244
- N = len(boxes)
245
- if N == 0:
246
- return np.array([], dtype=np.intp), np.array([], dtype=np.float32)
247
-
248
- boxes = boxes.astype(np.float32, copy=True)
249
- scores = scores.astype(np.float32, copy=True)
250
- order = np.arange(N)
251
 
252
- for i in range(N):
253
- max_pos = i + int(np.argmax(scores[i:]))
254
- boxes[[i, max_pos]] = boxes[[max_pos, i]]
255
- scores[[i, max_pos]] = scores[[max_pos, i]]
256
- order[[i, max_pos]] = order[[max_pos, i]]
257
 
258
- if i + 1 >= N:
 
 
 
259
  break
260
 
261
- xx1 = np.maximum(boxes[i, 0], boxes[i + 1:, 0])
262
- yy1 = np.maximum(boxes[i, 1], boxes[i + 1:, 1])
263
- xx2 = np.minimum(boxes[i, 2], boxes[i + 1:, 2])
264
- yy2 = np.minimum(boxes[i, 3], boxes[i + 1:, 3])
 
 
 
265
  inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
266
 
267
- area_i = max(0.0, float(
268
- (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
269
- ))
270
- areas_j = (
271
- np.maximum(0.0, boxes[i + 1:, 2] - boxes[i + 1:, 0])
272
- * np.maximum(0.0, boxes[i + 1:, 3] - boxes[i + 1:, 1])
273
  )
274
- iou = inter / (area_i + areas_j - inter + 1e-7)
275
- scores[i + 1:] *= np.exp(-(iou ** 2) / sigma)
276
 
277
- mask = scores > score_thresh
278
- return order[mask], scores[mask]
279
 
280
- @staticmethod
281
- def _hard_nms(
282
- boxes: np.ndarray,
283
- scores: np.ndarray,
284
- iou_thresh: float,
285
- ) -> np.ndarray:
286
- """
287
- Standard NMS: keep one box per overlapping cluster (the one with highest score).
288
- Returns indices of kept boxes (into the boxes/scores arrays).
289
- """
290
- N = len(boxes)
291
- if N == 0:
292
- return np.array([], dtype=np.intp)
293
- boxes = np.asarray(boxes, dtype=np.float32)
294
- scores = np.asarray(scores, dtype=np.float32)
295
- order = np.argsort(scores)[::-1]
296
- keep: list[int] = []
297
- suppressed = np.zeros(N, dtype=bool)
298
- for i in range(N):
299
- idx = order[i]
300
- if suppressed[idx]:
301
- continue
302
- keep.append(idx)
303
- bi = boxes[idx]
304
- for k in range(i + 1, N):
305
- jdx = order[k]
306
- if suppressed[jdx]:
307
- continue
308
- bj = boxes[jdx]
309
- xx1 = max(bi[0], bj[0])
310
- yy1 = max(bi[1], bj[1])
311
- xx2 = min(bi[2], bj[2])
312
- yy2 = min(bi[3], bj[3])
313
- inter = max(0.0, xx2 - xx1) * max(0.0, yy2 - yy1)
314
- area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
315
- area_j = (bj[2] - bj[0]) * (bj[3] - bj[1])
316
- iou = inter / (area_i + area_j - inter + 1e-7)
317
- if iou > iou_thresh:
318
- suppressed[jdx] = True
319
- return np.array(keep)
320
-
321
- def _hard_nms_by_class(
322
- self,
323
  boxes: np.ndarray,
324
  scores: np.ndarray,
325
  cls_ids: np.ndarray,
326
  iou_thresh: float,
 
327
  ) -> np.ndarray:
 
328
  if len(boxes) == 0:
329
  return np.array([], dtype=np.intp)
330
  keep_all: list[int] = []
331
  for c in np.unique(cls_ids):
332
- m = cls_ids == c
333
- inds = np.flatnonzero(m)
334
- sub_keep = self._hard_nms(boxes[m], scores[m], iou_thresh)
335
- keep_all.extend(int(inds[i]) for i in sub_keep)
336
- keep_all = np.asarray(keep_all, dtype=np.intp)
337
- order = np.argsort(scores[keep_all])[::-1][: self.max_det]
338
- return keep_all[order]
 
339
 
340
  @staticmethod
341
- def _max_score_per_cluster(
342
- coords: np.ndarray,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  scores: np.ndarray,
344
- keep_indices: np.ndarray,
345
- iou_thresh: float,
346
- ) -> np.ndarray:
347
- """
348
- For each kept box, return the max original score among itself and any
349
- box that overlaps it with IOU >= iou_thresh (so TTA cluster keeps best conf).
350
- """
351
- n_keep = len(keep_indices)
352
- if n_keep == 0:
353
- return np.array([], dtype=np.float32)
354
- out = np.empty(n_keep, dtype=np.float32)
355
- coords = np.asarray(coords, dtype=np.float32)
356
- scores = np.asarray(scores, dtype=np.float32)
357
- for i in range(n_keep):
358
- idx = keep_indices[i]
359
- bi = coords[idx]
360
- xx1 = np.maximum(bi[0], coords[:, 0])
361
- yy1 = np.maximum(bi[1], coords[:, 1])
362
- xx2 = np.minimum(bi[2], coords[:, 2])
363
- yy2 = np.minimum(bi[3], coords[:, 3])
364
- inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
365
- area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
366
- areas_j = (coords[:, 2] - coords[:, 0]) * (coords[:, 3] - coords[:, 1])
367
- iou = inter / (area_i + areas_j - inter + 1e-7)
368
- in_cluster = iou >= iou_thresh
369
- out[i] = float(np.max(scores[in_cluster]))
370
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  def _decode_final_dets(
373
  self,
@@ -375,12 +393,9 @@ class Miner:
375
  ratio: float,
376
  pad: tuple[float, float],
377
  orig_size: tuple[int, int],
378
- apply_optional_dedup: bool = False,
379
  ) -> list[BoundingBox]:
380
  """
381
- Primary path:
382
- expected output rows like [x1, y1, x2, y2, conf, cls_id]
383
- in letterboxed input coordinates.
384
  """
385
  if preds.ndim == 3 and preds.shape[0] == 1:
386
  preds = preds[0]
@@ -390,9 +405,10 @@ class Miner:
390
 
391
  boxes = preds[:, :4].astype(np.float32)
392
  scores = preds[:, 4].astype(np.float32)
393
- cls_ids = preds[:, 5].astype(np.int32)
394
 
395
- keep = scores >= self.conf_thres
 
396
  boxes = boxes[keep]
397
  scores = scores[keep]
398
  cls_ids = cls_ids[keep]
@@ -403,36 +419,35 @@ class Miner:
403
  pad_w, pad_h = pad
404
  orig_w, orig_h = orig_size
405
 
406
- # reverse letterbox
407
  boxes[:, [0, 2]] -= pad_w
408
  boxes[:, [1, 3]] -= pad_h
409
  boxes /= ratio
410
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
411
 
412
- if apply_optional_dedup and len(boxes) > 1:
413
- keep_idx, scores = self._soft_nms(boxes, scores)
414
- boxes = boxes[keep_idx]
415
- cls_ids = cls_ids[keep_idx]
416
 
417
- results: list[BoundingBox] = []
418
- for box, conf, cls_id in zip(boxes, scores, cls_ids):
419
- x1, y1, x2, y2 = box.tolist()
420
 
421
- if x2 <= x1 or y2 <= y1:
422
- continue
 
423
 
424
- results.append(
425
- BoundingBox(
426
- x1=int(math.floor(x1)),
427
- y1=int(math.floor(y1)),
428
- x2=int(math.ceil(x2)),
429
- y2=int(math.ceil(y2)),
430
- cls_id=int(cls_id),
431
- conf=float(conf),
432
- )
433
  )
434
-
435
- return results
 
436
 
437
  def _decode_raw_yolo(
438
  self,
@@ -441,21 +456,13 @@ class Miner:
441
  pad: tuple[float, float],
442
  orig_size: tuple[int, int],
443
  ) -> list[BoundingBox]:
444
- """
445
- Fallback path for raw YOLO predictions.
446
- Supports common layouts:
447
- - [1, C, N]
448
- - [1, N, C]
449
- """
450
  if preds.ndim != 3:
451
  raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
452
-
453
  if preds.shape[0] != 1:
454
  raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
455
 
456
  preds = preds[0]
457
 
458
- # Normalize to [N, C]
459
  if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
460
  preds = preds.T
461
 
@@ -463,16 +470,27 @@ class Miner:
463
  raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
464
 
465
  boxes_xywh = preds[:, :4].astype(np.float32)
466
- cls_part = preds[:, 4:].astype(np.float32)
467
 
468
- if cls_part.shape[1] == 1:
469
- scores = cls_part[:, 0]
 
 
 
 
 
470
  cls_ids = np.zeros(len(scores), dtype=np.int32)
471
  else:
472
- cls_ids = np.argmax(cls_part, axis=1).astype(np.int32)
473
- scores = cls_part[np.arange(len(cls_part)), cls_ids]
 
 
 
474
 
475
- keep = scores >= self.conf_thres
 
 
 
476
  boxes_xywh = boxes_xywh[keep]
477
  scores = scores[keep]
478
  cls_ids = cls_ids[keep]
@@ -481,12 +499,6 @@ class Miner:
481
  return []
482
 
483
  boxes = self._xywh_to_xyxy(boxes_xywh)
484
- keep_idx, scores = self._soft_nms(boxes, scores)
485
- keep_idx = keep_idx[: self.max_det]
486
- scores = scores[: self.max_det]
487
-
488
- boxes = boxes[keep_idx]
489
- cls_ids = cls_ids[keep_idx]
490
 
491
  pad_w, pad_h = pad
492
  orig_w, orig_h = orig_size
@@ -496,25 +508,30 @@ class Miner:
496
  boxes /= ratio
497
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
498
 
499
- results: list[BoundingBox] = []
500
- for box, conf, cls_id in zip(boxes, scores, cls_ids):
501
- x1, y1, x2, y2 = box.tolist()
502
 
503
- if x2 <= x1 or y2 <= y1:
504
- continue
 
505
 
506
- results.append(
507
- BoundingBox(
508
- x1=int(math.floor(x1)),
509
- y1=int(math.floor(y1)),
510
- x2=int(math.ceil(x2)),
511
- y2=int(math.ceil(y2)),
512
- cls_id=int(cls_id),
513
- conf=float(conf),
514
- )
515
- )
516
 
517
- return results
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  def _postprocess(
520
  self,
@@ -531,8 +548,8 @@ class Miner:
531
  if output.ndim == 2 and output.shape[1] >= 6:
532
  return self._decode_final_dets(output, ratio, pad, orig_size)
533
 
534
- # final detections: [1,N,6]
535
- if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] == 6:
536
  return self._decode_final_dets(output, ratio, pad, orig_size)
537
 
538
  # fallback raw decode
@@ -565,130 +582,137 @@ class Miner:
565
  det_output = outputs[0]
566
  return self._postprocess(det_output, ratio, pad, orig_size)
567
 
568
- def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
569
- """Horizontal-flip TTA: merge original + flipped via hard NMS."""
570
- boxes_orig = self._predict_single(image)
571
-
572
- flipped = cv2.flip(image, 1)
573
- boxes_flip = self._predict_single(flipped)
574
-
575
- w = image.shape[1]
576
- boxes_flip = [
577
- BoundingBox(
578
- x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
579
- cls_id=b.cls_id, conf=b.conf,
580
- )
581
- for b in boxes_flip
582
- ]
583
-
584
- all_boxes = boxes_orig + boxes_flip
585
- if len(all_boxes) == 0:
586
  return []
587
 
588
- coords = np.array(
589
- [[b.x1, b.y1, b.x2, b.y2] for b in all_boxes], dtype=np.float32
 
 
 
 
 
 
 
 
 
 
 
 
590
  )
591
- scores = np.array([b.conf for b in all_boxes], dtype=np.float32)
592
 
593
- hard_keep = self._hard_nms(coords, scores, self.iou_thres)
594
- if len(hard_keep) == 0:
595
- return []
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
- # _hard_nms already orders kept indices by descending score.
598
- hard_keep = hard_keep[: self.max_det]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
- return [
601
- BoundingBox(
602
- x1=all_boxes[i].x1,
603
- y1=all_boxes[i].y1,
604
- x2=all_boxes[i].x2,
605
- y2=all_boxes[i].y2,
606
- cls_id=all_boxes[i].cls_id,
607
- conf=float(scores[i]),
608
- )
609
- for i in hard_keep
610
- ]
611
 
612
- def predict_image(self, image: np.ndarray) -> list[BoundingBox]:
613
- if image is None:
614
- raise ValueError("Input image is None")
615
- if not isinstance(image, np.ndarray):
616
- raise TypeError(f"Input is not numpy array: {type(image)}")
617
- if image.ndim != 3 or image.shape[2] != 3:
618
- raise ValueError(f"Expected HWC image with 3 channels, got shape={image.shape}")
619
-
620
- H, W = image.shape[:2]
621
- all_boxes: list[list[float]] = []
622
- all_scores: list[float] = []
623
- all_cls: list[int] = []
624
-
625
- if self.use_slicer:
626
- tiles = self._slice_image(image)
627
- for tile_img, (ox, oy), (vw, vh) in tiles:
628
- try:
629
- dets = self._predict_tta(tile_img) if self.use_tta else self._predict_single(tile_img)
630
- except Exception as e:
631
- print(f"⚠️ Tile inference failed at ({ox}, {oy}): {e}")
632
- continue
633
-
634
- left_edge = ox == 0
635
- top_edge = oy == 0
636
- right_edge = (ox + vw) >= W
637
- bottom_edge = (oy + vh) >= H
638
-
639
- for b in dets:
640
- bw = b.x2 - b.x1
641
- bh = b.y2 - b.y1
642
- m = max(8, int(min(bw, bh) * 0.2))
643
- if not left_edge and b.x1 < m:
644
- continue
645
- if not top_edge and b.y1 < m:
646
- continue
647
- if not right_edge and b.x2 > (vw - m):
648
- continue
649
- if not bottom_edge and b.y2 > (vh - m):
650
- continue
651
-
652
- x1 = max(0, min(W - 1, int(b.x1 + ox)))
653
- y1 = max(0, min(H - 1, int(b.y1 + oy)))
654
- x2 = max(0, min(W - 1, int(b.x2 + ox)))
655
- y2 = max(0, min(H - 1, int(b.y2 + oy)))
656
- if x2 > x1 and y2 > y1:
657
- all_boxes.append([x1, y1, x2, y2])
658
- all_scores.append(float(b.conf))
659
- all_cls.append(int(b.cls_id))
660
-
661
- if self.use_full_image_merge or not self.use_slicer:
662
- full_dets = self._predict_tta(image) if self.use_tta else self._predict_single(image)
663
- for b in full_dets:
664
- if b.x2 > b.x1 and b.y2 > b.y1:
665
- all_boxes.append([b.x1, b.y1, b.x2, b.y2])
666
- all_scores.append(float(b.conf))
667
- all_cls.append(int(b.cls_id))
668
-
669
- if not all_boxes:
670
  return []
671
 
672
- boxes = np.asarray(all_boxes, dtype=np.float32)
673
- scores = np.asarray(all_scores, dtype=np.float32)
674
- cls_ids = np.asarray(all_cls, dtype=np.int32)
675
- keep = self._hard_nms_by_class(boxes, scores, cls_ids, self.iou_thres)
 
676
 
677
- out: list[BoundingBox] = []
678
- for i in keep:
679
- b = boxes[i]
680
  out.append(
681
  BoundingBox(
682
- x1=int(math.floor(b[0])),
683
- y1=int(math.floor(b[1])),
684
- x2=int(math.ceil(b[2])),
685
- y2=int(math.ceil(b[3])),
686
- cls_id=int(cls_ids[i]),
687
- conf=float(scores[i]),
688
  )
689
  )
690
  return out
691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
  def predict_batch(
693
  self,
694
  batch_images: list[ndarray],
@@ -699,7 +723,10 @@ class Miner:
699
 
700
  for frame_number_in_batch, image in enumerate(batch_images):
701
  try:
702
- boxes = self.predict_image(image)
 
 
 
703
  except Exception as e:
704
  print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
705
  boxes = []
@@ -713,3 +740,53 @@ class Miner:
713
  )
714
 
715
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import onnxruntime as ort
7
  from numpy import ndarray
8
  from pydantic import BaseModel
 
 
9
 
10
 
11
  class BoundingBox(BaseModel):
 
24
 
25
  SIZE = 1280
26
 
27
+ # --- Class labels (edit here; cls_id 0..N-1 matches this order) ---
28
+ CLASS_NAMES: tuple[str, ...] = (
29
+ "petrol hose",
30
+ "petrol pump",
31
+ "price board",
32
+ "roof canopy",
33
+ )
34
+
35
+ # If the ONNX model outputs class indices in a different order than CLASS_NAMES, set this
36
+ # to the same names in that model order. None = identity.
37
+ MODEL_CLASS_ORDER: tuple[str, ...] | None = None
38
+
39
+ # --- Per-class confidence (edit here) ---
40
+ # Same order as CLASS_NAMES. Use empty tuple () to use scalar defaults (conf_thres, conf_high).
41
+ PER_CLASS_CONF_THRES = (0.25, 0.42, 0.32, 0.45)
42
+ PER_CLASS_CONF_HIGH = (0.56, 0.62, 0.52, 0.6)
43
+
44
 
45
  class Miner:
46
  def __init__(self, path_hf_repo: Path) -> None:
47
  model_path = path_hf_repo / "weights.onnx"
48
+
49
+ self.class_names = list(CLASS_NAMES)
50
+ if MODEL_CLASS_ORDER is None:
51
+ self._train_cls_to_canonical = np.arange(
52
+ len(self.class_names), dtype=np.int32
53
+ )
 
 
54
  else:
55
+ if set(MODEL_CLASS_ORDER) != set(self.class_names) or len(
56
+ MODEL_CLASS_ORDER
57
+ ) != len(self.class_names):
58
+ raise ValueError(
59
+ "MODEL_CLASS_ORDER must be a permutation of CLASS_NAMES "
60
+ "(names in the order the ONNX model outputs cls indices)."
61
+ )
62
+ self._train_cls_to_canonical = np.array(
63
+ [self.class_names.index(n) for n in MODEL_CLASS_ORDER], dtype=np.int32
64
+ )
65
+
66
  print("ORT version:", ort.__version__)
67
 
68
  try:
 
106
  self.input_height = self._safe_dim(self.input_shape[2], default=SIZE)
107
  self.input_width = self._safe_dim(self.input_shape[3], default=SIZE)
108
 
109
+ # --- VehicleDetection/vehicle3 scoring-oriented thresholds ---
110
+ self.conf_thres = 0.25 # low threshold for candidate generation
111
+ self.conf_high = 0.5 # high-conf boxes can survive without TTA confirmation
112
+ self.iou_thres = 0.50
113
+ self.tta_match_iou = 0.6 # TTA agreement IoU
114
+ self.max_det = 150
115
  self.use_tta = True
116
+
117
+ n_cls = len(self.class_names)
118
+ self._conf_thres_per_class = self._per_class_vector(
119
+ n_cls, PER_CLASS_CONF_THRES, self.conf_thres, "PER_CLASS_CONF_THRES"
120
+ )
121
+ self._conf_high_per_class = self._per_class_vector(
122
+ n_cls, PER_CLASS_CONF_HIGH, self.conf_high, "PER_CLASS_CONF_HIGH"
123
+ )
124
+
125
+ # Box sanity (VehicleDetection/vehicle3)
126
+ self.min_box_area = 12 * 12
127
+ self.min_w = 8
128
+ self.min_h = 8
129
+ self.max_aspect_ratio = 8.0
130
+ self.max_box_area_ratio = 0.8
131
 
132
  print(f"✅ ONNX model loaded from: {model_path}")
133
  print(f"✅ ONNX providers: {self.session.get_providers()}")
 
143
  def _safe_dim(value, default: int) -> int:
144
  return value if isinstance(value, int) and value > 0 else default
145
 
146
+ @staticmethod
147
+ def _per_class_vector(
148
+ n_cls: int,
149
+ per_class: tuple[float, ...],
150
+ scalar: float,
151
+ name: str,
152
+ ) -> np.ndarray:
153
+ """Build length-`n_cls` vector from a tuple or broadcast `scalar` if tuple is empty."""
154
+ if not per_class:
155
+ return np.full(n_cls, scalar, dtype=np.float32)
156
+ if len(per_class) != n_cls:
157
+ raise ValueError(
158
+ f"{name}: expected {n_cls} values (same order as CLASS_NAMES), got {len(per_class)}"
159
+ )
160
+ return np.array(per_class, dtype=np.float32)
161
+
162
+ def _remap_train_cls_ids(self, cls_ids: np.ndarray) -> np.ndarray:
163
+ idx = np.clip(
164
+ cls_ids.astype(np.int64, copy=False),
165
+ 0,
166
+ len(self._train_cls_to_canonical) - 1,
167
+ )
168
+ return self._train_cls_to_canonical[idx]
169
+
170
+ def _clip_cls_id(self, cls_id: int) -> int:
171
+ n = len(self.class_names)
172
+ if cls_id < 0:
173
+ return 0
174
+ if cls_id >= n:
175
+ return n - 1
176
+ return cls_id
177
+
178
  def _letterbox(
179
  self,
180
  image: ndarray,
 
261
  out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
262
  return out
263
 
264
+ @staticmethod
265
+ def _hard_nms(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  boxes: np.ndarray,
267
  scores: np.ndarray,
268
+ iou_thresh: float,
269
+ ) -> np.ndarray:
270
+ if len(boxes) == 0:
271
+ return np.array([], dtype=np.intp)
 
 
 
 
 
 
 
 
 
 
272
 
273
+ boxes = np.asarray(boxes, dtype=np.float32)
274
+ scores = np.asarray(scores, dtype=np.float32)
275
+ order = np.argsort(scores)[::-1]
276
+ keep = []
 
277
 
278
+ while len(order) > 0:
279
+ i = order[0]
280
+ keep.append(i)
281
+ if len(order) == 1:
282
  break
283
 
284
+ rest = order[1:]
285
+
286
+ xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
287
+ yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
288
+ xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
289
+ yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
290
+
291
  inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
292
 
293
+ area_i = np.maximum(0.0, (boxes[i, 2] - boxes[i, 0])) * np.maximum(
294
+ 0.0, (boxes[i, 3] - boxes[i, 1])
295
+ )
296
+ area_r = np.maximum(0.0, (boxes[rest, 2] - boxes[rest, 0])) * np.maximum(
297
+ 0.0, (boxes[rest, 3] - boxes[rest, 1])
 
298
  )
 
 
299
 
300
+ iou = inter / (area_i + area_r - inter + 1e-7)
301
+ order = rest[iou <= iou_thresh]
302
 
303
+ return np.array(keep, dtype=np.intp)
304
+
305
+ @classmethod
306
+ def _nms_per_class(
307
+ cls,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  boxes: np.ndarray,
309
  scores: np.ndarray,
310
  cls_ids: np.ndarray,
311
  iou_thresh: float,
312
+ max_det: int,
313
  ) -> np.ndarray:
314
+ """NMS within each class; then global top-`max_det` by score (VehicleDetection/vehicle3)."""
315
  if len(boxes) == 0:
316
  return np.array([], dtype=np.intp)
317
  keep_all: list[int] = []
318
  for c in np.unique(cls_ids):
319
+ idxs = np.nonzero(cls_ids == c)[0]
320
+ if len(idxs) == 0:
321
+ continue
322
+ local_keep = cls._hard_nms(boxes[idxs], scores[idxs], iou_thresh)
323
+ keep_all.extend(idxs[local_keep].tolist())
324
+ keep_all = np.array(keep_all, dtype=np.intp)
325
+ order = np.argsort(scores[keep_all])[::-1]
326
+ return keep_all[order[:max_det]]
327
 
328
  @staticmethod
329
+ def _box_iou_one_to_many(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
330
+ xx1 = np.maximum(box[0], boxes[:, 0])
331
+ yy1 = np.maximum(box[1], boxes[:, 1])
332
+ xx2 = np.minimum(box[2], boxes[:, 2])
333
+ yy2 = np.minimum(box[3], boxes[:, 3])
334
+
335
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
336
+
337
+ area_a = max(0.0, (box[2] - box[0]) * (box[3] - box[1]))
338
+ area_b = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(
339
+ 0.0, boxes[:, 3] - boxes[:, 1]
340
+ )
341
+
342
+ return inter / (area_a + area_b - inter + 1e-7)
343
+
344
+ def _filter_sane_boxes(
345
+ self,
346
+ boxes: np.ndarray,
347
  scores: np.ndarray,
348
+ cls_ids: np.ndarray,
349
+ orig_size: tuple[int, int],
350
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
351
+ if len(boxes) == 0:
352
+ return boxes, scores, cls_ids
353
+
354
+ orig_w, orig_h = orig_size
355
+ image_area = float(orig_w * orig_h)
356
+
357
+ keep = []
358
+ for i, box in enumerate(boxes):
359
+ x1, y1, x2, y2 = box.tolist()
360
+ bw = x2 - x1
361
+ bh = y2 - y1
362
+
363
+ if bw <= 0 or bh <= 0:
364
+ continue
365
+ if bw < self.min_w or bh < self.min_h:
366
+ continue
367
+
368
+ area = bw * bh
369
+ if area < self.min_box_area:
370
+ continue
371
+ if area > self.max_box_area_ratio * image_area:
372
+ continue
373
+
374
+ ar = max(bw / max(bh, 1e-6), bh / max(bw, 1e-6))
375
+ if ar > self.max_aspect_ratio:
376
+ continue
377
+
378
+ keep.append(i)
379
+
380
+ if not keep:
381
+ return (
382
+ np.empty((0, 4), dtype=np.float32),
383
+ np.empty((0,), dtype=np.float32),
384
+ np.empty((0,), dtype=np.int32),
385
+ )
386
+
387
+ keep = np.array(keep, dtype=np.intp)
388
+ return boxes[keep], scores[keep], cls_ids[keep]
389
 
390
  def _decode_final_dets(
391
  self,
 
393
  ratio: float,
394
  pad: tuple[float, float],
395
  orig_size: tuple[int, int],
 
396
  ) -> list[BoundingBox]:
397
  """
398
+ Primary path: rows like [x1, y1, x2, y2, conf, cls_id] in letterboxed coords.
 
 
399
  """
400
  if preds.ndim == 3 and preds.shape[0] == 1:
401
  preds = preds[0]
 
405
 
406
  boxes = preds[:, :4].astype(np.float32)
407
  scores = preds[:, 4].astype(np.float32)
408
+ cls_ids = self._remap_train_cls_ids(preds[:, 5].astype(np.int32))
409
 
410
+ ci = np.clip(cls_ids.astype(np.int64), 0, len(self._conf_thres_per_class) - 1)
411
+ keep = scores >= self._conf_thres_per_class[ci]
412
  boxes = boxes[keep]
413
  scores = scores[keep]
414
  cls_ids = cls_ids[keep]
 
419
  pad_w, pad_h = pad
420
  orig_w, orig_h = orig_size
421
 
 
422
  boxes[:, [0, 2]] -= pad_w
423
  boxes[:, [1, 3]] -= pad_h
424
  boxes /= ratio
425
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
426
 
427
+ boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
428
+ if len(boxes) == 0:
429
+ return []
 
430
 
431
+ keep_idx = self._nms_per_class(
432
+ boxes, scores, cls_ids, self.iou_thres, self.max_det
433
+ )
434
 
435
+ boxes = boxes[keep_idx]
436
+ scores = scores[keep_idx]
437
+ cls_ids = cls_ids[keep_idx]
438
 
439
+ return [
440
+ BoundingBox(
441
+ x1=int(math.floor(box[0])),
442
+ y1=int(math.floor(box[1])),
443
+ x2=int(math.ceil(box[2])),
444
+ y2=int(math.ceil(box[3])),
445
+ cls_id=int(cls_id),
446
+ conf=float(conf),
 
447
  )
448
+ for box, conf, cls_id in zip(boxes, scores, cls_ids)
449
+ if box[2] > box[0] and box[3] > box[1]
450
+ ]
451
 
452
  def _decode_raw_yolo(
453
  self,
 
456
  pad: tuple[float, float],
457
  orig_size: tuple[int, int],
458
  ) -> list[BoundingBox]:
 
 
 
 
 
 
459
  if preds.ndim != 3:
460
  raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
 
461
  if preds.shape[0] != 1:
462
  raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
463
 
464
  preds = preds[0]
465
 
 
466
  if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
467
  preds = preds.T
468
 
 
470
  raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
471
 
472
  boxes_xywh = preds[:, :4].astype(np.float32)
473
+ tail = preds[:, 4:].astype(np.float32)
474
 
475
+ if tail.shape[1] == 1:
476
+ scores = tail[:, 0]
477
+ cls_ids = np.zeros(len(scores), dtype=np.int32)
478
+ elif tail.shape[1] == 2:
479
+ obj = tail[:, 0]
480
+ cls_prob = tail[:, 1]
481
+ scores = obj * cls_prob
482
  cls_ids = np.zeros(len(scores), dtype=np.int32)
483
  else:
484
+ obj = tail[:, 0]
485
+ class_probs = tail[:, 1:]
486
+ cls_ids = np.argmax(class_probs, axis=1).astype(np.int32)
487
+ cls_scores = class_probs[np.arange(len(class_probs)), cls_ids]
488
+ scores = obj * cls_scores
489
 
490
+ cls_ids = self._remap_train_cls_ids(cls_ids)
491
+
492
+ ci = np.clip(cls_ids.astype(np.int64), 0, len(self._conf_thres_per_class) - 1)
493
+ keep = scores >= self._conf_thres_per_class[ci]
494
  boxes_xywh = boxes_xywh[keep]
495
  scores = scores[keep]
496
  cls_ids = cls_ids[keep]
 
499
  return []
500
 
501
  boxes = self._xywh_to_xyxy(boxes_xywh)
 
 
 
 
 
 
502
 
503
  pad_w, pad_h = pad
504
  orig_w, orig_h = orig_size
 
508
  boxes /= ratio
509
  boxes = self._clip_boxes(boxes, (orig_w, orig_h))
510
 
511
+ boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
512
+ if len(boxes) == 0:
513
+ return []
514
 
515
+ keep_idx = self._nms_per_class(
516
+ boxes, scores, cls_ids, self.iou_thres, self.max_det
517
+ )
518
 
519
+ boxes = boxes[keep_idx]
520
+ scores = scores[keep_idx]
521
+ cls_ids = cls_ids[keep_idx]
 
 
 
 
 
 
 
522
 
523
+ return [
524
+ BoundingBox(
525
+ x1=int(math.floor(box[0])),
526
+ y1=int(math.floor(box[1])),
527
+ x2=int(math.ceil(box[2])),
528
+ y2=int(math.ceil(box[3])),
529
+ cls_id=int(cls_id),
530
+ conf=float(conf),
531
+ )
532
+ for box, conf, cls_id in zip(boxes, scores, cls_ids)
533
+ if box[2] > box[0] and box[3] > box[1]
534
+ ]
535
 
536
  def _postprocess(
537
  self,
 
548
  if output.ndim == 2 and output.shape[1] >= 6:
549
  return self._decode_final_dets(output, ratio, pad, orig_size)
550
 
551
+ # final detections: [1,N,C] with C>=6
552
+ if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] >= 6:
553
  return self._decode_final_dets(output, ratio, pad, orig_size)
554
 
555
  # fallback raw decode
 
582
  det_output = outputs[0]
583
  return self._postprocess(det_output, ratio, pad, orig_size)
584
 
585
+ def _merge_tta_consensus(
586
+ self,
587
+ boxes_orig: list[BoundingBox],
588
+ boxes_flip: list[BoundingBox],
589
+ ) -> list[BoundingBox]:
590
+ """
591
+ VehicleDetection/vehicle3 strategy:
592
+ - keep any original-view box with conf >= conf_high
593
+ - keep lower-conf original boxes only if confirmed in flipped view (IoU)
594
+ - add flipped high-conf boxes the original view missed
595
+ - final per-class NMS
596
+ """
597
+ if not boxes_orig and not boxes_flip:
 
 
 
 
 
598
  return []
599
 
600
+ coords_o = (
601
+ np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_orig], dtype=np.float32)
602
+ if boxes_orig
603
+ else np.empty((0, 4), dtype=np.float32)
604
+ )
605
+ scores_o = (
606
+ np.array([b.conf for b in boxes_orig], dtype=np.float32)
607
+ if boxes_orig
608
+ else np.empty((0,), dtype=np.float32)
609
+ )
610
+ cls_o = (
611
+ np.array([b.cls_id for b in boxes_orig], dtype=np.int32)
612
+ if boxes_orig
613
+ else np.empty((0,), dtype=np.int32)
614
  )
 
615
 
616
+ coords_f = (
617
+ np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_flip], dtype=np.float32)
618
+ if boxes_flip
619
+ else np.empty((0, 4), dtype=np.float32)
620
+ )
621
+ scores_f = (
622
+ np.array([b.conf for b in boxes_flip], dtype=np.float32)
623
+ if boxes_flip
624
+ else np.empty((0,), dtype=np.float32)
625
+ )
626
+ cls_f = (
627
+ np.array([b.cls_id for b in boxes_flip], dtype=np.int32)
628
+ if boxes_flip
629
+ else np.empty((0,), dtype=np.int32)
630
+ )
631
 
632
+ accepted_boxes = []
633
+ accepted_scores = []
634
+ accepted_cls = []
635
+
636
+ for i in range(len(coords_o)):
637
+ score = scores_o[i]
638
+ c = self._clip_cls_id(int(cls_o[i]))
639
+ ch = float(self._conf_high_per_class[c])
640
+ if score >= ch:
641
+ accepted_boxes.append(coords_o[i])
642
+ accepted_scores.append(score)
643
+ accepted_cls.append(int(cls_o[i]))
644
+ elif len(coords_f) > 0:
645
+ ious = self._box_iou_one_to_many(coords_o[i], coords_f)
646
+ j = int(np.argmax(ious))
647
+ if ious[j] >= self.tta_match_iou:
648
+ fused_score = max(score, scores_f[j])
649
+ accepted_boxes.append(coords_o[i])
650
+ accepted_scores.append(fused_score)
651
+ accepted_cls.append(int(cls_o[i]))
652
+
653
+ for i in range(len(coords_f)):
654
+ score = scores_f[i]
655
+ c = self._clip_cls_id(int(cls_f[i]))
656
+ if score < float(self._conf_high_per_class[c]):
657
+ continue
658
 
659
+ if len(coords_o) == 0:
660
+ accepted_boxes.append(coords_f[i])
661
+ accepted_scores.append(score)
662
+ accepted_cls.append(int(cls_f[i]))
663
+ continue
 
 
 
 
 
 
664
 
665
+ ious = self._box_iou_one_to_many(coords_f[i], coords_o)
666
+ if np.max(ious) < self.tta_match_iou:
667
+ accepted_boxes.append(coords_f[i])
668
+ accepted_scores.append(score)
669
+ accepted_cls.append(int(cls_f[i]))
670
+
671
+ if not accepted_boxes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  return []
673
 
674
+ boxes = np.array(accepted_boxes, dtype=np.float32)
675
+ scores = np.array(accepted_scores, dtype=np.float32)
676
+ cls_ids = np.array(accepted_cls, dtype=np.int32)
677
+
678
+ keep = self._nms_per_class(boxes, scores, cls_ids, self.iou_thres, self.max_det)
679
 
680
+ out = []
681
+ for idx in keep:
682
+ x1, y1, x2, y2 = boxes[idx].tolist()
683
  out.append(
684
  BoundingBox(
685
+ x1=int(math.floor(x1)),
686
+ y1=int(math.floor(y1)),
687
+ x2=int(math.ceil(x2)),
688
+ y2=int(math.ceil(y2)),
689
+ cls_id=int(cls_ids[idx]),
690
+ conf=float(scores[idx]),
691
  )
692
  )
693
  return out
694
 
695
+ def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
696
+ boxes_orig = self._predict_single(image)
697
+
698
+ flipped = cv2.flip(image, 1)
699
+ boxes_flip_raw = self._predict_single(flipped)
700
+
701
+ w = image.shape[1]
702
+ boxes_flip = [
703
+ BoundingBox(
704
+ x1=w - b.x2,
705
+ y1=b.y1,
706
+ x2=w - b.x1,
707
+ y2=b.y2,
708
+ cls_id=b.cls_id,
709
+ conf=b.conf,
710
+ )
711
+ for b in boxes_flip_raw
712
+ ]
713
+
714
+ return self._merge_tta_consensus(boxes_orig, boxes_flip)
715
+
716
  def predict_batch(
717
  self,
718
  batch_images: list[ndarray],
 
723
 
724
  for frame_number_in_batch, image in enumerate(batch_images):
725
  try:
726
+ if self.use_tta:
727
+ boxes = self._predict_tta(image)
728
+ else:
729
+ boxes = self._predict_single(image)
730
  except Exception as e:
731
  print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
732
  boxes = []
 
740
  )
741
 
742
  return results
743
+
744
+
745
+ if __name__ == "__main__":
746
+ # Simple manual test: load weights.onnx, run on 1.png, and draw bboxes
747
+ repo_dir = Path(__file__).parent
748
+ miner = Miner(repo_dir)
749
+
750
+ image_path = repo_dir / "car1.png"
751
+ if not image_path.exists():
752
+ raise FileNotFoundError(f"Test image not found: {image_path}")
753
+
754
+ image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
755
+ if image is None:
756
+ raise RuntimeError(f"Failed to read image: {image_path}")
757
+
758
+ results = miner.predict_batch([image], offset=0, n_keypoints=0)
759
+ # Draw bounding boxes on a copy of the image
760
+ vis = image.copy()
761
+ colors = [(0, 255, 0), (0, 0, 255), (255, 0, 0)]
762
+ for frame in results:
763
+ print(f"Frame {frame.frame_id}:")
764
+ for i, box in enumerate(frame.boxes):
765
+ color = colors[i % len(colors)]
766
+ cv2.rectangle(
767
+ vis,
768
+ (box.x1, box.y1),
769
+ (box.x2, box.y2),
770
+ color,
771
+ 2,
772
+ )
773
+ label = f"{box.cls_id }_{miner.class_names[box.cls_id] if box.cls_id < len(miner.class_names) else box.cls_id}:{box.conf:.2f}"
774
+ cv2.putText(
775
+ vis,
776
+ label,
777
+ (box.x1, max(0, box.y1 - 5)),
778
+ cv2.FONT_HERSHEY_SIMPLEX,
779
+ box.conf,
780
+ color,
781
+ 1,
782
+ cv2.LINE_AA,
783
+ )
784
+ print(
785
+ f" cls={box.cls_id} conf={box.conf:.3f} "
786
+ f"box=({box.x1},{box.y1},{box.x2},{box.y2})"
787
+ )
788
+ print(len(frame.boxes))
789
+
790
+ out_path = repo_dir / f"1_out_iou{miner.iou_thres:.2f}.png"
791
+ cv2.imwrite(str(out_path), vis)
792
+ print(f"Saved visualization to: {out_path}")