nexu02 commited on
Commit
58fd07f
·
verified ·
1 Parent(s): 9f41367

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +469 -129
miner.py CHANGED
@@ -1,27 +1,11 @@
1
- """miner.py — uploaded to artur7236/turbovision-beverage HF repo.
2
-
3
- Fine-tuned YOLO11s on SAM3-labelled beverage data + horizontal-flip TTA at
4
- 1280×1280 + confidence threshold 0.55. Local benchmark mean composite = 0.709
5
- vs UID 213's 0.667.
6
-
7
- Required by the chute template:
8
- - class Miner with __init__(self, path_hf_repo: Path)
9
- - predict_batch(batch_images, offset, n_keypoints) -> list[TVFrameResult]
10
- - BoundingBox + TVFrameResult pydantic models defined in this file
11
- - No imports from anywhere except stdlib + site-packages (no helper modules
12
- from the HF repo).
13
- """
14
  from pathlib import Path
 
15
 
 
16
  import numpy as np
 
17
  from numpy import ndarray
18
  from pydantic import BaseModel
19
- from ultralytics import YOLO
20
-
21
-
22
- # Class index follows the manifest objects order for Detect-beverage-detect:
23
- # [cup, bottle, can] → cls_id 0, 1, 2.
24
- CLASS_NAMES = ["cup", "bottle", "can"]
25
 
26
 
27
  class BoundingBox(BaseModel):
@@ -39,122 +23,478 @@ class TVFrameResult(BaseModel):
39
  keypoints: list[tuple[int, int]]
40
 
41
 
42
- def _iou(a: BoundingBox, b: BoundingBox) -> float:
43
- """Standard IoU between two axis-aligned boxes."""
44
- x1 = max(a.x1, b.x1)
45
- y1 = max(a.y1, b.y1)
46
- x2 = min(a.x2, b.x2)
47
- y2 = min(a.y2, b.y2)
48
- if x2 <= x1 or y2 <= y1:
49
- return 0.0
50
- inter = (x2 - x1) * (y2 - y1)
51
- area_a = max(0, a.x2 - a.x1) * max(0, a.y2 - a.y1)
52
- area_b = max(0, b.x2 - b.x1) * max(0, b.y2 - b.y1)
53
- union = area_a + area_b - inter
54
- return inter / union if union > 0 else 0.0
55
-
56
-
57
- def _cross_class_nms(boxes: list[BoundingBox], iou_thresh: float = 0.6) -> list[BoundingBox]:
58
- """Suppress cross-class overlapping boxes (keep highest-conf when IoU≥thresh).
59
-
60
- Ultralytics' default NMS only dedupes WITHIN a class. The SN44 validator
61
- counts cross-class overlapping boxes as false positives (only one class
62
- can be right per object), so we need this extra pass. Mirrors UID 213's
63
- _cross_class_dedup_op (IoU 0.7); we use 0.6 to be slightly more aggressive.
64
- """
65
- if len(boxes) <= 1:
66
- return boxes
67
- sorted_boxes = sorted(boxes, key=lambda b: -b.conf)
68
- kept: list[BoundingBox] = []
69
- for b in sorted_boxes:
70
- suppressed = False
71
- for k in kept:
72
- if _iou(b, k) >= iou_thresh:
73
- suppressed = True
74
- break
75
- if not suppressed:
76
- kept.append(b)
77
- return kept
78
-
79
-
80
  class Miner:
81
- """Fine-tuned YOLO11s with hflip TTA (Round 5, 607 validator-distribution imgs, 19 MB).
82
-
83
- Tuning rationale (all on SAM3 ground-truth, 361 val images):
84
- - imgsz=1280: bumps small-object recall on the typical 1408×768 frames
85
- the validator sends (cans, distant cups).
86
- - conf=0.55: sweet spot for the validator's pillar weighting
87
- (0.6·mAP50 + 0.4·(1−ffpi/10)). Below 0.45 the false-positive pillar
88
- crashes; above 0.65 recall drops faster than precision rises.
89
- - augment=True: ultralytics enables hflip TTA, our biggest single
90
- composite gain in offline sweep.
91
- - iou=0.45: standard YOLO NMS IoU; cross-class dedup happens implicitly.
92
- """
93
-
94
- IMAGE_SIZE = 1280
95
- CONF_THRESH = 0.50 # Round 4: 0.50 narrowly beat 0.45 (0.6982 vs 0.6980)
96
- IOU_THRESH = 0.45
97
- USE_TTA = True
98
- CROSS_CLASS_IOU = 0.6 # Round 6: post-NMS cross-class dedup
99
- # Block 8287800 had 18 boxes including same-pixel cup+bottle pairs.
100
 
101
  def __init__(self, path_hf_repo: Path) -> None:
102
- weights_path = path_hf_repo / "best.pt"
103
- if not weights_path.exists():
104
- raise FileNotFoundError(f"missing weights at {weights_path}")
105
- self.model = YOLO(str(weights_path))
106
- # Touch the model once so cold-start latency hits the warmup, not the
107
- # first validator call.
108
- dummy = np.zeros((640, 640, 3), dtype=np.uint8)
109
- _ = self.model.predict(dummy, imgsz=self.IMAGE_SIZE, conf=self.CONF_THRESH,
110
- iou=self.IOU_THRESH, augment=self.USE_TTA, verbose=False)
111
- print(f" YOLO11s loaded from {weights_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  def __repr__(self) -> str:
114
- return (f"YOLO11s_ft(imgsz={self.IMAGE_SIZE}, "
115
- f"conf={self.CONF_THRESH}, iou={self.IOU_THRESH}, "
116
- f"tta={self.USE_TTA})")
117
-
118
- def predict_batch(
119
- self,
120
- batch_images: list[ndarray],
121
- offset: int,
122
- n_keypoints: int,
123
- ) -> list[TVFrameResult]:
124
- # Run inference. Ultralytics will accept a list[ndarray]; with augment=True
125
- # it does the flip+merge internally per image.
126
- results = self.model.predict(
127
- batch_images,
128
- imgsz=self.IMAGE_SIZE,
129
- conf=self.CONF_THRESH,
130
- iou=self.IOU_THRESH,
131
- augment=self.USE_TTA,
132
- verbose=False,
133
  )
134
 
135
- out: list[TVFrameResult] = []
136
- # n_keypoints is irrelevant for detection elements; return zero-padded.
137
- kp_zeros = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
138
-
139
- for i, r in enumerate(results):
140
- frame_id = offset + i
141
- boxes: list[BoundingBox] = []
142
- if r.boxes is not None and r.boxes.data is not None:
143
- for box in r.boxes.data.cpu().numpy():
144
- x1, y1, x2, y2, conf, cls_id = box.tolist()
145
- cls_id_int = int(cls_id)
146
- if cls_id_int < 0 or cls_id_int >= len(CLASS_NAMES):
147
- continue
148
- # ensure non-degenerate after rounding
149
- xi1, yi1, xi2, yi2 = int(x1), int(y1), int(x2), int(y2)
150
- if xi2 <= xi1 or yi2 <= yi1:
151
- continue
152
- boxes.append(BoundingBox(
153
- x1=xi1, y1=yi1, x2=xi2, y2=yi2,
154
- cls_id=cls_id_int, conf=float(conf),
155
- ))
156
- # Cross-class dedup (Ultralytics NMS is per-class only)
157
- boxes = _cross_class_nms(boxes, iou_thresh=self.CROSS_CLASS_IOU)
158
- out.append(TVFrameResult(frame_id=frame_id, boxes=boxes, keypoints=kp_zeros))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+ import math
3
 
4
+ import cv2
5
  import numpy as np
6
+ import onnxruntime as ort
7
  from numpy import ndarray
8
  from pydantic import BaseModel
 
 
 
 
 
 
9
 
10
 
11
  class BoundingBox(BaseModel):
 
23
  keypoints: list[tuple[int, int]]
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class Miner:
27
+ """ONNX Runtime miner. Hard global NMS + sanity filter + dedup + flip TTA, with per-class rescue bonus."""
28
+
29
+ class_names = ["cup", "bottle", "can"]
30
+ input_size = 1280
31
+ iou_thres = 0.4
32
+ cross_iou_thresh = 0.7
33
+ min_side = 8.0
34
+ min_box_area = 100.0
35
+ max_aspect_ratio = 10.0
36
+ max_det = 300
37
+ _conf_thres_array = np.array([0.6, 0.45, 0.5], dtype=np.float32)
38
+ _bonus_array = np.array([0.0, 0.0, 0.2], dtype=np.float32)
 
 
 
 
 
 
 
39
 
40
  def __init__(self, path_hf_repo: Path) -> None:
41
+ model_path = path_hf_repo / "weights.onnx"
42
+ print("ORT version:", ort.__version__)
43
+
44
+ try:
45
+ ort.preload_dlls()
46
+ print("preload_dlls success")
47
+ except Exception as e:
48
+ print(f"preload_dlls failed: {e}")
49
+
50
+ print("ORT available providers BEFORE session:", ort.get_available_providers())
51
+
52
+ sess_options = ort.SessionOptions()
53
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
54
+
55
+ try:
56
+ self.session = ort.InferenceSession(
57
+ str(model_path),
58
+ sess_options=sess_options,
59
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
60
+ )
61
+ print("Created ORT session with preferred CUDA provider list")
62
+ except Exception as e:
63
+ print(f"CUDA session creation failed, falling back to CPU: {e}")
64
+ self.session = ort.InferenceSession(
65
+ str(model_path),
66
+ sess_options=sess_options,
67
+ providers=["CPUExecutionProvider"],
68
+ )
69
+
70
+ print("ORT session providers:", self.session.get_providers())
71
+
72
+ for inp in self.session.get_inputs():
73
+ print("INPUT:", inp.name, inp.shape, inp.type)
74
+ for out in self.session.get_outputs():
75
+ print("OUTPUT:", out.name, out.shape, out.type)
76
+
77
+ self.input_name = self.session.get_inputs()[0].name
78
+ self.output_names = [output.name for output in self.session.get_outputs()]
79
+ self.input_shape = self.session.get_inputs()[0].shape
80
+
81
+ self.input_height = self._safe_dim(self.input_shape[2], default=self.input_size)
82
+ self.input_width = self._safe_dim(self.input_shape[3], default=self.input_size)
83
+
84
+ print(f"ONNX model loaded from: {model_path}")
85
+ print(f"ONNX providers: {self.session.get_providers()}")
86
+ print(f"ONNX input: name={self.input_name}, shape={self.input_shape}")
87
 
88
  def __repr__(self) -> str:
89
+ return (
90
+ f"ONNXRuntime(session={type(self.session).__name__}, "
91
+ f"providers={self.session.get_providers()})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
 
94
+ @staticmethod
95
+ def _safe_dim(value, default: int) -> int:
96
+ return value if isinstance(value, int) and value > 0 else default
97
+
98
+ def _letterbox(self, image: ndarray, new_shape: tuple[int, int],
99
+ color=(114, 114, 114)
100
+ ) -> tuple[ndarray, float, tuple[float, float]]:
101
+ h, w = image.shape[:2]
102
+ new_w, new_h = new_shape
103
+ ratio = min(new_w / w, new_h / h)
104
+ resized_w = int(round(w * ratio))
105
+ resized_h = int(round(h * ratio))
106
+ if (resized_w, resized_h) != (w, h):
107
+ interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
108
+ image = cv2.resize(image, (resized_w, resized_h), interpolation=interp)
109
+ dw = (new_w - resized_w) / 2.0
110
+ dh = (new_h - resized_h) / 2.0
111
+ left = int(round(dw - 0.1))
112
+ right = int(round(dw + 0.1))
113
+ top = int(round(dh - 0.1))
114
+ bottom = int(round(dh + 0.1))
115
+ padded = cv2.copyMakeBorder(image, top, bottom, left, right,
116
+ borderType=cv2.BORDER_CONSTANT, value=color)
117
+ return padded, ratio, (dw, dh)
118
+
119
+ def _preprocess(self, image: ndarray
120
+ ) -> tuple[np.ndarray, float, tuple[float, float],
121
+ tuple[int, int]]:
122
+ orig_h, orig_w = image.shape[:2]
123
+ img, ratio, pad = self._letterbox(image, (self.input_width, self.input_height))
124
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
125
+ img = img.astype(np.float32) / 255.0
126
+ img = np.transpose(img, (2, 0, 1))[None, ...]
127
+ img = np.ascontiguousarray(img, dtype=np.float32)
128
+ return img, ratio, pad, (orig_w, orig_h)
129
+
130
+ @staticmethod
131
+ def _clip_boxes(boxes: np.ndarray, image_size: tuple[int, int]) -> np.ndarray:
132
+ w, h = image_size
133
+ boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
134
+ boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
135
+ boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
136
+ boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
137
+ return boxes
138
+
139
+ @staticmethod
140
+ def _xywh_to_xyxy(boxes: np.ndarray) -> np.ndarray:
141
+ out = np.empty_like(boxes)
142
+ out[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
143
+ out[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
144
+ out[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
145
+ out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
146
+ return out
147
+
148
+ @staticmethod
149
+ def _hard_nms(boxes: np.ndarray, scores: np.ndarray,
150
+ iou_thresh: float) -> np.ndarray:
151
+ n = len(boxes)
152
+ if n == 0:
153
+ return np.array([], dtype=np.intp)
154
+ order = np.argsort(-scores)
155
+ keep: list[int] = []
156
+ while len(order) > 0:
157
+ i = int(order[0])
158
+ keep.append(i)
159
+ if len(order) == 1:
160
+ break
161
+ rest = order[1:]
162
+ xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
163
+ yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
164
+ xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
165
+ yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
166
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
167
+ a_i = (max(0.0, boxes[i, 2] - boxes[i, 0]) *
168
+ max(0.0, boxes[i, 3] - boxes[i, 1]))
169
+ a_r = (np.maximum(0.0, boxes[rest, 2] - boxes[rest, 0]) *
170
+ np.maximum(0.0, boxes[rest, 3] - boxes[rest, 1]))
171
+ iou = inter / (a_i + a_r - inter + 1e-7)
172
+ order = rest[iou <= iou_thresh]
173
+ return np.array(keep, dtype=np.intp)
174
+
175
+ def _per_class_hard_nms(self, boxes: np.ndarray, scores: np.ndarray,
176
+ cls_ids: np.ndarray, iou_thresh: float
177
+ ) -> np.ndarray:
178
+ if len(boxes) == 0:
179
+ return np.array([], dtype=np.intp)
180
+ all_keep: list[int] = []
181
+ for c in np.unique(cls_ids):
182
+ mask = cls_ids == c
183
+ indices = np.where(mask)[0]
184
+ keep = self._hard_nms(boxes[mask], scores[mask], iou_thresh)
185
+ all_keep.extend(indices[keep].tolist())
186
+ all_keep.sort()
187
+ return np.array(all_keep, dtype=np.intp)
188
+
189
+ def _cross_class_dedup_op(self, boxes: np.ndarray, scores: np.ndarray,
190
+ cls_ids: np.ndarray, iou_thresh: float
191
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
192
+ n = len(boxes)
193
+ if n <= 1:
194
+ return boxes, scores, cls_ids
195
+ boxes = np.asarray(boxes, dtype=np.float32)
196
+ scores = np.asarray(scores, dtype=np.float32)
197
+ cls_ids = np.asarray(cls_ids, dtype=np.int32)
198
+ areas = (np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) *
199
+ np.maximum(0.0, boxes[:, 3] - boxes[:, 1]))
200
+ margins = scores - self._conf_thres_array[cls_ids]
201
+ order = np.lexsort((-areas, -margins))
202
+ suppressed = np.zeros(n, dtype=bool)
203
+ keep: list[int] = []
204
+ for i in order:
205
+ if suppressed[i]:
206
+ continue
207
+ keep.append(int(i))
208
+ bi = boxes[i]
209
+ xx1 = np.maximum(bi[0], boxes[:, 0])
210
+ yy1 = np.maximum(bi[1], boxes[:, 1])
211
+ xx2 = np.minimum(bi[2], boxes[:, 2])
212
+ yy2 = np.minimum(bi[3], boxes[:, 3])
213
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
214
+ a_i = max(1e-7, float((bi[2] - bi[0]) * (bi[3] - bi[1])))
215
+ iou = inter / (a_i + areas - inter + 1e-7)
216
+ dup = iou > iou_thresh
217
+ dup[i] = False
218
+ suppressed |= dup
219
+ keep_idx = np.array(keep, dtype=np.intp)
220
+ return boxes[keep_idx], scores[keep_idx], cls_ids[keep_idx]
221
+
222
+ def _filter_sane_boxes(self, boxes: np.ndarray, scores: np.ndarray,
223
+ cls_ids: np.ndarray, orig_size: tuple[int, int]
224
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
225
+ if len(boxes) == 0:
226
+ return boxes, scores, cls_ids
227
+ orig_w, orig_h = orig_size
228
+ image_area = float(orig_w * orig_h)
229
+ bw = np.maximum(0.0, boxes[:, 2] - boxes[:, 0])
230
+ bh = np.maximum(0.0, boxes[:, 3] - boxes[:, 1])
231
+ area = bw * bh
232
+ ar = np.where(
233
+ (bw > 0) & (bh > 0),
234
+ np.maximum(bw / np.maximum(bh, 1e-6), bh / np.maximum(bw, 1e-6)),
235
+ np.inf,
236
+ )
237
+ keep = (
238
+ (bw >= self.min_side) & (bh >= self.min_side) &
239
+ (area >= self.min_box_area) &
240
+ (area <= 0.95 * image_area) &
241
+ (ar <= self.max_aspect_ratio)
242
+ )
243
+ return boxes[keep], scores[keep], cls_ids[keep]
244
 
245
+ def _max_score_per_cluster(self, post_boxes: np.ndarray,
246
+ post_cls: np.ndarray,
247
+ full_boxes: np.ndarray,
248
+ full_scores: np.ndarray,
249
+ full_cls: np.ndarray,
250
+ iou_thresh: float) -> np.ndarray:
251
+ n = len(post_boxes)
252
+ if n == 0:
253
+ return np.empty(0, dtype=np.float32)
254
+ full_areas = (np.maximum(0.0, full_boxes[:, 2] - full_boxes[:, 0]) *
255
+ np.maximum(0.0, full_boxes[:, 3] - full_boxes[:, 1]))
256
+ out = np.empty(n, dtype=np.float32)
257
+ for i in range(n):
258
+ bi = post_boxes[i]
259
+ xx1 = np.maximum(bi[0], full_boxes[:, 0])
260
+ yy1 = np.maximum(bi[1], full_boxes[:, 1])
261
+ xx2 = np.minimum(bi[2], full_boxes[:, 2])
262
+ yy2 = np.minimum(bi[3], full_boxes[:, 3])
263
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
264
+ a_i = max(0.0, float((bi[2] - bi[0]) * (bi[3] - bi[1])))
265
+ iou = inter / (a_i + full_areas - inter + 1e-7)
266
+ cluster = (iou >= iou_thresh) & (full_cls == post_cls[i])
267
+ out[i] = float(np.max(full_scores[cluster])) if np.any(cluster) else 0.0
268
  return out
269
+
270
+ def _conf_filter_mask(self, scores: np.ndarray,
271
+ cls_ids: np.ndarray) -> np.ndarray:
272
+ """Boolean keep-mask: score >= per-class threshold, with a per-class
273
+ rescue — if a class has zero boxes passing, admit its top-1 candidate
274
+ when its score >= (per-class threshold - per-class bonus)."""
275
+ if len(scores) == 0:
276
+ return np.zeros(0, dtype=bool)
277
+ thr = self._conf_thres_array[cls_ids]
278
+ keep = scores >= thr
279
+ for c in np.unique(cls_ids):
280
+ b = float(self._bonus_array[c])
281
+ if b <= 0.0:
282
+ continue
283
+ cm = cls_ids == c
284
+ if keep[cm].any():
285
+ continue
286
+ idx = np.where(cm)[0]
287
+ top = int(idx[int(np.argmax(scores[idx]))])
288
+ if scores[top] >= self._conf_thres_array[c] - b:
289
+ keep[top] = True
290
+ return keep
291
+
292
+ def _per_view_pipeline(self, boxes: np.ndarray, scores: np.ndarray,
293
+ cls_ids: np.ndarray, orig_size: tuple[int, int]
294
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
295
+ boxes, scores, cls_ids = self._filter_sane_boxes(
296
+ boxes, scores, cls_ids, orig_size
297
+ )
298
+ if len(boxes) == 0:
299
+ return boxes, scores, cls_ids
300
+ if len(boxes) > 1:
301
+ keep = self._hard_nms(boxes, scores, self.iou_thres)
302
+ boxes, scores, cls_ids = boxes[keep], scores[keep], cls_ids[keep]
303
+ if len(scores) > self.max_det:
304
+ top = np.argsort(-scores)[: self.max_det]
305
+ boxes, scores, cls_ids = boxes[top], scores[top], cls_ids[top]
306
+ if len(boxes) > 1:
307
+ boxes, scores, cls_ids = self._cross_class_dedup_op(
308
+ boxes, scores, cls_ids, self.cross_iou_thresh
309
+ )
310
+ return boxes, scores, cls_ids
311
+
312
+ def _decode_final_dets(self, preds: np.ndarray, ratio: float,
313
+ pad: tuple[float, float],
314
+ orig_size: tuple[int, int]) -> list[BoundingBox]:
315
+ if preds.ndim == 3 and preds.shape[0] == 1:
316
+ preds = preds[0]
317
+ if preds.ndim != 2 or preds.shape[1] < 6:
318
+ raise ValueError(f"Unexpected ONNX final-det output shape: {preds.shape}")
319
+
320
+ boxes = preds[:, :4].astype(np.float32)
321
+ scores = preds[:, 4].astype(np.float32)
322
+ cls_ids = preds[:, 5].astype(np.int32)
323
+
324
+ keep = self._conf_filter_mask(scores, cls_ids)
325
+ boxes = boxes[keep]
326
+ scores = scores[keep]
327
+ cls_ids = cls_ids[keep]
328
+ if len(boxes) == 0:
329
+ return []
330
+
331
+ pad_w, pad_h = pad
332
+ boxes[:, [0, 2]] -= pad_w
333
+ boxes[:, [1, 3]] -= pad_h
334
+ boxes /= ratio
335
+ boxes = self._clip_boxes(boxes, orig_size)
336
+
337
+ boxes, scores, cls_ids = self._per_view_pipeline(
338
+ boxes, scores, cls_ids, orig_size
339
+ )
340
+ return self._build_results(boxes, scores, cls_ids)
341
+
342
+ def _decode_raw_yolo(self, preds: np.ndarray, ratio: float,
343
+ pad: tuple[float, float],
344
+ orig_size: tuple[int, int]) -> list[BoundingBox]:
345
+ if preds.ndim != 3 or preds.shape[0] != 1:
346
+ raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
347
+ preds = preds[0]
348
+ if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
349
+ preds = preds.T
350
+ if preds.ndim != 2 or preds.shape[1] < 5:
351
+ raise ValueError(f"Unexpected raw output shape: {preds.shape}")
352
+
353
+ boxes_xywh = preds[:, :4].astype(np.float32)
354
+ cls_part = preds[:, 4:].astype(np.float32)
355
+ if cls_part.shape[1] == 1:
356
+ scores = cls_part[:, 0]
357
+ cls_ids = np.zeros(len(scores), dtype=np.int32)
358
+ else:
359
+ cls_ids = np.argmax(cls_part, axis=1).astype(np.int32)
360
+ scores = cls_part[np.arange(len(cls_part)), cls_ids]
361
+
362
+ keep = self._conf_filter_mask(scores, cls_ids)
363
+ boxes_xywh = boxes_xywh[keep]
364
+ scores = scores[keep]
365
+ cls_ids = cls_ids[keep]
366
+ if len(boxes_xywh) == 0:
367
+ return []
368
+ boxes = self._xywh_to_xyxy(boxes_xywh)
369
+
370
+ pad_w, pad_h = pad
371
+ boxes[:, [0, 2]] -= pad_w
372
+ boxes[:, [1, 3]] -= pad_h
373
+ boxes /= ratio
374
+ boxes = self._clip_boxes(boxes, orig_size)
375
+
376
+ boxes, scores, cls_ids = self._per_view_pipeline(
377
+ boxes, scores, cls_ids, orig_size
378
+ )
379
+ return self._build_results(boxes, scores, cls_ids)
380
+
381
+ @staticmethod
382
+ def _build_results(boxes: np.ndarray, scores: np.ndarray,
383
+ cls_ids: np.ndarray) -> list[BoundingBox]:
384
+ results: list[BoundingBox] = []
385
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
386
+ x1, y1, x2, y2 = box.tolist()
387
+ if x2 <= x1 or y2 <= y1:
388
+ continue
389
+ results.append(
390
+ BoundingBox(
391
+ x1=int(math.floor(x1)),
392
+ y1=int(math.floor(y1)),
393
+ x2=int(math.ceil(x2)),
394
+ y2=int(math.ceil(y2)),
395
+ cls_id=int(cls_id),
396
+ conf=float(conf),
397
+ )
398
+ )
399
+ return results
400
+
401
+ def _postprocess(self, output: np.ndarray, ratio: float,
402
+ pad: tuple[float, float],
403
+ orig_size: tuple[int, int]) -> list[BoundingBox]:
404
+ if output.ndim == 2 and output.shape[1] >= 6:
405
+ return self._decode_final_dets(output, ratio, pad, orig_size)
406
+ if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] == 6:
407
+ return self._decode_final_dets(output, ratio, pad, orig_size)
408
+ return self._decode_raw_yolo(output, ratio, pad, orig_size)
409
+
410
+ def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
411
+ if image is None:
412
+ raise ValueError("Input image is None")
413
+ if not isinstance(image, np.ndarray):
414
+ raise TypeError(f"Input is not numpy array: {type(image)}")
415
+ if image.ndim != 3:
416
+ raise ValueError(f"Expected HWC image, got shape={image.shape}")
417
+ if image.shape[2] != 3:
418
+ raise ValueError(f"Expected 3 channels, got shape={image.shape}")
419
+ if image.dtype != np.uint8:
420
+ image = image.astype(np.uint8)
421
+
422
+ input_tensor, ratio, pad, orig_size = self._preprocess(image)
423
+ expected = (1, 3, self.input_height, self.input_width)
424
+ if input_tensor.shape != expected:
425
+ raise ValueError(
426
+ f"Bad input tensor shape={input_tensor.shape}, expected={expected}"
427
+ )
428
+
429
+ outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
430
+ return self._postprocess(outputs[0], ratio, pad, orig_size)
431
+
432
+ def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
433
+ boxes_orig = self._predict_single(image)
434
+ flipped = cv2.flip(image, 1)
435
+ boxes_flip = self._predict_single(flipped)
436
+ w = image.shape[1]
437
+ boxes_flip = [
438
+ BoundingBox(
439
+ x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
440
+ cls_id=b.cls_id, conf=b.conf,
441
+ )
442
+ for b in boxes_flip
443
+ ]
444
+ all_boxes = boxes_orig + boxes_flip
445
+ if not all_boxes:
446
+ return []
447
+
448
+ coords = np.array(
449
+ [[b.x1, b.y1, b.x2, b.y2] for b in all_boxes], dtype=np.float32
450
+ )
451
+ scores = np.array([b.conf for b in all_boxes], dtype=np.float32)
452
+ cls_ids = np.array([b.cls_id for b in all_boxes], dtype=np.int32)
453
+
454
+ hard_keep = self._per_class_hard_nms(coords, scores, cls_ids, self.iou_thres)
455
+ if len(hard_keep) == 0:
456
+ return []
457
+ if len(hard_keep) > self.max_det:
458
+ top = np.argsort(-scores[hard_keep])[: self.max_det]
459
+ hard_keep = hard_keep[top]
460
+ boosted = self._max_score_per_cluster(
461
+ coords[hard_keep], cls_ids[hard_keep],
462
+ coords, scores, cls_ids, self.iou_thres,
463
+ )
464
+
465
+ kept_coords = coords[hard_keep]
466
+ kept_cls = cls_ids[hard_keep]
467
+ if len(kept_coords) > 1:
468
+ kept_coords, boosted, kept_cls = self._cross_class_dedup_op(
469
+ kept_coords, boosted, kept_cls, self.cross_iou_thresh
470
+ )
471
+
472
+ return [
473
+ BoundingBox(
474
+ x1=int(math.floor(kept_coords[j, 0])),
475
+ y1=int(math.floor(kept_coords[j, 1])),
476
+ x2=int(math.ceil(kept_coords[j, 2])),
477
+ y2=int(math.ceil(kept_coords[j, 3])),
478
+ cls_id=int(kept_cls[j]),
479
+ conf=float(boosted[j]),
480
+ )
481
+ for j in range(len(kept_coords))
482
+ ]
483
+
484
+ def predict_batch(self, batch_images: list[ndarray], offset: int,
485
+ n_keypoints: int) -> list[TVFrameResult]:
486
+ results: list[TVFrameResult] = []
487
+ for frame_number_in_batch, image in enumerate(batch_images):
488
+ try:
489
+ boxes = self._predict_tta(image)
490
+ except Exception as e:
491
+ print(f"Inference failed for frame {offset + frame_number_in_batch}: {e}")
492
+ boxes = []
493
+ results.append(
494
+ TVFrameResult(
495
+ frame_id=offset + frame_number_in_batch,
496
+ boxes=boxes,
497
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
498
+ )
499
+ )
500
+ return results