coolroman commited on
Commit
6188620
·
verified ·
1 Parent(s): a0c3aae

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +409 -213
miner.py CHANGED
@@ -1,24 +1,24 @@
1
- """Petrol-station detection miner for SN44 (TurboVision).
2
 
3
- Element: manak0_Detect-petrol-station-1-0 (4 classes: hose, pump, board, canopy)
4
- Backbone: YOLO11s, exported with NMS baked into the ONNX graph,
5
- weights converted to FP16. Output shape: [1, 300, 6] =
6
- (x1, y1, x2, y2, conf, cls_id) in letterboxed input coordinates.
7
 
8
  Inference pipeline:
9
- 1. Letterbox to 1280x1280, BGR->RGB, /255, NCHW float16.
10
- 2. Single ORT CUDA pass + horizontal-flip TTA (boxes merged via per-class
11
- hard NMS). Multi-scale TTA was net-negative on val so omitted here.
12
- 3. Per-class confidence thresholds tuned from the bench:
13
- hose=0.55, pump=0.55, board=0.50, canopy=0.55.
14
- 4. Reverse letterbox; clip to image bounds; emit BoundingBox.
15
-
16
- Local val composite_onchain (multi-TTA): 0.6460 leader median is 0.6276.
 
17
  """
18
- from __future__ import annotations
19
 
20
- import math
21
  from pathlib import Path
 
22
 
23
  import cv2
24
  import numpy as np
@@ -42,191 +42,410 @@ class TVFrameResult(BaseModel):
42
  keypoints: list[tuple[int, int]]
43
 
44
 
45
- CLASS_NAMES = ["petrol hose", "petrol pump", "price board", "roof canopy"]
46
- PER_CLASS_CONF = {0: 0.55, 1: 0.55, 2: 0.50, 3: 0.55}
47
- GLOBAL_CONF = min(PER_CLASS_CONF.values()) # filter floor before per-class
48
- NMS_IOU = 0.50
49
- WEIGHTS_FILENAME = "best_fp16.onnx"
50
-
51
-
52
- def _letterbox(image: ndarray, new_shape: tuple[int, int],
53
- color: tuple[int, int, int] = (114, 114, 114)
54
- ) -> tuple[ndarray, float, tuple[float, float]]:
55
- h, w = image.shape[:2]
56
- new_w, new_h = new_shape
57
- ratio = min(new_w / w, new_h / h)
58
- rw, rh = int(round(w * ratio)), int(round(h * ratio))
59
- if (rw, rh) != (w, h):
60
- interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
61
- image = cv2.resize(image, (rw, rh), interpolation=interp)
62
- dw = (new_w - rw) / 2.0
63
- dh = (new_h - rh) / 2.0
64
- left = int(round(dw - 0.1))
65
- right = int(round(dw + 0.1))
66
- top = int(round(dh - 0.1))
67
- bottom = int(round(dh + 0.1))
68
- padded = cv2.copyMakeBorder(image, top, bottom, left, right,
69
- cv2.BORDER_CONSTANT, value=color)
70
- return padded, ratio, (dw, dh)
71
-
72
-
73
- def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_t: float) -> np.ndarray:
74
- if len(boxes) == 0:
75
- return np.array([], dtype=np.intp)
76
- order = np.argsort(scores)[::-1]
77
- keep: list[int] = []
78
- while len(order):
79
- i = int(order[0])
80
- keep.append(i)
81
- if len(order) == 1:
82
- break
83
- rest = order[1:]
84
- xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
85
- yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
86
- xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
87
- yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
88
- inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
89
- area_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
90
- area_r = (boxes[rest, 2] - boxes[rest, 0]) * (boxes[rest, 3] - boxes[rest, 1])
91
- iou = inter / (area_i + area_r - inter + 1e-9)
92
- order = rest[iou <= iou_t]
93
- return np.array(keep, dtype=np.intp)
94
-
95
-
96
  class Miner:
97
- """SN44 chute entrypoint. Required:
98
- - class named `Miner`
99
- - method `predict_batch(batch_images, offset, n_keypoints)`
100
- - file at the root of the HF repo as `miner.py`
101
- """
102
-
103
  def __init__(self, path_hf_repo: Path) -> None:
104
- model_path = path_hf_repo / WEIGHTS_FILENAME
105
- if not model_path.exists():
106
- # Defensive: try alternative names that earlier exports used
107
- for alt in ("petrol.onnx", "weights.onnx", "best.onnx"):
108
- if (path_hf_repo / alt).exists():
109
- model_path = path_hf_repo / alt
110
- break
111
-
112
- print(f"[miner] ORT version: {ort.__version__}")
 
 
 
 
 
 
 
 
113
  try:
114
  ort.preload_dlls()
115
- except Exception:
116
- pass
117
- print(f"[miner] available providers: {ort.get_available_providers()}")
 
118
 
119
- sess_opts = ort.SessionOptions()
120
- sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
121
 
122
  try:
123
  self.session = ort.InferenceSession(
124
  str(model_path),
125
- sess_options=sess_opts,
126
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
127
  )
128
- print(f"[miner] ✅ session created (preferred CUDA)")
129
  except Exception as e:
130
- print(f"[miner] ⚠️ CUDA session failed, falling back to CPU: {e}")
131
  self.session = ort.InferenceSession(
132
  str(model_path),
133
- sess_options=sess_opts,
134
  providers=["CPUExecutionProvider"],
135
  )
136
-
137
- print(f"[miner] active providers: {self.session.get_providers()}")
138
-
139
- self.input_name = self.session.get_inputs()[0].name
140
- ishape = self.session.get_inputs()[0].shape
141
- self.in_h = ishape[2] if isinstance(ishape[2], int) and ishape[2] > 0 else 1280
142
- self.in_w = ishape[3] if isinstance(ishape[3], int) and ishape[3] > 0 else 1280
143
- ttype = self.session.get_inputs()[0].type
144
- self.dtype = np.float16 if "float16" in ttype else np.float32
145
- print(f"[miner] input ({self.in_h}x{self.in_w}, dtype={self.dtype.__name__})")
146
- print(f"[miner] weights: {model_path.name} ({model_path.stat().st_size/1e6:.2f} MB)")
147
- print(f"[miner] per-class conf: {PER_CLASS_CONF}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  def __repr__(self) -> str:
150
  return (
151
- f"PetrolStationMiner(providers={self.session.get_providers()}, "
152
- f"in={self.in_h}x{self.in_w}, dtype={self.dtype.__name__})"
153
  )
154
 
155
- def _run_pass(self, image: ndarray) -> list[BoundingBox]:
156
- """One ONNX forward pass. Returns BoundingBox in original-image coords."""
157
- h, w = image.shape[:2]
158
- padded, ratio, (dw, dh) = _letterbox(image, (self.in_w, self.in_h))
159
- rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
160
- x = np.transpose(rgb, (2, 0, 1))[None].astype(self.dtype, copy=False)
161
- x = np.ascontiguousarray(x)
162
- out = self.session.run(None, {self.input_name: x})[0]
163
- if out.ndim == 3:
164
- out = out[0]
165
- out = out.astype(np.float32, copy=False)
166
-
167
- valid = out[:, 4] > 0
168
- if not valid.any():
169
- return []
170
- out = out[valid]
171
 
172
- boxes = out[:, :4].copy()
173
- boxes[:, [0, 2]] -= dw
174
- boxes[:, [1, 3]] -= dh
175
- boxes /= ratio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
177
  boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
178
  boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
179
  boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
180
- scores = out[:, 4]
181
- cls_ids = out[:, 5].astype(np.int32)
182
 
183
- results: list[BoundingBox] = []
184
- for (x1, y1, x2, y2), c, k in zip(boxes, scores, cls_ids):
185
- cls = int(k)
186
- thr = PER_CLASS_CONF.get(cls, GLOBAL_CONF)
187
- if c < thr or x2 <= x1 or y2 <= y1:
 
 
 
 
 
 
 
 
 
 
 
 
188
  continue
189
- results.append(BoundingBox(
190
- x1=int(math.floor(x1)),
191
- y1=int(math.floor(y1)),
192
- x2=int(math.ceil(x2)),
193
- y2=int(math.ceil(y2)),
194
- cls_id=cls,
195
- conf=float(c),
196
- ))
197
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- def _predict_with_hflip(self, image: ndarray) -> list[BoundingBox]:
200
- """Original + horizontal flip. Per-class hard NMS, IoU=0.50."""
201
- a = self._run_pass(image)
202
  flipped = cv2.flip(image, 1)
203
- b = self._run_pass(flipped)
204
- w = image.shape[1]
205
- b_unflipped = [
206
- BoundingBox(
207
- x1=w - bb.x2, y1=bb.y1, x2=w - bb.x1, y2=bb.y2,
208
- cls_id=bb.cls_id, conf=bb.conf,
209
- )
210
- for bb in b
211
- ]
212
- all_boxes = a + b_unflipped
213
- if not all_boxes:
 
 
 
 
214
  return []
 
215
 
216
- by_cls: dict[int, list[BoundingBox]] = {}
217
- for bb in all_boxes:
218
- by_cls.setdefault(bb.cls_id, []).append(bb)
219
 
220
- merged: list[BoundingBox] = []
221
- for cls_id, lst in by_cls.items():
222
- coords = np.array(
223
- [[bb.x1, bb.y1, bb.x2, bb.y2] for bb in lst], dtype=np.float32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
- scores = np.array([bb.conf for bb in lst], dtype=np.float32)
226
- keep = _hard_nms(coords, scores, NMS_IOU)
227
- for i in keep:
228
- merged.append(lst[int(i)])
229
- return merged
230
 
231
  def predict_batch(
232
  self,
@@ -234,55 +453,32 @@ class Miner:
234
  offset: int,
235
  n_keypoints: int,
236
  ) -> list[TVFrameResult]:
237
- """Detection-only element (no keypoints) — return n_keypoints zeros
238
- per frame to keep the schema stable across challenge types."""
239
- n_kp = max(0, int(n_keypoints))
240
  results: list[TVFrameResult] = []
241
- for i, image in enumerate(batch_images):
242
- frame_idx = offset + i
 
 
 
 
 
 
 
 
 
 
243
  try:
244
- if image is None or not isinstance(image, np.ndarray) \
245
- or image.ndim != 3 or image.shape[2] != 3:
246
- raise ValueError(f"bad image at frame {frame_idx}: {type(image)}")
247
- if image.dtype != np.uint8:
248
- image = image.astype(np.uint8)
249
- boxes = self._predict_with_hflip(image)
250
  except Exception as e:
251
- print(f"[miner] ⚠️ frame {frame_idx} failed: {e}")
252
  boxes = []
253
- results.append(TVFrameResult(
254
- frame_id=frame_idx,
255
- boxes=boxes,
256
- keypoints=[(0, 0) for _ in range(n_kp)],
257
- ))
 
 
258
  return results
259
-
260
-
261
- def main() -> None:
262
- """Local smoke test: load miner from cwd, run on argv images or a blank."""
263
- import sys
264
-
265
- repo = Path(__file__).parent
266
- miner = Miner(repo)
267
- print(repr(miner))
268
-
269
- images: list[np.ndarray] = []
270
- if len(sys.argv) > 1:
271
- for p in sys.argv[1:]:
272
- img = cv2.imread(p)
273
- if img is None:
274
- raise ValueError(f"cannot read {p}")
275
- images.append(img)
276
- else:
277
- images = [np.zeros((720, 1280, 3), dtype=np.uint8)]
278
-
279
- results = miner.predict_batch(images, offset=0, n_keypoints=0)
280
- for r in results:
281
- print(f"frame {r.frame_id}: {len(r.boxes)} boxes")
282
- for b in r.boxes:
283
- name = CLASS_NAMES[b.cls_id] if 0 <= b.cls_id < len(CLASS_NAMES) else b.cls_id
284
- print(f" {name:12s} conf={b.conf:.3f} ({b.x1},{b.y1},{b.x2},{b.y2})")
285
-
286
-
287
- if __name__ == "__main__":
288
- main()
 
1
+ """TurboVision crime-detection miner.
2
 
3
+ YOLO11s @ 1280x1280, 6-class detection (balaclava, bat, glove, graffiti, hoodie,
4
+ spray paint), ONNX with end-to-end NMS baked in.
5
+
6
+ Output of weights.onnx: [1, 300, 6] = x1, y1, x2, y2, conf, cls (post-NMS).
7
 
8
  Inference pipeline:
9
+ 1) Primary forward pass on the full image.
10
+ 2) Hflip TTA: forward on horizontally-flipped image, transform boxes back.
11
+ 3) Per-class hard-NMS to merge primary + flip outputs.
12
+ 4) Cross-class IoU dedup (suppresses same physical object getting two class labels).
13
+ 5) Consensus-confidence boost: when both views agree on a cluster, take max score.
14
+ 6) Sanity filter (min size, aspect ratio).
15
+
16
+ Class taxonomy (must match the validator manifest's `objects` list for this element):
17
+ 0 balaclava 1 bat 2 glove 3 graffiti 4 hoodie 5 spray paint
18
  """
 
19
 
 
20
  from pathlib import Path
21
+ import math
22
 
23
  import cv2
24
  import numpy as np
 
42
  keypoints: list[tuple[int, int]]
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  class Miner:
 
 
 
 
 
 
46
  def __init__(self, path_hf_repo: Path) -> None:
47
+ model_path = path_hf_repo / "weights.onnx"
48
+
49
+ # Validator manifest order (from spec.json `objects`):
50
+ # 0=balaclava 1=hoodie 2=glove 3=bat 4="spray paint" 5=graffiti
51
+ # v5 weights.onnx was trained with this exact order, so cls_remap is identity.
52
+ cn_path = model_path.with_name("class_names.txt")
53
+ if cn_path.is_file():
54
+ self.class_names = [
55
+ ln.strip()
56
+ for ln in cn_path.read_text(encoding="utf-8").splitlines()
57
+ if ln.strip() and not ln.strip().startswith("#")
58
+ ]
59
+ else:
60
+ self.class_names = ["balaclava", "hoodie", "glove", "bat", "spray paint", "graffiti"]
61
+ self.cls_remap = np.arange(len(self.class_names), dtype=np.int32)
62
+
63
+ print("ORT version:", ort.__version__)
64
  try:
65
  ort.preload_dlls()
66
+ print("✅ onnxruntime.preload_dlls() success")
67
+ except Exception as e:
68
+ print(f"⚠️ preload_dlls failed: {e}")
69
+ print("ORT available providers BEFORE session:", ort.get_available_providers())
70
 
71
+ sess_options = ort.SessionOptions()
72
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
73
 
74
  try:
75
  self.session = ort.InferenceSession(
76
  str(model_path),
77
+ sess_options=sess_options,
78
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
79
  )
80
+ print("✅ Created ORT session with preferred CUDA provider list")
81
  except Exception as e:
82
+ print(f"⚠️ CUDA session creation failed, falling back to CPU: {e}")
83
  self.session = ort.InferenceSession(
84
  str(model_path),
85
+ sess_options=sess_options,
86
  providers=["CPUExecutionProvider"],
87
  )
88
+ print("ORT session providers:", self.session.get_providers())
89
+
90
+ inp = self.session.get_inputs()[0]
91
+ self.input_name = inp.name
92
+ self.output_names = [o.name for o in self.session.get_outputs()]
93
+ self.input_shape = inp.shape
94
+ self.input_dtype = np.float16 if "float16" in inp.type else np.float32
95
+
96
+ self.input_height = self._safe_dim(self.input_shape[2], default=1280)
97
+ self.input_width = self._safe_dim(self.input_shape[3], default=1280)
98
+
99
+ # Tuning matched to alfred's deployed model — bias toward precision to dodge
100
+ # the false_positive pillar penalty (validator weights FP heavily on this element).
101
+ self.conf_thres = 0.50
102
+ self.iou_thres = 0.4
103
+ self.cross_iou_thresh = 0.7
104
+ self.max_det = 200
105
+ self.use_tta = True
106
+
107
+ # Sanity filter — reject obviously bad boxes
108
+ self.min_box_area = 14 * 14
109
+ self.min_side = 8
110
+ self.max_aspect_ratio = 8.0
111
+ self.max_box_area_ratio = 0.95
112
+
113
+ print(f"✅ ONNX loaded: {model_path}")
114
+ print(f"✅ providers: {self.session.get_providers()}")
115
+ print(f"✅ input: name={self.input_name}, shape={self.input_shape}, dtype={self.input_dtype}")
116
+ print(f"✅ classes: {self.class_names}")
117
+ print(f"✅ config: conf={self.conf_thres}, iou={self.iou_thres}, "
118
+ f"cross_iou={self.cross_iou_thresh}, TTA={self.use_tta}")
119
 
120
  def __repr__(self) -> str:
121
  return (
122
+ f"ONNXRuntime(session={type(self.session).__name__}, "
123
+ f"providers={self.session.get_providers()})"
124
  )
125
 
126
+ @staticmethod
127
+ def _safe_dim(value, default: int) -> int:
128
+ return value if isinstance(value, int) and value > 0 else default
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ def _letterbox(
131
+ self,
132
+ image: ndarray,
133
+ new_shape: tuple[int, int],
134
+ color=(114, 114, 114),
135
+ ) -> tuple[ndarray, float, tuple[float, float]]:
136
+ h, w = image.shape[:2]
137
+ new_w, new_h = new_shape
138
+ ratio = min(new_w / w, new_h / h)
139
+ resized_w = int(round(w * ratio))
140
+ resized_h = int(round(h * ratio))
141
+ if (resized_w, resized_h) != (w, h):
142
+ interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
143
+ image = cv2.resize(image, (resized_w, resized_h), interpolation=interp)
144
+ dw = (new_w - resized_w) / 2.0
145
+ dh = (new_h - resized_h) / 2.0
146
+ left = int(round(dw - 0.1))
147
+ right = int(round(dw + 0.1))
148
+ top = int(round(dh - 0.1))
149
+ bottom = int(round(dh + 0.1))
150
+ padded = cv2.copyMakeBorder(
151
+ image, top, bottom, left, right,
152
+ borderType=cv2.BORDER_CONSTANT, value=color,
153
+ )
154
+ return padded, ratio, (dw, dh)
155
+
156
+ def _preprocess(self, image: ndarray):
157
+ orig_h, orig_w = image.shape[:2]
158
+ img, ratio, pad = self._letterbox(image, (self.input_width, self.input_height))
159
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
160
+ img = img.astype(self.input_dtype) / 255.0
161
+ img = np.transpose(img, (2, 0, 1))[None, ...]
162
+ img = np.ascontiguousarray(img)
163
+ return img, ratio, pad, (orig_w, orig_h)
164
+
165
+ @staticmethod
166
+ def _clip_boxes(boxes: np.ndarray, image_size: tuple[int, int]) -> np.ndarray:
167
+ w, h = image_size
168
  boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
169
  boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
170
  boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
171
  boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
172
+ return boxes
 
173
 
174
+ def _filter_sane_boxes(
175
+ self,
176
+ boxes: np.ndarray,
177
+ scores: np.ndarray,
178
+ cls_ids: np.ndarray,
179
+ orig_size: tuple[int, int],
180
+ ):
181
+ if len(boxes) == 0:
182
+ return boxes, scores, cls_ids
183
+ orig_w, orig_h = orig_size
184
+ image_area = float(orig_w * orig_h)
185
+ keep = []
186
+ for i, box in enumerate(boxes):
187
+ x1, y1, x2, y2 = box.tolist()
188
+ bw = x2 - x1
189
+ bh = y2 - y1
190
+ if bw <= 0 or bh <= 0:
191
  continue
192
+ if bw < self.min_side or bh < self.min_side:
193
+ continue
194
+ area = bw * bh
195
+ if area < self.min_box_area:
196
+ continue
197
+ if area > self.max_box_area_ratio * image_area:
198
+ continue
199
+ ar = max(bw / max(bh, 1e-6), bh / max(bw, 1e-6))
200
+ if ar > self.max_aspect_ratio:
201
+ continue
202
+ keep.append(i)
203
+ if not keep:
204
+ return (
205
+ np.empty((0, 4), dtype=np.float32),
206
+ np.empty((0,), dtype=np.float32),
207
+ np.empty((0,), dtype=np.int32),
208
+ )
209
+ k = np.array(keep, dtype=np.intp)
210
+ return boxes[k], scores[k], cls_ids[k]
211
+
212
+ @staticmethod
213
+ def _hard_nms(
214
+ boxes: np.ndarray,
215
+ scores: np.ndarray,
216
+ iou_thresh: float,
217
+ ) -> np.ndarray:
218
+ N = len(boxes)
219
+ if N == 0:
220
+ return np.array([], dtype=np.intp)
221
+ boxes = np.asarray(boxes, dtype=np.float32)
222
+ scores = np.asarray(scores, dtype=np.float32)
223
+ order = np.argsort(scores)[::-1]
224
+ keep: list[int] = []
225
+ suppressed = np.zeros(N, dtype=bool)
226
+ for i in range(N):
227
+ idx = order[i]
228
+ if suppressed[idx]:
229
+ continue
230
+ keep.append(int(idx))
231
+ bi = boxes[idx]
232
+ for k in range(i + 1, N):
233
+ jdx = order[k]
234
+ if suppressed[jdx]:
235
+ continue
236
+ bj = boxes[jdx]
237
+ xx1 = max(bi[0], bj[0])
238
+ yy1 = max(bi[1], bj[1])
239
+ xx2 = min(bi[2], bj[2])
240
+ yy2 = min(bi[3], bj[3])
241
+ inter = max(0.0, xx2 - xx1) * max(0.0, yy2 - yy1)
242
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
243
+ area_j = (bj[2] - bj[0]) * (bj[3] - bj[1])
244
+ iou = inter / (area_i + area_j - inter + 1e-7)
245
+ if iou > iou_thresh:
246
+ suppressed[jdx] = True
247
+ return np.array(keep, dtype=np.intp)
248
+
249
+ def _per_class_hard_nms(
250
+ self,
251
+ boxes: np.ndarray,
252
+ scores: np.ndarray,
253
+ cls_ids: np.ndarray,
254
+ iou_thresh: float,
255
+ ) -> np.ndarray:
256
+ if len(boxes) == 0:
257
+ return np.array([], dtype=np.intp)
258
+ all_keep: list[int] = []
259
+ for c in np.unique(cls_ids):
260
+ mask = cls_ids == c
261
+ indices = np.where(mask)[0]
262
+ keep = self._hard_nms(boxes[mask], scores[mask], iou_thresh)
263
+ all_keep.extend(indices[keep].tolist())
264
+ all_keep.sort()
265
+ return np.array(all_keep, dtype=np.intp)
266
+
267
+ @staticmethod
268
+ def _cross_class_dedup(
269
+ boxes: np.ndarray,
270
+ scores: np.ndarray,
271
+ cls_ids: np.ndarray,
272
+ iou_thresh: float,
273
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
274
+ n = len(boxes)
275
+ if n <= 1:
276
+ return boxes, scores, cls_ids
277
+ boxes = np.asarray(boxes, dtype=np.float32)
278
+ scores = np.asarray(scores, dtype=np.float32)
279
+ cls_ids = np.asarray(cls_ids, dtype=np.int32)
280
+ areas = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(
281
+ 0.0, boxes[:, 3] - boxes[:, 1]
282
+ )
283
+ # Keep larger boxes first, then higher score.
284
+ order = np.lexsort((-scores, -areas))
285
+ suppressed = np.zeros(n, dtype=bool)
286
+ keep: list[int] = []
287
+ for i in order:
288
+ if suppressed[i]:
289
+ continue
290
+ keep.append(int(i))
291
+ bi = boxes[i]
292
+ xx1 = np.maximum(bi[0], boxes[:, 0])
293
+ yy1 = np.maximum(bi[1], boxes[:, 1])
294
+ xx2 = np.minimum(bi[2], boxes[:, 2])
295
+ yy2 = np.minimum(bi[3], boxes[:, 3])
296
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
297
+ area_i = max(1e-7, float((bi[2] - bi[0]) * (bi[3] - bi[1])))
298
+ union = area_i + areas - inter + 1e-7
299
+ iou = inter / union
300
+ dup = iou > iou_thresh
301
+ dup[i] = False
302
+ suppressed |= dup
303
+ keep_idx = np.array(keep, dtype=np.intp)
304
+ return boxes[keep_idx], scores[keep_idx], cls_ids[keep_idx]
305
+
306
+ @staticmethod
307
+ def _max_score_per_cluster(
308
+ coords: np.ndarray,
309
+ scores: np.ndarray,
310
+ keep_indices: np.ndarray,
311
+ iou_thresh: float,
312
+ ) -> np.ndarray:
313
+ n_keep = len(keep_indices)
314
+ if n_keep == 0:
315
+ return np.array([], dtype=np.float32)
316
+ coords = np.asarray(coords, dtype=np.float32)
317
+ scores = np.asarray(scores, dtype=np.float32)
318
+ out = np.empty(n_keep, dtype=np.float32)
319
+ for i in range(n_keep):
320
+ idx = keep_indices[i]
321
+ bi = coords[idx]
322
+ xx1 = np.maximum(bi[0], coords[:, 0])
323
+ yy1 = np.maximum(bi[1], coords[:, 1])
324
+ xx2 = np.minimum(bi[2], coords[:, 2])
325
+ yy2 = np.minimum(bi[3], coords[:, 3])
326
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
327
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
328
+ areas_j = (coords[:, 2] - coords[:, 0]) * (coords[:, 3] - coords[:, 1])
329
+ iou = inter / (area_i + areas_j - inter + 1e-7)
330
+ in_cluster = iou >= iou_thresh
331
+ out[i] = float(np.max(scores[in_cluster]))
332
+ return out
333
+
334
+ def _decode_raw_dets(
335
+ self,
336
+ preds: np.ndarray,
337
+ ratio: float,
338
+ pad: tuple[float, float],
339
+ orig_size: tuple[int, int],
340
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
341
+ """Decode end2end NMS output and return (boxes, scores, cls_ids)
342
+ in original image coordinates, after conf-threshold + remap + letterbox-reverse + sanity."""
343
+ if preds.ndim == 3 and preds.shape[0] == 1:
344
+ preds = preds[0]
345
+ if preds.ndim != 2 or preds.shape[1] < 6:
346
+ raise ValueError(f"Unexpected ONNX output shape: {preds.shape}")
347
+
348
+ boxes = preds[:, :4].astype(np.float32)
349
+ scores = preds[:, 4].astype(np.float32)
350
+ cls_ids = preds[:, 5].astype(np.int32)
351
+
352
+ valid = (cls_ids >= 0) & (cls_ids < len(self.cls_remap))
353
+ boxes, scores, cls_ids = boxes[valid], scores[valid], cls_ids[valid]
354
+ cls_ids = self.cls_remap[cls_ids]
355
+
356
+ keep = scores >= self.conf_thres
357
+ boxes = boxes[keep]
358
+ scores = scores[keep]
359
+ cls_ids = cls_ids[keep]
360
+ if len(boxes) == 0:
361
+ return (
362
+ np.empty((0, 4), dtype=np.float32),
363
+ np.empty((0,), dtype=np.float32),
364
+ np.empty((0,), dtype=np.int32),
365
+ )
366
+
367
+ pad_w, pad_h = pad
368
+ orig_w, orig_h = orig_size
369
+ boxes[:, [0, 2]] -= pad_w
370
+ boxes[:, [1, 3]] -= pad_h
371
+ boxes /= ratio
372
+ boxes = self._clip_boxes(boxes, (orig_w, orig_h))
373
+
374
+ boxes, scores, cls_ids = self._filter_sane_boxes(boxes, scores, cls_ids, orig_size)
375
+ return boxes, scores, cls_ids
376
+
377
+ def _forward(
378
+ self, image: np.ndarray
379
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
380
+ x, ratio, pad, orig_size = self._preprocess(image)
381
+ out = self.session.run(self.output_names, {self.input_name: x})[0]
382
+ return self._decode_raw_dets(out, ratio, pad, orig_size)
383
+
384
+ def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
385
+ boxes, scores, cls_ids = self._forward(image)
386
+ if len(boxes) == 0:
387
+ return []
388
+ return self._build_results(boxes, scores, cls_ids)
389
+
390
+ def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
391
+ """Hflip TTA: merge primary + flipped via per-class hard-NMS,
392
+ then cross-class dedup, with consensus-confidence boost."""
393
+ ow = image.shape[1]
394
+ b1, s1, c1 = self._forward(image)
395
 
 
 
 
396
  flipped = cv2.flip(image, 1)
397
+ b2, s2, c2 = self._forward(flipped)
398
+ if len(b2):
399
+ x1f = ow - b2[:, 2]
400
+ x2f = ow - b2[:, 0]
401
+ b2 = np.stack([x1f, b2[:, 1], x2f, b2[:, 3]], axis=1)
402
+
403
+ if len(b1) == 0 and len(b2) == 0:
404
+ return []
405
+
406
+ boxes = np.concatenate([b1, b2], axis=0) if len(b2) else b1
407
+ scores = np.concatenate([s1, s2], axis=0) if len(b2) else s1
408
+ cls_ids = np.concatenate([c1, c2], axis=0) if len(b2) else c1
409
+
410
+ keep = self._per_class_hard_nms(boxes, scores, cls_ids, self.iou_thres)
411
+ if len(keep) == 0:
412
  return []
413
+ keep = keep[: self.max_det]
414
 
415
+ # Consensus-confidence boost: cluster by IoU and take max score.
416
+ boosted = self._max_score_per_cluster(boxes, scores, keep, self.iou_thres)
 
417
 
418
+ boxes = boxes[keep]
419
+ cls_ids = cls_ids[keep]
420
+ scores = boosted
421
+
422
+ boxes, scores, cls_ids = self._cross_class_dedup(
423
+ boxes, scores, cls_ids, self.cross_iou_thresh
424
+ )
425
+ if len(boxes) == 0:
426
+ return []
427
+
428
+ return self._build_results(boxes, scores, cls_ids)
429
+
430
+ def _build_results(
431
+ self, boxes: np.ndarray, scores: np.ndarray, cls_ids: np.ndarray
432
+ ) -> list[BoundingBox]:
433
+ results: list[BoundingBox] = []
434
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
435
+ x1, y1, x2, y2 = box.tolist()
436
+ if x2 <= x1 or y2 <= y1:
437
+ continue
438
+ results.append(
439
+ BoundingBox(
440
+ x1=int(math.floor(x1)),
441
+ y1=int(math.floor(y1)),
442
+ x2=int(math.ceil(x2)),
443
+ y2=int(math.ceil(y2)),
444
+ cls_id=int(cls_id),
445
+ conf=float(conf),
446
+ )
447
  )
448
+ return results
 
 
 
 
449
 
450
  def predict_batch(
451
  self,
 
453
  offset: int,
454
  n_keypoints: int,
455
  ) -> list[TVFrameResult]:
 
 
 
456
  results: list[TVFrameResult] = []
457
+ for frame_number_in_batch, image in enumerate(batch_images):
458
+ if image is None or not isinstance(image, np.ndarray) or image.ndim != 3:
459
+ results.append(
460
+ TVFrameResult(
461
+ frame_id=offset + frame_number_in_batch,
462
+ boxes=[],
463
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
464
+ )
465
+ )
466
+ continue
467
+ if image.dtype != np.uint8:
468
+ image = image.astype(np.uint8)
469
  try:
470
+ if self.use_tta:
471
+ boxes = self._predict_tta(image)
472
+ else:
473
+ boxes = self._predict_single(image)
 
 
474
  except Exception as e:
475
+ print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
476
  boxes = []
477
+ results.append(
478
+ TVFrameResult(
479
+ frame_id=offset + frame_number_in_batch,
480
+ boxes=boxes,
481
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
482
+ )
483
+ )
484
  return results