aitask1024 commited on
Commit
e1051be
·
verified ·
1 Parent(s): db7883f

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +522 -192
miner.py CHANGED
@@ -22,11 +22,41 @@ class TVFrameResult(BaseModel):
22
  boxes: list[BoundingBox]
23
  keypoints: list[tuple[int, int]]
24
 
 
 
 
25
 
26
  class Miner:
27
  def __init__(self, path_hf_repo: Path) -> None:
28
  model_path = path_hf_repo / "weights.onnx"
29
- self.class_names = ["person"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  sess_options = ort.SessionOptions()
32
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
@@ -37,71 +67,120 @@ class Miner:
37
  sess_options=sess_options,
38
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
39
  )
40
- except Exception:
 
 
41
  self.session = ort.InferenceSession(
42
  str(model_path),
43
  sess_options=sess_options,
44
  providers=["CPUExecutionProvider"],
45
  )
46
 
 
 
 
 
 
 
 
 
47
  self.input_name = self.session.get_inputs()[0].name
48
- self.output_names = [o.name for o in self.session.get_outputs()]
49
  self.input_shape = self.session.get_inputs()[0].shape
50
- self.input_height = self._safe_dim(self.input_shape[2], 1280)
51
- self.input_width = self._safe_dim(self.input_shape[3], 1280)
52
-
53
- # Tuned for MAP50 (65%) + FALSE_POSITIVE (35%) scoring
54
- # Lower conf = more recall = higher MAP50, but more FP
55
- # Balance: slightly aggressive recall since MAP50 weight > FP weight
56
- self.conf_thres = 0.40
57
- self.conf_high = 0.55
58
- self.iou_thres = 0.50
59
- self.tta_match_iou = 0.45
60
- self.max_det = 200
61
- self.use_tta = True
62
 
63
- # Box sanity filters
64
- self.min_box_area = 12 * 12
65
- self.min_w = 6
66
- self.min_h = 6
67
- self.max_aspect_ratio = 7.0
68
- self.max_box_area_ratio = 0.85
69
 
70
- print(f"Model loaded: {model_path}, providers={self.session.get_providers()}")
 
 
 
 
 
 
 
71
 
72
  def __repr__(self) -> str:
73
- return f"ONNXRuntime(providers={self.session.get_providers()})"
 
 
 
74
 
75
  @staticmethod
76
  def _safe_dim(value, default: int) -> int:
77
  return value if isinstance(value, int) and value > 0 else default
78
 
79
- def _letterbox(self, image: ndarray, new_shape: tuple[int, int],
80
- color=(114, 114, 114)) -> tuple[ndarray, float, tuple[float, float]]:
 
 
 
 
 
 
 
 
 
 
 
81
  h, w = image.shape[:2]
82
  new_w, new_h = new_shape
 
83
  ratio = min(new_w / w, new_h / h)
84
- rw, rh = int(round(w * ratio)), int(round(h * ratio))
85
- if (rw, rh) != (w, h):
 
 
86
  interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
87
- image = cv2.resize(image, (rw, rh), interpolation=interp)
88
- dw, dh = (new_w - rw) / 2.0, (new_h - rh) / 2.0
 
 
 
 
 
 
 
 
 
 
89
  padded = cv2.copyMakeBorder(
90
- image, int(round(dh - 0.1)), int(round(dh + 0.1)),
91
- int(round(dw - 0.1)), int(round(dw + 0.1)),
92
- borderType=cv2.BORDER_CONSTANT, value=color)
 
 
 
 
 
93
  return padded, ratio, (dw, dh)
94
 
95
- def _preprocess(self, image: ndarray) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
 
 
 
 
 
 
 
 
 
 
96
  orig_h, orig_w = image.shape[:2]
97
- img, ratio, pad = self._letterbox(image, (self.input_width, self.input_height))
98
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
99
- img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))[None, ...], dtype=np.float32)
 
 
 
 
 
 
100
  return img, ratio, pad, (orig_w, orig_h)
101
 
102
  @staticmethod
103
- def _clip_boxes(boxes: np.ndarray, size: tuple[int, int]) -> np.ndarray:
104
- w, h = size
105
  boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
106
  boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
107
  boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
@@ -109,206 +188,457 @@ class Miner:
109
  return boxes
110
 
111
  @staticmethod
112
- def _xywh_to_xyxy(b: np.ndarray) -> np.ndarray:
113
- o = np.empty_like(b)
114
- o[:, 0] = b[:, 0] - b[:, 2] / 2
115
- o[:, 1] = b[:, 1] - b[:, 3] / 2
116
- o[:, 2] = b[:, 0] + b[:, 2] / 2
117
- o[:, 3] = b[:, 1] + b[:, 3] / 2
118
- return o
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  @staticmethod
121
- def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_thresh: float) -> np.ndarray:
122
- if len(boxes) == 0:
 
 
 
 
 
 
 
 
 
123
  return np.array([], dtype=np.intp)
 
 
124
  order = np.argsort(scores)[::-1]
125
- keep = []
126
- while len(order) > 0:
127
- i = order[0]
128
- keep.append(i)
129
- if len(order) == 1:
130
- break
131
- rest = order[1:]
132
- xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
133
- yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
134
- xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
135
- yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
136
- inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
137
- area_i = max(0, (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1]))
138
- area_r = np.maximum(0, boxes[rest, 2] - boxes[rest, 0]) * np.maximum(0, boxes[rest, 3] - boxes[rest, 1])
139
- iou = inter / (area_i + area_r - inter + 1e-7)
140
- order = rest[iou <= iou_thresh]
141
- return np.array(keep, dtype=np.intp)
 
 
 
 
 
 
 
142
 
143
  @staticmethod
144
- def _box_iou_one_to_many(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
145
- xx1 = np.maximum(box[0], boxes[:, 0])
146
- yy1 = np.maximum(box[1], boxes[:, 1])
147
- xx2 = np.minimum(box[2], boxes[:, 2])
148
- yy2 = np.minimum(box[3], boxes[:, 3])
149
- inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
150
- a = max(0, (box[2] - box[0]) * (box[3] - box[1]))
151
- b = np.maximum(0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0, boxes[:, 3] - boxes[:, 1])
152
- return inter / (a + b - inter + 1e-7)
153
-
154
- def _filter_sane(self, boxes, scores, cls_ids, orig_size):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  if len(boxes) == 0:
156
- return boxes, scores, cls_ids
157
- ow, oh = orig_size
158
- area_img = float(ow * oh)
159
- keep = []
160
- for i, box in enumerate(boxes):
161
- bw, bh = box[2] - box[0], box[3] - box[1]
162
- if bw <= 0 or bh <= 0 or bw < self.min_w or bh < self.min_h:
163
- continue
164
- area = bw * bh
165
- if area < self.min_box_area or area > self.max_box_area_ratio * area_img:
166
- continue
167
- if max(bw / max(bh, 1e-6), bh / max(bw, 1e-6)) > self.max_aspect_ratio:
 
 
 
 
 
 
 
 
 
168
  continue
169
- keep.append(i)
170
- if not keep:
171
- return np.empty((0, 4), dtype=np.float32), np.empty(0, dtype=np.float32), np.empty(0, dtype=np.int32)
172
- k = np.array(keep, dtype=np.intp)
173
- return boxes[k], scores[k], cls_ids[k]
174
 
175
- def _decode_raw_yolo(self, preds, ratio, pad, orig_size):
176
- if preds.ndim == 3 and preds.shape[0] == 1:
177
- preds = preds[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
179
  preds = preds.T
180
 
 
 
 
181
  boxes_xywh = preds[:, :4].astype(np.float32)
182
- tail = preds[:, 4:]
183
 
184
- if tail.shape[1] == 1:
185
- scores = tail[:, 0]
186
  cls_ids = np.zeros(len(scores), dtype=np.int32)
187
  else:
188
- cls_ids = np.argmax(tail, axis=1).astype(np.int32)
189
- scores = tail[np.arange(len(tail)), cls_ids]
 
 
 
 
 
 
190
 
191
- # person only (class 0)
192
- mask = (cls_ids == 0) & (scores >= self.conf_thres)
193
- boxes_xywh, scores, cls_ids = boxes_xywh[mask], scores[mask], cls_ids[mask]
194
  if len(boxes_xywh) == 0:
195
  return []
196
 
197
  boxes = self._xywh_to_xyxy(boxes_xywh)
198
- boxes[:, [0, 2]] -= pad[0]
199
- boxes[:, [1, 3]] -= pad[1]
200
- boxes /= ratio
201
- boxes = self._clip_boxes(boxes, orig_size)
202
- boxes, scores, cls_ids = self._filter_sane(boxes, scores, cls_ids, orig_size)
203
- if len(boxes) == 0:
204
- return []
205
 
206
- keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
207
- return [BoundingBox(
208
- x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
209
- x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
210
- cls_id=0, conf=float(scores[i]))
211
- for i in keep if boxes[i, 2] > boxes[i, 0] and boxes[i, 3] > boxes[i, 1]]
212
-
213
- def _decode_final_dets(self, preds, ratio, pad, orig_size):
214
- if preds.ndim == 3 and preds.shape[0] == 1:
215
- preds = preds[0]
216
- boxes = preds[:, :4].astype(np.float32)
217
- scores = preds[:, 4].astype(np.float32)
218
- cls_ids = preds[:, 5].astype(np.int32)
219
 
220
- mask = (cls_ids == 0) & (scores >= self.conf_thres)
221
- boxes, scores, cls_ids = boxes[mask], scores[mask], cls_ids[mask]
222
- if len(boxes) == 0:
223
- return []
224
 
225
- boxes[:, [0, 2]] -= pad[0]
226
- boxes[:, [1, 3]] -= pad[1]
227
  boxes /= ratio
228
- boxes = self._clip_boxes(boxes, orig_size)
229
- boxes, scores, cls_ids = self._filter_sane(boxes, scores, cls_ids, orig_size)
230
- if len(boxes) == 0:
231
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
234
- return [BoundingBox(
235
- x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
236
- x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
237
- cls_id=0, conf=float(scores[i]))
238
- for i in keep if boxes[i, 2] > boxes[i, 0] and boxes[i, 3] > boxes[i, 1]]
239
 
240
- def _postprocess(self, output, ratio, pad, orig_size):
 
 
 
 
 
 
 
 
 
 
 
241
  if output.ndim == 2 and output.shape[1] >= 6:
242
  return self._decode_final_dets(output, ratio, pad, orig_size)
243
- if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] >= 6:
 
 
244
  return self._decode_final_dets(output, ratio, pad, orig_size)
 
 
245
  return self._decode_raw_yolo(output, ratio, pad, orig_size)
246
 
247
  def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
 
 
 
 
 
 
 
 
 
 
 
248
  if image.dtype != np.uint8:
249
  image = image.astype(np.uint8)
250
- tensor, ratio, pad, orig_size = self._preprocess(image)
251
- outputs = self.session.run(self.output_names, {self.input_name: tensor})
252
- return self._postprocess(outputs[0], ratio, pad, orig_size)
253
 
254
- def _merge_tta(self, boxes_orig, boxes_flip):
255
- if not boxes_orig and not boxes_flip:
256
- return []
257
 
258
- co = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty((0, 4), dtype=np.float32)
259
- so = np.array([b.conf for b in boxes_orig], dtype=np.float32) if boxes_orig else np.empty(0, dtype=np.float32)
260
- cf = np.array([[b.x1, b.y1, b.x2, b.y2] for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty((0, 4), dtype=np.float32)
261
- sf = np.array([b.conf for b in boxes_flip], dtype=np.float32) if boxes_flip else np.empty(0, dtype=np.float32)
 
262
 
263
- acc_b, acc_s = [], []
 
 
264
 
265
- for i in range(len(co)):
266
- if so[i] >= self.conf_high:
267
- acc_b.append(co[i]); acc_s.append(so[i])
268
- elif len(cf) > 0:
269
- ious = self._box_iou_one_to_many(co[i], cf)
270
- j = int(np.argmax(ious))
271
- if ious[j] >= self.tta_match_iou:
272
- acc_b.append(co[i]); acc_s.append(max(so[i], sf[j]))
273
 
274
- for i in range(len(cf)):
275
- if sf[i] < self.conf_high:
276
- continue
277
- if len(co) == 0:
278
- acc_b.append(cf[i]); acc_s.append(sf[i]); continue
279
- if np.max(self._box_iou_one_to_many(cf[i], co)) < self.tta_match_iou:
280
- acc_b.append(cf[i]); acc_s.append(sf[i])
281
 
282
- if not acc_b:
283
- return []
 
 
 
 
 
 
284
 
285
- boxes = np.array(acc_b, dtype=np.float32)
286
- scores = np.array(acc_s, dtype=np.float32)
287
- keep = self._hard_nms(boxes, scores, self.iou_thres)[:self.max_det]
288
 
289
- return [BoundingBox(
290
- x1=int(math.floor(boxes[i, 0])), y1=int(math.floor(boxes[i, 1])),
291
- x2=int(math.ceil(boxes[i, 2])), y2=int(math.ceil(boxes[i, 3])),
292
- cls_id=0, conf=float(scores[i])) for i in keep]
293
 
294
- def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
295
- boxes_orig = self._predict_single(image)
296
- flipped = cv2.flip(image, 1)
297
- boxes_flip_raw = self._predict_single(flipped)
298
- w = image.shape[1]
299
- boxes_flip = [BoundingBox(x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
300
- cls_id=b.cls_id, conf=b.conf) for b in boxes_flip_raw]
301
- return self._merge_tta(boxes_orig, boxes_flip)
302
 
303
- def predict_batch(self, batch_images: list[ndarray], offset: int, n_keypoints: int) -> list[TVFrameResult]:
304
- results = []
305
- for i, image in enumerate(batch_images):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  try:
307
- boxes = self._predict_tta(image) if self.use_tta else self._predict_single(image)
 
 
 
308
  except Exception as e:
309
- print(f"Inference failed frame {offset + i}: {e}")
310
  boxes = []
311
- results.append(TVFrameResult(
312
- frame_id=offset + i, boxes=boxes,
313
- keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))]))
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  boxes: list[BoundingBox]
23
  keypoints: list[tuple[int, int]]
24
 
25
+ SIZE = 1280
26
+ TARGET_CLASS_NAMES = ["petrol hose", "petrol pump", "price board", "roof canopy"]
27
+
28
 
29
  class Miner:
30
  def __init__(self, path_hf_repo: Path) -> None:
31
  model_path = path_hf_repo / "weights.onnx"
32
+ cn_path = model_path.with_name("class_names.txt")
33
+ self.class_names = TARGET_CLASS_NAMES.copy()
34
+ if cn_path.is_file():
35
+ lines = cn_path.read_text(encoding="utf-8").splitlines()
36
+ model_class_order = [
37
+ ln.strip()
38
+ for ln in lines
39
+ if ln.strip() and not ln.strip().startswith("#")
40
+ ]
41
+ if len(model_class_order) == len(self.class_names) and set(model_class_order) == set(self.class_names):
42
+ self.cls_remap = np.array(
43
+ [self.class_names.index(n) for n in model_class_order], dtype=np.int32
44
+ )
45
+ else:
46
+ # If class_names.txt is missing/invalid for this target order, keep identity mapping.
47
+ self.cls_remap = np.arange(len(self.class_names), dtype=np.int32)
48
+ else:
49
+ # Fallback when no class_names.txt is present: assume ONNX class order == target order.
50
+ self.cls_remap = np.arange(len(self.class_names), dtype=np.int32)
51
+ print("ORT version:", ort.__version__)
52
+
53
+ try:
54
+ ort.preload_dlls()
55
+ print("✅ onnxruntime.preload_dlls() success")
56
+ except Exception as e:
57
+ print(f"⚠️ preload_dlls failed: {e}")
58
+
59
+ print("ORT available providers BEFORE session:", ort.get_available_providers())
60
 
61
  sess_options = ort.SessionOptions()
62
  sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 
67
  sess_options=sess_options,
68
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
69
  )
70
+ print("✅ Created ORT session with preferred CUDA provider list")
71
+ except Exception as e:
72
+ print(f"⚠️ CUDA session creation failed, falling back to CPU: {e}")
73
  self.session = ort.InferenceSession(
74
  str(model_path),
75
  sess_options=sess_options,
76
  providers=["CPUExecutionProvider"],
77
  )
78
 
79
+ print("ORT session providers:", self.session.get_providers())
80
+
81
+ for inp in self.session.get_inputs():
82
+ print("INPUT:", inp.name, inp.shape, inp.type)
83
+
84
+ for out in self.session.get_outputs():
85
+ print("OUTPUT:", out.name, out.shape, out.type)
86
+
87
  self.input_name = self.session.get_inputs()[0].name
88
+ self.output_names = [output.name for output in self.session.get_outputs()]
89
  self.input_shape = self.session.get_inputs()[0].shape
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ self.input_height = self._safe_dim(self.input_shape[2], default=SIZE)
92
+ self.input_width = self._safe_dim(self.input_shape[3], default=SIZE)
 
 
 
 
93
 
94
+ self.conf_thres = 0.5
95
+ self.iou_thres = 0.45
96
+ self.max_det = 30
97
+ self.use_tta = True
98
+
99
+ print(f"✅ ONNX model loaded from: {model_path}")
100
+ print(f"✅ ONNX providers: {self.session.get_providers()}")
101
+ print(f"✅ ONNX input: name={self.input_name}, shape={self.input_shape}")
102
 
103
  def __repr__(self) -> str:
104
+ return (
105
+ f"ONNXRuntime(session={type(self.session).__name__}, "
106
+ f"providers={self.session.get_providers()})"
107
+ )
108
 
109
  @staticmethod
110
  def _safe_dim(value, default: int) -> int:
111
  return value if isinstance(value, int) and value > 0 else default
112
 
113
+ def _letterbox(
114
+ self,
115
+ image: ndarray,
116
+ new_shape: tuple[int, int],
117
+ color=(114, 114, 114),
118
+ ) -> tuple[ndarray, float, tuple[float, float]]:
119
+ """
120
+ Resize with unchanged aspect ratio and pad to target shape.
121
+ Returns:
122
+ padded_image,
123
+ ratio,
124
+ (pad_w, pad_h) # half-padding
125
+ """
126
  h, w = image.shape[:2]
127
  new_w, new_h = new_shape
128
+
129
  ratio = min(new_w / w, new_h / h)
130
+ resized_w = int(round(w * ratio))
131
+ resized_h = int(round(h * ratio))
132
+
133
+ if (resized_w, resized_h) != (w, h):
134
  interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
135
+ image = cv2.resize(image, (resized_w, resized_h), interpolation=interp)
136
+
137
+ dw = new_w - resized_w
138
+ dh = new_h - resized_h
139
+ dw /= 2.0
140
+ dh /= 2.0
141
+
142
+ left = int(round(dw - 0.1))
143
+ right = int(round(dw + 0.1))
144
+ top = int(round(dh - 0.1))
145
+ bottom = int(round(dh + 0.1))
146
+
147
  padded = cv2.copyMakeBorder(
148
+ image,
149
+ top,
150
+ bottom,
151
+ left,
152
+ right,
153
+ borderType=cv2.BORDER_CONSTANT,
154
+ value=color,
155
+ )
156
  return padded, ratio, (dw, dh)
157
 
158
+ def _preprocess(
159
+ self, image: ndarray
160
+ ) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
161
+ """
162
+ Preprocess for fixed-size ONNX export:
163
+ - enhance image quality (CLAHE, denoise, sharpen)
164
+ - letterbox to model input size
165
+ - BGR -> RGB
166
+ - normalize to [0,1]
167
+ - HWC -> NCHW float32
168
+ """
169
  orig_h, orig_w = image.shape[:2]
170
+
171
+ img, ratio, pad = self._letterbox(
172
+ image, (self.input_width, self.input_height)
173
+ )
174
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
175
+ img = img.astype(np.float32) / 255.0
176
+ img = np.transpose(img, (2, 0, 1))[None, ...]
177
+ img = np.ascontiguousarray(img, dtype=np.float32)
178
+
179
  return img, ratio, pad, (orig_w, orig_h)
180
 
181
  @staticmethod
182
+ def _clip_boxes(boxes: np.ndarray, image_size: tuple[int, int]) -> np.ndarray:
183
+ w, h = image_size
184
  boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
185
  boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
186
  boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
 
188
  return boxes
189
 
190
  @staticmethod
191
+ def _xywh_to_xyxy(boxes: np.ndarray) -> np.ndarray:
192
+ out = np.empty_like(boxes)
193
+ out[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
194
+ out[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
195
+ out[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
196
+ out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
197
+ return out
198
+
199
+ def _soft_nms(
200
+ self,
201
+ boxes: np.ndarray,
202
+ scores: np.ndarray,
203
+ sigma: float = 0.5,
204
+ score_thresh: float = 0.01,
205
+ ) -> tuple[np.ndarray, np.ndarray]:
206
+ """
207
+ Soft-NMS: Gaussian decay of overlapping scores instead of hard removal.
208
+ Returns (kept_original_indices, updated_scores).
209
+ """
210
+ N = len(boxes)
211
+ if N == 0:
212
+ return np.array([], dtype=np.intp), np.array([], dtype=np.float32)
213
+
214
+ boxes = boxes.astype(np.float32, copy=True)
215
+ scores = scores.astype(np.float32, copy=True)
216
+ order = np.arange(N)
217
+
218
+ for i in range(N):
219
+ max_pos = i + int(np.argmax(scores[i:]))
220
+ boxes[[i, max_pos]] = boxes[[max_pos, i]]
221
+ scores[[i, max_pos]] = scores[[max_pos, i]]
222
+ order[[i, max_pos]] = order[[max_pos, i]]
223
+
224
+ if i + 1 >= N:
225
+ break
226
+
227
+ xx1 = np.maximum(boxes[i, 0], boxes[i + 1:, 0])
228
+ yy1 = np.maximum(boxes[i, 1], boxes[i + 1:, 1])
229
+ xx2 = np.minimum(boxes[i, 2], boxes[i + 1:, 2])
230
+ yy2 = np.minimum(boxes[i, 3], boxes[i + 1:, 3])
231
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
232
+
233
+ area_i = max(0.0, float(
234
+ (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
235
+ ))
236
+ areas_j = (
237
+ np.maximum(0.0, boxes[i + 1:, 2] - boxes[i + 1:, 0])
238
+ * np.maximum(0.0, boxes[i + 1:, 3] - boxes[i + 1:, 1])
239
+ )
240
+ iou = inter / (area_i + areas_j - inter + 1e-7)
241
+ scores[i + 1:] *= np.exp(-(iou ** 2) / sigma)
242
+
243
+ mask = scores > score_thresh
244
+ return order[mask], scores[mask]
245
 
246
  @staticmethod
247
+ def _hard_nms(
248
+ boxes: np.ndarray,
249
+ scores: np.ndarray,
250
+ iou_thresh: float,
251
+ ) -> np.ndarray:
252
+ """
253
+ Standard NMS: keep one box per overlapping cluster (the one with highest score).
254
+ Returns indices of kept boxes (into the boxes/scores arrays).
255
+ """
256
+ N = len(boxes)
257
+ if N == 0:
258
  return np.array([], dtype=np.intp)
259
+ boxes = np.asarray(boxes, dtype=np.float32)
260
+ scores = np.asarray(scores, dtype=np.float32)
261
  order = np.argsort(scores)[::-1]
262
+ keep: list[int] = []
263
+ suppressed = np.zeros(N, dtype=bool)
264
+ for i in range(N):
265
+ idx = order[i]
266
+ if suppressed[idx]:
267
+ continue
268
+ keep.append(idx)
269
+ bi = boxes[idx]
270
+ for k in range(i + 1, N):
271
+ jdx = order[k]
272
+ if suppressed[jdx]:
273
+ continue
274
+ bj = boxes[jdx]
275
+ xx1 = max(bi[0], bj[0])
276
+ yy1 = max(bi[1], bj[1])
277
+ xx2 = min(bi[2], bj[2])
278
+ yy2 = min(bi[3], bj[3])
279
+ inter = max(0.0, xx2 - xx1) * max(0.0, yy2 - yy1)
280
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
281
+ area_j = (bj[2] - bj[0]) * (bj[3] - bj[1])
282
+ iou = inter / (area_i + area_j - inter + 1e-7)
283
+ if iou > iou_thresh:
284
+ suppressed[jdx] = True
285
+ return np.array(keep)
286
 
287
  @staticmethod
288
+ def _max_score_per_cluster(
289
+ coords: np.ndarray,
290
+ scores: np.ndarray,
291
+ keep_indices: np.ndarray,
292
+ iou_thresh: float,
293
+ ) -> np.ndarray:
294
+ """
295
+ For each kept box, return the max original score among itself and any
296
+ box that overlaps it with IOU >= iou_thresh (so TTA cluster keeps best conf).
297
+ """
298
+ n_keep = len(keep_indices)
299
+ if n_keep == 0:
300
+ return np.array([], dtype=np.float32)
301
+ out = np.empty(n_keep, dtype=np.float32)
302
+ coords = np.asarray(coords, dtype=np.float32)
303
+ scores = np.asarray(scores, dtype=np.float32)
304
+ for i in range(n_keep):
305
+ idx = keep_indices[i]
306
+ bi = coords[idx]
307
+ xx1 = np.maximum(bi[0], coords[:, 0])
308
+ yy1 = np.maximum(bi[1], coords[:, 1])
309
+ xx2 = np.minimum(bi[2], coords[:, 2])
310
+ yy2 = np.minimum(bi[3], coords[:, 3])
311
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
312
+ area_i = (bi[2] - bi[0]) * (bi[3] - bi[1])
313
+ areas_j = (coords[:, 2] - coords[:, 0]) * (coords[:, 3] - coords[:, 1])
314
+ iou = inter / (area_i + areas_j - inter + 1e-7)
315
+ in_cluster = iou >= iou_thresh
316
+ out[i] = float(np.max(scores[in_cluster]))
317
+ return out
318
+
319
+ def _decode_final_dets(
320
+ self,
321
+ preds: np.ndarray,
322
+ ratio: float,
323
+ pad: tuple[float, float],
324
+ orig_size: tuple[int, int],
325
+ apply_optional_dedup: bool = False,
326
+ ) -> list[BoundingBox]:
327
+ """
328
+ Primary path:
329
+ expected output rows like [x1, y1, x2, y2, conf, cls_id]
330
+ in letterboxed input coordinates.
331
+ """
332
+ if preds.ndim == 3 and preds.shape[0] == 1:
333
+ preds = preds[0]
334
+
335
+ if preds.ndim != 2 or preds.shape[1] < 6:
336
+ raise ValueError(f"Unexpected ONNX final-det output shape: {preds.shape}")
337
+
338
+ boxes = preds[:, :4].astype(np.float32)
339
+ scores = preds[:, 4].astype(np.float32)
340
+ cls_ids = preds[:, 5].astype(np.int32)
341
+ cls_ids = self.cls_remap[np.clip(cls_ids, 0, len(self.cls_remap) - 1)]
342
+
343
+ keep = scores >= self.conf_thres
344
+ boxes = boxes[keep]
345
+ scores = scores[keep]
346
+ cls_ids = cls_ids[keep]
347
+
348
  if len(boxes) == 0:
349
+ return []
350
+
351
+ pad_w, pad_h = pad
352
+ orig_w, orig_h = orig_size
353
+
354
+ # reverse letterbox
355
+ boxes[:, [0, 2]] -= pad_w
356
+ boxes[:, [1, 3]] -= pad_h
357
+ boxes /= ratio
358
+ boxes = self._clip_boxes(boxes, (orig_w, orig_h))
359
+
360
+ if apply_optional_dedup and len(boxes) > 1:
361
+ keep_idx, scores = self._soft_nms(boxes, scores)
362
+ boxes = boxes[keep_idx]
363
+ cls_ids = cls_ids[keep_idx]
364
+
365
+ results: list[BoundingBox] = []
366
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
367
+ x1, y1, x2, y2 = box.tolist()
368
+
369
+ if x2 <= x1 or y2 <= y1:
370
  continue
 
 
 
 
 
371
 
372
+ results.append(
373
+ BoundingBox(
374
+ x1=int(math.floor(x1)),
375
+ y1=int(math.floor(y1)),
376
+ x2=int(math.ceil(x2)),
377
+ y2=int(math.ceil(y2)),
378
+ cls_id=int(cls_id),
379
+ conf=float(conf),
380
+ )
381
+ )
382
+
383
+ return results
384
+
385
+ def _decode_raw_yolo(
386
+ self,
387
+ preds: np.ndarray,
388
+ ratio: float,
389
+ pad: tuple[float, float],
390
+ orig_size: tuple[int, int],
391
+ ) -> list[BoundingBox]:
392
+ """
393
+ Fallback path for raw YOLO predictions.
394
+ Supports common layouts:
395
+ - [1, C, N]
396
+ - [1, N, C]
397
+ """
398
+ if preds.ndim != 3:
399
+ raise ValueError(f"Unexpected raw ONNX output shape: {preds.shape}")
400
+
401
+ if preds.shape[0] != 1:
402
+ raise ValueError(f"Unexpected batch dimension in raw output: {preds.shape}")
403
+
404
+ preds = preds[0]
405
+
406
+ # Normalize to [N, C]
407
  if preds.shape[0] <= 16 and preds.shape[1] > preds.shape[0]:
408
  preds = preds.T
409
 
410
+ if preds.ndim != 2 or preds.shape[1] < 5:
411
+ raise ValueError(f"Unexpected normalized raw output shape: {preds.shape}")
412
+
413
  boxes_xywh = preds[:, :4].astype(np.float32)
414
+ cls_part = preds[:, 4:].astype(np.float32)
415
 
416
+ if cls_part.shape[1] == 1:
417
+ scores = cls_part[:, 0]
418
  cls_ids = np.zeros(len(scores), dtype=np.int32)
419
  else:
420
+ cls_ids = np.argmax(cls_part, axis=1).astype(np.int32)
421
+ scores = cls_part[np.arange(len(cls_part)), cls_ids]
422
+ cls_ids = self.cls_remap[np.clip(cls_ids, 0, len(self.cls_remap) - 1)]
423
+
424
+ keep = scores >= self.conf_thres
425
+ boxes_xywh = boxes_xywh[keep]
426
+ scores = scores[keep]
427
+ cls_ids = cls_ids[keep]
428
 
 
 
 
429
  if len(boxes_xywh) == 0:
430
  return []
431
 
432
  boxes = self._xywh_to_xyxy(boxes_xywh)
433
+ keep_idx, scores = self._soft_nms(boxes, scores)
434
+ keep_idx = keep_idx[: self.max_det]
435
+ scores = scores[: self.max_det]
 
 
 
 
436
 
437
+ boxes = boxes[keep_idx]
438
+ cls_ids = cls_ids[keep_idx]
 
 
 
 
 
 
 
 
 
 
 
439
 
440
+ pad_w, pad_h = pad
441
+ orig_w, orig_h = orig_size
 
 
442
 
443
+ boxes[:, [0, 2]] -= pad_w
444
+ boxes[:, [1, 3]] -= pad_h
445
  boxes /= ratio
446
+ boxes = self._clip_boxes(boxes, (orig_w, orig_h))
447
+
448
+ results: list[BoundingBox] = []
449
+ for box, conf, cls_id in zip(boxes, scores, cls_ids):
450
+ x1, y1, x2, y2 = box.tolist()
451
+
452
+ if x2 <= x1 or y2 <= y1:
453
+ continue
454
+
455
+ results.append(
456
+ BoundingBox(
457
+ x1=int(math.floor(x1)),
458
+ y1=int(math.floor(y1)),
459
+ x2=int(math.ceil(x2)),
460
+ y2=int(math.ceil(y2)),
461
+ cls_id=int(cls_id),
462
+ conf=float(conf),
463
+ )
464
+ )
465
 
466
+ return results
 
 
 
 
 
467
 
468
+ def _postprocess(
469
+ self,
470
+ output: np.ndarray,
471
+ ratio: float,
472
+ pad: tuple[float, float],
473
+ orig_size: tuple[int, int],
474
+ ) -> list[BoundingBox]:
475
+ """
476
+ Prefer final detections first.
477
+ Fallback to raw decode only if needed.
478
+ """
479
+ # final detections: [N,6]
480
  if output.ndim == 2 and output.shape[1] >= 6:
481
  return self._decode_final_dets(output, ratio, pad, orig_size)
482
+
483
+ # final detections: [1,N,6]
484
+ if output.ndim == 3 and output.shape[0] == 1 and output.shape[2] == 6:
485
  return self._decode_final_dets(output, ratio, pad, orig_size)
486
+
487
+ # fallback raw decode
488
  return self._decode_raw_yolo(output, ratio, pad, orig_size)
489
 
490
  def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
491
+ if image is None:
492
+ raise ValueError("Input image is None")
493
+ if not isinstance(image, np.ndarray):
494
+ raise TypeError(f"Input is not numpy array: {type(image)}")
495
+ if image.ndim != 3:
496
+ raise ValueError(f"Expected HWC image, got shape={image.shape}")
497
+ if image.shape[0] <= 0 or image.shape[1] <= 0:
498
+ raise ValueError(f"Invalid image shape={image.shape}")
499
+ if image.shape[2] != 3:
500
+ raise ValueError(f"Expected 3 channels, got shape={image.shape}")
501
+
502
  if image.dtype != np.uint8:
503
  image = image.astype(np.uint8)
 
 
 
504
 
505
+ input_tensor, ratio, pad, orig_size = self._preprocess(image)
 
 
506
 
507
+ expected_shape = (1, 3, self.input_height, self.input_width)
508
+ if input_tensor.shape != expected_shape:
509
+ raise ValueError(
510
+ f"Bad input tensor shape={input_tensor.shape}, expected={expected_shape}"
511
+ )
512
 
513
+ outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
514
+ det_output = outputs[0]
515
+ return self._postprocess(det_output, ratio, pad, orig_size)
516
 
517
+ def _predict_tta(self, image: np.ndarray) -> list[BoundingBox]:
518
+ """Horizontal-flip TTA: merge original + flipped via hard NMS."""
519
+ boxes_orig = self._predict_single(image)
 
 
 
 
 
520
 
521
+ flipped = cv2.flip(image, 1)
522
+ boxes_flip = self._predict_single(flipped)
 
 
 
 
 
523
 
524
+ w = image.shape[1]
525
+ boxes_flip = [
526
+ BoundingBox(
527
+ x1=w - b.x2, y1=b.y1, x2=w - b.x1, y2=b.y2,
528
+ cls_id=b.cls_id, conf=b.conf,
529
+ )
530
+ for b in boxes_flip
531
+ ]
532
 
533
+ all_boxes = boxes_orig + boxes_flip
534
+ if len(all_boxes) == 0:
535
+ return []
536
 
537
+ coords = np.array(
538
+ [[b.x1, b.y1, b.x2, b.y2] for b in all_boxes], dtype=np.float32
539
+ )
540
+ scores = np.array([b.conf for b in all_boxes], dtype=np.float32)
541
 
542
+ hard_keep = self._hard_nms(coords, scores, self.iou_thres)
543
+ if len(hard_keep) == 0:
544
+ return []
 
 
 
 
 
545
 
546
+ # _hard_nms already orders kept indices by descending score.
547
+ hard_keep = hard_keep[: self.max_det]
548
+
549
+ return [
550
+ BoundingBox(
551
+ x1=all_boxes[i].x1,
552
+ y1=all_boxes[i].y1,
553
+ x2=all_boxes[i].x2,
554
+ y2=all_boxes[i].y2,
555
+ cls_id=all_boxes[i].cls_id,
556
+ conf=float(scores[i]),
557
+ )
558
+ for i in hard_keep
559
+ ]
560
+
561
+ def predict_batch(
562
+ self,
563
+ batch_images: list[ndarray],
564
+ offset: int,
565
+ n_keypoints: int,
566
+ ) -> list[TVFrameResult]:
567
+ results: list[TVFrameResult] = []
568
+
569
+ for frame_number_in_batch, image in enumerate(batch_images):
570
  try:
571
+ if self.use_tta:
572
+ boxes = self._predict_tta(image)
573
+ else:
574
+ boxes = self._predict_single(image)
575
  except Exception as e:
576
+ print(f"⚠️ Inference failed for frame {offset + frame_number_in_batch}: {e}")
577
  boxes = []
578
+ # for box in boxes:
579
+ # if box.cls_id == 2:
580
+ # box.cls_id = 3
581
+ # elif box.cls_id == 3:
582
+ # box.cls_id = 2
583
+
584
+
585
+
586
+ results.append(
587
+ TVFrameResult(
588
+ frame_id=offset + frame_number_in_batch,
589
+ boxes=boxes,
590
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
591
+ )
592
+ )
593
+
594
  return results
595
+
596
+
597
+ if __name__ == "__main__":
598
+ # Simple manual test: load weights.onnx, run on 1.png, and draw bboxes
599
+ repo_dir = Path(__file__).parent
600
+ miner = Miner(repo_dir)
601
+
602
+ image_path = repo_dir / "car1.png"
603
+ if not image_path.exists():
604
+ raise FileNotFoundError(f"Test image not found: {image_path}")
605
+
606
+ image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
607
+ if image is None:
608
+ raise RuntimeError(f"Failed to read image: {image_path}")
609
+
610
+ results = miner.predict_batch([image], offset=0, n_keypoints=0)
611
+ # Draw bounding boxes on a copy of the image
612
+ vis = image.copy()
613
+ colors = [(0, 255, 0), (0, 0, 255), (255, 0, 0)]
614
+ for frame in results:
615
+ print(f"Frame {frame.frame_id}:")
616
+ for i, box in enumerate(frame.boxes):
617
+ color = colors[i % len(colors)]
618
+ cv2.rectangle(
619
+ vis,
620
+ (box.x1, box.y1),
621
+ (box.x2, box.y2),
622
+ color,
623
+ 2,
624
+ )
625
+ label = f"{box.cls_id }_{miner.class_names[box.cls_id] if box.cls_id < len(miner.class_names) else box.cls_id}:{box.conf:.2f}"
626
+ cv2.putText(
627
+ vis,
628
+ label,
629
+ (box.x1, max(0, box.y1 - 5)),
630
+ cv2.FONT_HERSHEY_SIMPLEX,
631
+ box.conf,
632
+ color,
633
+ 1,
634
+ cv2.LINE_AA,
635
+ )
636
+ print(
637
+ f" cls={box.cls_id} conf={box.conf:.3f} "
638
+ f"box=({box.x1},{box.y1},{box.x2},{box.y2})"
639
+ )
640
+ print(len(frame.boxes))
641
+
642
+ out_path = repo_dir / f"1_out_iou{miner.iou_thres:.2f}.png"
643
+ cv2.imwrite(str(out_path), vis)
644
+ print(f"Saved visualization to: {out_path}")