nexu02 commited on
Commit
fe6bdcc
·
verified ·
1 Parent(s): 0f56775

onnxruntime miner.py (was ultralytics .pt loader)

Browse files
Files changed (1) hide show
  1. miner.py +231 -77
miner.py CHANGED
@@ -1,35 +1,26 @@
1
- """miner.py — uploaded to nexu02/ScoreVision HF repo (R18 public).
2
-
3
- Round 18 (R18): YOLO11s retrained on dataset_v12 = 529 manual + 124 pseudo-labeled
4
- frames from the validator's own challenge pool. Pseudo-labels generated by
5
- YOLO11x teacher (mAP50 0.946) with multi-scale TTA + WBF + per-class threshold gates
6
- (cup 0.60, bottle 0.65, can 0.65). Goal: lift recall on the validator's specific
7
- CCTV distribution while keeping R17's class-discrimination gains.
8
-
9
- Training (RTX PRO 6000 Blackwell, 120 epochs, batch=32, cos_lr, AdamW):
10
- - dataset_v12 (587 manual + 124 pseudo-labeled = 711 train + 58 val)
11
- - same R17 recipe: 1280 imgsz, label_smoothing=0.1, copy_paste=0.4, mixup=0.2
12
- - cls loss weight 0.8
13
-
14
- Val results vs R17:
15
- - mAP50 = 0.932 (R17 0.928, +0.004)
16
- - mAP50-95 = 0.776 (R17 0.764, +0.012)
17
- - per-class P: cup 0.890, bottle 0.921, can 0.899
18
-
19
- Local F1 on 3 windows (vs bird ref): R17 0.784 → R18 0.836 (+0.052)
20
- - 8337900: 0.833 → 0.833 (no change)
21
- - 8338200: 0.818 → 0.857 (+0.039)
22
- - 8338500: 0.700 → 0.818 (+0.118) ← hardest window, biggest gain
23
-
24
- Inference (unchanged from R17 chute):
25
- - imgsz=1280, conf=0.50, iou=0.45, augment=True (hflip TTA)
26
  - cross-class NMS at IoU 0.6
 
 
 
27
  """
28
  from pathlib import Path
 
 
 
29
  import numpy as np
 
30
  from numpy import ndarray
31
  from pydantic import BaseModel
32
- from ultralytics import YOLO
33
 
34
  CLASS_NAMES = ["cup", "bottle", "can"]
35
 
@@ -49,72 +40,235 @@ class TVFrameResult(BaseModel):
49
  keypoints: list[tuple[int, int]]
50
 
51
 
52
- def _iou(a: BoundingBox, b: BoundingBox) -> float:
53
- x1 = max(a.x1, b.x1); y1 = max(a.y1, b.y1)
54
- x2 = min(a.x2, b.x2); y2 = min(a.y2, b.y2)
55
- if x2 <= x1 or y2 <= y1: return 0.0
56
- inter = (x2 - x1) * (y2 - y1)
57
- area_a = max(0, a.x2 - a.x1) * max(0, a.y2 - a.y1)
58
- area_b = max(0, b.x2 - b.x1) * max(0, b.y2 - b.y1)
59
- union = area_a + area_b - inter
60
- return inter / union if union > 0 else 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
- def _cross_class_nms(boxes: list[BoundingBox], iou_thresh: float = 0.6) -> list[BoundingBox]:
64
- if len(boxes) <= 1: return boxes
65
- sorted_boxes = sorted(boxes, key=lambda b: -b.conf)
66
- kept: list[BoundingBox] = []
67
- for b in sorted_boxes:
68
- if any(_iou(b, k) >= iou_thresh for k in kept):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  continue
70
- kept.append(b)
71
- return kept
 
 
 
 
72
 
73
 
74
  class Miner:
75
- IMAGE_SIZE = 1280
76
- CONF_THRESH = 0.50
77
- IOU_THRESH = 0.45
78
- USE_TTA = True
 
79
  CROSS_CLASS_IOU = 0.6
80
 
81
  def __init__(self, path_hf_repo: Path) -> None:
82
- weights_path = path_hf_repo / "best.pt"
83
- if not weights_path.exists():
84
- raise FileNotFoundError(f"missing weights at {weights_path}")
85
- self.model = YOLO(str(weights_path))
86
- dummy = np.zeros((640, 640, 3), dtype=np.uint8)
87
- _ = self.model.predict(dummy, imgsz=self.IMAGE_SIZE, conf=self.CONF_THRESH,
88
- iou=self.IOU_THRESH, augment=self.USE_TTA, verbose=False)
89
- print(f"✅ YOLO11s R18 loaded from {weights_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def __repr__(self) -> str:
92
- return (f"YOLO11s_R18(imgsz={self.IMAGE_SIZE}, "
93
- f"conf={self.CONF_THRESH}, iou={self.IOU_THRESH}, "
94
- f"tta={self.USE_TTA})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def predict_batch(self, batch_images: list[ndarray], offset: int,
97
- n_keypoints: int) -> list[TVFrameResult]:
98
- results = self.model.predict(
99
- batch_images, imgsz=self.IMAGE_SIZE, conf=self.CONF_THRESH,
100
- iou=self.IOU_THRESH, augment=self.USE_TTA, verbose=False,
101
- )
102
  out: list[TVFrameResult] = []
103
  kp_zeros = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
104
- for i, r in enumerate(results):
105
  frame_id = offset + i
106
- boxes: list[BoundingBox] = []
107
- if r.boxes is not None and r.boxes.data is not None:
108
- for box in r.boxes.data.cpu().numpy():
109
- x1, y1, x2, y2, conf, cls_id = box.tolist()
110
- cls_id_int = int(cls_id)
111
- if cls_id_int < 0 or cls_id_int >= len(CLASS_NAMES): continue
112
- xi1, yi1, xi2, yi2 = int(x1), int(y1), int(x2), int(y2)
113
- if xi2 <= xi1 or yi2 <= yi1: continue
114
- boxes.append(BoundingBox(
115
- x1=xi1, y1=yi1, x2=xi2, y2=yi2,
116
- cls_id=cls_id_int, conf=float(conf),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  ))
118
- boxes = _cross_class_nms(boxes, iou_thresh=self.CROSS_CLASS_IOU)
119
- out.append(TVFrameResult(frame_id=frame_id, boxes=boxes, keypoints=kp_zeros))
 
 
120
  return out
 
1
+ """miner.py — uploaded to nexu02/ScoreVision HF repo (R17 ONNX migration).
2
+
3
+ Migrated from .pt ONNX FP16 to comply with subnet requirement
4
+ (.onnx-only models). Same R17 weights (mAP50 0.928, mAP50-95 0.764) +
5
+ identical inference recipe to keep the #1 dashboard standing.
6
+
7
+ Inference (same as R17 .pt version):
8
+ - imgsz=1280, conf=0.50, iou=0.45
9
+ - hflip TTA (manual: run twice, merge with per-class NMS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  - cross-class NMS at IoU 0.6
11
+
12
+ Runtime: onnxruntime-gpu (CUDAExecutionProvider) with CPU fallback.
13
+ FP16 input/weights to fit under 30 MB HF cap (19.3 MB total).
14
  """
15
  from pathlib import Path
16
+ import math
17
+
18
+ import cv2
19
  import numpy as np
20
+ import onnxruntime as ort
21
  from numpy import ndarray
22
  from pydantic import BaseModel
23
+
24
 
25
  CLASS_NAMES = ["cup", "bottle", "can"]
26
 
 
40
  keypoints: list[tuple[int, int]]
41
 
42
 
43
+ def _iou_xyxy(a: np.ndarray, b: np.ndarray) -> np.ndarray:
44
+ """Vectorised IoU between one box (a) and array of boxes (b)."""
45
+ xx1 = np.maximum(a[0], b[:, 0])
46
+ yy1 = np.maximum(a[1], b[:, 1])
47
+ xx2 = np.minimum(a[2], b[:, 2])
48
+ yy2 = np.minimum(a[3], b[:, 3])
49
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
50
+ a_area = max(0.0, (a[2] - a[0]) * (a[3] - a[1]))
51
+ b_area = np.maximum(0.0, (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]))
52
+ return inter / (a_area + b_area - inter + 1e-7)
53
+
54
+
55
+ def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_thr: float) -> np.ndarray:
56
+ """Per-class hard NMS — assumes boxes already filtered to one class."""
57
+ n = len(boxes)
58
+ if n == 0:
59
+ return np.array([], dtype=np.intp)
60
+ order = np.argsort(-scores)
61
+ keep = []
62
+ while len(order) > 0:
63
+ i = int(order[0])
64
+ keep.append(i)
65
+ if len(order) == 1:
66
+ break
67
+ rest = order[1:]
68
+ iou = _iou_xyxy(boxes[i], boxes[rest])
69
+ order = rest[iou <= iou_thr]
70
+ return np.array(keep, dtype=np.intp)
71
 
72
 
73
+ def _per_class_nms(boxes, scores, cls_ids, iou_thr):
74
+ if len(boxes) == 0:
75
+ return np.array([], dtype=np.intp)
76
+ keep_all = []
77
+ for c in np.unique(cls_ids):
78
+ m = cls_ids == c
79
+ idx = np.where(m)[0]
80
+ k = _hard_nms(boxes[m], scores[m], iou_thr)
81
+ keep_all.extend(idx[k].tolist())
82
+ keep_all.sort()
83
+ return np.array(keep_all, dtype=np.intp)
84
+
85
+
86
+ def _cross_class_nms(boxes, scores, cls_ids, iou_thr):
87
+ """Cross-class NMS — drop overlapping boxes regardless of class."""
88
+ if len(boxes) <= 1:
89
+ return np.arange(len(boxes))
90
+ order = np.argsort(-scores)
91
+ keep = []
92
+ suppressed = np.zeros(len(boxes), dtype=bool)
93
+ for i in order:
94
+ if suppressed[i]:
95
  continue
96
+ keep.append(int(i))
97
+ iou = _iou_xyxy(boxes[i], boxes)
98
+ dup = iou > iou_thr
99
+ dup[i] = False
100
+ suppressed |= dup
101
+ return np.array(sorted(keep), dtype=np.intp)
102
 
103
 
104
  class Miner:
105
+ """R17 ONNX miner. Same recipe as .pt version: 1280 + flip TTA + cross-class NMS."""
106
+
107
+ INPUT_SIZE = 1280
108
+ CONF_THR = 0.50
109
+ IOU_THR = 0.45
110
  CROSS_CLASS_IOU = 0.6
111
 
112
  def __init__(self, path_hf_repo: Path) -> None:
113
+ model_path = path_hf_repo / "best.onnx"
114
+ if not model_path.exists():
115
+ raise FileNotFoundError(f"missing weights at {model_path}")
116
+
117
+ print(f"ORT version: {ort.__version__}")
118
+ try:
119
+ ort.preload_dlls()
120
+ except Exception:
121
+ pass
122
+
123
+ sess_options = ort.SessionOptions()
124
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
125
+
126
+ try:
127
+ self.session = ort.InferenceSession(
128
+ str(model_path),
129
+ sess_options=sess_options,
130
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
131
+ )
132
+ except Exception as e:
133
+ print(f"CUDA session failed, fallback CPU: {e}")
134
+ self.session = ort.InferenceSession(
135
+ str(model_path),
136
+ sess_options=sess_options,
137
+ providers=["CPUExecutionProvider"],
138
+ )
139
+ print(f"ORT providers: {self.session.get_providers()}")
140
+ for inp in self.session.get_inputs():
141
+ print(f"INPUT {inp.name} shape={inp.shape} dtype={inp.type}")
142
+ for out in self.session.get_outputs():
143
+ print(f"OUTPUT {out.name} shape={out.shape} dtype={out.type}")
144
+
145
+ self.input_name = self.session.get_inputs()[0].name
146
+ # FP16 model expects float16 inputs
147
+ in_type = self.session.get_inputs()[0].type
148
+ self.input_dtype = np.float16 if "float16" in in_type else np.float32
149
+ print(f"✅ R17 ONNX loaded, input dtype={self.input_dtype.__name__}")
150
 
151
  def __repr__(self) -> str:
152
+ return f"R17_ONNX(imgsz={self.INPUT_SIZE}, conf={self.CONF_THR}, iou={self.IOU_THR})"
153
+
154
+ def _letterbox(self, img: np.ndarray, size: int):
155
+ h, w = img.shape[:2]
156
+ r = min(size / w, size / h)
157
+ new_w, new_h = int(round(w * r)), int(round(h * r))
158
+ if (new_w, new_h) != (w, h):
159
+ interp = cv2.INTER_LINEAR
160
+ img = cv2.resize(img, (new_w, new_h), interpolation=interp)
161
+ dw, dh = (size - new_w) / 2.0, (size - new_h) / 2.0
162
+ top = int(round(dh - 0.1)); bottom = int(round(dh + 0.1))
163
+ left = int(round(dw - 0.1)); right = int(round(dw + 0.1))
164
+ padded = cv2.copyMakeBorder(img, top, bottom, left, right,
165
+ borderType=cv2.BORDER_CONSTANT, value=(114, 114, 114))
166
+ return padded, r, (dw, dh)
167
+
168
+ def _preprocess(self, img_bgr: np.ndarray):
169
+ h, w = img_bgr.shape[:2]
170
+ padded, r, pad = self._letterbox(img_bgr, self.INPUT_SIZE)
171
+ rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB)
172
+ x = rgb.astype(self.input_dtype) / 255.0
173
+ x = np.transpose(x, (2, 0, 1))[None, ...]
174
+ return np.ascontiguousarray(x, dtype=self.input_dtype), r, pad, (w, h)
175
+
176
+ def _decode_raw(self, raw: np.ndarray, r: float, pad, orig_size):
177
+ """Decode YOLO11 raw output (1, 7, N) → boxes + scores + class.
178
+ Output shape: 4 box (xywh) + 3 class scores.
179
+ """
180
+ if raw.ndim == 3:
181
+ raw = raw[0]
182
+ if raw.shape[0] < raw.shape[1]:
183
+ raw = raw.T # → (N, 7)
184
+ boxes_xywh = raw[:, :4].astype(np.float32)
185
+ cls_scores = raw[:, 4:].astype(np.float32)
186
+ cls_ids = np.argmax(cls_scores, axis=1)
187
+ scores = cls_scores[np.arange(len(cls_scores)), cls_ids]
188
+
189
+ keep = scores >= self.CONF_THR
190
+ if not keep.any():
191
+ return (np.empty((0, 4)), np.empty(0), np.empty(0, dtype=int))
192
+ boxes_xywh, scores, cls_ids = boxes_xywh[keep], scores[keep], cls_ids[keep]
193
+
194
+ # xywh → xyxy
195
+ boxes = np.empty_like(boxes_xywh)
196
+ boxes[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2
197
+ boxes[:, 1] = boxes_xywh[:, 1] - boxes_xywh[:, 3] / 2
198
+ boxes[:, 2] = boxes_xywh[:, 0] + boxes_xywh[:, 2] / 2
199
+ boxes[:, 3] = boxes_xywh[:, 1] + boxes_xywh[:, 3] / 2
200
+
201
+ # Undo letterbox padding/scale
202
+ pad_w, pad_h = pad
203
+ boxes[:, [0, 2]] -= pad_w
204
+ boxes[:, [1, 3]] -= pad_h
205
+ boxes /= r
206
+
207
+ # Clip to original image
208
+ w, h = orig_size
209
+ boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, w - 1)
210
+ boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, h - 1)
211
+
212
+ return boxes, scores, cls_ids
213
+
214
+ def _predict_single(self, img_bgr: np.ndarray):
215
+ x, r, pad, orig = self._preprocess(img_bgr)
216
+ out = self.session.run(None, {self.input_name: x})[0]
217
+ return self._decode_raw(out, r, pad, orig)
218
+
219
+ def _predict_with_tta(self, img_bgr: np.ndarray):
220
+ """Predict + horizontal flip TTA, merge with per-class NMS."""
221
+ boxes1, scores1, cls1 = self._predict_single(img_bgr)
222
+ flipped = cv2.flip(img_bgr, 1)
223
+ boxes2, scores2, cls2 = self._predict_single(flipped)
224
+ if len(boxes2):
225
+ w = img_bgr.shape[1]
226
+ new = boxes2.copy()
227
+ new[:, 0] = w - boxes2[:, 2]
228
+ new[:, 2] = w - boxes2[:, 0]
229
+ boxes2 = new
230
+ if not len(boxes1) and not len(boxes2):
231
+ return np.empty((0, 4)), np.empty(0), np.empty(0, dtype=int)
232
+ boxes = np.concatenate([boxes1, boxes2]) if len(boxes1) and len(boxes2) else (boxes1 if len(boxes1) else boxes2)
233
+ scores = np.concatenate([scores1, scores2]) if len(boxes1) and len(boxes2) else (scores1 if len(scores1) else scores2)
234
+ cls_ids = np.concatenate([cls1, cls2]) if len(boxes1) and len(boxes2) else (cls1 if len(cls1) else cls2)
235
+ keep = _per_class_nms(boxes, scores, cls_ids, self.IOU_THR)
236
+ return boxes[keep], scores[keep], cls_ids[keep]
237
 
238
  def predict_batch(self, batch_images: list[ndarray], offset: int,
239
+ n_keypoints: int) -> list[TVFrameResult]:
 
 
 
 
240
  out: list[TVFrameResult] = []
241
  kp_zeros = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
242
+ for i, image in enumerate(batch_images):
243
  frame_id = offset + i
244
+ try:
245
+ if image is None or image.ndim != 3 or image.shape[2] != 3:
246
+ out.append(TVFrameResult(frame_id=frame_id, boxes=[], keypoints=kp_zeros))
247
+ continue
248
+ if image.dtype != np.uint8:
249
+ image = image.astype(np.uint8)
250
+
251
+ boxes, scores, cls_ids = self._predict_with_tta(image)
252
+ if len(boxes):
253
+ # Cross-class NMS (validator counts cross-class overlap as FP)
254
+ keep = _cross_class_nms(boxes, scores, cls_ids, self.CROSS_CLASS_IOU)
255
+ boxes, scores, cls_ids = boxes[keep], scores[keep], cls_ids[keep]
256
+
257
+ results = []
258
+ for b, s, c in zip(boxes, scores, cls_ids):
259
+ x1, y1, x2, y2 = b
260
+ if x2 <= x1 or y2 <= y1:
261
+ continue
262
+ c_int = int(c)
263
+ if c_int < 0 or c_int >= len(CLASS_NAMES):
264
+ continue
265
+ results.append(BoundingBox(
266
+ x1=int(math.floor(x1)), y1=int(math.floor(y1)),
267
+ x2=int(math.ceil(x2)), y2=int(math.ceil(y2)),
268
+ cls_id=c_int, conf=float(s),
269
  ))
270
+ out.append(TVFrameResult(frame_id=frame_id, boxes=results, keypoints=kp_zeros))
271
+ except Exception as e:
272
+ print(f"Inference err for frame {frame_id}: {e}")
273
+ out.append(TVFrameResult(frame_id=frame_id, boxes=[], keypoints=kp_zeros))
274
  return out