iotaminer commited on
Commit
e15d45e
·
verified ·
1 Parent(s): 53c1227

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +344 -161
miner.py CHANGED
@@ -1,16 +1,25 @@
1
- """TurboVision miner for Detect-petrol-station-1-0.
 
 
 
 
 
2
 
3
- YOLOv11s ONNX FP16 + NMS baked in, with horizontal-flip TTA to boost recall.
4
- 4 classes: 0=petrol hose, 1=petrol pump, 2=price board, 3=roof canopy.
 
5
  """
 
6
  from __future__ import annotations
7
 
 
 
8
  from pathlib import Path
9
- from typing import List, Tuple
10
 
11
  import cv2
12
  import numpy as np
13
  import onnxruntime as ort
 
14
  from pydantic import BaseModel
15
 
16
 
@@ -29,176 +38,350 @@ class TVFrameResult(BaseModel):
29
  keypoints: list[tuple[int, int]]
30
 
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  class Miner:
33
- IMGSZ = 1280
34
- # Per-class conf thresholds: 0=petrol hose, 1=petrol pump, 2=price board, 3=roof canopy.
35
- # Tuned via greedy grid search on 100 fresh challenges vs real SAM3 pseudo-GT.
36
- CLASS_CONF_THRES = (0.43, 0.63, 0.37, 0.41)
37
- CONF_THRES = 0.37 # fallback / pre-filter at lowest per-class threshold
38
- IOU_THRES = 0.45
39
- NUM_CLASSES = 4
40
- MIN_BOX_FRAC = 0.005
41
- USE_TTA = True
42
 
43
  def __init__(self, path_hf_repo: Path) -> None:
44
- self.onnx_path = path_hf_repo / 'weights.onnx'
45
- if not self.onnx_path.exists():
46
- raise FileNotFoundError(f'Model not found at {self.onnx_path}')
47
-
48
- # Help ORT find CUDA libs shipped with nvidia-*-cu12 packages (pytorch/onnxruntime).
49
- import os as _os
50
- import site as _site
51
- import glob as _glob
52
- cuda_lib_dirs: list[str] = []
53
- for sp in _site.getsitepackages() + [_site.getusersitepackages()]:
54
- for sub in ('nvidia/cuda_runtime/lib', 'nvidia/cublas/lib', 'nvidia/cudnn/lib',
55
- 'nvidia/cufft/lib', 'nvidia/cuda_nvrtc/lib', 'nvidia/curand/lib',
56
- 'nvidia/cusparse/lib', 'nvidia/cusolver/lib', 'nvidia/nvjitlink/lib'):
57
- p = f'{sp}/{sub}'
58
- if _glob.glob(f'{p}/*.so*'):
59
- cuda_lib_dirs.append(p)
60
- if cuda_lib_dirs:
61
- existing = _os.environ.get('LD_LIBRARY_PATH', '')
62
- _os.environ['LD_LIBRARY_PATH'] = ':'.join(cuda_lib_dirs + ([existing] if existing else []))
63
-
64
- providers: list = []
65
  try:
66
  ort.preload_dlls()
67
- except Exception as _pe:
68
- print(f'[Miner] preload_dlls failed: {_pe}')
69
- available = ort.get_available_providers()
70
- if 'CUDAExecutionProvider' in available:
71
- providers.append(('CUDAExecutionProvider', {'device_id': 0}))
72
- providers.append('CPUExecutionProvider')
73
- so = ort.SessionOptions()
74
- so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
75
- self.session = ort.InferenceSession(str(self.onnx_path), sess_options=so, providers=providers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  self.input_name = self.session.get_inputs()[0].name
77
- inp = self.session.get_inputs()[0]
78
- self.input_shape = inp.shape
79
- self.input_dtype = np.float16 if inp.type == 'tensor(float16)' else np.float32
80
- self.active_providers = self.session.get_providers()
81
- print(f'[Miner] Loaded {self.onnx_path.name} | providers={self.active_providers} | dtype={self.input_dtype}')
82
- print(f'[Miner] cuda_lib_dirs discovered: {cuda_lib_dirs[:3]}')
83
- print(f'[Miner] ort.get_available_providers() = {available}')
84
 
85
- def __repr__(self) -> str:
86
- return f'PetrolMiner(yolo11s-onnx-fp16-nms, tta={self.USE_TTA}, conf={self.CONF_THRES}, providers={getattr(self, "active_providers", "?")})'
 
 
87
 
88
- @staticmethod
89
- def _letterbox(img, new_size=1280, color=(114, 114, 114)):
90
- h, w = img.shape[:2]
91
- r = min(new_size / h, new_size / w)
92
- nh, nw = int(round(h * r)), int(round(w * r))
93
- resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_LINEAR)
94
- top = (new_size - nh) // 2
95
- bottom = new_size - nh - top
96
- left = (new_size - nw) // 2
97
- right = new_size - nw - left
98
- padded = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
99
- return padded, r, (left, top)
100
-
101
- def _preprocess(self, img):
102
- h, w = img.shape[:2]
103
- img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
104
- padded, r, (lx, ty) = self._letterbox(img_rgb, self.IMGSZ)
105
- x = padded.astype(self.input_dtype) / 255.0
106
- x = x.transpose(2, 0, 1)[None, ...]
107
- return np.ascontiguousarray(x), r, (lx, ty), (w, h)
108
-
109
- def _run_onnx(self, img):
110
- x, r, (lx, ty), (W, H) = self._preprocess(img)
111
- outputs = self.session.run(None, {self.input_name: x})
112
- det = outputs[0]
113
- if det.ndim == 3: det = det[0]
114
- if det.size == 0: return [], [], [], W, H
115
- det = np.asarray(det, dtype=np.float32)
116
- if det.shape[-1] < 6: return [], [], [], W, H
117
- xyxy = det[:, :4].copy()
118
- conf = det[:, 4].copy()
119
- cls_id = det[:, 5].astype(int)
120
- keep = conf >= self.CONF_THRES
121
- xyxy, conf, cls_id = xyxy[keep], conf[keep], cls_id[keep]
122
- if len(xyxy) == 0: return [], [], [], W, H
123
- xyxy[:, [0, 2]] = (xyxy[:, [0, 2]] - lx) / r
124
- xyxy[:, [1, 3]] = (xyxy[:, [1, 3]] - ty) / r
125
- xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], 0, W - 1)
126
- xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], 0, H - 1)
127
- min_side = self.MIN_BOX_FRAC * min(W, H)
128
- mask = (
129
- (cls_id >= 0) & (cls_id < self.NUM_CLASSES)
130
- & ((xyxy[:, 2] - xyxy[:, 0]) >= min_side)
131
- & ((xyxy[:, 3] - xyxy[:, 1]) >= min_side)
132
  )
133
- return xyxy[mask], conf[mask], cls_id[mask], W, H
134
 
135
- @staticmethod
136
- def _hard_nms_per_class(xyxy, conf, cls_id, iou_thres=0.5):
137
- if len(xyxy) == 0: return np.empty((0,), dtype=int)
138
- keep = []
139
- for c in np.unique(cls_id):
140
- idx = np.where(cls_id == c)[0]
141
- b = xyxy[idx]; s = conf[idx]
142
- order = np.argsort(-s)
143
- b = b[order]; s = s[order]; idx = idx[order]
144
- areas = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
145
- suppressed = np.zeros(len(b), dtype=bool)
146
- for i in range(len(b)):
147
- if suppressed[i]: continue
148
- keep.append(idx[i])
149
- xx1 = np.maximum(b[i, 0], b[i+1:, 0])
150
- yy1 = np.maximum(b[i, 1], b[i+1:, 1])
151
- xx2 = np.minimum(b[i, 2], b[i+1:, 2])
152
- yy2 = np.minimum(b[i, 3], b[i+1:, 3])
153
- inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
154
- iou = inter / (areas[i] + areas[i+1:] - inter + 1e-9)
155
- suppressed[i+1:][iou > iou_thres] = True
156
- return np.array(keep, dtype=int)
157
-
158
- def _predict_single(self, img):
159
- xyxy1, conf1, cls1, W, H = self._run_onnx(img)
160
- if not self.USE_TTA:
161
- xyxy, conf, cls_id = xyxy1, conf1, cls1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  else:
163
- img_f = cv2.flip(img, 1)
164
- xyxy2, conf2, cls2, _, _ = self._run_onnx(img_f)
165
- if len(xyxy2) > 0:
166
- tmp = xyxy2.copy()
167
- tmp[:, 0] = W - xyxy2[:, 2]
168
- tmp[:, 2] = W - xyxy2[:, 0]
169
- xyxy2 = tmp
170
- pieces_xyxy = [a for a in (xyxy1, xyxy2) if len(a) > 0]
171
- pieces_conf = [a for a in (conf1, conf2) if len(a) > 0]
172
- pieces_cls = [a for a in (cls1, cls2) if len(a) > 0]
173
- xyxy = np.vstack(pieces_xyxy) if pieces_xyxy else np.empty((0, 4))
174
- conf = np.concatenate(pieces_conf) if pieces_conf else np.empty((0,))
175
- cls_id = np.concatenate(pieces_cls) if pieces_cls else np.empty((0,))
176
- if len(xyxy) > 0:
177
- keep = self._hard_nms_per_class(xyxy, conf, cls_id, iou_thres=self.IOU_THRES)
178
- xyxy, conf, cls_id = xyxy[keep], conf[keep], cls_id[keep]
179
- boxes = []
180
- for i in range(len(xyxy)):
181
- ci = int(cls_id[i])
182
- if 0 <= ci < self.NUM_CLASSES and float(conf[i]) < self.CLASS_CONF_THRES[ci]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  continue
184
- boxes.append(BoundingBox(
185
- x1=int(round(float(xyxy[i, 0]))),
186
- y1=int(round(float(xyxy[i, 1]))),
187
- x2=int(round(float(xyxy[i, 2]))),
188
- y2=int(round(float(xyxy[i, 3]))),
189
- cls_id=ci,
190
- conf=float(conf[i]),
191
- ))
192
- return boxes
193
-
194
- def predict_batch(self, batch_images, offset, n_keypoints):
195
- results = []
196
- for i, img in enumerate(batch_images):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  try:
198
- boxes = self._predict_single(img)
199
  except Exception as e:
200
- print(f'[Miner] predict error on frame {offset + i}: {e}')
201
  boxes = []
202
- kps = [(0, 0) for _ in range(n_keypoints)]
203
- results.append(TVFrameResult(frame_id=offset + i, boxes=boxes, keypoints=kps))
 
 
 
 
 
204
  return results
 
1
+ """
2
+ Detect-Person miner for ScoreVision.
3
+
4
+ Loaded by the TurboVision chute_template from the root of the HF repo.
5
+ Thresholds (imgsz, conf, iou, max_det) are overridable via SN44_* env vars
6
+ so operators can hot-patch without redeploying.
7
 
8
+ Contract expected by the chute template:
9
+ * class `Miner(path_hf_repo: Path)`
10
+ * method `predict_batch(batch_images, offset, n_keypoints) -> list[TVFrameResult]`
11
  """
12
+
13
  from __future__ import annotations
14
 
15
+ import math
16
+ import os
17
  from pathlib import Path
 
18
 
19
  import cv2
20
  import numpy as np
21
  import onnxruntime as ort
22
+ from numpy import ndarray
23
  from pydantic import BaseModel
24
 
25
 
 
38
  keypoints: list[tuple[int, int]]
39
 
40
 
41
+ # ---------------------------------------------------------------------------
42
+ # Tuned hyperparameters (override via env for hot-patching without redeploy)
43
+ # ---------------------------------------------------------------------------
44
+ _DEFAULT_WEIGHTS = "weights.onnx"
45
+ _DEFAULT_IMGSZ = 960
46
+ _DEFAULT_CONF = 0.25
47
+ _DEFAULT_IOU = 0.60
48
+ _DEFAULT_MAX_DET = 300
49
+
50
+
51
+ def _env_int(name: str, default: int) -> int:
52
+ try:
53
+ return int(os.environ.get(name, default))
54
+ except (TypeError, ValueError):
55
+ return default
56
+
57
+
58
+ def _env_float(name: str, default: float) -> float:
59
+ try:
60
+ return float(os.environ.get(name, default))
61
+ except (TypeError, ValueError):
62
+ return default
63
+
64
+
65
+ def _letterbox(
66
+ image: ndarray,
67
+ new_shape: tuple[int, int],
68
+ color: tuple[int, int, int] = (114, 114, 114),
69
+ ) -> tuple[ndarray, float, tuple[float, float]]:
70
+ """YOLO-style letterbox preserving aspect ratio, returns (img, ratio, (dw, dh))."""
71
+ h, w = image.shape[:2]
72
+ new_w, new_h = new_shape
73
+ ratio = min(new_w / w, new_h / h)
74
+ resized_w = int(round(w * ratio))
75
+ resized_h = int(round(h * ratio))
76
+ if (resized_w, resized_h) != (w, h):
77
+ interp = cv2.INTER_CUBIC if ratio > 1.0 else cv2.INTER_LINEAR
78
+ image = cv2.resize(image, (resized_w, resized_h), interpolation=interp)
79
+ dw = (new_w - resized_w) / 2.0
80
+ dh = (new_h - resized_h) / 2.0
81
+ left = int(round(dw - 0.1))
82
+ right = int(round(dw + 0.1))
83
+ top = int(round(dh - 0.1))
84
+ bottom = int(round(dh + 0.1))
85
+ padded = cv2.copyMakeBorder(
86
+ image, top, bottom, left, right,
87
+ borderType=cv2.BORDER_CONSTANT, value=color,
88
+ )
89
+ return padded, ratio, (dw, dh)
90
+
91
+
92
+ def _xywh_to_xyxy(boxes: np.ndarray) -> np.ndarray:
93
+ out = np.empty_like(boxes)
94
+ out[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
95
+ out[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
96
+ out[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
97
+ out[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
98
+ return out
99
+
100
+
101
+ def _hard_nms(boxes: np.ndarray, scores: np.ndarray, iou_thresh: float) -> np.ndarray:
102
+ """Pure numpy hard NMS. Avoids torchvision to keep the chute slim."""
103
+ if len(boxes) == 0:
104
+ return np.array([], dtype=np.intp)
105
+ boxes = np.asarray(boxes, dtype=np.float32)
106
+ scores = np.asarray(scores, dtype=np.float32)
107
+ order = np.argsort(scores)[::-1]
108
+ keep: list[int] = []
109
+ while len(order) > 0:
110
+ i = int(order[0])
111
+ keep.append(i)
112
+ if len(order) == 1:
113
+ break
114
+ rest = order[1:]
115
+ xx1 = np.maximum(boxes[i, 0], boxes[rest, 0])
116
+ yy1 = np.maximum(boxes[i, 1], boxes[rest, 1])
117
+ xx2 = np.minimum(boxes[i, 2], boxes[rest, 2])
118
+ yy2 = np.minimum(boxes[i, 3], boxes[rest, 3])
119
+ inter = np.maximum(0.0, xx2 - xx1) * np.maximum(0.0, yy2 - yy1)
120
+ area_i = max(0.0, (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1]))
121
+ area_r = np.maximum(0.0, boxes[rest, 2] - boxes[rest, 0]) * np.maximum(
122
+ 0.0, boxes[rest, 3] - boxes[rest, 1]
123
+ )
124
+ iou = inter / (area_i + area_r - inter + 1e-7)
125
+ order = rest[iou <= iou_thresh]
126
+ return np.array(keep, dtype=np.intp)
127
+
128
+
129
+ def _clip_boxes(boxes: np.ndarray, image_size: tuple[int, int]) -> np.ndarray:
130
+ w, h = image_size
131
+ boxes[:, 0] = np.clip(boxes[:, 0], 0, w - 1)
132
+ boxes[:, 1] = np.clip(boxes[:, 1], 0, h - 1)
133
+ boxes[:, 2] = np.clip(boxes[:, 2], 0, w - 1)
134
+ boxes[:, 3] = np.clip(boxes[:, 3], 0, h - 1)
135
+ return boxes
136
+
137
+
138
  class Miner:
139
+ """Detect-Person miner: ONNX Runtime + raw YOLO decode + numpy NMS."""
 
 
 
 
 
 
 
 
140
 
141
  def __init__(self, path_hf_repo: Path) -> None:
142
+ self.class_names = ["person"]
143
+
144
+ weights_name = os.environ.get("SN44_ONNX_WEIGHTS", _DEFAULT_WEIGHTS)
145
+ weights_path = path_hf_repo / weights_name
146
+ if not weights_path.is_file():
147
+ raise FileNotFoundError(
148
+ f"ONNX weights '{weights_name}' not found in {path_hf_repo}"
149
+ )
150
+
151
+ print("ORT version:", ort.__version__)
 
 
 
 
 
 
 
 
 
 
 
152
  try:
153
  ort.preload_dlls()
154
+ print("ORT preload_dlls ok")
155
+ except Exception as e:
156
+ print(f"ORT preload_dlls skipped: {e}")
157
+ print("ORT available providers:", ort.get_available_providers())
158
+
159
+ sess_options = ort.SessionOptions()
160
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
161
+
162
+ try:
163
+ self.session = ort.InferenceSession(
164
+ str(weights_path),
165
+ sess_options=sess_options,
166
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
167
+ )
168
+ print("ORT session created with CUDA preferred")
169
+ except Exception as e:
170
+ print(f"ORT CUDA provider failed, falling back to CPU: {e}")
171
+ self.session = ort.InferenceSession(
172
+ str(weights_path),
173
+ sess_options=sess_options,
174
+ providers=["CPUExecutionProvider"],
175
+ )
176
+ print("ORT session providers:", self.session.get_providers())
177
+
178
+ for inp in self.session.get_inputs():
179
+ print("ONNX INPUT:", inp.name, inp.shape, inp.type)
180
+ for out in self.session.get_outputs():
181
+ print("ONNX OUTPUT:", out.name, out.shape, out.type)
182
+
183
  self.input_name = self.session.get_inputs()[0].name
184
+ self.output_names = [o.name for o in self.session.get_outputs()]
185
+ input_shape = self.session.get_inputs()[0].shape
 
 
 
 
 
186
 
187
+ h = input_shape[2] if isinstance(input_shape[2], int) and input_shape[2] > 0 else _DEFAULT_IMGSZ
188
+ w = input_shape[3] if isinstance(input_shape[3], int) and input_shape[3] > 0 else _DEFAULT_IMGSZ
189
+ self.input_height = _env_int("SN44_IMGSZ", h)
190
+ self.input_width = _env_int("SN44_IMGSZ", w)
191
 
192
+ self.conf_thres = _env_float("SN44_CONF", _DEFAULT_CONF)
193
+ self.iou_thres = _env_float("SN44_IOU", _DEFAULT_IOU)
194
+ self.max_det = _env_int("SN44_MAX_DET", _DEFAULT_MAX_DET)
195
+
196
+ self.min_w = 4
197
+ self.min_h = 4
198
+ self.min_box_area = 16
199
+ self.max_aspect_ratio = 8.0
200
+ self.max_box_area_ratio = 0.9
201
+
202
+ self.person_cls_idx = 0
203
+
204
+ print(
205
+ "Miner ready: "
206
+ f"imgsz={self.input_height}x{self.input_width}, "
207
+ f"conf={self.conf_thres:.3f}, iou={self.iou_thres:.3f}, "
208
+ f"max_det={self.max_det}, providers={self.session.get_providers()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  )
 
210
 
211
+ def __repr__(self) -> str:
212
+ return (
213
+ "DetectPersonMiner("
214
+ f"providers={self.session.get_providers()}, "
215
+ f"imgsz={self.input_height}x{self.input_width}, "
216
+ f"conf={self.conf_thres}, iou={self.iou_thres})"
217
+ )
218
+
219
+ def _preprocess(
220
+ self, image: ndarray
221
+ ) -> tuple[np.ndarray, float, tuple[float, float], tuple[int, int]]:
222
+ if image.dtype != np.uint8:
223
+ image = image.astype(np.uint8)
224
+ orig_h, orig_w = image.shape[:2]
225
+ img, ratio, pad = _letterbox(image, (self.input_width, self.input_height))
226
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
227
+ img = img.astype(np.float32) / 255.0
228
+ img = np.transpose(img, (2, 0, 1))[None, ...]
229
+ img = np.ascontiguousarray(img, dtype=np.float32)
230
+ return img, ratio, pad, (orig_w, orig_h)
231
+
232
+ def _filter_sane(
233
+ self,
234
+ boxes: np.ndarray,
235
+ scores: np.ndarray,
236
+ orig_size: tuple[int, int],
237
+ ) -> tuple[np.ndarray, np.ndarray]:
238
+ if len(boxes) == 0:
239
+ return boxes, scores
240
+ orig_w, orig_h = orig_size
241
+ image_area = float(orig_w * orig_h)
242
+ keep: list[int] = []
243
+ for i, box in enumerate(boxes):
244
+ x1, y1, x2, y2 = box.tolist()
245
+ bw = x2 - x1
246
+ bh = y2 - y1
247
+ if bw <= 0 or bh <= 0:
248
+ continue
249
+ if bw < self.min_w or bh < self.min_h:
250
+ continue
251
+ area = bw * bh
252
+ if area < self.min_box_area:
253
+ continue
254
+ if area > self.max_box_area_ratio * image_area:
255
+ continue
256
+ ar = max(bw / max(bh, 1e-6), bh / max(bw, 1e-6))
257
+ if ar > self.max_aspect_ratio:
258
+ continue
259
+ keep.append(i)
260
+ if not keep:
261
+ return (
262
+ np.empty((0, 4), dtype=np.float32),
263
+ np.empty((0,), dtype=np.float32),
264
+ )
265
+ keep_idx = np.array(keep, dtype=np.intp)
266
+ return boxes[keep_idx], scores[keep_idx]
267
+
268
+ def _decode_yolov11(
269
+ self,
270
+ preds: np.ndarray,
271
+ ratio: float,
272
+ pad: tuple[float, float],
273
+ orig_size: tuple[int, int],
274
+ ) -> list[BoundingBox]:
275
+ """
276
+ Ultralytics YOLOv8/11 ONNX output is [1, 4+nc, N].
277
+ For COCO nc=80 → shape [1, 84, N]. No objectness term;
278
+ class score IS the detection score.
279
+ """
280
+ if preds.ndim != 3:
281
+ return []
282
+ preds = preds[0]
283
+ if preds.shape[0] == 4 + len(self._coco_classes()):
284
+ preds = preds.T
285
+ elif preds.shape[1] == 4 + len(self._coco_classes()):
286
+ pass
287
  else:
288
+ if preds.shape[0] < preds.shape[1]:
289
+ preds = preds.T
290
+
291
+ if preds.shape[1] < 5:
292
+ return []
293
+
294
+ boxes_xywh = preds[:, :4].astype(np.float32)
295
+ class_scores = preds[:, 4:].astype(np.float32)
296
+
297
+ person_scores = class_scores[:, self.person_cls_idx]
298
+ mask = person_scores >= self.conf_thres
299
+ if not np.any(mask):
300
+ return []
301
+
302
+ boxes_xywh = boxes_xywh[mask]
303
+ scores = person_scores[mask]
304
+
305
+ boxes = _xywh_to_xyxy(boxes_xywh)
306
+
307
+ pad_w, pad_h = pad
308
+ boxes[:, [0, 2]] -= pad_w
309
+ boxes[:, [1, 3]] -= pad_h
310
+ boxes /= ratio
311
+ boxes = _clip_boxes(boxes, orig_size)
312
+
313
+ boxes, scores = self._filter_sane(boxes, scores, orig_size)
314
+ if len(boxes) == 0:
315
+ return []
316
+
317
+ keep = _hard_nms(boxes, scores, self.iou_thres)
318
+ keep = keep[: self.max_det]
319
+ boxes = boxes[keep]
320
+ scores = scores[keep]
321
+
322
+ out: list[BoundingBox] = []
323
+ for box, conf in zip(boxes, scores):
324
+ if box[2] <= box[0] or box[3] <= box[1]:
325
  continue
326
+ out.append(
327
+ BoundingBox(
328
+ x1=int(math.floor(box[0])),
329
+ y1=int(math.floor(box[1])),
330
+ x2=int(math.ceil(box[2])),
331
+ y2=int(math.ceil(box[3])),
332
+ cls_id=0,
333
+ conf=float(conf),
334
+ )
335
+ )
336
+ return out
337
+
338
+ @staticmethod
339
+ def _coco_classes() -> list[str]:
340
+ return [
341
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train",
342
+ "truck", "boat", "traffic light", "fire hydrant", "stop sign",
343
+ "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep",
344
+ "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
345
+ "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
346
+ "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
347
+ "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork",
348
+ "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
349
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
350
+ "couch", "potted plant", "bed", "dining table", "toilet", "tv",
351
+ "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
352
+ "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
353
+ "scissors", "teddy bear", "hair drier", "toothbrush",
354
+ ]
355
+
356
+ def _predict_single(self, image: np.ndarray) -> list[BoundingBox]:
357
+ if image is None:
358
+ raise ValueError("Input image is None")
359
+ if not isinstance(image, np.ndarray) or image.ndim != 3 or image.shape[2] != 3:
360
+ raise ValueError(f"Expected HWC RGB/BGR image, got shape={getattr(image, 'shape', None)}")
361
+
362
+ input_tensor, ratio, pad, orig_size = self._preprocess(image)
363
+ outputs = self.session.run(self.output_names, {self.input_name: input_tensor})
364
+ return self._decode_yolov11(outputs[0], ratio, pad, orig_size)
365
+
366
+ def predict_batch(
367
+ self,
368
+ batch_images: list[ndarray],
369
+ offset: int,
370
+ n_keypoints: int,
371
+ ) -> list[TVFrameResult]:
372
+ results: list[TVFrameResult] = []
373
+ for i, image in enumerate(batch_images):
374
+ frame_id = offset + i
375
  try:
376
+ boxes = self._predict_single(image)
377
  except Exception as e:
378
+ print(f"Inference failed for frame {frame_id}: {e}")
379
  boxes = []
380
+ results.append(
381
+ TVFrameResult(
382
+ frame_id=frame_id,
383
+ boxes=boxes,
384
+ keypoints=[(0, 0) for _ in range(max(0, int(n_keypoints)))],
385
+ )
386
+ )
387
  return results