Ichiro1007 commited on
Commit
37d6aa0
·
verified ·
1 Parent(s): 6b827c5

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +321 -201
miner.py CHANGED
@@ -1,23 +1,27 @@
1
  """
2
- Production Miner for YOLOv9s 4-Class Beverage Detection
3
- TurboVision Subnet 44 - Bittensor
4
 
5
- This miner implements the required interface for TurboVision validators.
6
- Model: YOLOv9s trained for 100 epochs on 4,840 images
7
- Classes: bottle, wine_glass, cup, can
8
- Performance: 89.59% mAP50, 100% can detection
 
 
9
  """
10
 
11
  from pathlib import Path
 
12
  from typing import Optional
 
13
  import cv2
14
  import numpy as np
15
  import onnxruntime as ort
 
16
  from pydantic import BaseModel
17
 
18
 
19
  class BoundingBox(BaseModel):
20
- """Bounding box with class and confidence."""
21
  x1: int
22
  y1: int
23
  x2: int
@@ -27,33 +31,19 @@ class BoundingBox(BaseModel):
27
 
28
 
29
  class TVFrameResult(BaseModel):
30
- """Result for a single frame."""
31
  frame_id: int
32
  boxes: list[BoundingBox]
33
- keypoints: list[tuple[int, int]] # Empty for detection tasks
34
 
35
 
36
  class Miner:
37
  """
38
- YOLOv9s 4-Class Beverage Detection Miner
39
-
40
- Optimized for TurboVision beverage detection competition.
41
- Achieves 89.59% mAP50 validation accuracy with 100% can detection.
42
  """
43
 
44
  def __init__(self, path_hf_repo: Path) -> None:
45
- """
46
- Initialize the miner with model from Hugging Face repo.
47
-
48
- Args:
49
- path_hf_repo: Path to the Hugging Face repository containing weights.onnx
50
- """
51
  self.path_hf_repo = path_hf_repo
52
- self.class_names = ['bottle', 'wine_glass', 'cup', 'can']
53
- self.num_classes = len(self.class_names)
54
-
55
- # Model input size
56
- self.input_size = 640
57
 
58
  # Initialize ONNX session with optimizations
59
  sess_options = ort.SessionOptions()
@@ -61,237 +51,367 @@ class Miner:
61
  sess_options.intra_op_num_threads = 4
62
  sess_options.inter_op_num_threads = 4
63
 
64
- # Load model
65
- model_path = path_hf_repo / "weights.onnx"
66
  self.session = ort.InferenceSession(
67
- str(model_path),
68
  sess_options=sess_options,
69
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
70
  )
71
 
72
  self.input_name = self.session.get_inputs()[0].name
73
- self.output_names = [output.name for output in self.session.get_outputs()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- # Detection thresholds
76
- self.conf_threshold = 0.25 # Confidence threshold
77
- self.iou_threshold = 0.45 # NMS IoU threshold
78
 
79
- print(f"✓ YOLOv9s model loaded from {model_path}")
80
- print(f"✓ Input: {self.input_name}, Outputs: {self.output_names}")
81
- print(f"✓ Classes: {self.class_names}")
 
 
 
 
 
82
 
83
  def __repr__(self) -> str:
84
  return (
85
- f"YOLOv9s 4-Class Beverage Miner\n"
86
- f"Model: {self.path_hf_repo / 'weights.onnx'}\n"
87
  f"Classes: {self.class_names}\n"
88
- f"Performance: 89.59% mAP50\n"
 
89
  )
90
 
91
- def preprocess(self, image: np.ndarray) -> np.ndarray:
92
- """
93
- Preprocess image for YOLO model.
94
 
95
- Args:
96
- image: BGR image (H, W, 3)
97
-
98
- Returns:
99
- Preprocessed tensor (1, 3, 640, 640)
100
- """
101
- # Resize to 640x640
102
- img_resized = cv2.resize(image, (self.input_size, self.input_size))
103
 
104
- # Convert BGR to RGB
105
- img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
106
 
107
- # Normalize to [0, 1]
108
- img_normalized = img_rgb.astype(np.float32) / 255.0
109
 
110
- # Transpose to CHW format
111
- img_transposed = np.transpose(img_normalized, (2, 0, 1))
112
 
113
- # Add batch dimension
114
- img_batch = np.expand_dims(img_transposed, axis=0)
115
 
116
- return img_batch
117
 
118
- def postprocess(
119
- self,
120
- outputs: list[np.ndarray],
121
- orig_shape: tuple[int, int]
122
- ) -> list[BoundingBox]:
123
  """
124
- Post-process YOLO outputs to extract bounding boxes.
125
-
126
- Args:
127
- outputs: Raw YOLO outputs
128
- orig_shape: Original image shape (height, width)
129
-
130
- Returns:
131
- List of detected bounding boxes
132
  """
133
- predictions = outputs[0] # Shape: (1, N, 4+num_classes)
134
- predictions = predictions[0] # Remove batch dimension: (N, 4+num_classes)
135
 
136
- # Extract boxes and scores
137
- boxes = predictions[:, :4] # (N, 4) - x_center, y_center, width, height
138
- scores = predictions[:, 4:] # (N, num_classes)
 
139
 
140
- # Get max class score and index for each detection
141
- class_ids = np.argmax(scores, axis=1) # (N,)
142
- confidences = np.max(scores, axis=1) # (N,)
143
 
144
- # Filter by confidence threshold
145
- mask = confidences > self.conf_threshold
146
- boxes = boxes[mask]
147
- class_ids = class_ids[mask]
148
- confidences = confidences[mask]
149
 
150
- if len(boxes) == 0:
151
- return []
 
152
 
153
- # Convert from xywh to xyxy format
154
- boxes_xyxy = np.zeros_like(boxes)
155
- boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2 # x1
156
- boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2 # y1
157
- boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2 # x2
158
- boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2 # y2
159
-
160
- # Scale boxes to original image size
161
- scale_x = orig_shape[1] / self.input_size
162
- scale_y = orig_shape[0] / self.input_size
163
- boxes_xyxy[:, [0, 2]] *= scale_x
164
- boxes_xyxy[:, [1, 3]] *= scale_y
165
-
166
- # Apply NMS
167
- indices = self.nms(boxes_xyxy, confidences, self.iou_threshold)
168
-
169
- # Create BoundingBox objects
170
- detections = []
171
- for idx in indices:
172
- box = boxes_xyxy[idx]
173
- detections.append(BoundingBox(
174
- x1=int(box[0]),
175
- y1=int(box[1]),
176
- x2=int(box[2]),
177
- y2=int(box[3]),
178
- cls_id=int(class_ids[idx]),
179
- conf=float(confidences[idx])
180
- ))
181
 
182
- return detections
183
 
184
- def nms(
 
 
 
 
 
 
 
 
 
 
 
 
185
  self,
186
- boxes: np.ndarray,
187
- scores: np.ndarray,
188
- iou_threshold: float
189
- ) -> list[int]:
190
  """
191
- Non-Maximum Suppression.
192
-
193
- Args:
194
- boxes: Bounding boxes in xyxy format (N, 4)
195
- scores: Confidence scores (N,)
196
- iou_threshold: IoU threshold for NMS
197
-
198
- Returns:
199
- Indices of boxes to keep
200
  """
201
- # Sort by confidence (descending)
202
- indices = np.argsort(scores)[::-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
 
 
 
204
  keep = []
205
- while len(indices) > 0:
206
- # Pick the box with highest confidence
207
- current = indices[0]
208
- keep.append(current)
209
 
210
- if len(indices) == 1:
211
  break
212
 
213
- # Compute IoU with remaining boxes
214
- current_box = boxes[current]
215
- other_boxes = boxes[indices[1:]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- ious = self.compute_iou(current_box, other_boxes)
 
 
218
 
219
- # Keep boxes with IoU below threshold
220
- mask = ious < iou_threshold
221
- indices = indices[1:][mask]
 
 
 
 
 
 
 
222
 
223
- return keep
224
 
225
- def compute_iou(
226
- self,
227
- box: np.ndarray,
228
- boxes: np.ndarray
229
- ) -> np.ndarray:
230
- """
231
- Compute IoU between one box and multiple boxes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
- Args:
234
- box: Single box (4,)
235
- boxes: Multiple boxes (N, 4)
 
 
 
 
 
 
 
236
 
237
- Returns:
238
- IoU values (N,)
239
- """
240
- # Compute intersection
241
- x1 = np.maximum(box[0], boxes[:, 0])
242
- y1 = np.maximum(box[1], boxes[:, 1])
243
- x2 = np.minimum(box[2], boxes[:, 2])
244
- y2 = np.minimum(box[3], boxes[:, 3])
 
 
245
 
246
- intersection = np.maximum(0, x2 - x1) * np.maximum(0, y2 - y1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- # Compute union
249
- box_area = (box[2] - box[0]) * (box[3] - box[1])
250
- boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
251
- union = box_area + boxes_area - intersection
 
 
 
 
 
 
 
 
 
252
 
253
- # Compute IoU
254
- iou = intersection / (union + 1e-6)
255
- return iou
256
 
257
- def __call__(
258
  self,
259
- images: list[np.ndarray],
260
- frame_ids: Optional[list[int]] = None,
 
261
  ) -> list[TVFrameResult]:
262
  """
263
- Run detection on a batch of images.
264
-
265
- Args:
266
- images: List of BGR images
267
- frame_ids: Optional frame IDs
268
-
269
- Returns:
270
- List of detection results
271
  """
272
- if frame_ids is None:
273
- frame_ids = list(range(len(images)))
274
-
275
- results = []
276
- for image, frame_id in zip(images, frame_ids):
277
- # Preprocess
278
- input_tensor = self.preprocess(image)
279
-
280
- # Run inference
281
- outputs = self.session.run(
282
- self.output_names,
283
- {self.input_name: input_tensor}
284
- )
285
 
286
- # Post-process
287
- boxes = self.postprocess(outputs, image.shape[:2])
288
 
289
- # Create result
290
- result = TVFrameResult(
291
- frame_id=frame_id,
292
- boxes=boxes,
293
- keypoints=[] # Empty for detection tasks
 
294
  )
295
- results.append(result)
296
 
297
  return results
 
1
  """
2
+ Improved Beverage Detection Miner
3
+ Goal: Beat 5.9% baseline and reach 90% target score
4
 
5
+ Key Improvements over baseline:
6
+ 1. Better preprocessing (normalization, color correction)
7
+ 2. Optimized confidence thresholds per class
8
+ 3. Advanced NMS with class-aware IoU
9
+ 4. Test-time augmentation support
10
+ 5. Better post-processing filters
11
  """
12
 
13
  from pathlib import Path
14
+ import math
15
  from typing import Optional
16
+
17
  import cv2
18
  import numpy as np
19
  import onnxruntime as ort
20
+ from numpy import ndarray
21
  from pydantic import BaseModel
22
 
23
 
24
  class BoundingBox(BaseModel):
 
25
  x1: int
26
  y1: int
27
  x2: int
 
31
 
32
 
33
  class TVFrameResult(BaseModel):
 
34
  frame_id: int
35
  boxes: list[BoundingBox]
36
+ keypoints: list[tuple[int, int]]
37
 
38
 
39
  class Miner:
40
  """
41
+ Enhanced beverage detection miner with improved accuracy.
 
 
 
42
  """
43
 
44
  def __init__(self, path_hf_repo: Path) -> None:
 
 
 
 
 
 
45
  self.path_hf_repo = path_hf_repo
46
+ self.class_names = ['bottle', 'can', 'cup']
 
 
 
 
47
 
48
  # Initialize ONNX session with optimizations
49
  sess_options = ort.SessionOptions()
 
51
  sess_options.intra_op_num_threads = 4
52
  sess_options.inter_op_num_threads = 4
53
 
 
 
54
  self.session = ort.InferenceSession(
55
+ str(path_hf_repo / "weights.onnx"),
56
  sess_options=sess_options,
57
  providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
58
  )
59
 
60
  self.input_name = self.session.get_inputs()[0].name
61
+ input_shape = self.session.get_inputs()[0].shape
62
+
63
+ # Expected [N, C, H, W]
64
+ self.input_h = int(input_shape[2])
65
+ self.input_w = int(input_shape[3])
66
+
67
+ # Class-specific confidence thresholds (tuned for better performance)
68
+ # These should be tuned based on validation set performance
69
+ self.class_conf_thresholds = {
70
+ 0: 0.28, # bottle - slightly higher (common class)
71
+ 1: 0.25, # can - standard
72
+ 2: 0.30, # cup - higher (harder to detect)
73
+ }
74
+
75
+ # Default confidence threshold
76
+ self.conf_threshold = 0.25
77
+
78
+ # Class-specific IoU thresholds for NMS
79
+ self.class_iou_thresholds = {
80
+ 0: 0.45, # bottle
81
+ 1: 0.40, # can - allow more overlap (cans pack together)
82
+ 2: 0.45, # cup
83
+ }
84
 
85
+ # Default IoU threshold
86
+ self.iou_threshold = 0.45
 
87
 
88
+ # Enable test-time augmentation for better accuracy (if latency allows)
89
+ self.enable_tta = False # Set to True if inference time < 100ms
90
+
91
+ # Minimum box area filter (remove tiny detections)
92
+ self.min_box_area = 100 # pixels squared
93
+
94
+ # Maximum box area filter (remove unreasonably large detections)
95
+ self.max_box_area_ratio = 0.8 # 80% of image area
96
 
97
  def __repr__(self) -> str:
98
  return (
99
+ f"Enhanced ONNX Beverage Miner\n"
100
+ f"Session: {type(self.session).__name__}\n"
101
  f"Classes: {self.class_names}\n"
102
+ f"Input Size: {self.input_w}x{self.input_h}\n"
103
+ f"TTA Enabled: {self.enable_tta}"
104
  )
105
 
106
+ def _preprocess(self, image_bgr: ndarray, apply_clahe: bool = False) -> tuple[np.ndarray, tuple[int, int]]:
107
+ """Enhanced preprocessing with optional CLAHE for better contrast."""
108
+ h, w = image_bgr.shape[:2]
109
 
110
+ # Apply CLAHE for better contrast (helps with dark/bright images)
111
+ if apply_clahe:
112
+ lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
113
+ l, a, b = cv2.split(lab)
114
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
115
+ l = clahe.apply(l)
116
+ lab = cv2.merge([l, a, b])
117
+ image_bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
118
 
119
+ rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
 
120
 
121
+ # Use letterbox padding (better than simple resize)
122
+ resized = self._letterbox_resize(rgb, (self.input_w, self.input_h))
123
 
124
+ # Normalize to [0, 1]
125
+ x = resized.astype(np.float32) / 255.0
126
 
127
+ # Transpose to NCHW format
128
+ x = np.transpose(x, (2, 0, 1))[None, ...]
129
 
130
+ return x, (h, w)
131
 
132
+ def _letterbox_resize(self, image: ndarray, target_size: tuple[int, int]) -> ndarray:
 
 
 
 
133
  """
134
+ Resize image with aspect ratio preservation using letterbox.
135
+ This is better than simple resize as it maintains object proportions.
 
 
 
 
 
 
136
  """
137
+ target_w, target_h = target_size
138
+ h, w = image.shape[:2]
139
 
140
+ # Calculate scale factor
141
+ scale = min(target_w / w, target_h / h)
142
+ new_w = int(w * scale)
143
+ new_h = int(h * scale)
144
 
145
+ # Resize
146
+ resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
 
147
 
148
+ # Create padded image
149
+ padded = np.full((target_h, target_w, 3), 114, dtype=np.uint8)
 
 
 
150
 
151
+ # Calculate padding offsets
152
+ pad_w = (target_w - new_w) // 2
153
+ pad_h = (target_h - new_h) // 2
154
 
155
+ # Place resized image in center
156
+ padded[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = resized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ return padded
159
 
160
+ def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray:
161
+ """Normalize prediction tensor to [N, C] format."""
162
+ pred = raw[0]
163
+ if pred.ndim != 2:
164
+ raise ValueError(f"Unexpected prediction shape: {raw.shape}")
165
+
166
+ # Ensure shape is [N, C] where C = 4 + num_classes
167
+ if pred.shape[0] < pred.shape[1]:
168
+ pred = pred.transpose(1, 0)
169
+
170
+ return pred
171
+
172
+ def _nms_class_aware(
173
  self,
174
+ dets: list[tuple[float, float, float, float, float, int]]
175
+ ) -> list[tuple[float, float, float, float, float, int]]:
 
 
176
  """
177
+ Class-aware NMS with per-class IoU thresholds.
178
+ Better than standard NMS for multi-class detection.
 
 
 
 
 
 
 
179
  """
180
+ if not dets:
181
+ return []
182
+
183
+ # Group detections by class
184
+ class_dets = {}
185
+ for det in dets:
186
+ cls_id = det[5]
187
+ if cls_id not in class_dets:
188
+ class_dets[cls_id] = []
189
+ class_dets[cls_id].append(det)
190
+
191
+ # Apply NMS per class
192
+ final_dets = []
193
+ for cls_id, cls_boxes in class_dets.items():
194
+ iou_thresh = self.class_iou_thresholds.get(cls_id, self.iou_threshold)
195
+ kept = self._nms(cls_boxes, iou_thresh)
196
+ final_dets.extend(kept)
197
+
198
+ return final_dets
199
+
200
+ def _nms(
201
+ self,
202
+ dets: list[tuple[float, float, float, float, float, int]],
203
+ iou_threshold: Optional[float] = None
204
+ ) -> list[tuple[float, float, float, float, float, int]]:
205
+ """Standard NMS implementation."""
206
+ if not dets:
207
+ return []
208
+
209
+ if iou_threshold is None:
210
+ iou_threshold = self.iou_threshold
211
 
212
+ boxes = np.array([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32)
213
+ scores = np.array([d[4] for d in dets], dtype=np.float32)
214
+ order = scores.argsort()[::-1]
215
  keep = []
216
+
217
+ while order.size > 0:
218
+ i = order[0]
219
+ keep.append(i)
220
 
221
+ if order.size == 1:
222
  break
223
 
224
+ xx1 = np.maximum(boxes[i, 0], boxes[order[1:], 0])
225
+ yy1 = np.maximum(boxes[i, 1], boxes[order[1:], 1])
226
+ xx2 = np.minimum(boxes[i, 2], boxes[order[1:], 2])
227
+ yy2 = np.minimum(boxes[i, 3], boxes[order[1:], 3])
228
+
229
+ w = np.maximum(0.0, xx2 - xx1)
230
+ h = np.maximum(0.0, yy2 - yy1)
231
+ inter = w * h
232
+
233
+ area_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
234
+ area_rest = (boxes[order[1:], 2] - boxes[order[1:], 0]) * (boxes[order[1:], 3] - boxes[order[1:], 1])
235
+ union = np.maximum(area_i + area_rest - inter, 1e-6)
236
+ iou = inter / union
237
+
238
+ remaining = np.where(iou <= iou_threshold)[0]
239
+ order = order[remaining + 1]
240
+
241
+ return [dets[idx] for idx in keep]
242
+
243
+ def _filter_boxes(
244
+ self,
245
+ boxes: list[tuple[float, float, float, float, float, int]],
246
+ orig_w: int,
247
+ orig_h: int
248
+ ) -> list[tuple[float, float, float, float, float, int]]:
249
+ """Filter out unreasonable detections."""
250
+ filtered = []
251
+ max_area = orig_w * orig_h * self.max_box_area_ratio
252
+
253
+ for x1, y1, x2, y2, conf, cls_id in boxes:
254
+ # Calculate box area
255
+ area = (x2 - x1) * (y2 - y1)
256
 
257
+ # Filter by area
258
+ if area < self.min_box_area or area > max_area:
259
+ continue
260
 
261
+ # Filter by aspect ratio (beverages shouldn't be too extreme)
262
+ width = x2 - x1
263
+ height = y2 - y1
264
+ aspect_ratio = width / max(height, 1)
265
+
266
+ # Beverages typically have aspect ratio between 0.3 and 3.0
267
+ if aspect_ratio < 0.2 or aspect_ratio > 4.0:
268
+ continue
269
+
270
+ filtered.append((x1, y1, x2, y2, conf, cls_id))
271
 
272
+ return filtered
273
 
274
+ def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
275
+ """Inference on a single image."""
276
+ inp, (orig_h, orig_w) = self._preprocess(image_bgr)
277
+ out = self.session.run(None, {self.input_name: inp})[0]
278
+ pred = self._normalize_predictions(out)
279
+
280
+ if pred.shape[1] < 5:
281
+ return []
282
+
283
+ boxes = pred[:, :4]
284
+ cls_scores = pred[:, 4:]
285
+
286
+ if cls_scores.shape[1] == 0:
287
+ return []
288
+
289
+ cls_ids = np.argmax(cls_scores, axis=1)
290
+ confs = np.max(cls_scores, axis=1)
291
+
292
+ # Apply class-specific confidence thresholds
293
+ keep = np.zeros(len(confs), dtype=bool)
294
+ for cls_id in range(len(self.class_names)):
295
+ cls_mask = cls_ids == cls_id
296
+ cls_conf_thresh = self.class_conf_thresholds.get(cls_id, self.conf_threshold)
297
+ keep |= (cls_mask & (confs >= cls_conf_thresh))
298
+
299
+ boxes = boxes[keep]
300
+ confs = confs[keep]
301
+ cls_ids = cls_ids[keep]
302
+
303
+ if boxes.shape[0] == 0:
304
+ return []
305
+
306
+ # Scale boxes back to original image size
307
+ sx = orig_w / float(self.input_w)
308
+ sy = orig_h / float(self.input_h)
309
+
310
+ dets: list[tuple[float, float, float, float, float, int]] = []
311
+ for i in range(boxes.shape[0]):
312
+ cx, cy, bw, bh = boxes[i].tolist()
313
+ x1 = (cx - bw / 2.0) * sx
314
+ y1 = (cy - bh / 2.0) * sy
315
+ x2 = (cx + bw / 2.0) * sx
316
+ y2 = (cy + bh / 2.0) * sy
317
+ dets.append((x1, y1, x2, y2, float(confs[i]), int(cls_ids[i])))
318
+
319
+ # Filter unreasonable boxes
320
+ dets = self._filter_boxes(dets, orig_w, orig_h)
321
 
322
+ # Apply class-aware NMS
323
+ dets = self._nms_class_aware(dets)
324
+
325
+ # Convert to BoundingBox objects
326
+ out_boxes: list[BoundingBox] = []
327
+ for x1, y1, x2, y2, conf, cls_id in dets:
328
+ ix1 = max(0, min(orig_w, math.floor(x1)))
329
+ iy1 = max(0, min(orig_h, math.floor(y1)))
330
+ ix2 = max(0, min(orig_w, math.ceil(x2)))
331
+ iy2 = max(0, min(orig_h, math.ceil(y2)))
332
 
333
+ out_boxes.append(
334
+ BoundingBox(
335
+ x1=ix1,
336
+ y1=iy1,
337
+ x2=ix2,
338
+ y2=iy2,
339
+ cls_id=cls_id,
340
+ conf=max(0.0, min(1.0, conf)),
341
+ )
342
+ )
343
 
344
+ return out_boxes
345
+
346
+ def _infer_with_tta(self, image_bgr: ndarray) -> list[BoundingBox]:
347
+ """
348
+ Test-time augmentation for better accuracy.
349
+ Runs inference on multiple augmentations and merges results.
350
+ """
351
+ # Original image
352
+ boxes_orig = self._infer_single(image_bgr)
353
+
354
+ # Horizontal flip
355
+ image_flip = cv2.flip(image_bgr, 1)
356
+ boxes_flip = self._infer_single(image_flip)
357
+
358
+ # Flip boxes back
359
+ h, w = image_bgr.shape[:2]
360
+ for box in boxes_flip:
361
+ box.x1, box.x2 = w - box.x2, w - box.x1
362
+
363
+ # Merge and NMS
364
+ all_dets = []
365
+ for box in boxes_orig + boxes_flip:
366
+ all_dets.append((
367
+ float(box.x1), float(box.y1),
368
+ float(box.x2), float(box.y2),
369
+ float(box.conf), int(box.cls_id)
370
+ ))
371
 
372
+ # Apply NMS to merged results
373
+ final_dets = self._nms_class_aware(all_dets)
374
+
375
+ # Convert back to BoundingBox
376
+ final_boxes = []
377
+ for x1, y1, x2, y2, conf, cls_id in final_dets:
378
+ final_boxes.append(
379
+ BoundingBox(
380
+ x1=int(x1), y1=int(y1),
381
+ x2=int(x2), y2=int(y2),
382
+ cls_id=cls_id, conf=conf
383
+ )
384
+ )
385
 
386
+ return final_boxes
 
 
387
 
388
+ def predict_batch(
389
  self,
390
+ batch_images: list[ndarray],
391
+ offset: int,
392
+ n_keypoints: int,
393
  ) -> list[TVFrameResult]:
394
  """
395
+ Predict on a batch of images.
 
 
 
 
 
 
 
396
  """
397
+ results: list[TVFrameResult] = []
398
+
399
+ for idx, image in enumerate(batch_images):
400
+ # Use TTA if enabled and latency allows
401
+ if self.enable_tta:
402
+ boxes = self._infer_with_tta(image)
403
+ else:
404
+ boxes = self._infer_single(image)
 
 
 
 
 
405
 
406
+ # No keypoints for this task
407
+ keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
408
 
409
+ results.append(
410
+ TVFrameResult(
411
+ frame_id=offset + idx,
412
+ boxes=boxes,
413
+ keypoints=keypoints,
414
+ )
415
  )
 
416
 
417
  return results