Ichiro1007 commited on
Commit
b3f968a
·
verified ·
1 Parent(s): da98178

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. chute_config.yml +17 -0
  2. class_names.txt +3 -0
  3. miner.py +417 -0
  4. weights.onnx +3 -0
chute_config.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Image:
2
+ from_base: parachutes/python:3.12
3
+ run_command:
4
+ - pip install --upgrade setuptools wheel
5
+ - pip install 'numpy>=1.23' 'onnxruntime-gpu>=1.16' 'opencv-python-headless>=4.7' 'pillow>=9.5' 'pydantic>=2.0'
6
+ set_workdir: /app
7
+
8
+ NodeSelector:
9
+ gpu_count: 1
10
+ min_vram_gb_per_gpu: 8
11
+ gpu_type: "T4"
12
+
13
+ Chute:
14
+ shutdown_after_seconds: 300
15
+ concurrency: 4
16
+ max_instances: 5
17
+ scaling_threshold: 0.5
class_names.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ bottle
2
+ can
3
+ cup
miner.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Improved Beverage Detection Miner
3
+ Goal: Beat 5.9% baseline and reach 90% target score
4
+
5
+ Key Improvements over baseline:
6
+ 1. Better preprocessing (normalization, color correction)
7
+ 2. Optimized confidence thresholds per class
8
+ 3. Advanced NMS with class-aware IoU
9
+ 4. Test-time augmentation support
10
+ 5. Better post-processing filters
11
+ """
12
+
13
+ from pathlib import Path
14
+ import math
15
+ from typing import Optional
16
+
17
+ import cv2
18
+ import numpy as np
19
+ import onnxruntime as ort
20
+ from numpy import ndarray
21
+ from pydantic import BaseModel
22
+
23
+
24
+ class BoundingBox(BaseModel):
25
+ x1: int
26
+ y1: int
27
+ x2: int
28
+ y2: int
29
+ cls_id: int
30
+ conf: float
31
+
32
+
33
+ class TVFrameResult(BaseModel):
34
+ frame_id: int
35
+ boxes: list[BoundingBox]
36
+ keypoints: list[tuple[int, int]]
37
+
38
+
39
+ class Miner:
40
+ """
41
+ Enhanced beverage detection miner with improved accuracy.
42
+ """
43
+
44
+ def __init__(self, path_hf_repo: Path) -> None:
45
+ self.path_hf_repo = path_hf_repo
46
+ self.class_names = ['bottle', 'can', 'cup']
47
+
48
+ # Initialize ONNX session with optimizations
49
+ sess_options = ort.SessionOptions()
50
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
51
+ sess_options.intra_op_num_threads = 4
52
+ sess_options.inter_op_num_threads = 4
53
+
54
+ self.session = ort.InferenceSession(
55
+ str(path_hf_repo / "weights.onnx"),
56
+ sess_options=sess_options,
57
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
58
+ )
59
+
60
+ self.input_name = self.session.get_inputs()[0].name
61
+ input_shape = self.session.get_inputs()[0].shape
62
+
63
+ # Expected [N, C, H, W]
64
+ self.input_h = int(input_shape[2])
65
+ self.input_w = int(input_shape[3])
66
+
67
+ # Class-specific confidence thresholds (tuned for better performance)
68
+ # These should be tuned based on validation set performance
69
+ self.class_conf_thresholds = {
70
+ 0: 0.28, # bottle - slightly higher (common class)
71
+ 1: 0.25, # can - standard
72
+ 2: 0.30, # cup - higher (harder to detect)
73
+ }
74
+
75
+ # Default confidence threshold
76
+ self.conf_threshold = 0.25
77
+
78
+ # Class-specific IoU thresholds for NMS
79
+ self.class_iou_thresholds = {
80
+ 0: 0.45, # bottle
81
+ 1: 0.40, # can - allow more overlap (cans pack together)
82
+ 2: 0.45, # cup
83
+ }
84
+
85
+ # Default IoU threshold
86
+ self.iou_threshold = 0.45
87
+
88
+ # Enable test-time augmentation for better accuracy (if latency allows)
89
+ self.enable_tta = False # Set to True if inference time < 100ms
90
+
91
+ # Minimum box area filter (remove tiny detections)
92
+ self.min_box_area = 100 # pixels squared
93
+
94
+ # Maximum box area filter (remove unreasonably large detections)
95
+ self.max_box_area_ratio = 0.8 # 80% of image area
96
+
97
+ def __repr__(self) -> str:
98
+ return (
99
+ f"Enhanced ONNX Beverage Miner\n"
100
+ f"Session: {type(self.session).__name__}\n"
101
+ f"Classes: {self.class_names}\n"
102
+ f"Input Size: {self.input_w}x{self.input_h}\n"
103
+ f"TTA Enabled: {self.enable_tta}"
104
+ )
105
+
106
+ def _preprocess(self, image_bgr: ndarray, apply_clahe: bool = False) -> tuple[np.ndarray, tuple[int, int]]:
107
+ """Enhanced preprocessing with optional CLAHE for better contrast."""
108
+ h, w = image_bgr.shape[:2]
109
+
110
+ # Apply CLAHE for better contrast (helps with dark/bright images)
111
+ if apply_clahe:
112
+ lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
113
+ l, a, b = cv2.split(lab)
114
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
115
+ l = clahe.apply(l)
116
+ lab = cv2.merge([l, a, b])
117
+ image_bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
118
+
119
+ rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
120
+
121
+ # Use letterbox padding (better than simple resize)
122
+ resized = self._letterbox_resize(rgb, (self.input_w, self.input_h))
123
+
124
+ # Normalize to [0, 1]
125
+ x = resized.astype(np.float32) / 255.0
126
+
127
+ # Transpose to NCHW format
128
+ x = np.transpose(x, (2, 0, 1))[None, ...]
129
+
130
+ return x, (h, w)
131
+
132
+ def _letterbox_resize(self, image: ndarray, target_size: tuple[int, int]) -> ndarray:
133
+ """
134
+ Resize image with aspect ratio preservation using letterbox.
135
+ This is better than simple resize as it maintains object proportions.
136
+ """
137
+ target_w, target_h = target_size
138
+ h, w = image.shape[:2]
139
+
140
+ # Calculate scale factor
141
+ scale = min(target_w / w, target_h / h)
142
+ new_w = int(w * scale)
143
+ new_h = int(h * scale)
144
+
145
+ # Resize
146
+ resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
147
+
148
+ # Create padded image
149
+ padded = np.full((target_h, target_w, 3), 114, dtype=np.uint8)
150
+
151
+ # Calculate padding offsets
152
+ pad_w = (target_w - new_w) // 2
153
+ pad_h = (target_h - new_h) // 2
154
+
155
+ # Place resized image in center
156
+ padded[pad_h:pad_h + new_h, pad_w:pad_w + new_w] = resized
157
+
158
+ return padded
159
+
160
+ def _normalize_predictions(self, raw: np.ndarray) -> np.ndarray:
161
+ """Normalize prediction tensor to [N, C] format."""
162
+ pred = raw[0]
163
+ if pred.ndim != 2:
164
+ raise ValueError(f"Unexpected prediction shape: {raw.shape}")
165
+
166
+ # Ensure shape is [N, C] where C = 4 + num_classes
167
+ if pred.shape[0] < pred.shape[1]:
168
+ pred = pred.transpose(1, 0)
169
+
170
+ return pred
171
+
172
+ def _nms_class_aware(
173
+ self,
174
+ dets: list[tuple[float, float, float, float, float, int]]
175
+ ) -> list[tuple[float, float, float, float, float, int]]:
176
+ """
177
+ Class-aware NMS with per-class IoU thresholds.
178
+ Better than standard NMS for multi-class detection.
179
+ """
180
+ if not dets:
181
+ return []
182
+
183
+ # Group detections by class
184
+ class_dets = {}
185
+ for det in dets:
186
+ cls_id = det[5]
187
+ if cls_id not in class_dets:
188
+ class_dets[cls_id] = []
189
+ class_dets[cls_id].append(det)
190
+
191
+ # Apply NMS per class
192
+ final_dets = []
193
+ for cls_id, cls_boxes in class_dets.items():
194
+ iou_thresh = self.class_iou_thresholds.get(cls_id, self.iou_threshold)
195
+ kept = self._nms(cls_boxes, iou_thresh)
196
+ final_dets.extend(kept)
197
+
198
+ return final_dets
199
+
200
+ def _nms(
201
+ self,
202
+ dets: list[tuple[float, float, float, float, float, int]],
203
+ iou_threshold: Optional[float] = None
204
+ ) -> list[tuple[float, float, float, float, float, int]]:
205
+ """Standard NMS implementation."""
206
+ if not dets:
207
+ return []
208
+
209
+ if iou_threshold is None:
210
+ iou_threshold = self.iou_threshold
211
+
212
+ boxes = np.array([[d[0], d[1], d[2], d[3]] for d in dets], dtype=np.float32)
213
+ scores = np.array([d[4] for d in dets], dtype=np.float32)
214
+ order = scores.argsort()[::-1]
215
+ keep = []
216
+
217
+ while order.size > 0:
218
+ i = order[0]
219
+ keep.append(i)
220
+
221
+ if order.size == 1:
222
+ break
223
+
224
+ xx1 = np.maximum(boxes[i, 0], boxes[order[1:], 0])
225
+ yy1 = np.maximum(boxes[i, 1], boxes[order[1:], 1])
226
+ xx2 = np.minimum(boxes[i, 2], boxes[order[1:], 2])
227
+ yy2 = np.minimum(boxes[i, 3], boxes[order[1:], 3])
228
+
229
+ w = np.maximum(0.0, xx2 - xx1)
230
+ h = np.maximum(0.0, yy2 - yy1)
231
+ inter = w * h
232
+
233
+ area_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
234
+ area_rest = (boxes[order[1:], 2] - boxes[order[1:], 0]) * (boxes[order[1:], 3] - boxes[order[1:], 1])
235
+ union = np.maximum(area_i + area_rest - inter, 1e-6)
236
+ iou = inter / union
237
+
238
+ remaining = np.where(iou <= iou_threshold)[0]
239
+ order = order[remaining + 1]
240
+
241
+ return [dets[idx] for idx in keep]
242
+
243
+ def _filter_boxes(
244
+ self,
245
+ boxes: list[tuple[float, float, float, float, float, int]],
246
+ orig_w: int,
247
+ orig_h: int
248
+ ) -> list[tuple[float, float, float, float, float, int]]:
249
+ """Filter out unreasonable detections."""
250
+ filtered = []
251
+ max_area = orig_w * orig_h * self.max_box_area_ratio
252
+
253
+ for x1, y1, x2, y2, conf, cls_id in boxes:
254
+ # Calculate box area
255
+ area = (x2 - x1) * (y2 - y1)
256
+
257
+ # Filter by area
258
+ if area < self.min_box_area or area > max_area:
259
+ continue
260
+
261
+ # Filter by aspect ratio (beverages shouldn't be too extreme)
262
+ width = x2 - x1
263
+ height = y2 - y1
264
+ aspect_ratio = width / max(height, 1)
265
+
266
+ # Beverages typically have aspect ratio between 0.3 and 3.0
267
+ if aspect_ratio < 0.2 or aspect_ratio > 4.0:
268
+ continue
269
+
270
+ filtered.append((x1, y1, x2, y2, conf, cls_id))
271
+
272
+ return filtered
273
+
274
+ def _infer_single(self, image_bgr: ndarray) -> list[BoundingBox]:
275
+ """Inference on a single image."""
276
+ inp, (orig_h, orig_w) = self._preprocess(image_bgr)
277
+ out = self.session.run(None, {self.input_name: inp})[0]
278
+ pred = self._normalize_predictions(out)
279
+
280
+ if pred.shape[1] < 5:
281
+ return []
282
+
283
+ boxes = pred[:, :4]
284
+ cls_scores = pred[:, 4:]
285
+
286
+ if cls_scores.shape[1] == 0:
287
+ return []
288
+
289
+ cls_ids = np.argmax(cls_scores, axis=1)
290
+ confs = np.max(cls_scores, axis=1)
291
+
292
+ # Apply class-specific confidence thresholds
293
+ keep = np.zeros(len(confs), dtype=bool)
294
+ for cls_id in range(len(self.class_names)):
295
+ cls_mask = cls_ids == cls_id
296
+ cls_conf_thresh = self.class_conf_thresholds.get(cls_id, self.conf_threshold)
297
+ keep |= (cls_mask & (confs >= cls_conf_thresh))
298
+
299
+ boxes = boxes[keep]
300
+ confs = confs[keep]
301
+ cls_ids = cls_ids[keep]
302
+
303
+ if boxes.shape[0] == 0:
304
+ return []
305
+
306
+ # Scale boxes back to original image size
307
+ sx = orig_w / float(self.input_w)
308
+ sy = orig_h / float(self.input_h)
309
+
310
+ dets: list[tuple[float, float, float, float, float, int]] = []
311
+ for i in range(boxes.shape[0]):
312
+ cx, cy, bw, bh = boxes[i].tolist()
313
+ x1 = (cx - bw / 2.0) * sx
314
+ y1 = (cy - bh / 2.0) * sy
315
+ x2 = (cx + bw / 2.0) * sx
316
+ y2 = (cy + bh / 2.0) * sy
317
+ dets.append((x1, y1, x2, y2, float(confs[i]), int(cls_ids[i])))
318
+
319
+ # Filter unreasonable boxes
320
+ dets = self._filter_boxes(dets, orig_w, orig_h)
321
+
322
+ # Apply class-aware NMS
323
+ dets = self._nms_class_aware(dets)
324
+
325
+ # Convert to BoundingBox objects
326
+ out_boxes: list[BoundingBox] = []
327
+ for x1, y1, x2, y2, conf, cls_id in dets:
328
+ ix1 = max(0, min(orig_w, math.floor(x1)))
329
+ iy1 = max(0, min(orig_h, math.floor(y1)))
330
+ ix2 = max(0, min(orig_w, math.ceil(x2)))
331
+ iy2 = max(0, min(orig_h, math.ceil(y2)))
332
+
333
+ out_boxes.append(
334
+ BoundingBox(
335
+ x1=ix1,
336
+ y1=iy1,
337
+ x2=ix2,
338
+ y2=iy2,
339
+ cls_id=cls_id,
340
+ conf=max(0.0, min(1.0, conf)),
341
+ )
342
+ )
343
+
344
+ return out_boxes
345
+
346
+ def _infer_with_tta(self, image_bgr: ndarray) -> list[BoundingBox]:
347
+ """
348
+ Test-time augmentation for better accuracy.
349
+ Runs inference on multiple augmentations and merges results.
350
+ """
351
+ # Original image
352
+ boxes_orig = self._infer_single(image_bgr)
353
+
354
+ # Horizontal flip
355
+ image_flip = cv2.flip(image_bgr, 1)
356
+ boxes_flip = self._infer_single(image_flip)
357
+
358
+ # Flip boxes back
359
+ h, w = image_bgr.shape[:2]
360
+ for box in boxes_flip:
361
+ box.x1, box.x2 = w - box.x2, w - box.x1
362
+
363
+ # Merge and NMS
364
+ all_dets = []
365
+ for box in boxes_orig + boxes_flip:
366
+ all_dets.append((
367
+ float(box.x1), float(box.y1),
368
+ float(box.x2), float(box.y2),
369
+ float(box.conf), int(box.cls_id)
370
+ ))
371
+
372
+ # Apply NMS to merged results
373
+ final_dets = self._nms_class_aware(all_dets)
374
+
375
+ # Convert back to BoundingBox
376
+ final_boxes = []
377
+ for x1, y1, x2, y2, conf, cls_id in final_dets:
378
+ final_boxes.append(
379
+ BoundingBox(
380
+ x1=int(x1), y1=int(y1),
381
+ x2=int(x2), y2=int(y2),
382
+ cls_id=cls_id, conf=conf
383
+ )
384
+ )
385
+
386
+ return final_boxes
387
+
388
+ def predict_batch(
389
+ self,
390
+ batch_images: list[ndarray],
391
+ offset: int,
392
+ n_keypoints: int,
393
+ ) -> list[TVFrameResult]:
394
+ """
395
+ Predict on a batch of images.
396
+ """
397
+ results: list[TVFrameResult] = []
398
+
399
+ for idx, image in enumerate(batch_images):
400
+ # Use TTA if enabled and latency allows
401
+ if self.enable_tta:
402
+ boxes = self._infer_with_tta(image)
403
+ else:
404
+ boxes = self._infer_single(image)
405
+
406
+ # No keypoints for this task
407
+ keypoints = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
408
+
409
+ results.append(
410
+ TVFrameResult(
411
+ frame_id=offset + idx,
412
+ boxes=boxes,
413
+ keypoints=keypoints,
414
+ )
415
+ )
416
+
417
+ return results
weights.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc74e293a63ffe89f1a35a182f2d36cfbcfdf8138c53a45df0e3a6c9fb028d10
3
+ size 29080002