nexu02 commited on
Commit
f94d217
·
verified ·
1 Parent(s): 01e6c47

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +160 -0
miner.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """miner.py — uploaded to artur7236/turbovision-beverage HF repo.
2
+
3
+ Fine-tuned YOLO11s on SAM3-labelled beverage data + horizontal-flip TTA at
4
+ 1280×1280 + confidence threshold 0.55. Local benchmark mean composite = 0.709
5
+ vs UID 213's 0.667.
6
+
7
+ Required by the chute template:
8
+ - class Miner with __init__(self, path_hf_repo: Path)
9
+ - predict_batch(batch_images, offset, n_keypoints) -> list[TVFrameResult]
10
+ - BoundingBox + TVFrameResult pydantic models defined in this file
11
+ - No imports from anywhere except stdlib + site-packages (no helper modules
12
+ from the HF repo).
13
+ """
14
+ from pathlib import Path
15
+
16
+ import numpy as np
17
+ from numpy import ndarray
18
+ from pydantic import BaseModel
19
+ from ultralytics import YOLO
20
+
21
+
22
+ # Class index follows the manifest objects order for Detect-beverage-detect:
23
+ # [cup, bottle, can] → cls_id 0, 1, 2.
24
+ CLASS_NAMES = ["cup", "bottle", "can"]
25
+
26
+
27
+ class BoundingBox(BaseModel):
28
+ x1: int
29
+ y1: int
30
+ x2: int
31
+ y2: int
32
+ cls_id: int
33
+ conf: float
34
+
35
+
36
+ class TVFrameResult(BaseModel):
37
+ frame_id: int
38
+ boxes: list[BoundingBox]
39
+ keypoints: list[tuple[int, int]]
40
+
41
+
42
+ def _iou(a: BoundingBox, b: BoundingBox) -> float:
43
+ """Standard IoU between two axis-aligned boxes."""
44
+ x1 = max(a.x1, b.x1)
45
+ y1 = max(a.y1, b.y1)
46
+ x2 = min(a.x2, b.x2)
47
+ y2 = min(a.y2, b.y2)
48
+ if x2 <= x1 or y2 <= y1:
49
+ return 0.0
50
+ inter = (x2 - x1) * (y2 - y1)
51
+ area_a = max(0, a.x2 - a.x1) * max(0, a.y2 - a.y1)
52
+ area_b = max(0, b.x2 - b.x1) * max(0, b.y2 - b.y1)
53
+ union = area_a + area_b - inter
54
+ return inter / union if union > 0 else 0.0
55
+
56
+
57
+ def _cross_class_nms(boxes: list[BoundingBox], iou_thresh: float = 0.6) -> list[BoundingBox]:
58
+ """Suppress cross-class overlapping boxes (keep highest-conf when IoU≥thresh).
59
+
60
+ Ultralytics' default NMS only dedupes WITHIN a class. The SN44 validator
61
+ counts cross-class overlapping boxes as false positives (only one class
62
+ can be right per object), so we need this extra pass. Mirrors UID 213's
63
+ _cross_class_dedup_op (IoU 0.7); we use 0.6 to be slightly more aggressive.
64
+ """
65
+ if len(boxes) <= 1:
66
+ return boxes
67
+ sorted_boxes = sorted(boxes, key=lambda b: -b.conf)
68
+ kept: list[BoundingBox] = []
69
+ for b in sorted_boxes:
70
+ suppressed = False
71
+ for k in kept:
72
+ if _iou(b, k) >= iou_thresh:
73
+ suppressed = True
74
+ break
75
+ if not suppressed:
76
+ kept.append(b)
77
+ return kept
78
+
79
+
80
+ class Miner:
81
+ """Fine-tuned YOLO11s with hflip TTA (Round 5, 607 validator-distribution imgs, 19 MB).
82
+
83
+ Tuning rationale (all on SAM3 ground-truth, 361 val images):
84
+ - imgsz=1280: bumps small-object recall on the typical 1408×768 frames
85
+ the validator sends (cans, distant cups).
86
+ - conf=0.55: sweet spot for the validator's pillar weighting
87
+ (0.6·mAP50 + 0.4·(1−ffpi/10)). Below 0.45 the false-positive pillar
88
+ crashes; above 0.65 recall drops faster than precision rises.
89
+ - augment=True: ultralytics enables hflip TTA, our biggest single
90
+ composite gain in offline sweep.
91
+ - iou=0.45: standard YOLO NMS IoU; cross-class dedup happens implicitly.
92
+ """
93
+
94
+ IMAGE_SIZE = 1280
95
+ CONF_THRESH = 0.50 # Round 4: 0.50 narrowly beat 0.45 (0.6982 vs 0.6980)
96
+ IOU_THRESH = 0.45
97
+ USE_TTA = True
98
+ CROSS_CLASS_IOU = 0.6 # Round 6: post-NMS cross-class dedup
99
+ # Block 8287800 had 18 boxes including same-pixel cup+bottle pairs.
100
+
101
+ def __init__(self, path_hf_repo: Path) -> None:
102
+ weights_path = path_hf_repo / "best.pt"
103
+ if not weights_path.exists():
104
+ raise FileNotFoundError(f"missing weights at {weights_path}")
105
+ self.model = YOLO(str(weights_path))
106
+ # Touch the model once so cold-start latency hits the warmup, not the
107
+ # first validator call.
108
+ dummy = np.zeros((640, 640, 3), dtype=np.uint8)
109
+ _ = self.model.predict(dummy, imgsz=self.IMAGE_SIZE, conf=self.CONF_THRESH,
110
+ iou=self.IOU_THRESH, augment=self.USE_TTA, verbose=False)
111
+ print(f"✅ YOLO11s loaded from {weights_path}")
112
+
113
+ def __repr__(self) -> str:
114
+ return (f"YOLO11s_ft(imgsz={self.IMAGE_SIZE}, "
115
+ f"conf={self.CONF_THRESH}, iou={self.IOU_THRESH}, "
116
+ f"tta={self.USE_TTA})")
117
+
118
+ def predict_batch(
119
+ self,
120
+ batch_images: list[ndarray],
121
+ offset: int,
122
+ n_keypoints: int,
123
+ ) -> list[TVFrameResult]:
124
+ # Run inference. Ultralytics will accept a list[ndarray]; with augment=True
125
+ # it does the flip+merge internally per image.
126
+ results = self.model.predict(
127
+ batch_images,
128
+ imgsz=self.IMAGE_SIZE,
129
+ conf=self.CONF_THRESH,
130
+ iou=self.IOU_THRESH,
131
+ augment=self.USE_TTA,
132
+ verbose=False,
133
+ )
134
+
135
+ out: list[TVFrameResult] = []
136
+ # n_keypoints is irrelevant for detection elements; return zero-padded.
137
+ kp_zeros = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
138
+
139
+ for i, r in enumerate(results):
140
+ frame_id = offset + i
141
+ boxes: list[BoundingBox] = []
142
+ if r.boxes is not None and r.boxes.data is not None:
143
+ for box in r.boxes.data.cpu().numpy():
144
+ x1, y1, x2, y2, conf, cls_id = box.tolist()
145
+ cls_id_int = int(cls_id)
146
+ if cls_id_int < 0 or cls_id_int >= len(CLASS_NAMES):
147
+ continue
148
+ # ensure non-degenerate after rounding
149
+ xi1, yi1, xi2, yi2 = int(x1), int(y1), int(x2), int(y2)
150
+ if xi2 <= xi1 or yi2 <= yi1:
151
+ continue
152
+ boxes.append(BoundingBox(
153
+ x1=xi1, y1=yi1, x2=xi2, y2=yi2,
154
+ cls_id=cls_id_int, conf=float(conf),
155
+ ))
156
+ # Cross-class dedup (Ultralytics NMS is per-class only)
157
+ boxes = _cross_class_nms(boxes, iou_thresh=self.CROSS_CLASS_IOU)
158
+ out.append(TVFrameResult(frame_id=frame_id, boxes=boxes, keypoints=kp_zeros))
159
+
160
+ return out