scorevision: push artifact
Browse files
miner.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""miner.py — uploaded to artur7236/turbovision-beverage HF repo.
|
| 2 |
+
|
| 3 |
+
Fine-tuned YOLO11s on SAM3-labelled beverage data + horizontal-flip TTA at
|
| 4 |
+
1280×1280 + confidence threshold 0.55. Local benchmark mean composite = 0.709
|
| 5 |
+
vs UID 213's 0.667.
|
| 6 |
+
|
| 7 |
+
Required by the chute template:
|
| 8 |
+
- class Miner with __init__(self, path_hf_repo: Path)
|
| 9 |
+
- predict_batch(batch_images, offset, n_keypoints) -> list[TVFrameResult]
|
| 10 |
+
- BoundingBox + TVFrameResult pydantic models defined in this file
|
| 11 |
+
- No imports from anywhere except stdlib + site-packages (no helper modules
|
| 12 |
+
from the HF repo).
|
| 13 |
+
"""
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
from numpy import ndarray
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
from ultralytics import YOLO
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Class index follows the manifest objects order for Detect-beverage-detect:
|
| 23 |
+
# [cup, bottle, can] → cls_id 0, 1, 2.
|
| 24 |
+
CLASS_NAMES = ["cup", "bottle", "can"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BoundingBox(BaseModel):
|
| 28 |
+
x1: int
|
| 29 |
+
y1: int
|
| 30 |
+
x2: int
|
| 31 |
+
y2: int
|
| 32 |
+
cls_id: int
|
| 33 |
+
conf: float
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TVFrameResult(BaseModel):
|
| 37 |
+
frame_id: int
|
| 38 |
+
boxes: list[BoundingBox]
|
| 39 |
+
keypoints: list[tuple[int, int]]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _iou(a: BoundingBox, b: BoundingBox) -> float:
|
| 43 |
+
"""Standard IoU between two axis-aligned boxes."""
|
| 44 |
+
x1 = max(a.x1, b.x1)
|
| 45 |
+
y1 = max(a.y1, b.y1)
|
| 46 |
+
x2 = min(a.x2, b.x2)
|
| 47 |
+
y2 = min(a.y2, b.y2)
|
| 48 |
+
if x2 <= x1 or y2 <= y1:
|
| 49 |
+
return 0.0
|
| 50 |
+
inter = (x2 - x1) * (y2 - y1)
|
| 51 |
+
area_a = max(0, a.x2 - a.x1) * max(0, a.y2 - a.y1)
|
| 52 |
+
area_b = max(0, b.x2 - b.x1) * max(0, b.y2 - b.y1)
|
| 53 |
+
union = area_a + area_b - inter
|
| 54 |
+
return inter / union if union > 0 else 0.0
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _cross_class_nms(boxes: list[BoundingBox], iou_thresh: float = 0.6) -> list[BoundingBox]:
|
| 58 |
+
"""Suppress cross-class overlapping boxes (keep highest-conf when IoU≥thresh).
|
| 59 |
+
|
| 60 |
+
Ultralytics' default NMS only dedupes WITHIN a class. The SN44 validator
|
| 61 |
+
counts cross-class overlapping boxes as false positives (only one class
|
| 62 |
+
can be right per object), so we need this extra pass. Mirrors UID 213's
|
| 63 |
+
_cross_class_dedup_op (IoU 0.7); we use 0.6 to be slightly more aggressive.
|
| 64 |
+
"""
|
| 65 |
+
if len(boxes) <= 1:
|
| 66 |
+
return boxes
|
| 67 |
+
sorted_boxes = sorted(boxes, key=lambda b: -b.conf)
|
| 68 |
+
kept: list[BoundingBox] = []
|
| 69 |
+
for b in sorted_boxes:
|
| 70 |
+
suppressed = False
|
| 71 |
+
for k in kept:
|
| 72 |
+
if _iou(b, k) >= iou_thresh:
|
| 73 |
+
suppressed = True
|
| 74 |
+
break
|
| 75 |
+
if not suppressed:
|
| 76 |
+
kept.append(b)
|
| 77 |
+
return kept
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class Miner:
|
| 81 |
+
"""Fine-tuned YOLO11s with hflip TTA (Round 5, 607 validator-distribution imgs, 19 MB).
|
| 82 |
+
|
| 83 |
+
Tuning rationale (all on SAM3 ground-truth, 361 val images):
|
| 84 |
+
- imgsz=1280: bumps small-object recall on the typical 1408×768 frames
|
| 85 |
+
the validator sends (cans, distant cups).
|
| 86 |
+
- conf=0.55: sweet spot for the validator's pillar weighting
|
| 87 |
+
(0.6·mAP50 + 0.4·(1−ffpi/10)). Below 0.45 the false-positive pillar
|
| 88 |
+
crashes; above 0.65 recall drops faster than precision rises.
|
| 89 |
+
- augment=True: ultralytics enables hflip TTA, our biggest single
|
| 90 |
+
composite gain in offline sweep.
|
| 91 |
+
- iou=0.45: standard YOLO NMS IoU; cross-class dedup happens implicitly.
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
IMAGE_SIZE = 1280
|
| 95 |
+
CONF_THRESH = 0.50 # Round 4: 0.50 narrowly beat 0.45 (0.6982 vs 0.6980)
|
| 96 |
+
IOU_THRESH = 0.45
|
| 97 |
+
USE_TTA = True
|
| 98 |
+
CROSS_CLASS_IOU = 0.6 # Round 6: post-NMS cross-class dedup
|
| 99 |
+
# Block 8287800 had 18 boxes including same-pixel cup+bottle pairs.
|
| 100 |
+
|
| 101 |
+
def __init__(self, path_hf_repo: Path) -> None:
|
| 102 |
+
weights_path = path_hf_repo / "best.pt"
|
| 103 |
+
if not weights_path.exists():
|
| 104 |
+
raise FileNotFoundError(f"missing weights at {weights_path}")
|
| 105 |
+
self.model = YOLO(str(weights_path))
|
| 106 |
+
# Touch the model once so cold-start latency hits the warmup, not the
|
| 107 |
+
# first validator call.
|
| 108 |
+
dummy = np.zeros((640, 640, 3), dtype=np.uint8)
|
| 109 |
+
_ = self.model.predict(dummy, imgsz=self.IMAGE_SIZE, conf=self.CONF_THRESH,
|
| 110 |
+
iou=self.IOU_THRESH, augment=self.USE_TTA, verbose=False)
|
| 111 |
+
print(f"✅ YOLO11s loaded from {weights_path}")
|
| 112 |
+
|
| 113 |
+
def __repr__(self) -> str:
|
| 114 |
+
return (f"YOLO11s_ft(imgsz={self.IMAGE_SIZE}, "
|
| 115 |
+
f"conf={self.CONF_THRESH}, iou={self.IOU_THRESH}, "
|
| 116 |
+
f"tta={self.USE_TTA})")
|
| 117 |
+
|
| 118 |
+
def predict_batch(
|
| 119 |
+
self,
|
| 120 |
+
batch_images: list[ndarray],
|
| 121 |
+
offset: int,
|
| 122 |
+
n_keypoints: int,
|
| 123 |
+
) -> list[TVFrameResult]:
|
| 124 |
+
# Run inference. Ultralytics will accept a list[ndarray]; with augment=True
|
| 125 |
+
# it does the flip+merge internally per image.
|
| 126 |
+
results = self.model.predict(
|
| 127 |
+
batch_images,
|
| 128 |
+
imgsz=self.IMAGE_SIZE,
|
| 129 |
+
conf=self.CONF_THRESH,
|
| 130 |
+
iou=self.IOU_THRESH,
|
| 131 |
+
augment=self.USE_TTA,
|
| 132 |
+
verbose=False,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
out: list[TVFrameResult] = []
|
| 136 |
+
# n_keypoints is irrelevant for detection elements; return zero-padded.
|
| 137 |
+
kp_zeros = [(0, 0) for _ in range(max(0, int(n_keypoints)))]
|
| 138 |
+
|
| 139 |
+
for i, r in enumerate(results):
|
| 140 |
+
frame_id = offset + i
|
| 141 |
+
boxes: list[BoundingBox] = []
|
| 142 |
+
if r.boxes is not None and r.boxes.data is not None:
|
| 143 |
+
for box in r.boxes.data.cpu().numpy():
|
| 144 |
+
x1, y1, x2, y2, conf, cls_id = box.tolist()
|
| 145 |
+
cls_id_int = int(cls_id)
|
| 146 |
+
if cls_id_int < 0 or cls_id_int >= len(CLASS_NAMES):
|
| 147 |
+
continue
|
| 148 |
+
# ensure non-degenerate after rounding
|
| 149 |
+
xi1, yi1, xi2, yi2 = int(x1), int(y1), int(x2), int(y2)
|
| 150 |
+
if xi2 <= xi1 or yi2 <= yi1:
|
| 151 |
+
continue
|
| 152 |
+
boxes.append(BoundingBox(
|
| 153 |
+
x1=xi1, y1=yi1, x2=xi2, y2=yi2,
|
| 154 |
+
cls_id=cls_id_int, conf=float(conf),
|
| 155 |
+
))
|
| 156 |
+
# Cross-class dedup (Ultralytics NMS is per-class only)
|
| 157 |
+
boxes = _cross_class_nms(boxes, iou_thresh=self.CROSS_CLASS_IOU)
|
| 158 |
+
out.append(TVFrameResult(frame_id=frame_id, boxes=boxes, keypoints=kp_zeros))
|
| 159 |
+
|
| 160 |
+
return out
|