scorevision: push artifact
Browse files
miner.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Open-source Detect-beverage miner (manak0/Detect-beverage-detect).
|
| 2 |
+
|
| 3 |
+
ONNX + onnxruntime (no torch/ultralytics at inference -> light repo,
|
| 4 |
+
deterministic; spot-check re-runs this same code+weights). Trained
|
| 5 |
+
yolo11n with class order [cup, bottle, can] == manifest `objects`, so
|
| 6 |
+
cls_id maps directly (0=cup,1=bottle,2=can). Letterbox 1280 (manifest
|
| 7 |
+
preproc resize_long), flip-TTA, per-class conf, global NMS.
|
| 8 |
+
|
| 9 |
+
Contract (turbovision example_miner): class `Miner` at HF repo root;
|
| 10 |
+
`predict_batch(batch_images, offset, n_keypoints) -> list[TVFrameResult]`.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import cv2
|
| 18 |
+
import numpy as np
|
| 19 |
+
import onnxruntime as ort
|
| 20 |
+
from numpy import ndarray
|
| 21 |
+
from pydantic import BaseModel
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class BoundingBox(BaseModel):
|
| 25 |
+
x1: int
|
| 26 |
+
y1: int
|
| 27 |
+
x2: int
|
| 28 |
+
y2: int
|
| 29 |
+
cls_id: int
|
| 30 |
+
conf: float
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class TVFrameResult(BaseModel):
|
| 34 |
+
frame_id: int
|
| 35 |
+
boxes: list[BoundingBox]
|
| 36 |
+
keypoints: list[tuple[int, int]]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class Miner:
|
| 40 |
+
weights_file = "best.onnx"
|
| 41 |
+
input_size = 1280
|
| 42 |
+
num_classes = 3 # cup, bottle, can
|
| 43 |
+
# per-class confidence (tuned on held-out; cup scarcer -> lower gate)
|
| 44 |
+
conf_thres = np.array([0.25, 0.35, 0.35], dtype=np.float32)
|
| 45 |
+
iou_thres = 0.55
|
| 46 |
+
max_det = 100
|
| 47 |
+
min_box_area = 36.0
|
| 48 |
+
use_flip_tta = True
|
| 49 |
+
|
| 50 |
+
def __init__(self, path_hf_repo: Path) -> None:
|
| 51 |
+
so = ort.SessionOptions()
|
| 52 |
+
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 53 |
+
self.sess = ort.InferenceSession(
|
| 54 |
+
str(Path(path_hf_repo) / self.weights_file),
|
| 55 |
+
providers=["CPUExecutionProvider"],
|
| 56 |
+
sess_options=so,
|
| 57 |
+
)
|
| 58 |
+
self.inp = self.sess.get_inputs()[0].name
|
| 59 |
+
print("✅ ONNX beverage model loaded")
|
| 60 |
+
|
| 61 |
+
def __repr__(self) -> str:
|
| 62 |
+
return f"BeverageONNX(in={self.input_size}, cls={self.num_classes})"
|
| 63 |
+
|
| 64 |
+
# ---- preprocessing ---------------------------------------------------
|
| 65 |
+
def _letterbox(self, im: ndarray):
|
| 66 |
+
h0, w0 = im.shape[:2]
|
| 67 |
+
s = min(self.input_size / h0, self.input_size / w0)
|
| 68 |
+
nh, nw = int(round(h0 * s)), int(round(w0 * s))
|
| 69 |
+
r = cv2.resize(im, (nw, nh))
|
| 70 |
+
out = np.full((self.input_size, self.input_size, 3), 114, np.uint8)
|
| 71 |
+
out[:nh, :nw] = r
|
| 72 |
+
return out, s
|
| 73 |
+
|
| 74 |
+
def _infer(self, im_bgr: ndarray) -> ndarray:
|
| 75 |
+
lb, s = self._letterbox(im_bgr)
|
| 76 |
+
x = lb[:, :, ::-1].transpose(2, 0, 1)[None].astype(np.float32) / 255.0
|
| 77 |
+
out = self.sess.run(None, {self.inp: x})[0][0] # (4+nc, N)
|
| 78 |
+
p = out.T if out.shape[0] < out.shape[1] else out # (N, 4+nc)
|
| 79 |
+
boxes = p[:, :4].copy()
|
| 80 |
+
scores = p[:, 4:4 + self.num_classes]
|
| 81 |
+
# xywh(center) -> xyxy in original image coords
|
| 82 |
+
xy = boxes[:, :2]
|
| 83 |
+
wh = boxes[:, 2:4]
|
| 84 |
+
x1y1 = (xy - wh / 2) / s
|
| 85 |
+
x2y2 = (xy + wh / 2) / s
|
| 86 |
+
return np.concatenate([x1y1, x2y2, scores], axis=1) # (N,4+nc)
|
| 87 |
+
|
| 88 |
+
def _detect(self, im_bgr: ndarray) -> list[BoundingBox]:
|
| 89 |
+
det = self._infer(im_bgr)
|
| 90 |
+
if self.use_flip_tta:
|
| 91 |
+
fl = self._infer(im_bgr[:, ::-1])
|
| 92 |
+
W = im_bgr.shape[1]
|
| 93 |
+
x1 = W - fl[:, 2]
|
| 94 |
+
x2 = W - fl[:, 0]
|
| 95 |
+
fl[:, 0], fl[:, 2] = x1, x2
|
| 96 |
+
det = np.concatenate([det, fl], axis=0)
|
| 97 |
+
|
| 98 |
+
cls = det[:, 4:].argmax(1)
|
| 99 |
+
conf = det[:, 4:].max(1)
|
| 100 |
+
keep = conf >= self.conf_thres[cls]
|
| 101 |
+
det, cls, conf = det[keep], cls[keep], conf[keep]
|
| 102 |
+
out: list[BoundingBox] = []
|
| 103 |
+
for c in range(self.num_classes):
|
| 104 |
+
m = cls == c
|
| 105 |
+
if not m.any():
|
| 106 |
+
continue
|
| 107 |
+
b = det[m, :4]
|
| 108 |
+
sc = conf[m]
|
| 109 |
+
idx = cv2.dnn.NMSBoxes(
|
| 110 |
+
bboxes=[[float(x1), float(y1), float(x2 - x1),
|
| 111 |
+
float(y2 - y1)] for x1, y1, x2, y2 in b],
|
| 112 |
+
scores=sc.tolist(), score_threshold=0.0,
|
| 113 |
+
nms_threshold=self.iou_thres,
|
| 114 |
+
)
|
| 115 |
+
for i in np.array(idx).flatten()[: self.max_det]:
|
| 116 |
+
x1, y1, x2, y2 = b[i]
|
| 117 |
+
if (x2 - x1) * (y2 - y1) < self.min_box_area:
|
| 118 |
+
continue
|
| 119 |
+
out.append(BoundingBox(
|
| 120 |
+
x1=int(x1), y1=int(y1), x2=int(x2), y2=int(y2),
|
| 121 |
+
cls_id=int(c), conf=float(sc[i])))
|
| 122 |
+
return out
|
| 123 |
+
|
| 124 |
+
def predict_batch(
|
| 125 |
+
self,
|
| 126 |
+
batch_images: list[ndarray],
|
| 127 |
+
offset: int,
|
| 128 |
+
n_keypoints: int,
|
| 129 |
+
) -> list[TVFrameResult]:
|
| 130 |
+
results: list[TVFrameResult] = []
|
| 131 |
+
for i, img in enumerate(batch_images):
|
| 132 |
+
try:
|
| 133 |
+
boxes = self._detect(np.ascontiguousarray(img))
|
| 134 |
+
except Exception as e: # never crash the chute
|
| 135 |
+
print(f"⚠️ frame {offset + i} detect error: {e}")
|
| 136 |
+
boxes = []
|
| 137 |
+
results.append(TVFrameResult(
|
| 138 |
+
frame_id=offset + i, boxes=boxes,
|
| 139 |
+
keypoints=[(0, 0) for _ in range(n_keypoints)]))
|
| 140 |
+
return results
|