meaculpitt commited on
Commit
8d0d1ad
Β·
verified Β·
1 Parent(s): 34e8b67

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +339 -0
miner.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Score Vision SN44 β€” VehicleDetect miner endpoint.
3
+
4
+ Class mapping (output indices):
5
+ 0 = car (COCO class 2)
6
+ 1 = bus (COCO class 5)
7
+ 2 = truck (COCO class 7)
8
+ 3 = motorcycle (COCO class 3)
9
+
10
+ Accepts: base64-encoded image or raw image bytes via chutes cord.
11
+ Returns: list of {bbox: [x1,y1,x2,y2], score: float, class_id: int, class_name: str}
12
+
13
+ CUDA fix: onnxruntime-gpu finds cuDNN via ldconfig (registered during image build),
14
+ with ctypes preload as belt-and-suspenders fallback.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import base64
20
+ import io
21
+ import os
22
+ import time
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ import ctypes
27
+ import cv2
28
+ import numpy as np
29
+ from PIL import Image
30
+
31
+ # ── cuDNN preload (belt-and-suspenders fallback) ──────────────────────────────
32
+ # Primary fix is ldconfig at image build time (see Image builder below).
33
+ # This ctypes preload catches any edge cases where ld.so.cache isn't used.
34
+ def _preload_cuda_libs() -> None:
35
+ _NVIDIA = "/usr/local/lib/python3.12/dist-packages/nvidia"
36
+ _LIBS = [
37
+ "/usr/lib/x86_64-linux-gnu/libcuda.so.1", # driver stub β€” must be first
38
+ f"{_NVIDIA}/cublas/lib/libcublasLt.so.12",
39
+ f"{_NVIDIA}/cublas/lib/libcublas.so.12",
40
+ f"{_NVIDIA}/cudnn/lib/libcudnn.so.9",
41
+ ]
42
+ for path in _LIBS:
43
+ if os.path.exists(path):
44
+ try:
45
+ ctypes.CDLL(path, mode=ctypes.RTLD_GLOBAL)
46
+ except OSError:
47
+ pass
48
+
49
+ _preload_cuda_libs()
50
+
51
+ import onnxruntime as ort # noqa: E402 β€” must come after preload
52
+
53
+ # ── Constants ────────────────────────────────────────────────────────────────
54
+ MODEL_DIR = Path(__file__).parent
55
+ WEIGHTS = MODEL_DIR / "weights.onnx"
56
+ IMG_SIZE = 640
57
+ CONF_THRESH = 0.25
58
+ IOU_THRESH = 0.45
59
+
60
+ # COCO class index β†’ submission class index
61
+ COCO_TO_OUT: dict[int, int] = {2: 0, 5: 1, 7: 2, 3: 3}
62
+ COCO_VEHICLE_IDX = list(COCO_TO_OUT.keys())
63
+ OUT_NAMES = ["car", "bus", "truck", "motorcycle"]
64
+
65
+ # ── Model loader (singleton) ─────────────────────────────────────────────────
66
+ _SESSION: ort.InferenceSession | None = None
67
+
68
+
69
+ def get_session() -> ort.InferenceSession:
70
+ global _SESSION
71
+ if _SESSION is None:
72
+ opts = ort.SessionOptions()
73
+ opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
74
+ opts.enable_mem_pattern = True
75
+ opts.enable_mem_reuse = True
76
+ cuda_opts = {
77
+ "device_id": 0,
78
+ "arena_extend_strategy": "kNextPowerOfTwo",
79
+ "gpu_mem_limit": 2 * 1024 ** 3,
80
+ "cudnn_conv_algo_search": "EXHAUSTIVE",
81
+ "do_copy_in_default_stream": True,
82
+ }
83
+ _SESSION = ort.InferenceSession(
84
+ str(WEIGHTS),
85
+ sess_options=opts,
86
+ providers=[
87
+ ("CUDAExecutionProvider", cuda_opts),
88
+ "CPUExecutionProvider",
89
+ ],
90
+ )
91
+ provider = _SESSION.get_providers()[0]
92
+ print(f"[miner] Model loaded. Provider: {provider}", flush=True)
93
+ return _SESSION
94
+
95
+
96
+ # ── Preprocessing ────────────────────────────────────────────────────────────
97
+
98
+ def letterbox(img: np.ndarray, size: int = IMG_SIZE) -> tuple[np.ndarray, float, int, int]:
99
+ h, w = img.shape[:2]
100
+ r = min(size / h, size / w)
101
+ new_w, new_h = int(round(w * r)), int(round(h * r))
102
+ img_r = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
103
+ dw, dh = size - new_w, size - new_h
104
+ pad_l, pad_t = dw // 2, dh // 2
105
+ img_p = cv2.copyMakeBorder(
106
+ img_r, pad_t, dh - pad_t, pad_l, dw - pad_l,
107
+ cv2.BORDER_CONSTANT, value=(114, 114, 114),
108
+ )
109
+ return img_p, r, pad_l, pad_t
110
+
111
+
112
+ def preprocess(img_bgr: np.ndarray) -> tuple[np.ndarray, float, int, int]:
113
+ img_p, ratio, pad_l, pad_t = letterbox(img_bgr)
114
+ img_rgb = cv2.cvtColor(img_p, cv2.COLOR_BGR2RGB)
115
+ inp = img_rgb.transpose(2, 0, 1).astype(np.float32) * (1.0 / 255.0)
116
+ return np.ascontiguousarray(inp[np.newaxis]), ratio, pad_l, pad_t
117
+
118
+
119
+ # ── NMS ──────────────────────────────────────────────────────────────────────
120
+
121
+ def nms(boxes: np.ndarray, scores: np.ndarray, iou_thresh: float = IOU_THRESH) -> list[int]:
122
+ if not len(boxes):
123
+ return []
124
+ x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
125
+ areas = (x2 - x1) * (y2 - y1)
126
+ order = scores.argsort()[::-1]
127
+ keep: list[int] = []
128
+ while len(order):
129
+ i = order[0]
130
+ keep.append(int(i))
131
+ xx1 = np.maximum(x1[i], x1[order[1:]])
132
+ yy1 = np.maximum(y1[i], y1[order[1:]])
133
+ xx2 = np.minimum(x2[i], x2[order[1:]])
134
+ yy2 = np.minimum(y2[i], y2[order[1:]])
135
+ inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
136
+ iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-7)
137
+ order = order[1:][iou <= iou_thresh]
138
+ return keep
139
+
140
+
141
+ # ── Postprocessing ───────────────────────────────────────────────────────────
142
+
143
+ def postprocess(
144
+ raw: np.ndarray,
145
+ ratio: float,
146
+ pad_l: int,
147
+ pad_t: int,
148
+ orig_w: int,
149
+ orig_h: int,
150
+ ) -> list[dict[str, Any]]:
151
+ pred = raw # [84, 8400]
152
+ veh_row_idx = np.array([4 + c for c in COCO_VEHICLE_IDX])
153
+ max_veh_score = pred[veh_row_idx].max(axis=0)
154
+ mask = max_veh_score > CONF_THRESH
155
+ if not mask.any():
156
+ return []
157
+
158
+ pred_f = pred[:, mask]
159
+ cx, cy, bw, bh = pred_f[0], pred_f[1], pred_f[2], pred_f[3]
160
+
161
+ x1 = np.clip((cx - bw / 2 - pad_l) / ratio, 0, orig_w)
162
+ y1 = np.clip((cy - bh / 2 - pad_t) / ratio, 0, orig_h)
163
+ x2 = np.clip((cx + bw / 2 - pad_l) / ratio, 0, orig_w)
164
+ y2 = np.clip((cy + bh / 2 - pad_t) / ratio, 0, orig_h)
165
+ boxes = np.stack([x1, y1, x2, y2], axis=1)
166
+
167
+ results: list[dict[str, Any]] = []
168
+ for coco_cls in COCO_VEHICLE_IDX:
169
+ scores = pred_f[4 + coco_cls]
170
+ cls_mask = scores > CONF_THRESH
171
+ if not cls_mask.any():
172
+ continue
173
+ keep = nms(boxes[cls_mask], scores[cls_mask])
174
+ out_cls = COCO_TO_OUT[coco_cls]
175
+ for k in keep:
176
+ box = boxes[cls_mask][k]
177
+ results.append({
178
+ "bbox": [
179
+ float(box[0]), float(box[1]),
180
+ float(box[2]), float(box[3]),
181
+ ],
182
+ "score": float(scores[cls_mask][k]),
183
+ "class_id": out_cls,
184
+ "class_name": OUT_NAMES[out_cls],
185
+ })
186
+ return results
187
+
188
+
189
+ # ── Image decoding helpers ───────────────────────────────────────────────────
190
+
191
+ def decode_image(data: bytes | str) -> np.ndarray:
192
+ if isinstance(data, str):
193
+ data = base64.b64decode(data)
194
+ elif isinstance(data, (bytes, bytearray)):
195
+ try:
196
+ data = base64.b64decode(data)
197
+ except Exception:
198
+ pass
199
+ arr = np.frombuffer(data, dtype=np.uint8)
200
+ img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
201
+ if img is None:
202
+ pil = Image.open(io.BytesIO(data)).convert("RGB")
203
+ img = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
204
+ return img
205
+
206
+
207
+ # ── Core predict function ────────────────────────────────────────────────────
208
+
209
+ def predict(image_data: bytes | str | np.ndarray) -> dict[str, Any]:
210
+ sess = get_session()
211
+
212
+ if isinstance(image_data, np.ndarray):
213
+ img_bgr = image_data
214
+ else:
215
+ img_bgr = decode_image(image_data)
216
+
217
+ orig_h, orig_w = img_bgr.shape[:2]
218
+ inp, ratio, pad_l, pad_t = preprocess(img_bgr)
219
+
220
+ t0 = time.perf_counter()
221
+ outputs = sess.run(None, {"images": inp})
222
+ infer_ms = (time.perf_counter() - t0) * 1000.0
223
+
224
+ raw = outputs[0][0] # [84, 8400]
225
+ detections = postprocess(raw, ratio, pad_l, pad_t, orig_w, orig_h)
226
+
227
+ return {
228
+ "detections": detections,
229
+ "inference_ms": round(infer_ms, 3),
230
+ "provider": sess.get_providers()[0],
231
+ }
232
+
233
+
234
+ # ── Chutes cord wrapper ──────────────────────────────────────────────────────
235
+
236
+ try:
237
+ from chutes.chute import Chute
238
+ from chutes.chute.node_selector import NodeSelector
239
+ from chutes.image import Image as ChuteImage
240
+
241
+ chute_image = (
242
+ ChuteImage(
243
+ username="lculpitt",
244
+ name="vehicle-detect-sn44",
245
+ tag="v4-cuda",
246
+ readme=(Path(__file__).parent / "README.md").read_text(),
247
+ )
248
+ .from_base("parachutes/python:3.12")
249
+ .run_command("pip install --upgrade setuptools wheel")
250
+ .run_command(
251
+ "pip install 'numpy>=1.23' 'onnxruntime-gpu>=1.16' "
252
+ "'opencv-python-headless>=4.7' 'pillow>=9.5' "
253
+ "'huggingface_hub>=0.19.4' 'pydantic>=2.0' "
254
+ "'pyyaml>=6.0' 'aiohttp>=3.9'"
255
+ )
256
+ # Bake cuDNN/cuBLAS paths into the image as Docker ENV so onnxruntime
257
+ # CUDAExecutionProvider finds libcudnn.so.9 on every node at container start.
258
+ .with_env(
259
+ "LD_LIBRARY_PATH",
260
+ "/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib"
261
+ ":/usr/local/lib/python3.12/dist-packages/nvidia/cublas/lib",
262
+ )
263
+ )
264
+
265
+ chute = Chute(
266
+ username="lculpitt",
267
+ name="vehicle-detect-sn44",
268
+ tagline="YOLO11n vehicle detector β€” car, bus, truck, motorcycle",
269
+ readme=(Path(__file__).parent / "README.md").read_text(),
270
+ image=chute_image,
271
+ concurrency=4,
272
+ max_instances=5,
273
+ shutdown_after_seconds=300,
274
+ scaling_threshold=0.5,
275
+ node_selector=NodeSelector(
276
+ gpu_count=1,
277
+ min_vram_gb_per_gpu=16,
278
+ # All CUDA 12.x, all $0.40–$0.85/hr (within 2.5Γ— spread from cheapest)
279
+ include=["4090", "a40", "a6000", "l40", "l40s"],
280
+ ),
281
+ )
282
+
283
+ @chute.cord(path="/predict", method="POST")
284
+ async def predict_cord(image_b64: str) -> dict:
285
+ """
286
+ POST /predict
287
+ Body: {"image_b64": "<base64-encoded image>"}
288
+ Returns detection JSON.
289
+ """
290
+ return predict(image_b64)
291
+
292
+ except ImportError:
293
+ pass
294
+
295
+
296
+ # ── Local test ───────────────────────────────────────────────────────────────
297
+
298
+ if __name__ == "__main__":
299
+ import sys
300
+
301
+ print("=" * 55)
302
+ print(" miner.py β€” local smoke test")
303
+ print("=" * 55)
304
+
305
+ dummy_bgr = np.full((720, 1280, 3), 128, dtype=np.uint8)
306
+ cv2.rectangle(dummy_bgr, (100, 100), (400, 300), (0, 255, 0), 3)
307
+
308
+ if len(sys.argv) > 1:
309
+ loaded = cv2.imread(sys.argv[1])
310
+ if loaded is not None:
311
+ dummy_bgr = loaded
312
+ print(f" Using image: {sys.argv[1]} ({loaded.shape[1]}x{loaded.shape[0]})")
313
+ else:
314
+ print(f" Could not load {sys.argv[1]}, using dummy.")
315
+ else:
316
+ print(" Using synthetic 1280x720 dummy image.")
317
+
318
+ result = predict(dummy_bgr)
319
+ print(f"\n Provider : {result['provider']}")
320
+ print(f" Inference : {result['inference_ms']:.2f} ms")
321
+ print(f" Detections : {len(result['detections'])}")
322
+ for d in result["detections"]:
323
+ x1, y1, x2, y2 = [round(v, 1) for v in d["bbox"]]
324
+ print(f" [{d['class_id']}] {d['class_name']:12s} score={d['score']:.3f} "
325
+ f"bbox=[{x1},{y1},{x2},{y2}]")
326
+
327
+ print("\n Latency benchmark (50 runs)...")
328
+ times = []
329
+ for _ in range(50):
330
+ t0 = time.perf_counter()
331
+ predict(dummy_bgr)
332
+ times.append((time.perf_counter() - t0) * 1000)
333
+ times.sort()
334
+ p50, p95 = times[25], times[47]
335
+ fps = 1000.0 / p50
336
+ print(f" P50={p50:.2f}ms P95={p95:.2f}ms FPS={fps:.1f}")
337
+ print(f" Target >=30 FPS : {'PASS' if fps >= 30 else 'FAIL'}")
338
+ print(f" Target P95<50ms : {'PASS' if p95 < 50 else 'FAIL'}")
339
+ print("=" * 55)