Spaces:

csmith715
/

ais-api

Sleeping

App Files Files Community

csmith715 commited on Sep 14, 2025

Commit

8021aca

1 Parent(s): 621ba56

Adding Tiling functionality

Browse files

Files changed (3) hide show

app.py +49 -57
tiling.py +238 -0
tiling_test.py +231 -0

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import PIL.Image as Image
 from fastapi import FastAPI, UploadFile, File, HTTPException, Request
 from pydantic import BaseModel
 from ultralytics import YOLO
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", 8 * 1024 * 1024))  # 8 MB default
 MAX_SIDE = int(os.getenv("MAX_SIDE", 2000))  # downscale largest side to this
@@ -20,14 +21,17 @@ HIGH_CLASS_NAMES = [
 LOW_CLASS_NAMES = ["shop_bw", "shop_sw", "field_bw", "Insulation"]
 # -----------------------------
 # App setup
 # -----------------------------
 app = FastAPI(title="YOLO Weld Type Detector API", version="1.0.0")
-model = YOLO("top_reduced_best.pt")
-low_model = YOLO("best_low_072725.pt")
 # -----------------------------
@@ -58,29 +62,45 @@ def downscale_if_needed(img_rgb: np.ndarray) -> np.ndarray:
     new_w, new_h = int(w * scale), int(h * scale)
     return cv2.resize(img_rgb, (new_w, new_h), interpolation=cv2.INTER_AREA)
-def detect_weld_types(image_bgr: np.ndarray, model_type: str) -> dict:
-    if model_type == "top":
-        results = model(image_bgr)
-        class_names = HIGH_CLASS_NAMES
-    else:
-        results = low_model(image_bgr)
-        class_names = LOW_CLASS_NAMES
-    boxes = results[0].boxes
-    class_ids = boxes.cls.cpu().numpy().astype(int) if boxes and boxes.cls is not None else []
-    counts = {}
-    for cid in class_ids:
-        if 0 <= cid < len(class_names):
-            name = class_names[cid]
-            counts[name] = counts.get(name, 0) + 1
     return counts
-# def merge_counts(a: dict, b: dict) -> dict:
-#     out = dict(a)
-#     for k, v in b.items():
-#         out[k] = out.get(k, 0) + v
-#     return out
 # -----------------------------
 # Endpoints
@@ -110,11 +130,11 @@ async def predict_multipart(file: UploadFile = File(default=None)):
     img_rgb = downscale_if_needed(pil_to_numpy_rgb(img))
     img_bgr = numpy_rgb_to_bgr(img_rgb)
-    high = detect_weld_types(img_bgr, "top")
-    low = detect_weld_types(img_bgr, "low")
-    merged = high | low
-    return PredictResponse(detections=merged)
 @app.post("/ping")
 async def ping():
@@ -126,34 +146,6 @@ async def echo(req: Request):
     ct = req.headers.get("content-type", "")
     return {"ok": True, "content_type": ct}
-# @app.post("/predict_base64", response_model=PredictResponse)
-# def predict_base64(payload: PredictQuery = Body(...)):
-#     b64 = payload.image_base64
-#     # Size guard for base64 (approx raw size)
-#     try:
-#         raw = base64.b64decode(b64, validate=True)
-#     except Exception:
-#         raise HTTPException(status_code=400, detail="Invalid base64.")
-#
-#     if len(raw) > MAX_UPLOAD_BYTES:
-#         raise HTTPException(
-#             status_code=413,
-#             detail=f"Image too large after base64 decode ({len(raw)/1024/1024:.2f} MB). "
-#                    f"Use multipart /predict or reduce image size."
-#         )
-#
-#     try:
-#         img = Image.open(io.BytesIO(raw))
-#     except Exception:
-#         raise HTTPException(status_code=400, detail="Invalid image.")
-#
-#     img_rgb = downscale_if_needed(pil_to_numpy_rgb(img))
-#     img_bgr = numpy_rgb_to_bgr(img_rgb)
-#
-#     high = detect_weld_types(img_bgr, "top")
-#     low = detect_weld_types(img_bgr, "low")
-#     return PredictResponse(detections=merge_counts(low, high))
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

 from fastapi import FastAPI, UploadFile, File, HTTPException, Request
 from pydantic import BaseModel
 from ultralytics import YOLO
+from tiling import detect_tiled_softnms
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", 8 * 1024 * 1024))  # 8 MB default
 MAX_SIDE = int(os.getenv("MAX_SIDE", 2000))  # downscale largest side to this
 LOW_CLASS_NAMES = ["shop_bw", "shop_sw", "field_bw", "Insulation"]
+ALL_CLASS_NAMES = HIGH_CLASS_NAMES + LOW_CLASS_NAMES
 # -----------------------------
 # App setup
 # -----------------------------
 app = FastAPI(title="YOLO Weld Type Detector API", version="1.0.0")
+model = YOLO("best_7-15-25.pt")
+# model = YOLO("top_reduced_best.pt")
+# low_model = YOLO("best_low_072725.pt")
 # -----------------------------
     new_w, new_h = int(w * scale), int(h * scale)
     return cv2.resize(img_rgb, (new_w, new_h), interpolation=cv2.INTER_AREA)
+def normalize_prediction(output):
+    weld_counts = {}
+    for cls_pred in output['cls']:
+        weld_key = output['names'][cls_pred]
+        weld_counts[weld_key] = weld_counts.get(weld_key, 0) + 1
+    return weld_counts
+def detect_weld_types(image_bgr: np.ndarray, model) -> dict:
+    out = detect_tiled_softnms(
+        model, image_bgr,
+        tile_size=1024, overlap=0.23,
+        per_tile_conf=0.2, per_tile_iou=0.7,
+        softnms_iou=0.6, softnms_method="hard", softnms_sigma=0.5,
+        final_conf=0.38, device=None, imgsz=1280
+    )
+    counts = normalize_prediction(out)
     return counts
+# {'file': 50.724137931034484,
+#  'soft_iou': 0.5982183908045983,
+#  'final_conf': 0.37854022988505753,
+#  'olap': 0.22752873563218376}
+# def detect_weld_types(image_bgr: np.ndarray, model_type: str) -> dict:
+#     if model_type == "top":
+#         results = model.predict(image_bgr)
+#         class_names = HIGH_CLASS_NAMES
+#     else:
+#         results = low_model.predict(image_bgr, conf=0.10, iou=0.55, max_det=300, imgsz=1920, augment=True)
+#         class_names = LOW_CLASS_NAMES
+#
+#     boxes = results[0].boxes
+#     class_ids = boxes.cls.cpu().numpy().astype(int) if boxes and boxes.cls is not None else []
+#
+#     counts = {}
+#     for cid in class_ids:
+#         if 0 <= cid < len(class_names):
+#             name = class_names[cid]
+#             counts[name] = counts.get(name, 0) + 1
+#     return counts
 # -----------------------------
 # Endpoints
     img_rgb = downscale_if_needed(pil_to_numpy_rgb(img))
     img_bgr = numpy_rgb_to_bgr(img_rgb)
+    welds = detect_weld_types(img_bgr, model)
+    # high = detect_weld_types(img_bgr, "top")
+    # low = detect_weld_types(img_bgr, "low")
+    # merged = high | low
+    return PredictResponse(detections=welds)
 @app.post("/ping")
 async def ping():
     ct = req.headers.get("content-type", "")
     return {"ok": True, "content_type": ct}
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

tiling.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+tiled_yolo_softnms.py
+Tiled inference + class-wise Soft-NMS for YOLO (Ultralytics).
+- Runs YOLO on overlapping tiles to boost recall on small symbols.
+- Maps all tile detections back to full-image coords.
+- Fuses duplicates with Soft-NMS per class.
+Usage
+-----
+from ultralytics import YOLO
+import cv2
+model = YOLO("best.pt")  # your YOLO v12/v11/v8 checkpoint
+img = cv2.imread("example.jpg")[:, :, ::-1]  # BGR->RGB (optional; YOLO accepts BGR too)
+out = detect_tiled_softnms(
+    model, img,
+    tile_size=1024, overlap=0.25,
+    per_tile_conf=0.2, per_tile_iou=0.7,
+    softnms_iou=0.55, softnms_method="linear", softnms_sigma=0.5,
+    final_conf=0.25, device=None, imgsz=None
+)
+# Access results
+xyxy = out["xyxy"]
+conf = out["conf"]
+cls  = out["cls"]
+annot = draw_detections(img.copy(), xyxy, conf, cls, out["names"])
+cv2.imwrite("annotated.jpg", annot[:, :, ::-1])  # RGB->BGR for writing
+"""
+from typing import List, Tuple, Dict, Optional
+import numpy as np
+import cv2
+# ---------------------------
+# Utilities
+# ---------------------------
+def make_overlapping_tiles(H: int, W: int, tile: int, overlap: float) -> List[Tuple[int, int, int, int]]:
+    """Return list of (x0, y0, x1, y1) tile boxes covering the image with given overlap."""
+    assert 0.0 <= overlap < 1.0
+    stride = max(1, int(tile * (1.0 - overlap)))
+    xs = list(range(0, max(W - tile, 0) + 1, stride))
+    ys = list(range(0, max(H - tile, 0) + 1, stride))
+    if xs[-1] + tile < W:
+        xs.append(W - tile)
+    if ys[-1] + tile < H:
+        ys.append(H - tile)
+    tiles = []
+    for y in ys:
+        for x in xs:
+            x0, y0 = max(0, x), max(0, y)
+            x1, y1 = min(W, x0 + tile), min(H, y0 + tile)
+            tiles.append((x0, y0, x1, y1))
+    return tiles
+def iou_xyxy(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """IoU between one box a (4,x) and many boxes b (N,4)."""
+    xx1 = np.maximum(a[0], b[:, 0])
+    yy1 = np.maximum(a[1], b[:, 1])
+    xx2 = np.minimum(a[2], b[:, 2])
+    yy2 = np.minimum(a[3], b[:, 3])
+    inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
+    area_a = (a[2]-a[0]) * (a[3]-a[1])
+    area_b = (b[:, 2]-b[:, 0]) * (b[:, 3]-b[:, 1])
+    union = np.maximum(1e-9, area_a + area_b - inter)
+    return inter / union
+def soft_nms_classwise(
+    boxes: np.ndarray, scores: np.ndarray, classes: np.ndarray,
+    iou_thr: float = 0.55, method: str = "linear", sigma: float = 0.5,
+    score_thresh: float = 1e-3, max_det: Optional[int] = None
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Soft-NMS per class.
+    boxes: (N,4), scores:(N, x), classes:(N, x)
+    Returns filtered (boxes, scores, classes).
+    """
+    keep_boxes, keep_scores, keep_classes = [], [], []
+    for c in np.unique(classes):
+        m = classes == c
+        b = boxes[m].astype(np.float32).copy()
+        s = scores[m].astype(np.float32).copy()
+        idxs = np.arange(b.shape[0])
+        kept = []
+        while len(idxs):
+            i = idxs[np.argmax(s[idxs])]
+            M = b[i].copy()
+            Ms = s[i].copy()
+            kept.append(i)
+            idxs = idxs[idxs != i]
+            if len(idxs) == 0:
+                break
+            ious = iou_xyxy(M, b[idxs])
+            if method == "linear":
+                decay = np.where(ious > iou_thr, 1.0 - ious, 1.0)
+                s[idxs] *= decay
+            elif method == "gaussian":
+                s[idxs] *= np.exp(-(ious ** 2) / sigma)
+            elif method == "hard":
+                # standard NMS behaviour
+                idxs = idxs[ious <= iou_thr]
+            else:
+                raise ValueError("method must be 'linear', 'gaussian', or 'hard'")
+            # prune very low scores
+            idxs = idxs[s[idxs] >= score_thresh]
+        if kept:
+            kb, ks = b[kept], s[kept]
+            order = np.argsort(-ks)
+            kb, ks = kb[order], ks[order]
+            kc = np.full(len(ks), c, dtype=classes.dtype)
+            keep_boxes.append(kb)
+            keep_scores.append(ks)
+            keep_classes.append(kc)
+    if not keep_boxes:
+        return (np.zeros((0, 4), dtype=np.float32),
+                np.zeros((0,), dtype=np.float32),
+                np.zeros((0,), dtype=classes.dtype))
+    B = np.concatenate(keep_boxes, axis=0)
+    S = np.concatenate(keep_scores, axis=0)
+    C = np.concatenate(keep_classes, axis=0)
+    order = np.argsort(-S)
+    if max_det is not None:
+        order = order[:max_det]
+    return B[order], S[order], C[order]
+def draw_detections(img: np.ndarray, boxes: np.ndarray, scores: np.ndarray, classes: np.ndarray, names: Dict[int, str]) -> np.ndarray:
+    """Simple visualizer (RGB in, RGB out)."""
+    for (x1, y1, x2, y2), sc, cl in zip(boxes.astype(int), scores, classes.astype(int)):
+        label = f"{names.get(cl, str(cl))} {sc:.2f}"
+        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 180, 255), 2)
+        (tw, th), bl = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
+        cv2.rectangle(img, (x1, y1 - th - 6), (x1 + tw + 4, y1), (0, 180, 255), -1)
+        cv2.putText(img, label, (x1 + 2, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2, cv2.LINE_AA)
+    return img
+# ---------------------------
+# Main tiled inference
+# ---------------------------
+def detect_tiled_softnms(
+    model, image: np.ndarray,
+    tile_size: int = 1024, overlap: float = 0.25,
+    per_tile_conf: float = 0.25, per_tile_iou: float = 0.7,
+    softnms_iou: float = 0.55, softnms_method: str = "linear", softnms_sigma: float = 0.5,
+    final_conf: float = 0.25, max_det: int = 3000,
+    device: Optional[str] = None, imgsz: Optional[int] = None,
+    class_agnostic_nms: bool = False
+) -> Dict[str, np.ndarray]:
+    """
+    Run YOLO on overlapping tiles, then fuse globally with class-wise Soft-NMS.
+    Returns dict: {"xyxy","conf","cls","names"}.
+    """
+    assert image.ndim == 3, "image must be HxWx3"
+    H, W = image.shape[:2]
+    names = getattr(model, "names", {i: str(i) for i in range(1000)})
+    tiles = make_overlapping_tiles(H, W, tile=tile_size, overlap=overlap)
+    all_boxes, all_scores, all_classes = [], [], []
+    for (x0, y0, x1, y1) in tiles:
+        tile = image[y0:y1, x0:x1]
+        # Ultralytics returns boxes in original tile coords (pre-letterbox)
+        results = model.predict(
+            source=tile,
+            conf=per_tile_conf,
+            iou=per_tile_iou,
+            imgsz=imgsz,        # None -> model default
+            device=device,
+            verbose=False
+        )
+        if not results:
+            continue
+        r = results[0]
+        if r.boxes is None or r.boxes.shape[0] == 0:
+            continue
+        b = r.boxes.xyxy.cpu().numpy()
+        s = r.boxes.conf.cpu().numpy()
+        c = r.boxes.cls.cpu().numpy().astype(int)
+        # Map to full-image coordinates
+        b[:, [0, 2]] += x0
+        b[:, [1, 3]] += y0
+        # Clip
+        b[:, 0] = np.clip(b[:, 0], 0, W - 1)
+        b[:, 1] = np.clip(b[:, 1], 0, H - 1)
+        b[:, 2] = np.clip(b[:, 2], 0, W - 1)
+        b[:, 3] = np.clip(b[:, 3], 0, H - 1)
+        # Filter degenerate boxes
+        valid = (b[:, 2] > b[:, 0]) & (b[:, 3] > b[:, 1])
+        if not np.any(valid):
+            continue
+        all_boxes.append(b[valid])
+        all_scores.append(s[valid])
+        all_classes.append(c[valid])
+    if not all_boxes:
+        return {"xyxy": np.zeros((0, 4), dtype=np.float32),
+                "conf": np.zeros((0,), dtype=np.float32),
+                "cls": np.zeros((0,), dtype=np.int32),
+                "names": names}
+    boxes = np.concatenate(all_boxes, axis=0).astype(np.float32)
+    scores = np.concatenate(all_scores, axis=0).astype(np.float32)
+    classes = np.concatenate(all_classes, axis=0).astype(np.int32)
+    # Global fusion: class-wise Soft-NMS or class-agnostic if chosen
+    if class_agnostic_nms:
+        classes = np.zeros_like(classes)
+    boxes, scores, classes = soft_nms_classwise(
+        boxes, scores, classes,
+        iou_thr=softnms_iou,
+        method=softnms_method,
+        sigma=softnms_sigma,
+        score_thresh=1e-3,
+        max_det=max_det
+    )
+    # Final confidence gate
+    keep = scores >= final_conf
+    boxes, scores, classes = boxes[keep], scores[keep], classes[keep]
+    return {"xyxy": boxes, "conf": scores, "cls": classes, "names": names}

tiling_test.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import math
+from typing import Dict, Optional, Tuple
+import numpy as np
+import pandas as pd
+import cv2
+# --- Parse YOLO txt (normalized) -> pixel xyxy ---
+def load_yolo_labels_xyxy(txt_path: str, img_w: int, img_h: int) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Returns:
+      cls_ids: (N,) int
+      boxes_xyxy: (N,4) float32 in pixel coords
+    """
+    cls_ids, boxes = [], []
+    with open(txt_path, "r") as f:
+        for line in f:
+            parts = line.strip().split()
+            if len(parts) != 5:
+                continue
+            c, xc, yc, w, h = parts
+            c = int(float(c))
+            xc, yc, w, h = map(float, (xc, yc, w, h))
+            # convert normalized -> pixel xyxy
+            px = xc * img_w
+            py = yc * img_h
+            pw = w * img_w
+            ph = h * img_h
+            x1 = px - pw / 2.0
+            y1 = py - ph / 2.0
+            x2 = px + pw / 2.0
+            y2 = py + ph / 2.0
+            boxes.append([x1, y1, x2, y2])
+            cls_ids.append(c)
+    if not boxes:
+        return np.zeros((0,), dtype=np.int32), np.zeros((0,4), dtype=np.float32)
+    return np.array(cls_ids, dtype=np.int32), np.array(boxes, dtype=np.float32)
+# --- IoU & matching ---
+def iou_matrix(a_xyxy: np.ndarray, b_xyxy: np.ndarray) -> np.ndarray:
+    """Pairwise IoU: (Na,4) vs (Nb,4) -> (Na,Nb)."""
+    if a_xyxy.size == 0 or b_xyxy.size == 0:
+        return np.zeros((a_xyxy.shape[0], b_xyxy.shape[0]), dtype=np.float32)
+    ax1, ay1, ax2, ay2 = a_xyxy[:,0:1], a_xyxy[:,1:2], a_xyxy[:,2:3], a_xyxy[:,3:4]
+    bx1, by1, bx2, by2 = b_xyxy[:,0], b_xyxy[:,1], b_xyxy[:,2], b_xyxy[:,3]
+    xx1 = np.maximum(ax1, bx1)
+    yy1 = np.maximum(ay1, by1)
+    xx2 = np.minimum(ax2, bx2)
+    yy2 = np.minimum(ay2, by2)
+    inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
+    area_a = (ax2 - ax1) * (ay2 - ay1)
+    area_b = (bx2 - bx1) * (by2 - by1)
+    union = np.maximum(1e-9, area_a + area_b - inter)
+    return (inter / union).astype(np.float32)
+def greedy_match_per_class(
+    pred_boxes: np.ndarray, pred_scores: np.ndarray, pred_cls: np.ndarray,
+    gt_boxes: np.ndarray, gt_cls: np.ndarray,
+    iou_thr: float
+):
+    """
+    Greedy IoU matching per class. Returns:
+      matches: list of (pred_idx, gt_idx)
+      pred_unmatched: np.ndarray of unmatched pred indices
+      gt_unmatched: np.ndarray of unmatched gt indices
+    """
+    matches = []
+    pred_unmatched = np.ones(len(pred_boxes), dtype=bool)
+    gt_unmatched   = np.ones(len(gt_boxes), dtype=bool)
+    classes = np.union1d(pred_cls, gt_cls)
+    for c in classes:
+        p_idx = np.where(pred_cls == c)[0]
+        g_idx = np.where(gt_cls   == c)[0]
+        if len(p_idx) == 0 or len(g_idx) == 0:
+            continue
+        IoU = iou_matrix(pred_boxes[p_idx], gt_boxes[g_idx])
+        # Greedy: repeatedly pick the best remaining pair
+        used_p = set(); used_g = set()
+        while True:
+            if IoU.size == 0:
+                break
+            m = np.max(IoU)
+            if m < iou_thr:
+                break
+            i, j = np.unravel_index(np.argmax(IoU), IoU.shape)
+            pi, gi = p_idx[i], g_idx[j]
+            if (i in used_p) or (j in used_g):
+                IoU[i, j] = -1.0
+                continue
+            matches.append((pi, gi))
+            used_p.add(i); used_g.add(j)
+            IoU[i, :] = -1.0
+            IoU[:, j] = -1.0
+        # mark matched as not unmatched
+        for i in used_p:
+            pred_unmatched[p_idx[i]] = False
+        for j in used_g:
+            gt_unmatched[g_idx[j]] = False
+    return matches, np.where(pred_unmatched)[0], np.where(gt_unmatched)[0]
+# --- Count metrics (optional but handy) ---
+def count_metrics(actual_counts: Dict[int, int], pred_counts: Dict[int, int]) -> Tuple[pd.DataFrame, Dict]:
+    labels = sorted(set(actual_counts)|set(pred_counts))
+    rows = []
+    tp_sum = fp_sum = fn_sum = 0
+    abs_sum = 0
+    denom_sum = 0
+    for c in labels:
+        a = int(actual_counts.get(c, 0))
+        p = int(pred_counts.get(c, 0))
+        tp = min(a, p); fp = max(p-a, 0); fn = max(a-p, 0)
+        abs_err = abs(p-a)
+        denom = (abs(a)+abs(p))/2 if (a+p)>0 else 1.0
+        smape = abs_err/denom
+        prec = tp/(tp+fp) if (tp+fp)>0 else float('nan')
+        rec  = tp/(tp+fn) if (tp+fn)>0 else float('nan')
+        f1   = 2*prec*rec/(prec+rec) if (not math.isnan(prec) and not math.isnan(rec) and (prec+rec)>0) else float('nan')
+        rows.append({"class_id": c, "actual": a, "pred": p, "abs_err": abs_err, "sMAPE": smape, "P": prec, "R": rec, "F1": f1})
+        tp_sum += tp; fp_sum += fp; fn_sum += fn; abs_sum += abs_err; denom_sum += denom
+    micro_p = tp_sum/(tp_sum+fp_sum) if (tp_sum+fp_sum)>0 else float('nan')
+    micro_r = tp_sum/(tp_sum+fn_sum) if (tp_sum+fn_sum)>0 else float('nan')
+    micro_f1 = 2*micro_p*micro_r/(micro_p+micro_r) if (not math.isnan(micro_p) and not math.isnan(micro_r) and (micro_p+micro_r)>0) else float('nan')
+    overall = {"sum_abs_count_error": abs_sum, "micro_precision": micro_p, "micro_recall": micro_r, "micro_f1": micro_f1, "micro_sMAPE": abs_sum/(denom_sum or 1.0)}
+    return pd.DataFrame(rows), overall
+# --- Pretty eval for ONE image ---
+def evaluate_one_image(
+    out: Dict,                         # from detect_tiled_softnms(...)
+    label_txt_path: str,
+    img_w: int, img_h: int,
+    iou_thr: float = 0.50,
+    conf_thr: float = 0.25,
+    return_vis: bool = False,
+    image_rgb: Optional[np.ndarray] = None
+):
+    """
+    Returns:
+      per_class_df (precision/recall/F1, counts),
+      overall (micro P/R/F1, totals),
+      (optional) annotated RGB image
+    """
+    # Predictions (filter by conf)
+    p_boxes = out["xyxy"].astype(np.float32)
+    p_scores = out["conf"].astype(np.float32)
+    p_cls = out["cls"].astype(np.int32)
+    keep = p_scores >= float(conf_thr)
+    p_boxes, p_scores, p_cls = p_boxes[keep], p_scores[keep], p_cls[keep]
+    names: Dict[int,str] = out.get("names", {})
+    # Ground truth
+    g_cls, g_boxes = load_yolo_labels_xyxy(label_txt_path, img_w, img_h)
+    # Per-class counts (sanity)
+    actual_counts = {int(c): int((g_cls == c).sum()) for c in np.unique(g_cls)} if len(g_cls) else {}
+    pred_counts = {int(c): int((p_cls == c).sum()) for c in np.unique(p_cls)} if len(p_cls) else {}
+    count_df, count_overall = count_metrics(actual_counts, pred_counts)
+    # Matching
+    matches, p_unmatched_idx, g_unmatched_idx = greedy_match_per_class(
+        p_boxes, p_scores, p_cls, g_boxes, g_cls, iou_thr=iou_thr
+    )
+    matched_p = np.array([m[0] for m in matches], dtype=int) if matches else np.array([], dtype=int)
+    matched_g = np.array([m[1] for m in matches], dtype=int) if matches else np.array([], dtype=int)
+    # Compute per-class detection metrics
+    classes = sorted(set(list(actual_counts.keys()) + list(pred_counts.keys())))
+    rows = []
+    for c in classes:
+        tp = int(np.sum(p_cls[matched_p] == c))                 # matched pairs already class-consistent
+        fp = int(np.sum((p_cls == c))) - tp
+        fn = int(np.sum((g_cls == c))) - tp
+        prec = tp/(tp+fp) if (tp+fp)>0 else float('nan')
+        rec = tp/(tp+fn) if (tp+fn)>0 else float('nan')
+        f1 = 2*prec*rec/(prec+rec) if (not math.isnan(prec) and not math.isnan(rec) and (prec+rec)>0) else float('nan')
+        rows.append({
+            "class_id": c,
+            "class_name": names.get(c, str(c)),
+            "gt": int(np.sum(g_cls==c)),
+            "pred": int(np.sum(p_cls==c)),
+            "TP": tp, "FP": fp, "FN": fn,
+            "precision": prec, "recall": rec, "F1": f1
+        })
+    det_df = pd.DataFrame(rows).sort_values("class_id").reset_index(drop=True)
+    # Overall detection micro-averages
+    TP = int(len(matches))
+    FP = int(len(p_boxes) - TP)
+    FN = int(len(g_boxes) - TP)
+    micro_p = TP/(TP+FP) if (TP+FP)>0 else float('nan')
+    micro_r = TP/(TP+FN) if (TP+FN)>0 else float('nan')
+    micro_f1 = 2*micro_p*micro_r/(micro_p+micro_r) if (not math.isnan(micro_p) and not math.isnan(micro_r) and (micro_p+micro_r)>0) else float('nan')
+    overall = {
+        "gt_instances": int(len(g_boxes)),
+        "pred_instances": int(len(p_boxes)),
+        "TP": TP, "FP": FP, "FN": FN,
+        "micro_precision": micro_p,
+        "micro_recall": micro_r,
+        "micro_F1": micro_f1,
+        "iou_thr": iou_thr,
+        "conf_thr": conf_thr
+    }
+    if not return_vis or image_rgb is None:
+        return det_df, overall, count_df, count_overall
+    # Annotated visualization
+    vis = image_rgb.copy()
+    # Draw GT (yellow)
+    for i in range(len(g_boxes)):
+        color = (240, 230, 70)
+        x1,y1,x2,y2 = g_boxes[i].astype(int)
+        cv2.rectangle(vis, (x1,y1), (x2,y2), color, 2)
+    # Draw matched predictions (green)
+    for pi in matched_p:
+        x1,y1,x2,y2 = p_boxes[pi].astype(int)
+        c = int(p_cls[pi]); sc = float(p_scores[pi])
+        label = f"{names.get(c,str(c))} {sc:.2f}"
+        cv2.rectangle(vis, (x1,y1), (x2,y2), (60, 220, 60), 2)
+        cv2.putText(vis, label, (x1+2, max(0,y1-5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (60,220,60), 2, cv2.LINE_AA)
+    # Draw unmatched predictions (red)
+    for pi in p_unmatched_idx:
+        x1,y1,x2,y2 = p_boxes[pi].astype(int)
+        c = int(p_cls[pi]); sc = float(p_scores[pi])
+        label = f"{names.get(c,str(c))} {sc:.2f}"
+        cv2.rectangle(vis, (x1,y1), (x2,y2), (10, 60, 240), 2)
+        cv2.putText(vis, label, (x1+2, max(0,y1-5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (10,60,240), 2, cv2.LINE_AA)
+    return det_df, overall, count_df, count_overall, vis