File size: 4,695 Bytes

45f16b6

import cv2
import numpy as np
import onnxruntime as ort

# --- Config (from model.phd.cfg) ---
MODEL_PATH = "yolov11_phd_s.onnx"
LABEL_PATH = "../models/crowd_human.names"
IMAGE_PATH = "test_image.jpg"
CONF_THRESHOLD = 0.2        # pre-cluster-threshold
IOU_THRESHOLD = 0.6         # nms-iou-threshold
NET_SCALE_FACTOR = 0.0039215697906911373  # net-scale-factor (≈1/255)
MODEL_COLOR_FORMAT = 0      # 0 = BGR (no channel swap)
TOPK = 300                  # topk

def load_labels(label_path):
    with open(label_path) as f:
        return [line.strip() for line in f if line.strip()]

def load_model(model_path):
    session = ort.InferenceSession(
        model_path,
        providers=["CUDAExecutionProvider", "CPUExecutionProvider"]  # gpu-id=0, CPU fallback
    )
    input_meta = session.get_inputs()[0]
    input_name = input_meta.name
    _, _, h, w = input_meta.shape   # NCHW → extract H, W
    return session, input_name, (h, w)

def preprocess(image, input_size):
    """Letterbox resize + normalize."""
    h_in, w_in = input_size
    h_orig, w_orig = image.shape[:2]

    # Letterbox scaling (preserves aspect ratio)
    scale = min(w_in / w_orig, h_in / h_orig)
    new_w, new_h = int(w_orig * scale), int(h_orig * scale)
    resized = cv2.resize(image, (new_w, new_h))

    # Pad to input size
    canvas = np.full((h_in, w_in, 3), 114, dtype=np.uint8)
    pad_top = (h_in - new_h) // 2
    pad_left = (w_in - new_w) // 2
    canvas[pad_top:pad_top + new_h, pad_left:pad_left + new_w] = resized

    # Normalize — model-color-format=0 means BGR input, no channel swap
    img = canvas.astype(np.float32) * NET_SCALE_FACTOR    # scale by net-scale-factor
    img = np.transpose(img, (2, 0, 1))                    # HWC → CHW
    img = np.expand_dims(img, axis=0)                     # Add batch dim

    return img, scale, pad_top, pad_left

def postprocess(output, scale, pad_top, pad_left, conf_thresh, iou_thresh):
    """
    YOLOv11 output shape: (1, 4 + num_classes, num_anchors)
    For dual-class (person + head): (1, 6, 8400)
    """
    preds = output[0]           # (1, 6, 8400)
    preds = preds[0]            # (6, 8400)
    preds = preds.T             # (8400, 6) → each row = one anchor

    boxes_raw = preds[:, :4]    # cx, cy, w, h
    class_scores = preds[:, 4:] # (8400, 2) — one score per class

    # Best class per anchor
    class_ids = np.argmax(class_scores, axis=1)
    scores = class_scores[np.arange(len(class_scores)), class_ids]

    # Filter by confidence
    mask = scores >= conf_thresh
    boxes_raw = boxes_raw[mask]
    scores = scores[mask]
    class_ids = class_ids[mask]

    if len(scores) == 0:
        return []

    # Convert cx,cy,w,h → x1,y1,x2,y2 and undo letterbox
    x1 = (boxes_raw[:, 0] - boxes_raw[:, 2] / 2 - pad_left) / scale
    y1 = (boxes_raw[:, 1] - boxes_raw[:, 3] / 2 - pad_top)  / scale
    x2 = (boxes_raw[:, 0] + boxes_raw[:, 2] / 2 - pad_left) / scale
    y2 = (boxes_raw[:, 1] + boxes_raw[:, 3] / 2 - pad_top)  / scale

    boxes_xyxy = np.stack([x1, y1, x2 - x1, y2 - y1], axis=1).astype(int)  # for NMS

    # NMS with topk cap
    indices = cv2.dnn.NMSBoxes(
        boxes_xyxy.tolist(), scores.tolist(), conf_thresh, iou_thresh
    )

    results = []
    for i in indices[:TOPK]:
        idx = i[0] if isinstance(i, (list, np.ndarray)) else i
        x, y, w, h = boxes_xyxy[idx]
        results.append({
            "bbox": (x, y, x + w, y + h),
            "score": float(scores[idx]),
            "class_id": int(class_ids[idx])
        })
    return results

def draw(image, detections, labels):
    for det in detections:
        x1, y1, x2, y2 = det["bbox"]
        label = labels[det["class_id"]] if labels and det["class_id"] < len(labels) else f"class{det['class_id']}"
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 200, 0), 2)
        cv2.putText(image, f"{label} {det['score']:.2f}",
                    (x1, max(y1 - 8, 0)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 200, 0), 2)
    return image

# --- Main ---
labels = load_labels(LABEL_PATH)
session, input_name, input_size = load_model(MODEL_PATH)
print(f"Model input size: {input_size}")

image = cv2.imread(IMAGE_PATH)
tensor, scale, pad_top, pad_left = preprocess(image, input_size)

outputs = session.run(None, {input_name: tensor})

detections = postprocess(outputs, scale, pad_top, pad_left,
                         CONF_THRESHOLD, IOU_THRESHOLD)

print(f"Detected {len(detections)} heads")
for d in detections:
    print(f"  BBox: {d['bbox']}, Score: {d['score']:.3f}")

result = draw(image.copy(), detections, labels)
cv2.imwrite("output.jpg", result)
cv2.imshow("Detections", result)
cv2.waitKey(0)