Spaces:

DariusGiannoli
/

PerceptionBenchmark

Running

File size: 15,633 Bytes

"""
src/localization.py  —  Localization Strategy Library
=====================================================
Five strategies that decide WHERE to evaluate a recognition head.
The head stays the same — only the search method changes.

Strategies
----------
1. Exhaustive Sliding Window  — brute-force grid scan
2. Image Pyramid              — multi-scale resize + sliding window
3. Coarse-to-Fine Search      — two-pass hierarchical refinement
4. Contour Proposals          — edge-driven candidate regions
5. Template Matching          — OpenCV cross-correlation (no head)

Every function returns the same tuple:
    (detections, n_proposals, elapsed_ms, heatmap)
"""

import cv2
import numpy as np
import time


# ===================================================================
#  Shared utilities
# ===================================================================

def nms(dets, iou_thresh):
    """Greedy NMS on list of (x1, y1, x2, y2, label, conf)."""
    dets = sorted(dets, key=lambda d: d[5], reverse=True)
    keep = []
    while dets:
        best = dets.pop(0)
        keep.append(best)
        dets = [d for d in dets if _iou(best, d) < iou_thresh]
    return keep


def _iou(a, b):
    xi1, yi1 = max(a[0], b[0]), max(a[1], b[1])
    xi2, yi2 = min(a[2], b[2]), min(a[3], b[3])
    inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
    aa = (a[2] - a[0]) * (a[3] - a[1])
    ab = (b[2] - b[0]) * (b[3] - b[1])
    return inter / (aa + ab - inter + 1e-6)


# ===================================================================
#  1. Exhaustive Sliding Window
# ===================================================================

def exhaustive_sliding_window(image, win_h, win_w, feature_fn, head,
                               stride, conf_thresh, nms_iou):
    """
    Brute-force grid scan.  Evaluates the head at **every** position
    spaced by *stride* pixels.
    """
    H, W = image.shape[:2]
    heatmap = np.zeros((H, W), dtype=np.float32)
    detections = []
    n_proposals = 0
    t0 = time.perf_counter()

    for y in range(0, H - win_h + 1, stride):
        for x in range(0, W - win_w + 1, stride):
            patch = image[y:y + win_h, x:x + win_w]
            feats = feature_fn(patch)
            label, conf = head.predict(feats)
            n_proposals += 1
            if label != "background":
                heatmap[y:y + win_h, x:x + win_w] = np.maximum(
                    heatmap[y:y + win_h, x:x + win_w], conf)
                if conf >= conf_thresh:
                    detections.append((x, y, x + win_w, y + win_h, label, conf))

    elapsed_ms = (time.perf_counter() - t0) * 1000
    if detections:
        detections = nms(detections, nms_iou)
    return detections, n_proposals, elapsed_ms, heatmap


# ===================================================================
#  2. Image Pyramid
# ===================================================================

def image_pyramid(image, win_h, win_w, feature_fn, head,
                  stride, conf_thresh, nms_iou,
                  scales=(0.5, 0.75, 1.0, 1.25, 1.5)):
    """
    Resize the image at several scales, run a sliding window at each
    level, and map detections back to original coordinates.
    Finds objects at sizes different from the training crop.
    """
    H, W = image.shape[:2]
    heatmap = np.zeros((H, W), dtype=np.float32)
    detections = []
    n_proposals = 0
    t0 = time.perf_counter()

    for scale in scales:
        sH, sW = int(H * scale), int(W * scale)
        if sH < win_h or sW < win_w:
            continue
        scaled = cv2.resize(image, (sW, sH))

        for y in range(0, sH - win_h + 1, stride):
            for x in range(0, sW - win_w + 1, stride):
                patch = scaled[y:y + win_h, x:x + win_w]
                feats = feature_fn(patch)
                label, conf = head.predict(feats)
                n_proposals += 1
                if label != "background":
                    # Map back to original image coordinates
                    ox  = int(x / scale)
                    oy  = int(y / scale)
                    ox2 = min(int((x + win_w) / scale), W)
                    oy2 = min(int((y + win_h) / scale), H)
                    heatmap[oy:oy2, ox:ox2] = np.maximum(
                        heatmap[oy:oy2, ox:ox2], conf)
                    if conf >= conf_thresh:
                        detections.append((ox, oy, ox2, oy2, label, conf))

    elapsed_ms = (time.perf_counter() - t0) * 1000
    if detections:
        detections = nms(detections, nms_iou)
    return detections, n_proposals, elapsed_ms, heatmap


# ===================================================================
#  3. Coarse-to-Fine Search
# ===================================================================

def coarse_to_fine(image, win_h, win_w, feature_fn, head,
                   fine_stride, conf_thresh, nms_iou,
                   coarse_factor=4, refine_radius=2):
    """
    Two-pass hierarchical search.

    Pass 1 — Scan at *coarse_factor × fine_stride* to cheaply identify
             hot regions (using a relaxed threshold of 0.7 × conf_thresh).
    Pass 2 — Re-scan **only** the neighbourhood of each hit at
             *fine_stride*, within *refine_radius* steps in each direction.
    """
    H, W = image.shape[:2]
    heatmap = np.zeros((H, W), dtype=np.float32)
    detections = []
    n_proposals = 0
    t0 = time.perf_counter()

    coarse_stride = fine_stride * coarse_factor

    # --- Pass 1: coarse ---
    hot_spots = []
    for y in range(0, H - win_h + 1, coarse_stride):
        for x in range(0, W - win_w + 1, coarse_stride):
            patch = image[y:y + win_h, x:x + win_w]
            feats = feature_fn(patch)
            label, conf = head.predict(feats)
            n_proposals += 1
            if label != "background" and conf >= conf_thresh * 0.7:
                hot_spots.append((x, y))
                heatmap[y:y + win_h, x:x + win_w] = np.maximum(
                    heatmap[y:y + win_h, x:x + win_w], conf)

    # --- Pass 2: fine around hot spots ---
    visited = set()
    for hx, hy in hot_spots:
        for dy in range(-refine_radius, refine_radius + 1):
            for dx in range(-refine_radius, refine_radius + 1):
                x = hx + dx * fine_stride
                y = hy + dy * fine_stride
                if (x, y) in visited:
                    continue
                if x < 0 or y < 0 or x + win_w > W or y + win_h > H:
                    continue
                visited.add((x, y))
                patch = image[y:y + win_h, x:x + win_w]
                feats = feature_fn(patch)
                label, conf = head.predict(feats)
                n_proposals += 1
                if label != "background":
                    heatmap[y:y + win_h, x:x + win_w] = np.maximum(
                        heatmap[y:y + win_h, x:x + win_w], conf)
                    if conf >= conf_thresh:
                        detections.append((x, y, x + win_w, y + win_h,
                                           label, conf))

    elapsed_ms = (time.perf_counter() - t0) * 1000
    if detections:
        detections = nms(detections, nms_iou)
    return detections, n_proposals, elapsed_ms, heatmap


# ===================================================================
#  4. Contour Proposals
# ===================================================================

def contour_proposals(image, win_h, win_w, feature_fn, head,
                      conf_thresh, nms_iou,
                      canny_low=50, canny_high=150,
                      area_tolerance=3.0):
    """
    Generate candidate regions from image structure:
    Canny edges → morphological closing → contour extraction.
    Keep contours whose bounding-box area is within *area_tolerance*×
    of the window area, centre a window on each, and score with the head.

    Returns an extra key ``edge_map`` in the heatmap slot for
    visualisation on the page (the caller can detect this).
    """
    H, W = image.shape[:2]
    heatmap = np.zeros((H, W), dtype=np.float32)
    detections = []
    n_proposals = 0
    t0 = time.perf_counter()

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(blurred, canny_low, canny_high)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)

    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL,
                                   cv2.CHAIN_APPROX_SIMPLE)

    target_area = win_h * win_w
    min_area = target_area / area_tolerance
    max_area = target_area * area_tolerance

    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area < min_area or area > max_area:
            continue
        bx, by, bw, bh = cv2.boundingRect(cnt)
        # Centre a window on the contour centre
        cx, cy = bx + bw // 2, by + bh // 2
        px = max(0, min(cx - win_w // 2, W - win_w))
        py = max(0, min(cy - win_h // 2, H - win_h))

        patch = image[py:py + win_h, px:px + win_w]
        if patch.shape[0] != win_h or patch.shape[1] != win_w:
            continue

        feats = feature_fn(patch)
        label, conf = head.predict(feats)
        n_proposals += 1

        if label != "background":
            heatmap[py:py + win_h, px:px + win_w] = np.maximum(
                heatmap[py:py + win_h, px:px + win_w], conf)
            if conf >= conf_thresh:
                detections.append((px, py, px + win_w, py + win_h,
                                   label, conf))

    elapsed_ms = (time.perf_counter() - t0) * 1000
    if detections:
        detections = nms(detections, nms_iou)
    return detections, n_proposals, elapsed_ms, heatmap, edges


# ===================================================================
#  5. Template Matching
# ===================================================================

def template_matching(image, template, conf_thresh, nms_iou,
                      method=cv2.TM_CCOEFF_NORMED):
    """
    OpenCV normalised cross-correlation.
    No trained head — pure pixel similarity between *template* and every
    image position.  Extremely fast (optimised C++) but not invariant to
    rotation, scale, or illumination.
    """
    H, W = image.shape[:2]
    th, tw = template.shape[:2]
    t0 = time.perf_counter()

    result = cv2.matchTemplate(image, template, method)

    if method in (cv2.TM_CCOEFF_NORMED, cv2.TM_CCORR_NORMED):
        score_map = np.clip(result, 0, 1).astype(np.float32)
    else:
        lo, hi = result.min(), result.max()
        score_map = ((result - lo) / (hi - lo + 1e-6)).astype(np.float32)

    # Full-size heatmap (resize for visualisation)
    heatmap = cv2.resize(score_map, (W, H), interpolation=cv2.INTER_LINEAR)

    # Extract detections above threshold
    detections = []
    locs = np.where(score_map >= conf_thresh)
    for y, x in zip(*locs):
        detections.append((int(x), int(y), int(x + tw), int(y + th),
                           "object", float(score_map[y, x])))

    n_proposals = score_map.shape[0] * score_map.shape[1]
    elapsed_ms = (time.perf_counter() - t0) * 1000

    if detections:
        detections = nms(detections, nms_iou)
    return detections, n_proposals, elapsed_ms, heatmap


# ===================================================================
#  Registry  —  metadata used by the Streamlit page
# ===================================================================

STRATEGIES = {
    "Exhaustive Sliding Window": {
        "icon": "🔲",
        "fn":   exhaustive_sliding_window,
        "needs_head": True,
        "short": "Brute-force grid scan at every stride position.",
        "detail": (
            "The simplest approach: a fixed-size window slides across the "
            "**entire image** at regular intervals.  At every position the "
            "patch is extracted, features are computed, and the head classifies it.\n\n"
            "**Complexity:** $O\\!\\left(\\frac{W}{s} \\times \\frac{H}{s}\\right)$ "
            "where $s$ = stride.\n\n"
            "**Pro:** Guaranteed to evaluate every location — nothing is missed.\n\n"
            "**Con:** Extremely slow on large images or small strides."
        ),
    },
    "Image Pyramid": {
        "icon": "🔺",
        "fn":   image_pyramid,
        "needs_head": True,
        "short": "Multi-scale resize + sliding window.",
        "detail": (
            "Builds a **Gaussian pyramid** by resizing the image to several "
            "scales (e.g. 50 %, 75 %, 100 %, 125 %, 150 %).  A sliding-window "
            "scan runs at each level and detections are mapped back to original "
            "coordinates.\n\n"
            "**Why:** The training crop has a fixed size.  If the real object "
            "appears larger or smaller in the scene, a single-scale scan will "
            "miss it.  The pyramid handles **scale variation**.\n\n"
            "**Cost:** Multiplies the number of proposals by the number of "
            "scales — slower than single-scale exhaustive."
        ),
    },
    "Coarse-to-Fine": {
        "icon": "🎯",
        "fn":   coarse_to_fine,
        "needs_head": True,
        "short": "Two-pass hierarchical refinement.",
        "detail": (
            "**Pass 1 — Coarse:** Scans the image with a large stride "
            "(coarse\\_factor × fine\\_stride) using a relaxed confidence "
            "threshold (70 % of the target) to cheaply identify *hot regions*.\n\n"
            "**Pass 2 — Fine:** Re-scans **only** the neighbourhood around "
            "each coarse hit at the fine stride, within *refine\\_radius* steps "
            "in each direction.\n\n"
            "**Speedup:** Typically **3–10×** faster than exhaustive when the "
            "object is spatially sparse (i.e. most of the image is background)."
        ),
    },
    "Contour Proposals": {
        "icon": "✏️",
        "fn":   contour_proposals,
        "needs_head": True,
        "short": "Edge-driven candidate regions scored by head.",
        "detail": (
            "Instead of scanning everywhere, this method lets **image "
            "structure** drive the search:\n\n"
            "1. Canny edge detection\n"
            "2. Morphological closing to bridge nearby edges\n"
            "3. External contour extraction\n"
            "4. Filter contours whose area falls within *area\\_tolerance* "
            "of the window area\n"
            "5. Centre a window on each surviving contour and score with "
            "the trained head\n\n"
            "**Proposals evaluated:** Typically 10–100× fewer than exhaustive. "
            "Speed depends on scene complexity (more edges → more proposals)."
        ),
    },
    "Template Matching": {
        "icon": "📋",
        "fn":   template_matching,
        "needs_head": False,
        "short": "OpenCV cross-correlation — no head needed.",
        "detail": (
            "Classical **normalised cross-correlation** (NCC).  Slides the "
            "crop template over the image computing pixel-level similarity "
            "at every position.  No trained head is involved.\n\n"
            "**Speed:** Runs entirely in OpenCV's optimised C++ backend — "
            "orders of magnitude faster than Python-level loops.\n\n"
            "**Limitation:** Not invariant to rotation, scale, or illumination "
            "changes.  Works best when the object appears at the **exact same "
            "size and orientation** as the crop."
        ),
    },
}