PerceptionBenchmark / src /localization.py
DariusGiannoli
fix: 11 bugs β€” confusion matrix, multi-class localization, dedup RCE/NMS, validation guards
a2b92f9
"""
src/localization.py β€” Localization Strategy Library
=====================================================
Five strategies that decide WHERE to evaluate a recognition head.
The head stays the same β€” only the search method changes.
Strategies
----------
1. Exhaustive Sliding Window β€” brute-force grid scan
2. Image Pyramid β€” multi-scale resize + sliding window
3. Coarse-to-Fine Search β€” two-pass hierarchical refinement
4. Contour Proposals β€” edge-driven candidate regions
5. Template Matching β€” OpenCV cross-correlation (no head)
Every function returns the same tuple:
(detections, n_proposals, elapsed_ms, heatmap)
"""
import cv2
import numpy as np
import time
# ===================================================================
# Shared utilities
# ===================================================================
def nms(dets, iou_thresh):
"""Greedy NMS on list of (x1, y1, x2, y2, label, conf)."""
dets = sorted(dets, key=lambda d: d[5], reverse=True)
keep = []
while dets:
best = dets.pop(0)
keep.append(best)
dets = [d for d in dets if _iou(best, d) < iou_thresh]
return keep
def _iou(a, b):
xi1, yi1 = max(a[0], b[0]), max(a[1], b[1])
xi2, yi2 = min(a[2], b[2]), min(a[3], b[3])
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
aa = (a[2] - a[0]) * (a[3] - a[1])
ab = (b[2] - b[0]) * (b[3] - b[1])
return inter / (aa + ab - inter + 1e-6)
# ===================================================================
# 1. Exhaustive Sliding Window
# ===================================================================
def exhaustive_sliding_window(image, win_h, win_w, feature_fn, head,
stride, conf_thresh, nms_iou):
"""
Brute-force grid scan. Evaluates the head at **every** position
spaced by *stride* pixels.
"""
H, W = image.shape[:2]
heatmap = np.zeros((H, W), dtype=np.float32)
detections = []
n_proposals = 0
t0 = time.perf_counter()
for y in range(0, H - win_h + 1, stride):
for x in range(0, W - win_w + 1, stride):
patch = image[y:y + win_h, x:x + win_w]
feats = feature_fn(patch)
label, conf = head.predict(feats)
n_proposals += 1
if label != "background":
heatmap[y:y + win_h, x:x + win_w] = np.maximum(
heatmap[y:y + win_h, x:x + win_w], conf)
if conf >= conf_thresh:
detections.append((x, y, x + win_w, y + win_h, label, conf))
elapsed_ms = (time.perf_counter() - t0) * 1000
if detections:
detections = nms(detections, nms_iou)
return detections, n_proposals, elapsed_ms, heatmap
# ===================================================================
# 2. Image Pyramid
# ===================================================================
def image_pyramid(image, win_h, win_w, feature_fn, head,
stride, conf_thresh, nms_iou,
scales=(0.5, 0.75, 1.0, 1.25, 1.5)):
"""
Resize the image at several scales, run a sliding window at each
level, and map detections back to original coordinates.
Finds objects at sizes different from the training crop.
"""
H, W = image.shape[:2]
heatmap = np.zeros((H, W), dtype=np.float32)
detections = []
n_proposals = 0
t0 = time.perf_counter()
for scale in scales:
sH, sW = int(H * scale), int(W * scale)
if sH < win_h or sW < win_w:
continue
scaled = cv2.resize(image, (sW, sH))
for y in range(0, sH - win_h + 1, stride):
for x in range(0, sW - win_w + 1, stride):
patch = scaled[y:y + win_h, x:x + win_w]
feats = feature_fn(patch)
label, conf = head.predict(feats)
n_proposals += 1
if label != "background":
# Map back to original image coordinates
ox = int(x / scale)
oy = int(y / scale)
ox2 = min(int((x + win_w) / scale), W)
oy2 = min(int((y + win_h) / scale), H)
heatmap[oy:oy2, ox:ox2] = np.maximum(
heatmap[oy:oy2, ox:ox2], conf)
if conf >= conf_thresh:
detections.append((ox, oy, ox2, oy2, label, conf))
elapsed_ms = (time.perf_counter() - t0) * 1000
if detections:
detections = nms(detections, nms_iou)
return detections, n_proposals, elapsed_ms, heatmap
# ===================================================================
# 3. Coarse-to-Fine Search
# ===================================================================
def coarse_to_fine(image, win_h, win_w, feature_fn, head,
fine_stride, conf_thresh, nms_iou,
coarse_factor=4, refine_radius=2):
"""
Two-pass hierarchical search.
Pass 1 β€” Scan at *coarse_factor Γ— fine_stride* to cheaply identify
hot regions (using a relaxed threshold of 0.7 Γ— conf_thresh).
Pass 2 β€” Re-scan **only** the neighbourhood of each hit at
*fine_stride*, within *refine_radius* steps in each direction.
"""
H, W = image.shape[:2]
heatmap = np.zeros((H, W), dtype=np.float32)
detections = []
n_proposals = 0
t0 = time.perf_counter()
coarse_stride = fine_stride * coarse_factor
# --- Pass 1: coarse ---
hot_spots = []
for y in range(0, H - win_h + 1, coarse_stride):
for x in range(0, W - win_w + 1, coarse_stride):
patch = image[y:y + win_h, x:x + win_w]
feats = feature_fn(patch)
label, conf = head.predict(feats)
n_proposals += 1
if label != "background" and conf >= conf_thresh * 0.7:
hot_spots.append((x, y))
heatmap[y:y + win_h, x:x + win_w] = np.maximum(
heatmap[y:y + win_h, x:x + win_w], conf)
# --- Pass 2: fine around hot spots ---
visited = set()
for hx, hy in hot_spots:
for dy in range(-refine_radius, refine_radius + 1):
for dx in range(-refine_radius, refine_radius + 1):
x = hx + dx * fine_stride
y = hy + dy * fine_stride
if (x, y) in visited:
continue
if x < 0 or y < 0 or x + win_w > W or y + win_h > H:
continue
visited.add((x, y))
patch = image[y:y + win_h, x:x + win_w]
feats = feature_fn(patch)
label, conf = head.predict(feats)
n_proposals += 1
if label != "background":
heatmap[y:y + win_h, x:x + win_w] = np.maximum(
heatmap[y:y + win_h, x:x + win_w], conf)
if conf >= conf_thresh:
detections.append((x, y, x + win_w, y + win_h,
label, conf))
elapsed_ms = (time.perf_counter() - t0) * 1000
if detections:
detections = nms(detections, nms_iou)
return detections, n_proposals, elapsed_ms, heatmap
# ===================================================================
# 4. Contour Proposals
# ===================================================================
def contour_proposals(image, win_h, win_w, feature_fn, head,
conf_thresh, nms_iou,
canny_low=50, canny_high=150,
area_tolerance=3.0):
"""
Generate candidate regions from image structure:
Canny edges β†’ morphological closing β†’ contour extraction.
Keep contours whose bounding-box area is within *area_tolerance*Γ—
of the window area, centre a window on each, and score with the head.
Returns an extra key ``edge_map`` in the heatmap slot for
visualisation on the page (the caller can detect this).
"""
H, W = image.shape[:2]
heatmap = np.zeros((H, W), dtype=np.float32)
detections = []
n_proposals = 0
t0 = time.perf_counter()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(blurred, canny_low, canny_high)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
target_area = win_h * win_w
min_area = target_area / area_tolerance
max_area = target_area * area_tolerance
for cnt in contours:
area = cv2.contourArea(cnt)
if area < min_area or area > max_area:
continue
bx, by, bw, bh = cv2.boundingRect(cnt)
# Centre a window on the contour centre
cx, cy = bx + bw // 2, by + bh // 2
px = max(0, min(cx - win_w // 2, W - win_w))
py = max(0, min(cy - win_h // 2, H - win_h))
patch = image[py:py + win_h, px:px + win_w]
if patch.shape[0] != win_h or patch.shape[1] != win_w:
continue
feats = feature_fn(patch)
label, conf = head.predict(feats)
n_proposals += 1
if label != "background":
heatmap[py:py + win_h, px:px + win_w] = np.maximum(
heatmap[py:py + win_h, px:px + win_w], conf)
if conf >= conf_thresh:
detections.append((px, py, px + win_w, py + win_h,
label, conf))
elapsed_ms = (time.perf_counter() - t0) * 1000
if detections:
detections = nms(detections, nms_iou)
return detections, n_proposals, elapsed_ms, heatmap, edges
# ===================================================================
# 5. Template Matching
# ===================================================================
def template_matching(image, template, conf_thresh, nms_iou,
method=cv2.TM_CCOEFF_NORMED):
"""
OpenCV normalised cross-correlation.
No trained head β€” pure pixel similarity between *template* and every
image position. Extremely fast (optimised C++) but not invariant to
rotation, scale, or illumination.
"""
H, W = image.shape[:2]
th, tw = template.shape[:2]
t0 = time.perf_counter()
result = cv2.matchTemplate(image, template, method)
if method in (cv2.TM_CCOEFF_NORMED, cv2.TM_CCORR_NORMED):
score_map = np.clip(result, 0, 1).astype(np.float32)
else:
lo, hi = result.min(), result.max()
score_map = ((result - lo) / (hi - lo + 1e-6)).astype(np.float32)
# Full-size heatmap (resize for visualisation)
heatmap = cv2.resize(score_map, (W, H), interpolation=cv2.INTER_LINEAR)
# Extract detections above threshold
detections = []
locs = np.where(score_map >= conf_thresh)
for y, x in zip(*locs):
detections.append((int(x), int(y), int(x + tw), int(y + th),
"object", float(score_map[y, x])))
n_proposals = score_map.shape[0] * score_map.shape[1]
elapsed_ms = (time.perf_counter() - t0) * 1000
if detections:
detections = nms(detections, nms_iou)
return detections, n_proposals, elapsed_ms, heatmap
# ===================================================================
# Registry β€” metadata used by the Streamlit page
# ===================================================================
STRATEGIES = {
"Exhaustive Sliding Window": {
"icon": "πŸ”²",
"fn": exhaustive_sliding_window,
"needs_head": True,
"short": "Brute-force grid scan at every stride position.",
"detail": (
"The simplest approach: a fixed-size window slides across the "
"**entire image** at regular intervals. At every position the "
"patch is extracted, features are computed, and the head classifies it.\n\n"
"**Complexity:** $O\\!\\left(\\frac{W}{s} \\times \\frac{H}{s}\\right)$ "
"where $s$ = stride.\n\n"
"**Pro:** Guaranteed to evaluate every location β€” nothing is missed.\n\n"
"**Con:** Extremely slow on large images or small strides."
),
},
"Image Pyramid": {
"icon": "πŸ”Ί",
"fn": image_pyramid,
"needs_head": True,
"short": "Multi-scale resize + sliding window.",
"detail": (
"Builds a **Gaussian pyramid** by resizing the image to several "
"scales (e.g. 50 %, 75 %, 100 %, 125 %, 150 %). A sliding-window "
"scan runs at each level and detections are mapped back to original "
"coordinates.\n\n"
"**Why:** The training crop has a fixed size. If the real object "
"appears larger or smaller in the scene, a single-scale scan will "
"miss it. The pyramid handles **scale variation**.\n\n"
"**Cost:** Multiplies the number of proposals by the number of "
"scales β€” slower than single-scale exhaustive."
),
},
"Coarse-to-Fine": {
"icon": "🎯",
"fn": coarse_to_fine,
"needs_head": True,
"short": "Two-pass hierarchical refinement.",
"detail": (
"**Pass 1 β€” Coarse:** Scans the image with a large stride "
"(coarse\\_factor Γ— fine\\_stride) using a relaxed confidence "
"threshold (70 % of the target) to cheaply identify *hot regions*.\n\n"
"**Pass 2 β€” Fine:** Re-scans **only** the neighbourhood around "
"each coarse hit at the fine stride, within *refine\\_radius* steps "
"in each direction.\n\n"
"**Speedup:** Typically **3–10Γ—** faster than exhaustive when the "
"object is spatially sparse (i.e. most of the image is background)."
),
},
"Contour Proposals": {
"icon": "✏️",
"fn": contour_proposals,
"needs_head": True,
"short": "Edge-driven candidate regions scored by head.",
"detail": (
"Instead of scanning everywhere, this method lets **image "
"structure** drive the search:\n\n"
"1. Canny edge detection\n"
"2. Morphological closing to bridge nearby edges\n"
"3. External contour extraction\n"
"4. Filter contours whose area falls within *area\\_tolerance* "
"of the window area\n"
"5. Centre a window on each surviving contour and score with "
"the trained head\n\n"
"**Proposals evaluated:** Typically 10–100Γ— fewer than exhaustive. "
"Speed depends on scene complexity (more edges β†’ more proposals)."
),
},
"Template Matching": {
"icon": "πŸ“‹",
"fn": template_matching,
"needs_head": False,
"short": "OpenCV cross-correlation β€” no head needed.",
"detail": (
"Classical **normalised cross-correlation** (NCC). Slides the "
"crop template over the image computing pixel-level similarity "
"at every position. No trained head is involved.\n\n"
"**Speed:** Runs entirely in OpenCV's optimised C++ backend β€” "
"orders of magnitude faster than Python-level loops.\n\n"
"**Limitation:** Not invariant to rotation, scale, or illumination "
"changes. Works best when the object appears at the **exact same "
"size and orientation** as the crop."
),
},
}