Spaces:

1javid
/

cv_project_2

Sleeping

App Files Files Community

1javid commited on Apr 28

Commit

e77df61

verified ·

1 Parent(s): f21524b

Upload 2 files

Browse files

Files changed (2) hide show

depth_estimation.py +409 -0
object_distance.py +799 -0

depth_estimation.py ADDED Viewed

	@@ -0,0 +1,409 @@

+"""
+Subtask 1 – Depth Estimation
+  1. Classical method : SGBM Stereo Matching on a synthesised stereo pair
+  2. ML-based method  : Actual MiDaS (MiDaS_small) via torch.hub
+  3. Both rendered as heatmaps (hot colours = close, cold colours = far)
+Usage:
+    python depth_estimation.py <image_path> [output_dir]
+Example:
+    python depth_estimation.py street.jpg output/
+"""
+import sys
+import os
+import cv2
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from scipy.ndimage import gaussian_filter
+import torch
+# ═══════════════════════════════════════════════════════════
+# 0.  LOAD IMAGE  (real image required)
+# ═══════════════════════════════════════════════════════════
+def load_image(path: str) -> np.ndarray:
+    if not path or not os.path.exists(path):
+        sys.exit(
+            f"ERROR: Image not found: '{path}'\n"
+            "Usage: python depth_estimation.py <image_path>\n"
+            "Example: python depth_estimation.py street.jpg"
+        )
+    img = cv2.imread(path)
+    if img is None:
+        sys.exit(f"ERROR: Could not read image: '{path}'")
+    print(f"Loaded: {path}  {img.shape[1]}x{img.shape[0]}  ({img.shape[2]} channels)")
+    return img
+# ═══════════════════════════════════════════════════════════
+# 1.  CLASSICAL METHOD – SGBM STEREO MATCHING
+# ═══════════════════════════════════════════════════════════
+def synthesise_stereo_pair(
+    img: np.ndarray,
+    baseline_shift_pct: float = 0.03
+) -> tuple:
+    """
+    Simulate a stereo pair from a monocular image.
+    A per-pixel disparity seed is estimated from two monocular cues:
+      - Focus sharpness  (Laplacian magnitude): sharp regions → close
+      - Vertical position (perspective geometry): lower in frame → close
+    That seed drives a horizontal warp to produce the right view,
+    mimicking a camera shifted by `baseline_shift_pct * width` pixels.
+    This is the same bootstrap step used in single-image SfM pipelines.
+    """
+    h, w = img.shape[:2]
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Sharpness cue
+    lap       = cv2.Laplacian(gray.astype(np.float32), cv2.CV_32F)
+    sharpness = gaussian_filter(np.abs(lap), sigma=5)
+    sharpness = sharpness / (sharpness.max() + 1e-6)
+    # Vertical prior
+    vert = np.linspace(0, 1, h)[:, None] * np.ones((h, w))
+    # Combine and smooth
+    closeness = 0.5 * sharpness + 0.5 * vert
+    closeness = gaussian_filter(closeness.astype(np.float32), sigma=10)
+    closeness = (closeness - closeness.min()) / (closeness.max() - closeness.min() + 1e-6)
+    max_shift = int(w * baseline_shift_pct)
+    disp_seed = (closeness * max_shift).astype(np.float32)
+    # Warp: right image looks slightly to the left
+    map_x = np.tile(np.arange(w, dtype=np.float32), (h, 1)) - disp_seed
+    map_y = np.tile(np.arange(h, dtype=np.float32)[:, None], (1, w))
+    right = cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR,
+                      borderMode=cv2.BORDER_REPLICATE)
+    return img.copy(), right, max_shift
+def sgbm_depth(
+    img: np.ndarray,
+    baseline_shift_pct: float = 0.03,
+    block_size: int = 7,
+    uniqueness_ratio: int = 10,
+    speckle_window_size: int = 100,
+    speckle_range: int = 2
+) -> tuple:
+    """
+    Semi-Global Block Matching (Hirschmüller 2008).
+    SGBM minimises a global energy function across multiple 1-D scanline
+    paths (8 directions in SGBM_3WAY mode), combining a per-pixel data
+    cost (census transform) with smoothness penalties P1/P2 that penalise
+    disparity discontinuities.
+    Returns:
+        depth_norm  – normalised closeness map  [0, 1],  1 = close
+        left_img    – left  view of stereo pair
+        right_img   – right view of stereo pair
+    """
+    left_img, right_img, max_shift = synthesise_stereo_pair(
+        img, baseline_shift_pct=baseline_shift_pct
+    )
+    left_g  = cv2.cvtColor(left_img,  cv2.COLOR_BGR2GRAY)
+    right_g = cv2.cvtColor(right_img, cv2.COLOR_BGR2GRAY)
+    num_disp = max(16, ((max_shift // 16) + 1) * 16)   # must be multiple of 16
+    block = max(3, int(block_size))
+    if block % 2 == 0:
+        block += 1
+    matcher = cv2.StereoSGBM_create(
+        minDisparity      = 0,
+        numDisparities    = num_disp,
+        blockSize         = block,
+        P1                = 8  * 3 * block ** 2,   # small-discontinuity penalty
+        P2                = 32 * 3 * block ** 2,   # large-discontinuity penalty
+        disp12MaxDiff     = 5,
+        uniquenessRatio   = uniqueness_ratio,
+        speckleWindowSize = speckle_window_size,
+        speckleRange      = speckle_range,
+        mode              = cv2.STEREO_SGBM_MODE_SGBM_3WAY
+    )
+    disp = matcher.compute(left_g, right_g).astype(np.float32) / 16.0
+    disp = np.maximum(disp, 0)
+    # Edge-preserving smoothing (bilateral keeps object boundaries clean)
+    disp = cv2.bilateralFilter(disp, d=9, sigmaColor=75, sigmaSpace=75)
+    # Normalise to [0, 1]: high disparity = close = 1
+    d = (disp - disp.min()) / (disp.max() - disp.min() + 1e-6)
+    # Guided filter refinement — sharpens depth edges using the colour image
+    d_8u = (d * 255).clip(0, 255).astype(np.uint8)
+    d    = cv2.ximgproc.guidedFilter(
+               guide=left_g, src=d_8u, radius=8, eps=200, dDepth=cv2.CV_32F)
+    d    = np.clip(d / (d.max() + 1e-6), 0, 1)
+    return d, left_img, right_img
+# ═══════════════════════════════════════════════════════════
+# 2.  ML-BASED METHOD – Actual MiDaS (MiDaS_small)
+# ═══════════════════════════════════════════════════════════
+def load_midas(model_type: str = "MiDaS_small"):
+    """
+    Load MiDaS from torch.hub (intel-isl/MiDaS).
+    Available model_type values (largest → smallest / slowest → fastest):
+        "DPT_Large"    – DPT-L  (ViT-L backbone, best quality)
+        "DPT_Hybrid"   – DPT-H  (ViT-H + ResNet50, good balance)
+        "MiDaS"        – MiDaS v2.1 large  (ResNet-101)
+        "MiDaS_small"  – MiDaS v2.1 small  (EfficientNet-Lite, fast) ← default
+    Weights are cached in ~/.cache/torch/hub/ after the first download.
+    """
+    print(f"[ MiDaS ] Loading model '{model_type}' from torch.hub ...")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"          Device: {device}")
+    model      = torch.hub.load("intel-isl/MiDaS", model_type, trust_repo=True)
+    model.to(device).eval()
+    transforms = torch.hub.load("intel-isl/MiDaS", "transforms", trust_repo=True)
+    transform  = (transforms.small_transform
+                  if model_type == "MiDaS_small"
+                  else transforms.dpt_transform)
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f"          Model loaded  ({n_params:,} parameters)")
+    return model, transform, device
+def midas_depth(
+    img:       np.ndarray,
+    model,
+    transform,
+    device:    torch.device
+) -> np.ndarray:
+    """
+    Run MiDaS inference on a BGR image.
+    MiDaS predicts *inverse* relative depth (disparity-like): larger values
+    correspond to closer surfaces.  We normalise to [0, 1] so 1 = close.
+    Pipeline:
+        BGR image
+          → RGB conversion
+          → MiDaS transform  (resize to 256x256 + ImageNet normalisation)
+          → EfficientNet encoder  (feature extraction)
+          → decoder + skip connections
+          → bilinear upsample to original resolution
+          → normalise to [0, 1]
+    Returns:
+        depth_norm – closeness map [0, 1] at original image resolution
+    """
+    h, w    = img.shape[:2]
+    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    # Preprocess: resize + normalise
+    input_batch = transform(img_rgb).to(device)
+    with torch.no_grad():
+        prediction = model(input_batch)
+        # Upsample back to original resolution
+        prediction = torch.nn.functional.interpolate(
+            prediction.unsqueeze(1),
+            size=(h, w),
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze()
+    depth = prediction.cpu().numpy()
+    # MiDaS output is inverse depth — higher value means closer.
+    # Normalise to [0, 1].
+    depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
+    return depth.astype(np.float32)
+# ═══════════════════════════════════════════════════════════
+# 3.  VISUALISATION
+# ═══════════════════════════════════════════════════════════
+def depth_to_heatmap(depth: np.ndarray) -> np.ndarray:
+    """depth [0,1] where 1=close → turbo BGR heatmap image."""
+    cmap = plt.get_cmap("turbo")
+    rgb  = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
+    return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+def visualise_results(
+    img:      np.ndarray,
+    depth_cl: np.ndarray,
+    depth_ml: np.ndarray,
+    out_path: str = "output/depth_estimation_subtask1.png"
+) -> None:
+    """
+    Compose a 3-column figure:
+      Col 1 – Original image
+      Col 2 – Classical SGBM heatmap  + scan-line profiles
+      Col 3 – MiDaS heatmap           + scan-line profiles
+    """
+    h, w  = img.shape[:2]
+    ncols = 3
+    fig = plt.figure(figsize=(ncols * 5.6, 11), dpi=130)
+    fig.patch.set_facecolor("#1a1a2e")
+    titles = [
+        "Original Image",
+        "Classical Depth\n(SGBM Stereo Matching)",
+        "ML-Based Depth\n(MiDaS_small — actual model)",
+    ]
+    depths = [None, depth_cl, depth_ml]
+    ax_top = [fig.add_subplot(2, ncols, c + 1)         for c in range(ncols)]
+    ax_bot = [fig.add_subplot(2, ncols, ncols + c + 1) for c in range(ncols)]
+    # ── Top row: images / heatmaps ──
+    for ax, title, d in zip(ax_top, titles, depths):
+        ax.set_title(title, color="white", fontsize=10, fontweight="bold", pad=8)
+        ax.axis("off")
+        rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if d is None:
+            ax.imshow(rgb)
+        else:
+            cmap_arr = plt.get_cmap("turbo")(d)[:, :, :3]
+            blended  = rgb.astype(np.float32) / 255 * 0.22 + cmap_arr * 0.78
+            ax.imshow(blended)
+            sm = plt.cm.ScalarMappable(cmap="turbo",
+                                       norm=plt.Normalize(vmin=0, vmax=1))
+            sm.set_array([])
+            cb = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.02)
+            cb.set_label("Near -> Far", color="white", fontsize=7)
+            cb.set_ticks([0, 0.5, 1])
+            cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=7)
+            cb.ax.yaxis.set_tick_params(color="white")
+    # ── Scan lines on heatmap panels ──
+    scan_ys     = [int(h * f) for f in [0.25, 0.50, 0.75]]
+    scan_colors = ["#ff6b6b", "#ffd93d", "#6bcb77"]
+    for ax in ax_top[1:]:
+        for sy, sc in zip(scan_ys, scan_colors):
+            ax.axhline(sy, color=sc, linewidth=1.2, alpha=0.75)
+    # ── Bottom row: depth profile plots ──
+    x            = np.arange(w)
+    method_maps  = [depth_cl, depth_ml]
+    method_names = ["Classical (SGBM)", "MiDaS (actual)"]
+    ls           = ["-", "--"]
+    for col, ax in enumerate(ax_bot):
+        ax.set_facecolor("#16213e")
+        for sp in ["top", "right"]:    ax.spines[sp].set_visible(False)
+        for sp in ["bottom", "left"]:  ax.spines[sp].set_color("#555")
+        ax.tick_params(colors="#888", labelsize=7)
+        ax.set_xlim(0, w - 1)
+        ax.set_ylim(-0.05, 1.05)
+        ax.set_xlabel("Pixel x", color="#aaa", fontsize=8)
+        ax.set_ylabel("Closeness  (1 = near)", color="#aaa", fontsize=8)
+        if col == 0:
+            # Compare both methods at the middle scan line
+            ax.set_title("Method comparison — middle scan line",
+                         color="white", fontsize=9, pad=6)
+            sy = scan_ys[1]
+            for mp, nm, l in zip(method_maps, method_names, ls):
+                ax.plot(x, mp[sy, :], linestyle=l, linewidth=1.6, label=nm)
+            ax.legend(fontsize=8, framealpha=0.25, labelcolor="white")
+        else:
+            # Per-method: three scan lines
+            mp = method_maps[col - 1]
+            nm = method_names[col - 1]
+            ax.set_title(f"{nm} — scan-line profiles",
+                         color="white", fontsize=9, pad=6)
+            for sy, sc in zip(scan_ys, scan_colors):
+                ax.plot(x, mp[sy, :], color=sc, linewidth=1.4,
+                        label=f"y = {sy}")
+            ax.legend(fontsize=7, framealpha=0.25, labelcolor="white")
+    # ── Colour scale strip ──
+    ax_s = fig.add_axes([0.05, 0.01, 0.90, 0.022])
+    ax_s.imshow(np.linspace(0, 1, 512).reshape(1, -1),
+                aspect="auto", cmap="turbo")
+    ax_s.set_yticks([])
+    ax_s.set_xticks([0, 170, 341, 511])
+    ax_s.set_xticklabels(
+        ["Far (cold / blue)", "Mid-far", "Mid-close", "Close (hot / red)"],
+        color="white", fontsize=8
+    )
+    plt.suptitle(
+        "Subtask 1 — Classical (SGBM) vs ML-Based (MiDaS) Depth Estimation\n"
+        "Heatmap: red/hot = close    blue/cold = far",
+        color="white", fontsize=13, fontweight="bold", y=1.003
+    )
+    plt.tight_layout(rect=[0, 0.05, 1, 1])
+    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
+    plt.savefig(out_path, dpi=130, bbox_inches="tight",
+                facecolor=fig.get_facecolor())
+    plt.close(fig)
+    print(f"Saved -> {out_path}")
+# ═══════════════════════════════════════════════════════════
+# 4.  MAIN
+# ═══════════════════════════════════════════════════════════
+def main() -> None:
+    if len(sys.argv) < 2:
+        sys.exit(
+            "Usage: python depth_estimation.py <image_path> [output_dir]\n"
+            "Example: python depth_estimation.py street.jpg output/"
+        )
+    image_path = sys.argv[1]
+    out_dir    = sys.argv[2] if len(sys.argv) > 2 else "output"
+    # ── Load image ──
+    img = load_image(image_path)
+    # ── Classical: SGBM ─��
+    print("\n[ Classical ] Running SGBM stereo matching ...")
+    depth_cl, left_img, right_img = sgbm_depth(img)
+    print(f"              Done.  depth in [0,1]  mean={depth_cl.mean():.3f}")
+    # ── ML: actual MiDaS ──
+    print("\n[ MiDaS     ] Loading and running MiDaS_small ...")
+    midas_model, midas_transform, device = load_midas("MiDaS_small")
+    depth_ml = midas_depth(img, midas_model, midas_transform, device)
+    print(f"              Done.  depth in [0,1]  mean={depth_ml.mean():.3f}")
+    # ── Save outputs ──
+    os.makedirs(out_dir, exist_ok=True)
+    cv2.imwrite(os.path.join(out_dir, "classical_heatmap.png"),
+                depth_to_heatmap(depth_cl))
+    cv2.imwrite(os.path.join(out_dir, "midas_heatmap.png"),
+                depth_to_heatmap(depth_ml))
+    cv2.imwrite(os.path.join(out_dir, "stereo_left.png"),  left_img)
+    cv2.imwrite(os.path.join(out_dir, "stereo_right.png"), right_img)
+    print("\n[ Visualise ] Compositing final figure ...")
+    visualise_results(
+        img, depth_cl, depth_ml,
+        out_path=os.path.join(out_dir, "depth_estimation_subtask1.png")
+    )
+    print(f"\nDone. Outputs written to: {out_dir}/")
+if __name__ == "__main__":
+    main()

object_distance.py ADDED Viewed

	@@ -0,0 +1,799 @@

+"""
+Subtask 2 – Object Detection + Distance Estimation
+  1. Detect objects with YOLOv5s (torch.hub)
+  2. Estimate metric distance (metres) per object using two complementary strategies:
+       A) Pinhole camera model  – uses known real-world object heights
+       B) MiDaS depth scaling  – calibrates MiDaS relative depth with pinhole anchors,
+                                  then applies the calibrated scale to all objects
+  3. Draw labelled bounding boxes on the image  ("person: 5.2 m")
+  4. Produce a combined figure: original detections | MiDaS depth | annotated result
+Usage:
+    python object_distance.py <image_path> [output_dir] [focal_length_px]
+Examples:
+    python object_distance.py street.jpg
+    python object_distance.py street.jpg output/ 800
+"""
+import sys
+import os
+import math
+import csv
+import json
+from typing import Optional, Tuple, List
+import cv2
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import torch
+# ── re-use MiDaS loader from Subtask 1 ──────────────────────
+sys.path.insert(0, os.path.dirname(__file__))
+from depth_estimation import load_image, load_midas, midas_depth, depth_to_heatmap
+# ═══════════════════════════════════════════════════════════
+# 1.  KNOWN OBJECT HEIGHTS  (metres)
+#     Used by the pinhole camera model.
+#     Values are representative averages for the COCO classes
+#     that appear most often in street / indoor scenes.
+# ═══════════════════════════════════════════════════════════
+KNOWN_HEIGHTS: dict[str, float] = {
+    # People & animals
+    "person":       1.70,
+    "cat":          0.30,
+    "dog":          0.50,
+    "horse":        1.60,
+    "cow":          1.40,
+    "sheep":        0.90,
+    "elephant":     3.00,
+    "bear":         1.20,
+    "zebra":        1.40,
+    "giraffe":      4.50,
+    # Vehicles
+    "bicycle":      1.00,
+    "car":          1.50,
+    "motorcycle":   1.10,
+    "airplane":     4.00,
+    "bus":          3.20,
+    "train":        4.00,
+    "truck":        3.50,
+    "boat":         1.50,
+    # Street furniture
+    "traffic light":0.90,
+    "fire hydrant": 0.60,
+    "stop sign":    0.75,
+    "parking meter":1.20,
+    "bench":        0.90,
+    # Indoor objects
+    "chair":        0.90,
+    "couch":        0.85,
+    "bed":          0.55,
+    "dining table": 0.75,
+    "toilet":       0.40,
+    "tv":           0.65,
+    "laptop":       0.30,
+    "microwave":    0.35,
+    "oven":         0.90,
+    "refrigerator": 1.80,
+    "sink":         0.20,
+    "door":         2.10,
+    # Handheld / small
+    "bottle":       0.25,
+    "cup":          0.12,
+    "backpack":     0.50,
+    "umbrella":     1.00,
+    "handbag":      0.30,
+    "suitcase":     0.65,
+    "sports ball":  0.22,
+    "baseball bat": 1.05,
+    "skateboard":   0.15,
+    "surfboard":    1.80,
+    "tennis racket":0.68,
+    "book":         0.22,
+    "clock":        0.30,
+    "vase":         0.30,
+    "scissors":     0.18,
+}
+# Colour palette (BGR) – one per class, cycling if more classes appear
+_PALETTE = [
+    (0, 200, 255),   # yellow
+    (0, 255, 100),   # green
+    (255, 80,  80),  # blue
+    (180, 0,  255),  # magenta
+    (0,  160, 255),  # orange
+    (255, 200,  0),  # cyan
+    (100, 255, 200), # lime
+    (255,  50, 180), # pink
+]
+# ═══════════════════════════════════════════════════════════
+# 2.  FOCAL LENGTH ESTIMATION
+# ═══════════════════════════════════════════════════════════
+def estimate_focal_length(image_width: int, fov_deg: float = 60.0) -> float:
+    """
+    Estimate the focal length in pixels from a known (or assumed) horizontal FOV.
+    f = (image_width / 2) / tan(FOV / 2)
+    The default of 60° covers most smartphones and consumer cameras.
+    Pass --focal to override with a measured value if you have camera metadata.
+    """
+    return (image_width / 2.0) / math.tan(math.radians(fov_deg / 2.0))
+# ═══════════════════════════════════════════════════════════
+# 3.  OBJECT DETECTION  (YOLOv5s via torch.hub)
+# ═══════════════════════════════════════════════════════════
+def load_yolo(
+    model_name: str = "yolov5s",
+    conf_thresh: float = 0.35,
+    iou_thresh: float = 0.45
+):
+    """
+    Load YOLOv5 from torch.hub.
+    Available sizes (speed ↑ / accuracy ↓):
+        yolov5n  – nano
+        yolov5s  – small  ← default, good balance
+        yolov5m  – medium
+        yolov5l  – large
+        yolov5x  – extra-large
+    """
+    print(f"[ YOLO ] Loading {model_name} from torch.hub ...")
+    model = torch.hub.load(
+        "ultralytics/yolov5", model_name,
+        pretrained=True, trust_repo=True
+    )
+    model.conf = conf_thresh
+    model.iou  = iou_thresh
+    print(f"         Loaded  ({model_name})")
+    return model
+def run_yolo(
+    model,
+    img: np.ndarray,
+    conf_thresh: float = 0.35
+) -> list[dict]:
+    """
+    Run YOLOv5 on a BGR image.
+    Returns a list of detections, each a dict:
+        { 'label': str, 'conf': float,
+          'x1': int, 'y1': int, 'x2': int, 'y2': int }
+    """
+    img_rgb  = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    results  = model(img_rgb)
+    df       = results.pandas().xyxy[0]           # Pandas DataFrame
+    detections = []
+    for _, row in df.iterrows():
+        if row["confidence"] < conf_thresh:
+            continue
+        detections.append({
+            "label": row["name"],
+            "conf":  float(row["confidence"]),
+            "x1":    int(row["xmin"]),
+            "y1":    int(row["ymin"]),
+            "x2":    int(row["xmax"]),
+            "y2":    int(row["ymax"]),
+        })
+    print(f"         {len(detections)} object(s) detected")
+    return detections
+# ═══════════════════════════════════════════════════════════
+# 4.  DISTANCE ESTIMATION
+# ═══════════════════════════════════════════════════════════
+def pinhole_distance(
+    pixel_height: int,
+    real_height:  float,
+    focal_length: float
+) -> float:
+    """
+    Pinhole / thin-lens camera model:
+        distance = (real_height * focal_length) / pixel_height
+    Derivation:
+        An object of real height H at distance D from a camera with focal
+        length f projects to a pixel height h = (H * f) / D.
+        Solving for D gives the formula above.
+    """
+    if pixel_height <= 0:
+        return float("inf")
+    return (real_height * focal_length) / pixel_height
+def detection_depth_stat(
+    depth_map: np.ndarray,
+    det: dict,
+    inner_ratio: float = 0.6
+) -> float:
+    """
+    Robust per-detection MiDaS statistic.
+    Uses the central region of the bounding box to reduce leakage from
+    neighbouring objects and background near box edges.
+    """
+    inner_ratio = float(np.clip(inner_ratio, 0.1, 1.0))
+    x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
+    w = max(1, x2 - x1)
+    h = max(1, y2 - y1)
+    dx = int(w * (1.0 - inner_ratio) / 2.0)
+    dy = int(h * (1.0 - inner_ratio) / 2.0)
+    cx1 = max(0, x1 + dx)
+    cy1 = max(0, y1 + dy)
+    cx2 = min(depth_map.shape[1], x2 - dx)
+    cy2 = min(depth_map.shape[0], y2 - dy)
+    roi = depth_map[cy1:cy2, cx1:cx2]
+    if roi.size == 0:
+        roi = depth_map[max(0, y1):min(depth_map.shape[0], y2),
+                        max(0, x1):min(depth_map.shape[1], x2)]
+    if roi.size == 0:
+        return 0.0
+    return float(np.median(roi))
+def midas_scale_calibration(
+    detections:   list[dict],
+    depth_map:    np.ndarray,
+    focal_length: float,
+    inner_ratio: float = 0.6,
+    min_depth_value: float = 0.02
+) -> Tuple[Optional[float], List[float]]:
+    """
+    Use objects with known real-world heights as anchors to calibrate
+    the MiDaS relative depth scale.
+    MiDaS outputs inverse relative depth d ∈ (0, 1] where d ≈ 1/D (D = distance).
+    So:  D_pinhole ≈ k / d_midas  =>  k = D_pinhole * d_midas
+    We collect k for each known-class detection and take the median,
+    giving a single scale factor that converts MiDaS values to metres.
+    """
+    k_values = []
+    for det in detections:
+        label        = det["label"]
+        real_height  = KNOWN_HEIGHTS.get(label)
+        if real_height is None:
+            continue
+        pixel_height = det["y2"] - det["y1"]
+        if pixel_height <= 5:
+            continue
+        D_pinhole    = pinhole_distance(pixel_height, real_height, focal_length)
+        d_midas = detection_depth_stat(depth_map, det, inner_ratio=inner_ratio)
+        if d_midas > min_depth_value:   # skip near-zero (invalid) regions
+            k_values.append(D_pinhole * d_midas)
+    if not k_values:
+        return None, []
+    scale = float(np.median(k_values))
+    print(f"         MiDaS scale factor k = {scale:.2f}  "
+          f"(from {len(k_values)} anchor object(s))")
+    return scale, k_values
+def estimate_distances(
+    detections:   list[dict],
+    depth_map:    np.ndarray,
+    focal_length: float,
+    inner_ratio: float = 0.6,
+    min_depth_value: float = 0.02,
+    blend_weight_pinhole: float = 0.55
+) -> tuple[list[dict], dict]:
+    """
+    Attach a metric distance estimate to every detection.
+    Strategy:
+      1. Pinhole model  – used when the class has a known reference height.
+      2. MiDaS scaling – after calibration with pinhole anchors, applied to
+                         ALL objects (including those without known heights).
+      3. Final distance – weighted average of the two when both are available;
+                          falls back to whichever single estimate exists.
+    Adds to each detection dict:
+        dist_pinhole  – metres from pinhole model  (None if class unknown)
+        dist_midas    – metres from MiDaS scaling  (None if no calibration)
+        distance      – final blended estimate (metres)
+        method        – string explaining which strategy was used
+    """
+    # ── Step 1: calibrate MiDaS scale ──
+    midas_scale, anchor_scales = midas_scale_calibration(
+        detections,
+        depth_map,
+        focal_length,
+        inner_ratio=inner_ratio,
+        min_depth_value=min_depth_value,
+    )
+    blend_weight_pinhole = float(np.clip(blend_weight_pinhole, 0.0, 1.0))
+    blend_weight_midas = 1.0 - blend_weight_pinhole
+    for det in detections:
+        label        = det["label"]
+        real_height  = KNOWN_HEIGHTS.get(label)
+        pixel_height = det["y2"] - det["y1"]
+        det["pixel_height"] = pixel_height
+        det["known_height_m"] = real_height
+        det["bbox_depth_median"] = detection_depth_stat(
+            depth_map, det, inner_ratio=inner_ratio
+        )
+        # ── Pinhole estimate ──
+        if real_height is not None and pixel_height > 5:
+            det["dist_pinhole"] = pinhole_distance(pixel_height, real_height,
+                                                    focal_length)
+        else:
+            det["dist_pinhole"] = None
+        # ── MiDaS estimate ──
+        d_midas = det["bbox_depth_median"]
+        if midas_scale and d_midas > min_depth_value:
+            det["dist_midas"] = midas_scale / d_midas
+        else:
+            det["dist_midas"] = None
+        # ── Blend ──
+        dp = det["dist_pinhole"]
+        dm = det["dist_midas"]
+        if dp is not None and dm is not None:
+            # Weighted average: pinhole is generally more precise for
+            # well-known classes; MiDaS captures scene context better.
+            det["distance"] = blend_weight_pinhole * dp + blend_weight_midas * dm
+            det["method"]   = "pinhole + MiDaS"
+        elif dp is not None:
+            det["distance"] = dp
+            det["method"]   = "pinhole"
+        elif dm is not None:
+            det["distance"] = dm
+            det["method"]   = "MiDaS"
+        else:
+            det["distance"] = None
+            det["method"]   = "unknown"
+    eval_context = {
+        "midas_scale": midas_scale,
+        "anchor_scales": anchor_scales,
+        "depth_inner_ratio": inner_ratio,
+        "min_depth_value": min_depth_value,
+        "blend_weight_pinhole": blend_weight_pinhole,
+    }
+    return detections, eval_context
+def compute_evaluation_metrics(
+    detections: list[dict],
+    focal_length: float,
+    eval_context: dict
+) -> dict:
+    """
+    Internal evaluation only.
+    Since there is no ground-truth distance label in this pipeline, the saved
+    metrics focus on coverage, calibration robustness, and agreement between
+    the two estimation branches rather than absolute accuracy.
+    """
+    total = len(detections)
+    confs = np.array([det["conf"] for det in detections], dtype=np.float32) if detections else np.array([])
+    final_dists = np.array(
+        [det["distance"] for det in detections if det.get("distance") is not None],
+        dtype=np.float32
+    )
+    pinhole_vals = np.array(
+        [det["dist_pinhole"] for det in detections if det.get("dist_pinhole") is not None],
+        dtype=np.float32
+    )
+    midas_vals = np.array(
+        [det["dist_midas"] for det in detections if det.get("dist_midas") is not None],
+        dtype=np.float32
+    )
+    overlap_pairs = [
+        (det["dist_pinhole"], det["dist_midas"])
+        for det in detections
+        if det.get("dist_pinhole") is not None and det.get("dist_midas") is not None
+    ]
+    anchor_scales = np.array(eval_context.get("anchor_scales", []), dtype=np.float32)
+    metrics = {
+        "focal_length_px": float(focal_length),
+        "num_detections": total,
+        "mean_confidence": float(confs.mean()) if confs.size else None,
+        "known_height_count": sum(det.get("known_height_m") is not None for det in detections),
+        "pinhole_count": int(pinhole_vals.size),
+        "midas_count": int(midas_vals.size),
+        "blended_count": sum(det.get("method") == "pinhole + MiDaS" for det in detections),
+        "unresolved_count": sum(det.get("distance") is None for det in detections),
+        "calibration_anchor_count": int(anchor_scales.size),
+        "midas_scale_factor": eval_context.get("midas_scale"),
+    }
+    metrics["known_height_coverage"] = (
+        metrics["known_height_count"] / total if total else None
+    )
+    metrics["distance_coverage"] = (
+        float(final_dists.size) / total if total else None
+    )
+    if final_dists.size:
+        metrics.update({
+            "final_distance_mean_m": float(final_dists.mean()),
+            "final_distance_std_m": float(final_dists.std()),
+            "final_distance_min_m": float(final_dists.min()),
+            "final_distance_max_m": float(final_dists.max()),
+        })
+    if anchor_scales.size:
+        metrics.update({
+            "anchor_scale_median": float(np.median(anchor_scales)),
+            "anchor_scale_std": float(anchor_scales.std()),
+            "anchor_scale_cv": float(anchor_scales.std() / (anchor_scales.mean() + 1e-6)),
+        })
+    if overlap_pairs:
+        pinhole_arr = np.array([pair[0] for pair in overlap_pairs], dtype=np.float32)
+        midas_arr = np.array([pair[1] for pair in overlap_pairs], dtype=np.float32)
+        abs_err = np.abs(midas_arr - pinhole_arr)
+        rel_err = abs_err / np.maximum(pinhole_arr, 1e-6)
+        metrics.update({
+            "agreement_sample_count": int(len(overlap_pairs)),
+            "agreement_mae_m": float(abs_err.mean()),
+            "agreement_rmse_m": float(np.sqrt(np.mean(abs_err ** 2))),
+            "agreement_mean_relative_error": float(rel_err.mean()),
+            "agreement_median_relative_error": float(np.median(rel_err)),
+            "agreement_within_10pct": float(np.mean(rel_err <= 0.10)),
+            "agreement_within_20pct": float(np.mean(rel_err <= 0.20)),
+        })
+    return metrics
+def save_evaluation_outputs(
+    detections: list[dict],
+    metrics: dict,
+    eval_dir: str
+) -> None:
+    os.makedirs(eval_dir, exist_ok=True)
+    csv_path = os.path.join(eval_dir, "detection_distances.csv")
+    with open(csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.writer(f)
+        writer.writerow([
+            "label", "confidence", "pixel_height", "known_height_m",
+            "bbox_depth_median", "dist_pinhole_m", "dist_midas_m",
+            "final_distance_m", "method"
+        ])
+        for det in sorted(detections, key=lambda d: d["distance"] if d["distance"] else 999):
+            writer.writerow([
+                det["label"],
+                f"{det['conf']:.6f}",
+                det.get("pixel_height"),
+                "" if det.get("known_height_m") is None else f"{det['known_height_m']:.3f}",
+                f"{det.get('bbox_depth_median', 0.0):.6f}",
+                "" if det.get("dist_pinhole") is None else f"{det['dist_pinhole']:.6f}",
+                "" if det.get("dist_midas") is None else f"{det['dist_midas']:.6f}",
+                "" if det.get("distance") is None else f"{det['distance']:.6f}",
+                det.get("method", "unknown"),
+            ])
+    metrics_path = os.path.join(eval_dir, "metrics.json")
+    with open(metrics_path, "w", encoding="utf-8") as f:
+        json.dump(metrics, f, indent=2)
+    report_path = os.path.join(eval_dir, "evaluation_report.txt")
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("Subtask 2 Evaluation Report\n")
+        f.write("===========================\n\n")
+        f.write("This report measures internal consistency only.\n")
+        f.write("No ground-truth object distances are available here, so these metrics\n")
+        f.write("should be interpreted as coverage / robustness diagnostics, not absolute accuracy.\n\n")
+        f.write("Key metrics\n")
+        f.write("-----------\n")
+        for key, value in metrics.items():
+            if value is None:
+                pretty = "N/A"
+            elif isinstance(value, float):
+                pretty = f"{value:.4f}"
+            else:
+                pretty = str(value)
+            f.write(f"{key}: {pretty}\n")
+        f.write("\nMetric sufficiency note\n")
+        f.write("----------------------\n")
+        f.write("- Enough for internal evaluation: yes.\n")
+        f.write("- Enough for accuracy claims: no.\n")
+        f.write("- To measure real accuracy, add ground-truth distances and report MAE/RMSE/MAPE against labels.\n")
+    print(f"         Saved -> {csv_path}")
+    print(f"         Saved -> {metrics_path}")
+    print(f"         Saved -> {report_path}")
+# ═══════════════════════════════════════════════════════════
+# 5.  DRAW ANNOTATED IMAGE
+# ═══════════════════════════════════════════════════════════
+def draw_detections(
+    img:        np.ndarray,
+    detections: list[dict]
+) -> np.ndarray:
+    """
+    Draw bounding boxes with labels on a copy of the image.
+    Label format:  "<class>: X.X m  (conf%)"
+    Each class gets a consistent colour from the palette.
+    """
+    out       = img.copy()
+    class_ids = {}   # map class name → colour index
+    for det in detections:
+        label  = det["label"]
+        dist   = det["distance"]
+        conf   = det["conf"]
+        x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
+        # Assign colour
+        if label not in class_ids:
+            class_ids[label] = len(class_ids) % len(_PALETTE)
+        colour = _PALETTE[class_ids[label]]
+        # Box
+        thickness = max(2, int((x2 - x1 + y2 - y1) / 200))
+        cv2.rectangle(out, (x1, y1), (x2, y2), colour, thickness)
+        # Label text
+        if dist is not None:
+            text = f"{label}: {dist:.1f} m  ({conf:.0%})"
+        else:
+            text = f"{label}  ({conf:.0%})"
+        # Dynamic font scale based on box size
+        box_h      = max(1, y2 - y1)
+        font_scale = max(0.45, min(0.9, box_h / 180))
+        font_thick = max(1, int(font_scale * 2))
+        (tw, th), baseline = cv2.getTextSize(
+            text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thick)
+        # Background pill behind text
+        pad    = 5
+        tx     = max(0, x1)
+        ty_box = max(0, y1 - th - baseline - pad * 2)
+        cv2.rectangle(out,
+                      (tx, ty_box),
+                      (tx + tw + pad * 2, ty_box + th + baseline + pad * 2),
+                      colour, -1)
+        # Invert text colour for readability
+        lum       = 0.299 * colour[2] + 0.587 * colour[1] + 0.114 * colour[0]
+        txt_color = (0, 0, 0) if lum > 128 else (255, 255, 255)
+        cv2.putText(out, text,
+                    (tx + pad, ty_box + th + pad),
+                    cv2.FONT_HERSHEY_SIMPLEX, font_scale,
+                    txt_color, font_thick, cv2.LINE_AA)
+    return out
+# ═══════════════════════════════════════════════════════════
+# 6.  VISUALISATION  (combined figure)
+# ═══════════════════════════════════════════════════════════
+def visualise_results(
+    img:          np.ndarray,
+    depth_map:    np.ndarray,
+    detections:   list[dict],
+    annotated:    np.ndarray,
+    out_path:     str
+) -> None:
+    """
+    Three-panel figure:
+      1. Original image with raw YOLO boxes
+      2. MiDaS depth heatmap with boxes overlaid
+      3. Final annotated image with distance labels
+    """
+    fig, axes = plt.subplots(1, 3, figsize=(19, 7), dpi=130)
+    fig.patch.set_facecolor("#1a1a2e")
+    h, w = img.shape[:2]
+    # ── Panel 1: raw YOLO detections ──
+    raw_boxes = img.copy()
+    for det in detections:
+        x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
+        cv2.rectangle(raw_boxes, (x1, y1), (x2, y2), (0, 255, 120), 2)
+        cv2.putText(raw_boxes, f"{det['label']} {det['conf']:.0%}",
+                    (x1, max(0, y1 - 6)),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 255, 120), 2, cv2.LINE_AA)
+    axes[0].imshow(cv2.cvtColor(raw_boxes, cv2.COLOR_BGR2RGB))
+    axes[0].set_title("YOLO Detections", color="white", fontsize=11,
+                       fontweight="bold", pad=10)
+    axes[0].axis("off")
+    # ── Panel 2: MiDaS depth + boxes ──
+    depth_bgr  = depth_to_heatmap(depth_map)
+    depth_over = depth_bgr.copy()
+    for det in detections:
+        x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
+        cv2.rectangle(depth_over, (x1, y1), (x2, y2), (255, 255, 255), 2)
+        dist_txt = f"{det['distance']:.1f}m" if det["distance"] else "?"
+        cv2.putText(depth_over, dist_txt,
+                    (x1 + 3, y1 + 18),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2, cv2.LINE_AA)
+    axes[1].imshow(cv2.cvtColor(depth_over, cv2.COLOR_BGR2RGB))
+    sm = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(0, 1))
+    sm.set_array([])
+    cb = plt.colorbar(sm, ax=axes[1], fraction=0.035, pad=0.02)
+    cb.set_label("Near → Far", color="white", fontsize=8)
+    cb.set_ticks([0, 0.5, 1])
+    cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=8)
+    cb.ax.yaxis.set_tick_params(color="white")
+    axes[1].set_title("MiDaS Depth  +  Distance Estimates",
+                       color="white", fontsize=11, fontweight="bold", pad=10)
+    axes[1].axis("off")
+    # ── Panel 3: final annotated image ──
+    axes[2].imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
+    axes[2].set_title("Object Distances  (pinhole + MiDaS blend)",
+                       color="white", fontsize=11, fontweight="bold", pad=10)
+    axes[2].axis("off")
+    # ── Distance table below ──
+    rows = []
+    for det in sorted(detections,
+                       key=lambda d: d["distance"] if d["distance"] else 999):
+        dist_str = f"{det['distance']:.2f} m" if det["distance"] is not None else "N/A"
+        ph_str   = (f"{det['dist_pinhole']:.2f} m"
+                    if det.get("dist_pinhole") is not None else "—")
+        md_str   = (f"{det['dist_midas']:.2f} m"
+                    if det.get("dist_midas") is not None else "—")
+        rows.append([det["label"], f"{det['conf']:.0%}",
+                     ph_str, md_str, dist_str, det["method"]])
+    if rows:
+        table_ax = fig.add_axes([0.05, -0.14, 0.90, 0.14])
+        table_ax.axis("off")
+        table_ax.set_facecolor("#1a1a2e")
+        col_labels = ["Object", "Confidence",
+                      "Pinhole est.", "MiDaS est.", "Final distance", "Method"]
+        tbl = table_ax.table(
+            cellText=rows,
+            colLabels=col_labels,
+            cellLoc="center", loc="center"
+        )
+        tbl.auto_set_font_size(False)
+        tbl.set_fontsize(8.5)
+        tbl.scale(1, 1.55)
+        # Style header
+        for j in range(len(col_labels)):
+            tbl[(0, j)].set_facecolor("#2e4057")
+            tbl[(0, j)].set_text_props(color="white", fontweight="bold")
+        # Alternating row shading
+        for i in range(1, len(rows) + 1):
+            bg = "#1e2d40" if i % 2 == 0 else "#16213e"
+            for j in range(len(col_labels)):
+                tbl[(i, j)].set_facecolor(bg)
+                tbl[(i, j)].set_text_props(color="#dde")
+    plt.suptitle(
+        "Subtask 2 — Object Detection & Distance Estimation\n"
+        "Distance = pinhole camera model  +  MiDaS depth scaling",
+        color="white", fontsize=13, fontweight="bold", y=1.02
+    )
+    plt.tight_layout()
+    os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
+    plt.savefig(out_path, dpi=130, bbox_inches="tight",
+                facecolor=fig.get_facecolor())
+    plt.close(fig)
+    print(f"Saved -> {out_path}")
+# ═══════════════════════════════════════════════════════════
+# 7.  MAIN
+# ═══════════════════════════════════════════════════════════
+def main() -> None:
+    if len(sys.argv) < 2:
+        sys.exit(
+            "Usage: python object_distance.py <image_path> [output_dir] [focal_px]\n"
+            "Example: python object_distance.py street.jpg output/ 800"
+        )
+    image_path   = sys.argv[1]
+    out_dir      = sys.argv[2] if len(sys.argv) > 2 else "output"
+    focal_length = float(sys.argv[3]) if len(sys.argv) > 3 else None
+    image_dir    = os.path.join(out_dir, "images")
+    eval_dir     = os.path.join(out_dir, "evaluation")
+    # ── Load image ──
+    img = load_image(image_path)
+    h, w = img.shape[:2]
+    if focal_length is None:
+        focal_length = estimate_focal_length(w, fov_deg=60.0)
+        print(f"Focal length estimated: {focal_length:.1f} px  "
+              f"(assuming 60° horizontal FOV — override via 3rd argument)")
+    else:
+        print(f"Focal length (user-supplied): {focal_length:.1f} px")
+    # ── MiDaS depth ──
+    print("\n[ MiDaS ] Loading MiDaS_small ...")
+    midas_model, midas_transform, device = load_midas("MiDaS_small")
+    print("[ MiDaS ] Running inference ...")
+    depth_map = midas_depth(img, midas_model, midas_transform, device)
+    print(f"          Done.  depth in [0,1]  mean={depth_map.mean():.3f}")
+    # ── YOLO detection ──
+    print("\n[ YOLO  ] Loading YOLOv5s ...")
+    yolo_model = load_yolo("yolov5s")
+    print("[ YOLO  ] Running detection ...")
+    detections = run_yolo(yolo_model, img)
+    if not detections:
+        print("WARNING: No objects detected.  "
+              "Try a lower confidence threshold or a different image.")
+        sys.exit(0)
+    # ── Distance estimation ──
+    print("\n[ Dist  ] Estimating distances ...")
+    detections, eval_context = estimate_distances(detections, depth_map, focal_length)
+    metrics = compute_evaluation_metrics(detections, focal_length, eval_context)
+    # Print summary table
+    print(f"\n  {'Object':<18} {'Conf':>5}  {'Pinhole':>10}  "
+          f"{'MiDaS':>10}  {'Final':>10}  Method")
+    print("  " + "-" * 70)
+    for det in sorted(detections,
+                       key=lambda d: d["distance"] if d["distance"] else 999):
+        dp = f"{det['dist_pinhole']:.1f} m" if det.get("dist_pinhole") else "    —"
+        dm = f"{det['dist_midas']:.1f} m"   if det.get("dist_midas")   else "    —"
+        df = f"{det['distance']:.1f} m"     if det.get("distance")     else "    —"
+        print(f"  {det['label']:<18} {det['conf']:>4.0%}  "
+              f"{dp:>10}  {dm:>10}  {df:>10}  {det['method']}")
+    # ── Draw and save ──
+    print("\n[ Draw  ] Annotating image ...")
+    annotated = draw_detections(img, detections)
+    os.makedirs(image_dir, exist_ok=True)
+    os.makedirs(eval_dir, exist_ok=True)
+    annotated_path = os.path.join(image_dir, "detections_with_distance.png")
+    cv2.imwrite(annotated_path, annotated)
+    cv2.imwrite(os.path.join(image_dir, "midas_depth.png"),
+                depth_to_heatmap(depth_map))
+    print(f"         Saved -> {annotated_path}")
+    print("\n[ Fig   ] Compositing combined figure ...")
+    visualise_results(
+        img, depth_map, detections, annotated,
+        out_path=os.path.join(image_dir, "object_distance_subtask2.png")
+    )
+    print("\n[ Eval  ] Writing evaluation artifacts ...")
+    save_evaluation_outputs(detections, metrics, eval_dir)
+    print(f"\nDone.  Image outputs: {image_dir}/")
+    print(f"Done.  Evaluation outputs: {eval_dir}/")
+if __name__ == "__main__":
+    main()