Spaces:
Sleeping
Sleeping
| """ | |
| Subtask 2 β Object Detection + Distance Estimation | |
| 1. Detect objects with YOLOv5s (torch.hub) | |
| 2. Estimate metric distance (metres) per object using two complementary strategies: | |
| A) Pinhole camera model β uses known real-world object heights | |
| B) MiDaS depth scaling β calibrates MiDaS relative depth with pinhole anchors, | |
| then applies the calibrated scale to all objects | |
| 3. Draw labelled bounding boxes on the image ("person: 5.2 m") | |
| 4. Produce a combined figure: original detections | MiDaS depth | annotated result | |
| Usage: | |
| python object_distance.py <image_path> [output_dir] [focal_length_px] | |
| Examples: | |
| python object_distance.py street.jpg | |
| python object_distance.py street.jpg output/ 800 | |
| """ | |
| import sys | |
| import os | |
| import math | |
| import csv | |
| import json | |
| from typing import Optional, Tuple, List | |
| import cv2 | |
| import numpy as np | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import torch | |
| # ββ re-use MiDaS loader from Subtask 1 ββββββββββββββββββββββ | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from depth_estimation import load_image, load_midas, midas_depth, depth_to_heatmap | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. KNOWN OBJECT HEIGHTS (metres) | |
| # Used by the pinhole camera model. | |
| # Values are representative averages for the COCO classes | |
| # that appear most often in street / indoor scenes. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| KNOWN_HEIGHTS: dict[str, float] = { | |
| # People & animals | |
| "person": 1.70, | |
| "cat": 0.30, | |
| "dog": 0.50, | |
| "horse": 1.60, | |
| "cow": 1.40, | |
| "sheep": 0.90, | |
| "elephant": 3.00, | |
| "bear": 1.20, | |
| "zebra": 1.40, | |
| "giraffe": 4.50, | |
| # Vehicles | |
| "bicycle": 1.00, | |
| "car": 1.50, | |
| "motorcycle": 1.10, | |
| "airplane": 4.00, | |
| "bus": 3.20, | |
| "train": 4.00, | |
| "truck": 3.50, | |
| "boat": 1.50, | |
| # Street furniture | |
| "traffic light":0.90, | |
| "fire hydrant": 0.60, | |
| "stop sign": 0.75, | |
| "parking meter":1.20, | |
| "bench": 0.90, | |
| # Indoor objects | |
| "chair": 0.90, | |
| "couch": 0.85, | |
| "bed": 0.55, | |
| "dining table": 0.75, | |
| "toilet": 0.40, | |
| "tv": 0.65, | |
| "laptop": 0.30, | |
| "microwave": 0.35, | |
| "oven": 0.90, | |
| "refrigerator": 1.80, | |
| "sink": 0.20, | |
| "door": 2.10, | |
| # Handheld / small | |
| "bottle": 0.25, | |
| "cup": 0.12, | |
| "backpack": 0.50, | |
| "umbrella": 1.00, | |
| "handbag": 0.30, | |
| "suitcase": 0.65, | |
| "sports ball": 0.22, | |
| "baseball bat": 1.05, | |
| "skateboard": 0.15, | |
| "surfboard": 1.80, | |
| "tennis racket":0.68, | |
| "book": 0.22, | |
| "clock": 0.30, | |
| "vase": 0.30, | |
| "scissors": 0.18, | |
| } | |
| # Colour palette (BGR) β one per class, cycling if more classes appear | |
| _PALETTE = [ | |
| (0, 200, 255), # yellow | |
| (0, 255, 100), # green | |
| (255, 80, 80), # blue | |
| (180, 0, 255), # magenta | |
| (0, 160, 255), # orange | |
| (255, 200, 0), # cyan | |
| (100, 255, 200), # lime | |
| (255, 50, 180), # pink | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. FOCAL LENGTH ESTIMATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def estimate_focal_length(image_width: int, fov_deg: float = 60.0) -> float: | |
| """ | |
| Estimate the focal length in pixels from a known (or assumed) horizontal FOV. | |
| f = (image_width / 2) / tan(FOV / 2) | |
| The default of 60Β° covers most smartphones and consumer cameras. | |
| Pass --focal to override with a measured value if you have camera metadata. | |
| """ | |
| return (image_width / 2.0) / math.tan(math.radians(fov_deg / 2.0)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. OBJECT DETECTION (YOLOv5s via torch.hub) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_yolo( | |
| model_name: str = "yolov5s", | |
| conf_thresh: float = 0.35, | |
| iou_thresh: float = 0.45 | |
| ): | |
| """ | |
| Load YOLOv5 from torch.hub. | |
| Available sizes (speed β / accuracy β): | |
| yolov5n β nano | |
| yolov5s β small β default, good balance | |
| yolov5m β medium | |
| yolov5l β large | |
| yolov5x β extra-large | |
| """ | |
| print(f"[ YOLO ] Loading {model_name} from torch.hub ...") | |
| model = torch.hub.load( | |
| "ultralytics/yolov5", model_name, | |
| pretrained=True, trust_repo=True | |
| ) | |
| model.conf = conf_thresh | |
| model.iou = iou_thresh | |
| print(f" Loaded ({model_name})") | |
| return model | |
| def run_yolo( | |
| model, | |
| img: np.ndarray, | |
| conf_thresh: float = 0.35 | |
| ) -> list[dict]: | |
| """ | |
| Run YOLOv5 on a BGR image. | |
| Returns a list of detections, each a dict: | |
| { 'label': str, 'conf': float, | |
| 'x1': int, 'y1': int, 'x2': int, 'y2': int } | |
| """ | |
| img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| results = model(img_rgb) | |
| df = results.pandas().xyxy[0] # Pandas DataFrame | |
| detections = [] | |
| for _, row in df.iterrows(): | |
| if row["confidence"] < conf_thresh: | |
| continue | |
| detections.append({ | |
| "label": row["name"], | |
| "conf": float(row["confidence"]), | |
| "x1": int(row["xmin"]), | |
| "y1": int(row["ymin"]), | |
| "x2": int(row["xmax"]), | |
| "y2": int(row["ymax"]), | |
| }) | |
| print(f" {len(detections)} object(s) detected") | |
| return detections | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. DISTANCE ESTIMATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def pinhole_distance( | |
| pixel_height: int, | |
| real_height: float, | |
| focal_length: float | |
| ) -> float: | |
| """ | |
| Pinhole / thin-lens camera model: | |
| distance = (real_height * focal_length) / pixel_height | |
| Derivation: | |
| An object of real height H at distance D from a camera with focal | |
| length f projects to a pixel height h = (H * f) / D. | |
| Solving for D gives the formula above. | |
| """ | |
| if pixel_height <= 0: | |
| return float("inf") | |
| return (real_height * focal_length) / pixel_height | |
| def detection_depth_stat( | |
| depth_map: np.ndarray, | |
| det: dict, | |
| inner_ratio: float = 0.6 | |
| ) -> float: | |
| """ | |
| Robust per-detection MiDaS statistic. | |
| Uses the central region of the bounding box to reduce leakage from | |
| neighbouring objects and background near box edges. | |
| """ | |
| inner_ratio = float(np.clip(inner_ratio, 0.1, 1.0)) | |
| x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"] | |
| w = max(1, x2 - x1) | |
| h = max(1, y2 - y1) | |
| dx = int(w * (1.0 - inner_ratio) / 2.0) | |
| dy = int(h * (1.0 - inner_ratio) / 2.0) | |
| cx1 = max(0, x1 + dx) | |
| cy1 = max(0, y1 + dy) | |
| cx2 = min(depth_map.shape[1], x2 - dx) | |
| cy2 = min(depth_map.shape[0], y2 - dy) | |
| roi = depth_map[cy1:cy2, cx1:cx2] | |
| if roi.size == 0: | |
| roi = depth_map[max(0, y1):min(depth_map.shape[0], y2), | |
| max(0, x1):min(depth_map.shape[1], x2)] | |
| if roi.size == 0: | |
| return 0.0 | |
| return float(np.median(roi)) | |
| def midas_scale_calibration( | |
| detections: list[dict], | |
| depth_map: np.ndarray, | |
| focal_length: float, | |
| inner_ratio: float = 0.6, | |
| min_depth_value: float = 0.02 | |
| ) -> Tuple[Optional[float], List[float]]: | |
| """ | |
| Use objects with known real-world heights as anchors to calibrate | |
| the MiDaS relative depth scale. | |
| MiDaS outputs inverse relative depth d β (0, 1] where d β 1/D (D = distance). | |
| So: D_pinhole β k / d_midas => k = D_pinhole * d_midas | |
| We collect k for each known-class detection and take the median, | |
| giving a single scale factor that converts MiDaS values to metres. | |
| """ | |
| k_values = [] | |
| for det in detections: | |
| label = det["label"] | |
| real_height = KNOWN_HEIGHTS.get(label) | |
| if real_height is None: | |
| continue | |
| pixel_height = det["y2"] - det["y1"] | |
| if pixel_height <= 5: | |
| continue | |
| D_pinhole = pinhole_distance(pixel_height, real_height, focal_length) | |
| d_midas = detection_depth_stat(depth_map, det, inner_ratio=inner_ratio) | |
| if d_midas > min_depth_value: # skip near-zero (invalid) regions | |
| k_values.append(D_pinhole * d_midas) | |
| if not k_values: | |
| return None, [] | |
| scale = float(np.median(k_values)) | |
| print(f" MiDaS scale factor k = {scale:.2f} " | |
| f"(from {len(k_values)} anchor object(s))") | |
| return scale, k_values | |
| def estimate_distances( | |
| detections: list[dict], | |
| depth_map: np.ndarray, | |
| focal_length: float, | |
| inner_ratio: float = 0.6, | |
| min_depth_value: float = 0.02, | |
| blend_weight_pinhole: float = 0.55 | |
| ) -> tuple[list[dict], dict]: | |
| """ | |
| Attach a metric distance estimate to every detection. | |
| Strategy: | |
| 1. Pinhole model β used when the class has a known reference height. | |
| 2. MiDaS scaling β after calibration with pinhole anchors, applied to | |
| ALL objects (including those without known heights). | |
| 3. Final distance β weighted average of the two when both are available; | |
| falls back to whichever single estimate exists. | |
| Adds to each detection dict: | |
| dist_pinhole β metres from pinhole model (None if class unknown) | |
| dist_midas β metres from MiDaS scaling (None if no calibration) | |
| distance β final blended estimate (metres) | |
| method β string explaining which strategy was used | |
| """ | |
| # ββ Step 1: calibrate MiDaS scale ββ | |
| midas_scale, anchor_scales = midas_scale_calibration( | |
| detections, | |
| depth_map, | |
| focal_length, | |
| inner_ratio=inner_ratio, | |
| min_depth_value=min_depth_value, | |
| ) | |
| blend_weight_pinhole = float(np.clip(blend_weight_pinhole, 0.0, 1.0)) | |
| blend_weight_midas = 1.0 - blend_weight_pinhole | |
| for det in detections: | |
| label = det["label"] | |
| real_height = KNOWN_HEIGHTS.get(label) | |
| pixel_height = det["y2"] - det["y1"] | |
| det["pixel_height"] = pixel_height | |
| det["known_height_m"] = real_height | |
| det["bbox_depth_median"] = detection_depth_stat( | |
| depth_map, det, inner_ratio=inner_ratio | |
| ) | |
| # ββ Pinhole estimate ββ | |
| if real_height is not None and pixel_height > 5: | |
| det["dist_pinhole"] = pinhole_distance(pixel_height, real_height, | |
| focal_length) | |
| else: | |
| det["dist_pinhole"] = None | |
| # ββ MiDaS estimate ββ | |
| d_midas = det["bbox_depth_median"] | |
| if midas_scale and d_midas > min_depth_value: | |
| det["dist_midas"] = midas_scale / d_midas | |
| else: | |
| det["dist_midas"] = None | |
| # ββ Blend ββ | |
| dp = det["dist_pinhole"] | |
| dm = det["dist_midas"] | |
| if dp is not None and dm is not None: | |
| # Weighted average: pinhole is generally more precise for | |
| # well-known classes; MiDaS captures scene context better. | |
| det["distance"] = blend_weight_pinhole * dp + blend_weight_midas * dm | |
| det["method"] = "pinhole + MiDaS" | |
| elif dp is not None: | |
| det["distance"] = dp | |
| det["method"] = "pinhole" | |
| elif dm is not None: | |
| det["distance"] = dm | |
| det["method"] = "MiDaS" | |
| else: | |
| det["distance"] = None | |
| det["method"] = "unknown" | |
| eval_context = { | |
| "midas_scale": midas_scale, | |
| "anchor_scales": anchor_scales, | |
| "depth_inner_ratio": inner_ratio, | |
| "min_depth_value": min_depth_value, | |
| "blend_weight_pinhole": blend_weight_pinhole, | |
| } | |
| return detections, eval_context | |
| def compute_evaluation_metrics( | |
| detections: list[dict], | |
| focal_length: float, | |
| eval_context: dict | |
| ) -> dict: | |
| """ | |
| Internal evaluation only. | |
| Since there is no ground-truth distance label in this pipeline, the saved | |
| metrics focus on coverage, calibration robustness, and agreement between | |
| the two estimation branches rather than absolute accuracy. | |
| """ | |
| total = len(detections) | |
| confs = np.array([det["conf"] for det in detections], dtype=np.float32) if detections else np.array([]) | |
| final_dists = np.array( | |
| [det["distance"] for det in detections if det.get("distance") is not None], | |
| dtype=np.float32 | |
| ) | |
| pinhole_vals = np.array( | |
| [det["dist_pinhole"] for det in detections if det.get("dist_pinhole") is not None], | |
| dtype=np.float32 | |
| ) | |
| midas_vals = np.array( | |
| [det["dist_midas"] for det in detections if det.get("dist_midas") is not None], | |
| dtype=np.float32 | |
| ) | |
| overlap_pairs = [ | |
| (det["dist_pinhole"], det["dist_midas"]) | |
| for det in detections | |
| if det.get("dist_pinhole") is not None and det.get("dist_midas") is not None | |
| ] | |
| anchor_scales = np.array(eval_context.get("anchor_scales", []), dtype=np.float32) | |
| metrics = { | |
| "focal_length_px": float(focal_length), | |
| "num_detections": total, | |
| "mean_confidence": float(confs.mean()) if confs.size else None, | |
| "known_height_count": sum(det.get("known_height_m") is not None for det in detections), | |
| "pinhole_count": int(pinhole_vals.size), | |
| "midas_count": int(midas_vals.size), | |
| "blended_count": sum(det.get("method") == "pinhole + MiDaS" for det in detections), | |
| "unresolved_count": sum(det.get("distance") is None for det in detections), | |
| "calibration_anchor_count": int(anchor_scales.size), | |
| "midas_scale_factor": eval_context.get("midas_scale"), | |
| } | |
| metrics["known_height_coverage"] = ( | |
| metrics["known_height_count"] / total if total else None | |
| ) | |
| metrics["distance_coverage"] = ( | |
| float(final_dists.size) / total if total else None | |
| ) | |
| if final_dists.size: | |
| metrics.update({ | |
| "final_distance_mean_m": float(final_dists.mean()), | |
| "final_distance_std_m": float(final_dists.std()), | |
| "final_distance_min_m": float(final_dists.min()), | |
| "final_distance_max_m": float(final_dists.max()), | |
| }) | |
| if anchor_scales.size: | |
| metrics.update({ | |
| "anchor_scale_median": float(np.median(anchor_scales)), | |
| "anchor_scale_std": float(anchor_scales.std()), | |
| "anchor_scale_cv": float(anchor_scales.std() / (anchor_scales.mean() + 1e-6)), | |
| }) | |
| if overlap_pairs: | |
| pinhole_arr = np.array([pair[0] for pair in overlap_pairs], dtype=np.float32) | |
| midas_arr = np.array([pair[1] for pair in overlap_pairs], dtype=np.float32) | |
| abs_err = np.abs(midas_arr - pinhole_arr) | |
| rel_err = abs_err / np.maximum(pinhole_arr, 1e-6) | |
| metrics.update({ | |
| "agreement_sample_count": int(len(overlap_pairs)), | |
| "agreement_mae_m": float(abs_err.mean()), | |
| "agreement_rmse_m": float(np.sqrt(np.mean(abs_err ** 2))), | |
| "agreement_mean_relative_error": float(rel_err.mean()), | |
| "agreement_median_relative_error": float(np.median(rel_err)), | |
| "agreement_within_10pct": float(np.mean(rel_err <= 0.10)), | |
| "agreement_within_20pct": float(np.mean(rel_err <= 0.20)), | |
| }) | |
| return metrics | |
| def metrics_table(metrics: dict) -> list[list[str]]: | |
| """ | |
| Convert the full metrics dict into a small table (key metrics only). | |
| Returns rows: [metric_name, value]. | |
| """ | |
| def fmt(v): | |
| if v is None: | |
| return "N/A" | |
| if isinstance(v, float): | |
| return f"{v:.4f}" | |
| return str(v) | |
| keys = [ | |
| # Coverage | |
| ("num_detections", "num_detections"), | |
| ("mean_confidence", "mean_confidence"), | |
| ("known_height_coverage", "known_height_coverage"), | |
| ("distance_coverage", "distance_coverage"), | |
| ("unresolved_count", "unresolved_count"), | |
| # Calibration | |
| ("calibration_anchor_count", "calibration_anchor_count"), | |
| ("midas_scale_factor", "midas_scale_factor"), | |
| ("anchor_scale_cv", "anchor_scale_cv"), | |
| # Agreement (if available) | |
| ("agreement_sample_count", "agreement_sample_count"), | |
| ("agreement_mae_m", "agreement_mae_m"), | |
| ("agreement_rmse_m", "agreement_rmse_m"), | |
| ("agreement_mean_relative_error", "agreement_mean_relative_error"), | |
| ("agreement_within_20pct", "agreement_within_20pct"), | |
| ] | |
| rows = [] | |
| for label, k in keys: | |
| rows.append([label, fmt(metrics.get(k))]) | |
| return rows | |
| def save_evaluation_outputs( | |
| detections: list[dict], | |
| metrics: dict, | |
| eval_dir: str | |
| ) -> None: | |
| os.makedirs(eval_dir, exist_ok=True) | |
| csv_path = os.path.join(eval_dir, "detection_distances.csv") | |
| with open(csv_path, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow([ | |
| "label", "confidence", "pixel_height", "known_height_m", | |
| "bbox_depth_median", "dist_pinhole_m", "dist_midas_m", | |
| "final_distance_m", "method" | |
| ]) | |
| for det in sorted(detections, key=lambda d: d["distance"] if d["distance"] else 999): | |
| writer.writerow([ | |
| det["label"], | |
| f"{det['conf']:.6f}", | |
| det.get("pixel_height"), | |
| "" if det.get("known_height_m") is None else f"{det['known_height_m']:.3f}", | |
| f"{det.get('bbox_depth_median', 0.0):.6f}", | |
| "" if det.get("dist_pinhole") is None else f"{det['dist_pinhole']:.6f}", | |
| "" if det.get("dist_midas") is None else f"{det['dist_midas']:.6f}", | |
| "" if det.get("distance") is None else f"{det['distance']:.6f}", | |
| det.get("method", "unknown"), | |
| ]) | |
| metrics_table_path = os.path.join(eval_dir, "metrics_table.csv") | |
| with open(metrics_table_path, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["metric", "value"]) | |
| writer.writerows(metrics_table(metrics)) | |
| report_path = os.path.join(eval_dir, "evaluation_report.txt") | |
| with open(report_path, "w", encoding="utf-8") as f: | |
| f.write("Subtask 2 Evaluation Report\n") | |
| f.write("===========================\n\n") | |
| f.write("This report measures internal consistency only.\n") | |
| f.write("No ground-truth object distances are available here, so these metrics\n") | |
| f.write("should be interpreted as coverage / robustness diagnostics, not absolute accuracy.\n\n") | |
| f.write("Key metrics (table)\n") | |
| f.write("-------------------\n") | |
| for k, v in metrics_table(metrics): | |
| f.write(f"{k}: {v}\n") | |
| f.write("\nMetric sufficiency note\n") | |
| f.write("----------------------\n") | |
| f.write("- Enough for internal evaluation: yes.\n") | |
| f.write("- Enough for accuracy claims: no.\n") | |
| f.write("- To measure real accuracy, add ground-truth distances and report MAE/RMSE/MAPE against labels.\n") | |
| print(f" Saved -> {csv_path}") | |
| print(f" Saved -> {metrics_table_path}") | |
| print(f" Saved -> {report_path}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. DRAW ANNOTATED IMAGE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def draw_detections( | |
| img: np.ndarray, | |
| detections: list[dict] | |
| ) -> np.ndarray: | |
| """ | |
| Draw bounding boxes with labels on a copy of the image. | |
| Label format: "<class>: X.X m (conf%)" | |
| Each class gets a consistent colour from the palette. | |
| """ | |
| out = img.copy() | |
| class_ids = {} # map class name β colour index | |
| for det in detections: | |
| label = det["label"] | |
| dist = det["distance"] | |
| conf = det["conf"] | |
| x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"] | |
| # Assign colour | |
| if label not in class_ids: | |
| class_ids[label] = len(class_ids) % len(_PALETTE) | |
| colour = _PALETTE[class_ids[label]] | |
| # Box | |
| thickness = max(2, int((x2 - x1 + y2 - y1) / 200)) | |
| cv2.rectangle(out, (x1, y1), (x2, y2), colour, thickness) | |
| # Label text | |
| if dist is not None: | |
| text = f"{label}: {dist:.1f} m ({conf:.0%})" | |
| else: | |
| text = f"{label} ({conf:.0%})" | |
| # Dynamic font scale based on box size | |
| box_h = max(1, y2 - y1) | |
| font_scale = max(0.45, min(0.9, box_h / 180)) | |
| font_thick = max(1, int(font_scale * 2)) | |
| (tw, th), baseline = cv2.getTextSize( | |
| text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thick) | |
| # Background pill behind text | |
| pad = 5 | |
| tx = max(0, x1) | |
| ty_box = max(0, y1 - th - baseline - pad * 2) | |
| cv2.rectangle(out, | |
| (tx, ty_box), | |
| (tx + tw + pad * 2, ty_box + th + baseline + pad * 2), | |
| colour, -1) | |
| # Invert text colour for readability | |
| lum = 0.299 * colour[2] + 0.587 * colour[1] + 0.114 * colour[0] | |
| txt_color = (0, 0, 0) if lum > 128 else (255, 255, 255) | |
| cv2.putText(out, text, | |
| (tx + pad, ty_box + th + pad), | |
| cv2.FONT_HERSHEY_SIMPLEX, font_scale, | |
| txt_color, font_thick, cv2.LINE_AA) | |
| return out | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. VISUALISATION (combined figure) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def visualise_results( | |
| img: np.ndarray, | |
| depth_map: np.ndarray, | |
| detections: list[dict], | |
| annotated: np.ndarray, | |
| out_path: str | |
| ) -> None: | |
| """ | |
| Three-panel figure: | |
| 1. Original image with raw YOLO boxes | |
| 2. MiDaS depth heatmap with boxes overlaid | |
| 3. Final annotated image with distance labels | |
| """ | |
| fig, axes = plt.subplots(1, 3, figsize=(19, 7), dpi=130) | |
| fig.patch.set_facecolor("#1a1a2e") | |
| h, w = img.shape[:2] | |
| # ββ Panel 1: raw YOLO detections ββ | |
| raw_boxes = img.copy() | |
| for det in detections: | |
| x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"] | |
| cv2.rectangle(raw_boxes, (x1, y1), (x2, y2), (0, 255, 120), 2) | |
| cv2.putText(raw_boxes, f"{det['label']} {det['conf']:.0%}", | |
| (x1, max(0, y1 - 6)), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 255, 120), 2, cv2.LINE_AA) | |
| axes[0].imshow(cv2.cvtColor(raw_boxes, cv2.COLOR_BGR2RGB)) | |
| axes[0].set_title("YOLO Detections", color="white", fontsize=11, | |
| fontweight="bold", pad=10) | |
| axes[0].axis("off") | |
| # ββ Panel 2: MiDaS depth + boxes ββ | |
| depth_bgr = depth_to_heatmap(depth_map) | |
| depth_over = depth_bgr.copy() | |
| for det in detections: | |
| x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"] | |
| cv2.rectangle(depth_over, (x1, y1), (x2, y2), (255, 255, 255), 2) | |
| dist_txt = f"{det['distance']:.1f}m" if det["distance"] else "?" | |
| cv2.putText(depth_over, dist_txt, | |
| (x1 + 3, y1 + 18), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2, cv2.LINE_AA) | |
| axes[1].imshow(cv2.cvtColor(depth_over, cv2.COLOR_BGR2RGB)) | |
| sm = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(0, 1)) | |
| sm.set_array([]) | |
| cb = plt.colorbar(sm, ax=axes[1], fraction=0.035, pad=0.02) | |
| cb.set_label("Near β Far", color="white", fontsize=8) | |
| cb.set_ticks([0, 0.5, 1]) | |
| cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=8) | |
| cb.ax.yaxis.set_tick_params(color="white") | |
| axes[1].set_title("MiDaS Depth + Distance Estimates", | |
| color="white", fontsize=11, fontweight="bold", pad=10) | |
| axes[1].axis("off") | |
| # ββ Panel 3: final annotated image ββ | |
| axes[2].imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)) | |
| axes[2].set_title("Object Distances (pinhole + MiDaS blend)", | |
| color="white", fontsize=11, fontweight="bold", pad=10) | |
| axes[2].axis("off") | |
| # ββ Distance table below ββ | |
| rows = [] | |
| for det in sorted(detections, | |
| key=lambda d: d["distance"] if d["distance"] else 999): | |
| dist_str = f"{det['distance']:.2f} m" if det["distance"] is not None else "N/A" | |
| ph_str = (f"{det['dist_pinhole']:.2f} m" | |
| if det.get("dist_pinhole") is not None else "β") | |
| md_str = (f"{det['dist_midas']:.2f} m" | |
| if det.get("dist_midas") is not None else "β") | |
| rows.append([det["label"], f"{det['conf']:.0%}", | |
| ph_str, md_str, dist_str, det["method"]]) | |
| if rows: | |
| table_ax = fig.add_axes([0.05, -0.14, 0.90, 0.14]) | |
| table_ax.axis("off") | |
| table_ax.set_facecolor("#1a1a2e") | |
| col_labels = ["Object", "Confidence", | |
| "Pinhole est.", "MiDaS est.", "Final distance", "Method"] | |
| tbl = table_ax.table( | |
| cellText=rows, | |
| colLabels=col_labels, | |
| cellLoc="center", loc="center" | |
| ) | |
| tbl.auto_set_font_size(False) | |
| tbl.set_fontsize(8.5) | |
| tbl.scale(1, 1.55) | |
| # Style header | |
| for j in range(len(col_labels)): | |
| tbl[(0, j)].set_facecolor("#2e4057") | |
| tbl[(0, j)].set_text_props(color="white", fontweight="bold") | |
| # Alternating row shading | |
| for i in range(1, len(rows) + 1): | |
| bg = "#1e2d40" if i % 2 == 0 else "#16213e" | |
| for j in range(len(col_labels)): | |
| tbl[(i, j)].set_facecolor(bg) | |
| tbl[(i, j)].set_text_props(color="#dde") | |
| plt.suptitle( | |
| "Subtask 2 β Object Detection & Distance Estimation\n" | |
| "Distance = pinhole camera model + MiDaS depth scaling", | |
| color="white", fontsize=13, fontweight="bold", y=1.02 | |
| ) | |
| plt.tight_layout() | |
| os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True) | |
| plt.savefig(out_path, dpi=130, bbox_inches="tight", | |
| facecolor=fig.get_facecolor()) | |
| plt.close(fig) | |
| print(f"Saved -> {out_path}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 7. MAIN | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main() -> None: | |
| if len(sys.argv) < 2: | |
| sys.exit( | |
| "Usage: python object_distance.py <image_path> [output_dir] [focal_px]\n" | |
| "Example: python object_distance.py street.jpg output/ 800" | |
| ) | |
| image_path = sys.argv[1] | |
| out_dir = sys.argv[2] if len(sys.argv) > 2 else "output" | |
| focal_length = float(sys.argv[3]) if len(sys.argv) > 3 else None | |
| image_dir = os.path.join(out_dir, "images") | |
| eval_dir = os.path.join(out_dir, "evaluation") | |
| # ββ Load image ββ | |
| img = load_image(image_path) | |
| h, w = img.shape[:2] | |
| if focal_length is None: | |
| focal_length = estimate_focal_length(w, fov_deg=60.0) | |
| print(f"Focal length estimated: {focal_length:.1f} px " | |
| f"(assuming 60Β° horizontal FOV β override via 3rd argument)") | |
| else: | |
| print(f"Focal length (user-supplied): {focal_length:.1f} px") | |
| # ββ MiDaS depth ββ | |
| print("\n[ MiDaS ] Loading MiDaS_small ...") | |
| midas_model, midas_transform, device = load_midas("MiDaS_small") | |
| print("[ MiDaS ] Running inference ...") | |
| depth_map = midas_depth(img, midas_model, midas_transform, device) | |
| print(f" Done. depth in [0,1] mean={depth_map.mean():.3f}") | |
| # ββ YOLO detection ββ | |
| print("\n[ YOLO ] Loading YOLOv5s ...") | |
| yolo_model = load_yolo("yolov5s") | |
| print("[ YOLO ] Running detection ...") | |
| detections = run_yolo(yolo_model, img) | |
| if not detections: | |
| print("WARNING: No objects detected. " | |
| "Try a lower confidence threshold or a different image.") | |
| sys.exit(0) | |
| # ββ Distance estimation ββ | |
| print("\n[ Dist ] Estimating distances ...") | |
| detections, eval_context = estimate_distances(detections, depth_map, focal_length) | |
| metrics = compute_evaluation_metrics(detections, focal_length, eval_context) | |
| # Print summary table | |
| print(f"\n {'Object':<18} {'Conf':>5} {'Pinhole':>10} " | |
| f"{'MiDaS':>10} {'Final':>10} Method") | |
| print(" " + "-" * 70) | |
| for det in sorted(detections, | |
| key=lambda d: d["distance"] if d["distance"] else 999): | |
| dp = f"{det['dist_pinhole']:.1f} m" if det.get("dist_pinhole") else " β" | |
| dm = f"{det['dist_midas']:.1f} m" if det.get("dist_midas") else " β" | |
| df = f"{det['distance']:.1f} m" if det.get("distance") else " β" | |
| print(f" {det['label']:<18} {det['conf']:>4.0%} " | |
| f"{dp:>10} {dm:>10} {df:>10} {det['method']}") | |
| # ββ Draw and save ββ | |
| print("\n[ Draw ] Annotating image ...") | |
| annotated = draw_detections(img, detections) | |
| os.makedirs(image_dir, exist_ok=True) | |
| os.makedirs(eval_dir, exist_ok=True) | |
| annotated_path = os.path.join(image_dir, "detections_with_distance.png") | |
| cv2.imwrite(annotated_path, annotated) | |
| cv2.imwrite(os.path.join(image_dir, "midas_depth.png"), | |
| depth_to_heatmap(depth_map)) | |
| print(f" Saved -> {annotated_path}") | |
| print("\n[ Fig ] Compositing combined figure ...") | |
| visualise_results( | |
| img, depth_map, detections, annotated, | |
| out_path=os.path.join(image_dir, "object_distance_subtask2.png") | |
| ) | |
| print("\n[ Eval ] Writing evaluation artifacts ...") | |
| save_evaluation_outputs(detections, metrics, eval_dir) | |
| print(f"\nDone. Image outputs: {image_dir}/") | |
| print(f"Done. Evaluation outputs: {eval_dir}/") | |
| if __name__ == "__main__": | |
| main() |