Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- depth_estimation.py +409 -0
- object_distance.py +799 -0
depth_estimation.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Subtask 1 β Depth Estimation
|
| 3 |
+
1. Classical method : SGBM Stereo Matching on a synthesised stereo pair
|
| 4 |
+
2. ML-based method : Actual MiDaS (MiDaS_small) via torch.hub
|
| 5 |
+
3. Both rendered as heatmaps (hot colours = close, cold colours = far)
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python depth_estimation.py <image_path> [output_dir]
|
| 9 |
+
|
| 10 |
+
Example:
|
| 11 |
+
python depth_estimation.py street.jpg output/
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
import cv2
|
| 18 |
+
import numpy as np
|
| 19 |
+
import matplotlib
|
| 20 |
+
matplotlib.use("Agg")
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
from scipy.ndimage import gaussian_filter
|
| 23 |
+
import torch
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
# 0. LOAD IMAGE (real image required)
|
| 28 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
|
| 30 |
+
def load_image(path: str) -> np.ndarray:
|
| 31 |
+
if not path or not os.path.exists(path):
|
| 32 |
+
sys.exit(
|
| 33 |
+
f"ERROR: Image not found: '{path}'\n"
|
| 34 |
+
"Usage: python depth_estimation.py <image_path>\n"
|
| 35 |
+
"Example: python depth_estimation.py street.jpg"
|
| 36 |
+
)
|
| 37 |
+
img = cv2.imread(path)
|
| 38 |
+
if img is None:
|
| 39 |
+
sys.exit(f"ERROR: Could not read image: '{path}'")
|
| 40 |
+
print(f"Loaded: {path} {img.shape[1]}x{img.shape[0]} ({img.shape[2]} channels)")
|
| 41 |
+
return img
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
# 1. CLASSICAL METHOD β SGBM STEREO MATCHING
|
| 46 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
+
|
| 48 |
+
def synthesise_stereo_pair(
|
| 49 |
+
img: np.ndarray,
|
| 50 |
+
baseline_shift_pct: float = 0.03
|
| 51 |
+
) -> tuple:
|
| 52 |
+
"""
|
| 53 |
+
Simulate a stereo pair from a monocular image.
|
| 54 |
+
|
| 55 |
+
A per-pixel disparity seed is estimated from two monocular cues:
|
| 56 |
+
- Focus sharpness (Laplacian magnitude): sharp regions β close
|
| 57 |
+
- Vertical position (perspective geometry): lower in frame β close
|
| 58 |
+
|
| 59 |
+
That seed drives a horizontal warp to produce the right view,
|
| 60 |
+
mimicking a camera shifted by `baseline_shift_pct * width` pixels.
|
| 61 |
+
This is the same bootstrap step used in single-image SfM pipelines.
|
| 62 |
+
"""
|
| 63 |
+
h, w = img.shape[:2]
|
| 64 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 65 |
+
|
| 66 |
+
# Sharpness cue
|
| 67 |
+
lap = cv2.Laplacian(gray.astype(np.float32), cv2.CV_32F)
|
| 68 |
+
sharpness = gaussian_filter(np.abs(lap), sigma=5)
|
| 69 |
+
sharpness = sharpness / (sharpness.max() + 1e-6)
|
| 70 |
+
|
| 71 |
+
# Vertical prior
|
| 72 |
+
vert = np.linspace(0, 1, h)[:, None] * np.ones((h, w))
|
| 73 |
+
|
| 74 |
+
# Combine and smooth
|
| 75 |
+
closeness = 0.5 * sharpness + 0.5 * vert
|
| 76 |
+
closeness = gaussian_filter(closeness.astype(np.float32), sigma=10)
|
| 77 |
+
closeness = (closeness - closeness.min()) / (closeness.max() - closeness.min() + 1e-6)
|
| 78 |
+
|
| 79 |
+
max_shift = int(w * baseline_shift_pct)
|
| 80 |
+
disp_seed = (closeness * max_shift).astype(np.float32)
|
| 81 |
+
|
| 82 |
+
# Warp: right image looks slightly to the left
|
| 83 |
+
map_x = np.tile(np.arange(w, dtype=np.float32), (h, 1)) - disp_seed
|
| 84 |
+
map_y = np.tile(np.arange(h, dtype=np.float32)[:, None], (1, w))
|
| 85 |
+
right = cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR,
|
| 86 |
+
borderMode=cv2.BORDER_REPLICATE)
|
| 87 |
+
return img.copy(), right, max_shift
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def sgbm_depth(
|
| 91 |
+
img: np.ndarray,
|
| 92 |
+
baseline_shift_pct: float = 0.03,
|
| 93 |
+
block_size: int = 7,
|
| 94 |
+
uniqueness_ratio: int = 10,
|
| 95 |
+
speckle_window_size: int = 100,
|
| 96 |
+
speckle_range: int = 2
|
| 97 |
+
) -> tuple:
|
| 98 |
+
"""
|
| 99 |
+
Semi-Global Block Matching (HirschmΓΌller 2008).
|
| 100 |
+
|
| 101 |
+
SGBM minimises a global energy function across multiple 1-D scanline
|
| 102 |
+
paths (8 directions in SGBM_3WAY mode), combining a per-pixel data
|
| 103 |
+
cost (census transform) with smoothness penalties P1/P2 that penalise
|
| 104 |
+
disparity discontinuities.
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
depth_norm β normalised closeness map [0, 1], 1 = close
|
| 108 |
+
left_img β left view of stereo pair
|
| 109 |
+
right_img β right view of stereo pair
|
| 110 |
+
"""
|
| 111 |
+
left_img, right_img, max_shift = synthesise_stereo_pair(
|
| 112 |
+
img, baseline_shift_pct=baseline_shift_pct
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
left_g = cv2.cvtColor(left_img, cv2.COLOR_BGR2GRAY)
|
| 116 |
+
right_g = cv2.cvtColor(right_img, cv2.COLOR_BGR2GRAY)
|
| 117 |
+
|
| 118 |
+
num_disp = max(16, ((max_shift // 16) + 1) * 16) # must be multiple of 16
|
| 119 |
+
block = max(3, int(block_size))
|
| 120 |
+
if block % 2 == 0:
|
| 121 |
+
block += 1
|
| 122 |
+
|
| 123 |
+
matcher = cv2.StereoSGBM_create(
|
| 124 |
+
minDisparity = 0,
|
| 125 |
+
numDisparities = num_disp,
|
| 126 |
+
blockSize = block,
|
| 127 |
+
P1 = 8 * 3 * block ** 2, # small-discontinuity penalty
|
| 128 |
+
P2 = 32 * 3 * block ** 2, # large-discontinuity penalty
|
| 129 |
+
disp12MaxDiff = 5,
|
| 130 |
+
uniquenessRatio = uniqueness_ratio,
|
| 131 |
+
speckleWindowSize = speckle_window_size,
|
| 132 |
+
speckleRange = speckle_range,
|
| 133 |
+
mode = cv2.STEREO_SGBM_MODE_SGBM_3WAY
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
disp = matcher.compute(left_g, right_g).astype(np.float32) / 16.0
|
| 137 |
+
disp = np.maximum(disp, 0)
|
| 138 |
+
|
| 139 |
+
# Edge-preserving smoothing (bilateral keeps object boundaries clean)
|
| 140 |
+
disp = cv2.bilateralFilter(disp, d=9, sigmaColor=75, sigmaSpace=75)
|
| 141 |
+
|
| 142 |
+
# Normalise to [0, 1]: high disparity = close = 1
|
| 143 |
+
d = (disp - disp.min()) / (disp.max() - disp.min() + 1e-6)
|
| 144 |
+
|
| 145 |
+
# Guided filter refinement β sharpens depth edges using the colour image
|
| 146 |
+
d_8u = (d * 255).clip(0, 255).astype(np.uint8)
|
| 147 |
+
d = cv2.ximgproc.guidedFilter(
|
| 148 |
+
guide=left_g, src=d_8u, radius=8, eps=200, dDepth=cv2.CV_32F)
|
| 149 |
+
d = np.clip(d / (d.max() + 1e-6), 0, 1)
|
| 150 |
+
|
| 151 |
+
return d, left_img, right_img
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
# 2. ML-BASED METHOD β Actual MiDaS (MiDaS_small)
|
| 156 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
+
|
| 158 |
+
def load_midas(model_type: str = "MiDaS_small"):
|
| 159 |
+
"""
|
| 160 |
+
Load MiDaS from torch.hub (intel-isl/MiDaS).
|
| 161 |
+
|
| 162 |
+
Available model_type values (largest β smallest / slowest β fastest):
|
| 163 |
+
"DPT_Large" β DPT-L (ViT-L backbone, best quality)
|
| 164 |
+
"DPT_Hybrid" β DPT-H (ViT-H + ResNet50, good balance)
|
| 165 |
+
"MiDaS" β MiDaS v2.1 large (ResNet-101)
|
| 166 |
+
"MiDaS_small" β MiDaS v2.1 small (EfficientNet-Lite, fast) β default
|
| 167 |
+
|
| 168 |
+
Weights are cached in ~/.cache/torch/hub/ after the first download.
|
| 169 |
+
"""
|
| 170 |
+
print(f"[ MiDaS ] Loading model '{model_type}' from torch.hub ...")
|
| 171 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 172 |
+
print(f" Device: {device}")
|
| 173 |
+
|
| 174 |
+
model = torch.hub.load("intel-isl/MiDaS", model_type, trust_repo=True)
|
| 175 |
+
model.to(device).eval()
|
| 176 |
+
|
| 177 |
+
transforms = torch.hub.load("intel-isl/MiDaS", "transforms", trust_repo=True)
|
| 178 |
+
transform = (transforms.small_transform
|
| 179 |
+
if model_type == "MiDaS_small"
|
| 180 |
+
else transforms.dpt_transform)
|
| 181 |
+
|
| 182 |
+
n_params = sum(p.numel() for p in model.parameters())
|
| 183 |
+
print(f" Model loaded ({n_params:,} parameters)")
|
| 184 |
+
return model, transform, device
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def midas_depth(
|
| 188 |
+
img: np.ndarray,
|
| 189 |
+
model,
|
| 190 |
+
transform,
|
| 191 |
+
device: torch.device
|
| 192 |
+
) -> np.ndarray:
|
| 193 |
+
"""
|
| 194 |
+
Run MiDaS inference on a BGR image.
|
| 195 |
+
|
| 196 |
+
MiDaS predicts *inverse* relative depth (disparity-like): larger values
|
| 197 |
+
correspond to closer surfaces. We normalise to [0, 1] so 1 = close.
|
| 198 |
+
|
| 199 |
+
Pipeline:
|
| 200 |
+
BGR image
|
| 201 |
+
β RGB conversion
|
| 202 |
+
β MiDaS transform (resize to 256x256 + ImageNet normalisation)
|
| 203 |
+
β EfficientNet encoder (feature extraction)
|
| 204 |
+
β decoder + skip connections
|
| 205 |
+
β bilinear upsample to original resolution
|
| 206 |
+
β normalise to [0, 1]
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
depth_norm β closeness map [0, 1] at original image resolution
|
| 210 |
+
"""
|
| 211 |
+
h, w = img.shape[:2]
|
| 212 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 213 |
+
|
| 214 |
+
# Preprocess: resize + normalise
|
| 215 |
+
input_batch = transform(img_rgb).to(device)
|
| 216 |
+
|
| 217 |
+
with torch.no_grad():
|
| 218 |
+
prediction = model(input_batch)
|
| 219 |
+
# Upsample back to original resolution
|
| 220 |
+
prediction = torch.nn.functional.interpolate(
|
| 221 |
+
prediction.unsqueeze(1),
|
| 222 |
+
size=(h, w),
|
| 223 |
+
mode="bilinear",
|
| 224 |
+
align_corners=False,
|
| 225 |
+
).squeeze()
|
| 226 |
+
|
| 227 |
+
depth = prediction.cpu().numpy()
|
| 228 |
+
|
| 229 |
+
# MiDaS output is inverse depth β higher value means closer.
|
| 230 |
+
# Normalise to [0, 1].
|
| 231 |
+
depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
|
| 232 |
+
return depth.astype(np.float32)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# 3. VISUALISATION
|
| 237 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 238 |
+
|
| 239 |
+
def depth_to_heatmap(depth: np.ndarray) -> np.ndarray:
|
| 240 |
+
"""depth [0,1] where 1=close β turbo BGR heatmap image."""
|
| 241 |
+
cmap = plt.get_cmap("turbo")
|
| 242 |
+
rgb = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
|
| 243 |
+
return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def visualise_results(
|
| 247 |
+
img: np.ndarray,
|
| 248 |
+
depth_cl: np.ndarray,
|
| 249 |
+
depth_ml: np.ndarray,
|
| 250 |
+
out_path: str = "output/depth_estimation_subtask1.png"
|
| 251 |
+
) -> None:
|
| 252 |
+
"""
|
| 253 |
+
Compose a 3-column figure:
|
| 254 |
+
Col 1 β Original image
|
| 255 |
+
Col 2 β Classical SGBM heatmap + scan-line profiles
|
| 256 |
+
Col 3 β MiDaS heatmap + scan-line profiles
|
| 257 |
+
"""
|
| 258 |
+
h, w = img.shape[:2]
|
| 259 |
+
ncols = 3
|
| 260 |
+
|
| 261 |
+
fig = plt.figure(figsize=(ncols * 5.6, 11), dpi=130)
|
| 262 |
+
fig.patch.set_facecolor("#1a1a2e")
|
| 263 |
+
|
| 264 |
+
titles = [
|
| 265 |
+
"Original Image",
|
| 266 |
+
"Classical Depth\n(SGBM Stereo Matching)",
|
| 267 |
+
"ML-Based Depth\n(MiDaS_small β actual model)",
|
| 268 |
+
]
|
| 269 |
+
depths = [None, depth_cl, depth_ml]
|
| 270 |
+
|
| 271 |
+
ax_top = [fig.add_subplot(2, ncols, c + 1) for c in range(ncols)]
|
| 272 |
+
ax_bot = [fig.add_subplot(2, ncols, ncols + c + 1) for c in range(ncols)]
|
| 273 |
+
|
| 274 |
+
# ββ Top row: images / heatmaps ββ
|
| 275 |
+
for ax, title, d in zip(ax_top, titles, depths):
|
| 276 |
+
ax.set_title(title, color="white", fontsize=10, fontweight="bold", pad=8)
|
| 277 |
+
ax.axis("off")
|
| 278 |
+
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 279 |
+
if d is None:
|
| 280 |
+
ax.imshow(rgb)
|
| 281 |
+
else:
|
| 282 |
+
cmap_arr = plt.get_cmap("turbo")(d)[:, :, :3]
|
| 283 |
+
blended = rgb.astype(np.float32) / 255 * 0.22 + cmap_arr * 0.78
|
| 284 |
+
ax.imshow(blended)
|
| 285 |
+
sm = plt.cm.ScalarMappable(cmap="turbo",
|
| 286 |
+
norm=plt.Normalize(vmin=0, vmax=1))
|
| 287 |
+
sm.set_array([])
|
| 288 |
+
cb = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.02)
|
| 289 |
+
cb.set_label("Near -> Far", color="white", fontsize=7)
|
| 290 |
+
cb.set_ticks([0, 0.5, 1])
|
| 291 |
+
cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=7)
|
| 292 |
+
cb.ax.yaxis.set_tick_params(color="white")
|
| 293 |
+
|
| 294 |
+
# ββ Scan lines on heatmap panels ββ
|
| 295 |
+
scan_ys = [int(h * f) for f in [0.25, 0.50, 0.75]]
|
| 296 |
+
scan_colors = ["#ff6b6b", "#ffd93d", "#6bcb77"]
|
| 297 |
+
for ax in ax_top[1:]:
|
| 298 |
+
for sy, sc in zip(scan_ys, scan_colors):
|
| 299 |
+
ax.axhline(sy, color=sc, linewidth=1.2, alpha=0.75)
|
| 300 |
+
|
| 301 |
+
# ββ Bottom row: depth profile plots ββ
|
| 302 |
+
x = np.arange(w)
|
| 303 |
+
method_maps = [depth_cl, depth_ml]
|
| 304 |
+
method_names = ["Classical (SGBM)", "MiDaS (actual)"]
|
| 305 |
+
ls = ["-", "--"]
|
| 306 |
+
|
| 307 |
+
for col, ax in enumerate(ax_bot):
|
| 308 |
+
ax.set_facecolor("#16213e")
|
| 309 |
+
for sp in ["top", "right"]: ax.spines[sp].set_visible(False)
|
| 310 |
+
for sp in ["bottom", "left"]: ax.spines[sp].set_color("#555")
|
| 311 |
+
ax.tick_params(colors="#888", labelsize=7)
|
| 312 |
+
ax.set_xlim(0, w - 1)
|
| 313 |
+
ax.set_ylim(-0.05, 1.05)
|
| 314 |
+
ax.set_xlabel("Pixel x", color="#aaa", fontsize=8)
|
| 315 |
+
ax.set_ylabel("Closeness (1 = near)", color="#aaa", fontsize=8)
|
| 316 |
+
|
| 317 |
+
if col == 0:
|
| 318 |
+
# Compare both methods at the middle scan line
|
| 319 |
+
ax.set_title("Method comparison β middle scan line",
|
| 320 |
+
color="white", fontsize=9, pad=6)
|
| 321 |
+
sy = scan_ys[1]
|
| 322 |
+
for mp, nm, l in zip(method_maps, method_names, ls):
|
| 323 |
+
ax.plot(x, mp[sy, :], linestyle=l, linewidth=1.6, label=nm)
|
| 324 |
+
ax.legend(fontsize=8, framealpha=0.25, labelcolor="white")
|
| 325 |
+
|
| 326 |
+
else:
|
| 327 |
+
# Per-method: three scan lines
|
| 328 |
+
mp = method_maps[col - 1]
|
| 329 |
+
nm = method_names[col - 1]
|
| 330 |
+
ax.set_title(f"{nm} β scan-line profiles",
|
| 331 |
+
color="white", fontsize=9, pad=6)
|
| 332 |
+
for sy, sc in zip(scan_ys, scan_colors):
|
| 333 |
+
ax.plot(x, mp[sy, :], color=sc, linewidth=1.4,
|
| 334 |
+
label=f"y = {sy}")
|
| 335 |
+
ax.legend(fontsize=7, framealpha=0.25, labelcolor="white")
|
| 336 |
+
|
| 337 |
+
# ββ Colour scale strip ββ
|
| 338 |
+
ax_s = fig.add_axes([0.05, 0.01, 0.90, 0.022])
|
| 339 |
+
ax_s.imshow(np.linspace(0, 1, 512).reshape(1, -1),
|
| 340 |
+
aspect="auto", cmap="turbo")
|
| 341 |
+
ax_s.set_yticks([])
|
| 342 |
+
ax_s.set_xticks([0, 170, 341, 511])
|
| 343 |
+
ax_s.set_xticklabels(
|
| 344 |
+
["Far (cold / blue)", "Mid-far", "Mid-close", "Close (hot / red)"],
|
| 345 |
+
color="white", fontsize=8
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
plt.suptitle(
|
| 349 |
+
"Subtask 1 β Classical (SGBM) vs ML-Based (MiDaS) Depth Estimation\n"
|
| 350 |
+
"Heatmap: red/hot = close blue/cold = far",
|
| 351 |
+
color="white", fontsize=13, fontweight="bold", y=1.003
|
| 352 |
+
)
|
| 353 |
+
plt.tight_layout(rect=[0, 0.05, 1, 1])
|
| 354 |
+
|
| 355 |
+
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
|
| 356 |
+
plt.savefig(out_path, dpi=130, bbox_inches="tight",
|
| 357 |
+
facecolor=fig.get_facecolor())
|
| 358 |
+
plt.close(fig)
|
| 359 |
+
print(f"Saved -> {out_path}")
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
+
# 4. MAIN
|
| 364 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 365 |
+
|
| 366 |
+
def main() -> None:
|
| 367 |
+
if len(sys.argv) < 2:
|
| 368 |
+
sys.exit(
|
| 369 |
+
"Usage: python depth_estimation.py <image_path> [output_dir]\n"
|
| 370 |
+
"Example: python depth_estimation.py street.jpg output/"
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
image_path = sys.argv[1]
|
| 374 |
+
out_dir = sys.argv[2] if len(sys.argv) > 2 else "output"
|
| 375 |
+
|
| 376 |
+
# ββ Load image ββ
|
| 377 |
+
img = load_image(image_path)
|
| 378 |
+
|
| 379 |
+
# ββ Classical: SGBM βοΏ½οΏ½
|
| 380 |
+
print("\n[ Classical ] Running SGBM stereo matching ...")
|
| 381 |
+
depth_cl, left_img, right_img = sgbm_depth(img)
|
| 382 |
+
print(f" Done. depth in [0,1] mean={depth_cl.mean():.3f}")
|
| 383 |
+
|
| 384 |
+
# ββ ML: actual MiDaS ββ
|
| 385 |
+
print("\n[ MiDaS ] Loading and running MiDaS_small ...")
|
| 386 |
+
midas_model, midas_transform, device = load_midas("MiDaS_small")
|
| 387 |
+
depth_ml = midas_depth(img, midas_model, midas_transform, device)
|
| 388 |
+
print(f" Done. depth in [0,1] mean={depth_ml.mean():.3f}")
|
| 389 |
+
|
| 390 |
+
# ββ Save outputs ββ
|
| 391 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 392 |
+
cv2.imwrite(os.path.join(out_dir, "classical_heatmap.png"),
|
| 393 |
+
depth_to_heatmap(depth_cl))
|
| 394 |
+
cv2.imwrite(os.path.join(out_dir, "midas_heatmap.png"),
|
| 395 |
+
depth_to_heatmap(depth_ml))
|
| 396 |
+
cv2.imwrite(os.path.join(out_dir, "stereo_left.png"), left_img)
|
| 397 |
+
cv2.imwrite(os.path.join(out_dir, "stereo_right.png"), right_img)
|
| 398 |
+
|
| 399 |
+
print("\n[ Visualise ] Compositing final figure ...")
|
| 400 |
+
visualise_results(
|
| 401 |
+
img, depth_cl, depth_ml,
|
| 402 |
+
out_path=os.path.join(out_dir, "depth_estimation_subtask1.png")
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
print(f"\nDone. Outputs written to: {out_dir}/")
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
if __name__ == "__main__":
|
| 409 |
+
main()
|
object_distance.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Subtask 2 β Object Detection + Distance Estimation
|
| 3 |
+
1. Detect objects with YOLOv5s (torch.hub)
|
| 4 |
+
2. Estimate metric distance (metres) per object using two complementary strategies:
|
| 5 |
+
A) Pinhole camera model β uses known real-world object heights
|
| 6 |
+
B) MiDaS depth scaling β calibrates MiDaS relative depth with pinhole anchors,
|
| 7 |
+
then applies the calibrated scale to all objects
|
| 8 |
+
3. Draw labelled bounding boxes on the image ("person: 5.2 m")
|
| 9 |
+
4. Produce a combined figure: original detections | MiDaS depth | annotated result
|
| 10 |
+
|
| 11 |
+
Usage:
|
| 12 |
+
python object_distance.py <image_path> [output_dir] [focal_length_px]
|
| 13 |
+
|
| 14 |
+
Examples:
|
| 15 |
+
python object_distance.py street.jpg
|
| 16 |
+
python object_distance.py street.jpg output/ 800
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import sys
|
| 20 |
+
import os
|
| 21 |
+
import math
|
| 22 |
+
import csv
|
| 23 |
+
import json
|
| 24 |
+
from typing import Optional, Tuple, List
|
| 25 |
+
|
| 26 |
+
import cv2
|
| 27 |
+
import numpy as np
|
| 28 |
+
import matplotlib
|
| 29 |
+
matplotlib.use("Agg")
|
| 30 |
+
import matplotlib.pyplot as plt
|
| 31 |
+
import torch
|
| 32 |
+
|
| 33 |
+
# ββ re-use MiDaS loader from Subtask 1 ββββββββββββββββββββββ
|
| 34 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 35 |
+
from depth_estimation import load_image, load_midas, midas_depth, depth_to_heatmap
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
# 1. KNOWN OBJECT HEIGHTS (metres)
|
| 40 |
+
# Used by the pinhole camera model.
|
| 41 |
+
# Values are representative averages for the COCO classes
|
| 42 |
+
# that appear most often in street / indoor scenes.
|
| 43 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
KNOWN_HEIGHTS: dict[str, float] = {
|
| 46 |
+
# People & animals
|
| 47 |
+
"person": 1.70,
|
| 48 |
+
"cat": 0.30,
|
| 49 |
+
"dog": 0.50,
|
| 50 |
+
"horse": 1.60,
|
| 51 |
+
"cow": 1.40,
|
| 52 |
+
"sheep": 0.90,
|
| 53 |
+
"elephant": 3.00,
|
| 54 |
+
"bear": 1.20,
|
| 55 |
+
"zebra": 1.40,
|
| 56 |
+
"giraffe": 4.50,
|
| 57 |
+
# Vehicles
|
| 58 |
+
"bicycle": 1.00,
|
| 59 |
+
"car": 1.50,
|
| 60 |
+
"motorcycle": 1.10,
|
| 61 |
+
"airplane": 4.00,
|
| 62 |
+
"bus": 3.20,
|
| 63 |
+
"train": 4.00,
|
| 64 |
+
"truck": 3.50,
|
| 65 |
+
"boat": 1.50,
|
| 66 |
+
# Street furniture
|
| 67 |
+
"traffic light":0.90,
|
| 68 |
+
"fire hydrant": 0.60,
|
| 69 |
+
"stop sign": 0.75,
|
| 70 |
+
"parking meter":1.20,
|
| 71 |
+
"bench": 0.90,
|
| 72 |
+
# Indoor objects
|
| 73 |
+
"chair": 0.90,
|
| 74 |
+
"couch": 0.85,
|
| 75 |
+
"bed": 0.55,
|
| 76 |
+
"dining table": 0.75,
|
| 77 |
+
"toilet": 0.40,
|
| 78 |
+
"tv": 0.65,
|
| 79 |
+
"laptop": 0.30,
|
| 80 |
+
"microwave": 0.35,
|
| 81 |
+
"oven": 0.90,
|
| 82 |
+
"refrigerator": 1.80,
|
| 83 |
+
"sink": 0.20,
|
| 84 |
+
"door": 2.10,
|
| 85 |
+
# Handheld / small
|
| 86 |
+
"bottle": 0.25,
|
| 87 |
+
"cup": 0.12,
|
| 88 |
+
"backpack": 0.50,
|
| 89 |
+
"umbrella": 1.00,
|
| 90 |
+
"handbag": 0.30,
|
| 91 |
+
"suitcase": 0.65,
|
| 92 |
+
"sports ball": 0.22,
|
| 93 |
+
"baseball bat": 1.05,
|
| 94 |
+
"skateboard": 0.15,
|
| 95 |
+
"surfboard": 1.80,
|
| 96 |
+
"tennis racket":0.68,
|
| 97 |
+
"book": 0.22,
|
| 98 |
+
"clock": 0.30,
|
| 99 |
+
"vase": 0.30,
|
| 100 |
+
"scissors": 0.18,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# Colour palette (BGR) β one per class, cycling if more classes appear
|
| 104 |
+
_PALETTE = [
|
| 105 |
+
(0, 200, 255), # yellow
|
| 106 |
+
(0, 255, 100), # green
|
| 107 |
+
(255, 80, 80), # blue
|
| 108 |
+
(180, 0, 255), # magenta
|
| 109 |
+
(0, 160, 255), # orange
|
| 110 |
+
(255, 200, 0), # cyan
|
| 111 |
+
(100, 255, 200), # lime
|
| 112 |
+
(255, 50, 180), # pink
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
# 2. FOCAL LENGTH ESTIMATION
|
| 118 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
+
|
| 120 |
+
def estimate_focal_length(image_width: int, fov_deg: float = 60.0) -> float:
|
| 121 |
+
"""
|
| 122 |
+
Estimate the focal length in pixels from a known (or assumed) horizontal FOV.
|
| 123 |
+
|
| 124 |
+
f = (image_width / 2) / tan(FOV / 2)
|
| 125 |
+
|
| 126 |
+
The default of 60Β° covers most smartphones and consumer cameras.
|
| 127 |
+
Pass --focal to override with a measured value if you have camera metadata.
|
| 128 |
+
"""
|
| 129 |
+
return (image_width / 2.0) / math.tan(math.radians(fov_deg / 2.0))
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 133 |
+
# 3. OBJECT DETECTION (YOLOv5s via torch.hub)
|
| 134 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
|
| 136 |
+
def load_yolo(
|
| 137 |
+
model_name: str = "yolov5s",
|
| 138 |
+
conf_thresh: float = 0.35,
|
| 139 |
+
iou_thresh: float = 0.45
|
| 140 |
+
):
|
| 141 |
+
"""
|
| 142 |
+
Load YOLOv5 from torch.hub.
|
| 143 |
+
|
| 144 |
+
Available sizes (speed β / accuracy β):
|
| 145 |
+
yolov5n β nano
|
| 146 |
+
yolov5s β small β default, good balance
|
| 147 |
+
yolov5m β medium
|
| 148 |
+
yolov5l β large
|
| 149 |
+
yolov5x β extra-large
|
| 150 |
+
"""
|
| 151 |
+
print(f"[ YOLO ] Loading {model_name} from torch.hub ...")
|
| 152 |
+
model = torch.hub.load(
|
| 153 |
+
"ultralytics/yolov5", model_name,
|
| 154 |
+
pretrained=True, trust_repo=True
|
| 155 |
+
)
|
| 156 |
+
model.conf = conf_thresh
|
| 157 |
+
model.iou = iou_thresh
|
| 158 |
+
print(f" Loaded ({model_name})")
|
| 159 |
+
return model
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def run_yolo(
|
| 163 |
+
model,
|
| 164 |
+
img: np.ndarray,
|
| 165 |
+
conf_thresh: float = 0.35
|
| 166 |
+
) -> list[dict]:
|
| 167 |
+
"""
|
| 168 |
+
Run YOLOv5 on a BGR image.
|
| 169 |
+
|
| 170 |
+
Returns a list of detections, each a dict:
|
| 171 |
+
{ 'label': str, 'conf': float,
|
| 172 |
+
'x1': int, 'y1': int, 'x2': int, 'y2': int }
|
| 173 |
+
"""
|
| 174 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 175 |
+
results = model(img_rgb)
|
| 176 |
+
df = results.pandas().xyxy[0] # Pandas DataFrame
|
| 177 |
+
|
| 178 |
+
detections = []
|
| 179 |
+
for _, row in df.iterrows():
|
| 180 |
+
if row["confidence"] < conf_thresh:
|
| 181 |
+
continue
|
| 182 |
+
detections.append({
|
| 183 |
+
"label": row["name"],
|
| 184 |
+
"conf": float(row["confidence"]),
|
| 185 |
+
"x1": int(row["xmin"]),
|
| 186 |
+
"y1": int(row["ymin"]),
|
| 187 |
+
"x2": int(row["xmax"]),
|
| 188 |
+
"y2": int(row["ymax"]),
|
| 189 |
+
})
|
| 190 |
+
|
| 191 |
+
print(f" {len(detections)} object(s) detected")
|
| 192 |
+
return detections
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
# 4. DISTANCE ESTIMATION
|
| 197 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 198 |
+
|
| 199 |
+
def pinhole_distance(
|
| 200 |
+
pixel_height: int,
|
| 201 |
+
real_height: float,
|
| 202 |
+
focal_length: float
|
| 203 |
+
) -> float:
|
| 204 |
+
"""
|
| 205 |
+
Pinhole / thin-lens camera model:
|
| 206 |
+
|
| 207 |
+
distance = (real_height * focal_length) / pixel_height
|
| 208 |
+
|
| 209 |
+
Derivation:
|
| 210 |
+
An object of real height H at distance D from a camera with focal
|
| 211 |
+
length f projects to a pixel height h = (H * f) / D.
|
| 212 |
+
Solving for D gives the formula above.
|
| 213 |
+
"""
|
| 214 |
+
if pixel_height <= 0:
|
| 215 |
+
return float("inf")
|
| 216 |
+
return (real_height * focal_length) / pixel_height
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def detection_depth_stat(
|
| 220 |
+
depth_map: np.ndarray,
|
| 221 |
+
det: dict,
|
| 222 |
+
inner_ratio: float = 0.6
|
| 223 |
+
) -> float:
|
| 224 |
+
"""
|
| 225 |
+
Robust per-detection MiDaS statistic.
|
| 226 |
+
|
| 227 |
+
Uses the central region of the bounding box to reduce leakage from
|
| 228 |
+
neighbouring objects and background near box edges.
|
| 229 |
+
"""
|
| 230 |
+
inner_ratio = float(np.clip(inner_ratio, 0.1, 1.0))
|
| 231 |
+
x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
|
| 232 |
+
w = max(1, x2 - x1)
|
| 233 |
+
h = max(1, y2 - y1)
|
| 234 |
+
|
| 235 |
+
dx = int(w * (1.0 - inner_ratio) / 2.0)
|
| 236 |
+
dy = int(h * (1.0 - inner_ratio) / 2.0)
|
| 237 |
+
cx1 = max(0, x1 + dx)
|
| 238 |
+
cy1 = max(0, y1 + dy)
|
| 239 |
+
cx2 = min(depth_map.shape[1], x2 - dx)
|
| 240 |
+
cy2 = min(depth_map.shape[0], y2 - dy)
|
| 241 |
+
|
| 242 |
+
roi = depth_map[cy1:cy2, cx1:cx2]
|
| 243 |
+
if roi.size == 0:
|
| 244 |
+
roi = depth_map[max(0, y1):min(depth_map.shape[0], y2),
|
| 245 |
+
max(0, x1):min(depth_map.shape[1], x2)]
|
| 246 |
+
if roi.size == 0:
|
| 247 |
+
return 0.0
|
| 248 |
+
|
| 249 |
+
return float(np.median(roi))
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def midas_scale_calibration(
|
| 253 |
+
detections: list[dict],
|
| 254 |
+
depth_map: np.ndarray,
|
| 255 |
+
focal_length: float,
|
| 256 |
+
inner_ratio: float = 0.6,
|
| 257 |
+
min_depth_value: float = 0.02
|
| 258 |
+
) -> Tuple[Optional[float], List[float]]:
|
| 259 |
+
"""
|
| 260 |
+
Use objects with known real-world heights as anchors to calibrate
|
| 261 |
+
the MiDaS relative depth scale.
|
| 262 |
+
|
| 263 |
+
MiDaS outputs inverse relative depth d β (0, 1] where d β 1/D (D = distance).
|
| 264 |
+
So: D_pinhole β k / d_midas => k = D_pinhole * d_midas
|
| 265 |
+
|
| 266 |
+
We collect k for each known-class detection and take the median,
|
| 267 |
+
giving a single scale factor that converts MiDaS values to metres.
|
| 268 |
+
"""
|
| 269 |
+
k_values = []
|
| 270 |
+
for det in detections:
|
| 271 |
+
label = det["label"]
|
| 272 |
+
real_height = KNOWN_HEIGHTS.get(label)
|
| 273 |
+
if real_height is None:
|
| 274 |
+
continue
|
| 275 |
+
|
| 276 |
+
pixel_height = det["y2"] - det["y1"]
|
| 277 |
+
if pixel_height <= 5:
|
| 278 |
+
continue
|
| 279 |
+
D_pinhole = pinhole_distance(pixel_height, real_height, focal_length)
|
| 280 |
+
|
| 281 |
+
d_midas = detection_depth_stat(depth_map, det, inner_ratio=inner_ratio)
|
| 282 |
+
|
| 283 |
+
if d_midas > min_depth_value: # skip near-zero (invalid) regions
|
| 284 |
+
k_values.append(D_pinhole * d_midas)
|
| 285 |
+
|
| 286 |
+
if not k_values:
|
| 287 |
+
return None, []
|
| 288 |
+
|
| 289 |
+
scale = float(np.median(k_values))
|
| 290 |
+
print(f" MiDaS scale factor k = {scale:.2f} "
|
| 291 |
+
f"(from {len(k_values)} anchor object(s))")
|
| 292 |
+
return scale, k_values
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def estimate_distances(
|
| 296 |
+
detections: list[dict],
|
| 297 |
+
depth_map: np.ndarray,
|
| 298 |
+
focal_length: float,
|
| 299 |
+
inner_ratio: float = 0.6,
|
| 300 |
+
min_depth_value: float = 0.02,
|
| 301 |
+
blend_weight_pinhole: float = 0.55
|
| 302 |
+
) -> tuple[list[dict], dict]:
|
| 303 |
+
"""
|
| 304 |
+
Attach a metric distance estimate to every detection.
|
| 305 |
+
|
| 306 |
+
Strategy:
|
| 307 |
+
1. Pinhole model β used when the class has a known reference height.
|
| 308 |
+
2. MiDaS scaling β after calibration with pinhole anchors, applied to
|
| 309 |
+
ALL objects (including those without known heights).
|
| 310 |
+
3. Final distance β weighted average of the two when both are available;
|
| 311 |
+
falls back to whichever single estimate exists.
|
| 312 |
+
|
| 313 |
+
Adds to each detection dict:
|
| 314 |
+
dist_pinhole β metres from pinhole model (None if class unknown)
|
| 315 |
+
dist_midas β metres from MiDaS scaling (None if no calibration)
|
| 316 |
+
distance β final blended estimate (metres)
|
| 317 |
+
method β string explaining which strategy was used
|
| 318 |
+
"""
|
| 319 |
+
# ββ Step 1: calibrate MiDaS scale ββ
|
| 320 |
+
midas_scale, anchor_scales = midas_scale_calibration(
|
| 321 |
+
detections,
|
| 322 |
+
depth_map,
|
| 323 |
+
focal_length,
|
| 324 |
+
inner_ratio=inner_ratio,
|
| 325 |
+
min_depth_value=min_depth_value,
|
| 326 |
+
)
|
| 327 |
+
blend_weight_pinhole = float(np.clip(blend_weight_pinhole, 0.0, 1.0))
|
| 328 |
+
blend_weight_midas = 1.0 - blend_weight_pinhole
|
| 329 |
+
|
| 330 |
+
for det in detections:
|
| 331 |
+
label = det["label"]
|
| 332 |
+
real_height = KNOWN_HEIGHTS.get(label)
|
| 333 |
+
pixel_height = det["y2"] - det["y1"]
|
| 334 |
+
det["pixel_height"] = pixel_height
|
| 335 |
+
det["known_height_m"] = real_height
|
| 336 |
+
det["bbox_depth_median"] = detection_depth_stat(
|
| 337 |
+
depth_map, det, inner_ratio=inner_ratio
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# ββ Pinhole estimate ββ
|
| 341 |
+
if real_height is not None and pixel_height > 5:
|
| 342 |
+
det["dist_pinhole"] = pinhole_distance(pixel_height, real_height,
|
| 343 |
+
focal_length)
|
| 344 |
+
else:
|
| 345 |
+
det["dist_pinhole"] = None
|
| 346 |
+
|
| 347 |
+
# ββ MiDaS estimate ββ
|
| 348 |
+
d_midas = det["bbox_depth_median"]
|
| 349 |
+
if midas_scale and d_midas > min_depth_value:
|
| 350 |
+
det["dist_midas"] = midas_scale / d_midas
|
| 351 |
+
else:
|
| 352 |
+
det["dist_midas"] = None
|
| 353 |
+
|
| 354 |
+
# ββ Blend ββ
|
| 355 |
+
dp = det["dist_pinhole"]
|
| 356 |
+
dm = det["dist_midas"]
|
| 357 |
+
|
| 358 |
+
if dp is not None and dm is not None:
|
| 359 |
+
# Weighted average: pinhole is generally more precise for
|
| 360 |
+
# well-known classes; MiDaS captures scene context better.
|
| 361 |
+
det["distance"] = blend_weight_pinhole * dp + blend_weight_midas * dm
|
| 362 |
+
det["method"] = "pinhole + MiDaS"
|
| 363 |
+
elif dp is not None:
|
| 364 |
+
det["distance"] = dp
|
| 365 |
+
det["method"] = "pinhole"
|
| 366 |
+
elif dm is not None:
|
| 367 |
+
det["distance"] = dm
|
| 368 |
+
det["method"] = "MiDaS"
|
| 369 |
+
else:
|
| 370 |
+
det["distance"] = None
|
| 371 |
+
det["method"] = "unknown"
|
| 372 |
+
|
| 373 |
+
eval_context = {
|
| 374 |
+
"midas_scale": midas_scale,
|
| 375 |
+
"anchor_scales": anchor_scales,
|
| 376 |
+
"depth_inner_ratio": inner_ratio,
|
| 377 |
+
"min_depth_value": min_depth_value,
|
| 378 |
+
"blend_weight_pinhole": blend_weight_pinhole,
|
| 379 |
+
}
|
| 380 |
+
return detections, eval_context
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def compute_evaluation_metrics(
|
| 384 |
+
detections: list[dict],
|
| 385 |
+
focal_length: float,
|
| 386 |
+
eval_context: dict
|
| 387 |
+
) -> dict:
|
| 388 |
+
"""
|
| 389 |
+
Internal evaluation only.
|
| 390 |
+
|
| 391 |
+
Since there is no ground-truth distance label in this pipeline, the saved
|
| 392 |
+
metrics focus on coverage, calibration robustness, and agreement between
|
| 393 |
+
the two estimation branches rather than absolute accuracy.
|
| 394 |
+
"""
|
| 395 |
+
total = len(detections)
|
| 396 |
+
confs = np.array([det["conf"] for det in detections], dtype=np.float32) if detections else np.array([])
|
| 397 |
+
final_dists = np.array(
|
| 398 |
+
[det["distance"] for det in detections if det.get("distance") is not None],
|
| 399 |
+
dtype=np.float32
|
| 400 |
+
)
|
| 401 |
+
pinhole_vals = np.array(
|
| 402 |
+
[det["dist_pinhole"] for det in detections if det.get("dist_pinhole") is not None],
|
| 403 |
+
dtype=np.float32
|
| 404 |
+
)
|
| 405 |
+
midas_vals = np.array(
|
| 406 |
+
[det["dist_midas"] for det in detections if det.get("dist_midas") is not None],
|
| 407 |
+
dtype=np.float32
|
| 408 |
+
)
|
| 409 |
+
overlap_pairs = [
|
| 410 |
+
(det["dist_pinhole"], det["dist_midas"])
|
| 411 |
+
for det in detections
|
| 412 |
+
if det.get("dist_pinhole") is not None and det.get("dist_midas") is not None
|
| 413 |
+
]
|
| 414 |
+
anchor_scales = np.array(eval_context.get("anchor_scales", []), dtype=np.float32)
|
| 415 |
+
|
| 416 |
+
metrics = {
|
| 417 |
+
"focal_length_px": float(focal_length),
|
| 418 |
+
"num_detections": total,
|
| 419 |
+
"mean_confidence": float(confs.mean()) if confs.size else None,
|
| 420 |
+
"known_height_count": sum(det.get("known_height_m") is not None for det in detections),
|
| 421 |
+
"pinhole_count": int(pinhole_vals.size),
|
| 422 |
+
"midas_count": int(midas_vals.size),
|
| 423 |
+
"blended_count": sum(det.get("method") == "pinhole + MiDaS" for det in detections),
|
| 424 |
+
"unresolved_count": sum(det.get("distance") is None for det in detections),
|
| 425 |
+
"calibration_anchor_count": int(anchor_scales.size),
|
| 426 |
+
"midas_scale_factor": eval_context.get("midas_scale"),
|
| 427 |
+
}
|
| 428 |
+
metrics["known_height_coverage"] = (
|
| 429 |
+
metrics["known_height_count"] / total if total else None
|
| 430 |
+
)
|
| 431 |
+
metrics["distance_coverage"] = (
|
| 432 |
+
float(final_dists.size) / total if total else None
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
if final_dists.size:
|
| 436 |
+
metrics.update({
|
| 437 |
+
"final_distance_mean_m": float(final_dists.mean()),
|
| 438 |
+
"final_distance_std_m": float(final_dists.std()),
|
| 439 |
+
"final_distance_min_m": float(final_dists.min()),
|
| 440 |
+
"final_distance_max_m": float(final_dists.max()),
|
| 441 |
+
})
|
| 442 |
+
|
| 443 |
+
if anchor_scales.size:
|
| 444 |
+
metrics.update({
|
| 445 |
+
"anchor_scale_median": float(np.median(anchor_scales)),
|
| 446 |
+
"anchor_scale_std": float(anchor_scales.std()),
|
| 447 |
+
"anchor_scale_cv": float(anchor_scales.std() / (anchor_scales.mean() + 1e-6)),
|
| 448 |
+
})
|
| 449 |
+
|
| 450 |
+
if overlap_pairs:
|
| 451 |
+
pinhole_arr = np.array([pair[0] for pair in overlap_pairs], dtype=np.float32)
|
| 452 |
+
midas_arr = np.array([pair[1] for pair in overlap_pairs], dtype=np.float32)
|
| 453 |
+
abs_err = np.abs(midas_arr - pinhole_arr)
|
| 454 |
+
rel_err = abs_err / np.maximum(pinhole_arr, 1e-6)
|
| 455 |
+
metrics.update({
|
| 456 |
+
"agreement_sample_count": int(len(overlap_pairs)),
|
| 457 |
+
"agreement_mae_m": float(abs_err.mean()),
|
| 458 |
+
"agreement_rmse_m": float(np.sqrt(np.mean(abs_err ** 2))),
|
| 459 |
+
"agreement_mean_relative_error": float(rel_err.mean()),
|
| 460 |
+
"agreement_median_relative_error": float(np.median(rel_err)),
|
| 461 |
+
"agreement_within_10pct": float(np.mean(rel_err <= 0.10)),
|
| 462 |
+
"agreement_within_20pct": float(np.mean(rel_err <= 0.20)),
|
| 463 |
+
})
|
| 464 |
+
|
| 465 |
+
return metrics
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
def save_evaluation_outputs(
|
| 469 |
+
detections: list[dict],
|
| 470 |
+
metrics: dict,
|
| 471 |
+
eval_dir: str
|
| 472 |
+
) -> None:
|
| 473 |
+
os.makedirs(eval_dir, exist_ok=True)
|
| 474 |
+
|
| 475 |
+
csv_path = os.path.join(eval_dir, "detection_distances.csv")
|
| 476 |
+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
| 477 |
+
writer = csv.writer(f)
|
| 478 |
+
writer.writerow([
|
| 479 |
+
"label", "confidence", "pixel_height", "known_height_m",
|
| 480 |
+
"bbox_depth_median", "dist_pinhole_m", "dist_midas_m",
|
| 481 |
+
"final_distance_m", "method"
|
| 482 |
+
])
|
| 483 |
+
for det in sorted(detections, key=lambda d: d["distance"] if d["distance"] else 999):
|
| 484 |
+
writer.writerow([
|
| 485 |
+
det["label"],
|
| 486 |
+
f"{det['conf']:.6f}",
|
| 487 |
+
det.get("pixel_height"),
|
| 488 |
+
"" if det.get("known_height_m") is None else f"{det['known_height_m']:.3f}",
|
| 489 |
+
f"{det.get('bbox_depth_median', 0.0):.6f}",
|
| 490 |
+
"" if det.get("dist_pinhole") is None else f"{det['dist_pinhole']:.6f}",
|
| 491 |
+
"" if det.get("dist_midas") is None else f"{det['dist_midas']:.6f}",
|
| 492 |
+
"" if det.get("distance") is None else f"{det['distance']:.6f}",
|
| 493 |
+
det.get("method", "unknown"),
|
| 494 |
+
])
|
| 495 |
+
|
| 496 |
+
metrics_path = os.path.join(eval_dir, "metrics.json")
|
| 497 |
+
with open(metrics_path, "w", encoding="utf-8") as f:
|
| 498 |
+
json.dump(metrics, f, indent=2)
|
| 499 |
+
|
| 500 |
+
report_path = os.path.join(eval_dir, "evaluation_report.txt")
|
| 501 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 502 |
+
f.write("Subtask 2 Evaluation Report\n")
|
| 503 |
+
f.write("===========================\n\n")
|
| 504 |
+
f.write("This report measures internal consistency only.\n")
|
| 505 |
+
f.write("No ground-truth object distances are available here, so these metrics\n")
|
| 506 |
+
f.write("should be interpreted as coverage / robustness diagnostics, not absolute accuracy.\n\n")
|
| 507 |
+
f.write("Key metrics\n")
|
| 508 |
+
f.write("-----------\n")
|
| 509 |
+
for key, value in metrics.items():
|
| 510 |
+
if value is None:
|
| 511 |
+
pretty = "N/A"
|
| 512 |
+
elif isinstance(value, float):
|
| 513 |
+
pretty = f"{value:.4f}"
|
| 514 |
+
else:
|
| 515 |
+
pretty = str(value)
|
| 516 |
+
f.write(f"{key}: {pretty}\n")
|
| 517 |
+
|
| 518 |
+
f.write("\nMetric sufficiency note\n")
|
| 519 |
+
f.write("----------------------\n")
|
| 520 |
+
f.write("- Enough for internal evaluation: yes.\n")
|
| 521 |
+
f.write("- Enough for accuracy claims: no.\n")
|
| 522 |
+
f.write("- To measure real accuracy, add ground-truth distances and report MAE/RMSE/MAPE against labels.\n")
|
| 523 |
+
|
| 524 |
+
print(f" Saved -> {csv_path}")
|
| 525 |
+
print(f" Saved -> {metrics_path}")
|
| 526 |
+
print(f" Saved -> {report_path}")
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
+
# 5. DRAW ANNOTATED IMAGE
|
| 531 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 532 |
+
|
| 533 |
+
def draw_detections(
|
| 534 |
+
img: np.ndarray,
|
| 535 |
+
detections: list[dict]
|
| 536 |
+
) -> np.ndarray:
|
| 537 |
+
"""
|
| 538 |
+
Draw bounding boxes with labels on a copy of the image.
|
| 539 |
+
|
| 540 |
+
Label format: "<class>: X.X m (conf%)"
|
| 541 |
+
Each class gets a consistent colour from the palette.
|
| 542 |
+
"""
|
| 543 |
+
out = img.copy()
|
| 544 |
+
class_ids = {} # map class name β colour index
|
| 545 |
+
|
| 546 |
+
for det in detections:
|
| 547 |
+
label = det["label"]
|
| 548 |
+
dist = det["distance"]
|
| 549 |
+
conf = det["conf"]
|
| 550 |
+
x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
|
| 551 |
+
|
| 552 |
+
# Assign colour
|
| 553 |
+
if label not in class_ids:
|
| 554 |
+
class_ids[label] = len(class_ids) % len(_PALETTE)
|
| 555 |
+
colour = _PALETTE[class_ids[label]]
|
| 556 |
+
|
| 557 |
+
# Box
|
| 558 |
+
thickness = max(2, int((x2 - x1 + y2 - y1) / 200))
|
| 559 |
+
cv2.rectangle(out, (x1, y1), (x2, y2), colour, thickness)
|
| 560 |
+
|
| 561 |
+
# Label text
|
| 562 |
+
if dist is not None:
|
| 563 |
+
text = f"{label}: {dist:.1f} m ({conf:.0%})"
|
| 564 |
+
else:
|
| 565 |
+
text = f"{label} ({conf:.0%})"
|
| 566 |
+
|
| 567 |
+
# Dynamic font scale based on box size
|
| 568 |
+
box_h = max(1, y2 - y1)
|
| 569 |
+
font_scale = max(0.45, min(0.9, box_h / 180))
|
| 570 |
+
font_thick = max(1, int(font_scale * 2))
|
| 571 |
+
|
| 572 |
+
(tw, th), baseline = cv2.getTextSize(
|
| 573 |
+
text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thick)
|
| 574 |
+
|
| 575 |
+
# Background pill behind text
|
| 576 |
+
pad = 5
|
| 577 |
+
tx = max(0, x1)
|
| 578 |
+
ty_box = max(0, y1 - th - baseline - pad * 2)
|
| 579 |
+
cv2.rectangle(out,
|
| 580 |
+
(tx, ty_box),
|
| 581 |
+
(tx + tw + pad * 2, ty_box + th + baseline + pad * 2),
|
| 582 |
+
colour, -1)
|
| 583 |
+
# Invert text colour for readability
|
| 584 |
+
lum = 0.299 * colour[2] + 0.587 * colour[1] + 0.114 * colour[0]
|
| 585 |
+
txt_color = (0, 0, 0) if lum > 128 else (255, 255, 255)
|
| 586 |
+
cv2.putText(out, text,
|
| 587 |
+
(tx + pad, ty_box + th + pad),
|
| 588 |
+
cv2.FONT_HERSHEY_SIMPLEX, font_scale,
|
| 589 |
+
txt_color, font_thick, cv2.LINE_AA)
|
| 590 |
+
|
| 591 |
+
return out
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 595 |
+
# 6. VISUALISATION (combined figure)
|
| 596 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 597 |
+
|
| 598 |
+
def visualise_results(
|
| 599 |
+
img: np.ndarray,
|
| 600 |
+
depth_map: np.ndarray,
|
| 601 |
+
detections: list[dict],
|
| 602 |
+
annotated: np.ndarray,
|
| 603 |
+
out_path: str
|
| 604 |
+
) -> None:
|
| 605 |
+
"""
|
| 606 |
+
Three-panel figure:
|
| 607 |
+
1. Original image with raw YOLO boxes
|
| 608 |
+
2. MiDaS depth heatmap with boxes overlaid
|
| 609 |
+
3. Final annotated image with distance labels
|
| 610 |
+
"""
|
| 611 |
+
fig, axes = plt.subplots(1, 3, figsize=(19, 7), dpi=130)
|
| 612 |
+
fig.patch.set_facecolor("#1a1a2e")
|
| 613 |
+
|
| 614 |
+
h, w = img.shape[:2]
|
| 615 |
+
|
| 616 |
+
# ββ Panel 1: raw YOLO detections ββ
|
| 617 |
+
raw_boxes = img.copy()
|
| 618 |
+
for det in detections:
|
| 619 |
+
x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
|
| 620 |
+
cv2.rectangle(raw_boxes, (x1, y1), (x2, y2), (0, 255, 120), 2)
|
| 621 |
+
cv2.putText(raw_boxes, f"{det['label']} {det['conf']:.0%}",
|
| 622 |
+
(x1, max(0, y1 - 6)),
|
| 623 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 255, 120), 2, cv2.LINE_AA)
|
| 624 |
+
|
| 625 |
+
axes[0].imshow(cv2.cvtColor(raw_boxes, cv2.COLOR_BGR2RGB))
|
| 626 |
+
axes[0].set_title("YOLO Detections", color="white", fontsize=11,
|
| 627 |
+
fontweight="bold", pad=10)
|
| 628 |
+
axes[0].axis("off")
|
| 629 |
+
|
| 630 |
+
# ββ Panel 2: MiDaS depth + boxes ββ
|
| 631 |
+
depth_bgr = depth_to_heatmap(depth_map)
|
| 632 |
+
depth_over = depth_bgr.copy()
|
| 633 |
+
for det in detections:
|
| 634 |
+
x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
|
| 635 |
+
cv2.rectangle(depth_over, (x1, y1), (x2, y2), (255, 255, 255), 2)
|
| 636 |
+
dist_txt = f"{det['distance']:.1f}m" if det["distance"] else "?"
|
| 637 |
+
cv2.putText(depth_over, dist_txt,
|
| 638 |
+
(x1 + 3, y1 + 18),
|
| 639 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2, cv2.LINE_AA)
|
| 640 |
+
|
| 641 |
+
axes[1].imshow(cv2.cvtColor(depth_over, cv2.COLOR_BGR2RGB))
|
| 642 |
+
sm = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(0, 1))
|
| 643 |
+
sm.set_array([])
|
| 644 |
+
cb = plt.colorbar(sm, ax=axes[1], fraction=0.035, pad=0.02)
|
| 645 |
+
cb.set_label("Near β Far", color="white", fontsize=8)
|
| 646 |
+
cb.set_ticks([0, 0.5, 1])
|
| 647 |
+
cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=8)
|
| 648 |
+
cb.ax.yaxis.set_tick_params(color="white")
|
| 649 |
+
axes[1].set_title("MiDaS Depth + Distance Estimates",
|
| 650 |
+
color="white", fontsize=11, fontweight="bold", pad=10)
|
| 651 |
+
axes[1].axis("off")
|
| 652 |
+
|
| 653 |
+
# ββ Panel 3: final annotated image ββ
|
| 654 |
+
axes[2].imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
|
| 655 |
+
axes[2].set_title("Object Distances (pinhole + MiDaS blend)",
|
| 656 |
+
color="white", fontsize=11, fontweight="bold", pad=10)
|
| 657 |
+
axes[2].axis("off")
|
| 658 |
+
|
| 659 |
+
# ββ Distance table below ββ
|
| 660 |
+
rows = []
|
| 661 |
+
for det in sorted(detections,
|
| 662 |
+
key=lambda d: d["distance"] if d["distance"] else 999):
|
| 663 |
+
dist_str = f"{det['distance']:.2f} m" if det["distance"] is not None else "N/A"
|
| 664 |
+
ph_str = (f"{det['dist_pinhole']:.2f} m"
|
| 665 |
+
if det.get("dist_pinhole") is not None else "β")
|
| 666 |
+
md_str = (f"{det['dist_midas']:.2f} m"
|
| 667 |
+
if det.get("dist_midas") is not None else "β")
|
| 668 |
+
rows.append([det["label"], f"{det['conf']:.0%}",
|
| 669 |
+
ph_str, md_str, dist_str, det["method"]])
|
| 670 |
+
|
| 671 |
+
if rows:
|
| 672 |
+
table_ax = fig.add_axes([0.05, -0.14, 0.90, 0.14])
|
| 673 |
+
table_ax.axis("off")
|
| 674 |
+
table_ax.set_facecolor("#1a1a2e")
|
| 675 |
+
col_labels = ["Object", "Confidence",
|
| 676 |
+
"Pinhole est.", "MiDaS est.", "Final distance", "Method"]
|
| 677 |
+
tbl = table_ax.table(
|
| 678 |
+
cellText=rows,
|
| 679 |
+
colLabels=col_labels,
|
| 680 |
+
cellLoc="center", loc="center"
|
| 681 |
+
)
|
| 682 |
+
tbl.auto_set_font_size(False)
|
| 683 |
+
tbl.set_fontsize(8.5)
|
| 684 |
+
tbl.scale(1, 1.55)
|
| 685 |
+
# Style header
|
| 686 |
+
for j in range(len(col_labels)):
|
| 687 |
+
tbl[(0, j)].set_facecolor("#2e4057")
|
| 688 |
+
tbl[(0, j)].set_text_props(color="white", fontweight="bold")
|
| 689 |
+
# Alternating row shading
|
| 690 |
+
for i in range(1, len(rows) + 1):
|
| 691 |
+
bg = "#1e2d40" if i % 2 == 0 else "#16213e"
|
| 692 |
+
for j in range(len(col_labels)):
|
| 693 |
+
tbl[(i, j)].set_facecolor(bg)
|
| 694 |
+
tbl[(i, j)].set_text_props(color="#dde")
|
| 695 |
+
|
| 696 |
+
plt.suptitle(
|
| 697 |
+
"Subtask 2 β Object Detection & Distance Estimation\n"
|
| 698 |
+
"Distance = pinhole camera model + MiDaS depth scaling",
|
| 699 |
+
color="white", fontsize=13, fontweight="bold", y=1.02
|
| 700 |
+
)
|
| 701 |
+
plt.tight_layout()
|
| 702 |
+
|
| 703 |
+
os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
|
| 704 |
+
plt.savefig(out_path, dpi=130, bbox_inches="tight",
|
| 705 |
+
facecolor=fig.get_facecolor())
|
| 706 |
+
plt.close(fig)
|
| 707 |
+
print(f"Saved -> {out_path}")
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 711 |
+
# 7. MAIN
|
| 712 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 713 |
+
|
| 714 |
+
def main() -> None:
|
| 715 |
+
if len(sys.argv) < 2:
|
| 716 |
+
sys.exit(
|
| 717 |
+
"Usage: python object_distance.py <image_path> [output_dir] [focal_px]\n"
|
| 718 |
+
"Example: python object_distance.py street.jpg output/ 800"
|
| 719 |
+
)
|
| 720 |
+
|
| 721 |
+
image_path = sys.argv[1]
|
| 722 |
+
out_dir = sys.argv[2] if len(sys.argv) > 2 else "output"
|
| 723 |
+
focal_length = float(sys.argv[3]) if len(sys.argv) > 3 else None
|
| 724 |
+
image_dir = os.path.join(out_dir, "images")
|
| 725 |
+
eval_dir = os.path.join(out_dir, "evaluation")
|
| 726 |
+
|
| 727 |
+
# ββ Load image ββ
|
| 728 |
+
img = load_image(image_path)
|
| 729 |
+
h, w = img.shape[:2]
|
| 730 |
+
|
| 731 |
+
if focal_length is None:
|
| 732 |
+
focal_length = estimate_focal_length(w, fov_deg=60.0)
|
| 733 |
+
print(f"Focal length estimated: {focal_length:.1f} px "
|
| 734 |
+
f"(assuming 60Β° horizontal FOV β override via 3rd argument)")
|
| 735 |
+
else:
|
| 736 |
+
print(f"Focal length (user-supplied): {focal_length:.1f} px")
|
| 737 |
+
|
| 738 |
+
# ββ MiDaS depth ββ
|
| 739 |
+
print("\n[ MiDaS ] Loading MiDaS_small ...")
|
| 740 |
+
midas_model, midas_transform, device = load_midas("MiDaS_small")
|
| 741 |
+
print("[ MiDaS ] Running inference ...")
|
| 742 |
+
depth_map = midas_depth(img, midas_model, midas_transform, device)
|
| 743 |
+
print(f" Done. depth in [0,1] mean={depth_map.mean():.3f}")
|
| 744 |
+
|
| 745 |
+
# ββ YOLO detection ββ
|
| 746 |
+
print("\n[ YOLO ] Loading YOLOv5s ...")
|
| 747 |
+
yolo_model = load_yolo("yolov5s")
|
| 748 |
+
print("[ YOLO ] Running detection ...")
|
| 749 |
+
detections = run_yolo(yolo_model, img)
|
| 750 |
+
|
| 751 |
+
if not detections:
|
| 752 |
+
print("WARNING: No objects detected. "
|
| 753 |
+
"Try a lower confidence threshold or a different image.")
|
| 754 |
+
sys.exit(0)
|
| 755 |
+
|
| 756 |
+
# ββ Distance estimation ββ
|
| 757 |
+
print("\n[ Dist ] Estimating distances ...")
|
| 758 |
+
detections, eval_context = estimate_distances(detections, depth_map, focal_length)
|
| 759 |
+
metrics = compute_evaluation_metrics(detections, focal_length, eval_context)
|
| 760 |
+
|
| 761 |
+
# Print summary table
|
| 762 |
+
print(f"\n {'Object':<18} {'Conf':>5} {'Pinhole':>10} "
|
| 763 |
+
f"{'MiDaS':>10} {'Final':>10} Method")
|
| 764 |
+
print(" " + "-" * 70)
|
| 765 |
+
for det in sorted(detections,
|
| 766 |
+
key=lambda d: d["distance"] if d["distance"] else 999):
|
| 767 |
+
dp = f"{det['dist_pinhole']:.1f} m" if det.get("dist_pinhole") else " β"
|
| 768 |
+
dm = f"{det['dist_midas']:.1f} m" if det.get("dist_midas") else " β"
|
| 769 |
+
df = f"{det['distance']:.1f} m" if det.get("distance") else " β"
|
| 770 |
+
print(f" {det['label']:<18} {det['conf']:>4.0%} "
|
| 771 |
+
f"{dp:>10} {dm:>10} {df:>10} {det['method']}")
|
| 772 |
+
|
| 773 |
+
# ββ Draw and save ββ
|
| 774 |
+
print("\n[ Draw ] Annotating image ...")
|
| 775 |
+
annotated = draw_detections(img, detections)
|
| 776 |
+
|
| 777 |
+
os.makedirs(image_dir, exist_ok=True)
|
| 778 |
+
os.makedirs(eval_dir, exist_ok=True)
|
| 779 |
+
annotated_path = os.path.join(image_dir, "detections_with_distance.png")
|
| 780 |
+
cv2.imwrite(annotated_path, annotated)
|
| 781 |
+
cv2.imwrite(os.path.join(image_dir, "midas_depth.png"),
|
| 782 |
+
depth_to_heatmap(depth_map))
|
| 783 |
+
print(f" Saved -> {annotated_path}")
|
| 784 |
+
|
| 785 |
+
print("\n[ Fig ] Compositing combined figure ...")
|
| 786 |
+
visualise_results(
|
| 787 |
+
img, depth_map, detections, annotated,
|
| 788 |
+
out_path=os.path.join(image_dir, "object_distance_subtask2.png")
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
print("\n[ Eval ] Writing evaluation artifacts ...")
|
| 792 |
+
save_evaluation_outputs(detections, metrics, eval_dir)
|
| 793 |
+
|
| 794 |
+
print(f"\nDone. Image outputs: {image_dir}/")
|
| 795 |
+
print(f"Done. Evaluation outputs: {eval_dir}/")
|
| 796 |
+
|
| 797 |
+
|
| 798 |
+
if __name__ == "__main__":
|
| 799 |
+
main()
|