1javid commited on
Commit
e77df61
Β·
verified Β·
1 Parent(s): f21524b

Upload 2 files

Browse files
Files changed (2) hide show
  1. depth_estimation.py +409 -0
  2. object_distance.py +799 -0
depth_estimation.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Subtask 1 – Depth Estimation
3
+ 1. Classical method : SGBM Stereo Matching on a synthesised stereo pair
4
+ 2. ML-based method : Actual MiDaS (MiDaS_small) via torch.hub
5
+ 3. Both rendered as heatmaps (hot colours = close, cold colours = far)
6
+
7
+ Usage:
8
+ python depth_estimation.py <image_path> [output_dir]
9
+
10
+ Example:
11
+ python depth_estimation.py street.jpg output/
12
+ """
13
+
14
+ import sys
15
+ import os
16
+
17
+ import cv2
18
+ import numpy as np
19
+ import matplotlib
20
+ matplotlib.use("Agg")
21
+ import matplotlib.pyplot as plt
22
+ from scipy.ndimage import gaussian_filter
23
+ import torch
24
+
25
+
26
+ # ═══════════════════════════════════════════════════════════
27
+ # 0. LOAD IMAGE (real image required)
28
+ # ═══════════════════════════════════════════════════════════
29
+
30
+ def load_image(path: str) -> np.ndarray:
31
+ if not path or not os.path.exists(path):
32
+ sys.exit(
33
+ f"ERROR: Image not found: '{path}'\n"
34
+ "Usage: python depth_estimation.py <image_path>\n"
35
+ "Example: python depth_estimation.py street.jpg"
36
+ )
37
+ img = cv2.imread(path)
38
+ if img is None:
39
+ sys.exit(f"ERROR: Could not read image: '{path}'")
40
+ print(f"Loaded: {path} {img.shape[1]}x{img.shape[0]} ({img.shape[2]} channels)")
41
+ return img
42
+
43
+
44
+ # ═══════════════════════════════════════════════════════════
45
+ # 1. CLASSICAL METHOD – SGBM STEREO MATCHING
46
+ # ═══════════════════════════════════════════════════════════
47
+
48
+ def synthesise_stereo_pair(
49
+ img: np.ndarray,
50
+ baseline_shift_pct: float = 0.03
51
+ ) -> tuple:
52
+ """
53
+ Simulate a stereo pair from a monocular image.
54
+
55
+ A per-pixel disparity seed is estimated from two monocular cues:
56
+ - Focus sharpness (Laplacian magnitude): sharp regions β†’ close
57
+ - Vertical position (perspective geometry): lower in frame β†’ close
58
+
59
+ That seed drives a horizontal warp to produce the right view,
60
+ mimicking a camera shifted by `baseline_shift_pct * width` pixels.
61
+ This is the same bootstrap step used in single-image SfM pipelines.
62
+ """
63
+ h, w = img.shape[:2]
64
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
65
+
66
+ # Sharpness cue
67
+ lap = cv2.Laplacian(gray.astype(np.float32), cv2.CV_32F)
68
+ sharpness = gaussian_filter(np.abs(lap), sigma=5)
69
+ sharpness = sharpness / (sharpness.max() + 1e-6)
70
+
71
+ # Vertical prior
72
+ vert = np.linspace(0, 1, h)[:, None] * np.ones((h, w))
73
+
74
+ # Combine and smooth
75
+ closeness = 0.5 * sharpness + 0.5 * vert
76
+ closeness = gaussian_filter(closeness.astype(np.float32), sigma=10)
77
+ closeness = (closeness - closeness.min()) / (closeness.max() - closeness.min() + 1e-6)
78
+
79
+ max_shift = int(w * baseline_shift_pct)
80
+ disp_seed = (closeness * max_shift).astype(np.float32)
81
+
82
+ # Warp: right image looks slightly to the left
83
+ map_x = np.tile(np.arange(w, dtype=np.float32), (h, 1)) - disp_seed
84
+ map_y = np.tile(np.arange(h, dtype=np.float32)[:, None], (1, w))
85
+ right = cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR,
86
+ borderMode=cv2.BORDER_REPLICATE)
87
+ return img.copy(), right, max_shift
88
+
89
+
90
+ def sgbm_depth(
91
+ img: np.ndarray,
92
+ baseline_shift_pct: float = 0.03,
93
+ block_size: int = 7,
94
+ uniqueness_ratio: int = 10,
95
+ speckle_window_size: int = 100,
96
+ speckle_range: int = 2
97
+ ) -> tuple:
98
+ """
99
+ Semi-Global Block Matching (HirschmΓΌller 2008).
100
+
101
+ SGBM minimises a global energy function across multiple 1-D scanline
102
+ paths (8 directions in SGBM_3WAY mode), combining a per-pixel data
103
+ cost (census transform) with smoothness penalties P1/P2 that penalise
104
+ disparity discontinuities.
105
+
106
+ Returns:
107
+ depth_norm – normalised closeness map [0, 1], 1 = close
108
+ left_img – left view of stereo pair
109
+ right_img – right view of stereo pair
110
+ """
111
+ left_img, right_img, max_shift = synthesise_stereo_pair(
112
+ img, baseline_shift_pct=baseline_shift_pct
113
+ )
114
+
115
+ left_g = cv2.cvtColor(left_img, cv2.COLOR_BGR2GRAY)
116
+ right_g = cv2.cvtColor(right_img, cv2.COLOR_BGR2GRAY)
117
+
118
+ num_disp = max(16, ((max_shift // 16) + 1) * 16) # must be multiple of 16
119
+ block = max(3, int(block_size))
120
+ if block % 2 == 0:
121
+ block += 1
122
+
123
+ matcher = cv2.StereoSGBM_create(
124
+ minDisparity = 0,
125
+ numDisparities = num_disp,
126
+ blockSize = block,
127
+ P1 = 8 * 3 * block ** 2, # small-discontinuity penalty
128
+ P2 = 32 * 3 * block ** 2, # large-discontinuity penalty
129
+ disp12MaxDiff = 5,
130
+ uniquenessRatio = uniqueness_ratio,
131
+ speckleWindowSize = speckle_window_size,
132
+ speckleRange = speckle_range,
133
+ mode = cv2.STEREO_SGBM_MODE_SGBM_3WAY
134
+ )
135
+
136
+ disp = matcher.compute(left_g, right_g).astype(np.float32) / 16.0
137
+ disp = np.maximum(disp, 0)
138
+
139
+ # Edge-preserving smoothing (bilateral keeps object boundaries clean)
140
+ disp = cv2.bilateralFilter(disp, d=9, sigmaColor=75, sigmaSpace=75)
141
+
142
+ # Normalise to [0, 1]: high disparity = close = 1
143
+ d = (disp - disp.min()) / (disp.max() - disp.min() + 1e-6)
144
+
145
+ # Guided filter refinement β€” sharpens depth edges using the colour image
146
+ d_8u = (d * 255).clip(0, 255).astype(np.uint8)
147
+ d = cv2.ximgproc.guidedFilter(
148
+ guide=left_g, src=d_8u, radius=8, eps=200, dDepth=cv2.CV_32F)
149
+ d = np.clip(d / (d.max() + 1e-6), 0, 1)
150
+
151
+ return d, left_img, right_img
152
+
153
+
154
+ # ═══════════════════════════════════════════════════════════
155
+ # 2. ML-BASED METHOD – Actual MiDaS (MiDaS_small)
156
+ # ═══════════════════════════════════════════════════════════
157
+
158
+ def load_midas(model_type: str = "MiDaS_small"):
159
+ """
160
+ Load MiDaS from torch.hub (intel-isl/MiDaS).
161
+
162
+ Available model_type values (largest β†’ smallest / slowest β†’ fastest):
163
+ "DPT_Large" – DPT-L (ViT-L backbone, best quality)
164
+ "DPT_Hybrid" – DPT-H (ViT-H + ResNet50, good balance)
165
+ "MiDaS" – MiDaS v2.1 large (ResNet-101)
166
+ "MiDaS_small" – MiDaS v2.1 small (EfficientNet-Lite, fast) ← default
167
+
168
+ Weights are cached in ~/.cache/torch/hub/ after the first download.
169
+ """
170
+ print(f"[ MiDaS ] Loading model '{model_type}' from torch.hub ...")
171
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
172
+ print(f" Device: {device}")
173
+
174
+ model = torch.hub.load("intel-isl/MiDaS", model_type, trust_repo=True)
175
+ model.to(device).eval()
176
+
177
+ transforms = torch.hub.load("intel-isl/MiDaS", "transforms", trust_repo=True)
178
+ transform = (transforms.small_transform
179
+ if model_type == "MiDaS_small"
180
+ else transforms.dpt_transform)
181
+
182
+ n_params = sum(p.numel() for p in model.parameters())
183
+ print(f" Model loaded ({n_params:,} parameters)")
184
+ return model, transform, device
185
+
186
+
187
+ def midas_depth(
188
+ img: np.ndarray,
189
+ model,
190
+ transform,
191
+ device: torch.device
192
+ ) -> np.ndarray:
193
+ """
194
+ Run MiDaS inference on a BGR image.
195
+
196
+ MiDaS predicts *inverse* relative depth (disparity-like): larger values
197
+ correspond to closer surfaces. We normalise to [0, 1] so 1 = close.
198
+
199
+ Pipeline:
200
+ BGR image
201
+ β†’ RGB conversion
202
+ β†’ MiDaS transform (resize to 256x256 + ImageNet normalisation)
203
+ β†’ EfficientNet encoder (feature extraction)
204
+ β†’ decoder + skip connections
205
+ β†’ bilinear upsample to original resolution
206
+ β†’ normalise to [0, 1]
207
+
208
+ Returns:
209
+ depth_norm – closeness map [0, 1] at original image resolution
210
+ """
211
+ h, w = img.shape[:2]
212
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
213
+
214
+ # Preprocess: resize + normalise
215
+ input_batch = transform(img_rgb).to(device)
216
+
217
+ with torch.no_grad():
218
+ prediction = model(input_batch)
219
+ # Upsample back to original resolution
220
+ prediction = torch.nn.functional.interpolate(
221
+ prediction.unsqueeze(1),
222
+ size=(h, w),
223
+ mode="bilinear",
224
+ align_corners=False,
225
+ ).squeeze()
226
+
227
+ depth = prediction.cpu().numpy()
228
+
229
+ # MiDaS output is inverse depth β€” higher value means closer.
230
+ # Normalise to [0, 1].
231
+ depth = (depth - depth.min()) / (depth.max() - depth.min() + 1e-6)
232
+ return depth.astype(np.float32)
233
+
234
+
235
+ # ═══════════════════════════════════════════════════════════
236
+ # 3. VISUALISATION
237
+ # ═══════════════════════════════════════════════════════════
238
+
239
+ def depth_to_heatmap(depth: np.ndarray) -> np.ndarray:
240
+ """depth [0,1] where 1=close β†’ turbo BGR heatmap image."""
241
+ cmap = plt.get_cmap("turbo")
242
+ rgb = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
243
+ return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
244
+
245
+
246
+ def visualise_results(
247
+ img: np.ndarray,
248
+ depth_cl: np.ndarray,
249
+ depth_ml: np.ndarray,
250
+ out_path: str = "output/depth_estimation_subtask1.png"
251
+ ) -> None:
252
+ """
253
+ Compose a 3-column figure:
254
+ Col 1 – Original image
255
+ Col 2 – Classical SGBM heatmap + scan-line profiles
256
+ Col 3 – MiDaS heatmap + scan-line profiles
257
+ """
258
+ h, w = img.shape[:2]
259
+ ncols = 3
260
+
261
+ fig = plt.figure(figsize=(ncols * 5.6, 11), dpi=130)
262
+ fig.patch.set_facecolor("#1a1a2e")
263
+
264
+ titles = [
265
+ "Original Image",
266
+ "Classical Depth\n(SGBM Stereo Matching)",
267
+ "ML-Based Depth\n(MiDaS_small β€” actual model)",
268
+ ]
269
+ depths = [None, depth_cl, depth_ml]
270
+
271
+ ax_top = [fig.add_subplot(2, ncols, c + 1) for c in range(ncols)]
272
+ ax_bot = [fig.add_subplot(2, ncols, ncols + c + 1) for c in range(ncols)]
273
+
274
+ # ── Top row: images / heatmaps ──
275
+ for ax, title, d in zip(ax_top, titles, depths):
276
+ ax.set_title(title, color="white", fontsize=10, fontweight="bold", pad=8)
277
+ ax.axis("off")
278
+ rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
279
+ if d is None:
280
+ ax.imshow(rgb)
281
+ else:
282
+ cmap_arr = plt.get_cmap("turbo")(d)[:, :, :3]
283
+ blended = rgb.astype(np.float32) / 255 * 0.22 + cmap_arr * 0.78
284
+ ax.imshow(blended)
285
+ sm = plt.cm.ScalarMappable(cmap="turbo",
286
+ norm=plt.Normalize(vmin=0, vmax=1))
287
+ sm.set_array([])
288
+ cb = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.02)
289
+ cb.set_label("Near -> Far", color="white", fontsize=7)
290
+ cb.set_ticks([0, 0.5, 1])
291
+ cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=7)
292
+ cb.ax.yaxis.set_tick_params(color="white")
293
+
294
+ # ── Scan lines on heatmap panels ──
295
+ scan_ys = [int(h * f) for f in [0.25, 0.50, 0.75]]
296
+ scan_colors = ["#ff6b6b", "#ffd93d", "#6bcb77"]
297
+ for ax in ax_top[1:]:
298
+ for sy, sc in zip(scan_ys, scan_colors):
299
+ ax.axhline(sy, color=sc, linewidth=1.2, alpha=0.75)
300
+
301
+ # ── Bottom row: depth profile plots ──
302
+ x = np.arange(w)
303
+ method_maps = [depth_cl, depth_ml]
304
+ method_names = ["Classical (SGBM)", "MiDaS (actual)"]
305
+ ls = ["-", "--"]
306
+
307
+ for col, ax in enumerate(ax_bot):
308
+ ax.set_facecolor("#16213e")
309
+ for sp in ["top", "right"]: ax.spines[sp].set_visible(False)
310
+ for sp in ["bottom", "left"]: ax.spines[sp].set_color("#555")
311
+ ax.tick_params(colors="#888", labelsize=7)
312
+ ax.set_xlim(0, w - 1)
313
+ ax.set_ylim(-0.05, 1.05)
314
+ ax.set_xlabel("Pixel x", color="#aaa", fontsize=8)
315
+ ax.set_ylabel("Closeness (1 = near)", color="#aaa", fontsize=8)
316
+
317
+ if col == 0:
318
+ # Compare both methods at the middle scan line
319
+ ax.set_title("Method comparison β€” middle scan line",
320
+ color="white", fontsize=9, pad=6)
321
+ sy = scan_ys[1]
322
+ for mp, nm, l in zip(method_maps, method_names, ls):
323
+ ax.plot(x, mp[sy, :], linestyle=l, linewidth=1.6, label=nm)
324
+ ax.legend(fontsize=8, framealpha=0.25, labelcolor="white")
325
+
326
+ else:
327
+ # Per-method: three scan lines
328
+ mp = method_maps[col - 1]
329
+ nm = method_names[col - 1]
330
+ ax.set_title(f"{nm} β€” scan-line profiles",
331
+ color="white", fontsize=9, pad=6)
332
+ for sy, sc in zip(scan_ys, scan_colors):
333
+ ax.plot(x, mp[sy, :], color=sc, linewidth=1.4,
334
+ label=f"y = {sy}")
335
+ ax.legend(fontsize=7, framealpha=0.25, labelcolor="white")
336
+
337
+ # ── Colour scale strip ──
338
+ ax_s = fig.add_axes([0.05, 0.01, 0.90, 0.022])
339
+ ax_s.imshow(np.linspace(0, 1, 512).reshape(1, -1),
340
+ aspect="auto", cmap="turbo")
341
+ ax_s.set_yticks([])
342
+ ax_s.set_xticks([0, 170, 341, 511])
343
+ ax_s.set_xticklabels(
344
+ ["Far (cold / blue)", "Mid-far", "Mid-close", "Close (hot / red)"],
345
+ color="white", fontsize=8
346
+ )
347
+
348
+ plt.suptitle(
349
+ "Subtask 1 β€” Classical (SGBM) vs ML-Based (MiDaS) Depth Estimation\n"
350
+ "Heatmap: red/hot = close blue/cold = far",
351
+ color="white", fontsize=13, fontweight="bold", y=1.003
352
+ )
353
+ plt.tight_layout(rect=[0, 0.05, 1, 1])
354
+
355
+ os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
356
+ plt.savefig(out_path, dpi=130, bbox_inches="tight",
357
+ facecolor=fig.get_facecolor())
358
+ plt.close(fig)
359
+ print(f"Saved -> {out_path}")
360
+
361
+
362
+ # ═══════════════════════════════════════════════════════════
363
+ # 4. MAIN
364
+ # ═══════════════════════════════════════════════════════════
365
+
366
+ def main() -> None:
367
+ if len(sys.argv) < 2:
368
+ sys.exit(
369
+ "Usage: python depth_estimation.py <image_path> [output_dir]\n"
370
+ "Example: python depth_estimation.py street.jpg output/"
371
+ )
372
+
373
+ image_path = sys.argv[1]
374
+ out_dir = sys.argv[2] if len(sys.argv) > 2 else "output"
375
+
376
+ # ── Load image ──
377
+ img = load_image(image_path)
378
+
379
+ # ── Classical: SGBM ─��
380
+ print("\n[ Classical ] Running SGBM stereo matching ...")
381
+ depth_cl, left_img, right_img = sgbm_depth(img)
382
+ print(f" Done. depth in [0,1] mean={depth_cl.mean():.3f}")
383
+
384
+ # ── ML: actual MiDaS ──
385
+ print("\n[ MiDaS ] Loading and running MiDaS_small ...")
386
+ midas_model, midas_transform, device = load_midas("MiDaS_small")
387
+ depth_ml = midas_depth(img, midas_model, midas_transform, device)
388
+ print(f" Done. depth in [0,1] mean={depth_ml.mean():.3f}")
389
+
390
+ # ── Save outputs ──
391
+ os.makedirs(out_dir, exist_ok=True)
392
+ cv2.imwrite(os.path.join(out_dir, "classical_heatmap.png"),
393
+ depth_to_heatmap(depth_cl))
394
+ cv2.imwrite(os.path.join(out_dir, "midas_heatmap.png"),
395
+ depth_to_heatmap(depth_ml))
396
+ cv2.imwrite(os.path.join(out_dir, "stereo_left.png"), left_img)
397
+ cv2.imwrite(os.path.join(out_dir, "stereo_right.png"), right_img)
398
+
399
+ print("\n[ Visualise ] Compositing final figure ...")
400
+ visualise_results(
401
+ img, depth_cl, depth_ml,
402
+ out_path=os.path.join(out_dir, "depth_estimation_subtask1.png")
403
+ )
404
+
405
+ print(f"\nDone. Outputs written to: {out_dir}/")
406
+
407
+
408
+ if __name__ == "__main__":
409
+ main()
object_distance.py ADDED
@@ -0,0 +1,799 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Subtask 2 – Object Detection + Distance Estimation
3
+ 1. Detect objects with YOLOv5s (torch.hub)
4
+ 2. Estimate metric distance (metres) per object using two complementary strategies:
5
+ A) Pinhole camera model – uses known real-world object heights
6
+ B) MiDaS depth scaling – calibrates MiDaS relative depth with pinhole anchors,
7
+ then applies the calibrated scale to all objects
8
+ 3. Draw labelled bounding boxes on the image ("person: 5.2 m")
9
+ 4. Produce a combined figure: original detections | MiDaS depth | annotated result
10
+
11
+ Usage:
12
+ python object_distance.py <image_path> [output_dir] [focal_length_px]
13
+
14
+ Examples:
15
+ python object_distance.py street.jpg
16
+ python object_distance.py street.jpg output/ 800
17
+ """
18
+
19
+ import sys
20
+ import os
21
+ import math
22
+ import csv
23
+ import json
24
+ from typing import Optional, Tuple, List
25
+
26
+ import cv2
27
+ import numpy as np
28
+ import matplotlib
29
+ matplotlib.use("Agg")
30
+ import matplotlib.pyplot as plt
31
+ import torch
32
+
33
+ # ── re-use MiDaS loader from Subtask 1 ──────────────────────
34
+ sys.path.insert(0, os.path.dirname(__file__))
35
+ from depth_estimation import load_image, load_midas, midas_depth, depth_to_heatmap
36
+
37
+
38
+ # ═══════════════════════════════════════════════════════════
39
+ # 1. KNOWN OBJECT HEIGHTS (metres)
40
+ # Used by the pinhole camera model.
41
+ # Values are representative averages for the COCO classes
42
+ # that appear most often in street / indoor scenes.
43
+ # ═══════════════════════════════════════════════════════════
44
+
45
+ KNOWN_HEIGHTS: dict[str, float] = {
46
+ # People & animals
47
+ "person": 1.70,
48
+ "cat": 0.30,
49
+ "dog": 0.50,
50
+ "horse": 1.60,
51
+ "cow": 1.40,
52
+ "sheep": 0.90,
53
+ "elephant": 3.00,
54
+ "bear": 1.20,
55
+ "zebra": 1.40,
56
+ "giraffe": 4.50,
57
+ # Vehicles
58
+ "bicycle": 1.00,
59
+ "car": 1.50,
60
+ "motorcycle": 1.10,
61
+ "airplane": 4.00,
62
+ "bus": 3.20,
63
+ "train": 4.00,
64
+ "truck": 3.50,
65
+ "boat": 1.50,
66
+ # Street furniture
67
+ "traffic light":0.90,
68
+ "fire hydrant": 0.60,
69
+ "stop sign": 0.75,
70
+ "parking meter":1.20,
71
+ "bench": 0.90,
72
+ # Indoor objects
73
+ "chair": 0.90,
74
+ "couch": 0.85,
75
+ "bed": 0.55,
76
+ "dining table": 0.75,
77
+ "toilet": 0.40,
78
+ "tv": 0.65,
79
+ "laptop": 0.30,
80
+ "microwave": 0.35,
81
+ "oven": 0.90,
82
+ "refrigerator": 1.80,
83
+ "sink": 0.20,
84
+ "door": 2.10,
85
+ # Handheld / small
86
+ "bottle": 0.25,
87
+ "cup": 0.12,
88
+ "backpack": 0.50,
89
+ "umbrella": 1.00,
90
+ "handbag": 0.30,
91
+ "suitcase": 0.65,
92
+ "sports ball": 0.22,
93
+ "baseball bat": 1.05,
94
+ "skateboard": 0.15,
95
+ "surfboard": 1.80,
96
+ "tennis racket":0.68,
97
+ "book": 0.22,
98
+ "clock": 0.30,
99
+ "vase": 0.30,
100
+ "scissors": 0.18,
101
+ }
102
+
103
+ # Colour palette (BGR) – one per class, cycling if more classes appear
104
+ _PALETTE = [
105
+ (0, 200, 255), # yellow
106
+ (0, 255, 100), # green
107
+ (255, 80, 80), # blue
108
+ (180, 0, 255), # magenta
109
+ (0, 160, 255), # orange
110
+ (255, 200, 0), # cyan
111
+ (100, 255, 200), # lime
112
+ (255, 50, 180), # pink
113
+ ]
114
+
115
+
116
+ # ═══════════════════════════════════════════════════════════
117
+ # 2. FOCAL LENGTH ESTIMATION
118
+ # ═══════════════════════════════════════════════════════════
119
+
120
+ def estimate_focal_length(image_width: int, fov_deg: float = 60.0) -> float:
121
+ """
122
+ Estimate the focal length in pixels from a known (or assumed) horizontal FOV.
123
+
124
+ f = (image_width / 2) / tan(FOV / 2)
125
+
126
+ The default of 60Β° covers most smartphones and consumer cameras.
127
+ Pass --focal to override with a measured value if you have camera metadata.
128
+ """
129
+ return (image_width / 2.0) / math.tan(math.radians(fov_deg / 2.0))
130
+
131
+
132
+ # ═══════════════════════════════════════════════════════════
133
+ # 3. OBJECT DETECTION (YOLOv5s via torch.hub)
134
+ # ═══════════════════════════════════════════════════════════
135
+
136
+ def load_yolo(
137
+ model_name: str = "yolov5s",
138
+ conf_thresh: float = 0.35,
139
+ iou_thresh: float = 0.45
140
+ ):
141
+ """
142
+ Load YOLOv5 from torch.hub.
143
+
144
+ Available sizes (speed ↑ / accuracy ↓):
145
+ yolov5n – nano
146
+ yolov5s – small ← default, good balance
147
+ yolov5m – medium
148
+ yolov5l – large
149
+ yolov5x – extra-large
150
+ """
151
+ print(f"[ YOLO ] Loading {model_name} from torch.hub ...")
152
+ model = torch.hub.load(
153
+ "ultralytics/yolov5", model_name,
154
+ pretrained=True, trust_repo=True
155
+ )
156
+ model.conf = conf_thresh
157
+ model.iou = iou_thresh
158
+ print(f" Loaded ({model_name})")
159
+ return model
160
+
161
+
162
+ def run_yolo(
163
+ model,
164
+ img: np.ndarray,
165
+ conf_thresh: float = 0.35
166
+ ) -> list[dict]:
167
+ """
168
+ Run YOLOv5 on a BGR image.
169
+
170
+ Returns a list of detections, each a dict:
171
+ { 'label': str, 'conf': float,
172
+ 'x1': int, 'y1': int, 'x2': int, 'y2': int }
173
+ """
174
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
175
+ results = model(img_rgb)
176
+ df = results.pandas().xyxy[0] # Pandas DataFrame
177
+
178
+ detections = []
179
+ for _, row in df.iterrows():
180
+ if row["confidence"] < conf_thresh:
181
+ continue
182
+ detections.append({
183
+ "label": row["name"],
184
+ "conf": float(row["confidence"]),
185
+ "x1": int(row["xmin"]),
186
+ "y1": int(row["ymin"]),
187
+ "x2": int(row["xmax"]),
188
+ "y2": int(row["ymax"]),
189
+ })
190
+
191
+ print(f" {len(detections)} object(s) detected")
192
+ return detections
193
+
194
+
195
+ # ═══════════════════════════════════════════════════════════
196
+ # 4. DISTANCE ESTIMATION
197
+ # ═══════════════════════════════════════════════════════════
198
+
199
+ def pinhole_distance(
200
+ pixel_height: int,
201
+ real_height: float,
202
+ focal_length: float
203
+ ) -> float:
204
+ """
205
+ Pinhole / thin-lens camera model:
206
+
207
+ distance = (real_height * focal_length) / pixel_height
208
+
209
+ Derivation:
210
+ An object of real height H at distance D from a camera with focal
211
+ length f projects to a pixel height h = (H * f) / D.
212
+ Solving for D gives the formula above.
213
+ """
214
+ if pixel_height <= 0:
215
+ return float("inf")
216
+ return (real_height * focal_length) / pixel_height
217
+
218
+
219
+ def detection_depth_stat(
220
+ depth_map: np.ndarray,
221
+ det: dict,
222
+ inner_ratio: float = 0.6
223
+ ) -> float:
224
+ """
225
+ Robust per-detection MiDaS statistic.
226
+
227
+ Uses the central region of the bounding box to reduce leakage from
228
+ neighbouring objects and background near box edges.
229
+ """
230
+ inner_ratio = float(np.clip(inner_ratio, 0.1, 1.0))
231
+ x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
232
+ w = max(1, x2 - x1)
233
+ h = max(1, y2 - y1)
234
+
235
+ dx = int(w * (1.0 - inner_ratio) / 2.0)
236
+ dy = int(h * (1.0 - inner_ratio) / 2.0)
237
+ cx1 = max(0, x1 + dx)
238
+ cy1 = max(0, y1 + dy)
239
+ cx2 = min(depth_map.shape[1], x2 - dx)
240
+ cy2 = min(depth_map.shape[0], y2 - dy)
241
+
242
+ roi = depth_map[cy1:cy2, cx1:cx2]
243
+ if roi.size == 0:
244
+ roi = depth_map[max(0, y1):min(depth_map.shape[0], y2),
245
+ max(0, x1):min(depth_map.shape[1], x2)]
246
+ if roi.size == 0:
247
+ return 0.0
248
+
249
+ return float(np.median(roi))
250
+
251
+
252
+ def midas_scale_calibration(
253
+ detections: list[dict],
254
+ depth_map: np.ndarray,
255
+ focal_length: float,
256
+ inner_ratio: float = 0.6,
257
+ min_depth_value: float = 0.02
258
+ ) -> Tuple[Optional[float], List[float]]:
259
+ """
260
+ Use objects with known real-world heights as anchors to calibrate
261
+ the MiDaS relative depth scale.
262
+
263
+ MiDaS outputs inverse relative depth d ∈ (0, 1] where d β‰ˆ 1/D (D = distance).
264
+ So: D_pinhole β‰ˆ k / d_midas => k = D_pinhole * d_midas
265
+
266
+ We collect k for each known-class detection and take the median,
267
+ giving a single scale factor that converts MiDaS values to metres.
268
+ """
269
+ k_values = []
270
+ for det in detections:
271
+ label = det["label"]
272
+ real_height = KNOWN_HEIGHTS.get(label)
273
+ if real_height is None:
274
+ continue
275
+
276
+ pixel_height = det["y2"] - det["y1"]
277
+ if pixel_height <= 5:
278
+ continue
279
+ D_pinhole = pinhole_distance(pixel_height, real_height, focal_length)
280
+
281
+ d_midas = detection_depth_stat(depth_map, det, inner_ratio=inner_ratio)
282
+
283
+ if d_midas > min_depth_value: # skip near-zero (invalid) regions
284
+ k_values.append(D_pinhole * d_midas)
285
+
286
+ if not k_values:
287
+ return None, []
288
+
289
+ scale = float(np.median(k_values))
290
+ print(f" MiDaS scale factor k = {scale:.2f} "
291
+ f"(from {len(k_values)} anchor object(s))")
292
+ return scale, k_values
293
+
294
+
295
+ def estimate_distances(
296
+ detections: list[dict],
297
+ depth_map: np.ndarray,
298
+ focal_length: float,
299
+ inner_ratio: float = 0.6,
300
+ min_depth_value: float = 0.02,
301
+ blend_weight_pinhole: float = 0.55
302
+ ) -> tuple[list[dict], dict]:
303
+ """
304
+ Attach a metric distance estimate to every detection.
305
+
306
+ Strategy:
307
+ 1. Pinhole model – used when the class has a known reference height.
308
+ 2. MiDaS scaling – after calibration with pinhole anchors, applied to
309
+ ALL objects (including those without known heights).
310
+ 3. Final distance – weighted average of the two when both are available;
311
+ falls back to whichever single estimate exists.
312
+
313
+ Adds to each detection dict:
314
+ dist_pinhole – metres from pinhole model (None if class unknown)
315
+ dist_midas – metres from MiDaS scaling (None if no calibration)
316
+ distance – final blended estimate (metres)
317
+ method – string explaining which strategy was used
318
+ """
319
+ # ── Step 1: calibrate MiDaS scale ──
320
+ midas_scale, anchor_scales = midas_scale_calibration(
321
+ detections,
322
+ depth_map,
323
+ focal_length,
324
+ inner_ratio=inner_ratio,
325
+ min_depth_value=min_depth_value,
326
+ )
327
+ blend_weight_pinhole = float(np.clip(blend_weight_pinhole, 0.0, 1.0))
328
+ blend_weight_midas = 1.0 - blend_weight_pinhole
329
+
330
+ for det in detections:
331
+ label = det["label"]
332
+ real_height = KNOWN_HEIGHTS.get(label)
333
+ pixel_height = det["y2"] - det["y1"]
334
+ det["pixel_height"] = pixel_height
335
+ det["known_height_m"] = real_height
336
+ det["bbox_depth_median"] = detection_depth_stat(
337
+ depth_map, det, inner_ratio=inner_ratio
338
+ )
339
+
340
+ # ── Pinhole estimate ──
341
+ if real_height is not None and pixel_height > 5:
342
+ det["dist_pinhole"] = pinhole_distance(pixel_height, real_height,
343
+ focal_length)
344
+ else:
345
+ det["dist_pinhole"] = None
346
+
347
+ # ── MiDaS estimate ──
348
+ d_midas = det["bbox_depth_median"]
349
+ if midas_scale and d_midas > min_depth_value:
350
+ det["dist_midas"] = midas_scale / d_midas
351
+ else:
352
+ det["dist_midas"] = None
353
+
354
+ # ── Blend ──
355
+ dp = det["dist_pinhole"]
356
+ dm = det["dist_midas"]
357
+
358
+ if dp is not None and dm is not None:
359
+ # Weighted average: pinhole is generally more precise for
360
+ # well-known classes; MiDaS captures scene context better.
361
+ det["distance"] = blend_weight_pinhole * dp + blend_weight_midas * dm
362
+ det["method"] = "pinhole + MiDaS"
363
+ elif dp is not None:
364
+ det["distance"] = dp
365
+ det["method"] = "pinhole"
366
+ elif dm is not None:
367
+ det["distance"] = dm
368
+ det["method"] = "MiDaS"
369
+ else:
370
+ det["distance"] = None
371
+ det["method"] = "unknown"
372
+
373
+ eval_context = {
374
+ "midas_scale": midas_scale,
375
+ "anchor_scales": anchor_scales,
376
+ "depth_inner_ratio": inner_ratio,
377
+ "min_depth_value": min_depth_value,
378
+ "blend_weight_pinhole": blend_weight_pinhole,
379
+ }
380
+ return detections, eval_context
381
+
382
+
383
+ def compute_evaluation_metrics(
384
+ detections: list[dict],
385
+ focal_length: float,
386
+ eval_context: dict
387
+ ) -> dict:
388
+ """
389
+ Internal evaluation only.
390
+
391
+ Since there is no ground-truth distance label in this pipeline, the saved
392
+ metrics focus on coverage, calibration robustness, and agreement between
393
+ the two estimation branches rather than absolute accuracy.
394
+ """
395
+ total = len(detections)
396
+ confs = np.array([det["conf"] for det in detections], dtype=np.float32) if detections else np.array([])
397
+ final_dists = np.array(
398
+ [det["distance"] for det in detections if det.get("distance") is not None],
399
+ dtype=np.float32
400
+ )
401
+ pinhole_vals = np.array(
402
+ [det["dist_pinhole"] for det in detections if det.get("dist_pinhole") is not None],
403
+ dtype=np.float32
404
+ )
405
+ midas_vals = np.array(
406
+ [det["dist_midas"] for det in detections if det.get("dist_midas") is not None],
407
+ dtype=np.float32
408
+ )
409
+ overlap_pairs = [
410
+ (det["dist_pinhole"], det["dist_midas"])
411
+ for det in detections
412
+ if det.get("dist_pinhole") is not None and det.get("dist_midas") is not None
413
+ ]
414
+ anchor_scales = np.array(eval_context.get("anchor_scales", []), dtype=np.float32)
415
+
416
+ metrics = {
417
+ "focal_length_px": float(focal_length),
418
+ "num_detections": total,
419
+ "mean_confidence": float(confs.mean()) if confs.size else None,
420
+ "known_height_count": sum(det.get("known_height_m") is not None for det in detections),
421
+ "pinhole_count": int(pinhole_vals.size),
422
+ "midas_count": int(midas_vals.size),
423
+ "blended_count": sum(det.get("method") == "pinhole + MiDaS" for det in detections),
424
+ "unresolved_count": sum(det.get("distance") is None for det in detections),
425
+ "calibration_anchor_count": int(anchor_scales.size),
426
+ "midas_scale_factor": eval_context.get("midas_scale"),
427
+ }
428
+ metrics["known_height_coverage"] = (
429
+ metrics["known_height_count"] / total if total else None
430
+ )
431
+ metrics["distance_coverage"] = (
432
+ float(final_dists.size) / total if total else None
433
+ )
434
+
435
+ if final_dists.size:
436
+ metrics.update({
437
+ "final_distance_mean_m": float(final_dists.mean()),
438
+ "final_distance_std_m": float(final_dists.std()),
439
+ "final_distance_min_m": float(final_dists.min()),
440
+ "final_distance_max_m": float(final_dists.max()),
441
+ })
442
+
443
+ if anchor_scales.size:
444
+ metrics.update({
445
+ "anchor_scale_median": float(np.median(anchor_scales)),
446
+ "anchor_scale_std": float(anchor_scales.std()),
447
+ "anchor_scale_cv": float(anchor_scales.std() / (anchor_scales.mean() + 1e-6)),
448
+ })
449
+
450
+ if overlap_pairs:
451
+ pinhole_arr = np.array([pair[0] for pair in overlap_pairs], dtype=np.float32)
452
+ midas_arr = np.array([pair[1] for pair in overlap_pairs], dtype=np.float32)
453
+ abs_err = np.abs(midas_arr - pinhole_arr)
454
+ rel_err = abs_err / np.maximum(pinhole_arr, 1e-6)
455
+ metrics.update({
456
+ "agreement_sample_count": int(len(overlap_pairs)),
457
+ "agreement_mae_m": float(abs_err.mean()),
458
+ "agreement_rmse_m": float(np.sqrt(np.mean(abs_err ** 2))),
459
+ "agreement_mean_relative_error": float(rel_err.mean()),
460
+ "agreement_median_relative_error": float(np.median(rel_err)),
461
+ "agreement_within_10pct": float(np.mean(rel_err <= 0.10)),
462
+ "agreement_within_20pct": float(np.mean(rel_err <= 0.20)),
463
+ })
464
+
465
+ return metrics
466
+
467
+
468
+ def save_evaluation_outputs(
469
+ detections: list[dict],
470
+ metrics: dict,
471
+ eval_dir: str
472
+ ) -> None:
473
+ os.makedirs(eval_dir, exist_ok=True)
474
+
475
+ csv_path = os.path.join(eval_dir, "detection_distances.csv")
476
+ with open(csv_path, "w", newline="", encoding="utf-8") as f:
477
+ writer = csv.writer(f)
478
+ writer.writerow([
479
+ "label", "confidence", "pixel_height", "known_height_m",
480
+ "bbox_depth_median", "dist_pinhole_m", "dist_midas_m",
481
+ "final_distance_m", "method"
482
+ ])
483
+ for det in sorted(detections, key=lambda d: d["distance"] if d["distance"] else 999):
484
+ writer.writerow([
485
+ det["label"],
486
+ f"{det['conf']:.6f}",
487
+ det.get("pixel_height"),
488
+ "" if det.get("known_height_m") is None else f"{det['known_height_m']:.3f}",
489
+ f"{det.get('bbox_depth_median', 0.0):.6f}",
490
+ "" if det.get("dist_pinhole") is None else f"{det['dist_pinhole']:.6f}",
491
+ "" if det.get("dist_midas") is None else f"{det['dist_midas']:.6f}",
492
+ "" if det.get("distance") is None else f"{det['distance']:.6f}",
493
+ det.get("method", "unknown"),
494
+ ])
495
+
496
+ metrics_path = os.path.join(eval_dir, "metrics.json")
497
+ with open(metrics_path, "w", encoding="utf-8") as f:
498
+ json.dump(metrics, f, indent=2)
499
+
500
+ report_path = os.path.join(eval_dir, "evaluation_report.txt")
501
+ with open(report_path, "w", encoding="utf-8") as f:
502
+ f.write("Subtask 2 Evaluation Report\n")
503
+ f.write("===========================\n\n")
504
+ f.write("This report measures internal consistency only.\n")
505
+ f.write("No ground-truth object distances are available here, so these metrics\n")
506
+ f.write("should be interpreted as coverage / robustness diagnostics, not absolute accuracy.\n\n")
507
+ f.write("Key metrics\n")
508
+ f.write("-----------\n")
509
+ for key, value in metrics.items():
510
+ if value is None:
511
+ pretty = "N/A"
512
+ elif isinstance(value, float):
513
+ pretty = f"{value:.4f}"
514
+ else:
515
+ pretty = str(value)
516
+ f.write(f"{key}: {pretty}\n")
517
+
518
+ f.write("\nMetric sufficiency note\n")
519
+ f.write("----------------------\n")
520
+ f.write("- Enough for internal evaluation: yes.\n")
521
+ f.write("- Enough for accuracy claims: no.\n")
522
+ f.write("- To measure real accuracy, add ground-truth distances and report MAE/RMSE/MAPE against labels.\n")
523
+
524
+ print(f" Saved -> {csv_path}")
525
+ print(f" Saved -> {metrics_path}")
526
+ print(f" Saved -> {report_path}")
527
+
528
+
529
+ # ═══════════════════════════════════════════════════════════
530
+ # 5. DRAW ANNOTATED IMAGE
531
+ # ═══════════════════════════════════════════════════════════
532
+
533
+ def draw_detections(
534
+ img: np.ndarray,
535
+ detections: list[dict]
536
+ ) -> np.ndarray:
537
+ """
538
+ Draw bounding boxes with labels on a copy of the image.
539
+
540
+ Label format: "<class>: X.X m (conf%)"
541
+ Each class gets a consistent colour from the palette.
542
+ """
543
+ out = img.copy()
544
+ class_ids = {} # map class name β†’ colour index
545
+
546
+ for det in detections:
547
+ label = det["label"]
548
+ dist = det["distance"]
549
+ conf = det["conf"]
550
+ x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
551
+
552
+ # Assign colour
553
+ if label not in class_ids:
554
+ class_ids[label] = len(class_ids) % len(_PALETTE)
555
+ colour = _PALETTE[class_ids[label]]
556
+
557
+ # Box
558
+ thickness = max(2, int((x2 - x1 + y2 - y1) / 200))
559
+ cv2.rectangle(out, (x1, y1), (x2, y2), colour, thickness)
560
+
561
+ # Label text
562
+ if dist is not None:
563
+ text = f"{label}: {dist:.1f} m ({conf:.0%})"
564
+ else:
565
+ text = f"{label} ({conf:.0%})"
566
+
567
+ # Dynamic font scale based on box size
568
+ box_h = max(1, y2 - y1)
569
+ font_scale = max(0.45, min(0.9, box_h / 180))
570
+ font_thick = max(1, int(font_scale * 2))
571
+
572
+ (tw, th), baseline = cv2.getTextSize(
573
+ text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thick)
574
+
575
+ # Background pill behind text
576
+ pad = 5
577
+ tx = max(0, x1)
578
+ ty_box = max(0, y1 - th - baseline - pad * 2)
579
+ cv2.rectangle(out,
580
+ (tx, ty_box),
581
+ (tx + tw + pad * 2, ty_box + th + baseline + pad * 2),
582
+ colour, -1)
583
+ # Invert text colour for readability
584
+ lum = 0.299 * colour[2] + 0.587 * colour[1] + 0.114 * colour[0]
585
+ txt_color = (0, 0, 0) if lum > 128 else (255, 255, 255)
586
+ cv2.putText(out, text,
587
+ (tx + pad, ty_box + th + pad),
588
+ cv2.FONT_HERSHEY_SIMPLEX, font_scale,
589
+ txt_color, font_thick, cv2.LINE_AA)
590
+
591
+ return out
592
+
593
+
594
+ # ═══════════════════════════════════════════════════════════
595
+ # 6. VISUALISATION (combined figure)
596
+ # ═══════════════════════════════════════════════════════════
597
+
598
+ def visualise_results(
599
+ img: np.ndarray,
600
+ depth_map: np.ndarray,
601
+ detections: list[dict],
602
+ annotated: np.ndarray,
603
+ out_path: str
604
+ ) -> None:
605
+ """
606
+ Three-panel figure:
607
+ 1. Original image with raw YOLO boxes
608
+ 2. MiDaS depth heatmap with boxes overlaid
609
+ 3. Final annotated image with distance labels
610
+ """
611
+ fig, axes = plt.subplots(1, 3, figsize=(19, 7), dpi=130)
612
+ fig.patch.set_facecolor("#1a1a2e")
613
+
614
+ h, w = img.shape[:2]
615
+
616
+ # ── Panel 1: raw YOLO detections ──
617
+ raw_boxes = img.copy()
618
+ for det in detections:
619
+ x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
620
+ cv2.rectangle(raw_boxes, (x1, y1), (x2, y2), (0, 255, 120), 2)
621
+ cv2.putText(raw_boxes, f"{det['label']} {det['conf']:.0%}",
622
+ (x1, max(0, y1 - 6)),
623
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 255, 120), 2, cv2.LINE_AA)
624
+
625
+ axes[0].imshow(cv2.cvtColor(raw_boxes, cv2.COLOR_BGR2RGB))
626
+ axes[0].set_title("YOLO Detections", color="white", fontsize=11,
627
+ fontweight="bold", pad=10)
628
+ axes[0].axis("off")
629
+
630
+ # ── Panel 2: MiDaS depth + boxes ──
631
+ depth_bgr = depth_to_heatmap(depth_map)
632
+ depth_over = depth_bgr.copy()
633
+ for det in detections:
634
+ x1, y1, x2, y2 = det["x1"], det["y1"], det["x2"], det["y2"]
635
+ cv2.rectangle(depth_over, (x1, y1), (x2, y2), (255, 255, 255), 2)
636
+ dist_txt = f"{det['distance']:.1f}m" if det["distance"] else "?"
637
+ cv2.putText(depth_over, dist_txt,
638
+ (x1 + 3, y1 + 18),
639
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255, 255, 255), 2, cv2.LINE_AA)
640
+
641
+ axes[1].imshow(cv2.cvtColor(depth_over, cv2.COLOR_BGR2RGB))
642
+ sm = plt.cm.ScalarMappable(cmap="turbo", norm=plt.Normalize(0, 1))
643
+ sm.set_array([])
644
+ cb = plt.colorbar(sm, ax=axes[1], fraction=0.035, pad=0.02)
645
+ cb.set_label("Near β†’ Far", color="white", fontsize=8)
646
+ cb.set_ticks([0, 0.5, 1])
647
+ cb.set_ticklabels(["Far", "Mid", "Near"], color="white", fontsize=8)
648
+ cb.ax.yaxis.set_tick_params(color="white")
649
+ axes[1].set_title("MiDaS Depth + Distance Estimates",
650
+ color="white", fontsize=11, fontweight="bold", pad=10)
651
+ axes[1].axis("off")
652
+
653
+ # ── Panel 3: final annotated image ──
654
+ axes[2].imshow(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
655
+ axes[2].set_title("Object Distances (pinhole + MiDaS blend)",
656
+ color="white", fontsize=11, fontweight="bold", pad=10)
657
+ axes[2].axis("off")
658
+
659
+ # ── Distance table below ──
660
+ rows = []
661
+ for det in sorted(detections,
662
+ key=lambda d: d["distance"] if d["distance"] else 999):
663
+ dist_str = f"{det['distance']:.2f} m" if det["distance"] is not None else "N/A"
664
+ ph_str = (f"{det['dist_pinhole']:.2f} m"
665
+ if det.get("dist_pinhole") is not None else "β€”")
666
+ md_str = (f"{det['dist_midas']:.2f} m"
667
+ if det.get("dist_midas") is not None else "β€”")
668
+ rows.append([det["label"], f"{det['conf']:.0%}",
669
+ ph_str, md_str, dist_str, det["method"]])
670
+
671
+ if rows:
672
+ table_ax = fig.add_axes([0.05, -0.14, 0.90, 0.14])
673
+ table_ax.axis("off")
674
+ table_ax.set_facecolor("#1a1a2e")
675
+ col_labels = ["Object", "Confidence",
676
+ "Pinhole est.", "MiDaS est.", "Final distance", "Method"]
677
+ tbl = table_ax.table(
678
+ cellText=rows,
679
+ colLabels=col_labels,
680
+ cellLoc="center", loc="center"
681
+ )
682
+ tbl.auto_set_font_size(False)
683
+ tbl.set_fontsize(8.5)
684
+ tbl.scale(1, 1.55)
685
+ # Style header
686
+ for j in range(len(col_labels)):
687
+ tbl[(0, j)].set_facecolor("#2e4057")
688
+ tbl[(0, j)].set_text_props(color="white", fontweight="bold")
689
+ # Alternating row shading
690
+ for i in range(1, len(rows) + 1):
691
+ bg = "#1e2d40" if i % 2 == 0 else "#16213e"
692
+ for j in range(len(col_labels)):
693
+ tbl[(i, j)].set_facecolor(bg)
694
+ tbl[(i, j)].set_text_props(color="#dde")
695
+
696
+ plt.suptitle(
697
+ "Subtask 2 β€” Object Detection & Distance Estimation\n"
698
+ "Distance = pinhole camera model + MiDaS depth scaling",
699
+ color="white", fontsize=13, fontweight="bold", y=1.02
700
+ )
701
+ plt.tight_layout()
702
+
703
+ os.makedirs(os.path.dirname(os.path.abspath(out_path)), exist_ok=True)
704
+ plt.savefig(out_path, dpi=130, bbox_inches="tight",
705
+ facecolor=fig.get_facecolor())
706
+ plt.close(fig)
707
+ print(f"Saved -> {out_path}")
708
+
709
+
710
+ # ═══════════════════════════════════════════════════════════
711
+ # 7. MAIN
712
+ # ═══════════════════════════════════════════════════════════
713
+
714
+ def main() -> None:
715
+ if len(sys.argv) < 2:
716
+ sys.exit(
717
+ "Usage: python object_distance.py <image_path> [output_dir] [focal_px]\n"
718
+ "Example: python object_distance.py street.jpg output/ 800"
719
+ )
720
+
721
+ image_path = sys.argv[1]
722
+ out_dir = sys.argv[2] if len(sys.argv) > 2 else "output"
723
+ focal_length = float(sys.argv[3]) if len(sys.argv) > 3 else None
724
+ image_dir = os.path.join(out_dir, "images")
725
+ eval_dir = os.path.join(out_dir, "evaluation")
726
+
727
+ # ── Load image ──
728
+ img = load_image(image_path)
729
+ h, w = img.shape[:2]
730
+
731
+ if focal_length is None:
732
+ focal_length = estimate_focal_length(w, fov_deg=60.0)
733
+ print(f"Focal length estimated: {focal_length:.1f} px "
734
+ f"(assuming 60Β° horizontal FOV β€” override via 3rd argument)")
735
+ else:
736
+ print(f"Focal length (user-supplied): {focal_length:.1f} px")
737
+
738
+ # ── MiDaS depth ──
739
+ print("\n[ MiDaS ] Loading MiDaS_small ...")
740
+ midas_model, midas_transform, device = load_midas("MiDaS_small")
741
+ print("[ MiDaS ] Running inference ...")
742
+ depth_map = midas_depth(img, midas_model, midas_transform, device)
743
+ print(f" Done. depth in [0,1] mean={depth_map.mean():.3f}")
744
+
745
+ # ── YOLO detection ──
746
+ print("\n[ YOLO ] Loading YOLOv5s ...")
747
+ yolo_model = load_yolo("yolov5s")
748
+ print("[ YOLO ] Running detection ...")
749
+ detections = run_yolo(yolo_model, img)
750
+
751
+ if not detections:
752
+ print("WARNING: No objects detected. "
753
+ "Try a lower confidence threshold or a different image.")
754
+ sys.exit(0)
755
+
756
+ # ── Distance estimation ──
757
+ print("\n[ Dist ] Estimating distances ...")
758
+ detections, eval_context = estimate_distances(detections, depth_map, focal_length)
759
+ metrics = compute_evaluation_metrics(detections, focal_length, eval_context)
760
+
761
+ # Print summary table
762
+ print(f"\n {'Object':<18} {'Conf':>5} {'Pinhole':>10} "
763
+ f"{'MiDaS':>10} {'Final':>10} Method")
764
+ print(" " + "-" * 70)
765
+ for det in sorted(detections,
766
+ key=lambda d: d["distance"] if d["distance"] else 999):
767
+ dp = f"{det['dist_pinhole']:.1f} m" if det.get("dist_pinhole") else " β€”"
768
+ dm = f"{det['dist_midas']:.1f} m" if det.get("dist_midas") else " β€”"
769
+ df = f"{det['distance']:.1f} m" if det.get("distance") else " β€”"
770
+ print(f" {det['label']:<18} {det['conf']:>4.0%} "
771
+ f"{dp:>10} {dm:>10} {df:>10} {det['method']}")
772
+
773
+ # ── Draw and save ──
774
+ print("\n[ Draw ] Annotating image ...")
775
+ annotated = draw_detections(img, detections)
776
+
777
+ os.makedirs(image_dir, exist_ok=True)
778
+ os.makedirs(eval_dir, exist_ok=True)
779
+ annotated_path = os.path.join(image_dir, "detections_with_distance.png")
780
+ cv2.imwrite(annotated_path, annotated)
781
+ cv2.imwrite(os.path.join(image_dir, "midas_depth.png"),
782
+ depth_to_heatmap(depth_map))
783
+ print(f" Saved -> {annotated_path}")
784
+
785
+ print("\n[ Fig ] Compositing combined figure ...")
786
+ visualise_results(
787
+ img, depth_map, detections, annotated,
788
+ out_path=os.path.join(image_dir, "object_distance_subtask2.png")
789
+ )
790
+
791
+ print("\n[ Eval ] Writing evaluation artifacts ...")
792
+ save_evaluation_outputs(detections, metrics, eval_dir)
793
+
794
+ print(f"\nDone. Image outputs: {image_dir}/")
795
+ print(f"Done. Evaluation outputs: {eval_dir}/")
796
+
797
+
798
+ if __name__ == "__main__":
799
+ main()