GitHub Actions commited on
Commit
b20c82e
·
1 Parent(s): 6da2618

Deploy from GitHub commit ca72656c17476e5aa37a4735af6e47ff9f94fa1a

Browse files
app.py CHANGED
@@ -112,8 +112,17 @@ DEPTH_MODEL_NAME = str(config_value(
112
  "DEPTH_MODEL_NAME",
113
  "models",
114
  "depth_model_name",
115
- "Intel/dpt-large",
 
 
116
  ))
 
 
 
 
 
 
 
117
  ENABLE_DEPTH_ESTIMATION = str(config_value(
118
  "ENABLE_DEPTH_ESTIMATION",
119
  "runtime",
@@ -301,6 +310,18 @@ def estimate_depth(img: Image.Image, width: int, height: int):
301
  depth_min, depth_max = float(np.min(depth)), float(np.max(depth))
302
  if depth_max - depth_min < 1e-6:
303
  return None
 
 
 
 
 
 
 
 
 
 
 
 
304
  return (depth - depth_min) / (depth_max - depth_min)
305
  except Exception as exc:
306
  print(f"Depth estimation skipped ({exc}).", flush=True)
@@ -811,7 +832,144 @@ def detect_dual_vanishing_points(
811
  return primary, secondary
812
 
813
 
814
- def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
815
  ys, xs = np.where(mask > 0)
816
  if len(xs) < 1000:
817
  return None, None
@@ -891,7 +1049,9 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
891
  vp_y = float(vanishing_point["y"])
892
  else:
893
  # No usable VP: assume an eye-level shot with the horizon a little
894
- # above the floor's top edge, centred over the bottom edge.
 
 
895
  vp_x = (bl_x + br_x) * 0.5
896
  vp_y = top_y_f - 0.35 * span_y
897
  # Keep the horizon clear of the top edge so the transform stays
@@ -923,6 +1083,27 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
923
  H = cv2.getPerspectiveTransform(src, dst)
924
  homography = H.flatten().tolist()
925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  default_rotation = estimate_default_rotation(mask, H, vanishing_point2)
927
 
928
  return homography, {
@@ -938,6 +1119,110 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
938
  }
939
 
940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
941
  # ---------------------------------------------------------------------------
942
  # P1-4 — Default Tile Rotation
943
  # Tiles were laid along the plane's x-axis (= the image bbox axis), which is
@@ -1022,8 +1307,12 @@ def estimate_default_rotation(
1022
  [cx + (vanishing_point2["x"] - cx) * 0.25, cy + (vanishing_point2["y"] - cy) * 0.25],
1023
  ])
1024
  angle = _plane_angle(toward)
1025
- if angle is not None:
1026
- return float(np.clip(angle, -30.0, 30.0))
 
 
 
 
1027
 
1028
  return 0.0
1029
 
@@ -1112,7 +1401,9 @@ def build_floor_surface_mask(
1112
  if depth is not None and floor_mask.any():
1113
  floor_depth = depth[floor_mask > 0]
1114
  lo, hi = float(np.percentile(floor_depth, 2)), float(np.percentile(floor_depth, 98))
1115
- margin = max(0.08, (hi - lo) * 0.35)
 
 
1116
  depth_keep = (depth >= lo - margin) & (depth <= hi + margin)
1117
  surface = (surface & depth_keep.astype(np.uint8)).astype(np.uint8)
1118
  surface[floor_mask > 0] = np.maximum(surface[floor_mask > 0], 1)
@@ -1336,7 +1627,11 @@ def build_segmentation_bundle(contents: bytes):
1336
  print(f"[TIMING] Depth estimation took {time.perf_counter() - t0:.3f} seconds", flush=True)
1337
 
1338
  t0 = time.perf_counter()
1339
- homography, plane = estimate_floor_plane(floor_mask, img_np)
 
 
 
 
1340
  print(f"[TIMING] Plane fitting / homography calculation took {time.perf_counter() - t0:.3f} seconds", flush=True)
1341
 
1342
  t0 = time.perf_counter()
@@ -1408,6 +1703,19 @@ def build_segmentation_bundle(contents: bytes):
1408
  "depthEnabled": depth is not None,
1409
  "shadingEnabled": shade_map is not None,
1410
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
  })
1412
 
1413
  if not segments:
 
112
  "DEPTH_MODEL_NAME",
113
  "models",
114
  "depth_model_name",
115
+ # R1-1 — metric indoor checkpoint: predicts depth in metres (and is ~10x
116
+ # smaller than dpt-large). Foundation for R1-2 plane fit and R1-3 scale.
117
+ "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf",
118
  ))
119
+
120
+
121
+ def depth_model_is_metric(name: str | None = None) -> bool:
122
+ """Metric checkpoints predict metres; relative ones predict unitless
123
+ inverse depth that we min-max normalise. Convention: metric model ids
124
+ carry 'metric' in the name (Depth Anything V2 Metric, Metric3D...)."""
125
+ return "metric" in (name or DEPTH_MODEL_NAME).lower()
126
  ENABLE_DEPTH_ESTIMATION = str(config_value(
127
  "ENABLE_DEPTH_ESTIMATION",
128
  "runtime",
 
310
  depth_min, depth_max = float(np.min(depth)), float(np.max(depth))
311
  if depth_max - depth_min < 1e-6:
312
  return None
313
+ if depth_model_is_metric(model_name):
314
+ # R1-1 — keep the metres. Downstream filters are scale-agnostic
315
+ # (percentile bands), and R1-2/R1-3 need real geometry. Gate on
316
+ # plausibility: an indoor scene lives within ~0.1-30 m; anything
317
+ # else means the checkpoint or processor mis-loaded.
318
+ if depth_min < -0.5 or depth_max > 50.0 or depth_max < 0.2:
319
+ print(
320
+ f"Metric depth implausible [{depth_min:.2f}, {depth_max:.2f}] m — skipped.",
321
+ flush=True,
322
+ )
323
+ return None
324
+ return np.clip(depth, 0.05, 40.0)
325
  return (depth - depth_min) / (depth_max - depth_min)
326
  except Exception as exc:
327
  print(f"Depth estimation skipped ({exc}).", flush=True)
 
832
  return primary, secondary
833
 
834
 
835
+ def plane_homography_from_depth(
836
+ depth: np.ndarray | None, mask: np.ndarray, w: int, h: int
837
+ ):
838
+ """R1-2 — analytic floor homography from a metric 3D plane fit.
839
+
840
+ When vanishing-point detection fails, the old fallback invented a
841
+ synthetic horizon, which warped steep/wide rooms (sheared checkers, the
842
+ far-centre pinch). With metric depth the ground plane can be measured
843
+ instead: back-project floor pixels (pinhole, f ~ image width), fit the
844
+ plane robustly, and build the image->plane homography exactly as
845
+ inv(K @ [u v c]) where u,v are orthonormal in-plane axes and c is a point
846
+ on the plane. Plane units are METRES, so metersPerUnit is 1 by
847
+ construction and true-to-size tiling (R1-3) activates in precisely the
848
+ rooms the trapezoid fallback used to distort.
849
+
850
+ Returns (homography_list, plane_dict) or None (caller falls back to the
851
+ synthetic-VP construction).
852
+ """
853
+ if depth is None or not depth_model_is_metric():
854
+ return None
855
+ ys, xs = np.nonzero(mask)
856
+ if len(xs) < 2000:
857
+ return None
858
+ rng = np.random.default_rng(7)
859
+ sel = rng.choice(len(xs), size=min(6000, len(xs)), replace=False)
860
+ x_f = xs[sel].astype(np.float64)
861
+ y_f = ys[sel].astype(np.float64)
862
+ z = depth[ys[sel], xs[sel]].astype(np.float64)
863
+
864
+ f = float(w)
865
+ cx, cy = w / 2.0, h / 2.0
866
+ P = np.stack([(x_f - cx) / f * z, (y_f - cy) / f * z, z], axis=1)
867
+
868
+ # iteratively trimmed total-least-squares plane fit
869
+ keep = np.ones(len(P), bool)
870
+ n = np.array([0.0, -1.0, 0.0])
871
+ c = P.mean(axis=0)
872
+ for _ in range(4):
873
+ c = P[keep].mean(axis=0)
874
+ q = P[keep] - c
875
+ _, _, vt = np.linalg.svd(q, full_matrices=False)
876
+ n = vt[2]
877
+ resid = np.abs((P - c) @ n)
878
+ thr = max(float(np.percentile(resid[keep], 80)), 1e-4)
879
+ keep = resid <= thr
880
+
881
+ resid = np.abs((P - c) @ n)
882
+ med_z = float(np.median(z))
883
+ inliers = resid <= max(0.03, 0.025 * med_z)
884
+ if inliers.mean() < 0.6:
885
+ return None
886
+ c = P[inliers].mean(axis=0)
887
+ q = P[inliers] - c
888
+ _, _, vt = np.linalg.svd(q, full_matrices=False)
889
+ n = vt[2]
890
+ if n @ c > 0:
891
+ n = -n # face the camera
892
+ # floor sanity: the normal must point up-ish (camera y is down), not at a
893
+ # wall-like angle — protects against fitting a dominant wall/cabinet face.
894
+ if n[1] > -0.5:
895
+ return None
896
+
897
+ # in-plane axes: u image-horizontal-ish, v toward the camera (near field),
898
+ # matching the existing convention that plane-y grows toward the viewer
899
+ u = np.cross(n, [0.0, 0.0, 1.0])
900
+ if np.linalg.norm(u) < 0.2: # camera looking straight down
901
+ u = np.cross(n, [0.0, 1.0, 0.0])
902
+ u /= np.linalg.norm(u)
903
+ v = np.cross(n, u)
904
+ v /= np.linalg.norm(v)
905
+
906
+ K = np.array([[f, 0, cx], [0, f, cy], [0, 0, 1.0]])
907
+ M = K @ np.stack([u, v, c], axis=1)
908
+ if abs(np.linalg.det(M)) < 1e-9:
909
+ return None
910
+ H = np.linalg.inv(M)
911
+ H = H / H[2, 2]
912
+
913
+ # orient axes by their image-space direction at the floor centroid
914
+ def to_plane(px, py):
915
+ den = H[2, 0] * px + H[2, 1] * py + H[2, 2]
916
+ return (
917
+ (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / den,
918
+ (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / den,
919
+ )
920
+
921
+ mx, my = float(np.median(x_f)), float(np.median(y_f))
922
+ a0, b0 = to_plane(mx, my)
923
+ a_dx, _ = to_plane(mx + 10, my)
924
+ _, b_dy = to_plane(mx, my + 10)
925
+ flip = np.diag([
926
+ -1.0 if a_dx - a0 < 0 else 1.0, # plane-x grows image-right
927
+ -1.0 if b_dy - b0 < 0 else 1.0, # plane-y grows image-down (nearer)
928
+ 1.0,
929
+ ])
930
+ H = flip @ H
931
+ H = H / H[2, 2]
932
+
933
+ # plane-space extent of the floor (percentile box, metres)
934
+ den = H[2, 0] * x_f + H[2, 1] * y_f + H[2, 2]
935
+ pa = (H[0, 0] * x_f + H[0, 1] * y_f + H[0, 2]) / den
936
+ pb = (H[1, 0] * x_f + H[1, 1] * y_f + H[1, 2]) / den
937
+ a1, a2 = float(np.percentile(pa, 1)), float(np.percentile(pa, 99))
938
+ b1, b2 = float(np.percentile(pb, 1)), float(np.percentile(pb, 99))
939
+ if not (0.8 <= a2 - a1 <= 30.0 and 0.8 <= b2 - b1 <= 60.0):
940
+ return None # implausible physical footprint
941
+
942
+ # image quad of the plane box (bl, br, tr, tl — existing convention)
943
+ Hinv = np.linalg.inv(H)
944
+
945
+ def to_img(ap, bp):
946
+ den_i = Hinv[2, 0] * ap + Hinv[2, 1] * bp + Hinv[2, 2]
947
+ return (
948
+ float(np.clip((Hinv[0, 0] * ap + Hinv[0, 1] * bp + Hinv[0, 2]) / den_i, 0, w - 1)),
949
+ float(np.clip((Hinv[1, 0] * ap + Hinv[1, 1] * bp + Hinv[1, 2]) / den_i, 0, h - 1)),
950
+ )
951
+
952
+ quad = [to_img(a1, b2), to_img(a2, b2), to_img(a2, b1), to_img(a1, b1)]
953
+
954
+ homography = H.flatten().tolist()
955
+ # self-check with the certified R1-3 gate chain (isotropy, shear, spread):
956
+ # on a correct fit the measured scale must be ~1 m per plane unit.
957
+ mpu = estimate_meters_per_unit(depth, mask, homography, w, h)
958
+ if mpu is None or not (0.85 <= mpu <= 1.15):
959
+ return None
960
+
961
+ plane = {
962
+ "x": a1,
963
+ "y": b1,
964
+ "width": a2 - a1,
965
+ "height": b2 - b1,
966
+ "quad": [coord for pt in quad for coord in pt],
967
+ "geometrySource": "depth-plane", # R1-2 — vs the VP trapezoid path
968
+ }
969
+ return homography, plane
970
+
971
+
972
+ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray, depth: np.ndarray | None = None):
973
  ys, xs = np.where(mask > 0)
974
  if len(xs) < 1000:
975
  return None, None
 
1049
  vp_y = float(vanishing_point["y"])
1050
  else:
1051
  # No usable VP: assume an eye-level shot with the horizon a little
1052
+ # above the floor's top edge, centred over the bottom edge. (If this
1053
+ # guess disagrees with measured depth, the R1-2 arbitration below
1054
+ # replaces it with the analytic depth-plane homography.)
1055
  vp_x = (bl_x + br_x) * 0.5
1056
  vp_y = top_y_f - 0.35 * span_y
1057
  # Keep the horizon clear of the top edge so the transform stays
 
1083
  H = cv2.getPerspectiveTransform(src, dst)
1084
  homography = H.flatten().tolist()
1085
 
1086
+ # R1-2 — geometry arbitration: when the trapezoid homography (real or
1087
+ # synthetic VP) is inconsistent with the measured 3D floor — the certified
1088
+ # metric-scale gates fail on it — prefer the analytic depth-plane
1089
+ # homography. This covers both failure classes seen in the reference
1090
+ # rooms: no VP found (guessed horizon) and a detected-but-wrong VP
1091
+ # (sheared trapezoid). With no metric depth both probes return None and
1092
+ # the trapezoid ships unchanged.
1093
+ if depth is not None and estimate_meters_per_unit(depth, mask, homography, w, h) is None:
1094
+ fitted = plane_homography_from_depth(depth, mask, w, h)
1095
+ if fitted is not None:
1096
+ homography_m, plane_meta = fitted
1097
+ plane_meta["hullQuad"] = hull_quad_list
1098
+ plane_meta["vanishingPoint"] = vanishing_point
1099
+ plane_meta["vanishingPoint2"] = vanishing_point2
1100
+ plane_meta["defaultRotation"] = estimate_default_rotation(
1101
+ mask,
1102
+ np.asarray(homography_m, np.float64).reshape(3, 3),
1103
+ vanishing_point2,
1104
+ )
1105
+ return homography_m, plane_meta
1106
+
1107
  default_rotation = estimate_default_rotation(mask, H, vanishing_point2)
1108
 
1109
  return homography, {
 
1119
  }
1120
 
1121
 
1122
+ def estimate_meters_per_unit(
1123
+ depth: np.ndarray | None,
1124
+ mask: np.ndarray,
1125
+ homography: list[float] | None,
1126
+ w: int,
1127
+ h: int,
1128
+ ) -> float | None:
1129
+ """R1-3 — physical scale of the floor plane: metres per plane unit.
1130
+
1131
+ Back-project floor pixels to 3D through the metric depth (pinhole, f ~
1132
+ image width — the P0 convention), map the same pixels into plane space
1133
+ through the homography, and take the median ratio of 3D distance to
1134
+ plane-space distance over random point pairs. Replaces the heuristic
1135
+ repeat coefficient on the frontend: a 60 cm tile becomes 0.6/metersPerUnit
1136
+ plane units in every room. None when depth is relative, the floor is too
1137
+ small, or the ratios are inconsistent (non-planar depth — let the
1138
+ heuristic handle it).
1139
+ """
1140
+ if depth is None or homography is None or not depth_model_is_metric():
1141
+ return None
1142
+ ys, xs = np.nonzero(mask)
1143
+ if len(xs) < 500:
1144
+ return None
1145
+ # Near field only: rows below the floor's 40th percentile. The far floor
1146
+ # is where a synthetic-VP homography distorts most (R1-2's territory) and
1147
+ # where depth is noisiest; the near field anchors perceived tile size.
1148
+ near = ys >= np.percentile(ys, 40)
1149
+ ys, xs = ys[near], xs[near]
1150
+ if len(xs) < 500:
1151
+ return None
1152
+ rng = np.random.default_rng(12345)
1153
+ sel = rng.choice(len(xs), size=min(2000, len(xs)), replace=False)
1154
+ xs_f = xs[sel].astype(np.float64)
1155
+ ys_f = ys[sel].astype(np.float64)
1156
+ z = depth[ys[sel], xs[sel]].astype(np.float64)
1157
+
1158
+ f = float(w) # P0 convention: focal ~ image width
1159
+ cx, cy = w / 2.0, h / 2.0
1160
+ pts3 = np.stack([(xs_f - cx) / f * z, (ys_f - cy) / f * z, z], axis=1)
1161
+
1162
+ H = np.asarray(homography, np.float64).reshape(3, 3)
1163
+ den = H[2, 0] * xs_f + H[2, 1] * ys_f + H[2, 2]
1164
+ den = np.where(np.abs(den) < 1e-9, 1e-9, den)
1165
+ pts_p = np.stack(
1166
+ [
1167
+ (H[0, 0] * xs_f + H[0, 1] * ys_f + H[0, 2]) / den,
1168
+ (H[1, 0] * xs_f + H[1, 1] * ys_f + H[1, 2]) / den,
1169
+ ],
1170
+ axis=1,
1171
+ )
1172
+
1173
+ i = rng.integers(0, len(sel), 4000)
1174
+ j = rng.integers(0, len(sel), 4000)
1175
+ keep = i != j
1176
+ d3 = np.linalg.norm(pts3[i[keep]] - pts3[j[keep]], axis=1)
1177
+ dp = np.linalg.norm(pts_p[i[keep]] - pts_p[j[keep]], axis=1)
1178
+ # separation threshold relative to the floor's own plane-space span —
1179
+ # plane units may be pixel-ish (VP path) or metres (R1-2 path)
1180
+ span = float(np.hypot(
1181
+ np.percentile(pts_p[:, 0], 95) - np.percentile(pts_p[:, 0], 5),
1182
+ np.percentile(pts_p[:, 1], 95) - np.percentile(pts_p[:, 1], 5),
1183
+ ))
1184
+ far = dp > 0.05 * max(span, 1e-9)
1185
+ if far.sum() < 200:
1186
+ return None
1187
+ dpx = np.abs(pts_p[i[keep], 0] - pts_p[j[keep], 0])
1188
+ dpy = np.abs(pts_p[i[keep], 1] - pts_p[j[keep], 1])
1189
+ ratios = d3[far] / dp[far]
1190
+ med = float(np.median(ratios))
1191
+ q1, q3 = np.percentile(ratios, 25), np.percentile(ratios, 75)
1192
+ # Tight consistency gate: a synthetic-VP homography distorts the plane,
1193
+ # making the ratio position-dependent. Returning None there is correct —
1194
+ # the heuristic repeat takes over until R1-2 fixes the geometry.
1195
+ if med <= 1e-6 or (q3 - q1) / med > 0.35:
1196
+ return None # inconsistent — depth and homography disagree
1197
+ # Isotropy gate: a constant shear/anisotropy passes the spread check
1198
+ # (direction-averaged ratios stay narrow) but renders distorted tiles.
1199
+ # The scale measured along plane-x must match plane-y.
1200
+ horiz = far & (dpx > 2 * dpy)
1201
+ vert = far & (dpy > 2 * dpx)
1202
+ if horiz.sum() >= 50 and vert.sum() >= 50:
1203
+ med_h = float(np.median(d3[horiz] / dp[horiz]))
1204
+ med_v = float(np.median(d3[vert] / dp[vert]))
1205
+ if abs(med_h - med_v) / med > 0.2:
1206
+ return None # anisotropic plane mapping — not metric-trustworthy
1207
+ # Shear leaves axis-aligned lengths almost unchanged and shows up in the
1208
+ # diagonals instead: +45 deg pairs stretch while -45 deg pairs shrink.
1209
+ sx = pts_p[i[keep], 0] - pts_p[j[keep], 0]
1210
+ sy = pts_p[i[keep], 1] - pts_p[j[keep], 1]
1211
+ diag = far & (dpx > 0.5 * dpy) & (dpy > 0.5 * dpx)
1212
+ d1 = diag & (sx * sy > 0)
1213
+ d2 = diag & (sx * sy < 0)
1214
+ if d1.sum() >= 50 and d2.sum() >= 50:
1215
+ med_1 = float(np.median(d3[d1] / dp[d1]))
1216
+ med_2 = float(np.median(d3[d2] / dp[d2]))
1217
+ if abs(med_1 - med_2) / med > 0.2:
1218
+ return None # sheared plane mapping — not metric-trustworthy
1219
+ # plausible range: ~1e-3 m/unit for pixel-scale planes (VP trapezoid path)
1220
+ # up to ~1.0 m/unit for metre-scale planes (R1-2 depth-plane path)
1221
+ if not (1e-5 <= med <= 2.0):
1222
+ return None
1223
+ return round(med, 6)
1224
+
1225
+
1226
  # ---------------------------------------------------------------------------
1227
  # P1-4 — Default Tile Rotation
1228
  # Tiles were laid along the plane's x-axis (= the image bbox axis), which is
 
1307
  [cx + (vanishing_point2["x"] - cx) * 0.25, cy + (vanishing_point2["y"] - cy) * 0.25],
1308
  ])
1309
  angle = _plane_angle(toward)
1310
+ # R1-2 reject rather than clamp: an angle beyond the band means the
1311
+ # cue is unreliable in this plane chart (both depth-plane kitchens
1312
+ # saturated the old +/-30 clip), and a saturated guess lays every
1313
+ # tile visibly wrong. Same philosophy as the primary path's gates.
1314
+ if angle is not None and abs(angle) <= 30.0:
1315
+ return float(angle)
1316
 
1317
  return 0.0
1318
 
 
1401
  if depth is not None and floor_mask.any():
1402
  floor_depth = depth[floor_mask > 0]
1403
  lo, hi = float(np.percentile(floor_depth, 2)), float(np.percentile(floor_depth, 98))
1404
+ # R1-1 scale-free margin: works for normalised [0,1] relative depth
1405
+ # and for metric metres alike (the old 0.08 floor assumed [0,1]).
1406
+ margin = max((hi - lo) * 0.35, 0.04 * max(abs(hi), 1e-6))
1407
  depth_keep = (depth >= lo - margin) & (depth <= hi + margin)
1408
  surface = (surface & depth_keep.astype(np.uint8)).astype(np.uint8)
1409
  surface[floor_mask > 0] = np.maximum(surface[floor_mask > 0], 1)
 
1627
  print(f"[TIMING] Depth estimation took {time.perf_counter() - t0:.3f} seconds", flush=True)
1628
 
1629
  t0 = time.perf_counter()
1630
+ homography, plane = estimate_floor_plane(floor_mask, img_np, depth)
1631
+ if plane is not None:
1632
+ # R1-3 — physical plane scale; null on relative depth so the frontend
1633
+ # falls back to the heuristic repeat.
1634
+ plane["metersPerUnit"] = estimate_meters_per_unit(depth, floor_mask, homography, w, h)
1635
  print(f"[TIMING] Plane fitting / homography calculation took {time.perf_counter() - t0:.3f} seconds", flush=True)
1636
 
1637
  t0 = time.perf_counter()
 
1703
  "depthEnabled": depth is not None,
1704
  "shadingEnabled": shade_map is not None,
1705
  },
1706
+ # R1-1 — metric floor depth stats (metres). The seam for R1-2
1707
+ # (plane fit) and R1-3 (true tile scale); null on relative
1708
+ # checkpoints so the frontend can feature-gate.
1709
+ "metricDepth": (
1710
+ {
1711
+ "unit": "m",
1712
+ "floorP5": round(float(np.percentile(depth[region_mask > 0], 5)), 3),
1713
+ "floorP50": round(float(np.percentile(depth[region_mask > 0], 50)), 3),
1714
+ "floorP95": round(float(np.percentile(depth[region_mask > 0], 95)), 3),
1715
+ }
1716
+ if depth is not None and depth_model_is_metric() and (region_mask > 0).any()
1717
+ else None
1718
+ ),
1719
  })
1720
 
1721
  if not segments:
golden_render.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R0-1 — deterministic golden render: bundle x tile -> PNG.
2
+
3
+ A faithful Python port of the CURRENT frontend composite (canvas-engine.ts):
4
+ texture prep (wrap detection -> period-snap -> masked-shift fallback), mip
5
+ pyramid + trilinear with per-pixel footprint LOD, shade-map decode, homography-
6
+ mapped light vector + gloss-gated specular, colour cast, soft highlight clip,
7
+ confidence-map alpha. Texture-prep and sampling primitives are imported from
8
+ verify_n1_sim so this stays in lockstep with the certified implementations.
9
+
10
+ Usage:
11
+ python golden_render.py <bundle.json[.gz]> <tile-image> <out.png>
12
+
13
+ The output is resized to max-dim 720 so goldens stay small and stable.
14
+ """
15
+
16
+ import base64
17
+ import gzip
18
+ import json
19
+ import sys
20
+
21
+ import numpy as np
22
+ from PIL import Image
23
+
24
+ from verify_n1_sim import (
25
+ build_mips,
26
+ detect_wrap_mode,
27
+ make_seamless,
28
+ period_snap,
29
+ sample_bilinear_wrap,
30
+ )
31
+
32
+ OUT_MAX_DIM = 720
33
+
34
+
35
+ def load_bundle(path):
36
+ if path.endswith(".gz"):
37
+ with gzip.open(path, "rt") as f:
38
+ return json.load(f)
39
+ return json.load(open(path))
40
+
41
+
42
+ def estimate_gloss(tex):
43
+ """Port of estimateGloss (canvas-engine.ts): mean 4px luminance gradient."""
44
+ lum = tex[:, :, 0] * 0.299 + tex[:, :, 1] * 0.587 + tex[:, :, 2] * 0.114
45
+ a = lum[::4, 4::4]
46
+ b = lum[::4, :-4:4][:, : a.shape[1]]
47
+ mean_grad = float(np.mean(np.abs(a - b))) if a.size else 0.0
48
+ return float(np.clip(1 - mean_grad / 24, 0, 1))
49
+
50
+
51
+ def soft_clip(v):
52
+ """Port of softClipByte: linear below 220, rational shoulder above."""
53
+ knee, rng = 220.0, 35.0
54
+ t = v - knee
55
+ return np.where(v <= knee, v, knee + (t * rng) / (t + rng))
56
+
57
+
58
+ def prepare_texture(tile_path):
59
+ tex = np.asarray(Image.open(tile_path).convert("RGB"))
60
+ h, w, _ = tex.shape
61
+ mode, _, _ = detect_wrap_mode(tex)
62
+ repeat_scale = 1.0
63
+ if mode != "wrap":
64
+ prepared, info = period_snap(tex)
65
+ if info[0] == "snap":
66
+ repeat_scale = prepared.shape[1] / w
67
+ tex = prepared
68
+ else:
69
+ tex = make_seamless(tex)
70
+ return tex, repeat_scale
71
+
72
+
73
+ def render(bundle_path, tile_path):
74
+ d = load_bundle(bundle_path)
75
+ w, h = d["width"], d["height"]
76
+ base = np.asarray(
77
+ Image.open(__import__("io").BytesIO(base64.b64decode(d["pixels"]))).convert("RGB")
78
+ ).astype(np.float64)
79
+ seg = max(d["segments"], key=lambda s: len(s["mask"]))
80
+
81
+ mask_idx = np.frombuffer(base64.b64decode(seg["mask"]), dtype=np.uint32)
82
+ mask = np.zeros(w * h, bool)
83
+ mask[mask_idx] = True
84
+ mask = mask.reshape(h, w)
85
+
86
+ H = np.asarray(seg["homography"], np.float64).reshape(3, 3)
87
+ plane = seg.get("plane") or {}
88
+ plane_w = max(plane.get("width", w), 1)
89
+ plane_h = max(plane.get("height", h), 1)
90
+ plane_cx = plane.get("x", 0) + plane_w / 2
91
+ plane_cy = plane.get("y", 0) + plane_h / 2
92
+ rot_deg = plane.get("defaultRotation") or 0.0
93
+ rad = np.deg2rad(rot_deg)
94
+ cos, sin = np.cos(-rad), np.sin(-rad)
95
+
96
+ shade_map = (
97
+ np.frombuffer(base64.b64decode(seg["shadeMap"]), np.uint8).reshape(h, w).astype(np.float64)
98
+ if seg.get("shadeMap")
99
+ else None
100
+ )
101
+ shade_lo, shade_hi = seg.get("shadeRange") or (0.55, 1.35)
102
+ conf = (
103
+ np.frombuffer(base64.b64decode(seg["confidenceMap"]), np.uint8).reshape(h, w).astype(np.float64) / 255.0
104
+ if seg.get("confidenceMap")
105
+ else None
106
+ )
107
+ ct = seg.get("colorTemperature") or {}
108
+ if "cast" in ct:
109
+ ct = ct["cast"]
110
+ col = np.array([ct.get("r", 1.0), ct.get("g", 1.0), ct.get("b", 1.0)])
111
+ lv = seg.get("lightVector")
112
+
113
+ tex, repeat_scale = prepare_texture(tile_path)
114
+ gloss = estimate_gloss(np.asarray(Image.open(tile_path).convert("RGB")).astype(np.float64))
115
+ th, tw, _ = tex.shape
116
+ mips = build_mips(tex)
117
+ max_l = len(mips) - 1
118
+
119
+ # R1-3 — mirror of canvas-engine.ts: metric plane scale when present
120
+ # (pixel-ish or metre plane units alike — backend gates mpu hard),
121
+ # heuristic fallback otherwise (info.scale = 1 in goldens).
122
+ DEFAULT_TILE_M = 0.6
123
+ mpu = plane.get("metersPerUnit")
124
+ repeat_w = 0.0
125
+ if mpu and mpu > 0:
126
+ repeat_w = (DEFAULT_TILE_M / mpu) * repeat_scale
127
+ if not (np.isfinite(repeat_w) and repeat_w > 0):
128
+ repeat_w = 0.0
129
+ if not repeat_w:
130
+ repeat_w = max(48.0, min(plane_w, plane_h) * 0.22) * repeat_scale
131
+ repeat_h = repeat_w * (th / tw)
132
+
133
+ ys, xs = np.nonzero(mask)
134
+ xs_f, ys_f = xs.astype(np.float64), ys.astype(np.float64)
135
+
136
+ def to_plane(px, py):
137
+ z = H[2, 0] * px + H[2, 1] * py + H[2, 2]
138
+ z = np.where(np.abs(z) < 1e-6, 1e-6, z)
139
+ return (
140
+ (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / z,
141
+ (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / z,
142
+ )
143
+
144
+ fx, fy = to_plane(xs_f, ys_f)
145
+ fx1, fy1 = to_plane(xs_f + 1, ys_f)
146
+ fx2, fy2 = to_plane(xs_f, ys_f + 1)
147
+
148
+ def rot(ax, ay):
149
+ dx = ax - plane_cx
150
+ dy = ay - plane_cy
151
+ return dx * cos - dy * sin, dx * sin + dy * cos
152
+
153
+ rx, ry = rot(fx, fy)
154
+ rx1, ry1 = rot(fx1, fy1)
155
+ rx2, ry2 = rot(fx2, fy2)
156
+
157
+ u = np.mod(rx / repeat_w, 1.0)
158
+ v = np.mod(ry / repeat_h, 1.0)
159
+ tcx, tcy = (rx / repeat_w) * tw, (ry / repeat_h) * th
160
+ du = np.hypot((rx1 / repeat_w) * tw - tcx, (ry1 / repeat_h) * th - tcy)
161
+ dv = np.hypot((rx2 / repeat_w) * tw - tcx, (ry2 / repeat_h) * th - tcy)
162
+ lod = np.log2(np.maximum(np.maximum(du, dv), 1e-3)) + 0.5
163
+ l0 = np.clip(np.floor(lod), 0, max_l).astype(np.int64)
164
+ frac = np.clip(lod - l0, 0, 1)
165
+
166
+ sample = np.zeros((len(xs), 3), np.float64)
167
+ for lev in range(max_l + 1):
168
+ sel = l0 == lev
169
+ if not sel.any():
170
+ continue
171
+ a = mips[lev]
172
+ sa = sample_bilinear_wrap(a, u[sel] * a.shape[1], v[sel] * a.shape[0])
173
+ if lev < max_l:
174
+ b = mips[lev + 1]
175
+ sb = sample_bilinear_wrap(b, u[sel] * b.shape[1], v[sel] * b.shape[0])
176
+ sample[sel] = sa + (sb - sa) * frac[sel][:, None]
177
+ else:
178
+ sample[sel] = sa
179
+
180
+ shade = (
181
+ shade_lo + (shade_map[ys, xs] / 255.0) * (shade_hi - shade_lo)
182
+ if shade_map is not None
183
+ else np.full(len(xs), 1.0)
184
+ )
185
+
186
+ specular = np.zeros(len(xs))
187
+ if lv:
188
+ lvx, lvy = lv.get("x", 0.0), lv.get("y", 0.0)
189
+ a = to_plane(np.array([w * 0.5]), np.array([h * 0.75]))
190
+ step = min(w, h) * 0.05
191
+ b = to_plane(np.array([w * 0.5 + lvx * step]), np.array([h * 0.75 + lvy * step]))
192
+ dxv, dyv = b[0][0] - a[0][0], b[1][0] - a[1][0]
193
+ ln = np.hypot(dxv, dyv)
194
+ if ln > 1e-6:
195
+ lvx, lvy = dxv / ln, dyv / ln
196
+ dfx = (fx - plane_cx) / (plane_w * 0.5)
197
+ dfy = (fy - plane_cy) / (plane_h * 0.5)
198
+ dlen = np.hypot(dfx, dfy)
199
+ ok = dlen > 0.01
200
+ dot = np.where(ok, (dfx * lvx + dfy * lvy) / np.maximum(dlen, 1e-9), 0.0)
201
+ specular = 0.12 * gloss * np.maximum(0, dot) ** 4
202
+
203
+ texel = soft_clip(sample * shade[:, None] * col[None, :] + specular[:, None] * 255.0)
204
+ alpha = conf[ys, xs][:, None] if conf is not None else np.ones((len(xs), 1))
205
+
206
+ out = base.copy()
207
+ out[ys, xs] = np.clip(texel * alpha + base[ys, xs] * (1 - alpha), 0, 255)
208
+ img = Image.fromarray(out.astype(np.uint8))
209
+ scale = min(OUT_MAX_DIM / max(img.size), 1.0)
210
+ if scale < 1.0:
211
+ img = img.resize((round(img.width * scale), round(img.height * scale)), Image.BILINEAR)
212
+ return img
213
+
214
+
215
+ def main():
216
+ if len(sys.argv) != 4:
217
+ print(__doc__)
218
+ return 2
219
+ render(sys.argv[1], sys.argv[2]).save(sys.argv[3])
220
+ print(f"saved {sys.argv[3]}")
221
+ return 0
222
+
223
+
224
+ if __name__ == "__main__":
225
+ raise SystemExit(main())
verify_goldens.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R0-1 — golden-image gate.
2
+
3
+ Renders the reference matrix (bundle x tile) through golden_render and compares
4
+ against the committed goldens. Any drift beyond tolerance fails with a
5
+ side-by-side (golden | current | amplified diff) written to verify_out/.
6
+
7
+ Usage:
8
+ python verify_goldens.py # check against goldens (CI mode)
9
+ python verify_goldens.py --bless # regenerate goldens (intentional change)
10
+
11
+ Tolerances: renders are deterministic numpy, so genuine engine changes show up
12
+ as large diffs; the small allowance absorbs PIL/numpy version drift only.
13
+ """
14
+
15
+ import os
16
+ import sys
17
+
18
+ import numpy as np
19
+ from PIL import Image
20
+
21
+ import golden_render
22
+
23
+ HERE = os.path.dirname(os.path.abspath(__file__))
24
+ TILES = os.path.join(HERE, "..", "..", "frontend", "viz2d-demo", "src", "assets", "tiles")
25
+ GOLDEN_DIR = os.path.join(HERE, "goldens")
26
+ OUT = os.path.join(HERE, "verify_out")
27
+
28
+ MEAN_TOL = 0.5 # mean abs diff per channel
29
+ P999_TOL = 8.0 # 99.9th percentile abs diff
30
+
31
+ MATRIX = [
32
+ # (golden name, bundle, tile) — tiles cover the three texture-prep paths:
33
+ # checkered = period-snap, rustic-wood = masked-shift, basalt = native wrap
34
+ ("desk_checkered", "data/current_bundle.vizbundle.json", "checkered.jpeg"),
35
+ ("desk_rustic", "data/current_bundle.vizbundle.json", "rustic-wood.jpg"),
36
+ ("desk_basalt", "data/current_bundle.vizbundle.json", "basalt-outside-wal.jpg"),
37
+ ("kitchen_checkered", "data/ref_kitchen.vizbundle.json", "checkered.jpeg"),
38
+ ("kitchen_rustic", "data/ref_kitchen.vizbundle.json", "rustic-wood.jpg"),
39
+ ("kitchen_basalt", "data/ref_kitchen.vizbundle.json", "basalt-outside-wal.jpg"),
40
+ ]
41
+
42
+
43
+ def main():
44
+ bless = "--bless" in sys.argv
45
+ os.makedirs(GOLDEN_DIR, exist_ok=True)
46
+ os.makedirs(OUT, exist_ok=True)
47
+ ok = True
48
+
49
+ for name, bundle, tile in MATRIX:
50
+ img = golden_render.render(os.path.join(HERE, bundle), os.path.join(TILES, tile))
51
+ golden_path = os.path.join(GOLDEN_DIR, f"{name}.png")
52
+
53
+ if bless:
54
+ img.save(golden_path)
55
+ print(f" blessed {name}.png ({img.width}x{img.height})")
56
+ continue
57
+
58
+ if not os.path.exists(golden_path):
59
+ print(f" [FAIL] {name}: golden missing — run `make bless`")
60
+ ok = False
61
+ continue
62
+
63
+ cur = np.asarray(img).astype(np.float64)
64
+ gold = np.asarray(Image.open(golden_path).convert("RGB")).astype(np.float64)
65
+ if cur.shape != gold.shape:
66
+ print(f" [FAIL] {name}: size changed {gold.shape} -> {cur.shape}")
67
+ ok = False
68
+ continue
69
+
70
+ diff = np.abs(cur - gold)
71
+ mean_d = float(diff.mean())
72
+ p999 = float(np.percentile(diff, 99.9))
73
+ passed = mean_d <= MEAN_TOL and p999 <= P999_TOL
74
+ print(f" [{'PASS' if passed else 'FAIL'}] {name}: mean={mean_d:.3f} p99.9={p999:.1f}")
75
+ if not passed:
76
+ ok = False
77
+ amplified = np.clip(diff * 8, 0, 255).astype(np.uint8)
78
+ panel = np.concatenate(
79
+ [gold.astype(np.uint8), cur.astype(np.uint8), amplified], axis=1
80
+ )
81
+ fail_path = os.path.join(OUT, f"golden_fail_{name}.png")
82
+ Image.fromarray(panel).save(fail_path)
83
+ print(f" side-by-side: {fail_path} (golden | current | diff x8)")
84
+
85
+ if bless:
86
+ print("goldens regenerated — commit backend/floor-visualizer/goldens/")
87
+ return 0
88
+ print("\n" + ("ALL GOLDEN CHECKS PASSED" if ok else "GOLDEN CHECKS FAILED"))
89
+ return 0 if ok else 1
90
+
91
+
92
+ if __name__ == "__main__":
93
+ raise SystemExit(main())
verify_n1_parity.sh ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # R0-1 — N1 parity gate: the TypeScript periodSnap/detectWrapMode in
3
+ # canvas-engine.ts must make the same decisions as the certified Python
4
+ # implementations in verify_n1_sim.py, on the real catalog tiles.
5
+ #
6
+ # Requires: python3 (PIL, numpy), node, esbuild (present via frontend
7
+ # node_modules). Run from backend/floor-visualizer/.
8
+ set -euo pipefail
9
+
10
+ HERE="$(cd "$(dirname "$0")" && pwd)"
11
+ FRONTEND="$HERE/../../frontend/viz2d-demo"
12
+ TMP="$(mktemp -d)"
13
+ trap 'rm -rf "$TMP"' EXIT
14
+
15
+ # 1. Python: decode tiles to raw RGBA + record expected decisions
16
+ python3 - "$TMP" << 'EOF'
17
+ import json
18
+ import sys
19
+
20
+ import numpy as np
21
+ from PIL import Image
22
+
23
+ sys.path.insert(0, ".")
24
+ from verify_n1_sim import detect_wrap_mode, period_snap
25
+
26
+ tmp = sys.argv[1]
27
+ TILES = "../../frontend/viz2d-demo/src/assets/tiles"
28
+ cases = ["checkered.jpeg", "rustic-wood.jpg", "floor-natural-stone.jpg",
29
+ "basalt-outside-wal.jpg", "mosaic-tile.jpg"]
30
+ expected = {}
31
+ for name in cases:
32
+ key = name.split(".")[0]
33
+ im = Image.open(f"{TILES}/{name}").convert("RGBA")
34
+ open(f"{tmp}/{key}.bin", "wb").write(im.tobytes())
35
+ rgb = np.asarray(im.convert("RGB"))
36
+ mode, _, _ = detect_wrap_mode(rgb)
37
+ snap = None
38
+ if mode != "wrap":
39
+ out, info = period_snap(rgb)
40
+ if info[0] == "snap":
41
+ snap = [out.shape[1], out.shape[0]]
42
+ expected[key] = {"w": im.width, "h": im.height,
43
+ "mode": "wrap" if mode == "wrap" else "mirror", "snap": snap}
44
+ json.dump(expected, open(f"{tmp}/expected.json", "w"))
45
+ print("python decisions:", json.dumps(expected, default=str))
46
+ EOF
47
+
48
+ # 2. Compile the actual frontend engine and replay the same decisions
49
+ (cd "$FRONTEND" && npx esbuild src/visualizer-demo/canvas-engine.ts \
50
+ --format=cjs --outfile="$TMP/ce.cjs" --log-level=error)
51
+
52
+ node - "$TMP" << 'EOF'
53
+ const fs = require("fs");
54
+ const path = require("path");
55
+ const tmp = process.argv[2];
56
+ const { periodSnap, detectWrapMode } = require(path.join(tmp, "ce.cjs"));
57
+ const expected = JSON.parse(fs.readFileSync(path.join(tmp, "expected.json")));
58
+ let ok = true;
59
+ for (const [key, exp] of Object.entries(expected)) {
60
+ const raw = new Uint8ClampedArray(fs.readFileSync(path.join(tmp, key + ".bin")));
61
+ const mode = detectWrapMode(raw, exp.w, exp.h);
62
+ let snap = null;
63
+ if (mode !== "wrap") {
64
+ const r = periodSnap(raw, exp.w, exp.h);
65
+ if (r) snap = [r.w, r.h];
66
+ }
67
+ const pass = mode === exp.mode && JSON.stringify(snap) === JSON.stringify(exp.snap);
68
+ console.log(` [${pass ? "PASS" : "FAIL"}] ${key}: ts mode=${mode} snap=${JSON.stringify(snap)}`
69
+ + (pass ? "" : ` expected mode=${exp.mode} snap=${JSON.stringify(exp.snap)}`));
70
+ if (!pass) ok = false;
71
+ }
72
+ console.log(ok ? "N1 PARITY PASSED" : "N1 PARITY FAILED");
73
+ process.exit(ok ? 0 : 1);
74
+ EOF
verify_r1_metric.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R1-1 — metric depth certification (local harness; needs torch+transformers
2
+ and reference room photos — not part of `make verify`, which uses precomputed
3
+ bundles).
4
+
5
+ Runs the configured depth checkpoint on reference room photos and validates
6
+ that the output is genuinely METRIC:
7
+
8
+ 1. floor depth range plausible for an interior (p5/p95 within 0.3-20 m)
9
+ 2. ground-plane consistency: on a floor plane, inverse depth is linear in
10
+ image row (1/Z = (y - y_horizon) / (h_cam * f)); the fit must hold
11
+ (R^2 >= 0.9 over floor rows)
12
+ 3. absolute scale: the camera height recovered from that fit's slope
13
+ (h = 1 / (slope * f), f ~ image width) must land in 0.7-2.5 m — the
14
+ handheld-phone band. This is the automated equivalent of the backlog's
15
+ "door height ~2.0 m +/-15%" check: both test absolute metric scale, but
16
+ this one needs no manual annotation.
17
+
18
+ Usage:
19
+ python verify_r1_metric.py <room-photo.jpg> [more photos...]
20
+ """
21
+
22
+ import sys
23
+
24
+ import cv2
25
+ import numpy as np
26
+ import torch
27
+ from PIL import Image
28
+ from transformers import AutoImageProcessor, AutoModelForDepthEstimation
29
+
30
+ # single source of truth: read the configured model + metric predicate from app.py
31
+ src = open("app.py").read()
32
+ ns = {}
33
+ start = src.index("def depth_model_is_metric")
34
+ end = src.index("\nENABLE_DEPTH", start)
35
+ exec(compile(src[start:end], "app.py", "exec"), ns)
36
+ import re
37
+
38
+ MODEL = re.search(r'depth_model_name",\s*\n(?:\s*#.*\n)*\s*"([^"]+)"', src).group(1)
39
+ depth_model_is_metric = ns["depth_model_is_metric"]
40
+
41
+ FLOOR_FRAC = 0.45 # treat the bottom 45% of the frame as floor-dominated
42
+
43
+
44
+ def run_depth(img):
45
+ processor = run_depth.processor
46
+ model = run_depth.model
47
+ inputs = processor(images=img, return_tensors="pt")
48
+ with torch.no_grad():
49
+ out = model(**inputs)
50
+ depth = torch.nn.functional.interpolate(
51
+ out.predicted_depth.unsqueeze(1),
52
+ size=(img.height, img.width),
53
+ mode="bicubic",
54
+ align_corners=False,
55
+ ).squeeze().numpy()
56
+ return cv2.GaussianBlur(depth.astype(np.float32), (0, 0), sigmaX=3)
57
+
58
+
59
+ def main():
60
+ photos = sys.argv[1:]
61
+ if not photos:
62
+ print(__doc__)
63
+ return 2
64
+ print(f"model: {MODEL}")
65
+ if not depth_model_is_metric(MODEL):
66
+ print("!! configured model is not metric — R1-1 not in effect")
67
+ return 1
68
+
69
+ print("loading checkpoint...")
70
+ run_depth.processor = AutoImageProcessor.from_pretrained(MODEL)
71
+ run_depth.model = AutoModelForDepthEstimation.from_pretrained(MODEL).eval()
72
+
73
+ ok = True
74
+ for path in photos:
75
+ img = Image.open(path).convert("RGB")
76
+ if max(img.size) > 1280:
77
+ s = 1280 / max(img.size)
78
+ img = img.resize((round(img.width * s), round(img.height * s)), Image.LANCZOS)
79
+ w, h = img.size
80
+ depth = run_depth(img)
81
+
82
+ floor = depth[int(h * (1 - FLOOR_FRAC)):, :]
83
+ p5, p95 = np.percentile(floor, 5), np.percentile(floor, 95)
84
+ range_ok = 0.3 <= p5 and p95 <= 20.0
85
+
86
+ # row-median inverse depth over the floor band; fit 1/Z = a*y + b
87
+ ys = np.arange(int(h * (1 - FLOOR_FRAC)), h)
88
+ inv = np.array([np.median(1.0 / np.maximum(depth[y], 0.05)) for y in ys])
89
+ a, b = np.polyfit(ys, inv, 1)
90
+ pred = a * ys + b
91
+ ss_res = float(np.sum((inv - pred) ** 2))
92
+ ss_tot = float(np.sum((inv - inv.mean()) ** 2)) + 1e-12
93
+ r2 = 1 - ss_res / ss_tot
94
+
95
+ focal = float(w) # P0 convention: f ~ image width
96
+ horizon_y = -b / a if abs(a) > 1e-12 else float("nan")
97
+ # exact ground-plane relation for a pitched camera:
98
+ # 1/Z = (sin(t)*f - cos(t)*y') / (h*f) -> h = cos(t) / (a*f)
99
+ # with pitch t recovered from the fitted horizon row.
100
+ pitch = np.arctan2(h / 2 - horizon_y, focal)
101
+ cam_h = float(np.cos(pitch) / (a * focal)) if a > 1e-9 else float("inf")
102
+
103
+ plane_ok = r2 >= 0.90 and a > 0
104
+ height_ok = 0.7 <= cam_h <= 2.5
105
+ passed = range_ok and plane_ok and height_ok
106
+ ok &= passed
107
+ print(
108
+ f" [{'PASS' if passed else 'FAIL'}] {path.split('/')[-1]}: "
109
+ f"floor p5-p95 = {p5:.2f}-{p95:.2f} m | invZ-fit R2={r2:.3f} | "
110
+ f"camera height = {cam_h:.2f} m | horizon y = {horizon_y:.0f}/{h}"
111
+ )
112
+ if not range_ok:
113
+ print(" !! floor depth outside 0.3-20 m")
114
+ if not plane_ok:
115
+ print(" !! inverse depth not linear in row — not plane-consistent")
116
+ if not height_ok:
117
+ print(" !! camera height outside handheld band 0.7-2.5 m")
118
+
119
+ print("\n" + ("ALL R1-1 METRIC CHECKS PASSED" if ok else "R1-1 METRIC CHECKS FAILED"))
120
+ return 0 if ok else 1
121
+
122
+
123
+ if __name__ == "__main__":
124
+ raise SystemExit(main())
verify_r1_plane_sim.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R1-2 — depth-based plane-fit fallback certification (CI-safe: analytic
2
+ depth, no torch).
3
+
4
+ Reuses the exact pinhole ground-plane scene from verify_r1_scale_sim (camera
5
+ 1.5 m up, pitch 25 deg, f = image width) and runs the REAL
6
+ plane_homography_from_depth from app.py.
7
+
8
+ Checks:
9
+ 1. engages: returns a homography + plane on a clean metric ground plane
10
+ 2. metric by construction: the certified R1-3 estimator measures
11
+ metersPerUnit ~ 1 on the produced homography (within 2%)
12
+ 3. shear-free and true-to-size: a known 1 m ground square maps to a
13
+ 1 x 1 plane square with right angles (sides within 2%, angle within 2 deg)
14
+ 4. orientation: plane-y grows toward the camera (near field), plane-x
15
+ image-right — the bundle convention the frontend assumes
16
+ 5. rejection: non-planar depth (a dome) returns None
17
+ 6. rejection: relative (normalised) depth returns None
18
+ """
19
+
20
+ import cv2
21
+ import numpy as np
22
+
23
+ from verify_r1_scale_sim import CAM_H, F, H as IMG_H, PITCH, W as IMG_W, scene
24
+
25
+ # --- real implementations from app.py ----------------------------------------
26
+ src = open("app.py").read()
27
+ ns = {"np": np, "cv2": cv2, "depth_model_is_metric": lambda name=None: True}
28
+ for fn in ["estimate_meters_per_unit", "plane_homography_from_depth"]:
29
+ start = src.index(f"def {fn}")
30
+ end = src.index("\ndef ", start + 10)
31
+ exec(compile(src[start:end], "app.py", "exec"), ns)
32
+ plane_homography_from_depth = ns["plane_homography_from_depth"]
33
+ estimate_meters_per_unit = ns["estimate_meters_per_unit"]
34
+
35
+
36
+ def to_plane(H, px, py):
37
+ den = H[2, 0] * px + H[2, 1] * py + H[2, 2]
38
+ return (
39
+ (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / den,
40
+ (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / den,
41
+ )
42
+
43
+
44
+ def project_ground(x_w, fwd_w):
45
+ """Image pixel of a world ground point — inverse of the scene mapping."""
46
+ # world -> camera: y_c, z_c from CAM_H/pitch; then u,v via pinhole
47
+ y_w = -CAM_H
48
+ z_c = np.cos(PITCH) * fwd_w - np.sin(PITCH) * y_w
49
+ y_c = -np.sin(PITCH) * fwd_w - np.cos(PITCH) * y_w
50
+ u = x_w / z_c * F + IMG_W / 2.0
51
+ v = y_c / z_c * F + IMG_H / 2.0
52
+ return u, v
53
+
54
+
55
+ def main():
56
+ ok = True
57
+ mask, z, x_w, fwd_w = scene()
58
+
59
+ fitted = plane_homography_from_depth(z, mask, IMG_W, IMG_H)
60
+ if fitted is None:
61
+ print(" [FAIL] fallback did not engage on a clean metric ground plane")
62
+ print("\nR1-2 SIM CHECKS FAILED")
63
+ return 1
64
+ hom, plane = fitted
65
+ H = np.asarray(hom, np.float64).reshape(3, 3)
66
+ print(f" [PASS] engages: plane {plane['width']:.2f} x {plane['height']:.2f} m, "
67
+ f"source={plane.get('geometrySource')}")
68
+
69
+ mpu = estimate_meters_per_unit(z, mask, hom, IMG_W, IMG_H)
70
+ good = mpu is not None and abs(mpu - 1.0) <= 0.02
71
+ print(f" [{'PASS' if good else 'FAIL'}] metric: metersPerUnit = {mpu}")
72
+ ok &= good
73
+
74
+ # known 1m ground square in the near field, centred
75
+ cx_w = 0.0
76
+ f0 = CAM_H / np.tan(PITCH) * 0.9 # comfortably inside the visible floor
77
+ corners_w = [(cx_w - 0.5, f0), (cx_w + 0.5, f0), (cx_w + 0.5, f0 + 1.0), (cx_w - 0.5, f0 + 1.0)]
78
+ corners_p = []
79
+ for xw, fw in corners_w:
80
+ u, v = project_ground(xw, fw)
81
+ corners_p.append(to_plane(H, u, v))
82
+ corners_p = np.asarray(corners_p)
83
+ s1 = np.linalg.norm(corners_p[1] - corners_p[0])
84
+ s2 = np.linalg.norm(corners_p[2] - corners_p[1])
85
+ d1 = corners_p[1] - corners_p[0]
86
+ d2 = corners_p[2] - corners_p[1]
87
+ angle = np.degrees(np.arccos(abs(d1 @ d2) / (s1 * s2 + 1e-12)))
88
+ square_ok = abs(s1 - 1) <= 0.02 and abs(s2 - 1) <= 0.02 and angle >= 88.0
89
+ print(f" [{'PASS' if square_ok else 'FAIL'}] 1m square -> sides {s1:.3f} x {s2:.3f} m, "
90
+ f"corner angle {angle:.1f} deg")
91
+ ok &= square_ok
92
+
93
+ # orientation: nearer ground (smaller fwd) must have LARGER plane-y;
94
+ # world +x (image right) must have larger plane-x
95
+ u_near, v_near = project_ground(0.0, f0)
96
+ u_far, v_far = project_ground(0.0, f0 + 2.0)
97
+ _, b_near = to_plane(H, u_near, v_near)
98
+ _, b_far = to_plane(H, u_far, v_far)
99
+ u_r, v_r = project_ground(1.0, f0)
100
+ a_l, _ = to_plane(H, u_near, v_near)
101
+ a_r, _ = to_plane(H, u_r, v_r)
102
+ orient_ok = b_near > b_far and a_r > a_l
103
+ print(f" [{'PASS' if orient_ok else 'FAIL'}] orientation: near-y {b_near:.2f} > far-y {b_far:.2f}, "
104
+ f"right-x {a_r:.2f} > left-x {a_l:.2f}")
105
+ ok &= orient_ok
106
+
107
+ # rejection: dome instead of plane
108
+ yy, xx = np.mgrid[0:IMG_H, 0:IMG_W].astype(np.float64)
109
+ dome = (2.5 - 1.2 * np.exp(-(((xx - IMG_W / 2) / 300) ** 2 + ((yy - IMG_H / 2) / 220) ** 2))).astype(np.float32)
110
+ r_dome = plane_homography_from_depth(dome, mask, IMG_W, IMG_H)
111
+ print(f" [{'PASS' if r_dome is None else 'FAIL'}] rejection: dome depth -> {None if r_dome is None else 'accepted'}")
112
+ ok &= r_dome is None
113
+
114
+ # rejection: relative depth
115
+ ns["depth_model_is_metric"] = lambda name=None: False
116
+ r_rel = plane_homography_from_depth(z, mask, IMG_W, IMG_H)
117
+ ns["depth_model_is_metric"] = lambda name=None: True
118
+ print(f" [{'PASS' if r_rel is None else 'FAIL'}] rejection: relative depth -> {None if r_rel is None else 'accepted'}")
119
+ ok &= r_rel is None
120
+
121
+ print("\n" + ("ALL R1-2 SIM CHECKS PASSED" if ok else "R1-2 SIM CHECKS FAILED"))
122
+ return 0 if ok else 1
123
+
124
+
125
+ if __name__ == "__main__":
126
+ raise SystemExit(main())
verify_r1_scale.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R1-3 — metric scene scale certification (local harness; needs
2
+ torch+transformers, a reference bundle, and the photo it was converted from).
3
+
4
+ Validates estimate_meters_per_unit end-to-end:
5
+
6
+ 1. mpu resolves (not None) on real rooms with metric depth
7
+ 2. plausibility: the visible floor's physical bottom width (plane.width *
8
+ mpu) lands in 1.5-10 m
9
+ 3. independence check: a 60 cm tile's on-screen size predicted two ways
10
+ must agree within 15%:
11
+ a) through the scale chain: 0.6/mpu plane units -> homography -> pixels
12
+ b) straight from depth: 0.6 * f / Z at the same image row
13
+ 4. cross-room consistency: the same physical tile, the same prediction
14
+ logic, in every supplied room.
15
+
16
+ Usage:
17
+ python verify_r1_scale.py <bundle.json[.gz]>:<photo.jpg> [more pairs...]
18
+ """
19
+
20
+ import base64
21
+ import gzip
22
+ import json
23
+ import sys
24
+
25
+ import cv2
26
+ import numpy as np
27
+ import torch
28
+ from PIL import Image
29
+ from transformers import AutoImageProcessor, AutoModelForDepthEstimation
30
+
31
+ # --- real implementations from app.py ---------------------------------------
32
+ src = open("app.py").read()
33
+ ns = {"np": np, "cv2": cv2}
34
+ start = src.index("def depth_model_is_metric")
35
+ end = src.index("\nENABLE_DEPTH", start)
36
+ exec(compile(src[start:end], "app.py", "exec"), ns)
37
+ ns["depth_model_is_metric"] = lambda name=None: True # harness always metric
38
+ start = src.index("def estimate_meters_per_unit")
39
+ end = src.index("\n# ---", start)
40
+ exec(compile(src[start:end], "app.py", "exec"), ns)
41
+ estimate_meters_per_unit = ns["estimate_meters_per_unit"]
42
+
43
+ import re
44
+
45
+ MODEL = re.search(r'depth_model_name",\s*\n(?:\s*#.*\n)*\s*"([^"]+)"', src).group(1)
46
+
47
+
48
+ def load_bundle(path):
49
+ if path.endswith(".gz"):
50
+ with gzip.open(path, "rt") as f:
51
+ return json.load(f)
52
+ return json.load(open(path))
53
+
54
+
55
+ def run_depth(img):
56
+ inputs = run_depth.processor(images=img, return_tensors="pt")
57
+ with torch.no_grad():
58
+ out = run_depth.model(**inputs)
59
+ depth = torch.nn.functional.interpolate(
60
+ out.predicted_depth.unsqueeze(1),
61
+ size=(img.height, img.width),
62
+ mode="bicubic",
63
+ align_corners=False,
64
+ ).squeeze().numpy()
65
+ return cv2.GaussianBlur(depth.astype(np.float32), (0, 0), sigmaX=3)
66
+
67
+
68
+ def main():
69
+ pairs = [a.split(":") for a in sys.argv[1:]]
70
+ if not pairs:
71
+ print(__doc__)
72
+ return 2
73
+ print(f"model: {MODEL}")
74
+ run_depth.processor = AutoImageProcessor.from_pretrained(MODEL)
75
+ run_depth.model = AutoModelForDepthEstimation.from_pretrained(MODEL).eval()
76
+
77
+ ok = True
78
+ for bundle_path, photo in pairs:
79
+ d = load_bundle(bundle_path)
80
+ w, h = d["width"], d["height"]
81
+ seg = max(d["segments"], key=lambda s: len(s["mask"]))
82
+ mask_idx = np.frombuffer(base64.b64decode(seg["mask"]), dtype=np.uint32)
83
+ mask = np.zeros(w * h, np.uint8)
84
+ mask[mask_idx] = 1
85
+ mask = mask.reshape(h, w)
86
+ H = np.asarray(seg["homography"], np.float64).reshape(3, 3)
87
+
88
+ img = Image.open(photo).convert("RGB").resize((w, h), Image.LANCZOS)
89
+ depth = run_depth(img)
90
+
91
+ mpu = estimate_meters_per_unit(depth, mask, seg["homography"], w, h)
92
+ if mpu is None:
93
+ # A clean fallback is acceptable: rooms on the synthetic-VP
94
+ # homography can't carry a trustworthy metric scale until R1-2;
95
+ # the engine then uses the heuristic repeat. FAIL is reserved for
96
+ # a returned-but-wrong scale (checked below).
97
+ print(f" [PASS] {photo.split('/')[-1]}: metersPerUnit = None "
98
+ f"(clean heuristic fallback — geometry not metric-trustworthy)")
99
+ continue
100
+
101
+ plane = seg["plane"]
102
+ floor_w_m = plane["width"] * mpu
103
+ width_ok = 1.5 <= floor_w_m <= 10.0
104
+
105
+ # independence check at a bottom-area floor row
106
+ ys, xs = np.nonzero(mask)
107
+ y_ref = int(np.percentile(ys, 92))
108
+ row_xs = xs[ys == y_ref]
109
+ x_ref = int(np.median(row_xs))
110
+ z_ref = float(depth[y_ref, x_ref])
111
+ f = float(w)
112
+ px_from_depth = f * 0.6 / z_ref
113
+
114
+ # map (0.6/mpu) plane units back through H^-1 at the same location
115
+ Hinv = np.linalg.inv(H)
116
+ den = H[2, 0] * x_ref + H[2, 1] * y_ref + H[2, 2]
117
+ px_p = (H[0, 0] * x_ref + H[0, 1] * y_ref + H[0, 2]) / den
118
+ py_p = (H[1, 0] * x_ref + H[1, 1] * y_ref + H[1, 2]) / den
119
+
120
+ def back(up, vp):
121
+ dz = Hinv[2, 0] * up + Hinv[2, 1] * vp + Hinv[2, 2]
122
+ return (
123
+ (Hinv[0, 0] * up + Hinv[0, 1] * vp + Hinv[0, 2]) / dz,
124
+ (Hinv[1, 0] * up + Hinv[1, 1] * vp + Hinv[1, 2]) / dz,
125
+ )
126
+
127
+ units = 0.6 / mpu
128
+ ax, ay = back(px_p - units / 2, py_p)
129
+ bx, by = back(px_p + units / 2, py_p)
130
+ px_from_chain = float(np.hypot(bx - ax, by - ay))
131
+ rel_err = abs(px_from_chain - px_from_depth) / px_from_depth
132
+ chain_ok = rel_err <= 0.15
133
+
134
+ passed = width_ok and chain_ok
135
+ ok &= passed
136
+ print(
137
+ f" [{'PASS' if passed else 'FAIL'}] {photo.split('/')[-1]}: "
138
+ f"mpu={mpu:.5f} m/unit | floor width = {floor_w_m:.2f} m | "
139
+ f"60cm tile @row{y_ref}: chain={px_from_chain:.0f}px vs depth={px_from_depth:.0f}px "
140
+ f"(err {rel_err * 100:.1f}%)"
141
+ )
142
+ if not width_ok:
143
+ print(" !! floor physical width implausible")
144
+ if not chain_ok:
145
+ print(" !! scale chain disagrees with direct depth prediction")
146
+
147
+ print("\n" + ("ALL R1-3 SCALE CHECKS PASSED" if ok else "R1-3 SCALE CHECKS FAILED"))
148
+ return 0 if ok else 1
149
+
150
+
151
+ if __name__ == "__main__":
152
+ raise SystemExit(main())
verify_r1_scale_sim.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """R1-3 — metric scale certification on an exact synthetic scene (CI-safe:
2
+ no torch, analytic depth).
3
+
4
+ Scene: pinhole camera (f = image width), height 1.5 m, pitch 25 deg, looking
5
+ at an infinite ground plane. Depth is computed analytically, the homography is
6
+ built exactly from four ground points with a known plane-unit scale, so the
7
+ true metersPerUnit is known in closed form.
8
+
9
+ Checks (real implementation extracted from app.py):
10
+ 1. recovery: estimate_meters_per_unit returns the true scale within 2%
11
+ 2. rejection: a sheared homography (the synthetic-VP failure mode) returns
12
+ None instead of a confidently-wrong scale
13
+ 3. relative depth (normalised [0,1]) returns None (metric-only feature)
14
+ """
15
+
16
+ import cv2
17
+ import numpy as np
18
+
19
+ # --- real implementation from app.py ----------------------------------------
20
+ src = open("app.py").read()
21
+ ns = {"np": np, "cv2": cv2, "depth_model_is_metric": lambda name=None: True}
22
+ start = src.index("def estimate_meters_per_unit")
23
+ end = src.index("\n# ---", start)
24
+ exec(compile(src[start:end], "app.py", "exec"), ns)
25
+ estimate_meters_per_unit = ns["estimate_meters_per_unit"]
26
+
27
+ W, H = 800, 600
28
+ F = float(W)
29
+ CAM_H = 1.5
30
+ PITCH = np.deg2rad(25.0)
31
+ UNITS_PER_M = 200.0 # plane-unit scale baked into the homography
32
+ TRUE_MPU = 1.0 / UNITS_PER_M
33
+
34
+
35
+ def scene():
36
+ cx, cy = W / 2.0, H / 2.0
37
+ u, v = np.meshgrid(np.arange(W, dtype=np.float64), np.arange(H, dtype=np.float64))
38
+ # ground plane: 1/Z = (sin(t) + cos(t) * (v - cy)/f) / h (v grows downward)
39
+ inv_z = (np.sin(PITCH) + np.cos(PITCH) * (v - cy) / F) / CAM_H
40
+ mask = inv_z > 1.0 / 30.0 # floor visible, within 30 m
41
+ z = np.where(mask, 1.0 / np.maximum(inv_z, 1e-9), 0.0)
42
+
43
+ # camera-frame 3D, then world ground coordinates. Camera pitched DOWN by
44
+ # PITCH, world y up, image v down: world_y = -cos*y_c - sin*z (must be
45
+ # exactly -CAM_H on the ground — asserted), forward = cos*z - sin*y_c.
46
+ x_c = z * (u - cx) / F
47
+ y_c = z * (v - cy) / F
48
+ world_y = -np.cos(PITCH) * y_c - np.sin(PITCH) * z
49
+ assert np.allclose(world_y[mask], -CAM_H, atol=1e-9), "sim geometry inconsistent"
50
+ x_w = x_c
51
+ fwd_w = np.cos(PITCH) * z - np.sin(PITCH) * y_c
52
+ return mask.astype(np.uint8), z.astype(np.float32), x_w, fwd_w
53
+
54
+
55
+ def exact_homography(mask, x_w, fwd_w):
56
+ ys, xs = np.nonzero(mask)
57
+ # four well-spread ground points
58
+ picks = []
59
+ for fy, fx in [(0.95, 0.2), (0.95, 0.8), (0.55, 0.3), (0.55, 0.7)]:
60
+ yy = int(np.percentile(ys, fy * 100))
61
+ row = xs[ys == yy]
62
+ xx = int(np.percentile(row, fx * 100))
63
+ picks.append((xx, yy))
64
+ src_pts = np.float32(picks)
65
+ dst_pts = np.float32(
66
+ [[x_w[y, x] * UNITS_PER_M, fwd_w[y, x] * UNITS_PER_M] for x, y in picks]
67
+ )
68
+ return cv2.getPerspectiveTransform(src_pts, dst_pts)
69
+
70
+
71
+ def main():
72
+ ok = True
73
+ mask, z, x_w, fwd_w = scene()
74
+ Hm = exact_homography(mask, x_w, fwd_w)
75
+
76
+ mpu = estimate_meters_per_unit(z, mask, Hm.flatten().tolist(), W, H)
77
+ if mpu is None:
78
+ print(" [FAIL] recovery: returned None on exact scene")
79
+ ok = False
80
+ else:
81
+ err = abs(mpu - TRUE_MPU) / TRUE_MPU
82
+ good = err <= 0.02
83
+ print(f" [{'PASS' if good else 'FAIL'}] recovery: mpu={mpu:.6f} "
84
+ f"(true {TRUE_MPU:.6f}, err {err * 100:.2f}%)")
85
+ ok &= good
86
+
87
+ # synthetic-VP failure mode: progressive horizontal shear of plane coords
88
+ S = np.array([[1.0, 0.35, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
89
+ H_bad = S @ Hm
90
+ mpu_bad = estimate_meters_per_unit(z, mask, H_bad.flatten().tolist(), W, H)
91
+ print(f" [{'PASS' if mpu_bad is None else 'FAIL'}] rejection: sheared homography -> {mpu_bad}")
92
+ ok &= mpu_bad is None
93
+
94
+ rel = (z - z[mask > 0].min()) / (z[mask > 0].max() - z[mask > 0].min())
95
+ ns["depth_model_is_metric"] = lambda name=None: False
96
+ mpu_rel = estimate_meters_per_unit(rel.astype(np.float32), mask, Hm.flatten().tolist(), W, H)
97
+ print(f" [{'PASS' if mpu_rel is None else 'FAIL'}] relative depth -> {mpu_rel}")
98
+ ok &= mpu_rel is None
99
+
100
+ print("\n" + ("ALL R1-3 SIM CHECKS PASSED" if ok else "R1-3 SIM CHECKS FAILED"))
101
+ return 0 if ok else 1
102
+
103
+
104
+ if __name__ == "__main__":
105
+ raise SystemExit(main())
visualizer.gpu.toml CHANGED
@@ -7,7 +7,7 @@ segmentation_model = "oneformer"
7
  oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
8
  mask2former_model_name = "facebook/mask2former-swin-small-ade-semantic"
9
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
10
- depth_model_name = "Intel/dpt-large"
11
  intrinsic_model_version = "v2"
12
 
13
  [runtime]
 
7
  oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
8
  mask2former_model_name = "facebook/mask2former-swin-small-ade-semantic"
9
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
10
+ depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
11
  intrinsic_model_version = "v2"
12
 
13
  [runtime]
visualizer.hf.toml CHANGED
@@ -5,7 +5,7 @@
5
  [models]
6
  segmentation_model = "oneformer"
7
  oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
8
- depth_model_name = "Intel/dpt-large"
9
  intrinsic_model_version = "v2"
10
 
11
  [runtime]
 
5
  [models]
6
  segmentation_model = "oneformer"
7
  oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
8
+ depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
9
  intrinsic_model_version = "v2"
10
 
11
  [runtime]
visualizer.local.toml CHANGED
@@ -5,7 +5,7 @@
5
  [models]
6
  segmentation_model = "segformer"
7
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
8
- depth_model_name = "Intel/dpt-large"
9
 
10
  [runtime]
11
  enable_depth_estimation = false
 
5
  [models]
6
  segmentation_model = "segformer"
7
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
8
+ depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
9
 
10
  [runtime]
11
  enable_depth_estimation = false
visualizer.segformer.toml CHANGED
@@ -6,7 +6,7 @@
6
  [models]
7
  segmentation_model = "segformer"
8
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
9
- depth_model_name = "Intel/dpt-large"
10
 
11
  [runtime]
12
  enable_depth_estimation = false
 
6
  [models]
7
  segmentation_model = "segformer"
8
  segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
9
+ depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
10
 
11
  [runtime]
12
  enable_depth_estimation = false