Spaces:

modventures
/

room-visualizer

Running on T4

App Files Files Community

GitHub Actions commited on about 6 hours ago

Commit

b20c82e

1 Parent(s): 6da2618

Deploy from GitHub commit ca72656c17476e5aa37a4735af6e47ff9f94fa1a

Browse files

Files changed (12) hide show

app.py +315 -7
golden_render.py +225 -0
verify_goldens.py +93 -0
verify_n1_parity.sh +74 -0
verify_r1_metric.py +124 -0
verify_r1_plane_sim.py +126 -0
verify_r1_scale.py +152 -0
verify_r1_scale_sim.py +105 -0
visualizer.gpu.toml +1 -1
visualizer.hf.toml +1 -1
visualizer.local.toml +1 -1
visualizer.segformer.toml +1 -1

app.py CHANGED Viewed

@@ -112,8 +112,17 @@ DEPTH_MODEL_NAME = str(config_value(
     "DEPTH_MODEL_NAME",
     "models",
     "depth_model_name",
-    "Intel/dpt-large",
 ))
 ENABLE_DEPTH_ESTIMATION = str(config_value(
     "ENABLE_DEPTH_ESTIMATION",
     "runtime",
@@ -301,6 +310,18 @@ def estimate_depth(img: Image.Image, width: int, height: int):
         depth_min, depth_max = float(np.min(depth)), float(np.max(depth))
         if depth_max - depth_min < 1e-6:
             return None
         return (depth - depth_min) / (depth_max - depth_min)
     except Exception as exc:
         print(f"Depth estimation skipped ({exc}).", flush=True)
@@ -811,7 +832,144 @@ def detect_dual_vanishing_points(
     return primary, secondary
-def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
     ys, xs = np.where(mask > 0)
     if len(xs) < 1000:
         return None, None
@@ -891,7 +1049,9 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
         vp_y = float(vanishing_point["y"])
     else:
         # No usable VP: assume an eye-level shot with the horizon a little
-        # above the floor's top edge, centred over the bottom edge.
         vp_x = (bl_x + br_x) * 0.5
         vp_y = top_y_f - 0.35 * span_y
     # Keep the horizon clear of the top edge so the transform stays
@@ -923,6 +1083,27 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
     H = cv2.getPerspectiveTransform(src, dst)
     homography = H.flatten().tolist()
     default_rotation = estimate_default_rotation(mask, H, vanishing_point2)
     return homography, {
@@ -938,6 +1119,110 @@ def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray):
     }
 # ---------------------------------------------------------------------------
 # P1-4 — Default Tile Rotation
 # Tiles were laid along the plane's x-axis (= the image bbox axis), which is
@@ -1022,8 +1307,12 @@ def estimate_default_rotation(
             [cx + (vanishing_point2["x"] - cx) * 0.25, cy + (vanishing_point2["y"] - cy) * 0.25],
         ])
         angle = _plane_angle(toward)
-        if angle is not None:
-            return float(np.clip(angle, -30.0, 30.0))
     return 0.0
@@ -1112,7 +1401,9 @@ def build_floor_surface_mask(
     if depth is not None and floor_mask.any():
         floor_depth = depth[floor_mask > 0]
         lo, hi = float(np.percentile(floor_depth, 2)), float(np.percentile(floor_depth, 98))
-        margin = max(0.08, (hi - lo) * 0.35)
         depth_keep = (depth >= lo - margin) & (depth <= hi + margin)
         surface = (surface & depth_keep.astype(np.uint8)).astype(np.uint8)
         surface[floor_mask > 0] = np.maximum(surface[floor_mask > 0], 1)
@@ -1336,7 +1627,11 @@ def build_segmentation_bundle(contents: bytes):
     print(f"[TIMING] Depth estimation took {time.perf_counter() - t0:.3f} seconds", flush=True)
     t0 = time.perf_counter()
-    homography, plane = estimate_floor_plane(floor_mask, img_np)
     print(f"[TIMING] Plane fitting / homography calculation took {time.perf_counter() - t0:.3f} seconds", flush=True)
     t0 = time.perf_counter()
@@ -1408,6 +1703,19 @@ def build_segmentation_bundle(contents: bytes):
                     "depthEnabled": depth is not None,
                     "shadingEnabled": shade_map is not None,
                 },
             })
     if not segments:

     "DEPTH_MODEL_NAME",
     "models",
     "depth_model_name",
+    # R1-1 — metric indoor checkpoint: predicts depth in metres (and is ~10x
+    # smaller than dpt-large). Foundation for R1-2 plane fit and R1-3 scale.
+    "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf",
 ))
+def depth_model_is_metric(name: str | None = None) -> bool:
+    """Metric checkpoints predict metres; relative ones predict unitless
+    inverse depth that we min-max normalise. Convention: metric model ids
+    carry 'metric' in the name (Depth Anything V2 Metric, Metric3D...)."""
+    return "metric" in (name or DEPTH_MODEL_NAME).lower()
 ENABLE_DEPTH_ESTIMATION = str(config_value(
     "ENABLE_DEPTH_ESTIMATION",
     "runtime",
         depth_min, depth_max = float(np.min(depth)), float(np.max(depth))
         if depth_max - depth_min < 1e-6:
             return None
+        if depth_model_is_metric(model_name):
+            # R1-1 — keep the metres. Downstream filters are scale-agnostic
+            # (percentile bands), and R1-2/R1-3 need real geometry. Gate on
+            # plausibility: an indoor scene lives within ~0.1-30 m; anything
+            # else means the checkpoint or processor mis-loaded.
+            if depth_min < -0.5 or depth_max > 50.0 or depth_max < 0.2:
+                print(
+                    f"Metric depth implausible [{depth_min:.2f}, {depth_max:.2f}] m — skipped.",
+                    flush=True,
+                )
+                return None
+            return np.clip(depth, 0.05, 40.0)
         return (depth - depth_min) / (depth_max - depth_min)
     except Exception as exc:
         print(f"Depth estimation skipped ({exc}).", flush=True)
     return primary, secondary
+def plane_homography_from_depth(
+    depth: np.ndarray | None, mask: np.ndarray, w: int, h: int
+):
+    """R1-2 — analytic floor homography from a metric 3D plane fit.
+    When vanishing-point detection fails, the old fallback invented a
+    synthetic horizon, which warped steep/wide rooms (sheared checkers, the
+    far-centre pinch). With metric depth the ground plane can be measured
+    instead: back-project floor pixels (pinhole, f ~ image width), fit the
+    plane robustly, and build the image->plane homography exactly as
+    inv(K @ [u v c]) where u,v are orthonormal in-plane axes and c is a point
+    on the plane. Plane units are METRES, so metersPerUnit is 1 by
+    construction and true-to-size tiling (R1-3) activates in precisely the
+    rooms the trapezoid fallback used to distort.
+    Returns (homography_list, plane_dict) or None (caller falls back to the
+    synthetic-VP construction).
+    """
+    if depth is None or not depth_model_is_metric():
+        return None
+    ys, xs = np.nonzero(mask)
+    if len(xs) < 2000:
+        return None
+    rng = np.random.default_rng(7)
+    sel = rng.choice(len(xs), size=min(6000, len(xs)), replace=False)
+    x_f = xs[sel].astype(np.float64)
+    y_f = ys[sel].astype(np.float64)
+    z = depth[ys[sel], xs[sel]].astype(np.float64)
+    f = float(w)
+    cx, cy = w / 2.0, h / 2.0
+    P = np.stack([(x_f - cx) / f * z, (y_f - cy) / f * z, z], axis=1)
+    # iteratively trimmed total-least-squares plane fit
+    keep = np.ones(len(P), bool)
+    n = np.array([0.0, -1.0, 0.0])
+    c = P.mean(axis=0)
+    for _ in range(4):
+        c = P[keep].mean(axis=0)
+        q = P[keep] - c
+        _, _, vt = np.linalg.svd(q, full_matrices=False)
+        n = vt[2]
+        resid = np.abs((P - c) @ n)
+        thr = max(float(np.percentile(resid[keep], 80)), 1e-4)
+        keep = resid <= thr
+    resid = np.abs((P - c) @ n)
+    med_z = float(np.median(z))
+    inliers = resid <= max(0.03, 0.025 * med_z)
+    if inliers.mean() < 0.6:
+        return None
+    c = P[inliers].mean(axis=0)
+    q = P[inliers] - c
+    _, _, vt = np.linalg.svd(q, full_matrices=False)
+    n = vt[2]
+    if n @ c > 0:
+        n = -n  # face the camera
+    # floor sanity: the normal must point up-ish (camera y is down), not at a
+    # wall-like angle — protects against fitting a dominant wall/cabinet face.
+    if n[1] > -0.5:
+        return None
+    # in-plane axes: u image-horizontal-ish, v toward the camera (near field),
+    # matching the existing convention that plane-y grows toward the viewer
+    u = np.cross(n, [0.0, 0.0, 1.0])
+    if np.linalg.norm(u) < 0.2:  # camera looking straight down
+        u = np.cross(n, [0.0, 1.0, 0.0])
+    u /= np.linalg.norm(u)
+    v = np.cross(n, u)
+    v /= np.linalg.norm(v)
+    K = np.array([[f, 0, cx], [0, f, cy], [0, 0, 1.0]])
+    M = K @ np.stack([u, v, c], axis=1)
+    if abs(np.linalg.det(M)) < 1e-9:
+        return None
+    H = np.linalg.inv(M)
+    H = H / H[2, 2]
+    # orient axes by their image-space direction at the floor centroid
+    def to_plane(px, py):
+        den = H[2, 0] * px + H[2, 1] * py + H[2, 2]
+        return (
+            (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / den,
+            (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / den,
+        )
+    mx, my = float(np.median(x_f)), float(np.median(y_f))
+    a0, b0 = to_plane(mx, my)
+    a_dx, _ = to_plane(mx + 10, my)
+    _, b_dy = to_plane(mx, my + 10)
+    flip = np.diag([
+        -1.0 if a_dx - a0 < 0 else 1.0,   # plane-x grows image-right
+        -1.0 if b_dy - b0 < 0 else 1.0,   # plane-y grows image-down (nearer)
+        1.0,
+    ])
+    H = flip @ H
+    H = H / H[2, 2]
+    # plane-space extent of the floor (percentile box, metres)
+    den = H[2, 0] * x_f + H[2, 1] * y_f + H[2, 2]
+    pa = (H[0, 0] * x_f + H[0, 1] * y_f + H[0, 2]) / den
+    pb = (H[1, 0] * x_f + H[1, 1] * y_f + H[1, 2]) / den
+    a1, a2 = float(np.percentile(pa, 1)), float(np.percentile(pa, 99))
+    b1, b2 = float(np.percentile(pb, 1)), float(np.percentile(pb, 99))
+    if not (0.8 <= a2 - a1 <= 30.0 and 0.8 <= b2 - b1 <= 60.0):
+        return None  # implausible physical footprint
+    # image quad of the plane box (bl, br, tr, tl — existing convention)
+    Hinv = np.linalg.inv(H)
+    def to_img(ap, bp):
+        den_i = Hinv[2, 0] * ap + Hinv[2, 1] * bp + Hinv[2, 2]
+        return (
+            float(np.clip((Hinv[0, 0] * ap + Hinv[0, 1] * bp + Hinv[0, 2]) / den_i, 0, w - 1)),
+            float(np.clip((Hinv[1, 0] * ap + Hinv[1, 1] * bp + Hinv[1, 2]) / den_i, 0, h - 1)),
+        )
+    quad = [to_img(a1, b2), to_img(a2, b2), to_img(a2, b1), to_img(a1, b1)]
+    homography = H.flatten().tolist()
+    # self-check with the certified R1-3 gate chain (isotropy, shear, spread):
+    # on a correct fit the measured scale must be ~1 m per plane unit.
+    mpu = estimate_meters_per_unit(depth, mask, homography, w, h)
+    if mpu is None or not (0.85 <= mpu <= 1.15):
+        return None
+    plane = {
+        "x": a1,
+        "y": b1,
+        "width": a2 - a1,
+        "height": b2 - b1,
+        "quad": [coord for pt in quad for coord in pt],
+        "geometrySource": "depth-plane",  # R1-2 — vs the VP trapezoid path
+    }
+    return homography, plane
+def estimate_floor_plane(mask: np.ndarray, img_np: np.ndarray, depth: np.ndarray | None = None):
     ys, xs = np.where(mask > 0)
     if len(xs) < 1000:
         return None, None
         vp_y = float(vanishing_point["y"])
     else:
         # No usable VP: assume an eye-level shot with the horizon a little
+        # above the floor's top edge, centred over the bottom edge. (If this
+        # guess disagrees with measured depth, the R1-2 arbitration below
+        # replaces it with the analytic depth-plane homography.)
         vp_x = (bl_x + br_x) * 0.5
         vp_y = top_y_f - 0.35 * span_y
     # Keep the horizon clear of the top edge so the transform stays
     H = cv2.getPerspectiveTransform(src, dst)
     homography = H.flatten().tolist()
+    # R1-2 — geometry arbitration: when the trapezoid homography (real or
+    # synthetic VP) is inconsistent with the measured 3D floor — the certified
+    # metric-scale gates fail on it — prefer the analytic depth-plane
+    # homography. This covers both failure classes seen in the reference
+    # rooms: no VP found (guessed horizon) and a detected-but-wrong VP
+    # (sheared trapezoid). With no metric depth both probes return None and
+    # the trapezoid ships unchanged.
+    if depth is not None and estimate_meters_per_unit(depth, mask, homography, w, h) is None:
+        fitted = plane_homography_from_depth(depth, mask, w, h)
+        if fitted is not None:
+            homography_m, plane_meta = fitted
+            plane_meta["hullQuad"] = hull_quad_list
+            plane_meta["vanishingPoint"] = vanishing_point
+            plane_meta["vanishingPoint2"] = vanishing_point2
+            plane_meta["defaultRotation"] = estimate_default_rotation(
+                mask,
+                np.asarray(homography_m, np.float64).reshape(3, 3),
+                vanishing_point2,
+            )
+            return homography_m, plane_meta
     default_rotation = estimate_default_rotation(mask, H, vanishing_point2)
     return homography, {
     }
+def estimate_meters_per_unit(
+    depth: np.ndarray | None,
+    mask: np.ndarray,
+    homography: list[float] | None,
+    w: int,
+    h: int,
+) -> float | None:
+    """R1-3 — physical scale of the floor plane: metres per plane unit.
+    Back-project floor pixels to 3D through the metric depth (pinhole, f ~
+    image width — the P0 convention), map the same pixels into plane space
+    through the homography, and take the median ratio of 3D distance to
+    plane-space distance over random point pairs. Replaces the heuristic
+    repeat coefficient on the frontend: a 60 cm tile becomes 0.6/metersPerUnit
+    plane units in every room. None when depth is relative, the floor is too
+    small, or the ratios are inconsistent (non-planar depth — let the
+    heuristic handle it).
+    """
+    if depth is None or homography is None or not depth_model_is_metric():
+        return None
+    ys, xs = np.nonzero(mask)
+    if len(xs) < 500:
+        return None
+    # Near field only: rows below the floor's 40th percentile. The far floor
+    # is where a synthetic-VP homography distorts most (R1-2's territory) and
+    # where depth is noisiest; the near field anchors perceived tile size.
+    near = ys >= np.percentile(ys, 40)
+    ys, xs = ys[near], xs[near]
+    if len(xs) < 500:
+        return None
+    rng = np.random.default_rng(12345)
+    sel = rng.choice(len(xs), size=min(2000, len(xs)), replace=False)
+    xs_f = xs[sel].astype(np.float64)
+    ys_f = ys[sel].astype(np.float64)
+    z = depth[ys[sel], xs[sel]].astype(np.float64)
+    f = float(w)  # P0 convention: focal ~ image width
+    cx, cy = w / 2.0, h / 2.0
+    pts3 = np.stack([(xs_f - cx) / f * z, (ys_f - cy) / f * z, z], axis=1)
+    H = np.asarray(homography, np.float64).reshape(3, 3)
+    den = H[2, 0] * xs_f + H[2, 1] * ys_f + H[2, 2]
+    den = np.where(np.abs(den) < 1e-9, 1e-9, den)
+    pts_p = np.stack(
+        [
+            (H[0, 0] * xs_f + H[0, 1] * ys_f + H[0, 2]) / den,
+            (H[1, 0] * xs_f + H[1, 1] * ys_f + H[1, 2]) / den,
+        ],
+        axis=1,
+    )
+    i = rng.integers(0, len(sel), 4000)
+    j = rng.integers(0, len(sel), 4000)
+    keep = i != j
+    d3 = np.linalg.norm(pts3[i[keep]] - pts3[j[keep]], axis=1)
+    dp = np.linalg.norm(pts_p[i[keep]] - pts_p[j[keep]], axis=1)
+    # separation threshold relative to the floor's own plane-space span —
+    # plane units may be pixel-ish (VP path) or metres (R1-2 path)
+    span = float(np.hypot(
+        np.percentile(pts_p[:, 0], 95) - np.percentile(pts_p[:, 0], 5),
+        np.percentile(pts_p[:, 1], 95) - np.percentile(pts_p[:, 1], 5),
+    ))
+    far = dp > 0.05 * max(span, 1e-9)
+    if far.sum() < 200:
+        return None
+    dpx = np.abs(pts_p[i[keep], 0] - pts_p[j[keep], 0])
+    dpy = np.abs(pts_p[i[keep], 1] - pts_p[j[keep], 1])
+    ratios = d3[far] / dp[far]
+    med = float(np.median(ratios))
+    q1, q3 = np.percentile(ratios, 25), np.percentile(ratios, 75)
+    # Tight consistency gate: a synthetic-VP homography distorts the plane,
+    # making the ratio position-dependent. Returning None there is correct —
+    # the heuristic repeat takes over until R1-2 fixes the geometry.
+    if med <= 1e-6 or (q3 - q1) / med > 0.35:
+        return None  # inconsistent — depth and homography disagree
+    # Isotropy gate: a constant shear/anisotropy passes the spread check
+    # (direction-averaged ratios stay narrow) but renders distorted tiles.
+    # The scale measured along plane-x must match plane-y.
+    horiz = far & (dpx > 2 * dpy)
+    vert = far & (dpy > 2 * dpx)
+    if horiz.sum() >= 50 and vert.sum() >= 50:
+        med_h = float(np.median(d3[horiz] / dp[horiz]))
+        med_v = float(np.median(d3[vert] / dp[vert]))
+        if abs(med_h - med_v) / med > 0.2:
+            return None  # anisotropic plane mapping — not metric-trustworthy
+    # Shear leaves axis-aligned lengths almost unchanged and shows up in the
+    # diagonals instead: +45 deg pairs stretch while -45 deg pairs shrink.
+    sx = pts_p[i[keep], 0] - pts_p[j[keep], 0]
+    sy = pts_p[i[keep], 1] - pts_p[j[keep], 1]
+    diag = far & (dpx > 0.5 * dpy) & (dpy > 0.5 * dpx)
+    d1 = diag & (sx * sy > 0)
+    d2 = diag & (sx * sy < 0)
+    if d1.sum() >= 50 and d2.sum() >= 50:
+        med_1 = float(np.median(d3[d1] / dp[d1]))
+        med_2 = float(np.median(d3[d2] / dp[d2]))
+        if abs(med_1 - med_2) / med > 0.2:
+            return None  # sheared plane mapping — not metric-trustworthy
+    # plausible range: ~1e-3 m/unit for pixel-scale planes (VP trapezoid path)
+    # up to ~1.0 m/unit for metre-scale planes (R1-2 depth-plane path)
+    if not (1e-5 <= med <= 2.0):
+        return None
+    return round(med, 6)
 # ---------------------------------------------------------------------------
 # P1-4 — Default Tile Rotation
 # Tiles were laid along the plane's x-axis (= the image bbox axis), which is
             [cx + (vanishing_point2["x"] - cx) * 0.25, cy + (vanishing_point2["y"] - cy) * 0.25],
         ])
         angle = _plane_angle(toward)
+        # R1-2 — reject rather than clamp: an angle beyond the band means the
+        # cue is unreliable in this plane chart (both depth-plane kitchens
+        # saturated the old +/-30 clip), and a saturated guess lays every
+        # tile visibly wrong. Same philosophy as the primary path's gates.
+        if angle is not None and abs(angle) <= 30.0:
+            return float(angle)
     return 0.0
     if depth is not None and floor_mask.any():
         floor_depth = depth[floor_mask > 0]
         lo, hi = float(np.percentile(floor_depth, 2)), float(np.percentile(floor_depth, 98))
+        # R1-1 — scale-free margin: works for normalised [0,1] relative depth
+        # and for metric metres alike (the old 0.08 floor assumed [0,1]).
+        margin = max((hi - lo) * 0.35, 0.04 * max(abs(hi), 1e-6))
         depth_keep = (depth >= lo - margin) & (depth <= hi + margin)
         surface = (surface & depth_keep.astype(np.uint8)).astype(np.uint8)
         surface[floor_mask > 0] = np.maximum(surface[floor_mask > 0], 1)
     print(f"[TIMING] Depth estimation took {time.perf_counter() - t0:.3f} seconds", flush=True)
     t0 = time.perf_counter()
+    homography, plane = estimate_floor_plane(floor_mask, img_np, depth)
+    if plane is not None:
+        # R1-3 — physical plane scale; null on relative depth so the frontend
+        # falls back to the heuristic repeat.
+        plane["metersPerUnit"] = estimate_meters_per_unit(depth, floor_mask, homography, w, h)
     print(f"[TIMING] Plane fitting / homography calculation took {time.perf_counter() - t0:.3f} seconds", flush=True)
     t0 = time.perf_counter()
                     "depthEnabled": depth is not None,
                     "shadingEnabled": shade_map is not None,
                 },
+                # R1-1 — metric floor depth stats (metres). The seam for R1-2
+                # (plane fit) and R1-3 (true tile scale); null on relative
+                # checkpoints so the frontend can feature-gate.
+                "metricDepth": (
+                    {
+                        "unit": "m",
+                        "floorP5": round(float(np.percentile(depth[region_mask > 0], 5)), 3),
+                        "floorP50": round(float(np.percentile(depth[region_mask > 0], 50)), 3),
+                        "floorP95": round(float(np.percentile(depth[region_mask > 0], 95)), 3),
+                    }
+                    if depth is not None and depth_model_is_metric() and (region_mask > 0).any()
+                    else None
+                ),
             })
     if not segments:

golden_render.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""R0-1 — deterministic golden render: bundle x tile -> PNG.
+A faithful Python port of the CURRENT frontend composite (canvas-engine.ts):
+texture prep (wrap detection -> period-snap -> masked-shift fallback), mip
+pyramid + trilinear with per-pixel footprint LOD, shade-map decode, homography-
+mapped light vector + gloss-gated specular, colour cast, soft highlight clip,
+confidence-map alpha. Texture-prep and sampling primitives are imported from
+verify_n1_sim so this stays in lockstep with the certified implementations.
+Usage:
+    python golden_render.py <bundle.json[.gz]> <tile-image> <out.png>
+The output is resized to max-dim 720 so goldens stay small and stable.
+"""
+import base64
+import gzip
+import json
+import sys
+import numpy as np
+from PIL import Image
+from verify_n1_sim import (
+    build_mips,
+    detect_wrap_mode,
+    make_seamless,
+    period_snap,
+    sample_bilinear_wrap,
+)
+OUT_MAX_DIM = 720
+def load_bundle(path):
+    if path.endswith(".gz"):
+        with gzip.open(path, "rt") as f:
+            return json.load(f)
+    return json.load(open(path))
+def estimate_gloss(tex):
+    """Port of estimateGloss (canvas-engine.ts): mean 4px luminance gradient."""
+    lum = tex[:, :, 0] * 0.299 + tex[:, :, 1] * 0.587 + tex[:, :, 2] * 0.114
+    a = lum[::4, 4::4]
+    b = lum[::4, :-4:4][:, : a.shape[1]]
+    mean_grad = float(np.mean(np.abs(a - b))) if a.size else 0.0
+    return float(np.clip(1 - mean_grad / 24, 0, 1))
+def soft_clip(v):
+    """Port of softClipByte: linear below 220, rational shoulder above."""
+    knee, rng = 220.0, 35.0
+    t = v - knee
+    return np.where(v <= knee, v, knee + (t * rng) / (t + rng))
+def prepare_texture(tile_path):
+    tex = np.asarray(Image.open(tile_path).convert("RGB"))
+    h, w, _ = tex.shape
+    mode, _, _ = detect_wrap_mode(tex)
+    repeat_scale = 1.0
+    if mode != "wrap":
+        prepared, info = period_snap(tex)
+        if info[0] == "snap":
+            repeat_scale = prepared.shape[1] / w
+            tex = prepared
+        else:
+            tex = make_seamless(tex)
+    return tex, repeat_scale
+def render(bundle_path, tile_path):
+    d = load_bundle(bundle_path)
+    w, h = d["width"], d["height"]
+    base = np.asarray(
+        Image.open(__import__("io").BytesIO(base64.b64decode(d["pixels"]))).convert("RGB")
+    ).astype(np.float64)
+    seg = max(d["segments"], key=lambda s: len(s["mask"]))
+    mask_idx = np.frombuffer(base64.b64decode(seg["mask"]), dtype=np.uint32)
+    mask = np.zeros(w * h, bool)
+    mask[mask_idx] = True
+    mask = mask.reshape(h, w)
+    H = np.asarray(seg["homography"], np.float64).reshape(3, 3)
+    plane = seg.get("plane") or {}
+    plane_w = max(plane.get("width", w), 1)
+    plane_h = max(plane.get("height", h), 1)
+    plane_cx = plane.get("x", 0) + plane_w / 2
+    plane_cy = plane.get("y", 0) + plane_h / 2
+    rot_deg = plane.get("defaultRotation") or 0.0
+    rad = np.deg2rad(rot_deg)
+    cos, sin = np.cos(-rad), np.sin(-rad)
+    shade_map = (
+        np.frombuffer(base64.b64decode(seg["shadeMap"]), np.uint8).reshape(h, w).astype(np.float64)
+        if seg.get("shadeMap")
+        else None
+    )
+    shade_lo, shade_hi = seg.get("shadeRange") or (0.55, 1.35)
+    conf = (
+        np.frombuffer(base64.b64decode(seg["confidenceMap"]), np.uint8).reshape(h, w).astype(np.float64) / 255.0
+        if seg.get("confidenceMap")
+        else None
+    )
+    ct = seg.get("colorTemperature") or {}
+    if "cast" in ct:
+        ct = ct["cast"]
+    col = np.array([ct.get("r", 1.0), ct.get("g", 1.0), ct.get("b", 1.0)])
+    lv = seg.get("lightVector")
+    tex, repeat_scale = prepare_texture(tile_path)
+    gloss = estimate_gloss(np.asarray(Image.open(tile_path).convert("RGB")).astype(np.float64))
+    th, tw, _ = tex.shape
+    mips = build_mips(tex)
+    max_l = len(mips) - 1
+    # R1-3 — mirror of canvas-engine.ts: metric plane scale when present
+    # (pixel-ish or metre plane units alike — backend gates mpu hard),
+    # heuristic fallback otherwise (info.scale = 1 in goldens).
+    DEFAULT_TILE_M = 0.6
+    mpu = plane.get("metersPerUnit")
+    repeat_w = 0.0
+    if mpu and mpu > 0:
+        repeat_w = (DEFAULT_TILE_M / mpu) * repeat_scale
+        if not (np.isfinite(repeat_w) and repeat_w > 0):
+            repeat_w = 0.0
+    if not repeat_w:
+        repeat_w = max(48.0, min(plane_w, plane_h) * 0.22) * repeat_scale
+    repeat_h = repeat_w * (th / tw)
+    ys, xs = np.nonzero(mask)
+    xs_f, ys_f = xs.astype(np.float64), ys.astype(np.float64)
+    def to_plane(px, py):
+        z = H[2, 0] * px + H[2, 1] * py + H[2, 2]
+        z = np.where(np.abs(z) < 1e-6, 1e-6, z)
+        return (
+            (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / z,
+            (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / z,
+        )
+    fx, fy = to_plane(xs_f, ys_f)
+    fx1, fy1 = to_plane(xs_f + 1, ys_f)
+    fx2, fy2 = to_plane(xs_f, ys_f + 1)
+    def rot(ax, ay):
+        dx = ax - plane_cx
+        dy = ay - plane_cy
+        return dx * cos - dy * sin, dx * sin + dy * cos
+    rx, ry = rot(fx, fy)
+    rx1, ry1 = rot(fx1, fy1)
+    rx2, ry2 = rot(fx2, fy2)
+    u = np.mod(rx / repeat_w, 1.0)
+    v = np.mod(ry / repeat_h, 1.0)
+    tcx, tcy = (rx / repeat_w) * tw, (ry / repeat_h) * th
+    du = np.hypot((rx1 / repeat_w) * tw - tcx, (ry1 / repeat_h) * th - tcy)
+    dv = np.hypot((rx2 / repeat_w) * tw - tcx, (ry2 / repeat_h) * th - tcy)
+    lod = np.log2(np.maximum(np.maximum(du, dv), 1e-3)) + 0.5
+    l0 = np.clip(np.floor(lod), 0, max_l).astype(np.int64)
+    frac = np.clip(lod - l0, 0, 1)
+    sample = np.zeros((len(xs), 3), np.float64)
+    for lev in range(max_l + 1):
+        sel = l0 == lev
+        if not sel.any():
+            continue
+        a = mips[lev]
+        sa = sample_bilinear_wrap(a, u[sel] * a.shape[1], v[sel] * a.shape[0])
+        if lev < max_l:
+            b = mips[lev + 1]
+            sb = sample_bilinear_wrap(b, u[sel] * b.shape[1], v[sel] * b.shape[0])
+            sample[sel] = sa + (sb - sa) * frac[sel][:, None]
+        else:
+            sample[sel] = sa
+    shade = (
+        shade_lo + (shade_map[ys, xs] / 255.0) * (shade_hi - shade_lo)
+        if shade_map is not None
+        else np.full(len(xs), 1.0)
+    )
+    specular = np.zeros(len(xs))
+    if lv:
+        lvx, lvy = lv.get("x", 0.0), lv.get("y", 0.0)
+        a = to_plane(np.array([w * 0.5]), np.array([h * 0.75]))
+        step = min(w, h) * 0.05
+        b = to_plane(np.array([w * 0.5 + lvx * step]), np.array([h * 0.75 + lvy * step]))
+        dxv, dyv = b[0][0] - a[0][0], b[1][0] - a[1][0]
+        ln = np.hypot(dxv, dyv)
+        if ln > 1e-6:
+            lvx, lvy = dxv / ln, dyv / ln
+            dfx = (fx - plane_cx) / (plane_w * 0.5)
+            dfy = (fy - plane_cy) / (plane_h * 0.5)
+            dlen = np.hypot(dfx, dfy)
+            ok = dlen > 0.01
+            dot = np.where(ok, (dfx * lvx + dfy * lvy) / np.maximum(dlen, 1e-9), 0.0)
+            specular = 0.12 * gloss * np.maximum(0, dot) ** 4
+    texel = soft_clip(sample * shade[:, None] * col[None, :] + specular[:, None] * 255.0)
+    alpha = conf[ys, xs][:, None] if conf is not None else np.ones((len(xs), 1))
+    out = base.copy()
+    out[ys, xs] = np.clip(texel * alpha + base[ys, xs] * (1 - alpha), 0, 255)
+    img = Image.fromarray(out.astype(np.uint8))
+    scale = min(OUT_MAX_DIM / max(img.size), 1.0)
+    if scale < 1.0:
+        img = img.resize((round(img.width * scale), round(img.height * scale)), Image.BILINEAR)
+    return img
+def main():
+    if len(sys.argv) != 4:
+        print(__doc__)
+        return 2
+    render(sys.argv[1], sys.argv[2]).save(sys.argv[3])
+    print(f"saved {sys.argv[3]}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

verify_goldens.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""R0-1 — golden-image gate.
+Renders the reference matrix (bundle x tile) through golden_render and compares
+against the committed goldens. Any drift beyond tolerance fails with a
+side-by-side (golden | current | amplified diff) written to verify_out/.
+Usage:
+    python verify_goldens.py            # check against goldens (CI mode)
+    python verify_goldens.py --bless    # regenerate goldens (intentional change)
+Tolerances: renders are deterministic numpy, so genuine engine changes show up
+as large diffs; the small allowance absorbs PIL/numpy version drift only.
+"""
+import os
+import sys
+import numpy as np
+from PIL import Image
+import golden_render
+HERE = os.path.dirname(os.path.abspath(__file__))
+TILES = os.path.join(HERE, "..", "..", "frontend", "viz2d-demo", "src", "assets", "tiles")
+GOLDEN_DIR = os.path.join(HERE, "goldens")
+OUT = os.path.join(HERE, "verify_out")
+MEAN_TOL = 0.5   # mean abs diff per channel
+P999_TOL = 8.0   # 99.9th percentile abs diff
+MATRIX = [
+    # (golden name, bundle, tile)  — tiles cover the three texture-prep paths:
+    # checkered = period-snap, rustic-wood = masked-shift, basalt = native wrap
+    ("desk_checkered", "data/current_bundle.vizbundle.json", "checkered.jpeg"),
+    ("desk_rustic", "data/current_bundle.vizbundle.json", "rustic-wood.jpg"),
+    ("desk_basalt", "data/current_bundle.vizbundle.json", "basalt-outside-wal.jpg"),
+    ("kitchen_checkered", "data/ref_kitchen.vizbundle.json", "checkered.jpeg"),
+    ("kitchen_rustic", "data/ref_kitchen.vizbundle.json", "rustic-wood.jpg"),
+    ("kitchen_basalt", "data/ref_kitchen.vizbundle.json", "basalt-outside-wal.jpg"),
+]
+def main():
+    bless = "--bless" in sys.argv
+    os.makedirs(GOLDEN_DIR, exist_ok=True)
+    os.makedirs(OUT, exist_ok=True)
+    ok = True
+    for name, bundle, tile in MATRIX:
+        img = golden_render.render(os.path.join(HERE, bundle), os.path.join(TILES, tile))
+        golden_path = os.path.join(GOLDEN_DIR, f"{name}.png")
+        if bless:
+            img.save(golden_path)
+            print(f"  blessed {name}.png ({img.width}x{img.height})")
+            continue
+        if not os.path.exists(golden_path):
+            print(f"  [FAIL] {name}: golden missing — run `make bless`")
+            ok = False
+            continue
+        cur = np.asarray(img).astype(np.float64)
+        gold = np.asarray(Image.open(golden_path).convert("RGB")).astype(np.float64)
+        if cur.shape != gold.shape:
+            print(f"  [FAIL] {name}: size changed {gold.shape} -> {cur.shape}")
+            ok = False
+            continue
+        diff = np.abs(cur - gold)
+        mean_d = float(diff.mean())
+        p999 = float(np.percentile(diff, 99.9))
+        passed = mean_d <= MEAN_TOL and p999 <= P999_TOL
+        print(f"  [{'PASS' if passed else 'FAIL'}] {name}: mean={mean_d:.3f} p99.9={p999:.1f}")
+        if not passed:
+            ok = False
+            amplified = np.clip(diff * 8, 0, 255).astype(np.uint8)
+            panel = np.concatenate(
+                [gold.astype(np.uint8), cur.astype(np.uint8), amplified], axis=1
+            )
+            fail_path = os.path.join(OUT, f"golden_fail_{name}.png")
+            Image.fromarray(panel).save(fail_path)
+            print(f"         side-by-side: {fail_path}  (golden | current | diff x8)")
+    if bless:
+        print("goldens regenerated — commit backend/floor-visualizer/goldens/")
+        return 0
+    print("\n" + ("ALL GOLDEN CHECKS PASSED" if ok else "GOLDEN CHECKS FAILED"))
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

verify_n1_parity.sh ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env bash
+# R0-1 — N1 parity gate: the TypeScript periodSnap/detectWrapMode in
+# canvas-engine.ts must make the same decisions as the certified Python
+# implementations in verify_n1_sim.py, on the real catalog tiles.
+#
+# Requires: python3 (PIL, numpy), node, esbuild (present via frontend
+# node_modules). Run from backend/floor-visualizer/.
+set -euo pipefail
+HERE="$(cd "$(dirname "$0")" && pwd)"
+FRONTEND="$HERE/../../frontend/viz2d-demo"
+TMP="$(mktemp -d)"
+trap 'rm -rf "$TMP"' EXIT
+# 1. Python: decode tiles to raw RGBA + record expected decisions
+python3 - "$TMP" << 'EOF'
+import json
+import sys
+import numpy as np
+from PIL import Image
+sys.path.insert(0, ".")
+from verify_n1_sim import detect_wrap_mode, period_snap
+tmp = sys.argv[1]
+TILES = "../../frontend/viz2d-demo/src/assets/tiles"
+cases = ["checkered.jpeg", "rustic-wood.jpg", "floor-natural-stone.jpg",
+         "basalt-outside-wal.jpg", "mosaic-tile.jpg"]
+expected = {}
+for name in cases:
+    key = name.split(".")[0]
+    im = Image.open(f"{TILES}/{name}").convert("RGBA")
+    open(f"{tmp}/{key}.bin", "wb").write(im.tobytes())
+    rgb = np.asarray(im.convert("RGB"))
+    mode, _, _ = detect_wrap_mode(rgb)
+    snap = None
+    if mode != "wrap":
+        out, info = period_snap(rgb)
+        if info[0] == "snap":
+            snap = [out.shape[1], out.shape[0]]
+    expected[key] = {"w": im.width, "h": im.height,
+                     "mode": "wrap" if mode == "wrap" else "mirror", "snap": snap}
+json.dump(expected, open(f"{tmp}/expected.json", "w"))
+print("python decisions:", json.dumps(expected, default=str))
+EOF
+# 2. Compile the actual frontend engine and replay the same decisions
+(cd "$FRONTEND" && npx esbuild src/visualizer-demo/canvas-engine.ts \
+    --format=cjs --outfile="$TMP/ce.cjs" --log-level=error)
+node - "$TMP" << 'EOF'
+const fs = require("fs");
+const path = require("path");
+const tmp = process.argv[2];
+const { periodSnap, detectWrapMode } = require(path.join(tmp, "ce.cjs"));
+const expected = JSON.parse(fs.readFileSync(path.join(tmp, "expected.json")));
+let ok = true;
+for (const [key, exp] of Object.entries(expected)) {
+  const raw = new Uint8ClampedArray(fs.readFileSync(path.join(tmp, key + ".bin")));
+  const mode = detectWrapMode(raw, exp.w, exp.h);
+  let snap = null;
+  if (mode !== "wrap") {
+    const r = periodSnap(raw, exp.w, exp.h);
+    if (r) snap = [r.w, r.h];
+  }
+  const pass = mode === exp.mode && JSON.stringify(snap) === JSON.stringify(exp.snap);
+  console.log(`  [${pass ? "PASS" : "FAIL"}] ${key}: ts mode=${mode} snap=${JSON.stringify(snap)}`
+    + (pass ? "" : `  expected mode=${exp.mode} snap=${JSON.stringify(exp.snap)}`));
+  if (!pass) ok = false;
+}
+console.log(ok ? "N1 PARITY PASSED" : "N1 PARITY FAILED");
+process.exit(ok ? 0 : 1);
+EOF

verify_r1_metric.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""R1-1 — metric depth certification (local harness; needs torch+transformers
+and reference room photos — not part of `make verify`, which uses precomputed
+bundles).
+Runs the configured depth checkpoint on reference room photos and validates
+that the output is genuinely METRIC:
+  1. floor depth range plausible for an interior (p5/p95 within 0.3-20 m)
+  2. ground-plane consistency: on a floor plane, inverse depth is linear in
+     image row (1/Z = (y - y_horizon) / (h_cam * f)); the fit must hold
+     (R^2 >= 0.9 over floor rows)
+  3. absolute scale: the camera height recovered from that fit's slope
+     (h = 1 / (slope * f), f ~ image width) must land in 0.7-2.5 m — the
+     handheld-phone band. This is the automated equivalent of the backlog's
+     "door height ~2.0 m +/-15%" check: both test absolute metric scale, but
+     this one needs no manual annotation.
+Usage:
+    python verify_r1_metric.py <room-photo.jpg> [more photos...]
+"""
+import sys
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+# single source of truth: read the configured model + metric predicate from app.py
+src = open("app.py").read()
+ns = {}
+start = src.index("def depth_model_is_metric")
+end = src.index("\nENABLE_DEPTH", start)
+exec(compile(src[start:end], "app.py", "exec"), ns)
+import re
+MODEL = re.search(r'depth_model_name",\s*\n(?:\s*#.*\n)*\s*"([^"]+)"', src).group(1)
+depth_model_is_metric = ns["depth_model_is_metric"]
+FLOOR_FRAC = 0.45  # treat the bottom 45% of the frame as floor-dominated
+def run_depth(img):
+    processor = run_depth.processor
+    model = run_depth.model
+    inputs = processor(images=img, return_tensors="pt")
+    with torch.no_grad():
+        out = model(**inputs)
+        depth = torch.nn.functional.interpolate(
+            out.predicted_depth.unsqueeze(1),
+            size=(img.height, img.width),
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze().numpy()
+    return cv2.GaussianBlur(depth.astype(np.float32), (0, 0), sigmaX=3)
+def main():
+    photos = sys.argv[1:]
+    if not photos:
+        print(__doc__)
+        return 2
+    print(f"model: {MODEL}")
+    if not depth_model_is_metric(MODEL):
+        print("!! configured model is not metric — R1-1 not in effect")
+        return 1
+    print("loading checkpoint...")
+    run_depth.processor = AutoImageProcessor.from_pretrained(MODEL)
+    run_depth.model = AutoModelForDepthEstimation.from_pretrained(MODEL).eval()
+    ok = True
+    for path in photos:
+        img = Image.open(path).convert("RGB")
+        if max(img.size) > 1280:
+            s = 1280 / max(img.size)
+            img = img.resize((round(img.width * s), round(img.height * s)), Image.LANCZOS)
+        w, h = img.size
+        depth = run_depth(img)
+        floor = depth[int(h * (1 - FLOOR_FRAC)):, :]
+        p5, p95 = np.percentile(floor, 5), np.percentile(floor, 95)
+        range_ok = 0.3 <= p5 and p95 <= 20.0
+        # row-median inverse depth over the floor band; fit 1/Z = a*y + b
+        ys = np.arange(int(h * (1 - FLOOR_FRAC)), h)
+        inv = np.array([np.median(1.0 / np.maximum(depth[y], 0.05)) for y in ys])
+        a, b = np.polyfit(ys, inv, 1)
+        pred = a * ys + b
+        ss_res = float(np.sum((inv - pred) ** 2))
+        ss_tot = float(np.sum((inv - inv.mean()) ** 2)) + 1e-12
+        r2 = 1 - ss_res / ss_tot
+        focal = float(w)  # P0 convention: f ~ image width
+        horizon_y = -b / a if abs(a) > 1e-12 else float("nan")
+        # exact ground-plane relation for a pitched camera:
+        #   1/Z = (sin(t)*f - cos(t)*y') / (h*f)   ->   h = cos(t) / (a*f)
+        # with pitch t recovered from the fitted horizon row.
+        pitch = np.arctan2(h / 2 - horizon_y, focal)
+        cam_h = float(np.cos(pitch) / (a * focal)) if a > 1e-9 else float("inf")
+        plane_ok = r2 >= 0.90 and a > 0
+        height_ok = 0.7 <= cam_h <= 2.5
+        passed = range_ok and plane_ok and height_ok
+        ok &= passed
+        print(
+            f"  [{'PASS' if passed else 'FAIL'}] {path.split('/')[-1]}: "
+            f"floor p5-p95 = {p5:.2f}-{p95:.2f} m | invZ-fit R2={r2:.3f} | "
+            f"camera height = {cam_h:.2f} m | horizon y = {horizon_y:.0f}/{h}"
+        )
+        if not range_ok:
+            print("        !! floor depth outside 0.3-20 m")
+        if not plane_ok:
+            print("        !! inverse depth not linear in row — not plane-consistent")
+        if not height_ok:
+            print("        !! camera height outside handheld band 0.7-2.5 m")
+    print("\n" + ("ALL R1-1 METRIC CHECKS PASSED" if ok else "R1-1 METRIC CHECKS FAILED"))
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

verify_r1_plane_sim.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""R1-2 — depth-based plane-fit fallback certification (CI-safe: analytic
+depth, no torch).
+Reuses the exact pinhole ground-plane scene from verify_r1_scale_sim (camera
+1.5 m up, pitch 25 deg, f = image width) and runs the REAL
+plane_homography_from_depth from app.py.
+Checks:
+  1. engages: returns a homography + plane on a clean metric ground plane
+  2. metric by construction: the certified R1-3 estimator measures
+     metersPerUnit ~ 1 on the produced homography (within 2%)
+  3. shear-free and true-to-size: a known 1 m ground square maps to a
+     1 x 1 plane square with right angles (sides within 2%, angle within 2 deg)
+  4. orientation: plane-y grows toward the camera (near field), plane-x
+     image-right — the bundle convention the frontend assumes
+  5. rejection: non-planar depth (a dome) returns None
+  6. rejection: relative (normalised) depth returns None
+"""
+import cv2
+import numpy as np
+from verify_r1_scale_sim import CAM_H, F, H as IMG_H, PITCH, W as IMG_W, scene
+# --- real implementations from app.py ----------------------------------------
+src = open("app.py").read()
+ns = {"np": np, "cv2": cv2, "depth_model_is_metric": lambda name=None: True}
+for fn in ["estimate_meters_per_unit", "plane_homography_from_depth"]:
+    start = src.index(f"def {fn}")
+    end = src.index("\ndef ", start + 10)
+    exec(compile(src[start:end], "app.py", "exec"), ns)
+plane_homography_from_depth = ns["plane_homography_from_depth"]
+estimate_meters_per_unit = ns["estimate_meters_per_unit"]
+def to_plane(H, px, py):
+    den = H[2, 0] * px + H[2, 1] * py + H[2, 2]
+    return (
+        (H[0, 0] * px + H[0, 1] * py + H[0, 2]) / den,
+        (H[1, 0] * px + H[1, 1] * py + H[1, 2]) / den,
+    )
+def project_ground(x_w, fwd_w):
+    """Image pixel of a world ground point — inverse of the scene mapping."""
+    # world -> camera: y_c, z_c from CAM_H/pitch; then u,v via pinhole
+    y_w = -CAM_H
+    z_c = np.cos(PITCH) * fwd_w - np.sin(PITCH) * y_w
+    y_c = -np.sin(PITCH) * fwd_w - np.cos(PITCH) * y_w
+    u = x_w / z_c * F + IMG_W / 2.0
+    v = y_c / z_c * F + IMG_H / 2.0
+    return u, v
+def main():
+    ok = True
+    mask, z, x_w, fwd_w = scene()
+    fitted = plane_homography_from_depth(z, mask, IMG_W, IMG_H)
+    if fitted is None:
+        print("  [FAIL] fallback did not engage on a clean metric ground plane")
+        print("\nR1-2 SIM CHECKS FAILED")
+        return 1
+    hom, plane = fitted
+    H = np.asarray(hom, np.float64).reshape(3, 3)
+    print(f"  [PASS] engages: plane {plane['width']:.2f} x {plane['height']:.2f} m, "
+          f"source={plane.get('geometrySource')}")
+    mpu = estimate_meters_per_unit(z, mask, hom, IMG_W, IMG_H)
+    good = mpu is not None and abs(mpu - 1.0) <= 0.02
+    print(f"  [{'PASS' if good else 'FAIL'}] metric: metersPerUnit = {mpu}")
+    ok &= good
+    # known 1m ground square in the near field, centred
+    cx_w = 0.0
+    f0 = CAM_H / np.tan(PITCH) * 0.9   # comfortably inside the visible floor
+    corners_w = [(cx_w - 0.5, f0), (cx_w + 0.5, f0), (cx_w + 0.5, f0 + 1.0), (cx_w - 0.5, f0 + 1.0)]
+    corners_p = []
+    for xw, fw in corners_w:
+        u, v = project_ground(xw, fw)
+        corners_p.append(to_plane(H, u, v))
+    corners_p = np.asarray(corners_p)
+    s1 = np.linalg.norm(corners_p[1] - corners_p[0])
+    s2 = np.linalg.norm(corners_p[2] - corners_p[1])
+    d1 = corners_p[1] - corners_p[0]
+    d2 = corners_p[2] - corners_p[1]
+    angle = np.degrees(np.arccos(abs(d1 @ d2) / (s1 * s2 + 1e-12)))
+    square_ok = abs(s1 - 1) <= 0.02 and abs(s2 - 1) <= 0.02 and angle >= 88.0
+    print(f"  [{'PASS' if square_ok else 'FAIL'}] 1m square -> sides {s1:.3f} x {s2:.3f} m, "
+          f"corner angle {angle:.1f} deg")
+    ok &= square_ok
+    # orientation: nearer ground (smaller fwd) must have LARGER plane-y;
+    # world +x (image right) must have larger plane-x
+    u_near, v_near = project_ground(0.0, f0)
+    u_far, v_far = project_ground(0.0, f0 + 2.0)
+    _, b_near = to_plane(H, u_near, v_near)
+    _, b_far = to_plane(H, u_far, v_far)
+    u_r, v_r = project_ground(1.0, f0)
+    a_l, _ = to_plane(H, u_near, v_near)
+    a_r, _ = to_plane(H, u_r, v_r)
+    orient_ok = b_near > b_far and a_r > a_l
+    print(f"  [{'PASS' if orient_ok else 'FAIL'}] orientation: near-y {b_near:.2f} > far-y {b_far:.2f}, "
+          f"right-x {a_r:.2f} > left-x {a_l:.2f}")
+    ok &= orient_ok
+    # rejection: dome instead of plane
+    yy, xx = np.mgrid[0:IMG_H, 0:IMG_W].astype(np.float64)
+    dome = (2.5 - 1.2 * np.exp(-(((xx - IMG_W / 2) / 300) ** 2 + ((yy - IMG_H / 2) / 220) ** 2))).astype(np.float32)
+    r_dome = plane_homography_from_depth(dome, mask, IMG_W, IMG_H)
+    print(f"  [{'PASS' if r_dome is None else 'FAIL'}] rejection: dome depth -> {None if r_dome is None else 'accepted'}")
+    ok &= r_dome is None
+    # rejection: relative depth
+    ns["depth_model_is_metric"] = lambda name=None: False
+    r_rel = plane_homography_from_depth(z, mask, IMG_W, IMG_H)
+    ns["depth_model_is_metric"] = lambda name=None: True
+    print(f"  [{'PASS' if r_rel is None else 'FAIL'}] rejection: relative depth -> {None if r_rel is None else 'accepted'}")
+    ok &= r_rel is None
+    print("\n" + ("ALL R1-2 SIM CHECKS PASSED" if ok else "R1-2 SIM CHECKS FAILED"))
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

verify_r1_scale.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""R1-3 — metric scene scale certification (local harness; needs
+torch+transformers, a reference bundle, and the photo it was converted from).
+Validates estimate_meters_per_unit end-to-end:
+  1. mpu resolves (not None) on real rooms with metric depth
+  2. plausibility: the visible floor's physical bottom width (plane.width *
+     mpu) lands in 1.5-10 m
+  3. independence check: a 60 cm tile's on-screen size predicted two ways
+     must agree within 15%:
+       a) through the scale chain: 0.6/mpu plane units -> homography -> pixels
+       b) straight from depth:    0.6 * f / Z  at the same image row
+  4. cross-room consistency: the same physical tile, the same prediction
+     logic, in every supplied room.
+Usage:
+    python verify_r1_scale.py <bundle.json[.gz]>:<photo.jpg> [more pairs...]
+"""
+import base64
+import gzip
+import json
+import sys
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+# --- real implementations from app.py ---------------------------------------
+src = open("app.py").read()
+ns = {"np": np, "cv2": cv2}
+start = src.index("def depth_model_is_metric")
+end = src.index("\nENABLE_DEPTH", start)
+exec(compile(src[start:end], "app.py", "exec"), ns)
+ns["depth_model_is_metric"] = lambda name=None: True  # harness always metric
+start = src.index("def estimate_meters_per_unit")
+end = src.index("\n# ---", start)
+exec(compile(src[start:end], "app.py", "exec"), ns)
+estimate_meters_per_unit = ns["estimate_meters_per_unit"]
+import re
+MODEL = re.search(r'depth_model_name",\s*\n(?:\s*#.*\n)*\s*"([^"]+)"', src).group(1)
+def load_bundle(path):
+    if path.endswith(".gz"):
+        with gzip.open(path, "rt") as f:
+            return json.load(f)
+    return json.load(open(path))
+def run_depth(img):
+    inputs = run_depth.processor(images=img, return_tensors="pt")
+    with torch.no_grad():
+        out = run_depth.model(**inputs)
+        depth = torch.nn.functional.interpolate(
+            out.predicted_depth.unsqueeze(1),
+            size=(img.height, img.width),
+            mode="bicubic",
+            align_corners=False,
+        ).squeeze().numpy()
+    return cv2.GaussianBlur(depth.astype(np.float32), (0, 0), sigmaX=3)
+def main():
+    pairs = [a.split(":") for a in sys.argv[1:]]
+    if not pairs:
+        print(__doc__)
+        return 2
+    print(f"model: {MODEL}")
+    run_depth.processor = AutoImageProcessor.from_pretrained(MODEL)
+    run_depth.model = AutoModelForDepthEstimation.from_pretrained(MODEL).eval()
+    ok = True
+    for bundle_path, photo in pairs:
+        d = load_bundle(bundle_path)
+        w, h = d["width"], d["height"]
+        seg = max(d["segments"], key=lambda s: len(s["mask"]))
+        mask_idx = np.frombuffer(base64.b64decode(seg["mask"]), dtype=np.uint32)
+        mask = np.zeros(w * h, np.uint8)
+        mask[mask_idx] = 1
+        mask = mask.reshape(h, w)
+        H = np.asarray(seg["homography"], np.float64).reshape(3, 3)
+        img = Image.open(photo).convert("RGB").resize((w, h), Image.LANCZOS)
+        depth = run_depth(img)
+        mpu = estimate_meters_per_unit(depth, mask, seg["homography"], w, h)
+        if mpu is None:
+            # A clean fallback is acceptable: rooms on the synthetic-VP
+            # homography can't carry a trustworthy metric scale until R1-2;
+            # the engine then uses the heuristic repeat. FAIL is reserved for
+            # a returned-but-wrong scale (checked below).
+            print(f"  [PASS] {photo.split('/')[-1]}: metersPerUnit = None "
+                  f"(clean heuristic fallback — geometry not metric-trustworthy)")
+            continue
+        plane = seg["plane"]
+        floor_w_m = plane["width"] * mpu
+        width_ok = 1.5 <= floor_w_m <= 10.0
+        # independence check at a bottom-area floor row
+        ys, xs = np.nonzero(mask)
+        y_ref = int(np.percentile(ys, 92))
+        row_xs = xs[ys == y_ref]
+        x_ref = int(np.median(row_xs))
+        z_ref = float(depth[y_ref, x_ref])
+        f = float(w)
+        px_from_depth = f * 0.6 / z_ref
+        # map (0.6/mpu) plane units back through H^-1 at the same location
+        Hinv = np.linalg.inv(H)
+        den = H[2, 0] * x_ref + H[2, 1] * y_ref + H[2, 2]
+        px_p = (H[0, 0] * x_ref + H[0, 1] * y_ref + H[0, 2]) / den
+        py_p = (H[1, 0] * x_ref + H[1, 1] * y_ref + H[1, 2]) / den
+        def back(up, vp):
+            dz = Hinv[2, 0] * up + Hinv[2, 1] * vp + Hinv[2, 2]
+            return (
+                (Hinv[0, 0] * up + Hinv[0, 1] * vp + Hinv[0, 2]) / dz,
+                (Hinv[1, 0] * up + Hinv[1, 1] * vp + Hinv[1, 2]) / dz,
+            )
+        units = 0.6 / mpu
+        ax, ay = back(px_p - units / 2, py_p)
+        bx, by = back(px_p + units / 2, py_p)
+        px_from_chain = float(np.hypot(bx - ax, by - ay))
+        rel_err = abs(px_from_chain - px_from_depth) / px_from_depth
+        chain_ok = rel_err <= 0.15
+        passed = width_ok and chain_ok
+        ok &= passed
+        print(
+            f"  [{'PASS' if passed else 'FAIL'}] {photo.split('/')[-1]}: "
+            f"mpu={mpu:.5f} m/unit | floor width = {floor_w_m:.2f} m | "
+            f"60cm tile @row{y_ref}: chain={px_from_chain:.0f}px vs depth={px_from_depth:.0f}px "
+            f"(err {rel_err * 100:.1f}%)"
+        )
+        if not width_ok:
+            print("        !! floor physical width implausible")
+        if not chain_ok:
+            print("        !! scale chain disagrees with direct depth prediction")
+    print("\n" + ("ALL R1-3 SCALE CHECKS PASSED" if ok else "R1-3 SCALE CHECKS FAILED"))
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

verify_r1_scale_sim.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""R1-3 — metric scale certification on an exact synthetic scene (CI-safe:
+no torch, analytic depth).
+Scene: pinhole camera (f = image width), height 1.5 m, pitch 25 deg, looking
+at an infinite ground plane. Depth is computed analytically, the homography is
+built exactly from four ground points with a known plane-unit scale, so the
+true metersPerUnit is known in closed form.
+Checks (real implementation extracted from app.py):
+  1. recovery: estimate_meters_per_unit returns the true scale within 2%
+  2. rejection: a sheared homography (the synthetic-VP failure mode) returns
+     None instead of a confidently-wrong scale
+  3. relative depth (normalised [0,1]) returns None (metric-only feature)
+"""
+import cv2
+import numpy as np
+# --- real implementation from app.py ----------------------------------------
+src = open("app.py").read()
+ns = {"np": np, "cv2": cv2, "depth_model_is_metric": lambda name=None: True}
+start = src.index("def estimate_meters_per_unit")
+end = src.index("\n# ---", start)
+exec(compile(src[start:end], "app.py", "exec"), ns)
+estimate_meters_per_unit = ns["estimate_meters_per_unit"]
+W, H = 800, 600
+F = float(W)
+CAM_H = 1.5
+PITCH = np.deg2rad(25.0)
+UNITS_PER_M = 200.0          # plane-unit scale baked into the homography
+TRUE_MPU = 1.0 / UNITS_PER_M
+def scene():
+    cx, cy = W / 2.0, H / 2.0
+    u, v = np.meshgrid(np.arange(W, dtype=np.float64), np.arange(H, dtype=np.float64))
+    # ground plane: 1/Z = (sin(t) + cos(t) * (v - cy)/f) / h  (v grows downward)
+    inv_z = (np.sin(PITCH) + np.cos(PITCH) * (v - cy) / F) / CAM_H
+    mask = inv_z > 1.0 / 30.0          # floor visible, within 30 m
+    z = np.where(mask, 1.0 / np.maximum(inv_z, 1e-9), 0.0)
+    # camera-frame 3D, then world ground coordinates. Camera pitched DOWN by
+    # PITCH, world y up, image v down: world_y = -cos*y_c - sin*z (must be
+    # exactly -CAM_H on the ground — asserted), forward = cos*z - sin*y_c.
+    x_c = z * (u - cx) / F
+    y_c = z * (v - cy) / F
+    world_y = -np.cos(PITCH) * y_c - np.sin(PITCH) * z
+    assert np.allclose(world_y[mask], -CAM_H, atol=1e-9), "sim geometry inconsistent"
+    x_w = x_c
+    fwd_w = np.cos(PITCH) * z - np.sin(PITCH) * y_c
+    return mask.astype(np.uint8), z.astype(np.float32), x_w, fwd_w
+def exact_homography(mask, x_w, fwd_w):
+    ys, xs = np.nonzero(mask)
+    # four well-spread ground points
+    picks = []
+    for fy, fx in [(0.95, 0.2), (0.95, 0.8), (0.55, 0.3), (0.55, 0.7)]:
+        yy = int(np.percentile(ys, fy * 100))
+        row = xs[ys == yy]
+        xx = int(np.percentile(row, fx * 100))
+        picks.append((xx, yy))
+    src_pts = np.float32(picks)
+    dst_pts = np.float32(
+        [[x_w[y, x] * UNITS_PER_M, fwd_w[y, x] * UNITS_PER_M] for x, y in picks]
+    )
+    return cv2.getPerspectiveTransform(src_pts, dst_pts)
+def main():
+    ok = True
+    mask, z, x_w, fwd_w = scene()
+    Hm = exact_homography(mask, x_w, fwd_w)
+    mpu = estimate_meters_per_unit(z, mask, Hm.flatten().tolist(), W, H)
+    if mpu is None:
+        print("  [FAIL] recovery: returned None on exact scene")
+        ok = False
+    else:
+        err = abs(mpu - TRUE_MPU) / TRUE_MPU
+        good = err <= 0.02
+        print(f"  [{'PASS' if good else 'FAIL'}] recovery: mpu={mpu:.6f} "
+              f"(true {TRUE_MPU:.6f}, err {err * 100:.2f}%)")
+        ok &= good
+    # synthetic-VP failure mode: progressive horizontal shear of plane coords
+    S = np.array([[1.0, 0.35, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    H_bad = S @ Hm
+    mpu_bad = estimate_meters_per_unit(z, mask, H_bad.flatten().tolist(), W, H)
+    print(f"  [{'PASS' if mpu_bad is None else 'FAIL'}] rejection: sheared homography -> {mpu_bad}")
+    ok &= mpu_bad is None
+    rel = (z - z[mask > 0].min()) / (z[mask > 0].max() - z[mask > 0].min())
+    ns["depth_model_is_metric"] = lambda name=None: False
+    mpu_rel = estimate_meters_per_unit(rel.astype(np.float32), mask, Hm.flatten().tolist(), W, H)
+    print(f"  [{'PASS' if mpu_rel is None else 'FAIL'}] relative depth -> {mpu_rel}")
+    ok &= mpu_rel is None
+    print("\n" + ("ALL R1-3 SIM CHECKS PASSED" if ok else "R1-3 SIM CHECKS FAILED"))
+    return 0 if ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

visualizer.gpu.toml CHANGED Viewed

@@ -7,7 +7,7 @@ segmentation_model = "oneformer"
 oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
 mask2former_model_name = "facebook/mask2former-swin-small-ade-semantic"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
-depth_model_name = "Intel/dpt-large"
 intrinsic_model_version = "v2"
 [runtime]

 oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
 mask2former_model_name = "facebook/mask2former-swin-small-ade-semantic"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
+depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
 intrinsic_model_version = "v2"
 [runtime]

visualizer.hf.toml CHANGED Viewed

@@ -5,7 +5,7 @@
 [models]
 segmentation_model = "oneformer"
 oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
-depth_model_name = "Intel/dpt-large"
 intrinsic_model_version = "v2"
 [runtime]

 [models]
 segmentation_model = "oneformer"
 oneformer_model_name = "shi-labs/oneformer_ade20k_swin_large"
+depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
 intrinsic_model_version = "v2"
 [runtime]

visualizer.local.toml CHANGED Viewed

@@ -5,7 +5,7 @@
 [models]
 segmentation_model = "segformer"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
-depth_model_name = "Intel/dpt-large"
 [runtime]
 enable_depth_estimation = false

 [models]
 segmentation_model = "segformer"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
+depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
 [runtime]
 enable_depth_estimation = false

visualizer.segformer.toml CHANGED Viewed

@@ -6,7 +6,7 @@
 [models]
 segmentation_model = "segformer"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
-depth_model_name = "Intel/dpt-large"
 [runtime]
 enable_depth_estimation = false

 [models]
 segmentation_model = "segformer"
 segformer_model_name = "nvidia/segformer-b2-finetuned-ade-512-512"
+depth_model_name = "depth-anything/Depth-Anything-V2-Metric-Indoor-Small-hf"
 [runtime]
 enable_depth_estimation = false