subm / mvs_utils.py
Neritz's picture
Add handcrafted_submission_2026 contents
5b6e3d8 verified
"""Multi-view geometry helpers around pycolmap.
This module extracts the intrinsics/extrinsics we need for triangulation,
in a shape-normalised form that doesn't depend on pycolmap's exact version.
Everything here is pure numpy + pycolmap — no torch, no kornia — so it can
run inside the HuggingFace submission container without installs.
Key data structure: ``ViewInfo`` (a plain dict) with keys:
image_id str — the short sample-level id (matches entry['image_ids'])
colmap_img pycolmap.Image
camera_id int
K (3,3) float64 — calibration matrix
R (3,3) float64 — world→camera rotation
t (3,) float64 — world→camera translation
P (3,4) float64 — K @ [R | t] (projection matrix)
center (3,) float64 — camera centre in world coords, -R^T t
width, height int — image resolution at COLMAP scale
Downstream code uses ``P`` for DLT triangulation and ``K, R, t`` for epipolar
geometry. All functions here are side-effect-free.
"""
from __future__ import annotations
import numpy as np
from hoho2025.example_solutions import _cam_matrix_from_image
def get_view_info(colmap_rec, img_id_substring: str) -> dict | None:
"""Return ViewInfo for the COLMAP image whose name contains ``img_id_substring``.
Returns None if the image is not registered in the reconstruction.
"""
found = None
for _, col_img in colmap_rec.images.items():
if img_id_substring in col_img.name:
found = col_img
break
if found is None:
return None
R, t = _cam_matrix_from_image(found)
cam = colmap_rec.cameras[found.camera_id]
K = np.asarray(cam.calibration_matrix(), dtype=np.float64)
P = K @ np.hstack([R, t.reshape(3, 1)])
center = -R.T @ t
return {
"image_id": img_id_substring,
"colmap_img": found,
"camera_id": int(found.camera_id),
"K": K,
"R": R,
"t": t,
"P": P,
"center": center,
"width": int(cam.width),
"height": int(cam.height),
}
def collect_views(colmap_rec, image_ids) -> dict[str, dict]:
"""Build a mapping ``{image_id → ViewInfo}`` for every id found in the recon.
Skips ids that are not registered (returns fewer items than requested
— caller must handle the missing keys).
"""
out: dict[str, dict] = {}
for iid in image_ids:
info = get_view_info(colmap_rec, iid)
if info is not None:
out[iid] = info
return out
def project_world_to_image(P: np.ndarray, points3d: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Project Nx3 world points through a 3x4 projection matrix.
Returns
-------
uv : (N, 2) float64 — pixel coordinates
z : (N,) float64 — camera-space depth (>0 means in front of the camera)
"""
pts = np.asarray(points3d, dtype=np.float64)
if pts.ndim == 1:
pts = pts.reshape(1, 3)
homog = np.hstack([pts, np.ones((len(pts), 1))])
proj = homog @ P.T # (N, 3)
z = proj[:, 2]
safe = np.where(np.abs(z) < 1e-12, 1e-12, z)
uv = proj[:, :2] / safe[:, None]
return uv, z
def relative_pose(view_a: dict, view_b: dict) -> tuple[np.ndarray, np.ndarray]:
"""Return rotation and translation from view_a's frame to view_b's frame.
If x_a is a point in view_a's camera frame, then
x_b = R_ab @ x_a + t_ab
with
R_ab = R_b @ R_a^T
t_ab = t_b - R_ab @ t_a
"""
R_a, t_a = view_a["R"], view_a["t"]
R_b, t_b = view_b["R"], view_b["t"]
R_ab = R_b @ R_a.T
t_ab = t_b - R_ab @ t_a
return R_ab, t_ab
def _skew(v: np.ndarray) -> np.ndarray:
x, y, z = v
return np.array([[0, -z, y],
[z, 0, -x],
[-y, x, 0]], dtype=np.float64)
def fundamental_matrix(view_a: dict, view_b: dict) -> np.ndarray:
"""Compute the fundamental matrix F_ab such that
x_b^T @ F_ab @ x_a = 0
for corresponding points (in homogeneous pixel coordinates).
Derivation: F = K_b^{-T} · [t_ab]× · R_ab · K_a^{-1}
"""
R_ab, t_ab = relative_pose(view_a, view_b)
K_a_inv = np.linalg.inv(view_a["K"])
K_b_inv_T = np.linalg.inv(view_b["K"]).T
E = _skew(t_ab) @ R_ab # essential matrix
F = K_b_inv_T @ E @ K_a_inv
return F
def epipolar_line(F: np.ndarray, point_in_a: np.ndarray) -> np.ndarray:
"""Epipolar line in view b induced by a point in view a.
Returns ``(a, b, c)`` with ``a*u + b*v + c = 0`` in view b.
"""
x = np.array([point_in_a[0], point_in_a[1], 1.0], dtype=np.float64)
return F @ x
def point_to_line_distance(line: np.ndarray, point_uv: np.ndarray) -> float:
"""Perpendicular distance from a 2D point to a homogeneous line (a,b,c)."""
a, b, c = line
num = abs(a * point_uv[0] + b * point_uv[1] + c)
den = np.sqrt(a * a + b * b) + 1e-12
return float(num / den)
def triangulate_dlt(Ps, pts2d) -> np.ndarray:
"""Linear triangulation (DLT) from ``>=2`` views.
Parameters
----------
Ps : sequence of (3,4) projection matrices
pts2d : sequence of (x, y) pixel coordinates, one per view
Returns the 3D point as a (3,) ndarray in world coordinates.
"""
A = []
for P, (x, y) in zip(Ps, pts2d):
A.append(x * P[2] - P[0])
A.append(y * P[2] - P[1])
A = np.asarray(A, dtype=np.float64)
try:
_, _, Vt = np.linalg.svd(A)
except Exception:
return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
X = Vt[-1]
if abs(X[3]) < 1e-12:
return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
return X[:3] / X[3]
def mean_reprojection_error(X: np.ndarray, Ps, pts2d) -> float:
"""Mean L2 reprojection error of ``X`` across multiple views.
Points behind the camera (depth <= 0) contribute a large penalty so the
caller can use this as a direct cost for track acceptance.
"""
if np.any(~np.isfinite(X)):
return float("inf")
errs = []
for P, uv in zip(Ps, pts2d):
u, z = project_world_to_image(P, X.reshape(1, 3))
if z[0] <= 0:
return float("inf")
errs.append(float(np.linalg.norm(u[0] - np.asarray(uv, dtype=np.float64))))
if not errs:
return float("inf")
return float(np.mean(errs))