Spaces:

Neritz
/

subm

Sleeping

File size: 6,377 Bytes

5b6e3d8

"""Multi-view geometry helpers around pycolmap.

This module extracts the intrinsics/extrinsics we need for triangulation,
in a shape-normalised form that doesn't depend on pycolmap's exact version.

Everything here is pure numpy + pycolmap — no torch, no kornia — so it can
run inside the HuggingFace submission container without installs.

Key data structure: ``ViewInfo`` (a plain dict) with keys:

    image_id       str   — the short sample-level id (matches entry['image_ids'])
    colmap_img     pycolmap.Image
    camera_id      int
    K              (3,3) float64 — calibration matrix
    R              (3,3) float64 — world→camera rotation
    t              (3,)  float64 — world→camera translation
    P              (3,4) float64 — K @ [R | t]  (projection matrix)
    center         (3,)  float64 — camera centre in world coords, -R^T t
    width, height  int   — image resolution at COLMAP scale

Downstream code uses ``P`` for DLT triangulation and ``K, R, t`` for epipolar
geometry. All functions here are side-effect-free.
"""

from __future__ import annotations

import numpy as np

from hoho2025.example_solutions import _cam_matrix_from_image


def get_view_info(colmap_rec, img_id_substring: str) -> dict | None:
    """Return ViewInfo for the COLMAP image whose name contains ``img_id_substring``.

    Returns None if the image is not registered in the reconstruction.
    """
    found = None
    for _, col_img in colmap_rec.images.items():
        if img_id_substring in col_img.name:
            found = col_img
            break
    if found is None:
        return None

    R, t = _cam_matrix_from_image(found)
    cam = colmap_rec.cameras[found.camera_id]
    K = np.asarray(cam.calibration_matrix(), dtype=np.float64)
    P = K @ np.hstack([R, t.reshape(3, 1)])
    center = -R.T @ t

    return {
        "image_id": img_id_substring,
        "colmap_img": found,
        "camera_id": int(found.camera_id),
        "K": K,
        "R": R,
        "t": t,
        "P": P,
        "center": center,
        "width": int(cam.width),
        "height": int(cam.height),
    }


def collect_views(colmap_rec, image_ids) -> dict[str, dict]:
    """Build a mapping ``{image_id → ViewInfo}`` for every id found in the recon.

    Skips ids that are not registered (returns fewer items than requested
    — caller must handle the missing keys).
    """
    out: dict[str, dict] = {}
    for iid in image_ids:
        info = get_view_info(colmap_rec, iid)
        if info is not None:
            out[iid] = info
    return out


def project_world_to_image(P: np.ndarray, points3d: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Project Nx3 world points through a 3x4 projection matrix.

    Returns
    -------
    uv : (N, 2) float64 — pixel coordinates
    z  : (N,)  float64 — camera-space depth (>0 means in front of the camera)
    """
    pts = np.asarray(points3d, dtype=np.float64)
    if pts.ndim == 1:
        pts = pts.reshape(1, 3)
    homog = np.hstack([pts, np.ones((len(pts), 1))])
    proj = homog @ P.T  # (N, 3)
    z = proj[:, 2]
    safe = np.where(np.abs(z) < 1e-12, 1e-12, z)
    uv = proj[:, :2] / safe[:, None]
    return uv, z


def relative_pose(view_a: dict, view_b: dict) -> tuple[np.ndarray, np.ndarray]:
    """Return rotation and translation from view_a's frame to view_b's frame.

    If x_a is a point in view_a's camera frame, then
        x_b = R_ab @ x_a + t_ab
    with
        R_ab = R_b @ R_a^T
        t_ab = t_b - R_ab @ t_a
    """
    R_a, t_a = view_a["R"], view_a["t"]
    R_b, t_b = view_b["R"], view_b["t"]
    R_ab = R_b @ R_a.T
    t_ab = t_b - R_ab @ t_a
    return R_ab, t_ab


def _skew(v: np.ndarray) -> np.ndarray:
    x, y, z = v
    return np.array([[0, -z, y],
                     [z,  0, -x],
                     [-y, x,  0]], dtype=np.float64)


def fundamental_matrix(view_a: dict, view_b: dict) -> np.ndarray:
    """Compute the fundamental matrix F_ab such that
        x_b^T @ F_ab @ x_a = 0
    for corresponding points (in homogeneous pixel coordinates).

    Derivation: F = K_b^{-T} · [t_ab]× · R_ab · K_a^{-1}
    """
    R_ab, t_ab = relative_pose(view_a, view_b)
    K_a_inv = np.linalg.inv(view_a["K"])
    K_b_inv_T = np.linalg.inv(view_b["K"]).T
    E = _skew(t_ab) @ R_ab  # essential matrix
    F = K_b_inv_T @ E @ K_a_inv
    return F


def epipolar_line(F: np.ndarray, point_in_a: np.ndarray) -> np.ndarray:
    """Epipolar line in view b induced by a point in view a.

    Returns ``(a, b, c)`` with ``a*u + b*v + c = 0`` in view b.
    """
    x = np.array([point_in_a[0], point_in_a[1], 1.0], dtype=np.float64)
    return F @ x


def point_to_line_distance(line: np.ndarray, point_uv: np.ndarray) -> float:
    """Perpendicular distance from a 2D point to a homogeneous line (a,b,c)."""
    a, b, c = line
    num = abs(a * point_uv[0] + b * point_uv[1] + c)
    den = np.sqrt(a * a + b * b) + 1e-12
    return float(num / den)


def triangulate_dlt(Ps, pts2d) -> np.ndarray:
    """Linear triangulation (DLT) from ``>=2`` views.

    Parameters
    ----------
    Ps : sequence of (3,4) projection matrices
    pts2d : sequence of (x, y) pixel coordinates, one per view

    Returns the 3D point as a (3,) ndarray in world coordinates.
    """
    A = []
    for P, (x, y) in zip(Ps, pts2d):
        A.append(x * P[2] - P[0])
        A.append(y * P[2] - P[1])
    A = np.asarray(A, dtype=np.float64)
    try:
        _, _, Vt = np.linalg.svd(A)
    except Exception:
        return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
    X = Vt[-1]
    if abs(X[3]) < 1e-12:
        return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
    return X[:3] / X[3]


def mean_reprojection_error(X: np.ndarray, Ps, pts2d) -> float:
    """Mean L2 reprojection error of ``X`` across multiple views.

    Points behind the camera (depth <= 0) contribute a large penalty so the
    caller can use this as a direct cost for track acceptance.
    """
    if np.any(~np.isfinite(X)):
        return float("inf")
    errs = []
    for P, uv in zip(Ps, pts2d):
        u, z = project_world_to_image(P, X.reshape(1, 3))
        if z[0] <= 0:
            return float("inf")
        errs.append(float(np.linalg.norm(u[0] - np.asarray(uv, dtype=np.float64))))
    if not errs:
        return float("inf")
    return float(np.mean(errs))