File size: 6,377 Bytes
5b6e3d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""Multi-view geometry helpers around pycolmap.

This module extracts the intrinsics/extrinsics we need for triangulation,
in a shape-normalised form that doesn't depend on pycolmap's exact version.

Everything here is pure numpy + pycolmap — no torch, no kornia — so it can
run inside the HuggingFace submission container without installs.

Key data structure: ``ViewInfo`` (a plain dict) with keys:

    image_id       str   — the short sample-level id (matches entry['image_ids'])
    colmap_img     pycolmap.Image
    camera_id      int
    K              (3,3) float64 — calibration matrix
    R              (3,3) float64 — world→camera rotation
    t              (3,)  float64 — world→camera translation
    P              (3,4) float64 — K @ [R | t]  (projection matrix)
    center         (3,)  float64 — camera centre in world coords, -R^T t
    width, height  int   — image resolution at COLMAP scale

Downstream code uses ``P`` for DLT triangulation and ``K, R, t`` for epipolar
geometry. All functions here are side-effect-free.
"""

from __future__ import annotations

import numpy as np

from hoho2025.example_solutions import _cam_matrix_from_image


def get_view_info(colmap_rec, img_id_substring: str) -> dict | None:
    """Return ViewInfo for the COLMAP image whose name contains ``img_id_substring``.

    Returns None if the image is not registered in the reconstruction.
    """
    found = None
    for _, col_img in colmap_rec.images.items():
        if img_id_substring in col_img.name:
            found = col_img
            break
    if found is None:
        return None

    R, t = _cam_matrix_from_image(found)
    cam = colmap_rec.cameras[found.camera_id]
    K = np.asarray(cam.calibration_matrix(), dtype=np.float64)
    P = K @ np.hstack([R, t.reshape(3, 1)])
    center = -R.T @ t

    return {
        "image_id": img_id_substring,
        "colmap_img": found,
        "camera_id": int(found.camera_id),
        "K": K,
        "R": R,
        "t": t,
        "P": P,
        "center": center,
        "width": int(cam.width),
        "height": int(cam.height),
    }


def collect_views(colmap_rec, image_ids) -> dict[str, dict]:
    """Build a mapping ``{image_id → ViewInfo}`` for every id found in the recon.

    Skips ids that are not registered (returns fewer items than requested
    — caller must handle the missing keys).
    """
    out: dict[str, dict] = {}
    for iid in image_ids:
        info = get_view_info(colmap_rec, iid)
        if info is not None:
            out[iid] = info
    return out


def project_world_to_image(P: np.ndarray, points3d: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Project Nx3 world points through a 3x4 projection matrix.

    Returns
    -------
    uv : (N, 2) float64 — pixel coordinates
    z  : (N,)  float64 — camera-space depth (>0 means in front of the camera)
    """
    pts = np.asarray(points3d, dtype=np.float64)
    if pts.ndim == 1:
        pts = pts.reshape(1, 3)
    homog = np.hstack([pts, np.ones((len(pts), 1))])
    proj = homog @ P.T  # (N, 3)
    z = proj[:, 2]
    safe = np.where(np.abs(z) < 1e-12, 1e-12, z)
    uv = proj[:, :2] / safe[:, None]
    return uv, z


def relative_pose(view_a: dict, view_b: dict) -> tuple[np.ndarray, np.ndarray]:
    """Return rotation and translation from view_a's frame to view_b's frame.

    If x_a is a point in view_a's camera frame, then
        x_b = R_ab @ x_a + t_ab
    with
        R_ab = R_b @ R_a^T
        t_ab = t_b - R_ab @ t_a
    """
    R_a, t_a = view_a["R"], view_a["t"]
    R_b, t_b = view_b["R"], view_b["t"]
    R_ab = R_b @ R_a.T
    t_ab = t_b - R_ab @ t_a
    return R_ab, t_ab


def _skew(v: np.ndarray) -> np.ndarray:
    x, y, z = v
    return np.array([[0, -z, y],
                     [z,  0, -x],
                     [-y, x,  0]], dtype=np.float64)


def fundamental_matrix(view_a: dict, view_b: dict) -> np.ndarray:
    """Compute the fundamental matrix F_ab such that
        x_b^T @ F_ab @ x_a = 0
    for corresponding points (in homogeneous pixel coordinates).

    Derivation: F = K_b^{-T} · [t_ab]× · R_ab · K_a^{-1}
    """
    R_ab, t_ab = relative_pose(view_a, view_b)
    K_a_inv = np.linalg.inv(view_a["K"])
    K_b_inv_T = np.linalg.inv(view_b["K"]).T
    E = _skew(t_ab) @ R_ab  # essential matrix
    F = K_b_inv_T @ E @ K_a_inv
    return F


def epipolar_line(F: np.ndarray, point_in_a: np.ndarray) -> np.ndarray:
    """Epipolar line in view b induced by a point in view a.

    Returns ``(a, b, c)`` with ``a*u + b*v + c = 0`` in view b.
    """
    x = np.array([point_in_a[0], point_in_a[1], 1.0], dtype=np.float64)
    return F @ x


def point_to_line_distance(line: np.ndarray, point_uv: np.ndarray) -> float:
    """Perpendicular distance from a 2D point to a homogeneous line (a,b,c)."""
    a, b, c = line
    num = abs(a * point_uv[0] + b * point_uv[1] + c)
    den = np.sqrt(a * a + b * b) + 1e-12
    return float(num / den)


def triangulate_dlt(Ps, pts2d) -> np.ndarray:
    """Linear triangulation (DLT) from ``>=2`` views.

    Parameters
    ----------
    Ps : sequence of (3,4) projection matrices
    pts2d : sequence of (x, y) pixel coordinates, one per view

    Returns the 3D point as a (3,) ndarray in world coordinates.
    """
    A = []
    for P, (x, y) in zip(Ps, pts2d):
        A.append(x * P[2] - P[0])
        A.append(y * P[2] - P[1])
    A = np.asarray(A, dtype=np.float64)
    try:
        _, _, Vt = np.linalg.svd(A)
    except Exception:
        return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
    X = Vt[-1]
    if abs(X[3]) < 1e-12:
        return np.array([np.nan, np.nan, np.nan], dtype=np.float64)
    return X[:3] / X[3]


def mean_reprojection_error(X: np.ndarray, Ps, pts2d) -> float:
    """Mean L2 reprojection error of ``X`` across multiple views.

    Points behind the camera (depth <= 0) contribute a large penalty so the
    caller can use this as a direct cost for track acceptance.
    """
    if np.any(~np.isfinite(X)):
        return float("inf")
    errs = []
    for P, uv in zip(Ps, pts2d):
        u, z = project_world_to_image(P, X.reshape(1, 3))
        if z[0] <= 0:
            return float("inf")
        errs.append(float(np.linalg.norm(u[0] - np.asarray(uv, dtype=np.float64))))
    if not errs:
        return float("inf")
    return float(np.mean(errs))