morozovdd's picture
feat: add HyperView app for space
23680f2
"""Projection methods for dimensionality reduction."""
import logging
import warnings
import numpy as np
import umap
logger = logging.getLogger(__name__)
class ProjectionEngine:
"""Engine for projecting high-dimensional embeddings to 2D."""
def to_poincare_ball(
self,
hyperboloid_embeddings: np.ndarray,
curvature: float | None = None,
clamp_radius: float = 0.999999,
) -> np.ndarray:
"""Convert hyperboloid (Lorentz) coordinates to Poincaré ball coordinates.
Input is expected to be shape (N, D+1) with first coordinate being time-like.
Points are assumed to satisfy: t^2 - ||x||^2 = 1/c (c > 0).
Returns Poincaré ball coordinates of shape (N, D) in the unit ball.
Notes:
- Many hyperbolic libraries parameterize curvature as a positive number c
where the manifold has sectional curvature -c.
- We map to the unit ball for downstream distance metrics (UMAP 'poincare').
"""
if hyperboloid_embeddings.ndim != 2 or hyperboloid_embeddings.shape[1] < 2:
raise ValueError(
"hyperboloid_embeddings must have shape (N, D+1) with D>=1"
)
c = float(curvature) if curvature is not None else 1.0
if c <= 0:
raise ValueError(f"curvature must be > 0, got {c}")
# Radius R = 1/sqrt(c) for curvature -c
R = 1.0 / np.sqrt(c)
t = hyperboloid_embeddings[:, :1]
x = hyperboloid_embeddings[:, 1:]
# Map to ball radius R: u_R = x / (t + R)
denom = t + R
u_R = x / denom
# Rescale to unit ball: u = u_R / R = sqrt(c) * u_R
u = u_R / R
# Numerical guard: ensure inside the unit ball
radii = np.linalg.norm(u, axis=1)
mask = radii >= clamp_radius
if np.any(mask):
u[mask] = u[mask] / radii[mask][:, np.newaxis] * clamp_radius
return u.astype(np.float32)
def project(
self,
embeddings: np.ndarray,
*,
input_geometry: str = "euclidean",
output_geometry: str = "euclidean",
curvature: float | None = None,
method: str = "umap",
n_neighbors: int = 15,
min_dist: float = 0.1,
metric: str = "cosine",
random_state: int = 42,
) -> np.ndarray:
"""Project embeddings to 2D with geometry-aware preprocessing.
This separates two concerns:
1) Geometry/model transforms for the *input* embeddings (e.g. hyperboloid -> Poincaré)
2) Dimensionality reduction / layout (currently UMAP)
Args:
embeddings: Input embeddings (N x D) or hyperboloid (N x D+1).
input_geometry: Geometry/model of the input embeddings (euclidean, hyperboloid).
output_geometry: Geometry of the output coordinates (euclidean, poincare).
curvature: Curvature parameter for hyperbolic embeddings (positive c).
method: Layout method (currently only 'umap').
n_neighbors: UMAP neighbors.
min_dist: UMAP min_dist.
metric: Input metric (used for euclidean inputs).
random_state: Random seed.
Returns:
2D coordinates (N x 2).
"""
if method != "umap":
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
prepared = embeddings
prepared_metric: str = metric
if input_geometry == "hyperboloid":
# Convert to unit Poincaré ball and use UMAP's built-in hyperbolic distance.
prepared = self.to_poincare_ball(embeddings, curvature=curvature)
prepared_metric = "poincare"
if output_geometry == "poincare":
return self.project_to_poincare(
prepared,
n_neighbors=n_neighbors,
min_dist=min_dist,
metric=prepared_metric,
random_state=random_state,
)
if output_geometry == "euclidean":
return self.project_umap(
prepared,
n_neighbors=n_neighbors,
min_dist=min_dist,
metric=prepared_metric,
n_components=2,
random_state=random_state,
)
raise ValueError(
f"Invalid output_geometry: {output_geometry}. Must be 'euclidean' or 'poincare'."
)
def project_umap(
self,
embeddings: np.ndarray,
n_neighbors: int = 15,
min_dist: float = 0.1,
metric: str = "cosine",
n_components: int = 2,
random_state: int = 42,
) -> np.ndarray:
"""Project embeddings to Euclidean 2D using UMAP."""
n_neighbors = min(n_neighbors, len(embeddings) - 1)
if n_neighbors < 2:
n_neighbors = 2
n_jobs = 1 if random_state is not None else -1
reducer = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
metric=metric,
random_state=random_state,
n_jobs=n_jobs,
)
coords = reducer.fit_transform(embeddings)
coords = self._normalize_coords(coords)
return coords
def project_to_poincare(
self,
embeddings: np.ndarray,
n_neighbors: int = 15,
min_dist: float = 0.1,
metric: str = "cosine",
random_state: int = 42,
) -> np.ndarray:
"""Project embeddings to the Poincaré disk using UMAP with hyperboloid output."""
n_neighbors = min(n_neighbors, len(embeddings) - 1)
if n_neighbors < 2:
n_neighbors = 2
n_jobs = 1 if random_state is not None else -1
# Suppress warning about missing gradient for poincare metric (only affects inverse_transform)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="gradient function is not yet implemented")
reducer = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=2,
metric=metric,
output_metric="hyperboloid",
random_state=random_state,
n_jobs=n_jobs,
)
spatial_coords = reducer.fit_transform(embeddings)
squared_norm = np.sum(spatial_coords**2, axis=1)
t = np.sqrt(1 + squared_norm)
# Project to Poincaré disk: u = x / (1 + t)
denom = 1 + t
poincare_coords = spatial_coords / denom[:, np.newaxis]
# Clamp to unit disk for numerical stability
radii = np.linalg.norm(poincare_coords, axis=1)
max_radius = 0.999
mask = radii > max_radius
if np.any(mask):
logger.warning(f"Clamping {np.sum(mask)} points to unit disk.")
poincare_coords[mask] = (
poincare_coords[mask] / radii[mask][:, np.newaxis] * max_radius
)
poincare_coords = self._center_poincare(poincare_coords)
poincare_coords = self._scale_poincare(poincare_coords, factor=0.65)
return poincare_coords
def _scale_poincare(self, coords: np.ndarray, factor: float) -> np.ndarray:
"""Scale points towards the origin in hyperbolic space.
Scales hyperbolic distance from origin by `factor`. If factor < 1, points move closer to center.
"""
radii = np.linalg.norm(coords, axis=1)
mask = radii > 1e-6
r = radii[mask]
r = np.minimum(r, 0.9999999)
r_new = np.tanh(factor * np.arctanh(r))
scale_ratios = np.ones_like(radii)
scale_ratios[mask] = r_new / r
return coords * scale_ratios[:, np.newaxis]
def _center_poincare(self, coords: np.ndarray) -> np.ndarray:
"""Center points in the Poincaré disk using a Möbius transformation."""
if len(coords) == 0:
return coords
z = coords[:, 0] + 1j * coords[:, 1]
centroid = np.mean(z)
if np.abs(centroid) > 0.99 or np.abs(centroid) < 1e-6:
return coords
# Möbius transformation: w = (z - a) / (1 - conj(a) * z)
a = centroid
w = (z - a) / (1 - np.conj(a) * z)
return np.stack([w.real, w.imag], axis=1)
def _normalize_coords(self, coords: np.ndarray) -> np.ndarray:
"""Normalize coordinates to [-1, 1] range."""
if len(coords) == 0:
return coords
coords = coords - coords.mean(axis=0)
max_abs = np.abs(coords).max()
if max_abs > 0:
coords = coords / max_abs * 0.95
return coords
def poincare_distance(self, u: np.ndarray, v: np.ndarray) -> float:
"""Compute the Poincaré distance between two points."""
u_norm_sq = np.sum(u**2)
v_norm_sq = np.sum(v**2)
diff_norm_sq = np.sum((u - v) ** 2)
u_norm_sq = min(u_norm_sq, 0.99999)
v_norm_sq = min(v_norm_sq, 0.99999)
delta = 2 * diff_norm_sq / ((1 - u_norm_sq) * (1 - v_norm_sq))
return np.arccosh(1 + delta)