Spaces:
Running
Running
File size: 6,193 Bytes
23680f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
"""Compute orchestration pipelines for HyperView.
These functions coordinate embedding computation and 2D layout/projection
computation, persisting results into the configured storage backend.
"""
from __future__ import annotations
import numpy as np
from hyperview.embeddings.providers import ModelSpec, get_provider, make_provider_aware_space_key
from hyperview.storage.backend import StorageBackend
from hyperview.storage.schema import make_layout_key
def compute_embeddings(
storage: StorageBackend,
model_spec: ModelSpec,
batch_size: int = 32,
show_progress: bool = True,
) -> tuple[str, int, int]:
"""Compute embeddings for samples that don't have them yet.
Args:
storage: Storage backend to read samples from and write embeddings to.
model_spec: Model specification (provider, model_id, geometry, etc.).
batch_size: Batch size for processing.
show_progress: Whether to show progress bar.
Returns:
Tuple of (space_key, num_computed, num_skipped).
Raises:
ValueError: If no samples in storage or provider not found.
"""
provider = get_provider(model_spec.provider)
all_samples = storage.get_all_samples()
if not all_samples:
raise ValueError("No samples in storage")
# Generate space key before computing (deterministic from spec)
space_key = make_provider_aware_space_key(model_spec)
# Check which samples need embeddings
missing_ids = storage.get_missing_embedding_ids(space_key)
# If space doesn't exist yet, all samples are missing
if not storage.get_space(space_key):
missing_ids = [s.id for s in all_samples]
num_skipped = len(all_samples) - len(missing_ids)
if not missing_ids:
if show_progress:
print(f"All {len(all_samples)} samples already have embeddings in space '{space_key}'")
return space_key, 0, num_skipped
samples_to_embed = storage.get_samples_by_ids(missing_ids)
if show_progress and num_skipped > 0:
print(f"Skipped {num_skipped} samples with existing embeddings")
# Compute all embeddings in one pass (no separate probe)
embeddings = provider.compute_embeddings(
samples=samples_to_embed,
model_spec=model_spec,
batch_size=batch_size,
show_progress=show_progress,
)
dim = embeddings.shape[1]
# Ensure space exists (create if needed)
config = provider.get_space_config(model_spec, dim)
storage.ensure_space(
model_id=model_spec.model_id,
dim=dim,
config=config,
space_key=space_key,
)
# Store embeddings
ids = [s.id for s in samples_to_embed]
storage.add_embeddings(space_key, ids, embeddings)
return space_key, len(ids), num_skipped
def compute_layout(
storage: StorageBackend,
space_key: str | None = None,
method: str = "umap",
geometry: str = "euclidean",
n_neighbors: int = 15,
min_dist: float = 0.1,
metric: str = "cosine",
force: bool = False,
show_progress: bool = True,
) -> str:
"""Compute 2D layout/projection for visualization.
Args:
storage: Storage backend with embeddings.
space_key: Embedding space to project. If None, uses the first available.
method: Projection method ('umap' supported).
geometry: Output geometry type ('euclidean' or 'poincare').
n_neighbors: Number of neighbors for UMAP.
min_dist: Minimum distance for UMAP.
metric: Distance metric for UMAP.
force: Force recomputation even if layout exists.
show_progress: Whether to print progress messages.
Returns:
layout_key for the computed layout.
Raises:
ValueError: If no embedding spaces, space not found, or insufficient samples.
"""
from hyperview.embeddings.projection import ProjectionEngine
if method != "umap":
raise ValueError(f"Invalid method: {method}. Only 'umap' is supported.")
if geometry not in ("euclidean", "poincare"):
raise ValueError(f"Invalid geometry: {geometry}. Must be 'euclidean' or 'poincare'.")
if space_key is None:
spaces = storage.list_spaces()
if not spaces:
raise ValueError("No embedding spaces. Call compute_embeddings() first.")
space_key = spaces[0].space_key
space = storage.get_space(space_key)
if space is None:
raise ValueError(f"Space not found: {space_key}")
input_geometry = space.geometry
curvature = (space.config or {}).get("curvature")
ids, vectors = storage.get_embeddings(space_key)
if len(ids) == 0:
raise ValueError(f"No embeddings in space '{space_key}'. Call compute_embeddings() first.")
if len(ids) < 3:
raise ValueError(f"Need at least 3 samples for visualization, have {len(ids)}")
layout_params = {
"n_neighbors": n_neighbors,
"min_dist": min_dist,
"metric": metric,
}
layout_key = make_layout_key(space_key, method, geometry, layout_params)
if not force:
existing_layout = storage.get_layout(layout_key)
if existing_layout is not None:
existing_ids, _ = storage.get_layout_coords(layout_key)
if set(existing_ids) == set(ids):
if show_progress:
print(f"Layout '{layout_key}' already exists with {len(ids)} points")
return layout_key
if show_progress:
print("Layout exists but has different samples, recomputing...")
if show_progress:
print(f"Computing {geometry} {method} layout for {len(ids)} samples...")
storage.ensure_layout(
layout_key=layout_key,
space_key=space_key,
method=method,
geometry=geometry,
params=layout_params,
)
engine = ProjectionEngine()
coords = engine.project(
vectors,
input_geometry=input_geometry,
output_geometry=geometry,
curvature=curvature,
method=method,
n_neighbors=n_neighbors,
min_dist=min_dist,
metric=metric,
)
storage.add_layout_coords(layout_key, ids, coords)
return layout_key
|