diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..f710191d454afeb67ebf2ebca857866d2ba867e0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +# Git +.git + +# Python +__pycache__ +*.py[cod] +.venv +venv + +# Caches +.mypy_cache +.pytest_cache + +# Local runtime artifacts +demo_data + +# Misc +.DS_Store \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..4d9a2697d3281d7ca63d23b120ea1a70b7515a92 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,56 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + git \ + libssl-dev \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +RUN useradd -m -u 1000 user +USER user + +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH \ + HF_HOME=/home/user/.cache/huggingface \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 + +WORKDIR $HOME/app + +RUN pip install --upgrade pip + +RUN pip install \ + "fastapi==0.128.0" \ + "uvicorn[standard]==0.40.0" \ + "numpy==2.3.5" \ + "umap-learn==0.5.11" \ + "Pillow==12.1.0" \ + "pydantic==2.12.5" \ + "aiofiles==25.1.0" \ + "datasets==4.5.0" \ + "lancedb==0.27.1" \ + "pyarrow==22.0.0" \ + "torch==2.9.1" \ + "torchvision==0.24.1" \ + "timm==1.0.24" + +COPY --chown=user vendor ./vendor +COPY --chown=user demo.py ./demo.py + +ENV PYTHONPATH=/home/user/app/vendor \ + HYPERVIEW_DATASETS_DIR=/home/user/app/demo_data/datasets \ + HYPERVIEW_MEDIA_DIR=/home/user/app/demo_data/media + +RUN python -c "import hyperview; print('hyperview', hyperview.__version__)" + +# Precompute at build time so the Space starts fast. +RUN python -c "from demo import build_dataset; build_dataset()" + +EXPOSE 7860 + +HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ + CMD curl -f http://localhost:7860/__hyperview__/health || exit 1 + +CMD ["python", "demo.py"] \ No newline at end of file diff --git a/README.md b/README.md index bdffbcb8668c075b4d919beddc27acf1bfa6995d..5d2f6ef1ffb95db4e78bb1d6c78ea5cc5ac58b69 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,50 @@ --- -title: HyperView Jaguar ReID -emoji: 🚀 -colorFrom: pink -colorTo: pink +title: HyperView-Jaguar-ReID +emoji: 🐆 +colorFrom: yellow +colorTo: green sdk: docker +app_port: 7860 pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# HyperView - Jaguar Re-ID (MegaDescriptor + Sphere) + +This Space runs the Jaguar Re-ID dataset through the MegaDescriptor timm +backbone and renders the result with HyperView's spherical 3D layout. + +Unlike the Imagenette starter, this folder vendors the current HyperView source +under `vendor/hyperview/`. The released `hyperview==0.2.0` wheel does not yet +include the `timm-image` provider or spherical layout support required by this +demo, so the Space builds against the local source snapshot instead of PyPI. + +This demo uses: + +- Hugging Face dataset `hyper3labs/jaguar-re-id` +- Config `default` +- Split `train` +- Image field `image` +- Label field `label` +- Sample count `200` +- Embedding model `hf-hub:BVRA/MegaDescriptor-L-384` +- Layout `spherical` (3D) + +## Build model + +The Dockerfile precomputes the dataset, embeddings, and layout during image +build so the runtime container only needs to launch HyperView. + +Because MegaDescriptor inference runs during Docker build on CPU, this Space +keeps the sample count modest and uses a smaller batch size than the local demo +script to stay within typical Hugging Face build limits. + +## Vendored source + +This folder includes a vendored `vendor/hyperview/` snapshot copied from the +main HyperView repository. Keep that snapshot in sync with the local repo if +you update the jaguar demo or any unreleased HyperView behavior it depends on. + +## Deploy source + +This folder is synchronized to Hugging Face Spaces by GitHub Actions from the +`hyperview-spaces` deployment repository. \ No newline at end of file diff --git a/__pycache__/demo.cpython-312.pyc b/__pycache__/demo.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f95c061e9f2fa95bf528f5ee30aa220d8fbe44dc Binary files /dev/null and b/__pycache__/demo.cpython-312.pyc differ diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..ecd58eaea1090862f0de1bbe0abdc371d884d24a --- /dev/null +++ b/demo.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +"""HyperView Jaguar Re-ID Hugging Face Space.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +SPACE_HOST = "0.0.0.0" +SPACE_PORT = 7860 + +DATASET_NAME = "jaguar_reid_megadescriptor_spherical_space" +HF_DATASET = "hyper3labs/jaguar-re-id" +HF_CONFIG = "default" +HF_SPLIT = "train" +HF_IMAGE_KEY = "image" +HF_LABEL_KEY = "label" +SAMPLE_COUNT = 200 +MODEL_ID = "hf-hub:BVRA/MegaDescriptor-L-384" +BATCH_SIZE = 4 + +ROOT = Path(__file__).resolve().parent +VENDOR_DIR = ROOT / "vendor" +if str(VENDOR_DIR) not in sys.path: + sys.path.insert(0, str(VENDOR_DIR)) + +import hyperview as hv + + +def build_dataset() -> hv.Dataset: + dataset = hv.Dataset(DATASET_NAME) + + if len(dataset) == 0: + print(f"Loading {SAMPLE_COUNT} samples from {HF_DATASET} [{HF_CONFIG}] ({HF_SPLIT})...") + dataset.add_from_huggingface( + HF_DATASET, + config=HF_CONFIG, + split=HF_SPLIT, + image_key=HF_IMAGE_KEY, + label_key=HF_LABEL_KEY, + max_samples=SAMPLE_COUNT, + ) + + print(f"Ensuring MegaDescriptor embeddings ({MODEL_ID})...") + space_key = dataset.compute_embeddings( + model=MODEL_ID, + provider="timm-image", + batch_size=BATCH_SIZE, + show_progress=True, + ) + + print("Ensuring spherical layout...") + dataset.compute_visualization(space_key=space_key, layout="spherical") + + return dataset + + +def main() -> None: + dataset = build_dataset() + print(f"Starting HyperView on {SPACE_HOST}:{SPACE_PORT}") + hv.launch(dataset, host=SPACE_HOST, port=SPACE_PORT, open_browser=False) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/vendor/hyperview/__init__.py b/vendor/hyperview/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d055599584e7c0f2025c447d8592bca4cc5c511e --- /dev/null +++ b/vendor/hyperview/__init__.py @@ -0,0 +1,14 @@ +"""HyperView - Open-source dataset curation with hyperbolic embeddings visualization.""" + +from . import _version as _version +from . import api as _api + +Dataset = _api.Dataset +launch = _api.launch +__version__ = _version.__version__ + +__all__ = [ + "Dataset", + "launch", + "__version__", +] diff --git a/vendor/hyperview/__pycache__/__init__.cpython-312.pyc b/vendor/hyperview/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc553dd5f5ef896cc0379df1a876d666e88ef1a1 Binary files /dev/null and b/vendor/hyperview/__pycache__/__init__.cpython-312.pyc differ diff --git a/vendor/hyperview/__pycache__/_version.cpython-312.pyc b/vendor/hyperview/__pycache__/_version.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76f9a3a22cd6d11ee7812bc1e9d367e3e5ebe05a Binary files /dev/null and b/vendor/hyperview/__pycache__/_version.cpython-312.pyc differ diff --git a/vendor/hyperview/__pycache__/api.cpython-312.pyc b/vendor/hyperview/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..241e79382b8030f3ac608d6989c33752e1b6bb15 Binary files /dev/null and b/vendor/hyperview/__pycache__/api.cpython-312.pyc differ diff --git a/vendor/hyperview/_version.py b/vendor/hyperview/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..23910fbfccf418f5ef18d8c1bc3ce45b48b7e462 --- /dev/null +++ b/vendor/hyperview/_version.py @@ -0,0 +1,34 @@ +# file generated by setuptools-scm +# don't change, don't track in version control + +__all__ = [ + "__version__", + "__version_tuple__", + "version", + "version_tuple", + "__commit_id__", + "commit_id", +] + +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple + from typing import Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] + COMMIT_ID = Union[str, None] +else: + VERSION_TUPLE = object + COMMIT_ID = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE +commit_id: COMMIT_ID +__commit_id__: COMMIT_ID + +__version__ = version = '0.2.1.dev2+g55532b8e3.d20260307' +__version_tuple__ = version_tuple = (0, 2, 1, 'dev2', 'g55532b8e3.d20260307') + +__commit_id__ = commit_id = None diff --git a/vendor/hyperview/api.py b/vendor/hyperview/api.py new file mode 100644 index 0000000000000000000000000000000000000000..a970b829002b4baaf3c3cba7a93e55a4db765702 --- /dev/null +++ b/vendor/hyperview/api.py @@ -0,0 +1,408 @@ +"""Public API for HyperView.""" + +import json +import os +import socket +import threading +import time +import webbrowser +from dataclasses import dataclass +from urllib.error import URLError +from urllib.request import Request, urlopen +from uuid import uuid4 + +import uvicorn + +from hyperview.core.dataset import Dataset +from hyperview.server.app import create_app, set_dataset + +__all__ = ["Dataset", "launch", "Session"] + + +@dataclass(frozen=True) +class _HealthResponse: + name: str | None + session_id: str | None + dataset: str | None + pid: int | None + + +def _can_connect(host: str, port: int, timeout_s: float) -> bool: + try: + with socket.create_connection((host, port), timeout=timeout_s): + return True + except OSError: + return False + + +def _try_read_health(url: str, timeout_s: float) -> _HealthResponse | None: + try: + return _read_health(url, timeout_s=timeout_s) + except (URLError, TimeoutError, OSError, ValueError, json.JSONDecodeError): + return None + + +def _read_health(url: str, timeout_s: float) -> _HealthResponse: + request = Request(url, headers={"Accept": "application/json"}) + with urlopen(request, timeout=timeout_s) as response: + data = json.loads(response.read().decode("utf-8")) + + return _HealthResponse( + name=data.get("name"), + session_id=data.get("session_id"), + dataset=data.get("dataset"), + pid=data.get("pid") if isinstance(data.get("pid"), int) else None, + ) + + +def _resolve_default_launch_layout(dataset: Dataset) -> str: + spaces = dataset.list_spaces() + + if any(space.geometry not in ("hyperboloid", "hypersphere") for space in spaces): + return "euclidean:2d" + if any(space.geometry == "hypersphere" for space in spaces): + return "spherical:3d" + return "poincare:2d" + + +class Session: + """A session for the HyperView visualizer.""" + + def __init__(self, dataset: Dataset, host: str, port: int): + self.dataset = dataset + self.host = host + self.port = port + # Prefer a browser-connectable host for user-facing URLs. + # When binding to 0.0.0.0, users should connect via 127.0.0.1 locally. + self.url = f"http://{self._connect_host}:{port}" + self._server_thread: threading.Thread | None = None + self._server: uvicorn.Server | None = None + self._startup_error: BaseException | None = None + self.session_id = uuid4().hex + + @property + def _connect_host(self) -> str: + return "127.0.0.1" if self.host == "0.0.0.0" else self.host + + @property + def _health_url(self) -> str: + return f"http://{self._connect_host}:{self.port}/__hyperview__/health" + + def _run_server(self): + try: + set_dataset(self.dataset) + app = create_app(self.dataset, session_id=self.session_id) + config = uvicorn.Config(app, host=self.host, port=self.port, log_level="warning") + self._server = uvicorn.Server(config) + self._server.run() + except BaseException as exc: + self._startup_error = exc + + def start(self, background: bool = True): + """Start the visualizer server.""" + if not background: + self._run_server() + return + + # Fail fast if something is already listening on this port. + if _can_connect(self._connect_host, self.port, timeout_s=0.2): + health = _try_read_health(self._health_url, timeout_s=0.2) + if health is not None and health.name == "hyperview": + raise RuntimeError( + "HyperView failed to start because the port is already serving " + f"HyperView (port={self.port}, session_id={health.session_id}). " + "Choose a different port or stop the existing server." + ) + + raise RuntimeError( + "HyperView failed to start because the port is already in use " + f"by a non-HyperView service (port={self.port}). Choose a different " + "port or stop the process listening on that port." + ) + + self._startup_error = None + self._server_thread = threading.Thread(target=self._run_server, daemon=True) + self._server_thread.start() + + deadline = time.time() + 5.0 + last_health_error: Exception | None = None + + while time.time() < deadline: + if self._startup_error is not None: + raise RuntimeError( + f"HyperView server failed to start (port={self.port}): " + f"{type(self._startup_error).__name__}: {self._startup_error}" + ) + + if self._server_thread is not None and not self._server_thread.is_alive(): + raise RuntimeError( + "HyperView server thread exited during startup. " + f"The port may be in use (port={self.port})." + ) + + try: + health = _read_health(self._health_url, timeout_s=0.2) + except (URLError, TimeoutError, OSError, ValueError, json.JSONDecodeError) as exc: + last_health_error = exc + time.sleep(0.05) + continue + + if health.name == "hyperview" and health.session_id == self.session_id: + return + + if health.name == "hyperview": + raise RuntimeError( + "HyperView failed to start because the port is already serving " + f"a different HyperView session (port={self.port}, " + f"session_id={health.session_id})." + ) + + raise RuntimeError( + "HyperView failed to start because the port is already serving " + f"a non-HyperView app (port={self.port})." + ) + + raise TimeoutError( + "HyperView server did not become ready in time " + f"(port={self.port}). Last error: {last_health_error}" + ) + + def stop(self): + """Stop the visualizer server.""" + if self._server: + self._server.should_exit = True + + def show(self, height: int = 800): + """Display the visualizer in a notebook. + + In Google Colab, notebook kernels cannot be accessed via localhost. + Colab exposes kernel ports through a proxy URL (see + `google.colab.kernel.proxyPort`). This renders a link to the proxied URL + that opens in a new tab. + + In other notebook environments, it renders a clickable link to the local + URL and a best-effort JavaScript auto-open. + """ + if _is_colab(): + try: + from google.colab.output import eval_js # type: ignore[import-not-found] + from IPython.display import HTML, display + + proxy_url = eval_js(f"google.colab.kernel.proxyPort({self.port})") + app_url = str(proxy_url).rstrip("/") + "/" + + display( + HTML( + "
HyperView is running in Colab. " + f"" + "Open HyperView in a new tab.
" + ) + ) + display(HTML(f"{app_url}
")) + return + except Exception: + # Fall through to the generic notebook behavior. + pass + + # Default: open in a new browser tab (works well for Jupyter). + try: + from IPython.display import HTML, Javascript, display + + display( + HTML( + "HyperView is running. " + f"Open in a new tab." + "
" + ) + ) + + # Best-effort auto-open. Some browsers may block popups. + display(Javascript(f'window.open("{self.url}", "_blank");')) + except ImportError: + print(f"IPython not installed. Please visit {self.url} in your browser.") + + def open_browser(self): + """Open the visualizer in a browser window.""" + webbrowser.open(self.url) + + +def launch( + dataset: Dataset, + port: int = 6262, + host: str = "127.0.0.1", + open_browser: bool = True, + notebook: bool | None = None, + height: int = 800, + reuse_server: bool = False, +) -> Session: + """Launch the HyperView visualization server. + + Note: + HyperView needs at least one visualization to display. If no layouts + exist yet but embedding spaces do, this function computes one default + layout automatically. + + Args: + dataset: The dataset to visualize. + port: Port to run the server on. + host: Host to bind to. + open_browser: Whether to open a browser window. + notebook: Whether to display in a notebook. If None, auto-detects. + height: Height of the iframe in the notebook. + reuse_server: If True, and the requested port is already serving HyperView, + attach to the existing server instead of starting a new one. For safety, + this will only attach when the existing server reports the same dataset + name (via `/__hyperview__/health`). + + Returns: + A Session object. + + Example: + >>> import hyperview as hv + >>> dataset = hv.Dataset("my_dataset") + >>> dataset.add_images_dir("/path/to/images", label_from_folder=True) + >>> dataset.compute_embeddings(model="openai/clip-vit-base-patch32") + >>> dataset.compute_visualization() + >>> hv.launch(dataset) + """ + if notebook is None: + # Colab is always a notebook environment, even if _is_notebook() fails to detect it + notebook = _is_notebook() or _is_colab() + + if _is_colab() and host == "127.0.0.1": + # Colab port forwarding/proxying is most reliable when the server binds + # to all interfaces. + host = "0.0.0.0" + + # Preflight: avoid doing expensive work if the port is already in use. + # If it's already serving HyperView and reuse_server=True, we can safely attach. + connect_host = "127.0.0.1" if host == "0.0.0.0" else host + health_url = f"http://{connect_host}:{port}/__hyperview__/health" + + if _can_connect(connect_host, port, timeout_s=0.2): + health = _try_read_health(health_url, timeout_s=0.2) + if health is not None and health.name == "hyperview": + if not reuse_server: + raise RuntimeError( + "HyperView failed to start because the port is already serving " + f"HyperView (port={port}, dataset={health.dataset}, " + f"session_id={health.session_id}, pid={health.pid}). " + "Choose a different port, stop the existing server, or pass " + "reuse_server=True to attach." + ) + + if health.dataset is not None and health.dataset != dataset.name: + raise RuntimeError( + "HyperView refused to attach to the existing server because it is " + f"serving a different dataset (port={port}, dataset={health.dataset}). " + f"Requested dataset={dataset.name}. Stop the existing server or " + "choose a different port." + ) + + session = Session(dataset, host, port) + if health.session_id is not None: + session.session_id = health.session_id + + if notebook: + if _is_colab(): + print( + f"\nHyperView is already running (Colab, port={session.port}). " + "Use the link below to open it." + ) + else: + print( + f"\nHyperView is already running at {session.url} (port={session.port}). " + "Opening a new tab..." + ) + session.show(height=height) + else: + print(f"\nHyperView is already running at {session.url} (port={session.port}).") + if open_browser: + session.open_browser() + + return session + + raise RuntimeError( + "HyperView failed to start because the port is already in use " + f"by a non-HyperView service (port={port}). Choose a different " + "port or stop the process listening on that port." + ) + + layouts = dataset.list_layouts() + spaces = dataset.list_spaces() + + if not layouts and not spaces: + raise ValueError( + "HyperView launch requires at least one visualization or embedding space. " + "No visualizations or embedding spaces were found. " + "Call `dataset.compute_embeddings()` and `dataset.compute_visualization()` " + "or `dataset.set_coords()` before `hv.launch()`." + ) + + if not layouts: + default_layout = _resolve_default_launch_layout(dataset) + + print(f"No visualizations found. Computing {default_layout} visualization...") + # Let compute_visualization pick the most appropriate default space. + dataset.compute_visualization( + space_key=None, + layout=default_layout, + ) + + session = Session(dataset, host, port) + + if notebook: + session.start(background=True) + if _is_colab(): + print( + f"\nHyperView is running (Colab, port={session.port}). " + "Use the link below to open it." + ) + else: + print(f"\nHyperView is running at {session.url}. Opening a new tab...") + session.show(height=height) + else: + session.start(background=True) + print(" Press Ctrl+C to stop.\n") + print(f"\nHyperView is running at {session.url}") + + if open_browser: + session.open_browser() + + try: + while True: + # Keep the main thread alive so the daemon server thread can run. + time.sleep(0.25) + if session._server_thread is not None and not session._server_thread.is_alive(): + raise RuntimeError("HyperView server stopped unexpectedly.") + except KeyboardInterrupt: + pass + finally: + session.stop() + if session._server_thread is not None: + session._server_thread.join(timeout=2.0) + + return session + + +def _is_notebook() -> bool: + """Check if running in a notebook environment.""" + try: + from IPython import get_ipython + except ImportError: + return False + + shell = get_ipython() + return shell is not None and shell.__class__.__name__ == "ZMQInteractiveShell" + + +def _is_colab() -> bool: + """Check if running inside a Google Colab notebook runtime.""" + if os.environ.get("COLAB_RELEASE_TAG"): + return True + try: + import google.colab # type: ignore[import-not-found] + + return True + except ImportError: + return False diff --git a/vendor/hyperview/cli.py b/vendor/hyperview/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..d489c40f1672596c4db54ed76789a5ddaa661577 --- /dev/null +++ b/vendor/hyperview/cli.py @@ -0,0 +1,362 @@ +"""Command-line interface for HyperView.""" + +from __future__ import annotations + +import argparse + +from hyperview import Dataset, launch +from hyperview.core.dataset import parse_visualization_layout + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="hyperview", + description="HyperView - Dataset visualization with hyperbolic embeddings", + ) + + parser.add_argument( + "--dataset", + type=str, + default=None, + help=( + "Dataset name in persistent storage. Required unless " + "--dataset-json is provided." + ), + ) + parser.add_argument( + "--dataset-json", + type=str, + help="Path to exported dataset JSON file (loads samples into memory)", + ) + parser.add_argument( + "--hf-dataset", + type=str, + help="HuggingFace dataset ID to ingest before launch (e.g. uoft-cs/cifar10)", + ) + parser.add_argument( + "--split", + type=str, + default=None, + help="HuggingFace split to use (required with --hf-dataset)", + ) + parser.add_argument( + "--hf-config", + type=str, + default=None, + help="Optional HuggingFace subset/configuration to use", + ) + parser.add_argument( + "--image-key", + type=str, + default=None, + help="Image column key for HuggingFace ingestion (required with --hf-dataset)", + ) + parser.add_argument( + "--label-key", + type=str, + default=None, + help="Label column key for HuggingFace ingestion (optional)", + ) + parser.add_argument( + "--label-names-key", + type=str, + default=None, + help="Optional dataset info key containing label names", + ) + parser.add_argument( + "--images-dir", + type=str, + help="Local directory of images to ingest before launch", + ) + parser.add_argument( + "--label-from-folder", + action="store_true", + help="When using --images-dir, derive label from parent folder name", + ) + parser.add_argument( + "--samples", + type=int, + default=None, + help="Maximum number of ingested samples (omit to load all)", + ) + parser.add_argument( + "--hf-streaming", + action="store_true", + help=( + "Stream HuggingFace rows instead of materializing the full split first. " + "Useful for loading subsets without eager full-split downloads." + ), + ) + parser.add_argument( + "--shuffle", + action="store_true", + help="Shuffle HuggingFace dataset before sampling", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed used when --shuffle is enabled (default: 42)", + ) + parser.add_argument( + "--hf-shuffle-buffer-size", + type=int, + default=1000, + help=( + "Shuffle buffer size used with --hf-streaming and --shuffle. " + "Streaming shuffle is approximate and trades larger buffers for more read-ahead." + ), + ) + + parser.add_argument( + "--model", + type=str, + default=None, + help=( + "Embedding model to compute before launch (e.g. openai/clip-vit-base-patch32). " + "If omitted, existing embedding spaces are reused." + ), + ) + parser.add_argument( + "--method", + choices=["umap"], + default="umap", + help="Projection method (currently only 'umap')", + ) + parser.add_argument( + "--layout", + action="append", + dest="layouts", + metavar="GEOMETRY[:2d|3d]", + help=( + "Visualization layout to compute. Repeat this flag to request multiple layouts, " + "for example '--layout euclidean --layout spherical'. " + "Omitting the suffix defaults to 2D for euclidean/poincare and 3D for spherical. " + "If omitted, HyperView picks one sensible default layout for the selected embedding space." + ), + ) + parser.add_argument( + "--n-neighbors", + type=int, + default=15, + help="UMAP n_neighbors (default: 15)", + ) + parser.add_argument( + "--min-dist", + type=float, + default=0.1, + help="UMAP min_dist (default: 0.1)", + ) + parser.add_argument( + "--metric", + type=str, + default="cosine", + help="UMAP metric (default: cosine)", + ) + parser.add_argument( + "--force-layout", + action="store_true", + help="Force layout recomputation even if projection already exists", + ) + + parser.add_argument( + "--port", + type=int, + default=6262, + help="Port to run the server on (default: 6262)", + ) + parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="Host to bind the server to (default: 127.0.0.1)", + ) + parser.add_argument( + "--no-browser", + action="store_true", + help="Do not open a browser window automatically", + ) + parser.add_argument( + "--reuse-server", + action="store_true", + help=( + "If the port is already serving HyperView, attach instead of failing. " + "For safety, this only attaches when the existing server reports the same dataset name." + ), + ) + + return parser + + +def _validate_args(parser: argparse.ArgumentParser, args: argparse.Namespace) -> None: + if args.layouts: + canonical_layouts: list[str] = [] + seen_layouts: set[str] = set() + for layout_spec in args.layouts: + try: + geometry, layout_dimension = parse_visualization_layout(layout_spec) + except ValueError as exc: + parser.error(str(exc)) + + canonical_layout = f"{geometry}:{layout_dimension}d" + if canonical_layout in seen_layouts: + continue + seen_layouts.add(canonical_layout) + canonical_layouts.append(canonical_layout) + + args.layouts = canonical_layouts + + if args.hf_dataset and args.images_dir: + parser.error("Use either --hf-dataset or --images-dir, not both.") + + if args.dataset_json and (args.hf_dataset or args.images_dir): + parser.error("--dataset-json cannot be combined with --hf-dataset or --images-dir.") + + if args.dataset_json and args.dataset: + parser.error("Use either --dataset or --dataset-json, not both.") + + if not args.dataset and not args.dataset_json: + parser.error( + "Provide --dataset (persistent dataset) or --dataset-json (exported dataset file)." + ) + + if args.hf_dataset: + if not args.split: + parser.error("--split is required when using --hf-dataset.") + if not args.image_key: + parser.error("--image-key is required when using --hf-dataset.") + if args.hf_shuffle_buffer_size < 1: + parser.error("--hf-shuffle-buffer-size must be at least 1.") + + +def _print_ingestion_result(added: int, skipped: int) -> None: + if skipped > 0: + print(f"Loaded {added} samples ({skipped} already present)") + else: + print(f"Loaded {added} samples") + + +def _ingest_huggingface(dataset: Dataset, args: argparse.Namespace, dataset_name: str) -> None: + config_suffix = f" [{args.hf_config}]" if args.hf_config else "" + print(f"Loading HuggingFace dataset {dataset_name}{config_suffix}...") + added, skipped = dataset.add_from_huggingface( + dataset_name, + config=args.hf_config, + split=args.split, + image_key=args.image_key, + label_key=args.label_key, + label_names_key=args.label_names_key, + max_samples=args.samples, + shuffle=args.shuffle, + seed=args.seed, + streaming=args.hf_streaming, + shuffle_buffer_size=args.hf_shuffle_buffer_size, + ) + _print_ingestion_result(added, skipped) + + +def _prepare_dataset(args: argparse.Namespace) -> Dataset: + if args.dataset_json: + print(f"Loading dataset from {args.dataset_json}...") + dataset = Dataset.load(args.dataset_json) + print(f"Loaded {len(dataset)} samples") + return dataset + + dataset = Dataset(args.dataset) + print(f"Using dataset '{dataset.name}' ({len(dataset)} samples)") + + if args.hf_dataset: + _ingest_huggingface(dataset, args, args.hf_dataset) + elif args.images_dir: + print(f"Loading images from {args.images_dir}...") + added, skipped = dataset.add_images_dir( + args.images_dir, + label_from_folder=args.label_from_folder, + ) + _print_ingestion_result(added, skipped) + + return dataset + + +def _resolve_default_layouts( + dataset: Dataset, + space_key: str | None, +) -> list[str]: + spaces = dataset.list_spaces() + selected = next((space for space in spaces if space.space_key == space_key), None) + + if selected is not None: + if selected.geometry == "hyperboloid": + return ["poincare:2d"] + if selected.geometry == "hypersphere": + return ["spherical:3d"] + return ["euclidean:2d"] + + if any(space.geometry not in ("hyperboloid", "hypersphere") for space in spaces): + return ["euclidean:2d"] + if any(space.geometry == "hypersphere" for space in spaces): + return ["spherical:3d"] + return ["poincare:2d"] + +def _compute_layouts(dataset: Dataset, args: argparse.Namespace, space_key: str | None) -> None: + target_layouts = args.layouts or _resolve_default_layouts(dataset, space_key) + + print("Computing visualizations...") + for target_layout in target_layouts: + dataset.compute_visualization( + space_key=space_key, + method=args.method, + layout=target_layout, + n_neighbors=args.n_neighbors, + min_dist=args.min_dist, + metric=args.metric, + force=args.force_layout, + ) + print("Visualizations ready") + + +def _prepare_embeddings_and_layouts(dataset: Dataset, args: argparse.Namespace) -> None: + has_spaces = len(dataset.list_spaces()) > 0 + + if args.model is not None: + print(f"Computing embeddings with {args.model}...") + space_key = dataset.compute_embeddings(model=args.model, show_progress=True) + print("Embeddings computed") + _compute_layouts(dataset, args, space_key) + return + + if args.force_layout: + if not has_spaces: + raise ValueError( + "No embedding spaces found. Provide --model to compute embeddings first." + ) + _compute_layouts(dataset, args, space_key=None) + return + + if not has_spaces: + raise ValueError( + "No embedding spaces found. Provide --model to compute embeddings first." + ) + + +def main(): + """Main CLI entry point.""" + parser = _build_parser() + args = parser.parse_args() + + _validate_args(parser, args) + + dataset = _prepare_dataset(args) + _prepare_embeddings_and_layouts(dataset, args) + + launch( + dataset, + port=args.port, + host=args.host, + open_browser=not args.no_browser, + reuse_server=args.reuse_server, + ) + + +if __name__ == "__main__": + main() diff --git a/vendor/hyperview/core/__init__.py b/vendor/hyperview/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f09c78cc3b71fb293208678553e3774c9bf0032 --- /dev/null +++ b/vendor/hyperview/core/__init__.py @@ -0,0 +1,6 @@ +"""Core data structures for HyperView.""" + +from hyperview.core.dataset import Dataset +from hyperview.core.sample import Sample + +__all__ = ["Dataset", "Sample"] diff --git a/vendor/hyperview/core/__pycache__/__init__.cpython-312.pyc b/vendor/hyperview/core/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54150c85597f0efe71873b183b12d583b5e469a0 Binary files /dev/null and b/vendor/hyperview/core/__pycache__/__init__.cpython-312.pyc differ diff --git a/vendor/hyperview/core/__pycache__/dataset.cpython-312.pyc b/vendor/hyperview/core/__pycache__/dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecbb7f20aeaead605d47027520ce5280005b13dc Binary files /dev/null and b/vendor/hyperview/core/__pycache__/dataset.cpython-312.pyc differ diff --git a/vendor/hyperview/core/__pycache__/sample.cpython-312.pyc b/vendor/hyperview/core/__pycache__/sample.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7129a13333729f8082dda69f984b2e95bb22da22 Binary files /dev/null and b/vendor/hyperview/core/__pycache__/sample.cpython-312.pyc differ diff --git a/vendor/hyperview/core/__pycache__/selection.cpython-312.pyc b/vendor/hyperview/core/__pycache__/selection.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0f5ab23663d4d6c189626fb210b98f10bc9b35b Binary files /dev/null and b/vendor/hyperview/core/__pycache__/selection.cpython-312.pyc differ diff --git a/vendor/hyperview/core/dataset.py b/vendor/hyperview/core/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d6e4fcb1268346a62dd2e4159503fe7cd029aeba --- /dev/null +++ b/vendor/hyperview/core/dataset.py @@ -0,0 +1,870 @@ +"""Dataset class for managing collections of samples.""" + +from __future__ import annotations + +import hashlib +import json +import math +import threading +import time +import uuid +from collections.abc import Callable, Iterator +from pathlib import Path +from typing import Any, cast + +import numpy as np +from datasets import DownloadConfig, load_dataset +from PIL import Image + +from hyperview.core.sample import Sample +from hyperview.storage.backend import StorageBackend +from hyperview.storage.schema import ( + make_layout_key, + normalize_layout_dimension, + parse_layout_dimension, +) + + +DEFAULT_VISUALIZATION_LAYOUT = "euclidean" +VALID_VISUALIZATION_GEOMETRIES = ("euclidean", "poincare", "spherical") + + +def _format_elapsed(seconds: float) -> str: + if seconds < 60: + return f"{seconds:.1f}s" + total_seconds = int(round(seconds)) + minutes, secs = divmod(total_seconds, 60) + if minutes < 60: + return f"{minutes}m {secs:02d}s" + hours, minutes = divmod(minutes, 60) + return f"{hours}h {minutes:02d}m {secs:02d}s" + + +def _format_eta(seconds: float) -> str: + if not math.isfinite(seconds) or seconds < 0: + return "unknown" + return _format_elapsed(seconds) + + +def _fallback_huggingface_fingerprint( + dataset_name: str, + config_name: str, + split: str, + version: str | None, +) -> str: + identity = f"{dataset_name}:{config_name}:{split}:{version or 'unknown'}" + return hashlib.md5(identity.encode()).hexdigest() + + +def parse_visualization_layout(layout: str) -> tuple[str, int]: + """Parse a public visualization layout spec like ``euclidean:3d``. + + Omitting the suffix defaults to 2D for Euclidean and Poincare layouts, + and to 3D for spherical layouts. + """ + layout_spec = layout.strip().lower() + if not layout_spec: + raise ValueError("layout must be a non-empty string") + + if ":" in layout_spec: + geometry, dimension_spec = layout_spec.rsplit(":", 1) + else: + geometry = layout_spec + dimension_spec = "3d" if geometry.strip() == "spherical" else "2d" + + geometry = geometry.strip() + dimension_spec = dimension_spec.strip() + + if geometry not in VALID_VISUALIZATION_GEOMETRIES: + raise ValueError( + "layout geometry must be one of " + f"{VALID_VISUALIZATION_GEOMETRIES}, got '{geometry}'" + ) + + if dimension_spec not in ("2d", "3d"): + raise ValueError( + "layout must use the form '