Spaces:

hyper3labs
/

HyperView

Running

File size: 8,592 Bytes

23680f2

"""LanceDB schema definitions for HyperView.

Storage architecture:
- samples: Core sample metadata (no embeddings)
- metadata: Key-value pairs for dataset config
- spaces: Registry of embedding spaces
- embeddings__<space_key>: One table per embedding space (id + vector)
- layouts__<layout_key>: One table per layout (id + x + y)
"""

import json
import re
from dataclasses import dataclass
from typing import Any

import pyarrow as pa

from hyperview.core.sample import Sample


def create_sample_schema() -> pa.Schema:
    """Create the PyArrow schema for samples.

    Samples are pure metadata - embeddings and layouts are stored separately.
    """
    return pa.schema(
        [
            pa.field("id", pa.utf8(), nullable=False),
            pa.field("filepath", pa.utf8(), nullable=False),
            pa.field("label", pa.utf8(), nullable=True),
            pa.field("metadata_json", pa.utf8(), nullable=True),
            pa.field("thumbnail_base64", pa.utf8(), nullable=True),
        ]
    )


def create_metadata_schema() -> pa.Schema:
    """Create the PyArrow schema for dataset metadata (key-value store)."""
    return pa.schema(
        [
            pa.field("key", pa.utf8(), nullable=False),
            pa.field("value", pa.utf8(), nullable=True),
        ]
    )


def create_spaces_schema() -> pa.Schema:
    """Create the PyArrow schema for the spaces registry.

    Each row represents an embedding space (one per model).
    """
    return pa.schema(
        [
            pa.field("space_key", pa.utf8(), nullable=False),
            pa.field("model_id", pa.utf8(), nullable=False),
            pa.field("dim", pa.int32(), nullable=False),
            pa.field("count", pa.int64(), nullable=False),
            pa.field("created_at", pa.int64(), nullable=False),
            pa.field("updated_at", pa.int64(), nullable=False),
            pa.field("config_json", pa.utf8(), nullable=True),
        ]
    )


def create_embeddings_schema(dim: int) -> pa.Schema:
    """Create the PyArrow schema for an embeddings table.

    Args:
        dim: Vector dimension for this embedding space.
    """
    return pa.schema(
        [
            pa.field("id", pa.utf8(), nullable=False),
            pa.field("vector", pa.list_(pa.float32(), dim), nullable=False),
        ]
    )


def create_layouts_schema() -> pa.Schema:
    """Create the PyArrow schema for a layouts table.

    Layouts store 2D coordinates for visualization.
    """
    return pa.schema(
        [
            pa.field("id", pa.utf8(), nullable=False),
            pa.field("x", pa.float32(), nullable=False),
            pa.field("y", pa.float32(), nullable=False),
        ]
    )


@dataclass
class SpaceInfo:
    """Metadata for an embedding space."""

    space_key: str
    model_id: str
    dim: int
    count: int
    created_at: int
    updated_at: int
    config: dict[str, Any] | None = None

    @property
    def provider(self) -> str:
        return (self.config or {}).get("provider", "unknown")

    @property
    def geometry(self) -> str:
        return (self.config or {}).get("geometry", "euclidean")

    def to_dict(self) -> dict[str, Any]:
        return {
            "space_key": self.space_key,
            "model_id": self.model_id,
            "dim": self.dim,
            "count": self.count,
            "created_at": self.created_at,
            "updated_at": self.updated_at,
            "config_json": json.dumps(self.config) if self.config else None,
        }

    def to_api_dict(self) -> dict[str, Any]:
        return {
            "space_key": self.space_key,
            "model_id": self.model_id,
            "dim": self.dim,
            "count": self.count,
            "provider": self.provider,
            "geometry": self.geometry,
            "config": self.config,
        }

    @classmethod
    def from_dict(cls, row: dict[str, Any]) -> "SpaceInfo":
        config_json = row.get("config_json")
        config = json.loads(config_json) if config_json else None
        return cls(
            space_key=row["space_key"],
            model_id=row["model_id"],
            dim=row["dim"],
            count=row["count"],
            created_at=row["created_at"],
            updated_at=row["updated_at"],
            config=config,
        )


def create_layouts_registry_schema() -> pa.Schema:
    """Create the PyArrow schema for the layouts registry.

    Each row represents a layout (2D projection of an embedding space).
    """
    return pa.schema(
        [
            pa.field("layout_key", pa.utf8(), nullable=False),
            pa.field("space_key", pa.utf8(), nullable=False),
            pa.field("method", pa.utf8(), nullable=False),
            pa.field("geometry", pa.utf8(), nullable=False),
            pa.field("count", pa.int64(), nullable=False),
            pa.field("created_at", pa.int64(), nullable=False),
            pa.field("params_json", pa.utf8(), nullable=True),
        ]
    )


@dataclass
class LayoutInfo:
    """Metadata for a layout (2D projection)."""

    layout_key: str
    space_key: str
    method: str
    geometry: str
    count: int
    created_at: int
    params: dict[str, Any] | None = None

    def to_dict(self) -> dict[str, Any]:
        return {
            "layout_key": self.layout_key,
            "space_key": self.space_key,
            "method": self.method,
            "geometry": self.geometry,
            "count": self.count,
            "created_at": self.created_at,
            "params_json": json.dumps(self.params) if self.params else None,
        }

    def to_api_dict(self) -> dict[str, Any]:
        return {
            "layout_key": self.layout_key,
            "space_key": self.space_key,
            "method": self.method,
            "geometry": self.geometry,
            "count": self.count,
            "params": self.params,
        }

    @classmethod
    def from_dict(cls, row: dict[str, Any]) -> "LayoutInfo":
        params_json = row.get("params_json")
        params = json.loads(params_json) if params_json else None
        return cls(
            layout_key=row["layout_key"],
            space_key=row["space_key"],
            method=row["method"],
            geometry=row["geometry"],
            count=row["count"],
            created_at=row["created_at"],
            params=params,
        )


def slugify_model_id(model_id: str) -> str:
    """Convert a model ID to a safe table name component.

    Examples:
        "openai/clip-vit-base-patch32" -> "openai_clip-vit-base-patch32"
        "sentence-transformers/all-MiniLM-L6-v2" -> "sentence-transformers_all-MiniLM-L6-v2"
    """
    # Replace / with _
    slug = model_id.replace("/", "_")
    # Replace any other unsafe characters with _
    slug = re.sub(r"[^a-zA-Z0-9_\-]", "_", slug)
    # Collapse multiple underscores
    slug = re.sub(r"_+", "_", slug)
    return slug.strip("_")


def make_space_key(model_id: str) -> str:
    """Generate a space_key from a model_id.

    For simplicity, this is just the slugified model_id.
    For provider-aware keys, use `make_provider_aware_space_key` from providers.py.
    """
    return slugify_model_id(model_id)


def make_layout_key(
    space_key: str,
    method: str = "umap",
    geometry: str = "euclidean",
    params: dict | None = None,
) -> str:
    """Generate a layout_key from space, method, geometry, and params.

    The params are hashed to ensure different parameter sets get different keys.
    """
    base = f"{space_key}__{geometry}_{method}"
    if params:
        # Create a stable hash of params
        import hashlib
        params_str = "_".join(f"{k}={v}" for k, v in sorted(params.items()))
        params_hash = hashlib.md5(params_str.encode()).hexdigest()[:8]
        return f"{base}_{params_hash}"
    return base


def sample_to_dict(sample: Sample) -> dict[str, Any]:
    """Convert a Sample to a dictionary for LanceDB insertion."""
    return {
        "id": sample.id,
        "filepath": sample.filepath,
        "label": sample.label,
        "metadata_json": json.dumps(sample.metadata) if sample.metadata else None,
        "thumbnail_base64": sample.thumbnail_base64,
    }


def dict_to_sample(row: dict[str, Any]) -> Sample:
    """Convert a LanceDB row to a Sample object."""
    metadata_json = row.get("metadata_json")
    metadata = json.loads(metadata_json) if metadata_json else {}

    return Sample(
        id=row["id"],
        filepath=row["filepath"],
        label=row.get("label"),
        metadata=metadata,
        thumbnail_base64=row.get("thumbnail_base64"),
    )