Spaces:

hyper3labs
/

HyperView

Running

File size: 4,456 Bytes

23680f2

#!/usr/bin/env python
"""Pre-compute HuggingFace Spaces demo dataset.

This script is run during Docker build to pre-compute embeddings and
visualizations, ensuring fast startup times on HuggingFace Spaces.

Features:
- CIFAR-10 dataset (300 samples)
- CLIP embeddings (Euclidean) via embed-anything
- HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX
- Pre-computed UMAP visualizations for both geometries
"""

import os
from pathlib import Path

# Configuration
DATASET_NAME = "cifar10_hf_demo"
HF_DATASET = "uoft-cs/cifar10"
HF_SPLIT = "train"
HF_IMAGE_KEY = "img"
HF_LABEL_KEY = "label"
NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300))

# Storage paths (LanceDB + media)
DEFAULT_DATASETS_DIR = Path(
    os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets")
)
DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media"))

# Model configurations
CLIP_MODEL_ID = "openai/clip-vit-base-patch32"


def create_demo_dataset():
    """Create and return the demo dataset with pre-computed embeddings."""
    import hyperview as hv
    from hyperview.embeddings.providers import ModelSpec

    print("=" * 60)
    print("HyperView HuggingFace Spaces Demo Dataset Builder")
    print("=" * 60)
    print(f"Dataset: {HF_DATASET} ({HF_SPLIT})")
    print(f"Samples: {NUM_SAMPLES}")
    print(f"Datasets dir: {DEFAULT_DATASETS_DIR}")
    print(f"Media dir: {DEFAULT_MEDIA_DIR}")
    print("=" * 60)

    # Create dataset (persistent LanceDB storage)
    dataset = hv.Dataset(DATASET_NAME)

    # Load CIFAR-10 samples
    print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...")
    added, skipped = dataset.add_from_huggingface(
        HF_DATASET,
        split=HF_SPLIT,
        image_key=HF_IMAGE_KEY,
        label_key=HF_LABEL_KEY,
        max_samples=NUM_SAMPLES,
    )
    print(f"      Loaded {added} samples ({skipped} skipped)")

    # Compute CLIP embeddings (Euclidean)
    print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...")
    clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True)
    print(f"      Embedding space: {clip_space}")

    # Compute Euclidean visualization
    print("\n[3/5] Computing Euclidean (UMAP) visualization...")
    dataset.compute_visualization(space_key=clip_space, geometry="euclidean")
    print("      Euclidean layout ready")

    # Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider
    print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...")
    try:
        # First, download the ONNX model from HuggingFace Hub
        from huggingface_hub import hf_hub_download

        repo_id = "mnm-matin/hyperbolic-clip"
        onnx_path = hf_hub_download(
            repo_id=repo_id,
            filename="hycoclip-vit-s/model.onnx",
        )
        data_path = hf_hub_download(
            repo_id=repo_id,
            filename="hycoclip-vit-s/model.onnx.data",
        )
        print(f"      Downloaded ONNX model to: {onnx_path}")
        print(f"      Downloaded ONNX weights to: {data_path}")

        # Create model spec with local path
        hycoclip_spec = ModelSpec(
            provider="hycoclip_onnx",
            model_id="hycoclip-vit-s",
            checkpoint=onnx_path,
        )
        hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True)
        print(f"      Embedding space: {hyper_space}")

        # Compute Poincaré visualization
        print("\n[5/5] Computing Poincaré disk visualization...")
        dataset.compute_visualization(space_key=hyper_space, geometry="poincare")
        print("      Poincaré layout ready")

    except Exception as e:
        print(f"      WARNING: HyCoCLIP failed: {e}")
        print("      Falling back to CLIP-only with Poincaré projection")
        # Fallback: use CLIP embeddings with Poincaré projection
        print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...")
        dataset.compute_visualization(space_key=clip_space, geometry="poincare")
        print("      Poincaré layout ready (from CLIP)")

    return dataset


def main() -> None:
    """Main entry point for pre-computation."""
    dataset = create_demo_dataset()

    # Summary
    print("\n" + "=" * 60)
    print("Pre-computation complete!")
    print("=" * 60)
    print(f"Samples: {len(dataset)}")
    print(f"Dataset: {DATASET_NAME}")
    print("=" * 60)


if __name__ == "__main__":
    main()