HyperView / scripts /precompute_hf_demo.py
morozovdd's picture
feat: add HyperView app for space
23680f2
#!/usr/bin/env python
"""Pre-compute HuggingFace Spaces demo dataset.
This script is run during Docker build to pre-compute embeddings and
visualizations, ensuring fast startup times on HuggingFace Spaces.
Features:
- CIFAR-10 dataset (300 samples)
- CLIP embeddings (Euclidean) via embed-anything
- HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX
- Pre-computed UMAP visualizations for both geometries
"""
import os
from pathlib import Path
# Configuration
DATASET_NAME = "cifar10_hf_demo"
HF_DATASET = "uoft-cs/cifar10"
HF_SPLIT = "train"
HF_IMAGE_KEY = "img"
HF_LABEL_KEY = "label"
NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300))
# Storage paths (LanceDB + media)
DEFAULT_DATASETS_DIR = Path(
os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets")
)
DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media"))
# Model configurations
CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
def create_demo_dataset():
"""Create and return the demo dataset with pre-computed embeddings."""
import hyperview as hv
from hyperview.embeddings.providers import ModelSpec
print("=" * 60)
print("HyperView HuggingFace Spaces Demo Dataset Builder")
print("=" * 60)
print(f"Dataset: {HF_DATASET} ({HF_SPLIT})")
print(f"Samples: {NUM_SAMPLES}")
print(f"Datasets dir: {DEFAULT_DATASETS_DIR}")
print(f"Media dir: {DEFAULT_MEDIA_DIR}")
print("=" * 60)
# Create dataset (persistent LanceDB storage)
dataset = hv.Dataset(DATASET_NAME)
# Load CIFAR-10 samples
print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...")
added, skipped = dataset.add_from_huggingface(
HF_DATASET,
split=HF_SPLIT,
image_key=HF_IMAGE_KEY,
label_key=HF_LABEL_KEY,
max_samples=NUM_SAMPLES,
)
print(f" Loaded {added} samples ({skipped} skipped)")
# Compute CLIP embeddings (Euclidean)
print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...")
clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True)
print(f" Embedding space: {clip_space}")
# Compute Euclidean visualization
print("\n[3/5] Computing Euclidean (UMAP) visualization...")
dataset.compute_visualization(space_key=clip_space, geometry="euclidean")
print(" Euclidean layout ready")
# Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider
print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...")
try:
# First, download the ONNX model from HuggingFace Hub
from huggingface_hub import hf_hub_download
repo_id = "mnm-matin/hyperbolic-clip"
onnx_path = hf_hub_download(
repo_id=repo_id,
filename="hycoclip-vit-s/model.onnx",
)
data_path = hf_hub_download(
repo_id=repo_id,
filename="hycoclip-vit-s/model.onnx.data",
)
print(f" Downloaded ONNX model to: {onnx_path}")
print(f" Downloaded ONNX weights to: {data_path}")
# Create model spec with local path
hycoclip_spec = ModelSpec(
provider="hycoclip_onnx",
model_id="hycoclip-vit-s",
checkpoint=onnx_path,
)
hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True)
print(f" Embedding space: {hyper_space}")
# Compute Poincaré visualization
print("\n[5/5] Computing Poincaré disk visualization...")
dataset.compute_visualization(space_key=hyper_space, geometry="poincare")
print(" Poincaré layout ready")
except Exception as e:
print(f" WARNING: HyCoCLIP failed: {e}")
print(" Falling back to CLIP-only with Poincaré projection")
# Fallback: use CLIP embeddings with Poincaré projection
print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...")
dataset.compute_visualization(space_key=clip_space, geometry="poincare")
print(" Poincaré layout ready (from CLIP)")
return dataset
def main() -> None:
"""Main entry point for pre-computation."""
dataset = create_demo_dataset()
# Summary
print("\n" + "=" * 60)
print("Pre-computation complete!")
print("=" * 60)
print(f"Samples: {len(dataset)}")
print(f"Dataset: {DATASET_NAME}")
print("=" * 60)
if __name__ == "__main__":
main()