#!/usr/bin/env python """Pre-compute HuggingFace Spaces demo dataset. This script is run during Docker build to pre-compute embeddings and visualizations, ensuring fast startup times on HuggingFace Spaces. Features: - CIFAR-10 dataset (300 samples) - CLIP embeddings (Euclidean) via embed-anything - HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX - Pre-computed UMAP visualizations for both geometries """ import os from pathlib import Path # Configuration DATASET_NAME = "cifar10_hf_demo" HF_DATASET = "uoft-cs/cifar10" HF_SPLIT = "train" HF_IMAGE_KEY = "img" HF_LABEL_KEY = "label" NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300)) # Storage paths (LanceDB + media) DEFAULT_DATASETS_DIR = Path( os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets") ) DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media")) # Model configurations CLIP_MODEL_ID = "openai/clip-vit-base-patch32" def create_demo_dataset(): """Create and return the demo dataset with pre-computed embeddings.""" import hyperview as hv from hyperview.embeddings.providers import ModelSpec print("=" * 60) print("HyperView HuggingFace Spaces Demo Dataset Builder") print("=" * 60) print(f"Dataset: {HF_DATASET} ({HF_SPLIT})") print(f"Samples: {NUM_SAMPLES}") print(f"Datasets dir: {DEFAULT_DATASETS_DIR}") print(f"Media dir: {DEFAULT_MEDIA_DIR}") print("=" * 60) # Create dataset (persistent LanceDB storage) dataset = hv.Dataset(DATASET_NAME) # Load CIFAR-10 samples print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...") added, skipped = dataset.add_from_huggingface( HF_DATASET, split=HF_SPLIT, image_key=HF_IMAGE_KEY, label_key=HF_LABEL_KEY, max_samples=NUM_SAMPLES, ) print(f" Loaded {added} samples ({skipped} skipped)") # Compute CLIP embeddings (Euclidean) print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...") clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True) print(f" Embedding space: {clip_space}") # Compute Euclidean visualization print("\n[3/5] Computing Euclidean (UMAP) visualization...") dataset.compute_visualization(space_key=clip_space, geometry="euclidean") print(" Euclidean layout ready") # Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...") try: # First, download the ONNX model from HuggingFace Hub from huggingface_hub import hf_hub_download repo_id = "mnm-matin/hyperbolic-clip" onnx_path = hf_hub_download( repo_id=repo_id, filename="hycoclip-vit-s/model.onnx", ) data_path = hf_hub_download( repo_id=repo_id, filename="hycoclip-vit-s/model.onnx.data", ) print(f" Downloaded ONNX model to: {onnx_path}") print(f" Downloaded ONNX weights to: {data_path}") # Create model spec with local path hycoclip_spec = ModelSpec( provider="hycoclip_onnx", model_id="hycoclip-vit-s", checkpoint=onnx_path, ) hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True) print(f" Embedding space: {hyper_space}") # Compute Poincaré visualization print("\n[5/5] Computing Poincaré disk visualization...") dataset.compute_visualization(space_key=hyper_space, geometry="poincare") print(" Poincaré layout ready") except Exception as e: print(f" WARNING: HyCoCLIP failed: {e}") print(" Falling back to CLIP-only with Poincaré projection") # Fallback: use CLIP embeddings with Poincaré projection print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...") dataset.compute_visualization(space_key=clip_space, geometry="poincare") print(" Poincaré layout ready (from CLIP)") return dataset def main() -> None: """Main entry point for pre-computation.""" dataset = create_demo_dataset() # Summary print("\n" + "=" * 60) print("Pre-computation complete!") print("=" * 60) print(f"Samples: {len(dataset)}") print(f"Dataset: {DATASET_NAME}") print("=" * 60) if __name__ == "__main__": main()