Spaces:
Running
Running
| #!/usr/bin/env python | |
| """Pre-compute HuggingFace Spaces demo dataset. | |
| This script is run during Docker build to pre-compute embeddings and | |
| visualizations, ensuring fast startup times on HuggingFace Spaces. | |
| Features: | |
| - CIFAR-10 dataset (300 samples) | |
| - CLIP embeddings (Euclidean) via embed-anything | |
| - HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX | |
| - Pre-computed UMAP visualizations for both geometries | |
| """ | |
| import os | |
| from pathlib import Path | |
| # Configuration | |
| DATASET_NAME = "cifar10_hf_demo" | |
| HF_DATASET = "uoft-cs/cifar10" | |
| HF_SPLIT = "train" | |
| HF_IMAGE_KEY = "img" | |
| HF_LABEL_KEY = "label" | |
| NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300)) | |
| # Storage paths (LanceDB + media) | |
| DEFAULT_DATASETS_DIR = Path( | |
| os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets") | |
| ) | |
| DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media")) | |
| # Model configurations | |
| CLIP_MODEL_ID = "openai/clip-vit-base-patch32" | |
| def create_demo_dataset(): | |
| """Create and return the demo dataset with pre-computed embeddings.""" | |
| import hyperview as hv | |
| from hyperview.embeddings.providers import ModelSpec | |
| print("=" * 60) | |
| print("HyperView HuggingFace Spaces Demo Dataset Builder") | |
| print("=" * 60) | |
| print(f"Dataset: {HF_DATASET} ({HF_SPLIT})") | |
| print(f"Samples: {NUM_SAMPLES}") | |
| print(f"Datasets dir: {DEFAULT_DATASETS_DIR}") | |
| print(f"Media dir: {DEFAULT_MEDIA_DIR}") | |
| print("=" * 60) | |
| # Create dataset (persistent LanceDB storage) | |
| dataset = hv.Dataset(DATASET_NAME) | |
| # Load CIFAR-10 samples | |
| print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...") | |
| added, skipped = dataset.add_from_huggingface( | |
| HF_DATASET, | |
| split=HF_SPLIT, | |
| image_key=HF_IMAGE_KEY, | |
| label_key=HF_LABEL_KEY, | |
| max_samples=NUM_SAMPLES, | |
| ) | |
| print(f" Loaded {added} samples ({skipped} skipped)") | |
| # Compute CLIP embeddings (Euclidean) | |
| print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...") | |
| clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True) | |
| print(f" Embedding space: {clip_space}") | |
| # Compute Euclidean visualization | |
| print("\n[3/5] Computing Euclidean (UMAP) visualization...") | |
| dataset.compute_visualization(space_key=clip_space, geometry="euclidean") | |
| print(" Euclidean layout ready") | |
| # Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider | |
| print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...") | |
| try: | |
| # First, download the ONNX model from HuggingFace Hub | |
| from huggingface_hub import hf_hub_download | |
| repo_id = "mnm-matin/hyperbolic-clip" | |
| onnx_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename="hycoclip-vit-s/model.onnx", | |
| ) | |
| data_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename="hycoclip-vit-s/model.onnx.data", | |
| ) | |
| print(f" Downloaded ONNX model to: {onnx_path}") | |
| print(f" Downloaded ONNX weights to: {data_path}") | |
| # Create model spec with local path | |
| hycoclip_spec = ModelSpec( | |
| provider="hycoclip_onnx", | |
| model_id="hycoclip-vit-s", | |
| checkpoint=onnx_path, | |
| ) | |
| hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True) | |
| print(f" Embedding space: {hyper_space}") | |
| # Compute Poincaré visualization | |
| print("\n[5/5] Computing Poincaré disk visualization...") | |
| dataset.compute_visualization(space_key=hyper_space, geometry="poincare") | |
| print(" Poincaré layout ready") | |
| except Exception as e: | |
| print(f" WARNING: HyCoCLIP failed: {e}") | |
| print(" Falling back to CLIP-only with Poincaré projection") | |
| # Fallback: use CLIP embeddings with Poincaré projection | |
| print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...") | |
| dataset.compute_visualization(space_key=clip_space, geometry="poincare") | |
| print(" Poincaré layout ready (from CLIP)") | |
| return dataset | |
| def main() -> None: | |
| """Main entry point for pre-computation.""" | |
| dataset = create_demo_dataset() | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("Pre-computation complete!") | |
| print("=" * 60) | |
| print(f"Samples: {len(dataset)}") | |
| print(f"Dataset: {DATASET_NAME}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |