Spaces:
Running
Running
File size: 4,456 Bytes
23680f2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
#!/usr/bin/env python
"""Pre-compute HuggingFace Spaces demo dataset.
This script is run during Docker build to pre-compute embeddings and
visualizations, ensuring fast startup times on HuggingFace Spaces.
Features:
- CIFAR-10 dataset (300 samples)
- CLIP embeddings (Euclidean) via embed-anything
- HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX
- Pre-computed UMAP visualizations for both geometries
"""
import os
from pathlib import Path
# Configuration
DATASET_NAME = "cifar10_hf_demo"
HF_DATASET = "uoft-cs/cifar10"
HF_SPLIT = "train"
HF_IMAGE_KEY = "img"
HF_LABEL_KEY = "label"
NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300))
# Storage paths (LanceDB + media)
DEFAULT_DATASETS_DIR = Path(
os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets")
)
DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media"))
# Model configurations
CLIP_MODEL_ID = "openai/clip-vit-base-patch32"
def create_demo_dataset():
"""Create and return the demo dataset with pre-computed embeddings."""
import hyperview as hv
from hyperview.embeddings.providers import ModelSpec
print("=" * 60)
print("HyperView HuggingFace Spaces Demo Dataset Builder")
print("=" * 60)
print(f"Dataset: {HF_DATASET} ({HF_SPLIT})")
print(f"Samples: {NUM_SAMPLES}")
print(f"Datasets dir: {DEFAULT_DATASETS_DIR}")
print(f"Media dir: {DEFAULT_MEDIA_DIR}")
print("=" * 60)
# Create dataset (persistent LanceDB storage)
dataset = hv.Dataset(DATASET_NAME)
# Load CIFAR-10 samples
print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...")
added, skipped = dataset.add_from_huggingface(
HF_DATASET,
split=HF_SPLIT,
image_key=HF_IMAGE_KEY,
label_key=HF_LABEL_KEY,
max_samples=NUM_SAMPLES,
)
print(f" Loaded {added} samples ({skipped} skipped)")
# Compute CLIP embeddings (Euclidean)
print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...")
clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True)
print(f" Embedding space: {clip_space}")
# Compute Euclidean visualization
print("\n[3/5] Computing Euclidean (UMAP) visualization...")
dataset.compute_visualization(space_key=clip_space, geometry="euclidean")
print(" Euclidean layout ready")
# Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider
print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...")
try:
# First, download the ONNX model from HuggingFace Hub
from huggingface_hub import hf_hub_download
repo_id = "mnm-matin/hyperbolic-clip"
onnx_path = hf_hub_download(
repo_id=repo_id,
filename="hycoclip-vit-s/model.onnx",
)
data_path = hf_hub_download(
repo_id=repo_id,
filename="hycoclip-vit-s/model.onnx.data",
)
print(f" Downloaded ONNX model to: {onnx_path}")
print(f" Downloaded ONNX weights to: {data_path}")
# Create model spec with local path
hycoclip_spec = ModelSpec(
provider="hycoclip_onnx",
model_id="hycoclip-vit-s",
checkpoint=onnx_path,
)
hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True)
print(f" Embedding space: {hyper_space}")
# Compute Poincaré visualization
print("\n[5/5] Computing Poincaré disk visualization...")
dataset.compute_visualization(space_key=hyper_space, geometry="poincare")
print(" Poincaré layout ready")
except Exception as e:
print(f" WARNING: HyCoCLIP failed: {e}")
print(" Falling back to CLIP-only with Poincaré projection")
# Fallback: use CLIP embeddings with Poincaré projection
print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...")
dataset.compute_visualization(space_key=clip_space, geometry="poincare")
print(" Poincaré layout ready (from CLIP)")
return dataset
def main() -> None:
"""Main entry point for pre-computation."""
dataset = create_demo_dataset()
# Summary
print("\n" + "=" * 60)
print("Pre-computation complete!")
print("=" * 60)
print(f"Samples: {len(dataset)}")
print(f"Dataset: {DATASET_NAME}")
print("=" * 60)
if __name__ == "__main__":
main()
|