Spaces:

hyper3labs
/

HyperView

Running

App Files Files Community

HyperView / scripts /precompute_hf_demo.py

morozovdd

feat: add HyperView app for space

23680f2 2 days ago

raw

history blame contribute delete

4.46 kB

	#!/usr/bin/env python
	"""Pre-compute HuggingFace Spaces demo dataset.

	This script is run during Docker build to pre-compute embeddings and
	visualizations, ensuring fast startup times on HuggingFace Spaces.

	Features:
	- CIFAR-10 dataset (300 samples)
	- CLIP embeddings (Euclidean) via embed-anything
	- HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX
	- Pre-computed UMAP visualizations for both geometries
	"""

	import os
	from pathlib import Path

	# Configuration
	DATASET_NAME = "cifar10_hf_demo"
	HF_DATASET = "uoft-cs/cifar10"
	HF_SPLIT = "train"
	HF_IMAGE_KEY = "img"
	HF_LABEL_KEY = "label"
	NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300))

	# Storage paths (LanceDB + media)
	DEFAULT_DATASETS_DIR = Path(
	os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets")
	)
	DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media"))

	# Model configurations
	CLIP_MODEL_ID = "openai/clip-vit-base-patch32"


	def create_demo_dataset():
	"""Create and return the demo dataset with pre-computed embeddings."""
	import hyperview as hv
	from hyperview.embeddings.providers import ModelSpec

	print("=" * 60)
	print("HyperView HuggingFace Spaces Demo Dataset Builder")
	print("=" * 60)
	print(f"Dataset: {HF_DATASET} ({HF_SPLIT})")
	print(f"Samples: {NUM_SAMPLES}")
	print(f"Datasets dir: {DEFAULT_DATASETS_DIR}")
	print(f"Media dir: {DEFAULT_MEDIA_DIR}")
	print("=" * 60)

	# Create dataset (persistent LanceDB storage)
	dataset = hv.Dataset(DATASET_NAME)

	# Load CIFAR-10 samples
	print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...")
	added, skipped = dataset.add_from_huggingface(
	HF_DATASET,
	split=HF_SPLIT,
	image_key=HF_IMAGE_KEY,
	label_key=HF_LABEL_KEY,
	max_samples=NUM_SAMPLES,
	)
	print(f" Loaded {added} samples ({skipped} skipped)")

	# Compute CLIP embeddings (Euclidean)
	print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...")
	clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True)
	print(f" Embedding space: {clip_space}")

	# Compute Euclidean visualization
	print("\n[3/5] Computing Euclidean (UMAP) visualization...")
	dataset.compute_visualization(space_key=clip_space, geometry="euclidean")
	print(" Euclidean layout ready")

	# Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider
	print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...")
	try:
	# First, download the ONNX model from HuggingFace Hub
	from huggingface_hub import hf_hub_download

	repo_id = "mnm-matin/hyperbolic-clip"
	onnx_path = hf_hub_download(
	repo_id=repo_id,
	filename="hycoclip-vit-s/model.onnx",
	)
	data_path = hf_hub_download(
	repo_id=repo_id,
	filename="hycoclip-vit-s/model.onnx.data",
	)
	print(f" Downloaded ONNX model to: {onnx_path}")
	print(f" Downloaded ONNX weights to: {data_path}")

	# Create model spec with local path
	hycoclip_spec = ModelSpec(
	provider="hycoclip_onnx",
	model_id="hycoclip-vit-s",
	checkpoint=onnx_path,
	)
	hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True)
	print(f" Embedding space: {hyper_space}")

	# Compute Poincaré visualization
	print("\n[5/5] Computing Poincaré disk visualization...")
	dataset.compute_visualization(space_key=hyper_space, geometry="poincare")
	print(" Poincaré layout ready")

	except Exception as e:
	print(f" WARNING: HyCoCLIP failed: {e}")
	print(" Falling back to CLIP-only with Poincaré projection")
	# Fallback: use CLIP embeddings with Poincaré projection
	print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...")
	dataset.compute_visualization(space_key=clip_space, geometry="poincare")
	print(" Poincaré layout ready (from CLIP)")

	return dataset


	def main() -> None:
	"""Main entry point for pre-computation."""
	dataset = create_demo_dataset()

	# Summary
	print("\n" + "=" * 60)
	print("Pre-computation complete!")
	print("=" * 60)
	print(f"Samples: {len(dataset)}")
	print(f"Dataset: {DATASET_NAME}")
	print("=" * 60)


	if __name__ == "__main__":
	main()