File size: 4,456 Bytes
23680f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
"""Pre-compute HuggingFace Spaces demo dataset.

This script is run during Docker build to pre-compute embeddings and
visualizations, ensuring fast startup times on HuggingFace Spaces.

Features:
- CIFAR-10 dataset (300 samples)
- CLIP embeddings (Euclidean) via embed-anything
- HyCoCLIP embeddings (Hyperbolic) via hyper-models ONNX
- Pre-computed UMAP visualizations for both geometries
"""

import os
from pathlib import Path

# Configuration
DATASET_NAME = "cifar10_hf_demo"
HF_DATASET = "uoft-cs/cifar10"
HF_SPLIT = "train"
HF_IMAGE_KEY = "img"
HF_LABEL_KEY = "label"
NUM_SAMPLES = int(os.environ.get("DEMO_SAMPLES", 300))

# Storage paths (LanceDB + media)
DEFAULT_DATASETS_DIR = Path(
    os.environ.get("HYPERVIEW_DATASETS_DIR", "/home/user/app/demo_data/datasets")
)
DEFAULT_MEDIA_DIR = Path(os.environ.get("HYPERVIEW_MEDIA_DIR", "/home/user/app/demo_data/media"))

# Model configurations
CLIP_MODEL_ID = "openai/clip-vit-base-patch32"


def create_demo_dataset():
    """Create and return the demo dataset with pre-computed embeddings."""
    import hyperview as hv
    from hyperview.embeddings.providers import ModelSpec

    print("=" * 60)
    print("HyperView HuggingFace Spaces Demo Dataset Builder")
    print("=" * 60)
    print(f"Dataset: {HF_DATASET} ({HF_SPLIT})")
    print(f"Samples: {NUM_SAMPLES}")
    print(f"Datasets dir: {DEFAULT_DATASETS_DIR}")
    print(f"Media dir: {DEFAULT_MEDIA_DIR}")
    print("=" * 60)

    # Create dataset (persistent LanceDB storage)
    dataset = hv.Dataset(DATASET_NAME)

    # Load CIFAR-10 samples
    print(f"\n[1/5] Loading {NUM_SAMPLES} samples from {HF_DATASET}...")
    added, skipped = dataset.add_from_huggingface(
        HF_DATASET,
        split=HF_SPLIT,
        image_key=HF_IMAGE_KEY,
        label_key=HF_LABEL_KEY,
        max_samples=NUM_SAMPLES,
    )
    print(f"      Loaded {added} samples ({skipped} skipped)")

    # Compute CLIP embeddings (Euclidean)
    print(f"\n[2/5] Computing CLIP embeddings ({CLIP_MODEL_ID})...")
    clip_space = dataset.compute_embeddings(CLIP_MODEL_ID, show_progress=True)
    print(f"      Embedding space: {clip_space}")

    # Compute Euclidean visualization
    print("\n[3/5] Computing Euclidean (UMAP) visualization...")
    dataset.compute_visualization(space_key=clip_space, geometry="euclidean")
    print("      Euclidean layout ready")

    # Compute HyCoCLIP embeddings (Hyperbolic) - using ONNX provider
    print("\n[4/5] Computing HyCoCLIP embeddings (hyperbolic)...")
    try:
        # First, download the ONNX model from HuggingFace Hub
        from huggingface_hub import hf_hub_download

        repo_id = "mnm-matin/hyperbolic-clip"
        onnx_path = hf_hub_download(
            repo_id=repo_id,
            filename="hycoclip-vit-s/model.onnx",
        )
        data_path = hf_hub_download(
            repo_id=repo_id,
            filename="hycoclip-vit-s/model.onnx.data",
        )
        print(f"      Downloaded ONNX model to: {onnx_path}")
        print(f"      Downloaded ONNX weights to: {data_path}")

        # Create model spec with local path
        hycoclip_spec = ModelSpec(
            provider="hycoclip_onnx",
            model_id="hycoclip-vit-s",
            checkpoint=onnx_path,
        )
        hyper_space = dataset.compute_embeddings(hycoclip_spec, show_progress=True)
        print(f"      Embedding space: {hyper_space}")

        # Compute Poincaré visualization
        print("\n[5/5] Computing Poincaré disk visualization...")
        dataset.compute_visualization(space_key=hyper_space, geometry="poincare")
        print("      Poincaré layout ready")

    except Exception as e:
        print(f"      WARNING: HyCoCLIP failed: {e}")
        print("      Falling back to CLIP-only with Poincaré projection")
        # Fallback: use CLIP embeddings with Poincaré projection
        print("\n[5/5] Computing Poincaré visualization from CLIP embeddings...")
        dataset.compute_visualization(space_key=clip_space, geometry="poincare")
        print("      Poincaré layout ready (from CLIP)")

    return dataset


def main() -> None:
    """Main entry point for pre-computation."""
    dataset = create_demo_dataset()

    # Summary
    print("\n" + "=" * 60)
    print("Pre-computation complete!")
    print("=" * 60)
    print(f"Samples: {len(dataset)}")
    print(f"Dataset: {DATASET_NAME}")
    print("=" * 60)


if __name__ == "__main__":
    main()