Spaces:
Runtime error
Runtime error
precompute
Browse files- app.py +22 -8
- assets/umap_gpt2_cosine.npy +3 -0
app.py
CHANGED
|
@@ -2,9 +2,9 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 5 |
-
from sklearn.decomposition import PCA
|
| 6 |
from sklearn.preprocessing import normalize
|
| 7 |
from umap import UMAP
|
|
|
|
| 8 |
import json
|
| 9 |
|
| 10 |
# Model configuration
|
|
@@ -22,6 +22,7 @@ vocab_tokens = tokenizer.convert_ids_to_tokens(list(range(vocab_size)))
|
|
| 22 |
# Cache for embeddings and UMAP (computed once at startup)
|
| 23 |
embeddings_cache = None
|
| 24 |
umap_projections_cache = None
|
|
|
|
| 25 |
umap_params = {
|
| 26 |
"n_neighbors": 75,
|
| 27 |
"min_dist": 0.15,
|
|
@@ -42,15 +43,28 @@ def initialize_embeddings():
|
|
| 42 |
# Normalize rows (cosine distance works best with normalized vectors)
|
| 43 |
norm_embeds = normalize(embeddings_cache, norm="l2", axis=1)
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
# UMAP to 2D (full vocab)
|
| 52 |
umap_model = UMAP(**umap_params)
|
| 53 |
-
umap_projections_cache = umap_model.fit_transform(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
return embeddings_cache, umap_projections_cache
|
| 56 |
|
|
|
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 5 |
from sklearn.preprocessing import normalize
|
| 6 |
from umap import UMAP
|
| 7 |
+
import os
|
| 8 |
import json
|
| 9 |
|
| 10 |
# Model configuration
|
|
|
|
| 22 |
# Cache for embeddings and UMAP (computed once at startup)
|
| 23 |
embeddings_cache = None
|
| 24 |
umap_projections_cache = None
|
| 25 |
+
UMAP_LAYOUT_PATH = os.environ.get("UMAP_LAYOUT_PATH", os.path.join("assets", "umap_gpt2_cosine.npy"))
|
| 26 |
umap_params = {
|
| 27 |
"n_neighbors": 75,
|
| 28 |
"min_dist": 0.15,
|
|
|
|
| 43 |
# Normalize rows (cosine distance works best with normalized vectors)
|
| 44 |
norm_embeds = normalize(embeddings_cache, norm="l2", axis=1)
|
| 45 |
|
| 46 |
+
# If a precomputed layout exists, load it
|
| 47 |
+
try:
|
| 48 |
+
if os.path.isfile(UMAP_LAYOUT_PATH):
|
| 49 |
+
umap_projections_cache = np.load(UMAP_LAYOUT_PATH)
|
| 50 |
+
# Basic validation
|
| 51 |
+
if umap_projections_cache.shape[0] != norm_embeds.shape[0] or umap_projections_cache.shape[1] != 2:
|
| 52 |
+
raise ValueError("Precomputed UMAP layout shape mismatch; recomputing.")
|
| 53 |
+
return embeddings_cache, umap_projections_cache
|
| 54 |
+
except Exception as e:
|
| 55 |
+
# If load fails, fall through to recompute
|
| 56 |
+
print(f"Warning: failed to load precomputed UMAP layout: {e}. Recomputing...")
|
| 57 |
|
| 58 |
+
# UMAP to 2D (full vocab) — direct from normalized embeddings (no PCA)
|
| 59 |
umap_model = UMAP(**umap_params)
|
| 60 |
+
umap_projections_cache = umap_model.fit_transform(norm_embeds).astype(np.float32) # [V, 2]
|
| 61 |
+
|
| 62 |
+
# Save for future cold starts
|
| 63 |
+
try:
|
| 64 |
+
os.makedirs(os.path.dirname(UMAP_LAYOUT_PATH), exist_ok=True)
|
| 65 |
+
np.save(UMAP_LAYOUT_PATH, umap_projections_cache)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"Warning: failed to save UMAP layout to {UMAP_LAYOUT_PATH}: {e}")
|
| 68 |
|
| 69 |
return embeddings_cache, umap_projections_cache
|
| 70 |
|
assets/umap_gpt2_cosine.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1886c8ea52bf44f920915c1b8188954b94a1dc06584deaea526259d096a470e1
|
| 3 |
+
size 402184
|