ollibolli commited on
Commit
51afe30
·
1 Parent(s): d5ba468

precompute

Browse files
Files changed (2) hide show
  1. app.py +22 -8
  2. assets/umap_gpt2_cosine.npy +3 -0
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import torch
3
  import numpy as np
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
- from sklearn.decomposition import PCA
6
  from sklearn.preprocessing import normalize
7
  from umap import UMAP
 
8
  import json
9
 
10
  # Model configuration
@@ -22,6 +22,7 @@ vocab_tokens = tokenizer.convert_ids_to_tokens(list(range(vocab_size)))
22
  # Cache for embeddings and UMAP (computed once at startup)
23
  embeddings_cache = None
24
  umap_projections_cache = None
 
25
  umap_params = {
26
  "n_neighbors": 75,
27
  "min_dist": 0.15,
@@ -42,15 +43,28 @@ def initialize_embeddings():
42
  # Normalize rows (cosine distance works best with normalized vectors)
43
  norm_embeds = normalize(embeddings_cache, norm="l2", axis=1)
44
 
45
- # Optional PCA(50) for speed/stability before UMAP (not a fallback layout)
46
- d = norm_embeds.shape[1]
47
- n_components = min(50, d)
48
- pca50 = PCA(n_components=n_components, svd_solver="randomized", random_state=0)
49
- reduced = pca50.fit_transform(norm_embeds)
 
 
 
 
 
 
50
 
51
- # UMAP to 2D (full vocab)
52
  umap_model = UMAP(**umap_params)
53
- umap_projections_cache = umap_model.fit_transform(reduced).astype(np.float32) # [V, 2]
 
 
 
 
 
 
 
54
 
55
  return embeddings_cache, umap_projections_cache
56
 
 
2
  import torch
3
  import numpy as np
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
  from sklearn.preprocessing import normalize
6
  from umap import UMAP
7
+ import os
8
  import json
9
 
10
  # Model configuration
 
22
  # Cache for embeddings and UMAP (computed once at startup)
23
  embeddings_cache = None
24
  umap_projections_cache = None
25
+ UMAP_LAYOUT_PATH = os.environ.get("UMAP_LAYOUT_PATH", os.path.join("assets", "umap_gpt2_cosine.npy"))
26
  umap_params = {
27
  "n_neighbors": 75,
28
  "min_dist": 0.15,
 
43
  # Normalize rows (cosine distance works best with normalized vectors)
44
  norm_embeds = normalize(embeddings_cache, norm="l2", axis=1)
45
 
46
+ # If a precomputed layout exists, load it
47
+ try:
48
+ if os.path.isfile(UMAP_LAYOUT_PATH):
49
+ umap_projections_cache = np.load(UMAP_LAYOUT_PATH)
50
+ # Basic validation
51
+ if umap_projections_cache.shape[0] != norm_embeds.shape[0] or umap_projections_cache.shape[1] != 2:
52
+ raise ValueError("Precomputed UMAP layout shape mismatch; recomputing.")
53
+ return embeddings_cache, umap_projections_cache
54
+ except Exception as e:
55
+ # If load fails, fall through to recompute
56
+ print(f"Warning: failed to load precomputed UMAP layout: {e}. Recomputing...")
57
 
58
+ # UMAP to 2D (full vocab) — direct from normalized embeddings (no PCA)
59
  umap_model = UMAP(**umap_params)
60
+ umap_projections_cache = umap_model.fit_transform(norm_embeds).astype(np.float32) # [V, 2]
61
+
62
+ # Save for future cold starts
63
+ try:
64
+ os.makedirs(os.path.dirname(UMAP_LAYOUT_PATH), exist_ok=True)
65
+ np.save(UMAP_LAYOUT_PATH, umap_projections_cache)
66
+ except Exception as e:
67
+ print(f"Warning: failed to save UMAP layout to {UMAP_LAYOUT_PATH}: {e}")
68
 
69
  return embeddings_cache, umap_projections_cache
70
 
assets/umap_gpt2_cosine.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1886c8ea52bf44f920915c1b8188954b94a1dc06584deaea526259d096a470e1
3
+ size 402184