Spaces:

Jgray21
/

attention_layer_graph

Sleeping

App Files Files Community

Joshua Gray commited on Dec 13, 2025

Commit

de372eb

1 Parent(s): c304b46

UMAP/Pool Performance boost

Browse files

Files changed (1) hide show

src/streamlit_app.py +98 -22

src/streamlit_app.py CHANGED Viewed

@@ -30,6 +30,8 @@ from sklearn.metrics import pairwise_distances
 # Plotly for interactive 3D
 import plotly.graph_objects as go
 # Optional libs (use if present)
 try:
     import hdbscan  # Robust density-based clustering
@@ -384,6 +386,63 @@ def fit_umap_2d(pool: np.ndarray,
     reducer.fit(pool)
     return reducer
 def fit_umap_3d(all_states: np.ndarray,
                 n_neighbors: int = 30,
@@ -568,27 +627,44 @@ def run_pipeline(cfg: Config, model, tok, device, main_text: str, save_artifacts
     main_bundle = extract_hidden_states(model, tok, main_text, cfg.max_length, device)
     layers_np: List[np.ndarray] = main_bundle.hidden_layers   # list of (T,D), length L_all = num_layers+1
     tokens = main_bundle.tokens                               # list of length T
     L_all = len(layers_np)
     #print(f"[Hidden] Layers (incl. embedding): {L_all}, Tokens: {len(tokens)}")
-    # 8.3 Build a pool of states (across a few texts & layers) to fit anchors + UMAP
-    pool_states = []
-    # Sample across first few texts to improve diversity (lightweight)
-    for t in texts[: min(5, len(texts))]:
-        b = extract_hidden_states(model, tok, t, cfg.max_length, device)
-        # Take a subset from each layer to limit pool size
-        for H in b.hidden_layers:
-            T = len(H)
-            take = min(cfg.fit_pool_per_layer, T)
-            idx = np.random.choice(T, size=take, replace=False)
-            pool_states.append(H[idx])
-    pool_states = np.vstack(pool_states) if len(pool_states) else layers_np[-1]
-    #print(f"[Pool] Pooled states for anchors/UMAP: {pool_states.shape}")
-    # 8.4 Fit global anchors (LoT-style features)
-    anchors = fit_global_anchors(pool_states, cfg.anchor_k)
-    # Save anchors for reproducibility
     # 8.5 Build per-layer features for main text (LoT-style distances & uncertainty)
     layer_features = []      # list of (T,K)
@@ -640,10 +716,10 @@ def run_pipeline(cfg: Config, model, tok, device, main_text: str, save_artifacts
     # 8.10 Common 2D manifold via UMAP (fit-once on the pool), then transform each layer
-    reducer2d = fit_umap_2d(pool_states,
                             n_neighbors=cfg.umap_n_neighbors,
                             min_dist=cfg.umap_min_dist,
-                            metric=cfg.umap_metric)
     xy_by_layer = [reducer2d.transform(layers_np[l]) for l in range(L_all)]
     # OPTIONAL: orthogonal alignment across layers (helps if UMAP.transform still drifts)
@@ -682,8 +758,8 @@ def get_model_and_tok(model_name: str):
     return model, tok, device, dtype
 def main():
-    st.set_page_config(page_title="Qwen Layer Explorer", layout="wide")
-    st.title("Qwen: 3D Token Embedding Explorer (Live Hidden States)")
     with st.sidebar:
         st.header("Model / Input")

 # Plotly for interactive 3D
 import plotly.graph_objects as go
+import hashlib
 # Optional libs (use if present)
 try:
     import hdbscan  # Robust density-based clustering
     reducer.fit(pool)
     return reducer
+def _corpus_fingerprint(texts, max_items=5, max_chars=4000) -> str:
+    """Stable key so cache invalidates if DEFAULT_CORPUS changes."""
+    joined = "\n".join(texts[:max_items])
+    joined = joined[:max_chars]
+    return hashlib.sha256(joined.encode("utf-8")).hexdigest()
+@st.cache_data(show_spinner=False)
+def get_pool_artifacts(
+    model_name: str,
+    max_length: int,
+    anchor_k: int,
+    anchor_temp: float,          # not strictly needed for fitting anchors, but included if you want cache keys aligned
+    umap_n_neighbors: int,
+    umap_min_dist: float,
+    umap_metric: str,
+    fit_pool_per_layer: int,
+    corpus_hash: str,
+):
+    """
+    Cached: build pooled hidden states on DEFAULT_CORPUS, fit anchors and a UMAP reducer once.
+    Returns:
+      anchors: (K, D) np.ndarray
+      reducer2d: fitted UMAP reducer object (must be pickleable; umap-learn's UMAP is)
+    """
+    # Use cached model loader (resource cache)
+    model, tok, device, dtype = get_model_and_tok(model_name)
+    texts = DEFAULT_CORPUS  # pooled set for stability
+    pool_states = []
+    for t in texts[: min(5, len(texts))]:
+        b = extract_hidden_states(model, tok, t, max_length, device)
+        for H in b.hidden_layers:
+            T = len(H)
+            take = min(fit_pool_per_layer, T)
+            if take <= 0:
+                continue
+            idx = np.random.choice(T, size=take, replace=False)
+            pool_states.append(H[idx])
+    if not pool_states:
+        # fallback: this should rarely happen
+        raise RuntimeError("Pool construction produced no states.")
+    pool_states = np.vstack(pool_states)
+    anchors = fit_global_anchors(pool_states, anchor_k)
+    reducer2d = fit_umap_2d(
+        pool_states,
+        n_neighbors=umap_n_neighbors,
+        min_dist=umap_min_dist,
+        metric=umap_metric,
+    )
+    return anchors, reducer2d
 def fit_umap_3d(all_states: np.ndarray,
                 n_neighbors: int = 30,
     main_bundle = extract_hidden_states(model, tok, main_text, cfg.max_length, device)
     layers_np: List[np.ndarray] = main_bundle.hidden_layers   # list of (T,D), length L_all = num_layers+1
     tokens = main_bundle.tokens                               # list of length T
+    # Cached pool artifacts (anchors + fitted UMAP reducer)
+    corpus_hash = _corpus_fingerprint(texts)  # texts is cfg.corpus or DEFAULT_CORPUS
+    anchors, reducer2d = get_pool_artifacts(
+        model_name=cfg.model_name,
+        max_length=cfg.max_length,
+        anchor_k=cfg.anchor_k,
+        anchor_temp=cfg.anchor_temp,
+        umap_n_neighbors=cfg.umap_n_neighbors,
+        umap_min_dist=cfg.umap_min_dist,
+        umap_metric=cfg.umap_metric,
+        fit_pool_per_layer=cfg.fit_pool_per_layer,
+        corpus_hash=corpus_hash,
+    )
     L_all = len(layers_np)
     #print(f"[Hidden] Layers (incl. embedding): {L_all}, Tokens: {len(tokens)}")
+    """
+        # 8.3 Build a pool of states (across a few texts & layers) to fit anchors + UMAP
+        pool_states = []
+        # Sample across first few texts to improve diversity (lightweight)
+        for t in texts[: min(5, len(texts))]:
+            b = extract_hidden_states(model, tok, t, cfg.max_length, device)
+            # Take a subset from each layer to limit pool size
+            for H in b.hidden_layers:
+                T = len(H)
+                take = min(cfg.fit_pool_per_layer, T)
+                idx = np.random.choice(T, size=take, replace=False)
+                pool_states.append(H[idx])
+        pool_states = np.vstack(pool_states) if len(pool_states) else layers_np[-1]
+        #print(f"[Pool] Pooled states for anchors/UMAP: {pool_states.shape}")
+        # 8.4 Fit global anchors (LoT-style features)
+        anchors = fit_global_anchors(pool_states, cfg.anchor_k)
+        # Save anchors for reproducibility
+        """
     # 8.5 Build per-layer features for main text (LoT-style distances & uncertainty)
     layer_features = []      # list of (T,K)
     # 8.10 Common 2D manifold via UMAP (fit-once on the pool), then transform each layer
+    """reducer2d = fit_umap_2d(pool_states,
                             n_neighbors=cfg.umap_n_neighbors,
                             min_dist=cfg.umap_min_dist,
+                            metric=cfg.umap_metric)"""
     xy_by_layer = [reducer2d.transform(layers_np[l]) for l in range(L_all)]
     # OPTIONAL: orthogonal alignment across layers (helps if UMAP.transform still drifts)
     return model, tok, device, dtype
 def main():
+    st.set_page_config(page_title="Layer Explorer", layout="wide")
+    st.title("3D Token Embedding Explorer (Live Hidden States)")
     with st.sidebar:
         st.header("Model / Input")