Spaces:

Jgray21
/

attention_layer_graph

Sleeping

App Files Files Community

Jgray21 commited on Dec 16, 2025

Commit

062f3e4

verified ·

1 Parent(s): bc94a3e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +6 -41

src/streamlit_app.py CHANGED Viewed

@@ -214,11 +214,6 @@ def build_threshold_graph(H: np.ndarray, top_pct: float = 0.05, use_cosine: bool
     return G
 def percolation_stats(G: nx.Graph) -> Dict[str, float]:
-    """
-    Compute percolation observables (φ, #clusters, χ) as in your notebook.
-    φ  : fraction of nodes in the Giant Connected Component (GCC)
-    χ  : mean size of components excluding GCC
-    """
     n = G.number_of_nodes()
     if n == 0:
         return dict(phi=0.0, num_clusters=0, chi=0.0, largest_component_size=0, component_sizes=[])
@@ -283,19 +278,12 @@ def fit_global_anchors(pool: np.ndarray, K: int) -> np.ndarray:
 # ====== Model I/O (hidden states) =============================================================
 @dataclass
 class HiddenStatesBundle:
-    """
-    Encapsulates a single input's hidden states and metadata.
-        hidden_layers: list of np.ndarray of shape (T, D), length = num_layers+1 (incl. embedding)
-        tokens       : list of token strings of length T
-    """
     hidden_layers: List[np.ndarray]
     tokens: List[str]
 def load_qwen(model_name: str, device: str, dtype: torch.dtype):
-    """
-    Load Qwen with output_hidden_states=True. We use AutoTokenizer for broader compatibility.
-    """
     print(f"[Load] {model_name} on {device} ({dtype})")
     config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
     tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
@@ -308,10 +296,7 @@ def load_qwen(model_name: str, device: str, dtype: torch.dtype):
 @torch.no_grad()
 def extract_hidden_states(model, tokenizer, text: str, max_length: int, device: str) -> HiddenStatesBundle:
-    """
-    Run a single forward pass to collect all hidden states (incl. embedding layer).
-    Returns CPU numpy arrays to keep GPU memory low.
-    """
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     out = model(**inputs)
     # Tuple length = num_layers + 1 (embedding)
@@ -322,25 +307,14 @@ def extract_hidden_states(model, tokenizer, text: str, max_length: int, device:
 # ====== LoT-style anchors & features ==========================================================
 def fit_global_anchors(all_states_sampled: np.ndarray, K: int, random_state: int = 42) -> np.ndarray:
-    """
-    Fit KMeans cluster centroids on a pooled set of states (from many layers/texts).
-    These centroids are "anchors" (LoT-like choices) to build low-dim features:
-      f(state) = [dist(state, anchor_j)]_{j=1..K}
-    """
     print(f"[Anchors] Fitting {K} global centroids on {len(all_states_sampled)} states ...")
     kmeans = KMeans(n_clusters=K, n_init="auto", random_state=random_state)
     kmeans.fit(all_states_sampled)
     return kmeans.cluster_centers_  # (K, D)
 def anchor_features(H: np.ndarray, anchors: np.ndarray, temperature: float = 1.0) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """
-    For states H (N,D) and anchors A (K,D):
-      - Compute Euclidean distances to each anchor → Dists (N,K)
-      - Convert to soft probabilities with exp(-Dist/T), normalize row-wise → P (N,K)
-      - Uncertainty = entropy(P) (cf. LoT Eq. (6))
-      - Top-anchor argmin distance for "consistency"-style comparisons (cf. Eq. (5))
-    Returns (Dists, P, entropy)
-    """
     # Distances (N, K)
     dists = pairwise_distances(H, anchors, metric="euclidean")  # (N,K)
     # Soft assignments
@@ -361,10 +335,7 @@ def fit_umap_2d(pool: np.ndarray,
                 min_dist: float = 0.05,
                 metric: str = "cosine",
                 random_state: int = 42) -> umap.UMAP:
-    """
-    Fit UMAP once on a diverse pool across layers to preserve orientation.
-    Later layers call .transform() to embed into the SAME 2D space → "MRI stack".
-    """
     reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist,
                         metric=metric, random_state=random_state)
@@ -377,15 +348,11 @@ def fit_umap_3d(all_states: np.ndarray,
                 min_dist: float = 0.05,
                 metric: str = "cosine",
                 random_state: int = 42) -> np.ndarray:
-    """
-    Fit a global 3D UMAP embedding for all states at once (alternative to slice stack).
-    Returns coords_3d (N,3) for the concatenated states passed in.
-    """
     reducer = umap.UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=min_dist,
                         metric=metric, random_state=random_state)
     return reducer.fit_transform(all_states)
-"""## Define Visualization Function"""
 # ====== Visualization ========================================================================
 def plotly_3d_layers(xy_layers: List[np.ndarray],
@@ -531,7 +498,6 @@ def plotly_3d_layers(xy_layers: List[np.ndarray],
     )
     return fig
-"""## Building the pipeline"""
 def run_pipeline(cfg: Config, model, tok, device, main_text: str, save_artifacts: bool = False):
     seed_everything(42)
@@ -630,7 +596,6 @@ def run_pipeline(cfg: Config, model, tok, device, main_text: str, save_artifacts
     return fig, {"percolation": percolation, "tokens": tokens}
-"""## This section is for the Web App UI"""
 @st.cache_resource(show_spinner=False)
 def get_model_and_tok(model_name: str):

     return G
 def percolation_stats(G: nx.Graph) -> Dict[str, float]:
     n = G.number_of_nodes()
     if n == 0:
         return dict(phi=0.0, num_clusters=0, chi=0.0, largest_component_size=0, component_sizes=[])
 # ====== Model I/O (hidden states) =============================================================
 @dataclass
 class HiddenStatesBundle:
     hidden_layers: List[np.ndarray]
     tokens: List[str]
 def load_qwen(model_name: str, device: str, dtype: torch.dtype):
     print(f"[Load] {model_name} on {device} ({dtype})")
     config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
     tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 @torch.no_grad()
 def extract_hidden_states(model, tokenizer, text: str, max_length: int, device: str) -> HiddenStatesBundle:
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     out = model(**inputs)
     # Tuple length = num_layers + 1 (embedding)
 # ====== LoT-style anchors & features ==========================================================
 def fit_global_anchors(all_states_sampled: np.ndarray, K: int, random_state: int = 42) -> np.ndarray:
     print(f"[Anchors] Fitting {K} global centroids on {len(all_states_sampled)} states ...")
     kmeans = KMeans(n_clusters=K, n_init="auto", random_state=random_state)
     kmeans.fit(all_states_sampled)
     return kmeans.cluster_centers_  # (K, D)
 def anchor_features(H: np.ndarray, anchors: np.ndarray, temperature: float = 1.0) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     # Distances (N, K)
     dists = pairwise_distances(H, anchors, metric="euclidean")  # (N,K)
     # Soft assignments
                 min_dist: float = 0.05,
                 metric: str = "cosine",
                 random_state: int = 42) -> umap.UMAP:
     reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist,
                         metric=metric, random_state=random_state)
                 min_dist: float = 0.05,
                 metric: str = "cosine",
                 random_state: int = 42) -> np.ndarray:
     reducer = umap.UMAP(n_components=3, n_neighbors=n_neighbors, min_dist=min_dist,
                         metric=metric, random_state=random_state)
     return reducer.fit_transform(all_states)
 # ====== Visualization ========================================================================
 def plotly_3d_layers(xy_layers: List[np.ndarray],
     )
     return fig
 def run_pipeline(cfg: Config, model, tok, device, main_text: str, save_artifacts: bool = False):
     seed_everything(42)
     return fig, {"percolation": percolation, "tokens": tokens}
 @st.cache_resource(show_spinner=False)
 def get_model_and_tok(model_name: str):