Spaces:

cdpearlman
/

LLMVis

Sleeping

App Files Files Community

cdpearlman commited on Feb 26

Commit

7fa8fb4

1 Parent(s): d86e476

Attention refactor, better categorization and explanation

Browse files

Files changed (11) hide show

app.py +4 -3
components/pipeline.py +218 -89
jarvis_llmvis_ux_review.md +240 -0
rag_docs/head_categories_explained.md +33 -31
scripts/analyze_heads.py +564 -0
tests/conftest.py +0 -8
tests/test_head_detection.py +391 -273
utils/__init__.py +4 -5
utils/head_categories.json +1099 -0
utils/head_detection.py +245 -373
utils/model_patterns.py +1 -20

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ import json
 import torch
 from utils import (load_model_and_get_patterns, execute_forward_pass, extract_layer_data,
                    perform_beam_search, execute_forward_pass_with_multi_layer_head_ablation)
-from utils.head_detection import categorize_all_heads
 from utils.model_config import get_auto_selections
 from utils.token_attribution import compute_integrated_gradients, compute_simple_gradient_attribution
@@ -576,10 +576,11 @@ def update_pipeline_content(activation_data, model_name):
         except:
             pass
-        # Agent G: Get full head categorization for attention stage UI (expandable categories)
         head_categories = None
         try:
-            head_categories = categorize_all_heads(activation_data)
         except:
             pass

 import torch
 from utils import (load_model_and_get_patterns, execute_forward_pass, extract_layer_data,
                    perform_beam_search, execute_forward_pass_with_multi_layer_head_ablation)
+from utils.head_detection import get_active_head_summary
 from utils.model_config import get_auto_selections
 from utils.token_attribution import compute_integrated_gradients, compute_simple_gradient_attribution
         except:
             pass
+        # Get head categorization from pre-computed JSON + runtime verification
         head_categories = None
         try:
+            from utils.head_detection import get_active_head_summary
+            head_categories = get_active_head_summary(activation_data, model_name)
         except:
             pass

components/pipeline.py CHANGED Viewed

@@ -400,133 +400,263 @@ def create_attention_content(attention_html=None, top_attended=None, layer_info=
     """
     Create content for the attention stage.
-    Agent G: Removed "Most attended tokens" section (deprecated). Now shows head categorization
-    to help users understand what different attention heads are doing.
     Args:
         attention_html: BertViz HTML string for attention visualization
-        top_attended: DEPRECATED - no longer used, kept for backward compatibility
         layer_info: Optional layer information for context
-        head_categories: Dict mapping category names to lists of head info dicts (from categorize_all_heads)
-                        Each head info has: {'layer': N, 'head': M, 'label': 'LN-HM', ...}
-                        Can also accept counts dict for backward compatibility.
     """
     content_items = [
         html.Div([
             html.H5("What happens here:", style={'color': '#495057', 'marginBottom': '8px'}),
             html.P([
                 "The model looks at ", html.Strong("all tokens at once"),
-                " and figures out which ones are related to each other. This is called 'attention' - ",
                 "each token 'attends to' other tokens to gather context for its prediction."
             ], style={'color': '#6c757d', 'fontSize': '14px', 'marginBottom': '12px'}),
             html.P([
-                "Attention has multiple ", html.Strong("heads"), " - each head learns to look for different types of relationships. ",
-                "For example, one head might track subject-verb agreement, while another tracks pronouns and their referents."
             ], style={'color': '#6c757d', 'fontSize': '14px', 'marginBottom': '16px'})
         ])
     ]
-    # Agent G: Head Categorization Summary with expandable categories
-    if head_categories:
-        category_labels = {
-            'previous_token': ('Previous-Token', '#667eea', 'Heads that attend to the immediately preceding token'),
-            'first_token': ('First/Positional', '#764ba2', 'Heads that focus on the first token or positional patterns'),
-            'bow': ('Bag-of-Words', '#f093fb', 'Heads with diffuse attention across many tokens'),
-            'syntactic': ('Syntactic', '#4facfe', 'Heads that capture grammatical relationships'),
-            'other': ('Other', '#6c757d', 'Heads with mixed or specialized patterns')
         }
         category_sections = []
-        for cat_key in ['previous_token', 'first_token', 'bow', 'syntactic', 'other']:
-            cat_data = head_categories.get(cat_key, [])
-            # Handle both list format (full data) and int format (counts only, backward compat)
-            if isinstance(cat_data, int):
-                count = cat_data
-                head_list = []
-            else:
-                count = len(cat_data) if cat_data else 0
-                head_list = cat_data
-            if count > 0 and cat_key in category_labels:
-                label, color, tooltip = category_labels[cat_key]
-                # Build head list display (only if we have full data)
-                head_chips = []
-                if head_list:
-                    for head_info in head_list:
-                        head_label = head_info.get('label', f"L{head_info.get('layer', '?')}-H{head_info.get('head', '?')}")
-                        head_chips.append(
-                            html.Span(head_label, style={
-                                'display': 'inline-block',
-                                'padding': '4px 8px',
-                                'margin': '2px',
-                                'backgroundColor': f'{color}15',
-                                'border': f'1px solid {color}30',
-                                'borderRadius': '4px',
-                                'fontSize': '12px',
-                                'fontFamily': 'monospace'
-                            })
-                        )
-                # Create expandable section for this category
-                category_sections.append(
-                    html.Details([
-                        html.Summary([
-                            html.Span(label, style={'fontWeight': '500', 'color': '#495057'}),
-                            html.Span(f" ({count})", style={'marginLeft': '4px', 'color': '#6c757d'})
-                        ], style={
-                            'padding': '8px 12px',
-                            'backgroundColor': f'{color}15',
-                            'border': f'1px solid {color}30',
-                            'borderRadius': '8px',
-                            'cursor': 'pointer',
-                            'userSelect': 'none',
-                            'listStyle': 'none',
-                            'display': 'flex',
-                            'alignItems': 'center'
-                        }, title=tooltip),
-                        # Expanded content - list of heads
                         html.Div([
-                            html.P(tooltip, style={
-                                'color': '#6c757d',
-                                'fontSize': '12px',
-                                'marginBottom': '8px',
-                                'fontStyle': 'italic'
-                            }),
-                            html.Div(head_chips if head_chips else [
-                                html.Span("Head details not available", style={'color': '#999', 'fontSize': '12px'})
                             ], style={
-                                'display': 'flex',
-                                'flexWrap': 'wrap',
-                                'gap': '4px'
                             })
                         ], style={
-                            'padding': '12px',
-                            'backgroundColor': '#fafbfc',
-                            'borderRadius': '0 0 8px 8px',
-                            'marginTop': '-1px',
-                            'border': f'1px solid {color}30',
-                            'borderTop': 'none'
                         })
-                    ], style={'marginBottom': '8px'})
                 )
         if category_sections:
             content_items.append(
                 html.Div([
-                    html.H5("Attention Head Categories:", style={'color': '#495057', 'marginBottom': '12px'}),
                     html.P([
-                        html.I(className='fas fa-info-circle', style={'color': '#6c757d', 'marginRight': '6px'}),
-                        "Click each category to expand and see which heads belong to it."
                     ], style={'color': '#6c757d', 'fontSize': '12px', 'marginBottom': '12px'}),
-                    html.Div(category_sections)
                 ], style={'marginBottom': '16px'})
             )
     # BertViz visualization with navigation instructions
     if attention_html:
-        # Agent G: Enhanced navigation instructions for head view
         content_items.append(
             html.Div([
                 html.H5("How to Navigate the Attention Visualization:", style={'color': '#495057', 'marginBottom': '12px'}),
@@ -537,7 +667,6 @@ def create_attention_content(attention_html=None, top_attended=None, layer_info=
                         html.Span("Click on layer/head numbers at the top to view specific attention heads.",
                                  style={'color': '#6c757d'})
                     ], style={'marginBottom': '4px'}),
-                    # Sub-points for click behaviors
                     html.Div([
                         html.Span("• ", style={'color': '#f093fb', 'fontWeight': 'bold'}),
                         html.Strong("Single click ", style={'color': '#495057'}),

     """
     Create content for the attention stage.
+    Displays head categorization with active/inactive states, activation bars,
+    suggested prompts, and guided interpretation.
     Args:
         attention_html: BertViz HTML string for attention visualization
+        top_attended: DEPRECATED - no longer used
         layer_info: Optional layer information for context
+        head_categories: Output from get_active_head_summary() — dict with 'categories' key
+                        containing per-category data with activation scores.
+                        Falls back gracefully if None or old format.
     """
     content_items = [
         html.Div([
             html.H5("What happens here:", style={'color': '#495057', 'marginBottom': '8px'}),
             html.P([
                 "The model looks at ", html.Strong("all tokens at once"),
+                " and figures out which ones are related to each other. This is called 'attention' — ",
                 "each token 'attends to' other tokens to gather context for its prediction."
             ], style={'color': '#6c757d', 'fontSize': '14px', 'marginBottom': '12px'}),
             html.P([
+                "Attention has multiple ", html.Strong("heads"), " — each head learns to look for different types of relationships. ",
+                "Below you can see what role each head plays and whether it's active on your current input."
             ], style={'color': '#6c757d', 'fontSize': '14px', 'marginBottom': '16px'})
         ])
     ]
+    # New: Head Roles Panel using get_active_head_summary() output
+    if head_categories and isinstance(head_categories, dict) and 'categories' in head_categories:
+        categories = head_categories['categories']
+        # Color scheme per category
+        category_colors = {
+            'previous_token': '#667eea',
+            'induction': '#e67e22',
+            'duplicate_token': '#9b59b6',
+            'positional': '#2ecc71',
+            'diffuse': '#3498db',
+            'other': '#95a5a6'
         }
+        # Find the top recommended head for guided interpretation
+        guided_head = None
+        guided_cat = None
+        for cat_key in ['previous_token', 'induction', 'positional']:
+            cat_data = categories.get(cat_key, {})
+            heads = cat_data.get('heads', [])
+            active_heads = [h for h in heads if h.get('is_active')]
+            if active_heads:
+                best = max(active_heads, key=lambda h: h['activation_score'])
+                if guided_head is None or best['activation_score'] > guided_head['activation_score']:
+                    guided_head = best
+                    guided_cat = cat_data.get('display_name', cat_key)
+        # Guided interpretation recommendation
+        if guided_head:
+            content_items.append(
+                html.Div([
+                    html.I(className='fas fa-lightbulb', style={'color': '#f39c12', 'marginRight': '8px', 'fontSize': '16px'}),
+                    html.Span([
+                        html.Strong("Try this: "),
+                        f"Select Layer {guided_head['layer']}, Head {guided_head['head']} in the visualization below — ",
+                        f"this is a {guided_cat} head ",
+                        f"(activation: {guided_head['activation_score']:.0%} on your input)."
+                    ], style={'color': '#495057', 'fontSize': '13px'})
+                ], style={
+                    'padding': '12px 16px', 'backgroundColor': '#fef9e7', 'borderRadius': '8px',
+                    'border': '1px solid #f9e79f', 'marginBottom': '16px', 'display': 'flex', 'alignItems': 'center'
+                })
+            )
+        # Build category sections
         category_sections = []
+        category_order = ['previous_token', 'induction', 'duplicate_token', 'positional', 'diffuse', 'other']
+        for cat_key in category_order:
+            cat_data = categories.get(cat_key, {})
+            if not cat_data:
+                continue
+            display_name = cat_data.get('display_name', cat_key)
+            description = cat_data.get('description', '')
+            educational_text = cat_data.get('educational_text', '')
+            icon_name = cat_data.get('icon', 'circle')
+            is_applicable = cat_data.get('is_applicable', True)
+            suggested_prompt = cat_data.get('suggested_prompt')
+            heads = cat_data.get('heads', [])
+            color = category_colors.get(cat_key, '#95a5a6')
+            # Active vs inactive indicator
+            has_active_heads = any(h.get('is_active') for h in heads)
+            status_icon = '●' if (is_applicable and has_active_heads) else '○'
+            status_color = color if (is_applicable and has_active_heads) else '#ccc'
+            # Skip "other" if no heads (which is the normal case)
+            if cat_key == 'other' and not heads:
+                continue
+            # Build head items with activation bars
+            head_items = []
+            if heads:
+                for head_info in heads:
+                    activation = head_info.get('activation_score', 0.0)
+                    is_active = head_info.get('is_active', False)
+                    label = head_info.get('label', f"L{head_info['layer']}-H{head_info['head']}")
+                    # Activation bar
+                    bar_width = max(activation * 100, 2)  # Min 2% for visibility
+                    bar_color = color if is_active else '#ddd'
+                    head_items.append(
                         html.Div([
+                            # Head label
+                            html.Span(label, style={
+                                'fontFamily': 'monospace', 'fontSize': '12px', 'fontWeight': '500',
+                                'minWidth': '60px', 'color': '#495057' if is_active else '#aaa',
+                            }, title=f"See Layer {head_info['layer']}, Head {head_info['head']} in the visualization below"),
+                            # Activation bar
+                            html.Div([
+                                html.Div(style={
+                                    'width': f'{bar_width}%', 'height': '100%',
+                                    'backgroundColor': bar_color, 'borderRadius': '3px',
+                                    'transition': 'width 0.3s ease'
+                                })
                             ], style={
+                                'flex': '1', 'height': '12px', 'backgroundColor': '#f0f0f0',
+                                'borderRadius': '3px', 'margin': '0 8px', 'overflow': 'hidden'
+                            }),
+                            # Score label
+                            html.Span(f"{activation:.2f}", style={
+                                'fontSize': '11px', 'fontFamily': 'monospace',
+                                'color': '#495057' if is_active else '#bbb', 'minWidth': '32px'
                             })
                         ], style={
+                            'display': 'flex', 'alignItems': 'center', 'marginBottom': '4px',
+                            'opacity': '1' if is_active else '0.5'
                         })
+                    )
+            # Build the category section
+            # Header content
+            summary_children = [
+                html.Span(status_icon, style={
+                    'color': status_color, 'fontSize': '16px', 'marginRight': '8px'
+                }),
+                html.Span(display_name, style={'fontWeight': '500', 'color': '#495057'}),
+            ]
+            if heads:
+                active_count = sum(1 for h in heads if h.get('is_active'))
+                summary_children.append(
+                    html.Span(f" ({active_count}/{len(heads)} active)", style={
+                        'marginLeft': '6px', 'color': '#6c757d', 'fontSize': '12px'
+                    })
+                )
+            if not is_applicable:
+                summary_children.append(
+                    html.Span(" — not triggered on this input", style={
+                        'marginLeft': '6px', 'color': '#aaa', 'fontSize': '12px', 'fontStyle': 'italic'
+                    })
                 )
+            # Expanded content
+            expanded_children = []
+            # Educational explanation
+            if educational_text:
+                expanded_children.append(
+                    html.P(educational_text, style={
+                        'color': '#6c757d', 'fontSize': '13px', 'marginBottom': '10px',
+                        'fontStyle': 'italic', 'lineHeight': '1.5'
+                    })
+                )
+            # Suggested prompt (for grayed-out categories)
+            if not is_applicable and suggested_prompt:
+                expanded_children.append(
+                    html.Div([
+                        html.I(className='fas fa-flask', style={'color': '#e67e22', 'marginRight': '6px'}),
+                        html.Span(suggested_prompt, style={'color': '#e67e22', 'fontSize': '12px'})
+                    ], style={
+                        'padding': '8px 12px', 'backgroundColor': '#fef5e7',
+                        'borderRadius': '6px', 'marginBottom': '10px', 'border': '1px solid #fde8c8'
+                    })
+                )
+            # Head activation bars
+            if head_items:
+                expanded_children.append(html.Div(head_items))
+            category_sections.append(
+                html.Details([
+                    html.Summary(summary_children, style={
+                        'padding': '10px 14px',
+                        'backgroundColor': f'{color}08' if is_applicable else '#fafafa',
+                        'border': f'1px solid {color}25' if is_applicable else '1px solid #eee',
+                        'borderRadius': '8px', 'cursor': 'pointer', 'userSelect': 'none',
+                        'listStyle': 'none', 'display': 'flex', 'alignItems': 'center'
+                    }),
+                    html.Div(expanded_children, style={
+                        'padding': '12px 14px', 'backgroundColor': '#fafbfc',
+                        'borderRadius': '0 0 8px 8px', 'marginTop': '-1px',
+                        'border': f'1px solid {color}25' if is_applicable else '1px solid #eee',
+                        'borderTop': 'none'
+                    })
+                ], style={'marginBottom': '8px'}, open=(cat_key == 'previous_token'))  # Default-open first category
+            )
         if category_sections:
+            # Legend
+            legend = html.Div([
+                html.Span("● = active on your input", style={
+                    'color': '#495057', 'fontSize': '11px', 'marginRight': '16px'
+                }),
+                html.Span("○ = role exists but not triggered", style={
+                    'color': '#aaa', 'fontSize': '11px'
+                })
+            ], style={'marginBottom': '10px'})
             content_items.append(
                 html.Div([
+                    html.H5("Attention Head Roles:", style={'color': '#495057', 'marginBottom': '8px'}),
                     html.P([
+                        "Each category represents a type of behavior we detected in this model's attention heads. ",
+                        "Click a category to see individual heads and how strongly they're activated on your input."
                     ], style={'color': '#6c757d', 'fontSize': '12px', 'marginBottom': '12px'}),
+                    legend,
+                    html.Div(category_sections),
+                    # Accuracy caveat
+                    html.Div([
+                        html.I(className='fas fa-info-circle', style={'color': '#6c757d', 'marginRight': '6px', 'fontSize': '11px'}),
+                        html.Span(
+                            "These categories are simplified labels based on each head's dominant behavior. "
+                            "In reality, heads can serve multiple roles and may behave differently on different inputs.",
+                            style={'color': '#999', 'fontSize': '11px'}
+                        )
+                    ], style={'marginTop': '12px', 'padding': '8px 12px', 'backgroundColor': '#f8f9fa', 'borderRadius': '6px'})
                 ], style={'marginBottom': '16px'})
             )
+    elif head_categories is None:
+        # Model not analyzed — show fallback message
+        content_items.append(
+            html.Div([
+                html.I(className='fas fa-info-circle', style={'color': '#6c757d', 'marginRight': '8px'}),
+                html.Span(
+                    "Head categorization is not available for this model. "
+                    "The attention visualization below still shows the full attention patterns.",
+                    style={'color': '#6c757d', 'fontSize': '13px'}
+                )
+            ], style={
+                'padding': '12px', 'backgroundColor': '#f8f9fa', 'borderRadius': '8px',
+                'border': '1px solid #dee2e6', 'marginBottom': '16px'
+            })
+        )
     # BertViz visualization with navigation instructions
     if attention_html:
         content_items.append(
             html.Div([
                 html.H5("How to Navigate the Attention Visualization:", style={'color': '#495057', 'marginBottom': '12px'}),
                         html.Span("Click on layer/head numbers at the top to view specific attention heads.",
                                  style={'color': '#6c757d'})
                     ], style={'marginBottom': '4px'}),
                     html.Div([
                         html.Span("• ", style={'color': '#f093fb', 'fontWeight': 'bold'}),
                         html.Strong("Single click ", style={'color': '#495057'}),

jarvis_llmvis_ux_review.md ADDED Viewed

	@@ -0,0 +1,240 @@

+# LLMVis UX & Explanation Review
+**Date:** 2026-02-26
+**Reviewer:** JARVIS
+**Method:** Playwright automated walkthrough of https://cdpearlman-llmvis.hf.space (GPT-2 124M, prompt: "The cat sat on the mat. The cat")
+**Reference:** `attention_handoff.md` (attention head categorization spec)
+---
+## Executive Summary
+The app is in solid working shape. The pipeline storytelling is clean, the BertViz integration works, and attribution renders well. The two biggest gaps against the handoff spec are: (1) the attention head categorization is broken — 132/144 heads are mislabeled as "First/Positional," swamping all meaningful signal; and (2) the induction, duplicate, and diffuse head categories from the spec are entirely absent. Beyond that, the attention visualization is the weakest explanation panel — it shows the heatmap but doesn't teach the student what to look for. Ablation UX also has friction and never surfaced results in testing.
+---
+## 1. Overall Layout & First Impression
+**What's good:**
+- Clean gradient header, uncluttered layout
+- The pipeline section ("How the Model Processes Your Input") is a strong pedagogical frame — the numbered steps with the flow chip bar (Input → Tokens → Embed → Attention → MLP → Output) is excellent
+- Glossary modal auto-opens on first visit, which is a good onboarding move
+- The sidebar module selection (showing `transformer.h.{N}.attn` etc.) is a nice power-user layer
+**Issues:**
+- **Glossary modal close button is off-screen** at default viewport widths. The `×` renders at x≈1858 on a 1400px window. Students on laptops will be stuck staring at a modal they can't close without scrolling right. Fix: position the close button inside the modal boundary, not at the document edge.
+- **45-second cold start with no feedback.** After clicking Analyze, the pipeline stages show "Awaiting analysis..." with no progress indicator, spinner, or ETA. For a student, this looks broken. Fix: add a loading spinner or "Model is warming up (~30s)..." message on first run.
+- **Generation Settings sliders are confusing.** "Number of Generation Choices" with values 1/3/5 is jargon. Students don't know what beam search is. The label should be "Explore How Many Different Continuations?" or similar, with a tooltip. The current glossary entry on Beam Search is good but isn't linked from the slider.
+---
+## 2. Tokenization Stage
+**What's good:**
+- Clean token→ID table. Exactly the right content.
+- "Your text is split into 10 tokens" summary in the header is great.
+**Issues:**
+- **No visual "aha" moment.** The table shows Token→ID correctly, but doesn't show *why* "The" becomes 464 vs "the" becoming 262. The capitalization distinction (same word, different token) is sitting right there in this example and the app doesn't call it out. This is a perfect teachable moment — highlight it.
+- **No subword tokenization example.** The prompt was simple English so all tokens were whole words. When a student types something with subwords (e.g., "transformers"), they won't know that's unusual. Consider adding a note: "Notice: some words may split into multiple pieces — try typing 'unhappiness' to see subword tokenization."
+- **The token ID numbers mean nothing to students.** Worth a one-liner: "These IDs are just addresses in a vocabulary table of 50,257 words and word-pieces."
+---
+## 3. Embedding Stage
+**What's good:**
+- The `Token ID → Lookup Table → [768-dimensional vector]` flow diagram is clean and conceptually correct.
+- The callout box ("How the lookup table was created: During training on billions of text examples...") is excellent — this is exactly the kind of "where did this come from?" context students need.
+**Issues:**
+- **No actual data shown.** The stage says "768-dimensional vector" but never shows a student what even 5 dimensions of that vector look like. Even a truncated display like `[0.23, -1.41, 0.07, ...]` would make it real.
+- **No similarity demo.** The explanation says "words with similar meanings (like 'happy' and 'joyful') have similar vectors" — but doesn't show it. A small cosine similarity callout using tokens actually in the input ("'cat' and 'mat' are somewhat similar; 'cat' and 'The' are not") would land this point.
+- **Missing: positional embeddings.** This is a significant omission. The embedding stage in a transformer is `token_embedding + positional_embedding`. The current explanation only covers token embeddings. Students who read further literature will be confused. Add: "Each token also gets a positional embedding added — a second vector encoding *where* in the sequence it appears."
+---
+## 4. Attention Stage
+This is the most important and most underbuilt section. The handoff doc has a detailed vision that is only partially implemented.
+### 4a. Head Category Panel
+**Critical bug: First/Positional is consuming 132/144 heads.**
+The categorization output:
+- Previous-Token: 6 heads ✓ (reasonable)
+- First/Positional: **132 heads** ✗ (this is ~92% of all heads — clearly wrong)
+- Syntactic: 5 heads (plausible)
+- Other: 1 head
+This makes the category panel meaningless. A student sees a wall of 132 head IDs under "First/Positional" and learns nothing. The classification threshold for positional heads is almost certainly too loose, OR the `all_scores` from the offline script are being compared against an incorrect threshold. The handoff spec calls for a cap of ~8 heads per category with layer diversity enforcement — that logic is either not implemented or the thresholds need significant tuning.
+**Missing categories from the spec:**
+The handoff doc specifies 6 categories:
+1. ✅ Previous Token (implemented)
+2. ❌ **Induction** (missing entirely)
+3. ❌ **Duplicate Token** (missing entirely)
+4. ✅ First/Positional (implemented but broken threshold)
+5. ❌ **Diffuse / Bag-of-Words** (missing entirely)
+6. ✅ Other/Unclassified (implemented)
+"Syntactic" appears as a category but isn't in the handoff spec — unclear where it came from or how it's detected.
+**Missing: runtime activation scoring.** The spec calls for each head to show an activation score on the *current input* (e.g., whether induction heads are firing given the repeated "The cat" in the prompt). Nothing like this exists yet — heads are just listed as belonging to categories with no indication of whether they're active or dormant on this specific input.
+**Missing: greyed-out heads with "suggested prompts."** The spec's pedagogically most powerful idea — "Try adding a repeated sentence to see induction heads light up" — doesn't exist at all. This is the thing that turns passive observation into active discovery.
+### 4b. Attention Visualization (BertViz)
+**What's good:**
+- BertViz integration works and renders the attention heatmap
+- The navigation instructions (single click, double click, hover) are clear
+**Issues:**
+- **No guided interpretation.** The visualization shows lines but doesn't tell the student what they're looking at. For a student who just read that "some heads track pronouns," they need a nudge: "Try Layer 4, Head 11 — this head often looks at the previous word." Right now the student opens a heatmap of spaghetti lines and has no idea what to conclude.
+- **The attention viz and head category panel are disconnected.** Clicking a head in the category list should highlight/select it in the BertViz below. The handoff spec mentions this: "Clicking a head navigates to its attention heatmap." That linkage doesn't exist.
+- **No explanation of what "good" attention looks like.** The viz shows all heads at once by default. For a 12×12 model that's 144 attention patterns — overwhelming. The default view should be a single interesting head (e.g., the strongest previous-token head), not all heads.
+- **Layer selector is bare.** The "Layer: [dropdown]" control has no context. Why would a student change the layer? Add: "Earlier layers tend to capture syntax; later layers capture meaning."
+---
+## 5. MLP (Feed-Forward) Stage
+**What's good:**
+- The `768d → 3072d → 768d` expand/compress diagram is clean
+- The "Why expand then compress?" callout box is excellent — the neuron activation framing is correct
+- "This happens in each of the model's 12 layers, with attention and MLP working together" is a good summary
+**Issues:**
+- **No connection to the current input.** The Paris/France example is generic and not connected to the actual prompt being analyzed. Consider: "For your prompt, the MLP layers are likely retrieving knowledge about common English sentence structures."
+- **No visualization.** MLP is the only stage with purely static text and a diagram. Even a simple bar chart of "top activated neurons at layer X" would make this real. The handoff doc doesn't spec this out, but it's a gap.
+- **Missing: the residual stream framing.** The glossary defines "Residual Stream" but the MLP stage doesn't mention that the MLP *adds* to the residual stream rather than replacing it. This is fundamental to why the model can accumulate knowledge across layers.
+---
+## 6. Output Selection Stage
+**What's good:**
+- Top-5 next-token predictions with probability bars is exactly right
+- The full-sentence context display with highlighted predicted token is excellent UX
+- The "Note on Token Selection" callout about Beam Search and MoE is appropriately nuanced
+**Issues:**
+- **"13.5% confidence" framing is misleading.** "Confidence" implies certainty; this is a softmax probability, which is better described as "the model assigned a 13.5% probability to 'was' as the next word." Students may misread this as "the model is 13.5% confident it's right."
+- **No contrast with wrong predictions.** The chart shows top-5 but doesn't explain *why* the model predicted "was" over "sat." A connection back to attribution ("The token 'cat' had the highest influence on predicting 'was'") would close the loop.
+- **The token slider is unclear.** "Step through generated tokens" with a slider defaulting to 0 and showing "was" is confusing — it looks like there's nothing to step through. Label it: "Generated token 1 of 1: was" and grey out or hide the slider when only 1 token was generated.
+---
+## 7. Token Attribution Panel
+**What's good:**
+- The visualization works well — darker tokens = more important is intuitive
+- The bar chart with normalized attribution scores is clean
+- Results matched expectations: "was" (the second "cat" token, position 9) scored 1.0, "The" scored 0.87 — sensible given the prompt structure
+**Issues:**
+- **"Simple Gradient" is selected by default, not "Integrated Gradients."** The UI labels Simple Gradient as "faster, less accurate" and Integrated Gradients as "more accurate, slower" — but defaults to the less accurate one. For an educational tool where accuracy matters more than speed, this should be reversed. Or at minimum, note: "For learning purposes, Integrated Gradients gives more reliable results."
+- **No explanation of what attribution scores mean in plain English.** The callout says "Tokens with higher attribution scores contributed more to the model's prediction" — but students need: "The second 'cat' scored highest because the model is pattern-matching 'The cat...' to predict what typically follows 'The cat' in English text."
+- **No visual connection to the actual attention visualization.** If "was" had high attribution from "cat," students should be able to click through to see which attention heads facilitated that. Right now attribution and attention are completely siloed.
+- **Target Token dropdown is confusing.** "Use top predicted token (default)" is fine, but the empty text box below it with "Leave empty to compute attribution for the top predicted token" is redundant and confusing — why show a text box that you immediately tell them not to fill?
+---
+## 8. Ablation Panel
+**Issues (mostly UX):**
+- **Ablation didn't show results in automated testing** — the head selection reset when switching tabs, suggesting state management issues between the Ablation and Attribution tabs.
+- **No presets or suggestions.** The student faces a blank "Layer / Head" picker and has no idea which heads are interesting to ablate. The category panel above already identified previous-token heads (L4-H11, etc.) — there should be a "Try ablating this head" link from the category panel directly into the ablation form.
+- **"Run Ablation Experiment" is permanently greyed out** until a head is added. The disabled state has no tooltip explaining why. Add: "Add at least one head above to run the experiment."
+- **No explanation of what to expect.** Before running, tell students: "If this head is important, the top prediction may change. If it doesn't change, the head wasn't critical for this input."
+- **No result interpretation.** After running (when it works), the diff between original and ablated predictions needs plain-English interpretation: "Removing L4-H11 changed 'was' (13.5%) → 'sat' (18.2%). This suggests that head was suppressing 'sat' as a prediction."
+---
+## 9. Sidebar
+**What's good:**
+- The "Model loaded successfully! Detected family: GPT-2 architecture" green badge is good UX
+- Module selection dropdowns (Attention Modules, Layer Blocks, Normalization Parameters) make sense for power users
+**Issues:**
+- **Sidebar purpose is unclear to students.** There's no explanation of what changing "Attention Modules" does or why a student would want to. This entire panel reads like a developer debug tool that was left exposed.
+- **"Clear Selections" does what, exactly?** No tooltip.
+- Consider: either hide the sidebar behind an "Advanced" toggle for student mode, or add inline documentation for each control.
+---
+## 10. Chatbot (Robot Icon)
+The robot icon is visible at bottom-right but the chat panel contents weren't captured in automated testing (JS error prevented inspection). Recommend manual review of the chatbot's response quality and whether it contextualizes responses to the current model/prompt state.
+---
+## Priority Recommendations for Cursor
+### 🔴 Critical (do these first)
+1. **Fix attention head categorization thresholds.** First/Positional capturing 132/144 heads makes the entire category panel meaningless. Tighten the threshold, enforce the ~8-head cap per category from the spec, and add layer diversity. This is the highest-impact fix.
+2. **Add the missing head categories.** Induction, Duplicate Token, and Diffuse are all specced in `attention_handoff.md` with detection logic. They need to be implemented. Induction is especially important for this exact prompt (repeated "The cat").
+3. **Fix the modal close button off-screen bug.** Students can't close the glossary modal on standard laptop viewports. Easy CSS fix: `position: absolute; right: 16px` inside the modal container, not the document.
+4. **Add a loading state after clicking Analyze.** 45 seconds of static "Awaiting analysis..." with no spinner is a UX failure. Add a pulsing animation or "Loading model..." progress message.
+### 🟡 High Priority
+5. **Connect head categories to the BertViz visualization.** Clicking a head ID (e.g., L4-H11) in the category panel should auto-select that head in the attention viz below.
+6. **Add runtime activation scoring to head categories.** Per the spec: show whether each head type is active on the current input. Gray out induction heads if there's no repetition in the input, with a "Try: 'The cat sat. The cat'" suggested prompt.
+7. **Add positional embeddings to the Embedding stage explanation.** Currently missing an entire half of what embeddings are.
+8. **Fix ablation state management.** Head selections shouldn't reset when switching between Ablation and Attribution tabs.
+9. **Change attribution default to Integrated Gradients.** It's the more accurate method; this is an educational tool, not a speed benchmark.
+10. **Capitalize on the tokenization "aha" moment.** "The" (464) vs "the" (262) is sitting right there in the example. Call it out explicitly.
+### 🟢 Enhancements
+11. **Add guided "what to look for" text to the attention visualization.** Pick one interesting head per model (pre-annotated) and surface it as a recommendation: "Try Layer 4, Head 11 to see a previous-token head in action."
+12. **Add suggested prompts for exploring each head category.** "To see induction heads activate, try: 'The cat sat on the mat. The cat...'"
+13. **Reframe "confidence" in Output stage.** Replace with "probability" throughout.
+14. **Link attribution results to attention heads.** "The token 'cat' was most influential — see which heads connected it to the prediction in the Attention stage."
+15. **Fix the Output stage token slider** — hide or disable it when only 1 token was generated.
+16. **Add a brief "what would you like to explore?" prompt to the ablation UI** with pre-suggested heads from the category panel.
+17. **Sidebar: add explanatory text** for what Module Selection controls, or hide it in an "Advanced" section.
+---
+## What's Already Strong (Don't Break)
+- The 5-stage pipeline structure and the flow chip bar — keep it exactly as is
+- The BertViz integration — it works and the navigation instructions are clear
+- The callout boxes in Embedding and MLP — these are the best explanation text in the app
+- The token attribution visualization (darker = more important) — intuitive and correct
+- The top-5 output prediction chart — exactly the right content
+- The glossary modal content — all 8 entries are well-written
+---
+## Comparison to Handoff Spec
+| Spec Feature | Status |
+|---|---|
+| 6 head categories (Previous Token, Induction, Duplicate, Positional, Diffuse, Other) | ⚠️ Partial — 3/6 missing, Positional broken |
+| Per-head activation scores on current input | ❌ Not implemented |
+| Active/inactive state display (filled vs open circle) | ❌ Not implemented |
+| Greyed-out heads with suggested prompts | ❌ Not implemented |
+| Click head → navigate to attention heatmap | ❌ Not implemented |
+| Runtime verification module | ❌ Not implemented |
+| One-time offline analysis script | ✅ Appears to have run (JSON exists) |
+| Educational tooltips per category | ⚠️ Partial — descriptions exist but brief |

rag_docs/head_categories_explained.md CHANGED Viewed

@@ -1,56 +1,58 @@
-# Attention Head Categories Explained
-## What Are Head Categories?
-The dashboard automatically analyzes all attention heads in the model and categorizes them based on their behavior patterns. This helps you understand what each head is doing without having to inspect every attention map manually.
-Head categories appear in **Stage 3 (Attention)** of the pipeline. Click any category to expand it and see which specific heads (like L0-H3, L2-H11) belong to it.
-## The Five Categories
-### Previous-Token Heads
-**What they do**: These heads strongly attend to the **immediately preceding token**. For every token at position *i*, the head focuses most of its attention on position *i-1*.
-**Why they matter**: Previous-token heads help the model track local context -- the word that just came before. They're important for bigram patterns (common two-word combinations like "of the" or "in a").
-**Detection**: A head is classified as Previous-Token if, on average, more than 40% of each token's attention goes to the token directly before it.
-**In the dashboard**: These heads are labeled with a purple color. Ablating them often causes noticeable changes in predictions.
-### First/Positional Heads
-**What they do**: These heads focus heavily on the **first token** in the sequence or show strong **positional patterns** (always attending to a specific position regardless of content).
-**Why they matter**: The first token often serves as a "default" attention target. Positional heads help the model keep track of where it is in the sequence.
-**Detection**: Classified when average attention to the first token exceeds 25%.
-### Bag-of-Words (BoW) Heads
-**What they do**: These heads spread their attention **broadly and evenly** across many tokens, without focusing strongly on any particular one.
-**Why they matter**: BoW heads capture a general summary of the entire input. They help the model maintain an overall sense of what the text is about.
-**Detection**: Classified when the attention distribution has high entropy (≥ 0.65 normalized) and no single token receives more than 35% attention.
-### Syntactic Heads
-**What they do**: These heads attend to tokens at **consistent distances**, suggesting they track grammatical or structural relationships (like subject-verb pairs).
-**Why they matter**: Syntactic heads help the model understand grammar and sentence structure. They might connect a verb to its subject or a pronoun to what it refers to.
-**Detection**: Classified when tokens consistently attend to other tokens at similar distances, with low variance in attention distances.
-### Other
-**What they do**: Heads that don't clearly fit any of the above patterns. They may have mixed or context-dependent behavior.
-**Why they matter**: "Other" doesn't mean unimportant. These heads may serve specialized roles that only activate for certain inputs. They're worth investigating through ablation experiments.
-## Using Categories for Experiments
-Head categories are especially useful for guiding ablation experiments:
-- Ablate a **Previous-Token** head to see if local context patterns break
-- Ablate a **BoW** head to see if the model loses global context
-- Compare the effect of ablating heads from different categories on the same prompt

+# Attention Head Categories
+This document explains the different types of attention heads found in transformer models. These categories are determined through **offline analysis** using TransformerLens and **verified at runtime** against your actual input.
+## Categories
+### Previous Token
+**Symbol:** ● (active on most inputs)
+Attends to the immediately preceding token — like reading left to right. This head helps the model track local word-by-word patterns. It's one of the most common and reliable head types.
+**What to look for in the visualization:** Strong diagonal line one position below the main diagonal.
+### Induction
+**Symbol:** ● when repeated tokens exist, ○ otherwise
+Completes repeated patterns: if the model saw [A][B] before and now sees [A], it predicts [B] will follow. This is one of the most important mechanisms in transformer language models.
+**Requires:** Repeated tokens in your input. If no tokens repeat, this category appears grayed out.
+**Try this prompt:** "The cat sat on the mat. The cat" — the repeated "The cat" activates induction heads.
+### Duplicate Token
+**Symbol:** ● when duplicate tokens exist, ○ otherwise
+Notices when the same word appears more than once, acting like a highlighter for repeated words. Helps the model track which words have already been said.
+**Requires:** Repeated tokens in your input.
+**Try this prompt:** "The cat sat. The cat slept." — the repeated words activate duplicate-token heads.
+### Positional / First-Token
+**Symbol:** ● (active on most inputs)
+Always pays attention to the very first word, using it as a fixed anchor point. The first token often serves as a "default" position when no specific token is relevant.
+**What to look for:** Strong vertical line at column 0 (all tokens attending to position 0).
+### Diffuse / Spread
+**Symbol:** ● (active on most inputs)
+Spreads attention evenly across many words, gathering general context rather than focusing on one spot. Provides a "big picture" summary of the input.
+**What to look for:** No strong patterns — attention is spread roughly evenly across all tokens.
+### Other / Unclassified
+Heads whose dominant pattern doesn't fit the categories above. These may perform more complex or context-dependent operations.
+## How It Works
+1. **Offline Analysis:** A TransformerLens script analyzes each head across many test inputs and assigns categories based on dominant behavior patterns.
+2. **Runtime Verification:** When you enter a prompt, the app checks whether each head's known role is actually active on your specific input.
+3. **Active vs Inactive:** A filled circle (●) means the head's role is triggered. An open circle (○) means the role exists but isn't triggered on your current input (e.g., no repeated tokens for induction).
+## Important Note
+These categories are simplified labels based on each head's dominant behavior pattern. In reality, attention heads can serve multiple roles and may behave differently depending on the input.

scripts/analyze_heads.py ADDED Viewed

	@@ -0,0 +1,564 @@

+#!/usr/bin/env python
+"""
+Offline Head Analysis Script
+Uses TransformerLens to analyze attention head behaviors across test inputs
+and generates a JSON file with head categories for each model.
+Usage:
+    python scripts/analyze_heads.py --model gpt2
+    python scripts/analyze_heads.py --model gpt2 gpt2-medium EleutherAI/pythia-70m
+    python scripts/analyze_heads.py --all
+Output:
+    Writes to utils/head_categories.json
+"""
+import os
+os.environ["USE_TF"] = "0"  # Prevent TensorFlow noise
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Dict, List, Any, Tuple
+import torch
+import numpy as np
+# Add project root to path
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+JSON_OUTPUT_PATH = PROJECT_ROOT / "utils" / "head_categories.json"
+# ============================================================================
+# TransformerLens model name mapping
+# ============================================================================
+# TL uses its own naming conventions. Map from HuggingFace names
+# (used in our model_config.py) to TL names.
+HF_TO_TL_NAME = {
+    "gpt2": "gpt2-small",
+    "openai-community/gpt2": "gpt2-small",
+    "gpt2-medium": "gpt2-medium",
+    "openai-community/gpt2-medium": "gpt2-medium",
+    "gpt2-large": "gpt2-large",
+    "openai-community/gpt2-large": "gpt2-large",
+    "gpt2-xl": "gpt2-xl",
+    "openai-community/gpt2-xl": "gpt2-xl",
+    "EleutherAI/pythia-70m": "pythia-70m",
+    "EleutherAI/pythia-160m": "pythia-160m",
+    "EleutherAI/pythia-410m": "pythia-410m",
+    "EleutherAI/pythia-1b": "pythia-1b",
+    "EleutherAI/pythia-1.4b": "pythia-1.4b",
+    "facebook/opt-125m": "opt-125m",
+    "facebook/opt-350m": "opt-350m",
+    "facebook/opt-1.3b": "opt-1.3b",
+}
+# Default models to analyze
+DEFAULT_MODELS = ["gpt2"]
+ALL_PRIORITY_MODELS = [
+    "gpt2",
+    "gpt2-medium",
+    "EleutherAI/pythia-70m",
+    "EleutherAI/pythia-160m",
+    "EleutherAI/pythia-410m",
+    "facebook/opt-125m",
+]
+# ============================================================================
+# Category metadata (shared across all models)
+# ============================================================================
+CATEGORY_METADATA = {
+    "previous_token": {
+        "display_name": "Previous Token",
+        "description": "Attends to the immediately preceding token — like reading left to right",
+        "icon": "arrow-left",
+        "educational_text": "This head looks at the word right before the current one. Like reading left to right, it helps track local word-by-word patterns.",
+        "requires_repetition": False,
+    },
+    "induction": {
+        "display_name": "Induction",
+        "description": "Completes repeated patterns: if it saw [A][B] before and now sees [A], it predicts [B]",
+        "icon": "repeat",
+        "educational_text": "This head finds patterns that happened before and predicts they'll happen again. If it saw 'the cat' earlier, it expects the same words to follow.",
+        "requires_repetition": True,
+        "suggested_prompt": "Try: 'The cat sat on the mat. The cat' — the repeated 'The cat' lets induction heads activate.",
+    },
+    "duplicate_token": {
+        "display_name": "Duplicate Token",
+        "description": "Notices when the same word appears more than once",
+        "icon": "clone",
+        "educational_text": "This head notices when the same word appears more than once, like a highlighter for repeated words. It helps the model track which words have already been said.",
+        "requires_repetition": True,
+        "suggested_prompt": "Try a prompt with repeated words like 'The cat sat. The cat slept.' to see duplicate-token heads light up.",
+    },
+    "positional": {
+        "display_name": "Positional / First-Token",
+        "description": "Always pays attention to the very first word, using it as an anchor point",
+        "icon": "map-pin",
+        "educational_text": "This head always pays attention to the very first word, using it as an anchor point. The first token serves as a 'default' position when no other token is specifically relevant.",
+        "requires_repetition": False,
+    },
+    "diffuse": {
+        "display_name": "Diffuse / Spread",
+        "description": "Spreads attention evenly across many words, gathering general context",
+        "icon": "expand-arrows-alt",
+        "educational_text": "This head spreads its attention evenly across many words, gathering general context rather than focusing on one spot. It provides a 'big picture' summary of the input.",
+        "requires_repetition": False,
+    },
+}
+# ============================================================================
+# Test input generation
+# ============================================================================
+def generate_test_inputs(tokenizer) -> Dict[str, List[str]]:
+    """Generate categorized test inputs for head analysis."""
+    # Natural language prompts for general analysis
+    natural_prompts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "In the beginning, there was nothing but darkness and silence.",
+        "Machine learning models process data to make predictions about the future.",
+        "She walked through the park and noticed the flowers blooming everywhere.",
+        "The president announced new economic policies at the press conference today.",
+        "After years of research, scientists finally discovered the missing link.",
+        "The library was quiet except for the occasional turning of pages.",
+        "Programming is both an art and a science requiring careful thought.",
+        "The restaurant on the corner served the best pizza in the entire city.",
+        "Education is the most powerful tool for changing the world around us.",
+        "The storm clouds gathered on the horizon as the wind began to howl.",
+        "Mathematics provides the foundation for understanding complex physical phenomena.",
+        "The children played happily in the garden while their parents watched.",
+        "Economic growth depends on innovation, investment, and human capital development.",
+        "The old man sat on the bench and watched the pigeons gather crumbs.",
+        "Artificial intelligence will transform every industry in the coming decades.",
+        "The river flowed gently through the valley between the tall mountains.",
+        "Good communication skills are essential for success in any professional career.",
+        "The concert hall was packed with enthusiastic fans waiting for the show.",
+        "Climate change poses significant challenges for agriculture and food security.",
+    ]
+    # Repetitive prompts for induction / duplicate detection
+    repetitive_prompts = [
+        "The cat sat on the mat. The cat sat on the mat.",
+        "One two three four five. One two three four five.",
+        "Hello world hello world hello world hello world.",
+        "Alice went to the store. Bob went to the store. Alice went to the store.",
+        "The dog chased the ball. The dog chased the ball. The dog chased.",
+        "Red blue green red blue green red blue green red.",
+        "I like apples and I like oranges and I like apples.",
+        "The sun rises in the east. The sun sets in the west. The sun rises.",
+        "Monday Tuesday Wednesday Monday Tuesday Wednesday Monday.",
+        "She said hello and he said hello and she said hello again.",
+        "The key to success is practice. The key to success is patience.",
+        "We went to the park and then we went to the park again.",
+        "First second third first second third first second third.",
+        "The teacher asked the student. The student asked the teacher. The teacher asked.",
+        "North south east west north south east west north south.",
+        "Open the door. Close the door. Open the door. Close the door.",
+        "The big red ball bounced. The big red ball rolled.",
+        "Cat dog cat dog cat dog cat dog cat dog.",
+        "Learn practice improve learn practice improve learn practice.",
+        "The man walked. The woman walked. The man walked. The woman walked.",
+    ]
+    return {
+        "natural": natural_prompts,
+        "repetitive": repetitive_prompts,
+    }
+# ============================================================================
+# Head scoring functions
+# ============================================================================
+def score_previous_token(attn_patterns: torch.Tensor) -> torch.Tensor:
+    """
+    Score each head for previous-token behavior.
+    For each position i > 0, check attention to position i-1.
+    Returns [n_layers, n_heads] scores.
+    """
+    n_layers, n_heads, seq_len, _ = attn_patterns.shape
+    if seq_len < 2:
+        return torch.zeros(n_layers, n_heads)
+    scores = torch.zeros(n_layers, n_heads)
+    for i in range(1, seq_len):
+        scores += attn_patterns[:, :, i, i - 1]
+    scores /= (seq_len - 1)
+    return scores
+def score_positional(attn_patterns: torch.Tensor) -> torch.Tensor:
+    """
+    Score each head for first-token / positional behavior.
+    Measures mean attention to position 0 across all positions.
+    Returns [n_layers, n_heads] scores.
+    """
+    # Mean of column 0 across all query positions
+    return attn_patterns[:, :, :, 0].mean(dim=-1)
+def score_diffuse(attn_patterns: torch.Tensor) -> torch.Tensor:
+    """
+    Score each head for diffuse / bag-of-words behavior.
+    Measures normalized entropy of attention distribution.
+    Returns [n_layers, n_heads] scores.
+    """
+    n_layers, n_heads, seq_len, _ = attn_patterns.shape
+    epsilon = 1e-10
+    p = attn_patterns + epsilon
+    entropy = -torch.sum(p * torch.log(p), dim=-1)  # [layers, heads, seq_len]
+    max_entropy = np.log(seq_len)
+    normalized = entropy / max_entropy if max_entropy > 0 else entropy
+    return normalized.mean(dim=-1)  # Average over positions
+def score_induction(attn_patterns: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
+    """
+    Score each head for induction behavior.
+    For repeated tokens: if token[i] == token[j] (j < i), check attention from i to j+1.
+    Returns [n_layers, n_heads] scores.
+    """
+    n_layers, n_heads, seq_len, _ = attn_patterns.shape
+    scores = torch.zeros(n_layers, n_heads)
+    count = 0
+    for i in range(2, seq_len):
+        for j in range(0, i - 1):
+            if tokens[i].item() == tokens[j].item():
+                target = j + 1
+                if target < seq_len:
+                    scores += attn_patterns[:, :, i, target]
+                    count += 1
+    if count > 0:
+        scores /= count
+    return scores
+def score_duplicate_token(attn_patterns: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
+    """
+    Score each head for duplicate-token behavior.
+    For repeated tokens: check attention from later to earlier occurrence.
+    Returns [n_layers, n_heads] scores.
+    """
+    n_layers, n_heads, seq_len, _ = attn_patterns.shape
+    scores = torch.zeros(n_layers, n_heads)
+    count = 0
+    for i in range(1, seq_len):
+        for j in range(0, i):
+            if tokens[i].item() == tokens[j].item():
+                scores += attn_patterns[:, :, i, j]
+                count += 1
+    if count > 0:
+        scores /= count
+    return scores
+# ============================================================================
+# Main analysis
+# ============================================================================
+def analyze_model(model_name: str, device: str = "cpu") -> Dict[str, Any]:
+    """
+    Run full head analysis for a model.
+    Returns a dict ready for JSON serialization.
+    """
+    from transformer_lens import HookedTransformer
+    tl_name = HF_TO_TL_NAME.get(model_name, model_name)
+    print(f"\n{'='*60}")
+    print(f"Analyzing: {model_name} (TL name: {tl_name})")
+    print(f"{'='*60}")
+    print("Loading model...")
+    model = HookedTransformer.from_pretrained(tl_name, device=device)
+    n_layers = model.cfg.n_layers
+    n_heads = model.cfg.n_heads
+    print(f"  Layers: {n_layers}, Heads per layer: {n_heads}")
+    # Generate test inputs
+    test_inputs = generate_test_inputs(model.tokenizer)
+    # Accumulators for scores
+    prev_token_scores = torch.zeros(n_layers, n_heads)
+    positional_scores = torch.zeros(n_layers, n_heads)
+    diffuse_scores = torch.zeros(n_layers, n_heads)
+    induction_scores = torch.zeros(n_layers, n_heads)
+    duplicate_scores = torch.zeros(n_layers, n_heads)
+    natural_count = 0
+    repetitive_count = 0
+    # Analyze natural prompts (for prev_token, positional, diffuse)
+    print("\nAnalyzing natural prompts...")
+    for prompt in test_inputs["natural"]:
+        try:
+            tokens = model.to_tokens(prompt)
+            if tokens.shape[1] < 3:
+                continue
+            with torch.no_grad():
+                _, cache = model.run_with_cache(tokens)
+            # Stack attention patterns: [n_layers, n_heads, seq_len, seq_len]
+            attn_patterns = torch.stack([
+                cache["pattern", layer][0]  # Remove batch dim
+                for layer in range(n_layers)
+            ])
+            prev_token_scores += score_previous_token(attn_patterns)
+            positional_scores += score_positional(attn_patterns)
+            diffuse_scores += score_diffuse(attn_patterns)
+            natural_count += 1
+        except Exception as e:
+            print(f"  Warning: Skipped prompt: {e}")
+            continue
+    print(f"  Processed {natural_count} natural prompts")
+    # Analyze repetitive prompts (for induction + duplicate)
+    print("Analyzing repetitive prompts...")
+    for prompt in test_inputs["repetitive"]:
+        try:
+            tokens = model.to_tokens(prompt)
+            if tokens.shape[1] < 4:
+                continue
+            with torch.no_grad():
+                _, cache = model.run_with_cache(tokens)
+            attn_patterns = torch.stack([
+                cache["pattern", layer][0]
+                for layer in range(n_layers)
+            ])
+            induction_scores += score_induction(attn_patterns, tokens[0])
+            duplicate_scores += score_duplicate_token(attn_patterns, tokens[0])
+            # Also accumulate general scores for these prompts
+            prev_token_scores += score_previous_token(attn_patterns)
+            positional_scores += score_positional(attn_patterns)
+            diffuse_scores += score_diffuse(attn_patterns)
+            natural_count += 1
+            repetitive_count += 1
+        except Exception as e:
+            print(f"  Warning: Skipped prompt: {e}")
+            continue
+    print(f"  Processed {repetitive_count} repetitive prompts")
+    # Average scores
+    if natural_count > 0:
+        prev_token_scores /= natural_count
+        positional_scores /= natural_count
+        diffuse_scores /= natural_count
+    if repetitive_count > 0:
+        induction_scores /= repetitive_count
+        duplicate_scores /= repetitive_count
+    # Select top heads per category
+    all_category_scores = {
+        "previous_token": prev_token_scores,
+        "induction": induction_scores,
+        "duplicate_token": duplicate_scores,
+        "positional": positional_scores,
+        "diffuse": diffuse_scores,
+    }
+    # Print score summaries
+    print("\nScore summaries (max per category):")
+    for cat_name, scores in all_category_scores.items():
+        max_score = scores.max().item()
+        max_idx = scores.argmax()
+        max_layer = max_idx // n_heads
+        max_head = max_idx % n_heads
+        print(f"  {cat_name:20s}: max={max_score:.4f} at L{max_layer}-H{max_head}")
+    # Build category data
+    categories_data = {}
+    for cat_name, scores in all_category_scores.items():
+        top_heads = select_top_heads(scores, n_layers, n_heads, cat_name)
+        cat_entry = dict(CATEGORY_METADATA[cat_name])
+        cat_entry["top_heads"] = top_heads
+        categories_data[cat_name] = cat_entry
+        print(f"\n  {cat_name} ({len(top_heads)} heads):")
+        for h in top_heads:
+            print(f"    L{h['layer']}-H{h['head']}: {h['score']:.4f}")
+    # Build the full model entry
+    model_entry = {
+        "model_name": model_name,
+        "num_layers": n_layers,
+        "num_heads": n_heads,
+        "analysis_date": time.strftime("%Y-%m-%d"),
+        "categories": categories_data,
+        "all_scores": {
+            cat: scores.tolist()
+            for cat, scores in all_category_scores.items()
+        }
+    }
+    return model_entry
+def select_top_heads(
+    scores: torch.Tensor,
+    n_layers: int,
+    n_heads: int,
+    category: str,
+    max_heads: int = 8,
+    primary_threshold: float = 0.25,
+    min_threshold: float = 0.10,
+) -> List[Dict[str, Any]]:
+    """
+    Select the top heads for a category, enforcing layer diversity.
+    Strategy:
+    1. Take all heads above primary_threshold
+    2. Ensure we include the best head from each layer above min_threshold
+    3. Cap at max_heads, keeping highest scores
+    """
+    candidates = []
+    for layer in range(n_layers):
+        for head in range(n_heads):
+            score = scores[layer, head].item()
+            if score > min_threshold:
+                candidates.append({
+                    "layer": layer,
+                    "head": head,
+                    "score": round(score, 4),
+                })
+    # Sort by score descending
+    candidates.sort(key=lambda x: x["score"], reverse=True)
+    # Select: prioritize above primary_threshold, then fill with layer diversity
+    selected = []
+    selected_keys = set()
+    layers_covered = set()
+    # First pass: take all above primary threshold
+    for c in candidates:
+        if c["score"] >= primary_threshold and len(selected) < max_heads:
+            key = (c["layer"], c["head"])
+            if key not in selected_keys:
+                selected.append(c)
+                selected_keys.add(key)
+                layers_covered.add(c["layer"])
+    # Second pass: ensure layer diversity (best from each uncovered layer)
+    for c in candidates:
+        if len(selected) >= max_heads:
+            break
+        if c["layer"] not in layers_covered:
+            key = (c["layer"], c["head"])
+            if key not in selected_keys:
+                selected.append(c)
+                selected_keys.add(key)
+                layers_covered.add(c["layer"])
+    # Sort final result by layer, then head
+    selected.sort(key=lambda x: (x["layer"], x["head"]))
+    return selected[:max_heads]
+# ============================================================================
+# CLI
+# ============================================================================
+def main():
+    parser = argparse.ArgumentParser(description="Analyze attention head categories using TransformerLens")
+    parser.add_argument("--model", nargs="+", default=None,
+                        help="HuggingFace model name(s) to analyze (e.g., gpt2, EleutherAI/pythia-70m)")
+    parser.add_argument("--all", action="store_true",
+                        help="Analyze all priority models")
+    parser.add_argument("--device", default="cpu",
+                        help="Device to use (cpu or cuda)")
+    parser.add_argument("--output", type=str, default=None,
+                        help="Output JSON path (default: utils/head_categories.json)")
+    args = parser.parse_args()
+    # Determine models to analyze
+    if args.all:
+        models = ALL_PRIORITY_MODELS
+    elif args.model:
+        models = args.model
+    else:
+        models = DEFAULT_MODELS
+    output_path = Path(args.output) if args.output else JSON_OUTPUT_PATH
+    # Load existing data if present
+    existing_data = {}
+    if output_path.exists():
+        try:
+            with open(output_path, 'r') as f:
+                existing_data = json.load(f)
+            print(f"Loaded existing data from {output_path} ({len(existing_data)} models)")
+        except (json.JSONDecodeError, IOError):
+            pass
+    # Analyze each model
+    for model_name in models:
+        try:
+            result = analyze_model(model_name, device=args.device)
+            # Store under the HuggingFace name
+            existing_data[model_name] = result
+            # Also store under the short name for lookup
+            short_name = model_name.split('/')[-1] if '/' in model_name else None
+            if short_name and short_name != model_name:
+                existing_data[short_name] = result
+        except Exception as e:
+            print(f"\nERROR analyzing {model_name}: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    # Write output
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(existing_data, f, indent=2)
+    print(f"\n{'='*60}")
+    print(f"Done! Wrote {len(existing_data)} model entries to {output_path}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()

tests/conftest.py CHANGED Viewed

@@ -199,12 +199,4 @@ def mock_attribution_result():
     }
-# =============================================================================
-# Head Categorization Config
-# =============================================================================
-@pytest.fixture
-def default_head_config():
-    """Default head categorization configuration for testing."""
-    from utils.head_detection import HeadCategorizationConfig
-    return HeadCategorizationConfig()


199	}
200
201



202

tests/test_head_detection.py CHANGED Viewed

@@ -1,313 +1,431 @@
 """
 Tests for utils/head_detection.py
-Tests attention head categorization heuristics using synthetic attention matrices.
 """
 import pytest
 import torch
 import numpy as np
 from utils.head_detection import (
-    compute_attention_entropy,
-    detect_previous_token_head,
-    detect_first_token_head,
-    detect_bow_head,
-    detect_syntactic_head,
-    categorize_attention_head,
-    categorize_all_heads,
-    format_categorization_summary,
-    HeadCategorizationConfig
 )
 class TestComputeAttentionEntropy:
-    """Tests for compute_attention_entropy function."""
     def test_uniform_distribution_high_entropy(self):
-        """Uniform attention should have high (near 1.0) normalized entropy."""
-        # 4 positions with equal attention
-        uniform = torch.tensor([0.25, 0.25, 0.25, 0.25])
-        entropy = compute_attention_entropy(uniform)
-        # Normalized entropy should be close to 1.0 for uniform
-        assert 0.95 <= entropy <= 1.0, f"Expected ~1.0, got {entropy}"
     def test_peaked_distribution_low_entropy(self):
-        """Peaked attention should have low normalized entropy."""
-        # One position dominates
-        peaked = torch.tensor([0.97, 0.01, 0.01, 0.01])
-        entropy = compute_attention_entropy(peaked)
-        # Should be low entropy
-        assert entropy < 0.3, f"Expected low entropy, got {entropy}"
-    def test_entropy_bounds(self):
-        """Entropy should always be between 0 and 1 (normalized)."""
-        test_cases = [
-            torch.tensor([1.0, 0.0, 0.0, 0.0]),      # Extreme peaked
-            torch.tensor([0.5, 0.5, 0.0, 0.0]),      # Two positions
-            torch.tensor([0.25, 0.25, 0.25, 0.25]),  # Uniform
-        ]
-        for weights in test_cases:
-            entropy = compute_attention_entropy(weights)
-            assert 0.0 <= entropy <= 1.0, f"Entropy {entropy} out of bounds"
-class TestDetectPreviousTokenHead:
-    """Tests for detect_previous_token_head function."""
-    def test_detects_previous_token_pattern(self, previous_token_attention_matrix, default_head_config):
-        """Should detect matrix with strong previous-token attention."""
-        is_prev, score = detect_previous_token_head(
-            previous_token_attention_matrix,
-            default_head_config
-        )
-        assert is_prev == True
-        assert score > 0.5, f"Expected high score, got {score}"
-    def test_rejects_uniform_attention(self, uniform_attention_matrix, default_head_config):
-        """Should reject matrix with uniform attention."""
-        is_prev, score = detect_previous_token_head(
-            uniform_attention_matrix,
-            default_head_config
-        )
-        assert is_prev == False
-        assert score < 0.4, f"Expected low score, got {score}"
-    def test_short_sequence_returns_false(self, default_head_config):
-        """Sequence shorter than min_seq_len should return False."""
-        short_matrix = torch.ones(2, 2) / 2
-        is_prev, score = detect_previous_token_head(short_matrix, default_head_config)
-        assert is_prev == False
-        assert score == 0.0
-class TestDetectFirstTokenHead:
-    """Tests for detect_first_token_head function."""
-    def test_detects_first_token_pattern(self, first_token_attention_matrix, default_head_config):
-        """Should detect matrix with strong first-token attention."""
-        is_first, score = detect_first_token_head(
-            first_token_attention_matrix,
-            default_head_config
-        )
-        assert is_first == True
-        assert score > 0.5, f"Expected high score, got {score}"
-    def test_low_first_token_attention(self, default_head_config):
-        """Matrix with low attention to first token should not be detected."""
-        # Create matrix where first token gets very little attention
-        # Use size 5 to be above min_seq_len and avoid overlap at [0,0]
-        size = 5
-        matrix = torch.zeros(size, size)
-        for i in range(size):
-            # Distribute attention: 5% to first token, 95% to last token
-            matrix[i, 0] = 0.05
-            matrix[i, -1] = 0.95
-        is_first, score = detect_first_token_head(matrix, default_head_config)
-        assert is_first == False
-        assert score < 0.25, f"Expected low score, got {score}"
-class TestDetectBowHead:
-    """Tests for detect_bow_head (bag-of-words / diffuse attention)."""
-    def test_detects_uniform_as_bow(self, uniform_attention_matrix, default_head_config):
-        """Uniform attention should be detected as BoW head."""
-        is_bow, score = detect_bow_head(uniform_attention_matrix, default_head_config)
-        # Uniform has high entropy and low max attention - should be BoW
-        assert is_bow == True
-        assert score > 0.9, f"Expected high entropy score, got {score}"
-    def test_rejects_peaked_attention(self, peaked_attention_matrix, default_head_config):
-        """Peaked attention should not be detected as BoW."""
-        is_bow, score = detect_bow_head(peaked_attention_matrix, default_head_config)
-        # Peaked attention has low entropy - should not be BoW
-        assert is_bow == False
-class TestDetectSyntacticHead:
-    """Tests for detect_syntactic_head function."""
-    def test_consistent_distance_pattern(self, default_head_config):
-        """Matrix with consistent distance pattern should be detected as syntactic."""
-        # Create matrix where each position attends to position 2 tokens back
         size = 6
         matrix = torch.zeros(size, size)
-        for i in range(size):
-            target = max(0, i - 2)  # 2 tokens back
-            matrix[i, target] = 1.0
-        is_syn, score = detect_syntactic_head(matrix, default_head_config)
-        # Should have consistent distance pattern
-        assert score > 0.0, f"Expected positive score for consistent pattern"
-    def test_random_attention_returns_valid_values(self, default_head_config):
-        """Random attention should return valid boolean and score."""
-        torch.manual_seed(42)
-        random_matrix = torch.softmax(torch.randn(6, 6), dim=-1)
-        is_syn, score = detect_syntactic_head(random_matrix, default_head_config)
-        # Check it returns valid types (bool or numpy bool, and numeric score)
-        assert is_syn in [True, False] or bool(is_syn) in [True, False]
-        assert 0 <= float(score) <= 1
-class TestCategorizeAttentionHead:
-    """Tests for categorize_attention_head function."""
-    def test_categorizes_previous_token_head(self, previous_token_attention_matrix, default_head_config):
-        """Should categorize previous-token pattern correctly."""
-        result = categorize_attention_head(
-            previous_token_attention_matrix,
-            layer_idx=0,
-            head_idx=3,
-            config=default_head_config
-        )
-        assert result['category'] == 'previous_token'
-        assert result['layer'] == 0
-        assert result['head'] == 3
-        assert result['label'] == 'L0-H3'
-        assert 'scores' in result
-    def test_categorizes_first_token_head(self, first_token_attention_matrix, default_head_config):
-        """Should categorize first-token pattern correctly."""
-        result = categorize_attention_head(
-            first_token_attention_matrix,
-            layer_idx=2,
-            head_idx=5,
-            config=default_head_config
-        )
-        assert result['category'] == 'first_token'
-        assert result['label'] == 'L2-H5'
-    def test_categorizes_bow_head(self, default_head_config):
-        """Should categorize diffuse attention as BoW when it doesn't match other patterns."""
-        # Create BoW-like matrix: diffuse attention but first token gets LESS than threshold
-        # This avoids triggering first_token detection (threshold 0.25)
         size = 5
         matrix = torch.zeros(size, size)
         for i in range(size):
-            # First token gets only 0.1, rest get roughly equal share
-            matrix[i, 0] = 0.1
-            remaining = 0.9 / (size - 1)
-            for j in range(1, size):
-                matrix[i, j] = remaining
-        result = categorize_attention_head(
-            matrix,
-            layer_idx=1,
-            head_idx=0,
-            config=default_head_config
-        )
-        assert result['category'] == 'bow'
-    def test_result_structure(self, uniform_attention_matrix):
-        """Result should have all required keys."""
-        result = categorize_attention_head(
-            uniform_attention_matrix,
-            layer_idx=0,
-            head_idx=0
-        )
-        required_keys = ['layer', 'head', 'category', 'scores', 'label']
-        for key in required_keys:
-            assert key in result, f"Missing key: {key}"
-class TestCategorizeAllHeads:
-    """Tests for categorize_all_heads function."""
-    def test_returns_all_categories(self, mock_activation_data, default_head_config):
-        """Should return dict with all category keys."""
-        result = categorize_all_heads(mock_activation_data, default_head_config)
-        expected_categories = ['previous_token', 'first_token', 'bow', 'syntactic', 'other']
-        for cat in expected_categories:
-            assert cat in result, f"Missing category: {cat}"
-            assert isinstance(result[cat], list)
-    def test_handles_empty_attention_data(self, default_head_config):
-        """Should handle activation data with no attention outputs."""
-        empty_data = {'attention_outputs': {}}
-        result = categorize_all_heads(empty_data, default_head_config)
-        # Should return empty lists for all categories
-        for cat, heads in result.items():
-            assert heads == []
-class TestFormatCategorizationSummary:
-    """Tests for format_categorization_summary function."""
-    def test_formats_empty_categorization(self):
-        """Should format empty categorization without error."""
-        empty = {
-            'previous_token': [],
-            'first_token': [],
-            'bow': [],
-            'syntactic': [],
-            'other': []
         }
-        result = format_categorization_summary(empty)
-        assert isinstance(result, str)
-        assert "Total Heads: 0" in result
-    def test_formats_with_heads(self):
-        """Should format categorization with heads correctly."""
-        categorized = {
-            'previous_token': [
-                {'layer': 0, 'head': 1, 'label': 'L0-H1'},
-                {'layer': 0, 'head': 2, 'label': 'L0-H2'},
-            ],
-            'first_token': [
-                {'layer': 1, 'head': 0, 'label': 'L1-H0'},
-            ],
-            'bow': [],
-            'syntactic': [],
-            'other': []
-        }
-        result = format_categorization_summary(categorized)
-        assert "Total Heads: 3" in result
-        assert "Previous-Token Heads: 2" in result
-        assert "First/Positional-Token Heads: 1" in result
-        assert "Layer 0" in result
-        assert "Layer 1" in result
-class TestHeadCategorizationConfig:
-    """Tests for HeadCategorizationConfig defaults."""
-    def test_default_values(self):
-        """Default config should have reasonable values."""
-        config = HeadCategorizationConfig()
-        assert 0 < config.prev_token_threshold < 1
-        assert 0 < config.first_token_threshold < 1
-        assert 0 < config.bow_entropy_threshold < 1
-        assert config.min_seq_len > 0
-    def test_config_is_mutable(self):
-        """Config values should be mutable for customization."""
-        config = HeadCategorizationConfig()
-        original = config.prev_token_threshold
-        config.prev_token_threshold = 0.8
-        assert config.prev_token_threshold == 0.8
-        assert config.prev_token_threshold != original

 """
 Tests for utils/head_detection.py
+Tests the offline JSON + runtime verification head categorization system.
 """
 import pytest
 import torch
+import json
 import numpy as np
+from pathlib import Path
+from unittest.mock import patch, mock_open
 from utils.head_detection import (
+    load_head_categories,
+    verify_head_activation,
+    get_active_head_summary,
+    clear_category_cache,
+    _compute_attention_entropy,
+    _find_repeated_tokens,
 )
+# =============================================================================
+# Sample JSON data for mocking
+# =============================================================================
+SAMPLE_JSON = {
+    "test-model": {
+        "model_name": "test-model",
+        "num_layers": 2,
+        "num_heads": 4,
+        "analysis_date": "2026-02-26",
+        "categories": {
+            "previous_token": {
+                "display_name": "Previous Token",
+                "description": "Attends to the previous token",
+                "educational_text": "Looks at the word before.",
+                "icon": "arrow-left",
+                "requires_repetition": False,
+                "top_heads": [
+                    {"layer": 0, "head": 1, "score": 0.85},
+                    {"layer": 1, "head": 2, "score": 0.72}
+                ]
+            },
+            "induction": {
+                "display_name": "Induction",
+                "description": "Pattern matching",
+                "educational_text": "Finds repeated patterns.",
+                "icon": "repeat",
+                "requires_repetition": True,
+                "suggested_prompt": "Try repeating words.",
+                "top_heads": [
+                    {"layer": 1, "head": 0, "score": 0.90}
+                ]
+            },
+            "duplicate_token": {
+                "display_name": "Duplicate Token",
+                "description": "Finds duplicates",
+                "educational_text": "Spots repeated words.",
+                "icon": "clone",
+                "requires_repetition": True,
+                "suggested_prompt": "Try typing the same word twice.",
+                "top_heads": [
+                    {"layer": 0, "head": 3, "score": 0.78}
+                ]
+            },
+            "positional": {
+                "display_name": "Positional",
+                "description": "First token focus",
+                "educational_text": "Anchors to position 0.",
+                "icon": "map-pin",
+                "requires_repetition": False,
+                "top_heads": [
+                    {"layer": 0, "head": 0, "score": 0.88}
+                ]
+            },
+            "diffuse": {
+                "display_name": "Diffuse",
+                "description": "Spread attention",
+                "educational_text": "Even distribution.",
+                "icon": "expand-arrows-alt",
+                "requires_repetition": False,
+                "top_heads": [
+                    {"layer": 1, "head": 3, "score": 0.80}
+                ]
+            }
+        },
+        "all_scores": {}
+    }
+}
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear the category cache before each test."""
+    clear_category_cache()
+    yield
+    clear_category_cache()
+# =============================================================================
+# Tests for _compute_attention_entropy
+# =============================================================================
 class TestComputeAttentionEntropy:
+    """Tests for _compute_attention_entropy helper."""
     def test_uniform_distribution_high_entropy(self):
+        """Uniform attention should have entropy near 1.0."""
+        weights = torch.ones(8) / 8
+        entropy = _compute_attention_entropy(weights)
+        assert entropy > 0.95
     def test_peaked_distribution_low_entropy(self):
+        """Peaked attention should have low entropy."""
+        weights = torch.zeros(8)
+        weights[0] = 0.98
+        weights[1:] = 0.02 / 7
+        entropy = _compute_attention_entropy(weights)
+        assert entropy < 0.3
+    def test_entropy_in_range(self):
+        """Entropy should always be between 0 and 1."""
+        for _ in range(10):
+            weights = torch.softmax(torch.randn(6), dim=0)
+            entropy = _compute_attention_entropy(weights)
+            assert 0.0 <= entropy <= 1.0
+# =============================================================================
+# Tests for _find_repeated_tokens
+# =============================================================================
+class TestFindRepeatedTokens:
+    """Tests for _find_repeated_tokens helper."""
+    def test_no_repeats(self):
+        """No repetition returns empty dict."""
+        assert _find_repeated_tokens([1, 2, 3, 4]) == {}
+    def test_simple_repeat(self):
+        """Repeated token returns positions."""
+        result = _find_repeated_tokens([10, 20, 10, 30])
+        assert 10 in result
+        assert result[10] == [0, 2]
+        assert 20 not in result
+    def test_multiple_repeats(self):
+        """Multiple repeated tokens tracked."""
+        result = _find_repeated_tokens([5, 6, 5, 6, 7])
+        assert 5 in result and 6 in result
+        assert 7 not in result
+    def test_empty_input(self):
+        assert _find_repeated_tokens([]) == {}
+# =============================================================================
+# Tests for load_head_categories
+# =============================================================================
+class TestLoadHeadCategories:
+    """Tests for load_head_categories function."""
+    def test_loads_from_json(self, tmp_path):
+        """Should load model data from JSON file."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            result = load_head_categories("test-model")
+        assert result is not None
+        assert result["model_name"] == "test-model"
+        assert "previous_token" in result["categories"]
+    def test_returns_none_for_unknown_model(self, tmp_path):
+        """Should return None when model not in JSON."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            result = load_head_categories("nonexistent-model")
+        assert result is None
+    def test_returns_none_when_no_file(self, tmp_path):
+        """Should return None when JSON file doesn't exist."""
+        with patch('utils.head_detection._JSON_PATH', tmp_path / "missing.json"):
+            result = load_head_categories("test-model")
+        assert result is None
+    def test_caches_results(self, tmp_path):
+        """Should cache loaded data."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            result1 = load_head_categories("test-model")
+            # Delete file to prove cache is used
+            json_file.unlink()
+            result2 = load_head_categories("test-model")
+        assert result1 is result2
+    def test_short_name_alias(self, tmp_path):
+        """Should find model by short name (after /)."""
+        data = {"my-model": {"model_name": "my-model", "categories": {}}}
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(data))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            result = load_head_categories("org/my-model")
+        assert result is not None
+# =============================================================================
+# Tests for verify_head_activation
+# =============================================================================
+class TestVerifyHeadActivation:
+    """Tests for verify_head_activation function."""
+    def test_previous_token_strong(self):
+        """Strong previous-token pattern should score high."""
         size = 6
         matrix = torch.zeros(size, size)
+        for i in range(1, size):
+            matrix[i, i - 1] = 0.8
+            matrix[i, i] = 0.2
+        matrix[0, 0] = 1.0
+        score = verify_head_activation(matrix, [1, 2, 3, 4, 5, 6], "previous_token")
+        assert score > 0.6
+    def test_previous_token_weak(self):
+        """Uniform attention should have low previous-token score."""
+        size = 6
+        matrix = torch.ones(size, size) / size
+        score = verify_head_activation(matrix, [1, 2, 3, 4, 5, 6], "previous_token")
+        assert score < 0.3
+    def test_induction_with_repetition(self):
+        """Induction pattern should score > 0 when repeated tokens are present."""
+        # Tokens: [A, B, C, A, ?] — head should attend to B (position 1) from position 3
+        size = 5
+        matrix = torch.ones(size, size) / size  # Baseline uniform
+        matrix[3, 1] = 0.7  # Position 3 (second A) attends to position 1 (B after first A)
+        token_ids = [10, 20, 30, 10, 40]  # Token 10 repeats
+        score = verify_head_activation(matrix, token_ids, "induction")
+        assert score > 0.3
+    def test_induction_no_repetition(self):
+        """Induction should return 0.0 when no tokens repeat."""
+        matrix = torch.ones(4, 4) / 4
+        score = verify_head_activation(matrix, [1, 2, 3, 4], "induction")
+        assert score == 0.0
+    def test_duplicate_token_with_repeats(self):
+        """Duplicate-token head should score > 0 when later positions attend to earlier same token."""
         size = 5
+        matrix = torch.ones(size, size) / size
+        matrix[3, 0] = 0.6  # Position 3 (second occurrence of token 10) attends to position 0
+        token_ids = [10, 20, 30, 10, 40]
+        score = verify_head_activation(matrix, token_ids, "duplicate_token")
+        assert score > 0.3
+    def test_duplicate_token_no_repeats(self):
+        """Should return 0.0 when no duplicates."""
+        matrix = torch.ones(4, 4) / 4
+        score = verify_head_activation(matrix, [1, 2, 3, 4], "duplicate_token")
+        assert score == 0.0
+    def test_positional_strong(self):
+        """Strong first-token attention should score high."""
+        size = 6
         matrix = torch.zeros(size, size)
         for i in range(size):
+            matrix[i, 0] = 0.7
+            matrix[i, i] = 0.3
+        score = verify_head_activation(matrix, [1, 2, 3, 4, 5, 6], "positional")
+        assert score > 0.5
+    def test_diffuse_uniform(self):
+        """Uniform attention should have high diffuse score."""
+        size = 8
+        matrix = torch.ones(size, size) / size
+        score = verify_head_activation(matrix, list(range(size)), "diffuse")
+        assert score > 0.8
+    def test_diffuse_peaked(self):
+        """Peaked attention should have low diffuse score."""
+        size = 8
+        matrix = torch.zeros(size, size)
+        matrix[:, 0] = 1.0
+        score = verify_head_activation(matrix, list(range(size)), "diffuse")
+        assert score < 0.3
+    def test_unknown_category(self):
+        """Unknown category should return 0.0."""
+        matrix = torch.ones(4, 4) / 4
+        assert verify_head_activation(matrix, [1, 2, 3, 4], "nonexistent") == 0.0
+    def test_short_sequence(self):
+        """Very short sequence should return 0.0."""
+        matrix = torch.ones(1, 1)
+        assert verify_head_activation(matrix, [1], "previous_token") == 0.0
+# =============================================================================
+# Tests for get_active_head_summary
+# =============================================================================
+class TestGetActiveHeadSummary:
+    """Tests for get_active_head_summary function."""
+    def _make_activation_data(self, token_ids, num_layers=2, num_heads=4, seq_len=None):
+        """Helper: create mock activation_data with given token_ids."""
+        if seq_len is None:
+            seq_len = len(token_ids)
+        attention_outputs = {}
+        for layer in range(num_layers):
+            # Create uniform attention [1, num_heads, seq_len, seq_len]
+            attn = torch.ones(1, num_heads, seq_len, seq_len) / seq_len
+            attention_outputs[f'model.layers.{layer}.self_attn'] = {
+                'output': [
+                    [[0.1] * seq_len],  # hidden states (unused)
+                    attn.tolist()
+                ]
+            }
+        return {
+            'model': 'test-model',
+            'input_ids': [token_ids],
+            'attention_outputs': attention_outputs,
         }
+    def test_returns_none_for_unknown_model(self, tmp_path):
+        """Should return None when model not in JSON."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])
+            result = get_active_head_summary(data, "unknown-model")
+        assert result is None
+    def test_returns_categories_structure(self, tmp_path):
+        """Should return proper structure with categories."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])
+            result = get_active_head_summary(data, "test-model")
+        assert result is not None
+        assert result["model_available"] is True
+        assert "categories" in result
+        assert "previous_token" in result["categories"]
+        assert "induction" in result["categories"]
+    def test_heads_have_activation_scores(self, tmp_path):
+        """Each head should have an activation_score."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])
+            result = get_active_head_summary(data, "test-model")
+        for cat_key, cat_data in result["categories"].items():
+            for head in cat_data.get("heads", []):
+                assert "activation_score" in head
+                assert "is_active" in head
+                assert "label" in head
+    def test_induction_grayed_when_no_repeats(self, tmp_path):
+        """Induction should be non-applicable when no repeated tokens."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])  # No repeats
+            result = get_active_head_summary(data, "test-model")
+        induction = result["categories"]["induction"]
+        assert induction["is_applicable"] is False
+        assert all(h["activation_score"] == 0.0 for h in induction["heads"])
+    def test_induction_active_with_repeats(self, tmp_path):
+        """Induction should be applicable when tokens repeat."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([10, 20, 10, 30])  # Token 10 repeats
+            result = get_active_head_summary(data, "test-model")
+        induction = result["categories"]["induction"]
+        assert induction["is_applicable"] is True
+    def test_suggested_prompt_included(self, tmp_path):
+        """Suggested prompt should appear for repetition-dependent categories."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])
+            result = get_active_head_summary(data, "test-model")
+        assert result["categories"]["induction"]["suggested_prompt"] is not None
+        assert result["categories"]["duplicate_token"]["suggested_prompt"] is not None
+    def test_other_category_always_present(self, tmp_path):
+        """Other/Unclassified category should always be in the result."""
+        json_file = tmp_path / "head_categories.json"
+        json_file.write_text(json.dumps(SAMPLE_JSON))
+        with patch('utils.head_detection._JSON_PATH', json_file):
+            data = self._make_activation_data([1, 2, 3, 4])
+            result = get_active_head_summary(data, "test-model")
+        assert "other" in result["categories"]

utils/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ from .model_patterns import (load_model_and_get_patterns, execute_forward_pass,
                              detect_significant_probability_increases,
                              evaluate_sequence_ablation, generate_bertviz_model_view_html)
 from .model_config import get_model_family, get_family_config, get_auto_selections, MODEL_TO_FAMILY, MODEL_FAMILIES
-from .head_detection import categorize_all_heads, categorize_single_layer_heads, format_categorization_summary, HeadCategorizationConfig
 from .beam_search import perform_beam_search
 from .ablation_metrics import compute_kl_divergence, score_sequence, get_token_probability_deltas
 from .token_attribution import compute_integrated_gradients, compute_simple_gradient_attribution, create_attribution_visualization_data
@@ -38,10 +38,9 @@ __all__ = [
     'MODEL_FAMILIES',
     # Head detection
-    'categorize_all_heads',
-    'categorize_single_layer_heads',
-    'format_categorization_summary',
-    'HeadCategorizationConfig',
     # Beam search
     'perform_beam_search',

                              detect_significant_probability_increases,
                              evaluate_sequence_ablation, generate_bertviz_model_view_html)
 from .model_config import get_model_family, get_family_config, get_auto_selections, MODEL_TO_FAMILY, MODEL_FAMILIES
+from .head_detection import load_head_categories, verify_head_activation, get_active_head_summary
 from .beam_search import perform_beam_search
 from .ablation_metrics import compute_kl_divergence, score_sequence, get_token_probability_deltas
 from .token_attribution import compute_integrated_gradients, compute_simple_gradient_attribution, create_attribution_visualization_data
     'MODEL_FAMILIES',
     # Head detection
+    'load_head_categories',
+    'verify_head_activation',
+    'get_active_head_summary',
     # Beam search
     'perform_beam_search',

utils/head_categories.json ADDED Viewed

	@@ -0,0 +1,1099 @@

+{
+  "gpt2": {
+    "model_name": "gpt2",
+    "num_layers": 12,
+    "num_heads": 12,
+    "analysis_date": "2026-02-26",
+    "categories": {
+      "previous_token": {
+        "display_name": "Previous Token",
+        "description": "Attends to the immediately preceding token \u2014 like reading left to right",
+        "icon": "arrow-left",
+        "educational_text": "This head looks at the word right before the current one. Like reading left to right, it helps track local word-by-word patterns.",
+        "requires_repetition": false,
+        "top_heads": [
+          {
+            "layer": 1,
+            "head": 0,
+            "score": 0.3655
+          },
+          {
+            "layer": 2,
+            "head": 2,
+            "score": 0.5679
+          },
+          {
+            "layer": 2,
+            "head": 5,
+            "score": 0.3384
+          },
+          {
+            "layer": 2,
+            "head": 9,
+            "score": 0.4052
+          },
+          {
+            "layer": 3,
+            "head": 2,
+            "score": 0.4164
+          },
+          {
+            "layer": 3,
+            "head": 6,
+            "score": 0.3359
+          },
+          {
+            "layer": 3,
+            "head": 7,
+            "score": 0.4419
+          },
+          {
+            "layer": 4,
+            "head": 11,
+            "score": 0.97
+          }
+        ]
+      },
+      "induction": {
+        "display_name": "Induction",
+        "description": "Completes repeated patterns: if it saw [A][B] before and now sees [A], it predicts [B]",
+        "icon": "repeat",
+        "educational_text": "This head finds patterns that happened before and predicts they'll happen again. If it saw 'the cat' earlier, it expects the same words to follow.",
+        "requires_repetition": true,
+        "suggested_prompt": "Try: 'The cat sat on the mat. The cat' \u2014 the repeated 'The cat' lets induction heads activate.",
+        "top_heads": [
+          {
+            "layer": 5,
+            "head": 0,
+            "score": 0.3363
+          },
+          {
+            "layer": 5,
+            "head": 1,
+            "score": 0.4412
+          },
+          {
+            "layer": 5,
+            "head": 5,
+            "score": 0.4119
+          },
+          {
+            "layer": 5,
+            "head": 8,
+            "score": 0.3032
+          },
+          {
+            "layer": 6,
+            "head": 9,
+            "score": 0.3017
+          },
+          {
+            "layer": 7,
+            "head": 10,
+            "score": 0.2849
+          },
+          {
+            "layer": 8,
+            "head": 1,
+            "score": 0.2608
+          },
+          {
+            "layer": 10,
+            "head": 7,
+            "score": 0.2196
+          }
+        ]
+      },
+      "duplicate_token": {
+        "display_name": "Duplicate Token",
+        "description": "Notices when the same word appears more than once",
+        "icon": "clone",
+        "educational_text": "This head notices when the same word appears more than once, like a highlighter for repeated words. It helps the model track which words have already been said.",
+        "requires_repetition": true,
+        "suggested_prompt": "Try a prompt with repeated words like 'The cat sat. The cat slept.' to see duplicate-token heads light up.",
+        "top_heads": [
+          {
+            "layer": 0,
+            "head": 1,
+            "score": 0.4175
+          },
+          {
+            "layer": 0,
+            "head": 5,
+            "score": 0.4155
+          },
+          {
+            "layer": 1,
+            "head": 11,
+            "score": 0.3256
+          },
+          {
+            "layer": 3,
+            "head": 0,
+            "score": 0.2416
+          },
+          {
+            "layer": 4,
+            "head": 7,
+            "score": 0.1238
+          },
+          {
+            "layer": 11,
+            "head": 8,
+            "score": 0.1741
+          }
+        ]
+      },
+      "positional": {
+        "display_name": "Positional / First-Token",
+        "description": "Always pays attention to the very first word, using it as an anchor point",
+        "icon": "map-pin",
+        "educational_text": "This head always pays attention to the very first word, using it as an anchor point. The first token serves as a 'default' position when no other token is specifically relevant.",
+        "requires_repetition": false,
+        "top_heads": [
+          {
+            "layer": 7,
+            "head": 2,
+            "score": 0.9077
+          },
+          {
+            "layer": 9,
+            "head": 6,
+            "score": 0.9077
+          },
+          {
+            "layer": 9,
+            "head": 9,
+            "score": 0.9064
+          },
+          {
+            "layer": 9,
+            "head": 11,
+            "score": 0.9301
+          },
+          {
+            "layer": 10,
+            "head": 10,
+            "score": 0.9098
+          },
+          {
+            "layer": 11,
+            "head": 2,
+            "score": 0.8962
+          },
+          {
+            "layer": 11,
+            "head": 6,
+            "score": 0.9231
+          },
+          {
+            "layer": 11,
+            "head": 9,
+            "score": 0.9117
+          }
+        ]
+      },
+      "diffuse": {
+        "display_name": "Diffuse / Spread",
+        "description": "Spreads attention evenly across many words, gathering general context",
+        "icon": "expand-arrows-alt",
+        "educational_text": "This head spreads its attention evenly across many words, gathering general context rather than focusing on one spot. It provides a 'big picture' summary of the input.",
+        "requires_repetition": false,
+        "top_heads": [
+          {
+            "layer": 0,
+            "head": 10,
+            "score": 0.6076
+          },
+          {
+            "layer": 0,
+            "head": 11,
+            "score": 0.5915
+          },
+          {
+            "layer": 1,
+            "head": 2,
+            "score": 0.5851
+          },
+          {
+            "layer": 1,
+            "head": 4,
+            "score": 0.5693
+          },
+          {
+            "layer": 1,
+            "head": 10,
+            "score": 0.6001
+          },
+          {
+            "layer": 2,
+            "head": 7,
+            "score": 0.6227
+          },
+          {
+            "layer": 2,
+            "head": 10,
+            "score": 0.6325
+          },
+          {
+            "layer": 11,
+            "head": 0,
+            "score": 0.6132
+          }
+        ]
+      }
+    },
+    "all_scores": {
+      "previous_token": [
+        [
+          0.1650262176990509,
+          0.005524545907974243,
+          0.13794219493865967,
+          0.11309953033924103,
+          0.19386060535907745,
+          0.02020726539194584,
+          0.18705399334430695,
+          0.3287373483181,
+          0.1688501238822937,
+          0.14645136892795563,
+          0.12409798055887222,
+          0.14697492122650146
+        ],
+        [
+          0.36550161242485046,
+          0.22920921444892883,
+          0.1901777684688568,
+          0.13691475987434387,
+          0.1552433967590332,
+          0.1548655927181244,
+          0.14041779935359955,
+          0.1399569809436798,
+          0.14001941680908203,
+          0.12206045538187027,
+          0.18723534047603607,
+          0.05272947624325752
+        ],
+        [
+          0.24368862807750702,
+          0.11734970659017563,
+          0.5678969025611877,
+          0.33175796270370483,
+          0.3293865919113159,
+          0.33843594789505005,
+          0.1687498688697815,
+          0.2169996052980423,
+          0.33436763286590576,
+          0.405174195766449,
+          0.20988500118255615,
+          0.1365954577922821
+        ],
+        [
+          0.08308680355548859,
+          0.16770434379577637,
+          0.41642817854881287,
+          0.32616299390792847,
+          0.09816452860832214,
+          0.12414131313562393,
+          0.33591750264167786,
+          0.4418589174747467,
+          0.3060630261898041,
+          0.21817748248577118,
+          0.1548490822315216,
+          0.2623787224292755
+        ],
+        [
+          0.24851615726947784,
+          0.22178645431995392,
+          0.10810651630163193,
+          0.2638419270515442,
+          0.1461866945028305,
+          0.19259677827358246,
+          0.16893190145492554,
+          0.20602412521839142,
+          0.11169518530368805,
+          0.16701465845108032,
+          0.09775038063526154,
+          0.9700173139572144
+        ],
+        [
+          0.1162194162607193,
+          0.09808940440416336,
+          0.20977501571178436,
+          0.16994376480579376,
+          0.2316969633102417,
+          0.10760845243930817,
+          0.26810961961746216,
+          0.1556214690208435,
+          0.13168412446975708,
+          0.10098359733819962,
+          0.1563761830329895,
+          0.11529763042926788
+        ],
+        [
+          0.23046550154685974,
+          0.13669200241565704,
+          0.10113422572612762,
+          0.12357200682163239,
+          0.12948814034461975,
+          0.14964132010936737,
+          0.11104538291692734,
+          0.17790208756923676,
+          0.3313186764717102,
+          0.09724397212266922,
+          0.1065865010023117,
+          0.19595712423324585
+        ],
+        [
+          0.2756780683994293,
+          0.09617989510297775,
+          0.0887245386838913,
+          0.14660504460334778,
+          0.11926672607660294,
+          0.12578082084655762,
+          0.10664939880371094,
+          0.11368991434574127,
+          0.18360558152198792,
+          0.130024254322052,
+          0.10562390089035034,
+          0.10479450225830078
+        ],
+        [
+          0.10714849084615707,
+          0.10390549898147583,
+          0.11945408582687378,
+          0.10176572948694229,
+          0.15246066451072693,
+          0.1935780942440033,
+          0.13547158241271973,
+          0.24629735946655273,
+          0.14471763372421265,
+          0.12072619050741196,
+          0.12850022315979004,
+          0.10024647414684296
+        ],
+        [
+          0.1123703345656395,
+          0.10224141925573349,
+          0.10966678708791733,
+          0.24468424916267395,
+          0.09359707683324814,
+          0.11123354732990265,
+          0.09214123338460922,
+          0.11035183817148209,
+          0.09690441191196442,
+          0.09199563413858414,
+          0.16506430506706238,
+          0.08864383399486542
+        ],
+        [
+          0.09993860870599747,
+          0.1017073541879654,
+          0.09143912047147751,
+          0.1137363463640213,
+          0.11926724761724472,
+          0.1261630356311798,
+          0.09609334915876389,
+          0.1267780214548111,
+          0.09360888600349426,
+          0.15695181488990784,
+          0.09125342220067978,
+          0.16533184051513672
+        ],
+        [
+          0.1551479697227478,
+          0.10182406008243561,
+          0.09162592142820358,
+          0.14142417907714844,
+          0.10655181109905243,
+          0.09299013763666153,
+          0.08795793354511261,
+          0.10052843391895294,
+          0.18854694068431854,
+          0.09097206592559814,
+          0.14251284301280975,
+          0.13573673367500305
+        ]
+      ],
+      "induction": [
+        [
+          0.07627037912607193,
+          0.0035299647133797407,
+          0.050907380878925323,
+          0.018350504338741302,
+          0.055634528398513794,
+          0.015752490609884262,
+          0.09711054712533951,
+          0.08642718195915222,
+          0.07673756778240204,
+          0.06478650867938995,
+          0.05675221234560013,
+          0.0686919093132019
+        ],
+        [
+          0.098502516746521,
+          0.08570204675197601,
+          0.09086534380912781,
+          0.05725013464689255,
+          0.06655086576938629,
+          0.08535383641719818,
+          0.04390129819512367,
+          0.05150846764445305,
+          0.05973561853170395,
+          0.05239921063184738,
+          0.10886937379837036,
+          0.03350156173110008
+        ],
+        [
+          0.0880986899137497,
+          0.029988640919327736,
+          0.06596572697162628,
+          0.09502042829990387,
+          0.06376759707927704,
+          0.07735122740268707,
+          0.07770463079214096,
+          0.08998467028141022,
+          0.08355952054262161,
+          0.08642251044511795,
+          0.0951002761721611,
+          0.038624998182058334
+        ],
+        [
+          0.012395743280649185,
+          0.0515044704079628,
+          0.0702400729060173,
+          0.038637131452560425,
+          0.03541486710309982,
+          0.04828893393278122,
+          0.07664503902196884,
+          0.05478388071060181,
+          0.05722055584192276,
+          0.05503711849451065,
+          0.05377575010061264,
+          0.05681142956018448
+        ],
+        [
+          0.023173518478870392,
+          0.04842953383922577,
+          0.02587379515171051,
+          0.0371115505695343,
+          0.043572355061769485,
+          0.025999004021286964,
+          0.057220708578825,
+          0.05670655891299248,
+          0.05118811875581741,
+          0.029776636511087418,
+          0.02828892692923546,
+          0.050957612693309784
+        ],
+        [
+          0.3362796902656555,
+          0.44116583466529846,
+          0.04926660656929016,
+          0.060651201754808426,
+          0.049554307013750076,
+          0.41194018721580505,
+          0.038970425724983215,
+          0.01051054522395134,
+          0.30320701003074646,
+          0.07053252309560776,
+          0.05541849881410599,
+          0.03842315822839737
+        ],
+        [
+          0.04865153878927231,
+          0.13892090320587158,
+          0.023456398397684097,
+          0.043447092175483704,
+          0.05254914611577988,
+          0.06307318806648254,
+          0.06592734158039093,
+          0.06641103327274323,
+          0.06890955567359924,
+          0.3017217516899109,
+          0.053376901894807816,
+          0.05453646928071976
+        ],
+        [
+          0.04203842580318451,
+          0.06195511296391487,
+          0.18403273820877075,
+          0.06932497024536133,
+          0.025891464203596115,
+          0.03674555569887161,
+          0.05915430188179016,
+          0.08904685080051422,
+          0.029217243194580078,
+          0.047680627554655075,
+          0.28489723801612854,
+          0.15201476216316223
+        ],
+        [
+          0.03113759122788906,
+          0.2607646584510803,
+          0.04262052848935127,
+          0.03490695357322693,
+          0.020729169249534607,
+          0.039468441158533096,
+          0.17247121036052704,
+          0.02061128057539463,
+          0.0941251665353775,
+          0.044258393347263336,
+          0.09541143476963043,
+          0.03278326988220215
+        ],
+        [
+          0.06156448647379875,
+          0.09029851853847504,
+          0.06509305536746979,
+          0.04298751801252365,
+          0.02618749439716339,
+          0.029909756034612656,
+          0.08973383903503418,
+          0.06374338269233704,
+          0.02463320828974247,
+          0.10424073040485382,
+          0.016569094732403755,
+          0.04829319566488266
+        ],
+        [
+          0.0732613354921341,
+          0.15449705719947815,
+          0.048853177577257156,
+          0.12552715837955475,
+          0.1161937341094017,
+          0.020513027906417847,
+          0.08032035827636719,
+          0.21955707669258118,
+          0.07728692889213562,
+          0.014143750071525574,
+          0.056671954691410065,
+          0.1141514927148819
+        ],
+        [
+          0.10236237943172455,
+          0.0509863905608654,
+          0.02403058484196663,
+          0.046142492443323135,
+          0.03625836968421936,
+          0.05091869831085205,
+          0.02450958639383316,
+          0.057415880262851715,
+          0.09816241264343262,
+          0.045323897153139114,
+          0.12710919976234436,
+          0.06512586772441864
+        ]
+      ],
+      "duplicate_token": [
+        [
+          0.061639100313186646,
+          0.4175182282924652,
+          0.05723930522799492,
+          0.039668913930654526,
+          0.0939607322216034,
+          0.41551661491394043,
+          0.07361333817243576,
+          0.0333673469722271,
+          0.0963386595249176,
+          0.0499253086745739,
+          0.17845425009727478,
+          0.0740630105137825
+        ],
+        [
+          0.03887755423784256,
+          0.03720149025321007,
+          0.07625596970319748,
+          0.052537791430950165,
+          0.06014804169535637,
+          0.09469039738178253,
+          0.05574027821421623,
+          0.03633364289999008,
+          0.05319533869624138,
+          0.04128124564886093,
+          0.10213665664196014,
+          0.3255976736545563
+        ],
+        [
+          0.0270945243537426,
+          0.02465079165995121,
+          0.003460302483290434,
+          0.01619820110499859,
+          0.008633781224489212,
+          0.012598037719726562,
+          0.04559514671564102,
+          0.06271781027317047,
+          0.014696493744850159,
+          0.012923041358590126,
+          0.07460619509220123,
+          0.027807259932160378
+        ],
+        [
+          0.24161744117736816,
+          0.013565832749009132,
+          0.006801762618124485,
+          0.0032485886476933956,
+          0.02135937288403511,
+          0.024630073457956314,
+          0.015564021654427052,
+          0.005436367355287075,
+          0.007849231362342834,
+          0.015441101975739002,
+          0.04518696293234825,
+          0.013415353372693062
+        ],
+        [
+          0.0038080490194261074,
+          0.00991421565413475,
+          0.025079775601625443,
+          0.011280774138867855,
+          0.04912680760025978,
+          0.006715251598507166,
+          0.021937724202871323,
+          0.12375693023204803,
+          0.026765504851937294,
+          0.011192137375473976,
+          0.025936853140592575,
+          8.196845010388643e-05
+        ],
+        [
+          0.023429764434695244,
+          0.016590412706136703,
+          0.017092403024435043,
+          0.03277356177568436,
+          0.016331162303686142,
+          0.021816818043589592,
+          0.011733165010809898,
+          0.005887174047529697,
+          0.01492474414408207,
+          0.030711984261870384,
+          0.07108811289072037,
+          0.06261330097913742
+        ],
+        [
+          0.02555452659726143,
+          0.029351357370615005,
+          0.021288855001330376,
+          0.024492312222719193,
+          0.039061177521944046,
+          0.03344884514808655,
+          0.06831201910972595,
+          0.03736294433474541,
+          0.019588876515626907,
+          0.04092007130384445,
+          0.01721787452697754,
+          0.019499698653817177
+        ],
+        [
+          0.020283106714487076,
+          0.02244160696864128,
+          0.01908939704298973,
+          0.0162697471678257,
+          0.02050776034593582,
+          0.02750096097588539,
+          0.026029860600829124,
+          0.03217357397079468,
+          0.014307908713817596,
+          0.006763854529708624,
+          0.04564401134848595,
+          0.027008097618818283
+        ],
+        [
+          0.027883464470505714,
+          0.041265588253736496,
+          0.028905224055051804,
+          0.013592107221484184,
+          0.0074845412746071815,
+          0.03488120436668396,
+          0.04030846059322357,
+          0.010207113809883595,
+          0.035800714045763016,
+          0.029832065105438232,
+          0.02576960064470768,
+          0.014182129874825478
+        ],
+        [
+          0.017836367711424828,
+          0.029379570856690407,
+          0.022140078246593475,
+          0.036215025931596756,
+          0.024319598451256752,
+          0.026142369955778122,
+          0.018539801239967346,
+          0.019365690648555756,
+          0.011654431000351906,
+          0.025902757421135902,
+          0.015683690086007118,
+          0.010347607545554638
+        ],
+        [
+          0.02144056186079979,
+          0.046325650066137314,
+          0.021630164235830307,
+          0.05147164314985275,
+          0.042117439210414886,
+          0.02441989816725254,
+          0.02136657014489174,
+          0.05447021871805191,
+          0.03011142648756504,
+          0.020071811974048615,
+          0.016738489270210266,
+          0.04836065694689751
+        ],
+        [
+          0.13101476430892944,
+          0.03627091646194458,
+          0.0201750285923481,
+          0.06851539760828018,
+          0.029396140947937965,
+          0.03782244399189949,
+          0.014253688976168633,
+          0.044284969568252563,
+          0.17414367198944092,
+          0.021388430148363113,
+          0.06319155544042587,
+          0.055135130882263184
+        ]
+      ],
+      "positional": [
+        [
+          0.5065976977348328,
+          0.07629109919071198,
+          0.5960054397583008,
+          0.1072789654135704,
+          0.1979677975177765,
+          0.13927273452281952,
+          0.40057316422462463,
+          0.294817179441452,
+          0.383198618888855,
+          0.5544258952140808,
+          0.40033283829689026,
+          0.47870078682899475
+        ],
+        [
+          0.2410203516483307,
+          0.4396105706691742,
+          0.4307883381843567,
+          0.5517755746841431,
+          0.5317303538322449,
+          0.5054966807365417,
+          0.6495388746261597,
+          0.6267575025558472,
+          0.5890303254127502,
+          0.6793325543403625,
+          0.07594899833202362,
+          0.21587026119232178
+        ],
+        [
+          0.4007927477359772,
+          0.7385829091072083,
+          0.1999039351940155,
+          0.30451780557632446,
+          0.46449190378189087,
+          0.3399127125740051,
+          0.514499306678772,
+          0.29614612460136414,
+          0.31728798151016235,
+          0.2615760266780853,
+          0.3395046591758728,
+          0.7219924926757812
+        ],
+        [
+          0.8190140724182129,
+          0.6275245547294617,
+          0.25404971837997437,
+          0.6006070375442505,
+          0.8895429372787476,
+          0.7170742154121399,
+          0.3035760521888733,
+          0.35117024183273315,
+          0.4254607558250427,
+          0.5432918071746826,
+          0.6645973920822144,
+          0.47774600982666016
+        ],
+        [
+          0.5796363949775696,
+          0.5921002626419067,
+          0.793941080570221,
+          0.49824151396751404,
+          0.7273139953613281,
+          0.6757563948631287,
+          0.64992356300354,
+          0.3122835159301758,
+          0.8277088403701782,
+          0.6422610878944397,
+          0.8769611120223999,
+          0.14915767312049866
+        ],
+        [
+          0.7556132078170776,
+          0.8456296920776367,
+          0.6256846785545349,
+          0.5377398729324341,
+          0.5960881114006042,
+          0.7833361625671387,
+          0.723742663860321,
+          0.7974669933319092,
+          0.7113959789276123,
+          0.8386362791061401,
+          0.6537194848060608,
+          0.7253992557525635
+        ],
+        [
+          0.538119912147522,
+          0.7342842817306519,
+          0.8442155718803406,
+          0.7554894685745239,
+          0.6839307546615601,
+          0.7064528465270996,
+          0.7554677724838257,
+          0.6205617189407349,
+          0.5202042460441589,
+          0.8443636894226074,
+          0.8635346293449402,
+          0.6343041062355042
+        ],
+        [
+          0.6614936590194702,
+          0.8791419267654419,
+          0.9076933860778809,
+          0.7058827877044678,
+          0.8025026321411133,
+          0.7749000787734985,
+          0.838254451751709,
+          0.8037239909172058,
+          0.6864684224128723,
+          0.7610327005386353,
+          0.8215873837471008,
+          0.8486534357070923
+        ],
+        [
+          0.8073843121528625,
+          0.8061873316764832,
+          0.7319211959838867,
+          0.8467031717300415,
+          0.7768716812133789,
+          0.6048685908317566,
+          0.7132378816604614,
+          0.6679729223251343,
+          0.6701217889785767,
+          0.7771828770637512,
+          0.7071925401687622,
+          0.8558918237686157
+        ],
+        [
+          0.8133878707885742,
+          0.8669012784957886,
+          0.8068772554397583,
+          0.5790890455245972,
+          0.8904383778572083,
+          0.8204380869865417,
+          0.9076582789421082,
+          0.7966066002845764,
+          0.8762456774711609,
+          0.9064305424690247,
+          0.7492377758026123,
+          0.9301468133926392
+        ],
+        [
+          0.8455430269241333,
+          0.8402767181396484,
+          0.890575110912323,
+          0.7642854452133179,
+          0.7333279252052307,
+          0.7862328290939331,
+          0.8635441660881042,
+          0.6658955812454224,
+          0.888232409954071,
+          0.7337470054626465,
+          0.9097886085510254,
+          0.7254845499992371
+        ],
+        [
+          0.3025703728199005,
+          0.8144607543945312,
+          0.8962485194206238,
+          0.6487042307853699,
+          0.7963070869445801,
+          0.8672806620597839,
+          0.9231362342834473,
+          0.8210302591323853,
+          0.07466430962085724,
+          0.9117152094841003,
+          0.6209774017333984,
+          0.6903347969055176
+        ]
+      ],
+      "diffuse": [
+        [
+          0.5471135377883911,
+          0.1322605162858963,
+          0.492602676153183,
+          0.21496565639972687,
+          0.45495811104774475,
+          0.25727584958076477,
+          0.5676304697990417,
+          0.5459160804748535,
+          0.5383939146995544,
+          0.5441114902496338,
+          0.6075721383094788,
+          0.5915287137031555
+        ],
+        [
+          0.56114661693573,
+          0.5631774663925171,
+          0.5851024389266968,
+          0.5447676777839661,
+          0.5693410038948059,
+          0.510784924030304,
+          0.4271117150783539,
+          0.48312950134277344,
+          0.5217397212982178,
+          0.4331055283546448,
+          0.60009765625,
+          0.3668949007987976
+        ],
+        [
+          0.5618427991867065,
+          0.35582321882247925,
+          0.34944772720336914,
+          0.5037699937820435,
+          0.4152102470397949,
+          0.47268810868263245,
+          0.5098887085914612,
+          0.622725248336792,
+          0.47435516119003296,
+          0.48120027780532837,
+          0.6324588060379028,
+          0.4027617573738098
+        ],
+        [
+          0.17714563012123108,
+          0.4172298312187195,
+          0.42452120780944824,
+          0.31828364729881287,
+          0.18911775946617126,
+          0.38251644372940063,
+          0.5157310366630554,
+          0.4105154871940613,
+          0.41387349367141724,
+          0.4185497760772705,
+          0.40337443351745605,
+          0.4543667733669281
+        ],
+        [
+          0.3102322220802307,
+          0.38234779238700867,
+          0.3048619031906128,
+          0.4123547673225403,
+          0.3599177300930023,
+          0.34652307629585266,
+          0.447924941778183,
+          0.46825671195983887,
+          0.26102879643440247,
+          0.3940913677215576,
+          0.20296287536621094,
+          0.02204204723238945
+        ],
+        [
+          0.2029620110988617,
+          0.08709979802370071,
+          0.40380486845970154,
+          0.514489471912384,
+          0.4261854588985443,
+          0.1830417364835739,
+          0.26347407698631287,
+          0.2405150830745697,
+          0.2826869487762451,
+          0.24574777483940125,
+          0.3901086449623108,
+          0.3574109673500061
+        ],
+        [
+          0.46612176299095154,
+          0.3027900457382202,
+          0.25536319613456726,
+          0.3338863253593445,
+          0.3941308856010437,
+          0.39528438448905945,
+          0.3291165232658386,
+          0.44284719228744507,
+          0.41498908400535583,
+          0.12233757972717285,
+          0.20009461045265198,
+          0.4175761640071869
+        ],
+        [
+          0.33120664954185486,
+          0.1765395551919937,
+          0.09227706491947174,
+          0.37284451723098755,
+          0.2708284258842468,
+          0.31805992126464844,
+          0.25206413865089417,
+          0.21613168716430664,
+          0.3545899987220764,
+          0.3042650818824768,
+          0.14626441895961761,
+          0.1727096140384674
+        ],
+        [
+          0.28339847922325134,
+          0.18787869811058044,
+          0.36294665932655334,
+          0.2241670787334442,
+          0.27335819602012634,
+          0.4469229280948639,
+          0.2862758934497833,
+          0.3158189654350281,
+          0.3742186725139618,
+          0.30465927720069885,
+          0.38407495617866516,
+          0.21899032592773438
+        ],
+        [
+          0.23532763123512268,
+          0.16719377040863037,
+          0.2597936987876892,
+          0.4364214539527893,
+          0.17044395208358765,
+          0.2712015211582184,
+          0.13269579410552979,
+          0.2855920195579529,
+          0.18635967373847961,
+          0.1326359510421753,
+          0.29712045192718506,
+          0.11560585349798203
+        ],
+        [
+          0.2210322916507721,
+          0.19647784531116486,
+          0.17695878446102142,
+          0.29771047830581665,
+          0.33597418665885925,
+          0.2783747613430023,
+          0.19375105202198029,
+          0.3423268496990204,
+          0.16622166335582733,
+          0.3245820999145508,
+          0.1462937742471695,
+          0.2878214716911316
+        ],
+        [
+          0.6131904721260071,
+          0.2794114649295807,
+          0.18150922656059265,
+          0.42593449354171753,
+          0.31345874071121216,
+          0.20985659956932068,
+          0.14243251085281372,
+          0.2698703110218048,
+          0.5045338869094849,
+          0.15346872806549072,
+          0.4387816786766052,
+          0.3756435811519623
+        ]
+      ]
+    }
+  }
+}

utils/head_detection.py CHANGED Viewed

@@ -1,313 +1,256 @@
 """
 Attention Head Detection and Categorization
-Implements heuristics to categorize attention heads into:
-- Previous-Token Heads: high attention on previous token
-- First/Positional Heads: high attention on first token or positional patterns
-- Bag-of-Words Heads: diffuse attention on content tokens
-- Syntactic Heads: dependency-like patterns
 - Other: heads that don't fit the above categories
 """
 import torch
 import numpy as np
 from typing import Dict, List, Tuple, Optional, Any
 import re
-class HeadCategorizationConfig:
-    """
-    Configuration for attention head categorization heuristics.
-    These thresholds are tuned to balance sensitivity (catching relevant patterns)
-    with specificity (avoiding false positives) for educational purposes.
-    """
-    def __init__(self):
-        # Previous-token head thresholds
-        # Heads that primarily attend to the immediately preceding token
-        self.prev_token_threshold = 0.4  # Minimum avg attention to prev token (40%)
-        self.prev_token_diagonal_offset = 1  # Check i → i-1 pattern
-        # First/Positional head thresholds
-        # Heads that attend strongly to first token or show positional patterns
-        self.first_token_threshold = 0.25  # Minimum avg attention to first token (25%)
-        self.positional_pattern_threshold = 0.4  # For detecting positional patterns
-        # Bag-of-words head thresholds
-        # Heads with diffuse attention across many tokens
-        self.bow_entropy_threshold = 0.65  # Minimum entropy (normalized, 0-1 scale)
-        self.bow_max_attention_threshold = 0.35  # Maximum attention to any single token
-        # Syntactic head thresholds
-        # Heads showing structured distance patterns (e.g., subject-verb)
-        self.syntactic_distance_pattern_threshold = 0.3  # For detecting distance patterns
-        # General thresholds
-        self.min_seq_len = 4  # Minimum sequence length for reliable detection
-def compute_attention_entropy(attention_weights: torch.Tensor) -> float:
     """
-    Compute normalized entropy of attention distribution.
     Args:
-        attention_weights: [seq_len] tensor of attention weights for a position
     Returns:
-        Normalized entropy (0 to 1)
-    """
-    # Avoid log(0) by adding small epsilon
-    epsilon = 1e-10
-    weights = attention_weights + epsilon
-    # Compute entropy: -sum(p * log(p))
-    entropy = -torch.sum(weights * torch.log(weights))
-    # Normalize by max entropy (log(n) where n is sequence length)
-    max_entropy = np.log(len(weights))
-    normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
-    return normalized_entropy.item()
-def detect_previous_token_head(attention_matrix: torch.Tensor, config: HeadCategorizationConfig) -> Tuple[bool, float]:
     """
-    Detect if head shows strong previous-token pattern (i → i-1).
-    Args:
-        attention_matrix: [seq_len, seq_len] attention weights
-        config: Configuration object
-    Returns:
-        (is_prev_token_head, score) where score is avg attention to previous token
-    """
-    seq_len = attention_matrix.shape[0]
-    if seq_len < config.min_seq_len:
-        return False, 0.0
-    # Extract the diagonal offset by 1 (i → i-1 pattern)
-    # For each position i > 0, check attention to position i-1
-    prev_token_attentions = []
-    for i in range(1, seq_len):
-        prev_token_attentions.append(attention_matrix[i, i-1].item())
-    avg_prev_attention = np.mean(prev_token_attentions)
-    is_prev_token_head = avg_prev_attention >= config.prev_token_threshold
-    return is_prev_token_head, avg_prev_attention
-def detect_first_token_head(attention_matrix: torch.Tensor, config: HeadCategorizationConfig) -> Tuple[bool, float]:
     """
-    Detect if head shows strong attention to first token(s) or positional patterns.
     Args:
-        attention_matrix: [seq_len, seq_len] attention weights
-        config: Configuration object
     Returns:
-        (is_first_token_head, score) where score is avg attention to first token
     """
-    seq_len = attention_matrix.shape[0]
-    if seq_len < config.min_seq_len:
-        return False, 0.0
-    # Check average attention to first token across all positions
-    first_token_attention = attention_matrix[:, 0].mean().item()
-    is_first_token_head = first_token_attention >= config.first_token_threshold
-    return is_first_token_head, first_token_attention
-def detect_bow_head(attention_matrix: torch.Tensor, config: HeadCategorizationConfig) -> Tuple[bool, float]:
     """
-    Detect if head shows bag-of-words pattern (diffuse attention).
     Args:
-        attention_matrix: [seq_len, seq_len] attention weights
-        config: Configuration object
     Returns:
-        (is_bow_head, score) where score is average entropy
     """
-    seq_len = attention_matrix.shape[0]
-    if seq_len < config.min_seq_len:
-        return False, 0.0
-    # Compute entropy for each position's attention distribution
-    entropies = []
-    max_attentions = []
-    for i in range(seq_len):
-        entropy = compute_attention_entropy(attention_matrix[i])
-        max_attention = attention_matrix[i].max().item()
-        entropies.append(entropy)
-        max_attentions.append(max_attention)
-    avg_entropy = np.mean(entropies)
-    avg_max_attention = np.mean(max_attentions)
-    # BoW heads have high entropy and low max attention (diffuse)
-    is_bow_head = (avg_entropy >= config.bow_entropy_threshold and
-                   avg_max_attention <= config.bow_max_attention_threshold)
-    return is_bow_head, avg_entropy
-def detect_syntactic_head(attention_matrix: torch.Tensor, config: HeadCategorizationConfig) -> Tuple[bool, float]:
     """
-    Detect if head shows syntactic/dependency-like patterns.
-    This is a simplified heuristic based on consistent distance patterns.
     Args:
-        attention_matrix: [seq_len, seq_len] attention weights
-        config: Configuration object
     Returns:
-        (is_syntactic_head, score) where score is pattern consistency
     """
-    seq_len = attention_matrix.shape[0]
-    if seq_len < config.min_seq_len:
-        return False, 0.0
-    # Check for consistent distance patterns (e.g., attending to tokens at fixed distances)
-    # This is a simplified approach; more sophisticated syntactic detection would
-    # require parsing or linguistic features
-    distance_scores = []
-    for i in range(seq_len):
-        # For each position, find the most attended position
-        max_idx = torch.argmax(attention_matrix[i]).item()
-        distance = abs(i - max_idx)
-        # Collect distances (excluding self-attention at distance 0)
-        if distance > 0:
-            distance_scores.append(distance)
-    if not distance_scores:
-        return False, 0.0
-    # Check if there's a consistent distance pattern
-    # (simple version: low variance in distances)
-    distance_variance = np.var(distance_scores)
-    distance_mean = np.mean(distance_scores)
-    # Syntactic heads often have moderate, consistent distances
-    # (not too short like prev-token, not too diffuse like BoW)
-    pattern_score = 1.0 / (1.0 + distance_variance) if distance_mean > 1 else 0.0
-    is_syntactic_head = pattern_score >= config.syntactic_distance_pattern_threshold
-    return is_syntactic_head, pattern_score
-def categorize_attention_head(attention_matrix: torch.Tensor,
-                               layer_idx: int,
-                               head_idx: int,
-                               config: Optional[HeadCategorizationConfig] = None) -> Dict[str, Any]:
-    """
-    Categorize a single attention head based on its attention pattern.
-    Args:
-        attention_matrix: [seq_len, seq_len] attention weights for this head
-        layer_idx: Layer index
-        head_idx: Head index within the layer
-        config: Configuration object (uses defaults if None)
-    Returns:
-        Dictionary with categorization results:
-        {
-            'layer': layer_idx,
-            'head': head_idx,
-            'category': str (one of: 'previous_token', 'first_token', 'bow', 'syntactic', 'other'),
-            'scores': dict of scores for each category,
-            'label': formatted label like "L{layer}-H{head}"
-        }
-    """
-    if config is None:
-        config = HeadCategorizationConfig()
-    # Run all detection heuristics
-    is_prev, prev_score = detect_previous_token_head(attention_matrix, config)
-    is_first, first_score = detect_first_token_head(attention_matrix, config)
-    is_bow, bow_score = detect_bow_head(attention_matrix, config)
-    is_syn, syn_score = detect_syntactic_head(attention_matrix, config)
-    # Assign category based on highest-scoring pattern
-    # Priority: previous_token > first_token > bow > syntactic > other
-    scores = {
-        'previous_token': prev_score if is_prev else 0.0,
-        'first_token': first_score if is_first else 0.0,
-        'bow': bow_score if is_bow else 0.0,
-        'syntactic': syn_score if is_syn else 0.0
-    }
-    # Determine primary category
-    if is_prev:
-        category = 'previous_token'
-    elif is_first:
-        category = 'first_token'
-    elif is_bow:
-        category = 'bow'
-    elif is_syn:
-        category = 'syntactic'
     else:
-        category = 'other'
-    return {
-        'layer': layer_idx,
-        'head': head_idx,
-        'category': category,
-        'scores': scores,
-        'label': f"L{layer_idx}-H{head_idx}"
-    }
-def categorize_all_heads(activation_data: Dict[str, Any],
-                         config: Optional[HeadCategorizationConfig] = None) -> Dict[str, List[Dict[str, Any]]]:
     """
-    Categorize all attention heads in the model.
     Args:
         activation_data: Output from execute_forward_pass with attention data
-        config: Configuration object (uses defaults if None)
     Returns:
-        Dictionary mapping category names to lists of head info dicts:
         {
-            'previous_token': [...],
-            'first_token': [...],
-            'bow': [...],
-            'syntactic': [...],
-            'other': [...]
         }
     """
-    if config is None:
-        config = HeadCategorizationConfig()
-    # Initialize result dict
-    categorized = {
-        'previous_token': [],
-        'first_token': [],
-        'bow': [],
-        'syntactic': [],
-        'other': []
-    }
     attention_outputs = activation_data.get('attention_outputs', {})
-    if not attention_outputs:
-        return categorized
-    # Process each layer's attention
     for module_name, output_dict in attention_outputs.items():
-        # Extract layer number from module name
         numbers = re.findall(r'\d+', module_name)
         if not numbers:
             continue
@@ -318,153 +261,82 @@ def categorize_all_heads(activation_data: Dict[str, Any],
         if not isinstance(attention_output, list) or len(attention_output) < 2:
             continue
-        # Get attention weights: [batch, heads, seq_len, seq_len]
         attention_weights = torch.tensor(attention_output[1])
-        # Process each head
         num_heads = attention_weights.shape[1]
-        seq_len = attention_weights.shape[2]
-        if seq_len < config.min_seq_len:
-            continue
         for head_idx in range(num_heads):
-            # Extract attention matrix for this head: [seq_len, seq_len]
-            head_attention = attention_weights[0, head_idx, :, :]
-            # Categorize this head
-            head_info = categorize_attention_head(head_attention, layer_idx, head_idx, config)
-            # Add to appropriate category list
-            category = head_info['category']
-            categorized[category].append(head_info)
-    return categorized
-def categorize_single_layer_heads(activation_data: Dict[str, Any],
-                                   layer_num: int,
-                                   config: Optional[HeadCategorizationConfig] = None) -> Dict[str, List[Dict[str, Any]]]:
-    """
-    Categorize attention heads for a single layer.
-    Args:
-        activation_data: Output from execute_forward_pass with attention data
-        layer_num: The specific layer number to categorize
-        config: Configuration object (uses defaults if None)
-    Returns:
-        Dictionary mapping category names to lists of head info dicts for this layer only:
-        {
-            'previous_token': [...],
-            'first_token': [...],
-            'bow': [...],
-            'syntactic': [...],
-            'other': [...]
-        }
-    """
-    if config is None:
-        config = HeadCategorizationConfig()
-    # Initialize result dict
-    categorized = {
-        'previous_token': [],
-        'first_token': [],
-        'bow': [],
-        'syntactic': [],
-        'other': []
     }
-    attention_outputs = activation_data.get('attention_outputs', {})
-    if not attention_outputs:
-        return categorized
-    # Find the attention output for the requested layer
-    target_module = None
-    for module_name, output_dict in attention_outputs.items():
-        # Extract layer number from module name
-        numbers = re.findall(r'\d+', module_name)
-        if not numbers:
             continue
-        if int(numbers[0]) == layer_num:
-            target_module = module_name
-            break
-    if not target_module:
-        return categorized
-    output_dict = attention_outputs[target_module]
-    attention_output = output_dict.get('output')
-    if not isinstance(attention_output, list) or len(attention_output) < 2:
-        return categorized
-    # Get attention weights: [batch, heads, seq_len, seq_len]
-    attention_weights = torch.tensor(attention_output[1])
-    # Process each head
-    num_heads = attention_weights.shape[1]
-    seq_len = attention_weights.shape[2]
-    if seq_len < config.min_seq_len:
-        return categorized
-    for head_idx in range(num_heads):
-        # Extract attention matrix for this head: [seq_len, seq_len]
-        head_attention = attention_weights[0, head_idx, :, :]
-        # Categorize this head
-        head_info = categorize_attention_head(head_attention, layer_num, head_idx, config)
-        # Add to appropriate category list
-        category = head_info['category']
-        categorized[category].append(head_info)
-    return categorized
-def format_categorization_summary(categorized_heads: Dict[str, List[Dict[str, Any]]]) -> str:
-    """
-    Format categorization results as a human-readable summary.
-    Args:
-        categorized_heads: Output from categorize_all_heads or categorize_single_layer_heads
-    Returns:
-        Formatted string summary
-    """
-    category_names = {
-        'previous_token': 'Previous-Token Heads',
-        'first_token': 'First/Positional-Token Heads',
-        'bow': 'Bag-of-Words Heads',
-        'syntactic': 'Syntactic Heads',
-        'other': 'Other Heads'
     }
-    summary = []
-    total_heads = sum(len(heads) for heads in categorized_heads.values())
-    summary.append(f"Total Heads: {total_heads}\n")
-    summary.append("=" * 60)
-    for category, display_name in category_names.items():
-        heads = categorized_heads.get(category, [])
-        summary.append(f"\n{display_name}: {len(heads)} heads")
-        if heads:
-            # Group by layer
-            heads_by_layer = {}
-            for head_info in heads:
-                layer = head_info['layer']
-                if layer not in heads_by_layer:
-                    heads_by_layer[layer] = []
-                heads_by_layer[layer].append(head_info['head'])
-            # Format by layer
-            for layer in sorted(heads_by_layer.keys()):
-                head_indices = sorted(heads_by_layer[layer])
-                summary.append(f"  Layer {layer}: Heads {head_indices}")
-    return "\n".join(summary)

 """
 Attention Head Detection and Categorization
+Loads pre-computed head category data from JSON (produced by scripts/analyze_heads.py)
+and performs lightweight runtime verification of head activation on the current input.
+Categories:
+- Previous Token: attends to the immediately preceding token
+- Induction: completes repeated patterns ([A][B]...[A] → [B])
+- Duplicate Token: attends to earlier occurrences of the same token
+- Positional / First-Token: attends to the first token or positional patterns
+- Diffuse / Spread: high-entropy, evenly distributed attention
 - Other: heads that don't fit the above categories
 """
+import json
+import os
 import torch
 import numpy as np
 from typing import Dict, List, Tuple, Optional, Any
 import re
+from pathlib import Path
+# Path to the pre-computed head categories JSON
+_JSON_PATH = Path(__file__).parent / "head_categories.json"
+# Cache for loaded JSON data (avoids re-reading per request)
+_category_cache: Dict[str, Any] = {}
+def load_head_categories(model_name: str) -> Optional[Dict[str, Any]]:
     """
+    Load pre-computed head category data for a model.
     Args:
+        model_name: HuggingFace model name (e.g., "gpt2", "EleutherAI/pythia-70m")
     Returns:
+        Dict with model's category data, or None if model not analyzed.
+        Structure: {
+            "model_name": str,
+            "num_layers": int,
+            "num_heads": int,
+            "categories": { category_name: { "top_heads": [...], ... } },
+            ...
+        }
     """
+    global _category_cache
+    # Check cache first
+    if model_name in _category_cache:
+        return _category_cache[model_name]
+    # Load JSON
+    if not _JSON_PATH.exists():
+        return None
+    try:
+        with open(_JSON_PATH, 'r') as f:
+            all_data = json.load(f)
+    except (json.JSONDecodeError, IOError):
+        return None
+    # Try exact match first, then common aliases
+    model_data = all_data.get(model_name)
+    if model_data is None:
+        # Try short name (e.g., "gpt2" for "openai-community/gpt2")
+        short_name = model_name.split('/')[-1] if '/' in model_name else model_name
+        model_data = all_data.get(short_name)
+    if model_data is not None:
+        _category_cache[model_name] = model_data
+    return model_data
+def clear_category_cache():
+    """Clear the loaded category cache (useful for testing)."""
+    global _category_cache
+    _category_cache = {}
+def _compute_attention_entropy(attention_weights: torch.Tensor) -> float:
     """
+    Compute normalized entropy of an attention distribution.
     Args:
+        attention_weights: [seq_len] tensor of attention weights for one position
     Returns:
+        Normalized entropy (0.0 to 1.0). 1.0 = perfectly uniform, 0.0 = fully peaked.
     """
+    epsilon = 1e-10
+    weights = attention_weights + epsilon
+    entropy = -torch.sum(weights * torch.log(weights))
+    max_entropy = np.log(len(weights))
+    return (entropy / max_entropy).item() if max_entropy > 0 else 0.0
+def _find_repeated_tokens(token_ids: List[int]) -> Dict[int, List[int]]:
     """
+    Find tokens that appear more than once and their positions.
     Args:
+        token_ids: List of token IDs in the sequence
     Returns:
+        Dict mapping token_id -> list of positions where it appears (only for repeated tokens)
     """
+    positions: Dict[int, List[int]] = {}
+    for i, tid in enumerate(token_ids):
+        if tid not in positions:
+            positions[tid] = []
+        positions[tid].append(i)
+    # Keep only tokens that appear more than once
+    return {tid: pos_list for tid, pos_list in positions.items() if len(pos_list) > 1}
+def verify_head_activation(
+    attn_matrix: torch.Tensor,
+    token_ids: List[int],
+    category: str
+) -> float:
     """
+    Verify whether a head's known role is active on the current input.
     Args:
+        attn_matrix: [seq_len, seq_len] attention weights for this head
+        token_ids: List of token IDs in the input
+        category: Category name (previous_token, induction, duplicate_token, positional, diffuse)
     Returns:
+        Activation score (0.0 to 1.0). 0.0 means the role is not triggered on this input.
     """
+    seq_len = attn_matrix.shape[0]
+    if seq_len < 2:
+        return 0.0
+    if category == "previous_token":
+        # Mean of diagonal-1 values: how much each position attends to the previous position
+        prev_token_attentions = []
+        for i in range(1, seq_len):
+            prev_token_attentions.append(attn_matrix[i, i - 1].item())
+        return float(np.mean(prev_token_attentions)) if prev_token_attentions else 0.0
+    elif category == "induction":
+        # Induction pattern: [A][B]...[A] → attend to [B]
+        # For each repeated token at position i where token[i]==token[j] (j < i),
+        # check if position i attends to position j+1
+        repeated = _find_repeated_tokens(token_ids)
+        if not repeated:
+            return 0.0  # No repetition → gray out
+        induction_scores = []
+        for tid, positions in repeated.items():
+            for k in range(1, len(positions)):
+                current_pos = positions[k]  # Later occurrence
+                for prev_idx in range(k):
+                    prev_pos = positions[prev_idx]  # Earlier occurrence
+                    target_pos = prev_pos + 1  # The token AFTER the earlier occurrence
+                    if target_pos < seq_len and current_pos < seq_len:
+                        induction_scores.append(attn_matrix[current_pos, target_pos].item())
+        return float(np.mean(induction_scores)) if induction_scores else 0.0
+    elif category == "duplicate_token":
+        # Check if later occurrences attend to earlier occurrences of the same token
+        repeated = _find_repeated_tokens(token_ids)
+        if not repeated:
+            return 0.0  # No duplicates → gray out
+        dup_scores = []
+        for tid, positions in repeated.items():
+            for k in range(1, len(positions)):
+                later_pos = positions[k]
+                # Sum attention to all earlier occurrences
+                earlier_attention = sum(
+                    attn_matrix[later_pos, positions[j]].item()
+                    for j in range(k)
+                )
+                dup_scores.append(earlier_attention)
+        return float(np.mean(dup_scores)) if dup_scores else 0.0
+    elif category == "positional":
+        # Mean of column-0 attention (how much each position attends to the first token)
+        first_token_attention = attn_matrix[:, 0].mean().item()
+        return first_token_attention
+    elif category == "diffuse":
+        # Average normalized entropy across all positions
+        entropies = []
+        for i in range(seq_len):
+            entropies.append(_compute_attention_entropy(attn_matrix[i]))
+        return float(np.mean(entropies)) if entropies else 0.0
     else:
+        return 0.0
+def get_active_head_summary(
+    activation_data: Dict[str, Any],
+    model_name: str
+) -> Optional[Dict[str, Any]]:
     """
+    Main entry point: load categories from JSON, verify each head on the current input,
+    and return a UI-ready structure.
     Args:
         activation_data: Output from execute_forward_pass with attention data
+        model_name: HuggingFace model name
     Returns:
+        Dict with structure:
         {
+            "model_available": True,
+            "categories": {
+                "previous_token": {
+                    "display_name": str,
+                    "description": str,
+                    "educational_text": str,
+                    "icon": str,
+                    "requires_repetition": bool,
+                    "suggested_prompt": str or None,
+                    "is_applicable": bool,  # False if requires_repetition but no repeats
+                    "heads": [
+                        {"layer": int, "head": int, "offline_score": float,
+                         "activation_score": float, "is_active": bool, "label": str}
+                    ]
+                },
+                ...
+            }
         }
+        Returns None if model not in JSON.
     """
+    model_data = load_head_categories(model_name)
+    if model_data is None:
+        return None
+    # Extract attention weights and token IDs from activation data
     attention_outputs = activation_data.get('attention_outputs', {})
+    input_ids = activation_data.get('input_ids', [[]])[0]
+    if not attention_outputs or not input_ids:
+        return None
+    # Build a lookup: (layer, head) → attention_matrix [seq_len, seq_len]
+    head_attention_lookup: Dict[Tuple[int, int], torch.Tensor] = {}
     for module_name, output_dict in attention_outputs.items():
         numbers = re.findall(r'\d+', module_name)
         if not numbers:
             continue
         if not isinstance(attention_output, list) or len(attention_output) < 2:
             continue
+        # attention_output[1] is [batch, heads, seq_len, seq_len]
         attention_weights = torch.tensor(attention_output[1])
         num_heads = attention_weights.shape[1]
         for head_idx in range(num_heads):
+            head_attention_lookup[(layer_idx, head_idx)] = attention_weights[0, head_idx, :, :]
+    # Check if input has repeated tokens (needed for applicability check)
+    repeated_tokens = _find_repeated_tokens(input_ids)
+    has_repetition = len(repeated_tokens) > 0
+    # Build result
+    result = {
+        "model_available": True,
+        "categories": {}
     }
+    categories = model_data.get("categories", {})
+    # Define category order for consistent display
+    category_order = ["previous_token", "induction", "duplicate_token", "positional", "diffuse"]
+    for cat_key in category_order:
+        cat_info = categories.get(cat_key)
+        if cat_info is None:
             continue
+        requires_repetition = cat_info.get("requires_repetition", False)
+        is_applicable = not requires_repetition or has_repetition
+        heads_result = []
+        for head_entry in cat_info.get("top_heads", []):
+            layer = head_entry["layer"]
+            head = head_entry["head"]
+            offline_score = head_entry["score"]
+            # Get activation score on current input
+            attn_matrix = head_attention_lookup.get((layer, head))
+            if attn_matrix is not None and is_applicable:
+                activation_score = verify_head_activation(attn_matrix, input_ids, cat_key)
+            else:
+                activation_score = 0.0
+            # A head is "active" if its activation score exceeds a minimum threshold
+            is_active = activation_score > 0.1 and is_applicable
+            heads_result.append({
+                "layer": layer,
+                "head": head,
+                "offline_score": offline_score,
+                "activation_score": round(activation_score, 3),
+                "is_active": is_active,
+                "label": f"L{layer}-H{head}"
+            })
+        result["categories"][cat_key] = {
+            "display_name": cat_info.get("display_name", cat_key),
+            "description": cat_info.get("description", ""),
+            "educational_text": cat_info.get("educational_text", ""),
+            "icon": cat_info.get("icon", "circle"),
+            "requires_repetition": requires_repetition,
+            "suggested_prompt": cat_info.get("suggested_prompt"),
+            "is_applicable": is_applicable,
+            "heads": heads_result
+        }
+    # Add "Other" category (heads not claimed by any top list)
+    result["categories"]["other"] = {
+        "display_name": "Other / Unclassified",
+        "description": "Heads whose patterns don't fit the simple categories above",
+        "educational_text": "This head's pattern doesn't fit our simple categories — it may be doing something more complex or context-dependent.",
+        "icon": "question-circle",
+        "requires_repetition": False,
+        "suggested_prompt": None,
+        "is_applicable": True,
+        "heads": []  # We don't enumerate all "other" heads to keep the UI clean
     }
+    return result

utils/model_patterns.py CHANGED Viewed

@@ -1421,23 +1421,4 @@ def generate_bertviz_html(activation_data: Dict[str, Any], layer_index: int, vie
         return f"<p>Error generating visualization: {str(e)}</p>"
-def get_head_category_counts(activation_data: Dict[str, Any]) -> Dict[str, int]:
-    """
-    Get counts of attention heads in each category.
-    Useful for UI display showing the distribution of head types.
-    Args:
-        activation_data: Output from execute_forward_pass with attention data
-    Returns:
-        Dict mapping category name to count of heads in that category
-    """
-    from .head_detection import categorize_all_heads
-    try:
-        categories = categorize_all_heads(activation_data)
-        return {category: len(heads) for category, heads in categories.items()}
-    except Exception as e:
-        print(f"Warning: Could not categorize heads: {e}")
-        return {}


1421	return f"<p>Error generating visualization: {str(e)}</p>"
1422
1423
1424	+