Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Dec 24, 2025

Commit

d1d37a8

1 Parent(s): 929ba88

fix: add QKV extraction support for Mistral/Devstral architecture

- Add make_separate_proj_hook for models with separate q_proj, k_proj, v_proj
- Handle GQA (Grouped Query Attention) by expanding K/V heads to match Q heads
- Support both CodeGen (combined qkv_proj) and Mistral (separate projections)
- Update both streaming and non-streaming endpoints

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +128 -17

backend/model_service.py CHANGED Viewed

@@ -1644,42 +1644,89 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         qkv_captures = {}
         hooks = []
-        def make_qkv_hook(layer_idx):
             def hook(module, input, output):
                 try:
-                    # output shape: [batch, seq_len, 3 * hidden_size]
-                    # Split into Q, K, V
                     if output.dim() != 3:
-                        return  # Skip if unexpected shape
                     batch_size, seq_len, hidden = output.shape
                     expected_hidden = 3 * n_heads * head_dim
                     if hidden != expected_hidden:
-                        return  # Skip if dimensions don't match QKV format
                     qkv = output.reshape(batch_size, seq_len, 3, n_heads, head_dim)
-                    # Separate Q, K, V: [batch, seq_len, n_heads, head_dim]
                     q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
                     qkv_captures[layer_idx] = {
-                        'q': q[0].detach().cpu(),  # Remove batch dim
                         'k': k[0].detach().cpu(),
                         'v': v[0].detach().cpu()
                     }
                 except Exception:
-                    # Silently skip QKV capture if it fails - it's optional data
                     pass
             return hook
-        # Register hooks on all qkv_proj modules (if available)
-        # This is model-specific - CodeGen uses different architecture
         try:
             if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
                 for layer_idx, layer in enumerate(manager.model.transformer.h):
                     if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
-                        hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
                         hooks.append(hook)
                     elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
-                        # GPT-2 style attention
-                        hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
                         hooks.append(hook)
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
@@ -2116,7 +2163,8 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             qkv_captures = {}
             hooks = []
-            def make_qkv_hook(layer_idx):
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
@@ -2136,16 +2184,79 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                         pass
                 return hook
-            # Register hooks
             try:
                 if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
                     for layer_idx, layer in enumerate(manager.model.transformer.h):
                         if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
-                            hook = layer.attn.qkv_proj.register_forward_hook(make_qkv_hook(layer_idx))
                             hooks.append(hook)
                         elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
-                            hook = layer.attn.c_attn.register_forward_hook(make_qkv_hook(layer_idx))
                             hooks.append(hook)
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")

         qkv_captures = {}
         hooks = []
+        # Hook for combined QKV projection (CodeGen style)
+        def make_combined_qkv_hook(layer_idx):
             def hook(module, input, output):
                 try:
                     if output.dim() != 3:
+                        return
                     batch_size, seq_len, hidden = output.shape
                     expected_hidden = 3 * n_heads * head_dim
                     if hidden != expected_hidden:
+                        return
                     qkv = output.reshape(batch_size, seq_len, 3, n_heads, head_dim)
                     q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
                     qkv_captures[layer_idx] = {
+                        'q': q[0].detach().cpu(),
                         'k': k[0].detach().cpu(),
                         'v': v[0].detach().cpu()
                     }
                 except Exception:
                     pass
             return hook
+        # Hooks for separate Q, K, V projections (Mistral/LLaMA style)
+        def make_separate_proj_hook(layer_idx, proj_type, num_kv_heads=None):
+            def hook(module, input, output):
+                try:
+                    if output.dim() != 3:
+                        return
+                    batch_size, seq_len, hidden = output.shape
+                    if proj_type == 'q':
+                        proj_heads = n_heads
+                    else:
+                        proj_heads = num_kv_heads if num_kv_heads else n_heads
+                    proj_head_dim = hidden // proj_heads
+                    if hidden != proj_heads * proj_head_dim:
+                        return
+                    proj_output = output.reshape(batch_size, seq_len, proj_heads, proj_head_dim)
+                    if proj_type != 'q' and num_kv_heads and num_kv_heads < n_heads:
+                        repeat_factor = n_heads // num_kv_heads
+                        proj_output = proj_output.repeat_interleave(repeat_factor, dim=2)
+                    if layer_idx not in qkv_captures:
+                        qkv_captures[layer_idx] = {}
+                    qkv_captures[layer_idx][proj_type] = proj_output[0].detach().cpu()
+                except Exception:
+                    pass
+            return hook
+        # Register hooks based on model architecture
         try:
+            # CodeGen style: model.transformer.h[layer].attn.qkv_proj
             if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
                 for layer_idx, layer in enumerate(manager.model.transformer.h):
                     if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
+                        hook = layer.attn.qkv_proj.register_forward_hook(make_combined_qkv_hook(layer_idx))
                         hooks.append(hook)
                     elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
+                        hook = layer.attn.c_attn.register_forward_hook(make_combined_qkv_hook(layer_idx))
                         hooks.append(hook)
+            # Mistral/LLaMA style: model.model.layers[layer].self_attn.{q,k,v}_proj
+            elif hasattr(manager.model, 'model') and hasattr(manager.model.model, 'layers'):
+                num_kv_heads = getattr(manager.model.config, 'num_key_value_heads', None)
+                for layer_idx, layer in enumerate(manager.model.model.layers):
+                    if hasattr(layer, 'self_attn'):
+                        attn = layer.self_attn
+                        if hasattr(attn, 'q_proj'):
+                            hook = attn.q_proj.register_forward_hook(
+                                make_separate_proj_hook(layer_idx, 'q', num_kv_heads))
+                            hooks.append(hook)
+                        if hasattr(attn, 'k_proj'):
+                            hook = attn.k_proj.register_forward_hook(
+                                make_separate_proj_hook(layer_idx, 'k', num_kv_heads))
+                            hooks.append(hook)
+                        if hasattr(attn, 'v_proj'):
+                            hook = attn.v_proj.register_forward_hook(
+                                make_separate_proj_hook(layer_idx, 'v', num_kv_heads))
+                            hooks.append(hook)
+                logger.info(f"Registered QKV hooks for {len(hooks)//3} Mistral layers (GQA: {num_kv_heads} KV heads)")
         except Exception as hook_error:
             logger.warning(f"Could not register QKV hooks: {hook_error}")
             qkv_captures = {}
             hooks = []
+            # Hook for combined QKV projection (CodeGen style)
+            def make_combined_qkv_hook(layer_idx):
                 def hook(module, input, output):
                     try:
                         if output.dim() != 3:
                         pass
                 return hook
+            # Hooks for separate Q, K, V projections (Mistral/LLaMA style)
+            def make_separate_proj_hook(layer_idx, proj_type, num_kv_heads=None):
+                """Create hook for separate Q/K/V projection modules.
+                For GQA models, K and V have fewer heads than Q, so we need to
+                expand them to match Q's head count for consistent visualization.
+                """
+                def hook(module, input, output):
+                    try:
+                        if output.dim() != 3:
+                            return
+                        batch_size, seq_len, hidden = output.shape
+                        # Determine number of heads for this projection
+                        if proj_type == 'q':
+                            proj_heads = n_heads
+                        else:
+                            # K and V may have fewer heads (GQA)
+                            proj_heads = num_kv_heads if num_kv_heads else n_heads
+                        proj_head_dim = hidden // proj_heads
+                        if hidden != proj_heads * proj_head_dim:
+                            return
+                        # Reshape to [batch, seq, heads, head_dim]
+                        proj_output = output.reshape(batch_size, seq_len, proj_heads, proj_head_dim)
+                        # For GQA, expand K/V to match Q's head count
+                        if proj_type != 'q' and num_kv_heads and num_kv_heads < n_heads:
+                            # Repeat each KV head to match Q heads
+                            repeat_factor = n_heads // num_kv_heads
+                            proj_output = proj_output.repeat_interleave(repeat_factor, dim=2)
+                        # Initialize layer entry if needed
+                        if layer_idx not in qkv_captures:
+                            qkv_captures[layer_idx] = {}
+                        qkv_captures[layer_idx][proj_type] = proj_output[0].detach().cpu()
+                    except Exception as e:
+                        logger.debug(f"QKV capture error for layer {layer_idx} {proj_type}: {e}")
+                return hook
+            # Register hooks based on model architecture
             try:
+                # CodeGen style: model.transformer.h[layer].attn.qkv_proj
                 if hasattr(manager.model, 'transformer') and hasattr(manager.model.transformer, 'h'):
                     for layer_idx, layer in enumerate(manager.model.transformer.h):
                         if hasattr(layer, 'attn') and hasattr(layer.attn, 'qkv_proj'):
+                            hook = layer.attn.qkv_proj.register_forward_hook(make_combined_qkv_hook(layer_idx))
                             hooks.append(hook)
                         elif hasattr(layer, 'attn') and hasattr(layer.attn, 'c_attn'):
+                            hook = layer.attn.c_attn.register_forward_hook(make_combined_qkv_hook(layer_idx))
                             hooks.append(hook)
+                # Mistral/LLaMA style: model.model.layers[layer].self_attn.{q,k,v}_proj
+                elif hasattr(manager.model, 'model') and hasattr(manager.model.model, 'layers'):
+                    num_kv_heads = getattr(manager.model.config, 'num_key_value_heads', None)
+                    for layer_idx, layer in enumerate(manager.model.model.layers):
+                        if hasattr(layer, 'self_attn'):
+                            attn = layer.self_attn
+                            if hasattr(attn, 'q_proj'):
+                                hook = attn.q_proj.register_forward_hook(
+                                    make_separate_proj_hook(layer_idx, 'q', num_kv_heads))
+                                hooks.append(hook)
+                            if hasattr(attn, 'k_proj'):
+                                hook = attn.k_proj.register_forward_hook(
+                                    make_separate_proj_hook(layer_idx, 'k', num_kv_heads))
+                                hooks.append(hook)
+                            if hasattr(attn, 'v_proj'):
+                                hook = attn.v_proj.register_forward_hook(
+                                    make_separate_proj_hook(layer_idx, 'v', num_kv_heads))
+                                hooks.append(hook)
+                    logger.info(f"Registered QKV hooks for {len(hooks)//3} Mistral layers (GQA: {num_kv_heads} KV heads)")
             except Exception as hook_error:
                 logger.warning(f"Could not register QKV hooks: {hook_error}")