Spaces:

visualisable-ai
/

api

Running on CPU Upgrade

api

File size: 10,577 Bytes

37ed739

"""
Architectural Analysis for RQ1 - Architectural Interpretability

Purpose: Extract and format raw architectural signals for transparency visualization
Focus: Internal mechanisms (NOT post-hoc feature attribution)

Key differences from SHAP/explainability:
- Preserves per-head, per-layer granularity (no aggregation)
- Captures activation patterns and confidence metrics
- Supports causal intervention (ablation)
- Real-time architectural transparency

Based on PhD proposal RQ1:
"Transform opaque architectural mechanisms into interpretable visual representations"
"""

import torch
import numpy as np
from typing import Dict, List, Optional, Tuple, Any
import logging

logger = logging.getLogger(__name__)


def compute_head_entropy(attention_weights: torch.Tensor) -> float:
    """
    Compute entropy of attention distribution for a single head.

    High entropy = diffuse attention (many tokens attended equally)
    Low entropy = focused attention (few tokens dominate)

    Args:
        attention_weights: [seq_len, seq_len] attention matrix for one head

    Returns:
        Entropy value (bits)
    """
    # Average across query positions to get distribution
    avg_dist = attention_weights.mean(dim=0)

    # Add small epsilon to avoid log(0)
    eps = 1e-10
    avg_dist = avg_dist + eps

    # Compute entropy: -sum(p * log(p))
    entropy = -(avg_dist * torch.log2(avg_dist)).sum().item()

    # Ensure finite value
    entropy = float(np.clip(entropy, 0.0, 1e10))
    if not np.isfinite(entropy):
        entropy = 0.0

    return entropy


def identify_head_role(attention_weights: torch.Tensor, tokens: List[str]) -> str:
    """
    Classify attention head role based on attention patterns.

    Roles:
    - 'positional': Attends primarily to specific positions (diagonal, next-token, etc.)
    - 'delimiter': Focuses on delimiters/special tokens (braces, semicolons, etc.)
    - 'content': Attends to semantic content tokens (identifiers, keywords)
    - 'mixed': No clear specialization

    Args:
        attention_weights: [seq_len, seq_len]
        tokens: List of token strings

    Returns:
        Role classification string
    """
    # Compute statistics
    diagonal_strength = torch.diag(attention_weights).mean().item()
    max_weight = attention_weights.max().item()

    # Simple heuristics (can be refined with more research)
    if diagonal_strength > 0.3:
        return 'positional'

    # Check if attends primarily to delimiters
    delimiter_tokens = {'{', '}', '(', ')', '[', ']', ';', ',', ':'}
    delimiter_indices = [i for i, tok in enumerate(tokens) if tok in delimiter_tokens]

    if delimiter_indices:
        delimiter_attention = attention_weights[:, delimiter_indices].mean().item()
        if delimiter_attention > 0.3:
            return 'delimiter'

    # Check for focused content attention
    if max_weight > 0.5:
        return 'content'

    return 'mixed'


def extract_per_head_attention(
    attention_tensor: torch.Tensor,
    layer_idx: int,
    tokens: List[str]
) -> List[Dict[str, Any]]:
    """
    Extract per-head attention data for a specific layer.

    Args:
        attention_tensor: [num_heads, seq_len, seq_len]
        layer_idx: Layer index
        tokens: Token strings

    Returns:
        List of dicts, one per head
    """
    num_heads = attention_tensor.shape[0]
    heads_data = []

    for head_idx in range(num_heads):
        head_attn = attention_tensor[head_idx]  # [seq_len, seq_len]

        # Clean attention matrix - replace NaN/Inf with 0
        head_attn_np = head_attn.cpu().numpy()
        head_attn_np = np.nan_to_num(head_attn_np, nan=0.0, posinf=1.0, neginf=0.0)
        head_attn_np = np.clip(head_attn_np, 0.0, 1.0)

        # Recompute as tensor for entropy/role calculations
        head_attn_clean = torch.from_numpy(head_attn_np)

        entropy = compute_head_entropy(head_attn_clean)
        max_weight = float(head_attn_np.max())
        if not np.isfinite(max_weight):
            max_weight = 0.0

        role = identify_head_role(head_attn_clean, tokens)

        heads_data.append({
            "head_idx": head_idx,
            "attention_matrix": head_attn_np.tolist(),
            "entropy": entropy,
            "max_weight": max_weight,
            "role": role
        })

    return heads_data


def compute_activation_metrics(
    hidden_states: torch.Tensor,
    prev_hidden_states: Optional[torch.Tensor] = None
) -> Dict[str, float]:
    """
    Compute activation-related metrics for a layer.

    Args:
        hidden_states: [seq_len, hidden_dim] output of layer
        prev_hidden_states: Previous layer hidden states (for drift computation)

    Returns:
        Dict with activation magnitude, entropy, norm, drift
    """
    # Activation magnitude: L2 norm averaged across sequence
    activation_magnitude = torch.norm(hidden_states, dim=-1).mean().item()
    activation_magnitude = float(np.clip(activation_magnitude, -1e10, 1e10))
    if not np.isfinite(activation_magnitude):
        activation_magnitude = 0.0

    # Activation entropy: How varied are the activations?
    flat_activations = hidden_states.flatten()
    # Normalize to probability distribution
    probs = torch.softmax(flat_activations, dim=0)
    activation_entropy = -(probs * torch.log2(probs + 1e-10)).sum().item()
    activation_entropy = float(np.clip(activation_entropy, 0.0, 1e10))
    if not np.isfinite(activation_entropy):
        activation_entropy = 0.0

    # Hidden state norm
    hidden_state_norm = torch.norm(hidden_states).item()
    hidden_state_norm = float(np.clip(hidden_state_norm, -1e10, 1e10))
    if not np.isfinite(hidden_state_norm):
        hidden_state_norm = 0.0

    # Hidden state drift (if previous layer available)
    hidden_state_drift = None
    if prev_hidden_states is not None:
        drift = torch.norm(hidden_states - prev_hidden_states).item()
        drift = float(np.clip(drift, -1e10, 1e10))
        if np.isfinite(drift):
            hidden_state_drift = drift

    return {
        "activation_magnitude": activation_magnitude,
        "activation_entropy": activation_entropy,
        "hidden_state_norm": hidden_state_norm,
        "hidden_state_drift": hidden_state_drift
    }


def extract_architectural_data(
    model_outputs: Dict[str, Any],
    input_tokens: List[str],
    output_tokens: List[str],
    model_config: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Extract complete architectural transparency data for visualization.

    This is the main function that formats all data needed for
    ArchitecturalAttentionExplorer component.

    Args:
        model_outputs: Dict containing 'attentions', 'hidden_states', etc.
        input_tokens: Input token strings
        output_tokens: Generated token strings
        model_config: Model configuration (num_layers, num_heads, etc.)

    Returns:
        Complete architectural data dict
    """
    # Extract attention from model outputs
    # Expected shape: attentions is tuple of [batch, num_heads, seq_len, seq_len]
    attentions = model_outputs.get('attentions', None)
    hidden_states = model_outputs.get('hidden_states', None)

    if attentions is None:
        logger.warning("No attention weights in model outputs")
        return None

    # Process each layer
    layers_data = []
    prev_hidden = None

    num_layers = len(attentions)

    for layer_idx in range(num_layers):
        layer_attn = attentions[layer_idx]  # [batch, num_heads, seq_len, seq_len]

        # Remove batch dimension (assuming batch_size=1)
        if layer_attn.dim() == 4:
            layer_attn = layer_attn[0]  # [num_heads, seq_len, seq_len]

        # Extract per-head attention
        all_tokens = input_tokens + output_tokens
        heads_data = extract_per_head_attention(layer_attn, layer_idx, all_tokens)

        # Compute activation metrics
        activation_metrics = {"activation_magnitude": 0.0, "activation_entropy": 0.0, "hidden_state_norm": 0.0}

        if hidden_states is not None and layer_idx < len(hidden_states):
            current_hidden = hidden_states[layer_idx]
            if current_hidden.dim() == 3:  # [batch, seq_len, hidden_dim]
                current_hidden = current_hidden[0]  # Remove batch

            activation_metrics = compute_activation_metrics(current_hidden, prev_hidden)
            prev_hidden = current_hidden

        # Combine data for this layer
        layer_data = {
            "layer_idx": layer_idx,
            "attention_heads": heads_data,
            **activation_metrics
        }

        layers_data.append(layer_data)

    # Build complete response
    architectural_data = {
        "layers": layers_data,
        "model_info": {
            "num_layers": num_layers,
            "num_heads": model_config.get('num_heads', len(heads_data)),
            "hidden_size": model_config.get('hidden_size', 768),
            "model_name": model_config.get('model_name', 'unknown')
        },
        "input_tokens": input_tokens,
        "output_tokens": output_tokens
    }

    # Optional: Expert routing (for MoE models)
    expert_routing = model_outputs.get('router_logits', None)
    if expert_routing is not None:
        architectural_data["expert_routing"] = extract_expert_routing(expert_routing)

    return architectural_data


def extract_expert_routing(router_logits: torch.Tensor) -> List[Dict[str, Any]]:
    """
    Extract expert routing decisions for MoE models.

    Args:
        router_logits: Router logits from model
            Shape depends on model architecture

    Returns:
        List of routing decisions per layer/token
    """
    # This is model-specific and would need to be adapted
    # For DeepSeek-MoE, CodeLlama-MoE, etc.

    # Placeholder implementation
    routing_data = []

    logger.info("Expert routing extraction not yet implemented for this model")

    return routing_data


def format_for_study_endpoint(
    architectural_data: Dict[str, Any],
    generation_metadata: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Format architectural data for /api/study/analyze endpoint response.

    Args:
        architectural_data: Output from extract_architectural_data()
        generation_metadata: Generation stats (time, tokens, etc.)

    Returns:
        Complete response dict
    """
    return {
        "architectural_data": architectural_data,
        "metadata": generation_metadata,
        "visualization_type": "architectural_transparency",
        "research_context": "RQ1: Architectural Interpretability"
    }