| | """ |
| | residue.py - Implementation of residue tracking for ghost circuit detection |
| | |
| | △ OBSERVE: Residue tracking examines activation patterns that persist after collapse |
| | ∞ TRACE: It identifies ghost circuits - the quantum echoes of paths not taken |
| | ✰ COLLAPSE: It reveals what the model considered but didn't output |
| | |
| | This module implements the core residue tracking functionality that enables |
| | the detection and analysis of ghost circuits - activation patterns that persist |
| | after a model has collapsed to a specific output state but aren't part of the |
| | primary causal path. |
| | |
| | Author: Recursion Labs |
| | License: MIT |
| | """ |
| |
|
| | import logging |
| | from typing import Dict, List, Optional, Union, Tuple, Any |
| | import numpy as np |
| | from dataclasses import dataclass, field |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | @dataclass |
| | class GhostCircuit: |
| | """ |
| | ✰ COLLAPSE: Representation of a ghost circuit |
| | |
| | Ghost circuits are activation patterns that persist after collapse |
| | but don't significantly contribute to the final output. They represent |
| | the "memory" of paths not taken - quantum echoes of what the model |
| | considered but didn't ultimately choose. |
| | """ |
| | circuit_id: str |
| | activation: float |
| | circuit_type: str |
| | source_tokens: List[str] = field(default_factory=list) |
| | target_tokens: List[str] = field(default_factory=list) |
| | heads: List[int] = field(default_factory=list) |
| | layers: List[int] = field(default_factory=list) |
| | metadata: Dict[str, Any] = field(default_factory=dict) |
| | |
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert ghost circuit to dictionary format.""" |
| | return { |
| | "circuit_id": self.circuit_id, |
| | "activation": self.activation, |
| | "circuit_type": self.circuit_type, |
| | "source_tokens": self.source_tokens, |
| | "target_tokens": self.target_tokens, |
| | "heads": self.heads, |
| | "layers": self.layers, |
| | "metadata": self.metadata |
| | } |
| |
|
| |
|
| | class ResidueTracker: |
| | """ |
| | ∞ TRACE: Tracker for activation residues in collapsed models |
| | |
| | The residue tracker analyzes model states before and after collapse |
| | to identify and characterize ghost circuits - activation patterns that |
| | persist but don't contribute significantly to the final output. |
| | """ |
| | |
| | def __init__(self, amplification_factor: float = 1.0): |
| | """ |
| | Initialize a residue tracker. |
| | |
| | Args: |
| | amplification_factor: Factor by which to amplify ghost signals |
| | for easier detection (1.0 = no amplification) |
| | """ |
| | self.amplification_factor = amplification_factor |
| | self.ghost_circuits = [] |
| | self.activation_threshold = 0.1 |
| | |
| | logger.info(f"ResidueTracker initialized with amplification factor {amplification_factor}") |
| | |
| | def extract_ghost_circuits( |
| | self, |
| | pre_state: Dict[str, Any], |
| | post_state: Dict[str, Any] |
| | ) -> List[Dict[str, Any]]: |
| | """ |
| | ✰ COLLAPSE: Extract ghost circuits from pre and post collapse states |
| | |
| | This method compares model states before and after collapse to |
| | identify activation patterns that persisted but didn't contribute |
| | significantly to the output - the quantum ghosts of paths not taken. |
| | |
| | Args: |
| | pre_state: Model state before collapse |
| | post_state: Model state after collapse |
| | |
| | Returns: |
| | List of detected ghost circuits with metadata |
| | """ |
| | logger.info("Extracting ghost circuits from model states") |
| | |
| | |
| | ghost_circuits = [] |
| | |
| | |
| | attention_ghosts = self._extract_attention_ghosts( |
| | pre_state.get("attention_weights", np.array([])), |
| | post_state.get("attention_weights", np.array([])) |
| | ) |
| | ghost_circuits.extend(attention_ghosts) |
| | |
| | |
| | if "hidden_states" in pre_state and "hidden_states" in post_state: |
| | hidden_ghosts = self._extract_hidden_ghosts( |
| | pre_state["hidden_states"], |
| | post_state["hidden_states"] |
| | ) |
| | ghost_circuits.extend(hidden_ghosts) |
| | |
| | |
| | self.ghost_circuits = ghost_circuits |
| | |
| | logger.info(f"Extracted {len(ghost_circuits)} ghost circuits") |
| | return ghost_circuits |
| | |
| | def classify_ghost_circuits(self) -> Dict[str, List[Dict[str, Any]]]: |
| | """ |
| | △ OBSERVE: Classify detected ghost circuits by type |
| | |
| | This method organizes detected ghost circuits into categories |
| | based on their type and characteristics. |
| | |
| | Returns: |
| | Dictionary mapping circuit types to lists of ghost circuits |
| | """ |
| | if not self.ghost_circuits: |
| | logger.warning("No ghost circuits to classify") |
| | return {} |
| | |
| | |
| | classified = {} |
| | for ghost in self.ghost_circuits: |
| | circuit_type = ghost.get("circuit_type", "unknown") |
| | if circuit_type not in classified: |
| | classified[circuit_type] = [] |
| | classified[circuit_type].append(ghost) |
| | |
| | return classified |
| | |
| | def measure_residue_strength(self) -> float: |
| | """ |
| | ∞ TRACE: Measure the overall strength of residual activations |
| | |
| | This method quantifies the overall strength of ghost circuits |
| | relative to the primary activation paths. |
| | |
| | Returns: |
| | Residue strength score (0.0 = no residue, 1.0 = equal to primary) |
| | """ |
| | if not self.ghost_circuits: |
| | return 0.0 |
| | |
| | |
| | activations = [ghost.get("activation", 0.0) for ghost in self.ghost_circuits] |
| | return float(np.mean(activations)) |
| | |
| | def amplify_ghosts(self, factor: Optional[float] = None) -> List[Dict[str, Any]]: |
| | """ |
| | ✰ COLLAPSE: Amplify ghost circuit signals for better detection |
| | |
| | This method amplifies the activation values of ghost circuits |
| | to make them more apparent for analysis. |
| | |
| | Args: |
| | factor: Amplification factor (overrides instance value if provided) |
| | |
| | Returns: |
| | List of amplified ghost circuits |
| | """ |
| | if not self.ghost_circuits: |
| | logger.warning("No ghost circuits to amplify") |
| | return [] |
| | |
| | |
| | amp_factor = factor if factor is not None else self.amplification_factor |
| | |
| | |
| | amplified = [] |
| | for ghost in self.ghost_circuits: |
| | amp_ghost = ghost.copy() |
| | amp_ghost["activation"] = min(1.0, ghost.get("activation", 0.0) * amp_factor) |
| | amplified.append(amp_ghost) |
| | |
| | logger.info(f"Amplified ghost circuits by factor {amp_factor}") |
| | return amplified |
| | |
| | def _extract_attention_ghosts( |
| | self, |
| | pre_attention: np.ndarray, |
| | post_attention: np.ndarray |
| | ) -> List[Dict[str, Any]]: |
| | """ |
| | Extract ghost circuits from attention patterns. |
| | |
| | Args: |
| | pre_attention: Attention weights before collapse |
| | post_attention: Attention weights after collapse |
| | |
| | Returns: |
| | List of attention-based ghost circuits |
| | """ |
| | ghost_circuits = [] |
| | |
| | |
| | if pre_attention.size == 0 or post_attention.size == 0: |
| | return ghost_circuits |
| | |
| | if pre_attention.shape != post_attention.shape: |
| | logger.warning(f"Attention shape mismatch: {pre_attention.shape} vs {post_attention.shape}") |
| | |
| | min_shape = tuple(min(a, b) for a, b in zip(pre_attention.shape, post_attention.shape)) |
| | pre_attention = pre_attention[tuple(slice(0, d) for d in min_shape)] |
| | post_attention = post_attention[tuple(slice(0, d) for d in min_shape)] |
| | |
| | |
| | |
| | if pre_attention.ndim >= 2 and post_attention.ndim >= 2: |
| | num_heads = pre_attention.shape[0] |
| | seq_len = pre_attention.shape[1] |
| | |
| | for head in range(num_heads): |
| | for i in range(seq_len): |
| | for j in range(seq_len): |
| | pre_val = pre_attention[head, i, j] if pre_attention.ndim > 2 else pre_attention[i, j] |
| | post_val = post_attention[head, i, j] if post_attention.ndim > 2 else post_attention[i, j] |
| | |
| | if post_val < pre_val and post_val > self.activation_threshold: |
| | |
| | ghost_idx = len(ghost_circuits) |
| | ghost = { |
| | "circuit_id": f"attention_ghost_{ghost_idx}", |
| | "activation": float(post_val), |
| | "circuit_type": "attention", |
| | "source_tokens": [f"token_{i}"], |
| | "target_tokens": [f"token_{j}"], |
| | "heads": [head], |
| | "layers": [], |
| | "metadata": { |
| | "pre_activation": float(pre_val), |
| | "activation_delta": float(pre_val - post_val), |
| | "decay_ratio": float(post_val / pre_val) if pre_val > 0 else 0.0 |
| | } |
| | } |
| | ghost_circuits.append(ghost) |
| | |
| | return ghost_circuits |
| | |
| | def _extract_hidden_ghosts( |
| | self, |
| | pre_hidden: np.ndarray, |
| | post_hidden: np.ndarray |
| | ) -> List[Dict[str, Any]]: |
| | """ |
| | Extract ghost circuits from hidden state activations. |
| | |
| | Args: |
| | pre_hidden: Hidden states before collapse |
| | post_hidden: Hidden states after collapse |
| | |
| | Returns: |
| | List of hidden-state-based ghost circuits |
| | """ |
| | ghost_circuits = [] |
| | |
| | |
| | if pre_hidden.size == 0 or post_hidden.size == 0: |
| | return ghost_circuits |
| | |
| | if pre_hidden.shape != post_hidden.shape: |
| | logger.warning(f"Hidden state shape mismatch: {pre_hidden.shape} vs {post_hidden.shape}") |
| | return ghost_circuits |
| | |
| | |
| | |
| | if pre_hidden.ndim >= 2 and post_hidden.ndim >= 2: |
| | |
| | if pre_hidden.ndim > 2: |
| | pre_agg = np.mean(pre_hidden, axis=0) |
| | post_agg = np.mean(post_hidden, axis=0) |
| | else: |
| | pre_agg = pre_hidden |
| | post_agg = post_hidden |
| | |
| | seq_len, hidden_dim = pre_agg.shape |
| | |
| | |
| | sample_size = min(hidden_dim, 100) |
| | sampled_dims = np.random.choice(hidden_dim, sample_size, replace=False) |
| | |
| | for pos in range(seq_len): |
| | for dim_idx, dim in enumerate(sampled_dims): |
| | pre_val = pre_agg[pos, dim] |
| | post_val = post_agg[pos, dim] |
| | |
| | if post_val < pre_val and abs(post_val) > self.activation_threshold: |
| | |
| | ghost_idx = len(ghost_circuits) |
| | ghost = { |
| | "circuit_id": f"hidden_ghost_{ghost_idx}", |
| | "activation": float(abs(post_val)), |
| | "circuit_type": "hidden_state", |
| | "source_tokens": [f"token_{pos}"], |
| | "target_tokens": [], |
| | "heads": [], |
| | "layers": [], |
| | "metadata": { |
| | "position": pos, |
| | "dimension": int(dim), |
| | "pre_activation": float(pre_val), |
| | "activation_delta": float(pre_val - post_val), |
| | "decay_ratio": float(post_val / pre_val) if pre_val != 0 else 0.0 |
| | } |
| | } |
| | ghost_circuits.append(ghost) |
| | |
| | return ghost_circuits |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | |
| | |
| | pre_state = { |
| | "attention_weights": np.random.random((8, 10, 10)), |
| | "hidden_states": np.random.random((1, 10, 768)) |
| | } |
| | |
| | |
| | post_state = { |
| | "attention_weights": pre_state["attention_weights"] * np.random.uniform(0.5, 1.0, pre_state["attention_weights"].shape), |
| | "hidden_states": pre_state["hidden_states"] * np.random.uniform(0.5, 1.0, pre_state["hidden_states"].shape) |
| | } |
| | |
| | |
| | tracker = ResidueTracker(amplification_factor=1.5) |
| | ghosts = tracker.extract_ghost_circuits(pre_state, post_state) |
| | |
| | |
| | print(f"Extracted {len(ghosts)} ghost circuits") |
| | |
| | |
| | classified = tracker.classify_ghost_circuits() |
| | for circuit_type, circuits in classified.items(): |
| | print(f" {circuit_type}: {len(circuits)} circuits") |
| | |
| | |
| | strength = tracker.measure_residue_strength() |
| | print(f"Residue strength: {strength:.3f}") |
| | |
| | |
| | amplified = tracker.amplify_ghosts(factor=2.0) |
| | print(f"Amplified {len(amplified)} ghost circuits") |
| |
|