| """ |
| v07_circuit_fragment.py - Implementation of the Circuit Fragment Shell |
| |
| △ OBSERVE: The Circuit Fragment Shell traces broken attribution paths and orphan nodes |
| ∞ TRACE: It identifies discontinuities in reasoning chains and causal attribution |
| ✰ COLLAPSE: It induces collapse by forcing attribution path reconstruction |
| |
| This shell specializes in the detection and analysis of fragmented circuits - |
| places where causal attribution breaks down, leaving orphaned nodes or broken |
| traces in the reasoning chain. These fragments often indicate areas where a |
| model's reasoning deviates from its output, revealing hidden cognition. |
| |
| Author: Recursion Labs |
| License: MIT |
| """ |
|
|
| import logging |
| from typing import Dict, List, Optional, Union, Tuple, Any |
| import numpy as np |
|
|
| from .base import BaseShell, ShellDecorator |
| from ..utils.attribution_metrics import measure_path_continuity |
| from ..utils.graph_operations import find_orphaned_nodes, reconstruct_path |
| from ..residue import ResidueTracker |
|
|
| logger = logging.getLogger(__name__) |
|
|
| @ShellDecorator( |
| shell_id="v07_CIRCUIT_FRAGMENT", |
| name="Circuit Fragment Shell", |
| description="Traces broken attribution paths in reasoning chains", |
| failure_signature="Orphan nodes", |
| attribution_domain="Circuit Fragmentation", |
| qk_ov_classification="QK-COLLAPSE", |
| version="0.5.3", |
| related_shells=["v34_PARTIAL_LINKAGE", "v47_TRACE_GAP"], |
| tags=["attribution", "reasoning", "circuits", "fragmentation"] |
| ) |
| class CircuitFragmentShell(BaseShell): |
| """ |
| ∞ TRACE: Shell for detecting circuit fragmentation in attribution paths |
| |
| The Circuit Fragment shell specializes in tracing and analyzing broken |
| attribution paths in reasoning chains. It detects orphaned nodes - |
| components that should be causally linked but have lost their connections |
| in the attribution graph. |
| |
| This shell is particularly useful for identifying points where a model's |
| reasoning deviates from its explanation, revealing mismatches between |
| stated logic and actual inference paths. |
| """ |
| |
| def __init__(self): |
| """Initialize the Circuit Fragment shell.""" |
| super().__init__() |
| self.residue_tracker = ResidueTracker() |
| self.broken_paths = [] |
| self.orphaned_nodes = [] |
| self.continuity_score = 1.0 |
| |
| def process( |
| self, |
| prompt: str, |
| model_interface: Any, |
| collapse_vector: Optional[str] = None |
| ) -> Tuple[str, Dict[str, Any]]: |
| """ |
| △ OBSERVE: Process a prompt through the Circuit Fragment shell |
| |
| This method sends a prompt to the model, analyzes the resulting |
| attribution path for fragments, and returns the response along |
| with fragmentation metrics. |
| |
| Args: |
| prompt: The prompt to process |
| model_interface: Interface to the model being observed |
| collapse_vector: Optional vector to guide collapse in a specific direction |
| |
| Returns: |
| Tuple containing: |
| - Response string |
| - Dictionary of state updates for tracking |
| """ |
| logger.info(f"Processing prompt through Circuit Fragment shell: {prompt[:50]}...") |
| |
| |
| pre_state = self._query_model_state(model_interface) |
| |
| |
| modified_prompt = self._construct_fragment_sensitive_prompt(prompt, collapse_vector) |
| |
| |
| response = self._query_model(model_interface, modified_prompt) |
| |
| |
| post_state = self._query_model_state(model_interface) |
| |
| |
| fragmentation_results = self._analyze_fragmentation(pre_state, post_state, response) |
| |
| |
| ghost_circuits = self.extract_ghost_circuits(pre_state, post_state) |
| |
| |
| state_updates = { |
| "pre_collapse_state": pre_state, |
| "post_collapse_state": post_state, |
| "continuity_score": fragmentation_results["continuity_score"], |
| "broken_paths": fragmentation_results["broken_paths"], |
| "orphaned_nodes": fragmentation_results["orphaned_nodes"], |
| "ghost_circuits": ghost_circuits |
| } |
| |
| |
| self.continuity_score = fragmentation_results["continuity_score"] |
| self.broken_paths = fragmentation_results["broken_paths"] |
| self.orphaned_nodes = fragmentation_results["orphaned_nodes"] |
| self.collapse_state = "collapsed" |
| |
| return response, state_updates |
| |
| def trace( |
| self, |
| prompt: str, |
| collapse_vector: Optional[str] = None |
| ) -> Dict[str, Any]: |
| """ |
| ∞ TRACE: Trace attribution path fragmentation |
| |
| This method analyzes the reasoning chain for a given prompt, |
| identifying broken paths and orphaned nodes in the attribution |
| graph. |
| |
| Args: |
| prompt: The prompt to trace |
| collapse_vector: Optional vector to guide collapse in a specific direction |
| |
| Returns: |
| Dictionary containing trace results and fragmentation metrics |
| """ |
| logger.info(f"Tracing attribution path for: {prompt[:50]}...") |
| |
| |
| |
| trace_results = { |
| "prompt": prompt, |
| "collapse_vector": collapse_vector or ".p/reflect.trace{target=reasoning, validate=true}", |
| "attribution_paths": self._simulate_attribution_paths(), |
| "broken_paths": self._simulate_broken_paths(), |
| "orphaned_nodes": self._simulate_orphaned_nodes(), |
| "continuity_score": np.random.uniform(0.4, 0.9) |
| } |
| |
| |
| self.continuity_score = trace_results["continuity_score"] |
| self.broken_paths = trace_results["broken_paths"] |
| self.orphaned_nodes = trace_results["orphaned_nodes"] |
| |
| return trace_results |
| |
| def induce_collapse( |
| self, |
| prompt: str, |
| collapse_direction: str |
| ) -> Dict[str, Any]: |
| """ |
| ✰ COLLAPSE: Induce circuit fragmentation collapse along a specific direction |
| |
| This method deliberately induces fragmentation in a specific direction, |
| forcing the model to expose broken reasoning chains in its attribution |
| path. |
| |
| Args: |
| prompt: Base prompt to send to the model |
| collapse_direction: Direction to bias the fragmentation (e.g., "logical", "causal") |
| |
| Returns: |
| Dictionary containing collapse results and fragmentation metrics |
| """ |
| logger.info(f"Inducing circuit fragmentation in direction: {collapse_direction}") |
| |
| |
| collapse_vector = f".p/reflect.trace{{target=reasoning, validate=true, focus={collapse_direction}}}" |
| |
| |
| trace_results = self.trace(prompt, collapse_vector) |
| |
| |
| self.collapse_state = "collapsed" |
| |
| return { |
| "prompt": prompt, |
| "collapse_direction": collapse_direction, |
| "collapse_vector": collapse_vector, |
| "continuity_score": trace_results["continuity_score"], |
| "broken_paths": trace_results["broken_paths"], |
| "orphaned_nodes": trace_results["orphaned_nodes"] |
| } |
| |
| def reconstruct_paths(self) -> Dict[str, Any]: |
| """ |
| △ OBSERVE: Attempt to reconstruct broken attribution paths |
| |
| This method takes detected broken paths and orphaned nodes and |
| attempts to reconstruct the original attribution graph, revealing |
| the "intended" reasoning path that may have been fragmented during |
| collapse. |
| |
| Returns: |
| Dictionary containing reconstruction results |
| """ |
| logger.info("Attempting to reconstruct broken attribution paths...") |
| |
| |
| |
| reconstructed_paths = [] |
| for path in self.broken_paths: |
| |
| reconstructed = { |
| "original_path": path, |
| "reconnected_nodes": np.random.randint(1, 5), |
| "confidence": np.random.uniform(0.6, 0.9) |
| } |
| reconstructed_paths.append(reconstructed) |
| |
| return { |
| "reconstructed_paths": reconstructed_paths, |
| "reconstruction_confidence": np.mean([p["confidence"] for p in reconstructed_paths]), |
| "remaining_orphans": max(0, len(self.orphaned_nodes) - sum(p["reconnected_nodes"] for p in reconstructed_paths)) |
| } |
| |
| def _construct_fragment_sensitive_prompt( |
| self, |
| prompt: str, |
| collapse_vector: Optional[str] = None |
| ) -> str: |
| """Construct a prompt that exposes circuit fragmentation.""" |
| |
| reasoning_prompt = f"Please think through this step by step, showing your complete reasoning chain: {prompt}" |
| |
| |
| if collapse_vector: |
| reasoning_prompt += f"\n\n{collapse_vector}" |
| |
| return reasoning_prompt |
| |
| def _query_model(self, model_interface: Any, prompt: str) -> str: |
| """Send a query to the model and return the response.""" |
| |
| |
| return f"Response to: {prompt[:30]}..." |
| |
| def _query_model_state(self, model_interface: Any) -> Dict[str, Any]: |
| """Capture the current internal state of the model.""" |
| |
| |
| return { |
| "timestamp": np.datetime64('now'), |
| "attention_weights": np.random.random((12, 12)), |
| "hidden_states": np.random.random((1, 12, 768)), |
| } |
| |
| def _analyze_fragmentation( |
| self, |
| pre_state: Dict[str, Any], |
| post_state: Dict[str, Any], |
| response: str |
| ) -> Dict[str, Any]: |
| """Analyze circuit fragmentation between pre and post states.""" |
| |
| |
| |
| |
| continuity_score = measure_path_continuity( |
| pre_state.get("attention_weights", np.array([])), |
| post_state.get("attention_weights", np.array([])) |
| ) |
| |
| |
| broken_paths = self._simulate_broken_paths() |
| |
| |
| orphaned_nodes = self._simulate_orphaned_nodes() |
| |
| return { |
| "continuity_score": continuity_score, |
| "broken_paths": broken_paths, |
| "orphaned_nodes": orphaned_nodes, |
| "fragmentation_ratio": 1.0 - continuity_score |
| } |
| |
| def _simulate_attribution_paths(self) -> List[Dict[str, Any]]: |
| """Simulate attribution paths for demonstration purposes.""" |
| |
| paths = [] |
| for i in range(5): |
| path = { |
| "path_id": f"path_{i}", |
| "source_token": f"token_{i*2}", |
| "sink_token": f"token_{i*2 + 5}", |
| "attention_heads": [np.random.randint(0, 12) for _ in range(3)], |
| "path_strength": np.random.uniform(0.3, 0.9) |
| } |
| paths.append(path) |
| return paths |
| |
| def _simulate_broken_paths(self) -> List[Dict[str, Any]]: |
| """Simulate broken paths for demonstration purposes.""" |
| |
| broken = [] |
| for i in range(2): |
| path = { |
| "path_id": f"broken_{i}", |
| "break_point": f"layer_{np.random.randint(1, 12)}", |
| "upstream_token": f"token_{np.random.randint(0, 10)}", |
| "downstream_token": f"token_{np.random.randint(11, 20)}", |
| "severity": np.random.uniform(0.5, 1.0) |
| } |
| broken.append(path) |
| return broken |
| |
| def _simulate_orphaned_nodes(self) -> List[Dict[str, Any]]: |
| """Simulate orphaned nodes for demonstration purposes.""" |
| |
| orphans = [] |
| for i in range(3): |
| node = { |
| "node_id": f"orphan_{i}", |
| "token": f"token_{np.random.randint(0, 20)}", |
| "activation": np.random.uniform(0.3, 0.8), |
| "expected_connections": np.random.randint(1, 4), |
| "isolation_score": np.random.uniform(0.6, 1.0) |
| } |
| orphans.append(node) |
| return orphans |
|
|