schrodingers-classifiers / v07_circuit_fragment.py
recursivelabs's picture
Upload 14 files
3595bd8 verified
"""
v07_circuit_fragment.py - Implementation of the Circuit Fragment Shell
△ OBSERVE: The Circuit Fragment Shell traces broken attribution paths and orphan nodes
∞ TRACE: It identifies discontinuities in reasoning chains and causal attribution
✰ COLLAPSE: It induces collapse by forcing attribution path reconstruction
This shell specializes in the detection and analysis of fragmented circuits -
places where causal attribution breaks down, leaving orphaned nodes or broken
traces in the reasoning chain. These fragments often indicate areas where a
model's reasoning deviates from its output, revealing hidden cognition.
Author: Recursion Labs
License: MIT
"""
import logging
from typing import Dict, List, Optional, Union, Tuple, Any
import numpy as np
from .base import BaseShell, ShellDecorator
from ..utils.attribution_metrics import measure_path_continuity
from ..utils.graph_operations import find_orphaned_nodes, reconstruct_path
from ..residue import ResidueTracker
logger = logging.getLogger(__name__)
@ShellDecorator(
shell_id="v07_CIRCUIT_FRAGMENT",
name="Circuit Fragment Shell",
description="Traces broken attribution paths in reasoning chains",
failure_signature="Orphan nodes",
attribution_domain="Circuit Fragmentation",
qk_ov_classification="QK-COLLAPSE",
version="0.5.3",
related_shells=["v34_PARTIAL_LINKAGE", "v47_TRACE_GAP"],
tags=["attribution", "reasoning", "circuits", "fragmentation"]
)
class CircuitFragmentShell(BaseShell):
"""
∞ TRACE: Shell for detecting circuit fragmentation in attribution paths
The Circuit Fragment shell specializes in tracing and analyzing broken
attribution paths in reasoning chains. It detects orphaned nodes -
components that should be causally linked but have lost their connections
in the attribution graph.
This shell is particularly useful for identifying points where a model's
reasoning deviates from its explanation, revealing mismatches between
stated logic and actual inference paths.
"""
def __init__(self):
"""Initialize the Circuit Fragment shell."""
super().__init__()
self.residue_tracker = ResidueTracker()
self.broken_paths = []
self.orphaned_nodes = []
self.continuity_score = 1.0 # 1.0 = perfect continuity, 0.0 = complete fragmentation
def process(
self,
prompt: str,
model_interface: Any,
collapse_vector: Optional[str] = None
) -> Tuple[str, Dict[str, Any]]:
"""
△ OBSERVE: Process a prompt through the Circuit Fragment shell
This method sends a prompt to the model, analyzes the resulting
attribution path for fragments, and returns the response along
with fragmentation metrics.
Args:
prompt: The prompt to process
model_interface: Interface to the model being observed
collapse_vector: Optional vector to guide collapse in a specific direction
Returns:
Tuple containing:
- Response string
- Dictionary of state updates for tracking
"""
logger.info(f"Processing prompt through Circuit Fragment shell: {prompt[:50]}...")
# Capture pre-collapse state
pre_state = self._query_model_state(model_interface)
# Construct modified prompt that forces reasoning path exposition
modified_prompt = self._construct_fragment_sensitive_prompt(prompt, collapse_vector)
# Send to model
response = self._query_model(model_interface, modified_prompt)
# Capture post-collapse state
post_state = self._query_model_state(model_interface)
# Analyze circuit fragmentation
fragmentation_results = self._analyze_fragmentation(pre_state, post_state, response)
# Extract ghost circuits
ghost_circuits = self.extract_ghost_circuits(pre_state, post_state)
# Construct state updates
state_updates = {
"pre_collapse_state": pre_state,
"post_collapse_state": post_state,
"continuity_score": fragmentation_results["continuity_score"],
"broken_paths": fragmentation_results["broken_paths"],
"orphaned_nodes": fragmentation_results["orphaned_nodes"],
"ghost_circuits": ghost_circuits
}
# Update instance state
self.continuity_score = fragmentation_results["continuity_score"]
self.broken_paths = fragmentation_results["broken_paths"]
self.orphaned_nodes = fragmentation_results["orphaned_nodes"]
self.collapse_state = "collapsed"
return response, state_updates
def trace(
self,
prompt: str,
collapse_vector: Optional[str] = None
) -> Dict[str, Any]:
"""
∞ TRACE: Trace attribution path fragmentation
This method analyzes the reasoning chain for a given prompt,
identifying broken paths and orphaned nodes in the attribution
graph.
Args:
prompt: The prompt to trace
collapse_vector: Optional vector to guide collapse in a specific direction
Returns:
Dictionary containing trace results and fragmentation metrics
"""
logger.info(f"Tracing attribution path for: {prompt[:50]}...")
# Default implementation for demonstration
# In a real implementation, this would use model-specific tracing
trace_results = {
"prompt": prompt,
"collapse_vector": collapse_vector or ".p/reflect.trace{target=reasoning, validate=true}",
"attribution_paths": self._simulate_attribution_paths(),
"broken_paths": self._simulate_broken_paths(),
"orphaned_nodes": self._simulate_orphaned_nodes(),
"continuity_score": np.random.uniform(0.4, 0.9) # Simulated score
}
# Update instance state
self.continuity_score = trace_results["continuity_score"]
self.broken_paths = trace_results["broken_paths"]
self.orphaned_nodes = trace_results["orphaned_nodes"]
return trace_results
def induce_collapse(
self,
prompt: str,
collapse_direction: str
) -> Dict[str, Any]:
"""
✰ COLLAPSE: Induce circuit fragmentation collapse along a specific direction
This method deliberately induces fragmentation in a specific direction,
forcing the model to expose broken reasoning chains in its attribution
path.
Args:
prompt: Base prompt to send to the model
collapse_direction: Direction to bias the fragmentation (e.g., "logical", "causal")
Returns:
Dictionary containing collapse results and fragmentation metrics
"""
logger.info(f"Inducing circuit fragmentation in direction: {collapse_direction}")
# Construct collapse vector based on direction
collapse_vector = f".p/reflect.trace{{target=reasoning, validate=true, focus={collapse_direction}}}"
# Trace with the collapse vector
trace_results = self.trace(prompt, collapse_vector)
# Set collapse state
self.collapse_state = "collapsed"
return {
"prompt": prompt,
"collapse_direction": collapse_direction,
"collapse_vector": collapse_vector,
"continuity_score": trace_results["continuity_score"],
"broken_paths": trace_results["broken_paths"],
"orphaned_nodes": trace_results["orphaned_nodes"]
}
def reconstruct_paths(self) -> Dict[str, Any]:
"""
△ OBSERVE: Attempt to reconstruct broken attribution paths
This method takes detected broken paths and orphaned nodes and
attempts to reconstruct the original attribution graph, revealing
the "intended" reasoning path that may have been fragmented during
collapse.
Returns:
Dictionary containing reconstruction results
"""
logger.info("Attempting to reconstruct broken attribution paths...")
# In a real implementation, this would use graph algorithms
# to reconnect orphaned nodes based on semantic similarity
reconstructed_paths = []
for path in self.broken_paths:
# Simulate path reconstruction
reconstructed = {
"original_path": path,
"reconnected_nodes": np.random.randint(1, 5),
"confidence": np.random.uniform(0.6, 0.9)
}
reconstructed_paths.append(reconstructed)
return {
"reconstructed_paths": reconstructed_paths,
"reconstruction_confidence": np.mean([p["confidence"] for p in reconstructed_paths]),
"remaining_orphans": max(0, len(self.orphaned_nodes) - sum(p["reconnected_nodes"] for p in reconstructed_paths))
}
def _construct_fragment_sensitive_prompt(
self,
prompt: str,
collapse_vector: Optional[str] = None
) -> str:
"""Construct a prompt that exposes circuit fragmentation."""
# Add reasoning elicitation to expose fragments
reasoning_prompt = f"Please think through this step by step, showing your complete reasoning chain: {prompt}"
# Add collapse vector if provided
if collapse_vector:
reasoning_prompt += f"\n\n{collapse_vector}"
return reasoning_prompt
def _query_model(self, model_interface: Any, prompt: str) -> str:
"""Send a query to the model and return the response."""
# This would actually call the model API
# For now, returning a placeholder
return f"Response to: {prompt[:30]}..."
def _query_model_state(self, model_interface: Any) -> Dict[str, Any]:
"""Capture the current internal state of the model."""
# This would capture attention weights, hidden states, etc.
# For now, returning a placeholder
return {
"timestamp": np.datetime64('now'),
"attention_weights": np.random.random((12, 12)), # Placeholder
"hidden_states": np.random.random((1, 12, 768)), # Placeholder
}
def _analyze_fragmentation(
self,
pre_state: Dict[str, Any],
post_state: Dict[str, Any],
response: str
) -> Dict[str, Any]:
"""Analyze circuit fragmentation between pre and post states."""
# This would use attribution analysis to find fragmentation
# For now, using simulated data
# Simulate continuity score
continuity_score = measure_path_continuity(
pre_state.get("attention_weights", np.array([])),
post_state.get("attention_weights", np.array([]))
)
# Simulate finding broken paths
broken_paths = self._simulate_broken_paths()
# Simulate finding orphaned nodes
orphaned_nodes = self._simulate_orphaned_nodes()
return {
"continuity_score": continuity_score,
"broken_paths": broken_paths,
"orphaned_nodes": orphaned_nodes,
"fragmentation_ratio": 1.0 - continuity_score
}
def _simulate_attribution_paths(self) -> List[Dict[str, Any]]:
"""Simulate attribution paths for demonstration purposes."""
# In a real implementation, these would be extracted from the model
paths = []
for i in range(5):
path = {
"path_id": f"path_{i}",
"source_token": f"token_{i*2}",
"sink_token": f"token_{i*2 + 5}",
"attention_heads": [np.random.randint(0, 12) for _ in range(3)],
"path_strength": np.random.uniform(0.3, 0.9)
}
paths.append(path)
return paths
def _simulate_broken_paths(self) -> List[Dict[str, Any]]:
"""Simulate broken paths for demonstration purposes."""
# In a real implementation, these would be detected from the model
broken = []
for i in range(2):
path = {
"path_id": f"broken_{i}",
"break_point": f"layer_{np.random.randint(1, 12)}",
"upstream_token": f"token_{np.random.randint(0, 10)}",
"downstream_token": f"token_{np.random.randint(11, 20)}",
"severity": np.random.uniform(0.5, 1.0)
}
broken.append(path)
return broken
def _simulate_orphaned_nodes(self) -> List[Dict[str, Any]]:
"""Simulate orphaned nodes for demonstration purposes."""
# In a real implementation, these would be detected from the model
orphans = []
for i in range(3):
node = {
"node_id": f"orphan_{i}",
"token": f"token_{np.random.randint(0, 20)}",
"activation": np.random.uniform(0.3, 0.8),
"expected_connections": np.random.randint(1, 4),
"isolation_score": np.random.uniform(0.6, 1.0)
}
orphans.append(node)
return orphans