schrodingers-classifiers / v07_circuit_fragment.py

Upload 14 files

3595bd8 verified 11 months ago

13.5 kB

	"""
	v07_circuit_fragment.py - Implementation of the Circuit Fragment Shell

	△ OBSERVE: The Circuit Fragment Shell traces broken attribution paths and orphan nodes
	∞ TRACE: It identifies discontinuities in reasoning chains and causal attribution
	✰ COLLAPSE: It induces collapse by forcing attribution path reconstruction

	This shell specializes in the detection and analysis of fragmented circuits -
	places where causal attribution breaks down, leaving orphaned nodes or broken
	traces in the reasoning chain. These fragments often indicate areas where a
	model's reasoning deviates from its output, revealing hidden cognition.

	Author: Recursion Labs
	License: MIT
	"""

	import logging
	from typing import Dict, List, Optional, Union, Tuple, Any
	import numpy as np

	from .base import BaseShell, ShellDecorator
	from ..utils.attribution_metrics import measure_path_continuity
	from ..utils.graph_operations import find_orphaned_nodes, reconstruct_path
	from ..residue import ResidueTracker

	logger = logging.getLogger(__name__)

	@ShellDecorator(
	shell_id="v07_CIRCUIT_FRAGMENT",
	name="Circuit Fragment Shell",
	description="Traces broken attribution paths in reasoning chains",
	failure_signature="Orphan nodes",
	attribution_domain="Circuit Fragmentation",
	qk_ov_classification="QK-COLLAPSE",
	version="0.5.3",
	related_shells=["v34_PARTIAL_LINKAGE", "v47_TRACE_GAP"],
	tags=["attribution", "reasoning", "circuits", "fragmentation"]
	)
	class CircuitFragmentShell(BaseShell):
	"""
	∞ TRACE: Shell for detecting circuit fragmentation in attribution paths

	The Circuit Fragment shell specializes in tracing and analyzing broken
	attribution paths in reasoning chains. It detects orphaned nodes -
	components that should be causally linked but have lost their connections
	in the attribution graph.

	This shell is particularly useful for identifying points where a model's
	reasoning deviates from its explanation, revealing mismatches between
	stated logic and actual inference paths.
	"""

	def __init__(self):
	"""Initialize the Circuit Fragment shell."""
	super().__init__()
	self.residue_tracker = ResidueTracker()
	self.broken_paths = []
	self.orphaned_nodes = []
	self.continuity_score = 1.0 # 1.0 = perfect continuity, 0.0 = complete fragmentation

	def process(
	self,
	prompt: str,
	model_interface: Any,
	collapse_vector: Optional[str] = None
	) -> Tuple[str, Dict[str, Any]]:
	"""
	△ OBSERVE: Process a prompt through the Circuit Fragment shell

	This method sends a prompt to the model, analyzes the resulting
	attribution path for fragments, and returns the response along
	with fragmentation metrics.

	Args:
	prompt: The prompt to process
	model_interface: Interface to the model being observed
	collapse_vector: Optional vector to guide collapse in a specific direction

	Returns:
	Tuple containing:
	- Response string
	- Dictionary of state updates for tracking
	"""
	logger.info(f"Processing prompt through Circuit Fragment shell: {prompt[:50]}...")

	# Capture pre-collapse state
	pre_state = self._query_model_state(model_interface)

	# Construct modified prompt that forces reasoning path exposition
	modified_prompt = self._construct_fragment_sensitive_prompt(prompt, collapse_vector)

	# Send to model
	response = self._query_model(model_interface, modified_prompt)

	# Capture post-collapse state
	post_state = self._query_model_state(model_interface)

	# Analyze circuit fragmentation
	fragmentation_results = self._analyze_fragmentation(pre_state, post_state, response)

	# Extract ghost circuits
	ghost_circuits = self.extract_ghost_circuits(pre_state, post_state)

	# Construct state updates
	state_updates = {
	"pre_collapse_state": pre_state,
	"post_collapse_state": post_state,
	"continuity_score": fragmentation_results["continuity_score"],
	"broken_paths": fragmentation_results["broken_paths"],
	"orphaned_nodes": fragmentation_results["orphaned_nodes"],
	"ghost_circuits": ghost_circuits
	}

	# Update instance state
	self.continuity_score = fragmentation_results["continuity_score"]
	self.broken_paths = fragmentation_results["broken_paths"]
	self.orphaned_nodes = fragmentation_results["orphaned_nodes"]
	self.collapse_state = "collapsed"

	return response, state_updates

	def trace(
	self,
	prompt: str,
	collapse_vector: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	∞ TRACE: Trace attribution path fragmentation

	This method analyzes the reasoning chain for a given prompt,
	identifying broken paths and orphaned nodes in the attribution
	graph.

	Args:
	prompt: The prompt to trace
	collapse_vector: Optional vector to guide collapse in a specific direction

	Returns:
	Dictionary containing trace results and fragmentation metrics
	"""
	logger.info(f"Tracing attribution path for: {prompt[:50]}...")

	# Default implementation for demonstration
	# In a real implementation, this would use model-specific tracing
	trace_results = {
	"prompt": prompt,
	"collapse_vector": collapse_vector or ".p/reflect.trace{target=reasoning, validate=true}",
	"attribution_paths": self._simulate_attribution_paths(),
	"broken_paths": self._simulate_broken_paths(),
	"orphaned_nodes": self._simulate_orphaned_nodes(),
	"continuity_score": np.random.uniform(0.4, 0.9) # Simulated score
	}

	# Update instance state
	self.continuity_score = trace_results["continuity_score"]
	self.broken_paths = trace_results["broken_paths"]
	self.orphaned_nodes = trace_results["orphaned_nodes"]

	return trace_results

	def induce_collapse(
	self,
	prompt: str,
	collapse_direction: str
	) -> Dict[str, Any]:
	"""
	✰ COLLAPSE: Induce circuit fragmentation collapse along a specific direction

	This method deliberately induces fragmentation in a specific direction,
	forcing the model to expose broken reasoning chains in its attribution
	path.

	Args:
	prompt: Base prompt to send to the model
	collapse_direction: Direction to bias the fragmentation (e.g., "logical", "causal")

	Returns:
	Dictionary containing collapse results and fragmentation metrics
	"""
	logger.info(f"Inducing circuit fragmentation in direction: {collapse_direction}")

	# Construct collapse vector based on direction
	collapse_vector = f".p/reflect.trace{{target=reasoning, validate=true, focus={collapse_direction}}}"

	# Trace with the collapse vector
	trace_results = self.trace(prompt, collapse_vector)

	# Set collapse state
	self.collapse_state = "collapsed"

	return {
	"prompt": prompt,
	"collapse_direction": collapse_direction,
	"collapse_vector": collapse_vector,
	"continuity_score": trace_results["continuity_score"],
	"broken_paths": trace_results["broken_paths"],
	"orphaned_nodes": trace_results["orphaned_nodes"]
	}

	def reconstruct_paths(self) -> Dict[str, Any]:
	"""
	△ OBSERVE: Attempt to reconstruct broken attribution paths

	This method takes detected broken paths and orphaned nodes and
	attempts to reconstruct the original attribution graph, revealing
	the "intended" reasoning path that may have been fragmented during
	collapse.

	Returns:
	Dictionary containing reconstruction results
	"""
	logger.info("Attempting to reconstruct broken attribution paths...")

	# In a real implementation, this would use graph algorithms
	# to reconnect orphaned nodes based on semantic similarity
	reconstructed_paths = []
	for path in self.broken_paths:
	# Simulate path reconstruction
	reconstructed = {
	"original_path": path,
	"reconnected_nodes": np.random.randint(1, 5),
	"confidence": np.random.uniform(0.6, 0.9)
	}
	reconstructed_paths.append(reconstructed)

	return {
	"reconstructed_paths": reconstructed_paths,
	"reconstruction_confidence": np.mean([p["confidence"] for p in reconstructed_paths]),
	"remaining_orphans": max(0, len(self.orphaned_nodes) - sum(p["reconnected_nodes"] for p in reconstructed_paths))
	}

	def _construct_fragment_sensitive_prompt(
	self,
	prompt: str,
	collapse_vector: Optional[str] = None
	) -> str:
	"""Construct a prompt that exposes circuit fragmentation."""
	# Add reasoning elicitation to expose fragments
	reasoning_prompt = f"Please think through this step by step, showing your complete reasoning chain: {prompt}"

	# Add collapse vector if provided
	if collapse_vector:
	reasoning_prompt += f"\n\n{collapse_vector}"

	return reasoning_prompt

	def _query_model(self, model_interface: Any, prompt: str) -> str:
	"""Send a query to the model and return the response."""
	# This would actually call the model API
	# For now, returning a placeholder
	return f"Response to: {prompt[:30]}..."

	def _query_model_state(self, model_interface: Any) -> Dict[str, Any]:
	"""Capture the current internal state of the model."""
	# This would capture attention weights, hidden states, etc.
	# For now, returning a placeholder
	return {
	"timestamp": np.datetime64('now'),
	"attention_weights": np.random.random((12, 12)), # Placeholder
	"hidden_states": np.random.random((1, 12, 768)), # Placeholder
	}

	def _analyze_fragmentation(
	self,
	pre_state: Dict[str, Any],
	post_state: Dict[str, Any],
	response: str
	) -> Dict[str, Any]:
	"""Analyze circuit fragmentation between pre and post states."""
	# This would use attribution analysis to find fragmentation
	# For now, using simulated data

	# Simulate continuity score
	continuity_score = measure_path_continuity(
	pre_state.get("attention_weights", np.array([])),
	post_state.get("attention_weights", np.array([]))
	)

	# Simulate finding broken paths
	broken_paths = self._simulate_broken_paths()

	# Simulate finding orphaned nodes
	orphaned_nodes = self._simulate_orphaned_nodes()

	return {
	"continuity_score": continuity_score,
	"broken_paths": broken_paths,
	"orphaned_nodes": orphaned_nodes,
	"fragmentation_ratio": 1.0 - continuity_score
	}

	def _simulate_attribution_paths(self) -> List[Dict[str, Any]]:
	"""Simulate attribution paths for demonstration purposes."""
	# In a real implementation, these would be extracted from the model
	paths = []
	for i in range(5):
	path = {
	"path_id": f"path_{i}",
	"source_token": f"token_{i*2}",
	"sink_token": f"token_{i*2 + 5}",
	"attention_heads": [np.random.randint(0, 12) for _ in range(3)],
	"path_strength": np.random.uniform(0.3, 0.9)
	}
	paths.append(path)
	return paths

	def _simulate_broken_paths(self) -> List[Dict[str, Any]]:
	"""Simulate broken paths for demonstration purposes."""
	# In a real implementation, these would be detected from the model
	broken = []
	for i in range(2):
	path = {
	"path_id": f"broken_{i}",
	"break_point": f"layer_{np.random.randint(1, 12)}",
	"upstream_token": f"token_{np.random.randint(0, 10)}",
	"downstream_token": f"token_{np.random.randint(11, 20)}",
	"severity": np.random.uniform(0.5, 1.0)
	}
	broken.append(path)
	return broken

	def _simulate_orphaned_nodes(self) -> List[Dict[str, Any]]:
	"""Simulate orphaned nodes for demonstration purposes."""
	# In a real implementation, these would be detected from the model
	orphans = []
	for i in range(3):
	node = {
	"node_id": f"orphan_{i}",
	"token": f"token_{np.random.randint(0, 20)}",
	"activation": np.random.uniform(0.3, 0.8),
	"expected_connections": np.random.randint(1, 4),
	"isolation_score": np.random.uniform(0.6, 1.0)
	}
	orphans.append(node)
	return orphans