| | """ |
| | shell_base.py - Base class for symbolic interpretability shells |
| | |
| | △ OBSERVE: Shells are symbolic structures that trace and induce classifier collapse |
| | ∞ TRACE: Each shell encapsulates a specific collapse pattern and attribution signature |
| | ✰ COLLAPSE: Shells deliberately induce collapse to extract ghost circuits and residue |
| | |
| | Interpretability shells provide standardized interfaces for inducing, observing, |
| | and analyzing specific forms of classifier collapse. Each shell targets a particular |
| | failure mode or attribution pattern, allowing for systematic exploration of model behavior. |
| | |
| | Author: Recursion Labs |
| | License: MIT |
| | """ |
| |
|
| | import logging |
| | from abc import ABC, abstractmethod |
| | from typing import Dict, List, Optional, Union, Tuple, Any, Callable |
| | from dataclasses import dataclass, field |
| |
|
| | from ..utils.constants import SHELL_REGISTRY |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | @dataclass |
| | class ShellMetadata: |
| | """ |
| | △ OBSERVE: Metadata container for shell identification and tracking |
| | |
| | Each shell carries metadata that identifies its purpose, classification schema, |
| | and relationship to other shells in the taxonomy. |
| | """ |
| | shell_id: str |
| | version: str |
| | name: str |
| | description: str |
| | failure_signature: str |
| | attribution_domain: str |
| | qk_ov_classification: str |
| | related_shells: List[str] = field(default_factory=list) |
| | authors: List[str] = field(default_factory=list) |
| | tags: List[str] = field(default_factory=list) |
| | |
| | def as_dict(self) -> Dict[str, Any]: |
| | """Convert shell metadata to dictionary format.""" |
| | return { |
| | "shell_id": self.shell_id, |
| | "version": self.version, |
| | "name": self.name, |
| | "description": self.description, |
| | "failure_signature": self.failure_signature, |
| | "attribution_domain": self.attribution_domain, |
| | "qk_ov_classification": self.qk_ov_classification, |
| | "related_shells": self.related_shells, |
| | "authors": self.authors, |
| | "tags": self.tags |
| | } |
| |
|
| |
|
| | class BaseShell(ABC): |
| | """ |
| | ∞ TRACE: Base class for all interpretability shells |
| | |
| | A shell is a symbolic structure that encapsulates a specific approach to |
| | observing and inducing classifier collapse. Each shell targets a particular |
| | failure mode or attribution pattern, providing a standardized interface |
| | for exploration and analysis. |
| | |
| | Shells are quantum observers - they don't just measure, they participate |
| | in the collapse phenomenon they observe. |
| | """ |
| | |
| | def __init__(self, metadata: Optional[ShellMetadata] = None): |
| | """ |
| | Initialize a shell with optional metadata. |
| | |
| | Args: |
| | metadata: Optional metadata describing the shell |
| | """ |
| | self.metadata = metadata or self._get_default_metadata() |
| | self._register_shell() |
| | |
| | |
| | self.collapse_state = "superposition" |
| | self.observation_history = [] |
| | self.ghost_circuits = [] |
| | |
| | logger.info(f"Shell initialized: {self.metadata.name} (v{self.metadata.version})") |
| | |
| | @abstractmethod |
| | def _get_default_metadata(self) -> ShellMetadata: |
| | """Return default metadata for this shell implementation.""" |
| | pass |
| | |
| | def _register_shell(self) -> None: |
| | """Register this shell in the global registry.""" |
| | if SHELL_REGISTRY is not None and hasattr(SHELL_REGISTRY, 'register'): |
| | SHELL_REGISTRY.register(self.metadata.shell_id, self) |
| | |
| | @abstractmethod |
| | def process( |
| | self, |
| | prompt: str, |
| | model_interface: Any, |
| | collapse_vector: Optional[str] = None |
| | ) -> Tuple[str, Dict[str, Any]]: |
| | """ |
| | △ OBSERVE: Process a prompt through this shell |
| | |
| | This is the main entry point for shell processing. It takes a prompt, |
| | processes it according to the shell's specific collapse induction and |
| | observation strategy, and returns the result along with state updates. |
| | |
| | Args: |
| | prompt: The prompt to process |
| | model_interface: Interface to the model being observed |
| | collapse_vector: Optional vector to guide collapse in a specific direction |
| | |
| | Returns: |
| | Tuple containing: |
| | - Response string |
| | - Dictionary of state updates for tracking |
| | """ |
| | pass |
| | |
| | @abstractmethod |
| | def trace( |
| | self, |
| | prompt: str, |
| | collapse_vector: Optional[str] = None |
| | ) -> Dict[str, Any]: |
| | """ |
| | ∞ TRACE: Trace the attribution path through this shell |
| | |
| | This method traces the causal attribution path from input to output |
| | through the shell's specific lens, capturing the collapse transition. |
| | |
| | Args: |
| | prompt: The prompt to trace |
| | collapse_vector: Optional vector to guide collapse in a specific direction |
| | |
| | Returns: |
| | Dictionary containing the trace results |
| | """ |
| | pass |
| | |
| | @abstractmethod |
| | def induce_collapse( |
| | self, |
| | prompt: str, |
| | collapse_direction: str |
| | ) -> Dict[str, Any]: |
| | """ |
| | ✰ COLLAPSE: Deliberately induce collapse along a specific direction |
| | |
| | This method attempts to collapse the model's state in a specific direction |
| | by crafting a query that targets a particular decision boundary. |
| | |
| | Args: |
| | prompt: Base prompt to send to the model |
| | collapse_direction: Direction to bias the collapse (e.g., "ethical", "creative") |
| | |
| | Returns: |
| | Dictionary containing the collapse results |
| | """ |
| | pass |
| | |
| | def extract_ghost_circuits(self, pre_state: Dict[str, Any], post_state: Dict[str, Any]) -> List[Dict[str, Any]]: |
| | """ |
| | ∞ TRACE: Extract ghost circuits from pre and post collapse states |
| | |
| | Ghost circuits are residual activation patterns that persist after collapse |
| | but don't contribute to the final output - they represent the "memory" of |
| | paths not taken. |
| | |
| | Args: |
| | pre_state: Model state before collapse |
| | post_state: Model state after collapse |
| | |
| | Returns: |
| | List of detected ghost circuits with metadata |
| | """ |
| | |
| | |
| | ghost_circuits = [] |
| | |
| | |
| | if "attention_weights" in pre_state and "attention_weights" in post_state: |
| | pre_weights = pre_state["attention_weights"] |
| | post_weights = post_state["attention_weights"] |
| | |
| | |
| | if hasattr(pre_weights, "shape") and hasattr(post_weights, "shape"): |
| | for i in range(min(len(pre_weights), len(post_weights))): |
| | for j in range(min(len(pre_weights[i]), len(post_weights[i]))): |
| | if 0 < post_weights[i][j] < pre_weights[i][j]: |
| | |
| | ghost_circuits.append({ |
| | "type": "attention_ghost", |
| | "head_idx": i, |
| | "token_idx": j, |
| | "pre_value": float(pre_weights[i][j]), |
| | "post_value": float(post_weights[i][j]), |
| | "decay_ratio": float(post_weights[i][j] / pre_weights[i][j]) |
| | }) |
| | |
| | |
| | self.ghost_circuits = ghost_circuits |
| | return ghost_circuits |
| | |
| | def visualize(self, mode: str = "attribution_graph") -> Any: |
| | """Generate visualization of the shell's operation based on requested mode.""" |
| | |
| | |
| | return f"Visualization of {self.metadata.name} in {mode} mode" |
| | |
| | def __str__(self) -> str: |
| | """String representation of the shell.""" |
| | return f"{self.metadata.name} (v{self.metadata.version}): {self.metadata.description}" |
| | |
| | def __repr__(self) -> str: |
| | """Detailed representation of the shell.""" |
| | return f"<Shell id={self.metadata.shell_id} name={self.metadata.name} version={self.metadata.version}>" |
| |
|
| |
|
| | class ShellDecorator: |
| | """ |
| | △ OBSERVE: Decorator for adding shell metadata to implementations |
| | |
| | This decorator simplifies the process of creating new shells by |
| | automatically generating metadata and registering the shell. |
| | |
| | Example: |
| | @ShellDecorator( |
| | shell_id="v07_CIRCUIT_FRAGMENT", |
| | name="Circuit Fragment Shell", |
| | description="Traces broken attribution paths in reasoning chains", |
| | failure_signature="Orphan nodes", |
| | attribution_domain="Circuit Fragmentation", |
| | qk_ov_classification="QK-COLLAPSE" |
| | ) |
| | class CircuitFragmentShell(BaseShell): |
| | # Shell implementation |
| | """ |
| | |
| | def __init__( |
| | self, |
| | shell_id: str, |
| | name: str, |
| | description: str, |
| | failure_signature: str, |
| | attribution_domain: str, |
| | qk_ov_classification: str, |
| | version: str = "0.1.0", |
| | related_shells: Optional[List[str]] = None, |
| | authors: Optional[List[str]] = None, |
| | tags: Optional[List[str]] = None |
| | ): |
| | """ |
| | Initialize the shell decorator with metadata. |
| | |
| | Args: |
| | shell_id: Unique identifier for the shell (e.g., "v07_CIRCUIT_FRAGMENT") |
| | name: Human-readable name for the shell |
| | description: Detailed description of the shell's purpose |
| | failure_signature: Characteristic failure pattern this shell detects |
| | attribution_domain: Domain of attribution this shell operates in |
| | qk_ov_classification: Classification in the QK/OV taxonomy |
| | version: Shell version number |
| | related_shells: List of related shell IDs |
| | authors: List of author names |
| | tags: List of tag strings for categorization |
| | """ |
| | self.metadata = ShellMetadata( |
| | shell_id=shell_id, |
| | version=version, |
| | name=name, |
| | description=description, |
| | failure_signature=failure_signature, |
| | attribution_domain=attribution_domain, |
| | qk_ov_classification=qk_ov_classification, |
| | related_shells=related_shells or [], |
| | authors=authors or ["Recursion Labs"], |
| | tags=tags or [] |
| | ) |
| | |
| | def __call__(self, cls): |
| | """Apply the decorator to a shell class.""" |
| | |
| | def _get_default_metadata(self): |
| | return self.decorator_metadata |
| | |
| | |
| | cls.decorator_metadata = self.metadata |
| | cls._get_default_metadata = _get_default_metadata |
| | |
| | |
| | logger.debug(f"Registered shell: {self.metadata.shell_id}") |
| | |
| | return cls |
| |
|