theapemachine
/

cortex

Model card Files Files and versions

xet

Community

theapemachine commited on 27 days ago

Commit

82900ee

verified ·

1 Parent(s): ba8ab4a

Add cortex/steering_vector.py

Browse files

Files changed (1) hide show

cortex/steering_vector.py +184 -0

cortex/steering_vector.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+SteeringVector: Activation-space behavioral control.
+Inspired by Representation Engineering (Zou et al. 2023) and LoRRA.
+Architecture:
+    - Maintains a set of named "concept directions" in activation space
+    - Each direction is a vector in R^D extracted via contrastive activation pairs
+    - At inference time, directions are added to the residual stream with learnable weights
+    - Directions can be extracted, composed, and interpolated
+Failure mode addressed:
+    - Behavioral inflexibility: models have fixed behaviors baked in during training.
+      Steering vectors allow runtime control without retraining.
+    - Safety/alignment: can steer toward/away from toxicity, bias, refusal behaviors
+    - Persona control: steer toward specific communication styles
+    - Truthfulness: steer toward directions associated with factual vs confabulated outputs
+Injection point: RESIDUAL_STREAM
+    - Rationale: The residual stream is the "information highway" of the transformer.
+      Additive modifications here have the most direct effect on all downstream layers.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union, List, Dict, Tuple
+from cortex.core import CortexModule, InjectionPoint
+class SteeringVector(CortexModule):
+    """
+    Adds learned/extracted direction vectors to the residual stream.
+    Supports two modes:
+    1. Extracted: Directions from contrastive activation analysis (RepE-style)
+    2. Learned: Directions trained end-to-end from task data
+    Multiple named directions can be composed with individual weights.
+    Args:
+        hidden_dim: Model hidden dimension
+        num_directions: Number of independent steering directions
+        direction_names: Optional names for each direction
+        alpha_init: Initial steering strength (learnable)
+        normalize: Whether to L2-normalize directions
+    """
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_directions: int = 4,
+        direction_names: Optional[List[str]] = None,
+        alpha_init: float = 0.0,
+        normalize: bool = True,
+        target_layers: Union[List[int], str] = "middle",
+    ):
+        super().__init__(InjectionPoint.RESIDUAL_STREAM, target_layers)
+        self.hidden_dim = hidden_dim
+        self.num_directions = num_directions
+        self.normalize = normalize
+        if direction_names is None:
+            direction_names = [f"direction_{i}" for i in range(num_directions)]
+        self.direction_names = direction_names
+        # Learnable direction vectors
+        self.directions = nn.Parameter(torch.randn(num_directions, hidden_dim) * 0.02)
+        # Per-direction steering strength (learnable)
+        self.alphas = nn.Parameter(torch.full((num_directions,), alpha_init))
+        # Per-layer scaling factor
+        self.layer_scale = nn.Parameter(torch.ones(1))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        Add weighted steering vectors to the residual stream.
+        h_new = h + layer_scale * Σ_i (alpha_i * direction_i)
+        """
+        if self.normalize:
+            directions = F.normalize(self.directions, dim=-1)
+        else:
+            directions = self.directions
+        weighted_dirs = (self.alphas.unsqueeze(-1) * directions).sum(dim=0)  # [D]
+        weighted_dirs = self.layer_scale * weighted_dirs
+        return hidden_states + weighted_dirs.unsqueeze(0).unsqueeze(0)
+    def set_direction(self, name_or_idx: Union[str, int], direction: torch.Tensor, alpha: float = 1.0):
+        """
+        Set a steering direction from an externally computed vector.
+        Args:
+            name_or_idx: Direction name or index
+            direction: Direction vector [hidden_dim]
+            alpha: Steering strength
+        """
+        if isinstance(name_or_idx, str):
+            idx = self.direction_names.index(name_or_idx)
+        else:
+            idx = name_or_idx
+        with torch.no_grad():
+            self.directions.data[idx] = direction
+            self.alphas.data[idx] = alpha
+    @staticmethod
+    def extract_direction(
+        model: nn.Module,
+        positive_prompts: List[str],
+        negative_prompts: List[str],
+        tokenizer,
+        layer_idx: int,
+        device: str = "cuda"
+    ) -> torch.Tensor:
+        """
+        Extract a steering direction via contrastive activation analysis.
+        Following RepE (Zou et al. 2023):
+        1. Run positive prompts through the model, collect last-token activations at layer_idx
+        2. Run negative prompts through the model, collect last-token activations
+        3. Compute the difference: direction = mean(positive) - mean(negative)
+        4. Optionally refine via PCA on the contrastive pairs
+        Args:
+            model: The LLM
+            positive_prompts: Prompts exemplifying the desired behavior
+            negative_prompts: Prompts exemplifying the undesired behavior
+            tokenizer: Model's tokenizer
+            layer_idx: Which layer to extract from
+            device: Device
+        Returns:
+            direction: [hidden_dim] steering direction vector
+        """
+        model.eval()
+        def get_activations(prompts):
+            activations = []
+            for prompt in prompts:
+                inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
+                with torch.no_grad():
+                    outputs = model(**inputs, output_hidden_states=True)
+                hidden = outputs.hidden_states[layer_idx]  # [1, T, D]
+                last_token = hidden[:, -1, :]  # [1, D]
+                activations.append(last_token)
+            return torch.cat(activations, dim=0)  # [N, D]
+        pos_acts = get_activations(positive_prompts)
+        neg_acts = get_activations(negative_prompts)
+        direction = pos_acts.mean(dim=0) - neg_acts.mean(dim=0)
+        # PCA refinement for robust direction extraction
+        if len(positive_prompts) >= 4:
+            diffs = pos_acts - neg_acts  # [N, D]
+            diffs = diffs - diffs.mean(dim=0)  # Center
+            U, S, Vt = torch.linalg.svd(diffs, full_matrices=False)
+            direction = Vt[0]  # First principal component
+        return direction.detach()
+    def get_direction_info(self) -> Dict[str, Tuple[float, torch.Tensor]]:
+        """Get all direction names, their alphas, and norms."""
+        info = {}
+        for i, name in enumerate(self.direction_names):
+            info[name] = {
+                "alpha": self.alphas[i].item(),
+                "norm": self.directions[i].norm().item(),
+            }
+        return info
+    def extra_repr(self):
+        return (f"hidden_dim={self.hidden_dim}, num_directions={self.num_directions}, "
+                f"names={self.direction_names}, {super().extra_repr()}")