"""M2 — DiffMean: Contrastive Activation Addition (CAA). Steering vector = mean(H_pos) - mean(H_neg) The simplest and most widely-used training-free steering method. """ from typing import Optional import numpy as np from src.methods.base import SteeringMethod class DiffMean(SteeringMethod): """DiffMean — Contrastive Activation Addition.""" @property def name(self) -> str: return "DiffMean" @property def method_id(self) -> str: return "M2" def extract_vector( self, h_pos: np.ndarray, h_neg: np.ndarray, **kwargs, ) -> np.ndarray: """Compute steering vector as mean(H_pos) - mean(H_neg). Args: h_pos: (N, d) positive activations h_neg: (N, d) negative activations Returns: (d,) steering direction """ assert h_pos.shape[1] == h_neg.shape[1], ( f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}" ) v = h_pos.mean(axis=0) - h_neg.mean(axis=0) return v