Spaces:
Running
Running
| """M2 — DiffMean: Contrastive Activation Addition (CAA). | |
| Steering vector = mean(H_pos) - mean(H_neg) | |
| The simplest and most widely-used training-free steering method. | |
| """ | |
| from typing import Optional | |
| import numpy as np | |
| from src.methods.base import SteeringMethod | |
| class DiffMean(SteeringMethod): | |
| """DiffMean — Contrastive Activation Addition.""" | |
| def name(self) -> str: | |
| return "DiffMean" | |
| def method_id(self) -> str: | |
| return "M2" | |
| def extract_vector( | |
| self, | |
| h_pos: np.ndarray, | |
| h_neg: np.ndarray, | |
| **kwargs, | |
| ) -> np.ndarray: | |
| """Compute steering vector as mean(H_pos) - mean(H_neg). | |
| Args: | |
| h_pos: (N, d) positive activations | |
| h_neg: (N, d) negative activations | |
| Returns: | |
| (d,) steering direction | |
| """ | |
| assert h_pos.shape[1] == h_neg.shape[1], ( | |
| f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}" | |
| ) | |
| v = h_pos.mean(axis=0) - h_neg.mean(axis=0) | |
| return v | |