Spaces:
Running
Running
File size: 1,070 Bytes
e6f24ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | """M2 — DiffMean: Contrastive Activation Addition (CAA).
Steering vector = mean(H_pos) - mean(H_neg)
The simplest and most widely-used training-free steering method.
"""
from typing import Optional
import numpy as np
from src.methods.base import SteeringMethod
class DiffMean(SteeringMethod):
"""DiffMean — Contrastive Activation Addition."""
@property
def name(self) -> str:
return "DiffMean"
@property
def method_id(self) -> str:
return "M2"
def extract_vector(
self,
h_pos: np.ndarray,
h_neg: np.ndarray,
**kwargs,
) -> np.ndarray:
"""Compute steering vector as mean(H_pos) - mean(H_neg).
Args:
h_pos: (N, d) positive activations
h_neg: (N, d) negative activations
Returns:
(d,) steering direction
"""
assert h_pos.shape[1] == h_neg.shape[1], (
f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}"
)
v = h_pos.mean(axis=0) - h_neg.mean(axis=0)
return v
|