Spaces:
Running
Running
File size: 1,380 Bytes
e6f24ae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | """M3 — PCADir: PC1 of (H_pos - H_neg) activation matrix."""
import numpy as np
from sklearn.decomposition import PCA
from src.methods.base import SteeringMethod
class PCADir(SteeringMethod):
"""PCADir — First principal component of contrastive activations."""
@property
def name(self) -> str:
return "PCADir"
@property
def method_id(self) -> str:
return "M3"
def extract_vector(
self,
h_pos: np.ndarray,
h_neg: np.ndarray,
**kwargs,
) -> np.ndarray:
"""Compute PC1 of the concatenated [H_pos; H_neg] activation matrix.
This captures the direction of maximum variance, which corresponds to
the axis separating the positive and negative distributions.
Args:
h_pos: (N, d) positive activations
h_neg: (N, d) negative activations
Returns:
(d,) first principal component direction (unit norm)
"""
# Concatenate and find PC1 of the combined activations
H = np.concatenate([h_pos, h_neg], axis=0) # (2N, d)
pca = PCA(n_components=1)
pca.fit(H)
v = pca.components_[0] # (d,)
# Orient: ensure positive dot product with mean diff
mean_diff = h_pos.mean(axis=0) - h_neg.mean(axis=0)
if np.dot(v, mean_diff) < 0:
v = -v
return v
|