abka03's picture
Deploy StyleSteer-VLM demo
e6f24ae verified
"""M3 — PCADir: PC1 of (H_pos - H_neg) activation matrix."""
import numpy as np
from sklearn.decomposition import PCA
from src.methods.base import SteeringMethod
class PCADir(SteeringMethod):
"""PCADir — First principal component of contrastive activations."""
@property
def name(self) -> str:
return "PCADir"
@property
def method_id(self) -> str:
return "M3"
def extract_vector(
self,
h_pos: np.ndarray,
h_neg: np.ndarray,
**kwargs,
) -> np.ndarray:
"""Compute PC1 of the concatenated [H_pos; H_neg] activation matrix.
This captures the direction of maximum variance, which corresponds to
the axis separating the positive and negative distributions.
Args:
h_pos: (N, d) positive activations
h_neg: (N, d) negative activations
Returns:
(d,) first principal component direction (unit norm)
"""
# Concatenate and find PC1 of the combined activations
H = np.concatenate([h_pos, h_neg], axis=0) # (2N, d)
pca = PCA(n_components=1)
pca.fit(H)
v = pca.components_[0] # (d,)
# Orient: ensure positive dot product with mean diff
mean_diff = h_pos.mean(axis=0) - h_neg.mean(axis=0)
if np.dot(v, mean_diff) < 0:
v = -v
return v