"""M3 — PCADir: PC1 of (H_pos - H_neg) activation matrix.""" import numpy as np from sklearn.decomposition import PCA from src.methods.base import SteeringMethod class PCADir(SteeringMethod): """PCADir — First principal component of contrastive activations.""" @property def name(self) -> str: return "PCADir" @property def method_id(self) -> str: return "M3" def extract_vector( self, h_pos: np.ndarray, h_neg: np.ndarray, **kwargs, ) -> np.ndarray: """Compute PC1 of the concatenated [H_pos; H_neg] activation matrix. This captures the direction of maximum variance, which corresponds to the axis separating the positive and negative distributions. Args: h_pos: (N, d) positive activations h_neg: (N, d) negative activations Returns: (d,) first principal component direction (unit norm) """ # Concatenate and find PC1 of the combined activations H = np.concatenate([h_pos, h_neg], axis=0) # (2N, d) pca = PCA(n_components=1) pca.fit(H) v = pca.components_[0] # (d,) # Orient: ensure positive dot product with mean diff mean_diff = h_pos.mean(axis=0) - h_neg.mean(axis=0) if np.dot(v, mean_diff) < 0: v = -v return v