Spaces:
Running
Running
| """M3 — PCADir: PC1 of (H_pos - H_neg) activation matrix.""" | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| from src.methods.base import SteeringMethod | |
| class PCADir(SteeringMethod): | |
| """PCADir — First principal component of contrastive activations.""" | |
| def name(self) -> str: | |
| return "PCADir" | |
| def method_id(self) -> str: | |
| return "M3" | |
| def extract_vector( | |
| self, | |
| h_pos: np.ndarray, | |
| h_neg: np.ndarray, | |
| **kwargs, | |
| ) -> np.ndarray: | |
| """Compute PC1 of the concatenated [H_pos; H_neg] activation matrix. | |
| This captures the direction of maximum variance, which corresponds to | |
| the axis separating the positive and negative distributions. | |
| Args: | |
| h_pos: (N, d) positive activations | |
| h_neg: (N, d) negative activations | |
| Returns: | |
| (d,) first principal component direction (unit norm) | |
| """ | |
| # Concatenate and find PC1 of the combined activations | |
| H = np.concatenate([h_pos, h_neg], axis=0) # (2N, d) | |
| pca = PCA(n_components=1) | |
| pca.fit(H) | |
| v = pca.components_[0] # (d,) | |
| # Orient: ensure positive dot product with mean diff | |
| mean_diff = h_pos.mean(axis=0) - h_neg.mean(axis=0) | |
| if np.dot(v, mean_diff) < 0: | |
| v = -v | |
| return v | |