Spaces:
Running
Running
| """M5 — LinearProbe: Logistic regression weight vector as steering direction.""" | |
| import numpy as np | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import StandardScaler | |
| from src.methods.base import SteeringMethod | |
| class LinearProbe(SteeringMethod): | |
| """LinearProbe — Logistic regression weight vector.""" | |
| def name(self) -> str: | |
| return "LinearProbe" | |
| def method_id(self) -> str: | |
| return "M5" | |
| def extract_vector( | |
| self, | |
| h_pos: np.ndarray, | |
| h_neg: np.ndarray, | |
| **kwargs, | |
| ) -> np.ndarray: | |
| """Compute logistic regression weight vector. | |
| Args: | |
| h_pos: (N_pos, d) positive activations | |
| h_neg: (N_neg, d) negative activations | |
| Returns: | |
| (d,) weight vector direction | |
| """ | |
| X = np.concatenate([h_pos, h_neg], axis=0) | |
| y = np.concatenate([ | |
| np.ones(len(h_pos)), | |
| np.zeros(len(h_neg)), | |
| ]) | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| C = kwargs.get("C", 1.0) | |
| max_iter = kwargs.get("max_iter", 5000) | |
| lr = LogisticRegression(C=C, max_iter=max_iter, solver="lbfgs") | |
| lr.fit(X_scaled, y) | |
| # Get weight vector in original space | |
| w = lr.coef_[0] / scaler.scale_ | |
| w = w / (np.linalg.norm(w) + 1e-8) | |
| return w | |