"""M5 — LinearProbe: Logistic regression weight vector as steering direction.""" import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from src.methods.base import SteeringMethod class LinearProbe(SteeringMethod): """LinearProbe — Logistic regression weight vector.""" @property def name(self) -> str: return "LinearProbe" @property def method_id(self) -> str: return "M5" def extract_vector( self, h_pos: np.ndarray, h_neg: np.ndarray, **kwargs, ) -> np.ndarray: """Compute logistic regression weight vector. Args: h_pos: (N_pos, d) positive activations h_neg: (N_neg, d) negative activations Returns: (d,) weight vector direction """ X = np.concatenate([h_pos, h_neg], axis=0) y = np.concatenate([ np.ones(len(h_pos)), np.zeros(len(h_neg)), ]) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) C = kwargs.get("C", 1.0) max_iter = kwargs.get("max_iter", 5000) lr = LogisticRegression(C=C, max_iter=max_iter, solver="lbfgs") lr.fit(X_scaled, y) # Get weight vector in original space w = lr.coef_[0] / scaler.scale_ w = w / (np.linalg.norm(w) + 1e-8) return w