stylsteer-vlm / src /methods /diffmean.py
abka03's picture
Deploy StyleSteer-VLM demo
e6f24ae verified
"""M2 — DiffMean: Contrastive Activation Addition (CAA).
Steering vector = mean(H_pos) - mean(H_neg)
The simplest and most widely-used training-free steering method.
"""
from typing import Optional
import numpy as np
from src.methods.base import SteeringMethod
class DiffMean(SteeringMethod):
"""DiffMean — Contrastive Activation Addition."""
@property
def name(self) -> str:
return "DiffMean"
@property
def method_id(self) -> str:
return "M2"
def extract_vector(
self,
h_pos: np.ndarray,
h_neg: np.ndarray,
**kwargs,
) -> np.ndarray:
"""Compute steering vector as mean(H_pos) - mean(H_neg).
Args:
h_pos: (N, d) positive activations
h_neg: (N, d) negative activations
Returns:
(d,) steering direction
"""
assert h_pos.shape[1] == h_neg.shape[1], (
f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}"
)
v = h_pos.mean(axis=0) - h_neg.mean(axis=0)
return v