"""M2 — DiffMean: Contrastive Activation Addition (CAA).

Steering vector = mean(H_pos) - mean(H_neg)
The simplest and most widely-used training-free steering method.
"""

from typing import Optional

import numpy as np

from src.methods.base import SteeringMethod


class DiffMean(SteeringMethod):
    """DiffMean — Contrastive Activation Addition."""

    @property
    def name(self) -> str:
        return "DiffMean"

    @property
    def method_id(self) -> str:
        return "M2"

    def extract_vector(
        self,
        h_pos: np.ndarray,
        h_neg: np.ndarray,
        **kwargs,
    ) -> np.ndarray:
        """Compute steering vector as mean(H_pos) - mean(H_neg).

        Args:
            h_pos: (N, d) positive activations
            h_neg: (N, d) negative activations

        Returns:
            (d,) steering direction
        """
        assert h_pos.shape[1] == h_neg.shape[1], (
            f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}"
        )
        v = h_pos.mean(axis=0) - h_neg.mean(axis=0)
        return v