File size: 1,070 Bytes
e6f24ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""M2 — DiffMean: Contrastive Activation Addition (CAA).

Steering vector = mean(H_pos) - mean(H_neg)
The simplest and most widely-used training-free steering method.
"""

from typing import Optional

import numpy as np

from src.methods.base import SteeringMethod


class DiffMean(SteeringMethod):
    """DiffMean — Contrastive Activation Addition."""

    @property
    def name(self) -> str:
        return "DiffMean"

    @property
    def method_id(self) -> str:
        return "M2"

    def extract_vector(
        self,
        h_pos: np.ndarray,
        h_neg: np.ndarray,
        **kwargs,
    ) -> np.ndarray:
        """Compute steering vector as mean(H_pos) - mean(H_neg).

        Args:
            h_pos: (N, d) positive activations
            h_neg: (N, d) negative activations

        Returns:
            (d,) steering direction
        """
        assert h_pos.shape[1] == h_neg.shape[1], (
            f"Dimension mismatch: h_pos={h_pos.shape}, h_neg={h_neg.shape}"
        )
        v = h_pos.mean(axis=0) - h_neg.mean(axis=0)
        return v