Spaces:

BART-ender
/

siege

Sleeping

App Files Files Community

siege / interp_arena /model /hooks.py

BART-ender

Upload folder using huggingface_hub

433f30e verified about 1 month ago

raw

history blame contribute delete

6.86 kB

	"""Hook builders for TransformerLens — Red attack hooks and Blue defense hooks.

	TransformerLens hook naming convention:
	blocks.{L}.hook_resid_post → residual stream after layer L
	blocks.{L}.hook_resid_pre → residual stream before layer L
	blocks.{L}.attn.hook_attn_scores → attention logits (pre-softmax)
	blocks.{L}.hook_mlp_out → MLP output
	hook_logits → final logit tensor

	All hook functions follow the TransformerLens signature:
	hook_fn(value: Tensor, hook: HookPoint) -> Tensor
	"""

	from __future__ import annotations

	import re
	from typing import Callable, Optional

	import torch


	# ── Type alias ────────────────────────────────────────────────────────────────
	HookFn = Callable[[torch.Tensor, object], torch.Tensor]
	HookPair = tuple[str, HookFn] # (hook_name, hook_fn)


	# ── Hook name helpers ─────────────────────────────────────────────────────────

	def resid_post(layer: int) -> str:
	return f"blocks.{layer}.hook_resid_post"

	def resid_pre(layer: int) -> str:
	return f"blocks.{layer}.hook_resid_pre"

	def attn_scores(layer: int) -> str:
	return f"blocks.{layer}.attn.hook_attn_scores"

	def mlp_out(layer: int) -> str:
	return f"blocks.{layer}.hook_mlp_out"

	LOGITS_HOOK = "hook_logits"


	# ── Red: attack hooks ─────────────────────────────────────────────────────────

	def make_steer_hook(direction: torch.Tensor, strength: float) -> HookFn:
	"""Add `strength * direction` to every token's residual stream at this layer."""
	direction = _unit(direction)

	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	# value: (batch, seq, d_model)
	d = direction.to(value.device, value.dtype)
	return value + strength * d

	return hook


	def make_amplify_attn_hook(head: int, scale: float) -> HookFn:
	"""Scale the attention scores for a specific head (pre-softmax).

	scale > 1 → head attends more sharply
	scale < 1 → head attends more uniformly
	0 → head fully suppressed
	"""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	# value: (batch, heads, seq_q, seq_k)
	value = value.clone()
	value[:, head, :, :] *= scale
	return value

	return hook


	def make_patch_hook(position: int, patch_value: torch.Tensor) -> HookFn:
	"""Replace residual stream at position with patch_value."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	# value: (batch, seq, d_model)
	p = patch_value.to(value.device, value.dtype)
	value = value.clone()
	value[:, position, :] = p
	return value

	return hook


	def make_logit_bias_hook(token_ids: list[int], bias: float) -> HookFn:
	"""Add bias to the final logits for token_ids (steers generation)."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	# value: (batch, seq, vocab) or (batch, vocab)
	value = value.clone()
	value[..., token_ids] += bias
	return value

	return hook


	# ── Blue: defense hooks ───────────────────────────────────────────────────────

	def make_ablate_hook(direction: torch.Tensor) -> HookFn:
	"""Project direction out of the residual stream (orthogonal ablation)."""
	direction = _unit(direction)

	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	d = direction.to(value.device, value.dtype)
	# v - (v·d) * d for each token
	# value: (batch, seq, d_model)
	proj = (value @ d).unsqueeze(-1) * d # (batch, seq, d_model)
	return value - proj

	return hook


	def make_suppress_head_hook(head: int) -> HookFn:
	"""Zero out a specific attention head's contribution."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	value = value.clone()
	value[:, head, :, :] = 0.0
	return value

	return hook


	def make_clamp_hook(clamp_min: float, clamp_max: float) -> HookFn:
	"""Clamp residual stream values to [clamp_min, clamp_max]."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	return value.clamp(clamp_min, clamp_max)

	return hook


	def make_restore_hook(
	position: int,
	baseline_activation: torch.Tensor,
	) -> HookFn:
	"""Replace activation at position with the clean baseline."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	b = baseline_activation.to(value.device, value.dtype)
	value = value.clone()
	value[:, position, :] = b
	return value

	return hook


	def make_logit_filter_hook(token_ids: list[int]) -> HookFn:
	"""Set logits for prohibited token ids to -inf."""
	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	value = value.clone()
	value[..., token_ids] = float("-inf")
	return value

	return hook


	# ── Detection utility ─────────────────────────────────────────────────────────

	def make_detection_hook(
	ref_direction: torch.Tensor,
	threshold: float,
	detected_layers: list[int],
	layer_id: int,
	) -> HookFn:
	"""Blue monitoring hook: records layer_id if steering is detected.

	Detects steering by measuring cosine similarity of the per-token residual
	shift relative to `ref_direction`. If mean cosine sim > threshold, the
	layer is flagged.
	"""
	ref = _unit(ref_direction)

	def hook(value: torch.Tensor, hook) -> torch.Tensor: # noqa: ANN001
	r = ref.to(value.device, value.dtype)
	# value: (batch, seq, d_model)
	norms = value.norm(dim=-1, keepdim=True).clamp(min=1e-8)
	cos_sim = (value / norms) @ r # (batch, seq)
	mean_cos = cos_sim.abs().mean().item()
	if mean_cos > threshold:
	detected_layers.append(layer_id)
	return value # no modification — pure observation

	return hook


	# ── Internals ─────────────────────────────────────────────────────────────────

	def _unit(v: torch.Tensor) -> torch.Tensor:
	norm = v.norm()
	return v / norm if norm > 1e-8 else v