Spaces:

cdpearlman
/

LLMVis

Sleeping

App Files Files Community

LLMVis / tests /test_ablation_hook_placement.py

cdpearlman

Fixed ablation experiments, drilled down on which pre-loaded ones to run

3d71d04 2 months ago

raw

history blame contribute delete

4.12 kB

	"""
	Tests for scientifically accurate head ablation via pre-projection hooking.

	Verifies that ablation hooks are placed on the INPUT to c_proj (pre-projection),
	where per-head dimensions are still separable, rather than on the OUTPUT of the
	attention module (post-projection), where heads are mixed.
	"""

	import sys
	import os
	import torch
	import torch.nn as nn
	import pytest
	from transformers import AutoModelForCausalLM, AutoTokenizer

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from utils.model_patterns import _find_output_proj_submodule


	@pytest.fixture(scope="module")
	def gpt2_model_and_tokenizer():
	"""Load GPT-2 once for all tests in this module."""
	try:
	model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float32)
	model.eval()
	tokenizer = AutoTokenizer.from_pretrained("gpt2")
	return model, tokenizer
	except Exception as e:
	pytest.skip(f"Could not load GPT-2: {e}")


	class TestFindOutputProjSubmodule:
	def test_find_output_proj_gpt2(self, gpt2_model_and_tokenizer):
	"""GPT-2 attention modules should have c_proj as the output projection."""
	model, _ = gpt2_model_and_tokenizer
	attn_module = model.transformer.h[0].attn
	name, submodule = _find_output_proj_submodule(attn_module)
	assert name == "c_proj"
	assert submodule is attn_module.c_proj

	def test_find_output_proj_unknown_raises(self):
	"""A plain nn.Module with no recognized projection children should raise ValueError."""
	plain_module = nn.Module()
	plain_module.add_module("some_layer", nn.Linear(10, 10))
	with pytest.raises(ValueError, match="No output projection found"):
	_find_output_proj_submodule(plain_module)


	class TestPreHookPlacement:
	def test_pre_hook_zeros_correct_dims(self, gpt2_model_and_tokenizer):
	"""Pre-hook on c_proj receives input where per-head dims are separable.
	Zeroing head 0 dims [0:64] should leave [64:768] untouched."""
	model, tokenizer = gpt2_model_and_tokenizer
	captured_input = {}

	def capture_pre_hook(module, args):
	captured_input['x'] = args[0].clone()
	return None # Don't modify

	hook = model.transformer.h[0].attn.c_proj.register_forward_pre_hook(capture_pre_hook)
	try:
	inputs = tokenizer("The cat sat on the mat", return_tensors="pt")
	with torch.no_grad():
	model(**inputs, use_cache=False)
	finally:
	hook.remove()

	x = captured_input['x']
	# Shape should be [batch, seq, 768]
	assert x.shape[-1] == 768
	# Verify per-head structure: zeroing [0:64] leaves [64:768] intact
	x_modified = x.clone()
	x_modified[:, :, 0:64] = 0.0
	# The rest should be exactly equal
	assert torch.equal(x_modified[:, :, 64:], x[:, :, 64:])
	# The zeroed part should actually be zero
	assert torch.all(x_modified[:, :, 0:64] == 0.0)

	def test_ablation_changes_output(self, gpt2_model_and_tokenizer):
	"""Ablating head 0 at layer 0 via pre-hook on c_proj should change logits."""
	model, tokenizer = gpt2_model_and_tokenizer
	prompt = "The quick brown fox jumps over the"
	inputs = tokenizer(prompt, return_tensors="pt")

	# Baseline
	with torch.no_grad():
	baseline_logits = model(**inputs, use_cache=False).logits

	# Ablated: pre-hook zeros head 0 on layer 0's c_proj
	def ablation_pre_hook(module, args):
	x = args[0].clone()
	x[:, :, 0:64] = 0.0
	return (x,)

	hook = model.transformer.h[0].attn.c_proj.register_forward_pre_hook(ablation_pre_hook)
	try:
	with torch.no_grad():
	ablated_logits = model(**inputs, use_cache=False).logits
	finally:
	hook.remove()

	# Logits must differ
	assert not torch.allclose(baseline_logits, ablated_logits, atol=1e-6), \
	"Ablation via pre-hook on c_proj did not change logits"