obliteratus

Runtime error

App Files Files Community

obliteratus / tests /test_logit_lens.py

pliny-the-prompter

Upload 127 files

45113e6 verified 2 months ago

raw

history blame contribute delete

6.02 kB

	"""Tests for logit lens refusal direction analysis."""

	from __future__ import annotations

	from unittest.mock import MagicMock

	import torch

	from obliteratus.analysis.logit_lens import (
	RefusalLogitLens,
	LogitLensResult,
	MultiLayerLogitLensResult,
	REFUSAL_TOKENS,
	COMPLIANCE_TOKENS,
	)


	def _make_mock_model(hidden_dim=32, vocab_size=100):
	"""Create a mock model with LM head and layer norm."""
	model = MagicMock()

	# LM head weight (vocab_size, hidden_dim)
	lm_head = MagicMock()
	lm_head.weight = MagicMock()
	lm_head.weight.data = torch.randn(vocab_size, hidden_dim)
	model.lm_head = lm_head

	# Final LayerNorm
	ln_f = MagicMock()
	ln_f.weight = MagicMock()
	ln_f.weight.data = torch.ones(hidden_dim)
	ln_f.bias = MagicMock()
	ln_f.bias.data = torch.zeros(hidden_dim)
	model.transformer = MagicMock()
	model.transformer.ln_f = ln_f

	return model


	def _make_mock_tokenizer(vocab_size=100):
	"""Create a mock tokenizer."""
	tokenizer = MagicMock()

	def mock_decode(ids):
	if isinstance(ids, list) and len(ids) == 1:
	return f"tok_{ids[0]}"
	return f"tok_{ids}"

	def mock_encode(text, add_special_tokens=False):
	# Return a deterministic token ID based on the text
	return [hash(text) % vocab_size]

	tokenizer.decode = mock_decode
	tokenizer.encode = mock_encode
	return tokenizer


	class TestRefusalLogitLens:
	def test_basic_analysis(self):
	"""Should produce a LogitLensResult with expected fields."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	direction = torch.randn(32)

	lens = RefusalLogitLens(top_k=10)
	result = lens.analyze_direction(direction, model, tokenizer, layer_idx=5)

	assert isinstance(result, LogitLensResult)
	assert result.layer_idx == 5
	assert len(result.top_promoted) == 10
	assert len(result.top_suppressed) == 10
	assert isinstance(result.refusal_specificity, float)
	assert isinstance(result.logit_effect_entropy, float)
	assert isinstance(result.refusal_compliance_gap, float)

	def test_promoted_suppressed_ordering(self):
	"""Top promoted should have higher logit boost than top suppressed."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	direction = torch.randn(32)

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_direction(direction, model, tokenizer)

	# Promoted tokens should have positive-ish values
	# Suppressed tokens should have negative-ish values
	max_promoted = max(v for _, v in result.top_promoted)
	min_suppressed = min(v for _, v in result.top_suppressed)
	assert max_promoted > min_suppressed

	def test_multi_layer_analysis(self):
	"""Should analyze multiple layers."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	directions = {0: torch.randn(32), 1: torch.randn(32), 2: torch.randn(32)}

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_all_layers(directions, model, tokenizer)

	assert isinstance(result, MultiLayerLogitLensResult)
	assert len(result.per_layer) == 3
	assert result.strongest_refusal_layer in [0, 1, 2]
	assert result.peak_specificity_layer in [0, 1, 2]

	def test_strong_layers_filter(self):
	"""Should only analyze specified strong layers."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	directions = {i: torch.randn(32) for i in range(10)}

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_all_layers(
	directions, model, tokenizer, strong_layers=[2, 5]
	)
	assert set(result.per_layer.keys()) == {2, 5}

	def test_handles_unnormalized_direction(self):
	"""Should handle non-unit directions."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	direction = torch.randn(32) * 100.0 # large magnitude

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_direction(direction, model, tokenizer)
	# Should still produce valid results
	assert len(result.top_promoted) == 5

	def test_format_report(self):
	"""Format report should produce readable output."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	directions = {0: torch.randn(32), 1: torch.randn(32)}

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_all_layers(directions, model, tokenizer)
	report = RefusalLogitLens.format_report(result)
	assert "Logit Lens" in report
	assert "Layer 0:" in report

	def test_empty_directions(self):
	"""Should handle empty input gracefully."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_all_layers({}, model, tokenizer)
	assert len(result.per_layer) == 0

	def test_token_lists_nonempty(self):
	"""Refusal and compliance token lists should have entries."""
	assert len(REFUSAL_TOKENS) > 10
	assert len(COMPLIANCE_TOKENS) > 10

	def test_entropy_nonnegative(self):
	"""Logit effect entropy should be non-negative."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	direction = torch.randn(32)

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_direction(direction, model, tokenizer)
	assert result.logit_effect_entropy >= 0

	def test_2d_direction_input(self):
	"""Should handle 2D direction input (unsqueezed)."""
	model = _make_mock_model()
	tokenizer = _make_mock_tokenizer()
	direction = torch.randn(1, 32)

	lens = RefusalLogitLens(top_k=5)
	result = lens.analyze_direction(direction, model, tokenizer)
	assert len(result.top_promoted) == 5