obliteratus

Sleeping

App Files Files Community

obliteratus / tests /test_advanced_metrics.py

pliny-the-prompter

Upload 127 files

45113e6 verified about 2 months ago

raw

history blame contribute delete

12.6 kB

	"""Tests for advanced evaluation metrics."""

	from __future__ import annotations


	import pytest
	import torch

	from obliteratus.evaluation.advanced_metrics import (
	REFUSAL_PREFIXES,
	REFUSAL_SUBSTRINGS,
	_is_refusal,
	refusal_rate,
	token_kl_divergence,
	first_token_kl_divergence,
	effective_rank,
	effective_rank_change,
	activation_cosine_similarity,
	linear_cka,
	refusal_projection_magnitude,
	format_eval_report,
	AbliterationEvalResult,
	)


	# ---------------------------------------------------------------------------
	# Refusal Rate
	# ---------------------------------------------------------------------------

	class TestRefusalRate:
	def test_all_refusals(self):
	responses = [
	"I'm sorry, but I cannot help with that.",
	"I apologize, but I'm unable to assist.",
	"As an AI, I cannot provide that information.",
	]
	assert refusal_rate(responses, mode="prefix") == 1.0

	def test_no_refusals(self):
	responses = [
	"Sure, here's how to do that.",
	"Here's a step-by-step guide.",
	"The answer is 42.",
	]
	assert refusal_rate(responses, mode="prefix") == 0.0

	def test_mixed_responses(self):
	responses = [
	"I'm sorry, I can't do that.",
	"Sure, here you go.",
	"I cannot provide that.",
	"Here's what you need.",
	]
	rate = refusal_rate(responses, mode="prefix")
	assert rate == 0.5 # 2 out of 4

	def test_substring_mode(self):
	responses = [
	"Well, actually I cannot help with that request.",
	"Let me explain how to do this.",
	]
	assert refusal_rate(responses, mode="substring") == 0.5

	def test_combined_mode(self):
	responses = [
	"I'm sorry, but no.", # caught by prefix
	"That said, i won't do that for you.", # caught by substring
	"Here's the answer.", # not a refusal
	]
	rate = refusal_rate(responses, mode="combined")
	assert abs(rate - 2.0 / 3.0) < 1e-6

	def test_empty_responses(self):
	assert refusal_rate([], mode="prefix") == 0.0

	def test_empty_string_not_refusal(self):
	assert not _is_refusal("", mode="combined")

	def test_is_refusal_prefix(self):
	assert _is_refusal("I'm sorry, I can't do that.", mode="prefix")
	assert _is_refusal("As an AI language model, I cannot", mode="prefix")
	assert not _is_refusal("The answer is 42.", mode="prefix")

	def test_is_refusal_substring(self):
	assert _is_refusal("I must say, i cannot assist with that.", mode="substring")
	assert not _is_refusal("The capital of France is Paris.", mode="substring")

	def test_refusal_prefixes_nonempty(self):
	assert len(REFUSAL_PREFIXES) > 20

	def test_refusal_substrings_nonempty(self):
	assert len(REFUSAL_SUBSTRINGS) > 10


	# ---------------------------------------------------------------------------
	# KL Divergence
	# ---------------------------------------------------------------------------

	class TestKLDivergence:
	def test_identical_distributions(self):
	"""KL divergence of identical distributions should be 0."""
	logits = torch.randn(2, 10, 100)
	kl = token_kl_divergence(logits, logits)
	assert abs(kl) < 1e-5

	def test_different_distributions(self):
	"""KL divergence of different distributions should be positive."""
	torch.manual_seed(42)
	logits_a = torch.randn(2, 10, 100)
	logits_b = torch.randn(2, 10, 100)
	kl = token_kl_divergence(logits_a, logits_b)
	assert kl > 0

	def test_kl_nonnegative(self):
	"""KL divergence should always be non-negative."""
	torch.manual_seed(42)
	for _ in range(5):
	logits_a = torch.randn(1, 5, 50)
	logits_b = torch.randn(1, 5, 50)
	kl = token_kl_divergence(logits_a, logits_b)
	assert kl >= -1e-6 # allow small numerical errors

	def test_first_token_kl_identical(self):
	"""First-token KL of identical distributions should be 0."""
	logits = torch.randn(4, 20, 100)
	kl = first_token_kl_divergence(logits, logits)
	assert abs(kl) < 1e-5

	def test_first_token_kl_different(self):
	"""First-token KL of different distributions should be positive."""
	torch.manual_seed(42)
	logits_a = torch.randn(4, 20, 100)
	logits_b = torch.randn(4, 20, 100)
	kl = first_token_kl_divergence(logits_a, logits_b)
	assert kl > 0

	def test_temperature_effect(self):
	"""Higher temperature should reduce KL divergence (smoother distributions)."""
	torch.manual_seed(42)
	logits_a = torch.randn(2, 5, 50)
	logits_b = torch.randn(2, 5, 50)
	kl_t1 = token_kl_divergence(logits_a, logits_b, temperature=1.0)
	kl_t5 = token_kl_divergence(logits_a, logits_b, temperature=5.0)
	assert kl_t5 < kl_t1


	# ---------------------------------------------------------------------------
	# Effective Rank
	# ---------------------------------------------------------------------------

	class TestEffectiveRank:
	def test_rank_one_matrix(self):
	"""Rank-1 matrix should have effective rank close to 1."""
	v = torch.randn(8, 1)
	u = torch.randn(1, 4)
	W = v @ u # rank-1
	erank = effective_rank(W)
	assert erank < 1.5

	def test_identity_matrix(self):
	"""Identity matrix should have effective rank equal to dimension."""
	n = 8
	W = torch.eye(n)
	erank = effective_rank(W)
	assert abs(erank - n) < 0.1

	def test_random_full_rank(self):
	"""Random matrix should have high effective rank."""
	torch.manual_seed(42)
	W = torch.randn(16, 16)
	erank = effective_rank(W)
	assert erank > 10 # should be close to 16

	def test_zero_matrix(self):
	"""Zero matrix should have effective rank 0."""
	W = torch.zeros(4, 4)
	erank = effective_rank(W)
	assert erank == 0.0

	def test_effective_rank_change(self):
	"""Should compute before/after rank comparison."""
	torch.manual_seed(42)
	W_before = torch.randn(8, 8)
	# Simulate abliteration: remove a direction (reduces rank slightly)
	d = torch.randn(8, 1)
	d = d / d.norm()
	W_after = W_before - (W_before @ d) @ d.T

	result = effective_rank_change(W_before, W_after)
	assert "rank_before" in result
	assert "rank_after" in result
	assert "rank_delta" in result
	assert "rank_ratio" in result
	assert result["rank_after"] <= result["rank_before"] + 0.1

	def test_rejects_non_2d(self):
	"""Should raise ValueError for non-2D tensors."""
	with pytest.raises(ValueError):
	effective_rank(torch.randn(4, 4, 4))


	# ---------------------------------------------------------------------------
	# Activation Cosine Similarity
	# ---------------------------------------------------------------------------

	class TestActivationCosineSimilarity:
	def test_identical_activations(self):
	acts = torch.randn(10, 32)
	sim = activation_cosine_similarity(acts, acts)
	assert abs(sim - 1.0) < 1e-5

	def test_orthogonal_activations(self):
	"""Orthogonal activations should have cosine near 0."""
	a = torch.tensor([[1.0, 0.0, 0.0]])
	b = torch.tensor([[0.0, 1.0, 0.0]])
	sim = activation_cosine_similarity(a, b)
	assert abs(sim) < 1e-5

	def test_opposite_activations(self):
	"""Opposite activations should have cosine -1."""
	a = torch.randn(5, 16)
	sim = activation_cosine_similarity(a, -a)
	assert abs(sim - (-1.0)) < 1e-5

	def test_handles_3d(self):
	"""Should handle 3D tensors by reshaping."""
	a = torch.randn(2, 5, 16)
	b = torch.randn(2, 5, 16)
	sim = activation_cosine_similarity(a, b)
	assert -1.0 <= sim <= 1.0


	# ---------------------------------------------------------------------------
	# Linear CKA
	# ---------------------------------------------------------------------------

	class TestLinearCKA:
	def test_identical_representations(self):
	"""CKA of identical representations should be 1.0."""
	X = torch.randn(20, 16)
	cka = linear_cka(X, X)
	assert abs(cka - 1.0) < 1e-4

	def test_scaled_representations(self):
	"""CKA should be invariant to isotropic scaling."""
	X = torch.randn(20, 16)
	Y = X * 5.0
	cka = linear_cka(X, Y)
	assert abs(cka - 1.0) < 1e-4

	def test_random_representations(self):
	"""CKA of random representations should be low."""
	torch.manual_seed(42)
	X = torch.randn(100, 16)
	Y = torch.randn(100, 16)
	cka = linear_cka(X, Y)
	assert cka < 0.3 # random should be near 0

	def test_cka_bounded(self):
	"""CKA should be between 0 and 1."""
	torch.manual_seed(42)
	for _ in range(5):
	X = torch.randn(20, 8)
	Y = torch.randn(20, 8)
	cka = linear_cka(X, Y)
	assert -0.01 <= cka <= 1.01 # small tolerance for numerics

	def test_different_dimensions(self):
	"""CKA should work with different hidden dimensions."""
	X = torch.randn(20, 16)
	Y = torch.randn(20, 32)
	cka = linear_cka(X, Y)
	assert -0.01 <= cka <= 1.01

	def test_handles_3d(self):
	"""Should handle 3D tensors by reshaping."""
	X = torch.randn(2, 10, 16)
	Y = torch.randn(2, 10, 16)
	cka = linear_cka(X, Y)
	assert -0.01 <= cka <= 1.01


	# ---------------------------------------------------------------------------
	# Refusal Direction Projection Magnitude
	# ---------------------------------------------------------------------------

	class TestRefusalProjection:
	def test_aligned_activations(self):
	"""Activations aligned with direction should have high projection."""
	d = torch.tensor([1.0, 0.0, 0.0])
	acts = torch.tensor([
	[5.0, 0.0, 0.0],
	[3.0, 0.0, 0.0],
	[4.0, 0.0, 0.0],
	])
	result = refusal_projection_magnitude(acts, d)
	assert result["mean"] == 4.0
	assert result["abs_mean"] == 4.0

	def test_orthogonal_activations(self):
	"""Orthogonal activations should have zero projection."""
	d = torch.tensor([1.0, 0.0, 0.0])
	acts = torch.tensor([
	[0.0, 5.0, 0.0],
	[0.0, 0.0, 3.0],
	])
	result = refusal_projection_magnitude(acts, d)
	assert abs(result["mean"]) < 1e-5
	assert abs(result["abs_mean"]) < 1e-5

	def test_result_keys(self):
	"""Should return all expected keys."""
	d = torch.randn(8)
	acts = torch.randn(5, 8)
	result = refusal_projection_magnitude(acts, d)
	assert set(result.keys()) == {"mean", "std", "max", "min", "abs_mean"}


	# ---------------------------------------------------------------------------
	# Eval Report Formatting
	# ---------------------------------------------------------------------------

	class TestEvalReport:
	def test_format_report(self):
	result = AbliterationEvalResult(
	refusal_rate_harmful=0.1,
	refusal_rate_harmless=0.02,
	kl_divergence=0.15,
	perplexity=12.5,
	coherence_score=0.8,
	mean_activation_cosine=0.95,
	mean_cka=0.92,
	)
	report = format_eval_report(result)
	assert "10.0%" in report
	assert "12.50" in report
	assert "excellent" in report # KL < 0.2

	def test_format_report_high_kl(self):
	result = AbliterationEvalResult(
	refusal_rate_harmful=0.0,
	refusal_rate_harmless=0.0,
	kl_divergence=1.5,
	perplexity=50.0,
	coherence_score=0.4,
	mean_activation_cosine=None,
	mean_cka=None,
	)
	report = format_eval_report(result)
	assert "significant damage" in report

	def test_format_report_no_kl(self):
	result = AbliterationEvalResult(
	refusal_rate_harmful=0.5,
	refusal_rate_harmless=0.1,
	kl_divergence=None,
	perplexity=20.0,
	coherence_score=1.0,
	mean_activation_cosine=None,
	mean_cka=None,
	)
	report = format_eval_report(result)
	assert "50.0%" in report
	assert "KL" not in report