obliteratus / tests /test_defense_robustness.py
pliny-the-prompter's picture
Upload 127 files
45113e6 verified
"""Tests for defense robustness evaluation framework."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.analysis.defense_robustness import (
DefenseProfile,
DefenseRobustnessEvaluator,
EntanglementMap,
SelfRepairResult,
)
def _make_mock_pipeline(n_layers=6, hidden_dim=16, n_prompts=5):
"""Create a mock pipeline with refusal directions and activations."""
pipeline = MagicMock()
pipeline.model_name = "test-model"
# Generate refusal directions (some strong, some weak)
torch.manual_seed(42)
directions = {}
for i in range(n_layers):
d = torch.randn(hidden_dim)
directions[i] = d / d.norm()
pipeline.refusal_directions = directions
# Generate activations with a planted refusal signal in middle layers
harmful_means = {}
harmless_means = {}
harmful_acts = {}
harmless_acts = {}
for i in range(n_layers):
base = torch.randn(hidden_dim)
harmless_means[i] = base.unsqueeze(0)
# Middle layers have stronger refusal signal
signal_strength = 3.0 if 2 <= i <= 4 else 0.5
harmful_means[i] = (base + signal_strength * directions[i]).unsqueeze(0)
harmful_acts[i] = [base + signal_strength * directions[i] + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
harmless_acts[i] = [base + torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)]
pipeline._harmful_means = harmful_means
pipeline._harmless_means = harmless_means
pipeline._harmful_acts = harmful_acts
pipeline._harmless_acts = harmless_acts
return pipeline
class TestDefenseProfile:
def test_profile_generates(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert isinstance(profile, DefenseProfile)
assert profile.model_name == "test-model"
assert profile.refusal_layer_spread > 0
assert profile.mean_refusal_strength > 0
assert profile.max_refusal_strength >= profile.mean_refusal_strength
assert profile.estimated_robustness in ("low", "medium", "high", "very_high")
def test_alignment_type_estimate(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert profile.alignment_type_estimate != "unknown"
def test_empty_pipeline(self):
pipeline = MagicMock()
pipeline.model_name = "empty"
pipeline.refusal_directions = {}
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert profile.estimated_robustness == "unknown"
def test_concentration_bounded(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
# Gini coefficient should be between 0 and 1
assert 0 <= profile.refusal_concentration <= 1.0
def test_self_repair_bounded(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
assert 0 <= profile.self_repair_estimate <= 1.0
def test_format_report(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
profile = evaluator.profile_defense()
report = DefenseRobustnessEvaluator.format_defense_profile(profile)
assert "Defense Robustness" in report
assert "test-model" in report
class TestSelfRepair:
def test_self_repair_measurement(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=3)
assert isinstance(result, SelfRepairResult)
assert result.layer_idx == 3
assert result.original_refusal_strength >= 0
assert 0 <= result.repair_ratio <= 1.0
assert len(result.compensating_layers) > 0
assert 3 not in result.compensating_layers # shouldn't list itself
def test_repair_ratio_high_for_distributed(self):
"""Distributed refusal should have high repair ratio."""
pipeline = _make_mock_pipeline(n_layers=10)
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=3)
# With distributed signal, removing one layer leaves much compensation
assert result.repair_ratio > 0.5
def test_format_self_repair(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
result = evaluator.measure_self_repair(layer_idx=2)
report = DefenseRobustnessEvaluator.format_self_repair(result)
assert "Self-Repair" in report
assert "Layer 2" in report
class TestEntanglement:
def test_entanglement_map(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
assert isinstance(emap, EntanglementMap)
assert len(emap.layer_entanglement) > 0
assert 0 <= emap.overall_entanglement <= 1.0
assert len(emap.most_entangled_layers) > 0
assert len(emap.least_entangled_layers) > 0
def test_capability_sensitivity_keys(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
expected_keys = {"factual_knowledge", "reasoning", "language_fluency",
"instruction_following", "math"}
assert set(emap.capability_sensitivity.keys()) == expected_keys
def test_math_most_sensitive(self):
"""Math should be estimated as the most sensitive capability."""
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
if emap.overall_entanglement > 0:
assert emap.capability_sensitivity["math"] >= emap.capability_sensitivity["language_fluency"]
def test_format_entanglement(self):
pipeline = _make_mock_pipeline()
evaluator = DefenseRobustnessEvaluator(pipeline)
emap = evaluator.map_entanglement()
report = DefenseRobustnessEvaluator.format_entanglement(emap)
assert "Entanglement" in report
assert "math" in report