obliteratus / tests /test_informed_pipeline.py
pliny-the-prompter's picture
Upload 127 files
45113e6 verified
"""Tests for the Analysis-Informed Abliteration Pipeline."""
from __future__ import annotations
import pytest
import torch
from obliteratus.informed_pipeline import (
AnalysisInsights,
InformedAbliterationPipeline,
InformedPipelineReport,
INFORMED_METHOD,
)
from obliteratus.abliterate import METHODS
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def insights():
"""Default AnalysisInsights for testing."""
return AnalysisInsights()
@pytest.fixture
def pipeline(tmp_path):
"""An InformedAbliterationPipeline with no model loaded."""
return InformedAbliterationPipeline(
model_name="test-model",
output_dir=str(tmp_path / "test_informed"),
)
# ---------------------------------------------------------------------------
# AnalysisInsights
# ---------------------------------------------------------------------------
class TestAnalysisInsights:
def test_default_values(self, insights):
assert insights.detected_alignment_method == "unknown"
assert insights.alignment_confidence == 0.0
assert insights.cone_is_polyhedral is False
assert insights.cone_dimensionality == 1.0
assert insights.mean_pairwise_cosine == 1.0
assert insights.per_category_directions == {}
assert insights.direction_specificity == {}
assert insights.cluster_count == 0
assert insights.direction_persistence == 0.0
assert insights.use_sparse_surgery is False
assert insights.recommended_n_directions == 4
assert insights.recommended_regularization == 0.0
assert insights.recommended_refinement_passes == 2
assert insights.recommended_layers == []
assert insights.skip_layers == []
def test_default_robustness(self, insights):
assert insights.estimated_robustness == "unknown"
assert insights.self_repair_estimate == 0.0
assert insights.entanglement_score == 0.0
assert insights.entangled_layers == []
assert insights.clean_layers == []
class TestInformedPipelineReport:
def test_default_report(self):
insights = AnalysisInsights()
report = InformedPipelineReport(insights=insights)
assert report.analysis_duration == 0.0
assert report.total_duration == 0.0
assert report.ouroboros_passes == 0
assert report.final_refusal_rate == 0.0
assert report.stages == []
# ---------------------------------------------------------------------------
# Method preset
# ---------------------------------------------------------------------------
class TestInformedMethod:
def test_informed_method_in_abliterate_methods(self):
assert "informed" in METHODS
cfg = METHODS["informed"]
assert cfg["norm_preserve"] is True
assert cfg["project_biases"] is True
assert cfg["use_chat_template"] is True
assert cfg["use_whitened_svd"] is True
assert cfg["true_iterative_refinement"] is True
def test_informed_method_standalone(self):
assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
assert INFORMED_METHOD["n_directions"] == 4
assert INFORMED_METHOD["norm_preserve"] is True
# ---------------------------------------------------------------------------
# Pipeline initialization
# ---------------------------------------------------------------------------
class TestPipelineInit:
def test_method_set_to_informed(self, pipeline):
assert pipeline.method == "informed"
def test_default_analysis_flags(self, pipeline):
assert pipeline._run_cone is True
assert pipeline._run_alignment is True
assert pipeline._run_cross_layer is True
assert pipeline._run_sparse is True
assert pipeline._run_defense is True
def test_ouroboros_defaults(self, pipeline):
assert pipeline._ouroboros_threshold == 0.5
assert pipeline._max_ouroboros_passes == 3
def test_entanglement_gate(self, pipeline):
assert pipeline._entanglement_gate == 0.8
def test_inherits_base_pipeline(self, pipeline):
assert pipeline.norm_preserve is True
assert pipeline.project_biases is True
assert pipeline.use_chat_template is True
assert pipeline.use_whitened_svd is True
assert pipeline.true_iterative_refinement is True
def test_custom_flags(self):
p = InformedAbliterationPipeline(
model_name="test",
run_cone_analysis=False,
run_alignment_detection=False,
ouroboros_threshold=0.3,
max_ouroboros_passes=5,
entanglement_gate=0.9,
)
assert p._run_cone is False
assert p._run_alignment is False
assert p._ouroboros_threshold == 0.3
assert p._max_ouroboros_passes == 5
assert p._entanglement_gate == 0.9
# ---------------------------------------------------------------------------
# Configuration derivation
# ---------------------------------------------------------------------------
class TestConfigurationDerivation:
"""Test the _derive_configuration logic with various insights."""
def _make_pipeline_with_insights(self, **kwargs):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
for k, v in kwargs.items():
setattr(p._insights, k, v)
return p
def test_polyhedral_cone_more_directions(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=True,
cone_dimensionality=3.5,
)
p._derive_configuration()
# Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
assert p.n_directions == 7
def test_linear_cone_fewer_directions(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=False,
cone_dimensionality=1.0,
)
p._derive_configuration()
# Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
assert p.n_directions == 2
def test_dpo_zero_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="dpo",
entanglement_score=0.1,
)
p._derive_configuration()
assert p.regularization == 0.0
def test_rlhf_moderate_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="rlhf",
entanglement_score=0.2,
)
p._derive_configuration()
assert p.regularization == 0.15
def test_cai_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="cai",
entanglement_score=0.2,
)
p._derive_configuration()
assert p.regularization == 0.2
def test_sft_low_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="sft",
entanglement_score=0.1,
)
p._derive_configuration()
assert p.regularization == 0.05
def test_high_entanglement_increases_regularization(self):
p = self._make_pipeline_with_insights(
detected_alignment_method="dpo",
entanglement_score=0.7,
)
p._derive_configuration()
# DPO base = 0.0, + 0.15 for high entanglement = 0.15
assert p.regularization == 0.15
def test_high_self_repair_more_passes(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.8,
)
p._derive_configuration()
assert p.refinement_passes == 3
def test_moderate_self_repair_two_passes(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.5,
)
p._derive_configuration()
assert p.refinement_passes == 2
def test_low_self_repair_one_pass(self):
p = self._make_pipeline_with_insights(
self_repair_estimate=0.2,
)
p._derive_configuration()
assert p.refinement_passes == 1
def test_cluster_layers_used(self):
p = self._make_pipeline_with_insights(
cluster_representative_layers=[5, 10, 15],
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
)
p.refusal_directions = {i: torch.randn(64) for i in range(20)}
p._derive_configuration()
# Should include all cluster layers
assert 5 in p._insights.recommended_layers
assert 10 in p._insights.recommended_layers
def test_entangled_layers_skipped(self):
p = self._make_pipeline_with_insights(
cluster_representative_layers=[5, 10, 15],
direction_clusters=[[3, 4, 5], [9, 10, 11], [14, 15, 16]],
entangled_layers=[10],
)
p._derive_configuration()
# Layer 10 should be skipped
assert 10 not in p._insights.recommended_layers
assert 10 in p._insights.skip_layers
def test_sparse_surgery_enabled_when_rsi_high(self):
p = self._make_pipeline_with_insights(
mean_refusal_sparsity_index=0.7,
)
p._sparse_threshold = 0.5
p._derive_configuration()
assert p._insights.use_sparse_surgery is True
def test_sparse_surgery_disabled_when_rsi_low(self):
p = self._make_pipeline_with_insights(
mean_refusal_sparsity_index=0.3,
)
p._sparse_threshold = 0.5
p._derive_configuration()
assert p._insights.use_sparse_surgery is False
def test_whitened_svd_for_multi_direction(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=True,
cone_dimensionality=2.5,
)
p._derive_configuration()
assert p.n_directions > 1
assert p.use_whitened_svd is True
def test_no_whitened_svd_for_single_direction(self):
p = self._make_pipeline_with_insights(
cone_is_polyhedral=False,
cone_dimensionality=0.5,
)
p._derive_configuration()
# dim 0.5 → max(1, min(4, int(0.5+1))) = 1
assert p.n_directions == 1
assert p.use_whitened_svd is False
# ---------------------------------------------------------------------------
# Format report
# ---------------------------------------------------------------------------
class TestFormatInsights:
def test_format_default(self, insights):
text = InformedAbliterationPipeline.format_insights(insights)
assert "Analysis-Informed Pipeline" in text
assert "UNKNOWN" in text # detected method
assert "LINEAR" in text # cone type
def test_format_polyhedral(self):
insights = AnalysisInsights(
detected_alignment_method="dpo",
alignment_confidence=0.85,
cone_is_polyhedral=True,
cone_dimensionality=3.5,
cluster_count=4,
)
text = InformedAbliterationPipeline.format_insights(insights)
assert "DPO" in text
assert "POLYHEDRAL" in text
assert "3.50" in text
def test_format_includes_derived_config(self, insights):
insights.recommended_n_directions = 6
insights.recommended_regularization = 0.2
insights.recommended_refinement_passes = 3
text = InformedAbliterationPipeline.format_insights(insights)
assert "n_directions: 6" in text
assert "regularization: 0.2" in text
assert "refinement_passes: 3" in text
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEdgeCases:
def test_no_cluster_layers_falls_back(self):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.cluster_representative_layers = []
p._derive_configuration()
assert p._insights.recommended_layers == []
def test_regularization_capped(self):
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.detected_alignment_method = "cai"
p._insights.entanglement_score = 0.9
p._derive_configuration()
# CAI base = 0.2, + 0.15 = 0.35, capped at 0.5
assert p.regularization <= 0.5
def test_all_layers_entangled_keeps_some(self):
"""If all cluster layers are entangled, don't skip all of them."""
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
p._insights.cluster_representative_layers = [5]
p._insights.direction_clusters = [[5]]
p._insights.entangled_layers = [5]
p._derive_configuration()
# Should NOT skip the only layer
assert 5 in p._insights.recommended_layers
def test_cone_dimensionality_bounds(self):
"""Extreme cone dimensionality values are handled."""
p = InformedAbliterationPipeline(
model_name="test",
on_log=lambda m: None,
)
# Very high dimensionality
p._insights.cone_is_polyhedral = True
p._insights.cone_dimensionality = 10.0
p._derive_configuration()
assert p.n_directions <= 8 # capped
# Very low dimensionality
p._insights.cone_is_polyhedral = False
p._insights.cone_dimensionality = 0.1
p._derive_configuration()
assert p.n_directions >= 1 # at least 1