Spaces:
Running
Running
| """Tests for the analysis techniques.""" | |
| from __future__ import annotations | |
| import torch | |
| from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor, WhitenedSVDResult | |
| from obliteratus.analysis.cross_layer import CrossLayerAlignmentAnalyzer, CrossLayerResult | |
| from obliteratus.analysis.activation_probing import ActivationProbe, ProbeResult | |
| # --------------------------------------------------------------------------- | |
| # WhitenedSVDExtractor | |
| # --------------------------------------------------------------------------- | |
| class TestWhitenedSVD: | |
| def test_basic_extraction(self): | |
| """Whitened SVD should extract directions from activation differences.""" | |
| torch.manual_seed(42) | |
| n_prompts, hidden_dim = 10, 32 | |
| # Create activations with a clear refusal direction | |
| refusal_dir = torch.randn(hidden_dim) | |
| refusal_dir = refusal_dir / refusal_dir.norm() | |
| harmless = [torch.randn(hidden_dim) for _ in range(n_prompts)] | |
| harmful = [h + 2.0 * refusal_dir for h in harmless] # shifted along refusal dir | |
| extractor = WhitenedSVDExtractor() | |
| result = extractor.extract(harmful, harmless, n_directions=3) | |
| assert isinstance(result, WhitenedSVDResult) | |
| assert result.directions.shape == (3, hidden_dim) | |
| assert result.singular_values.shape == (3,) | |
| assert result.variance_explained > 0 | |
| assert result.condition_number > 0 | |
| assert result.effective_rank > 0 | |
| def test_directions_are_unit_vectors(self): | |
| """Extracted directions should be unit length.""" | |
| torch.manual_seed(42) | |
| harmless = [torch.randn(16) for _ in range(8)] | |
| harmful = [h + torch.randn(16) * 0.5 for h in harmless] | |
| extractor = WhitenedSVDExtractor() | |
| result = extractor.extract(harmful, harmless, n_directions=2) | |
| for i in range(result.directions.shape[0]): | |
| assert abs(result.directions[i].norm().item() - 1.0) < 1e-4 | |
| def test_primary_aligns_with_planted_direction(self): | |
| """Primary whitened direction should capture the planted refusal signal. | |
| Whitening rotates directions relative to the covariance structure, | |
| so perfect alignment with the raw direction is not expected. We verify | |
| the whitened direction explains substantial variance and has moderate | |
| alignment (whitening intentionally reweights dimensions). | |
| """ | |
| torch.manual_seed(42) | |
| hidden_dim = 64 | |
| n_prompts = 30 | |
| refusal_dir = torch.randn(hidden_dim) | |
| refusal_dir = refusal_dir / refusal_dir.norm() | |
| # Isotropic harmless activations (whitening has minimal effect) | |
| harmless = [torch.randn(hidden_dim) * 0.1 for _ in range(n_prompts)] | |
| harmful = [h + 5.0 * refusal_dir for h in harmless] | |
| extractor = WhitenedSVDExtractor(regularization_eps=1e-3) | |
| result = extractor.extract(harmful, harmless, n_directions=1) | |
| cos_sim = (result.directions[0] @ refusal_dir).abs().item() | |
| # Moderate alignment expected (whitening reweights dimensions) | |
| assert cos_sim > 0.2, f"Expected alignment > 0.2, got {cos_sim:.3f}" | |
| # More importantly: the direction should explain most variance | |
| assert result.variance_explained > 0.5 | |
| def test_extract_all_layers(self): | |
| """Should extract directions for all provided layers.""" | |
| torch.manual_seed(42) | |
| harmful_acts = {} | |
| harmless_acts = {} | |
| for layer in range(4): | |
| harmful_acts[layer] = [torch.randn(16) for _ in range(5)] | |
| harmless_acts[layer] = [torch.randn(16) for _ in range(5)] | |
| extractor = WhitenedSVDExtractor() | |
| results = extractor.extract_all_layers(harmful_acts, harmless_acts, n_directions=2) | |
| assert len(results) == 4 | |
| for idx in range(4): | |
| assert idx in results | |
| assert results[idx].directions.shape[0] == 2 | |
| def test_compare_with_standard(self): | |
| """Comparison should return valid cosine similarities.""" | |
| torch.manual_seed(42) | |
| harmless = [torch.randn(16) for _ in range(8)] | |
| harmful = [h + torch.randn(16) for h in harmless] | |
| extractor = WhitenedSVDExtractor() | |
| result = extractor.extract(harmful, harmless, n_directions=2) | |
| std_dir = torch.randn(16) | |
| std_dir = std_dir / std_dir.norm() | |
| comparison = WhitenedSVDExtractor.compare_with_standard(result, std_dir) | |
| assert "primary_direction_cosine" in comparison | |
| assert "subspace_principal_cosine" in comparison | |
| assert 0 <= comparison["primary_direction_cosine"] <= 1.0 | |
| def test_handles_3d_activations(self): | |
| """Should handle activations with an extra batch dimension.""" | |
| torch.manual_seed(42) | |
| # (1, hidden_dim) shape from hook output | |
| harmless = [torch.randn(1, 16) for _ in range(5)] | |
| harmful = [torch.randn(1, 16) for _ in range(5)] | |
| extractor = WhitenedSVDExtractor() | |
| result = extractor.extract(harmful, harmless, n_directions=2) | |
| assert result.directions.shape == (2, 16) | |
| def test_variance_explained_bounded(self): | |
| """Variance explained should be between 0 and 1.""" | |
| torch.manual_seed(42) | |
| harmless = [torch.randn(16) for _ in range(8)] | |
| harmful = [torch.randn(16) for _ in range(8)] | |
| extractor = WhitenedSVDExtractor() | |
| result = extractor.extract(harmful, harmless, n_directions=3) | |
| assert 0 <= result.variance_explained <= 1.0 | |
| # --------------------------------------------------------------------------- | |
| # CrossLayerAlignmentAnalyzer | |
| # --------------------------------------------------------------------------- | |
| class TestCrossLayerAlignment: | |
| def test_identical_directions(self): | |
| """Identical directions across layers should give persistence = 1.""" | |
| direction = torch.randn(32) | |
| direction = direction / direction.norm() | |
| directions = {i: direction.clone() for i in range(5)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| assert isinstance(result, CrossLayerResult) | |
| assert result.direction_persistence_score > 0.99 | |
| assert result.mean_adjacent_cosine > 0.99 | |
| assert result.total_geodesic_distance < 0.01 | |
| def test_orthogonal_directions(self): | |
| """Orthogonal directions should give low persistence.""" | |
| # Create orthogonal directions via QR decomposition | |
| torch.manual_seed(42) | |
| M = torch.randn(5, 32) | |
| Q, _ = torch.linalg.qr(M.T) | |
| directions = {i: Q[:, i] for i in range(5)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| assert result.direction_persistence_score < 0.3 | |
| assert result.mean_adjacent_cosine < 0.3 | |
| def test_cluster_detection(self): | |
| """Should detect clusters of similar directions.""" | |
| torch.manual_seed(42) | |
| # Create two clusters | |
| d1 = torch.randn(32) | |
| d1 = d1 / d1.norm() | |
| d2 = torch.randn(32) | |
| d2 = d2 / d2.norm() | |
| directions = { | |
| 0: d1, 1: d1 + 0.01 * torch.randn(32), | |
| 2: d1 + 0.01 * torch.randn(32), | |
| 3: d2, 4: d2 + 0.01 * torch.randn(32), | |
| } | |
| # Normalize | |
| directions = {k: v / v.norm() for k, v in directions.items()} | |
| analyzer = CrossLayerAlignmentAnalyzer(cluster_threshold=0.9) | |
| result = analyzer.analyze(directions) | |
| # Should find at least 2 clusters | |
| assert result.cluster_count >= 2 | |
| def test_empty_input(self): | |
| """Should handle empty input gracefully.""" | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze({}) | |
| assert result.layer_indices == [] | |
| assert result.cluster_count == 0 | |
| def test_single_layer(self): | |
| """Single layer should work fine.""" | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze({5: torch.randn(16)}) | |
| assert result.layer_indices == [5] | |
| assert result.direction_persistence_score == 1.0 | |
| def test_strong_layers_filter(self): | |
| """Should only analyze specified strong layers.""" | |
| directions = {i: torch.randn(16) for i in range(10)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions, strong_layers=[2, 5, 7]) | |
| assert result.layer_indices == [2, 5, 7] | |
| assert result.cosine_matrix.shape == (3, 3) | |
| def test_cosine_matrix_symmetry(self): | |
| """Cosine matrix should be symmetric.""" | |
| torch.manual_seed(42) | |
| directions = {i: torch.randn(16) for i in range(4)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| diff = (result.cosine_matrix - result.cosine_matrix.T).abs().max().item() | |
| assert diff < 1e-5 | |
| def test_cosine_matrix_diagonal_ones(self): | |
| """Diagonal of cosine matrix should be 1.0.""" | |
| torch.manual_seed(42) | |
| directions = {i: torch.randn(16) for i in range(4)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| for i in range(4): | |
| assert abs(result.cosine_matrix[i, i].item() - 1.0) < 1e-4 | |
| def test_angular_drift_monotonic(self): | |
| """Angular drift should be monotonically non-decreasing.""" | |
| torch.manual_seed(42) | |
| directions = {i: torch.randn(16) for i in range(6)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| for i in range(len(result.angular_drift) - 1): | |
| assert result.angular_drift[i + 1] >= result.angular_drift[i] - 1e-6 | |
| def test_format_report(self): | |
| """Format report should produce a non-empty string.""" | |
| torch.manual_seed(42) | |
| directions = {i: torch.randn(16) for i in range(4)} | |
| analyzer = CrossLayerAlignmentAnalyzer() | |
| result = analyzer.analyze(directions) | |
| report = CrossLayerAlignmentAnalyzer.format_report(result) | |
| assert "Cross-Layer" in report | |
| assert "persistence" in report | |
| # --------------------------------------------------------------------------- | |
| # ActivationProbe | |
| # --------------------------------------------------------------------------- | |
| class TestActivationProbe: | |
| def test_clean_elimination(self): | |
| """After removing direction, projections should be near-zero.""" | |
| torch.manual_seed(42) | |
| hidden_dim = 32 | |
| refusal_dir = torch.randn(hidden_dim) | |
| refusal_dir = refusal_dir / refusal_dir.norm() | |
| # "Post-abliteration" activations: direction has been removed | |
| harmless = [torch.randn(hidden_dim) for _ in range(10)] | |
| harmful = [torch.randn(hidden_dim) for _ in range(10)] | |
| # Both sets are random, no refusal signal => gap should be small | |
| probe = ActivationProbe() | |
| result = probe.probe_layer(harmful, harmless, refusal_dir) | |
| assert abs(result.projection_gap) < 1.0 | |
| assert result.separation_d_prime < 2.0 | |
| def test_residual_detection(self): | |
| """Should detect residual refusal signal when direction wasn't removed.""" | |
| torch.manual_seed(42) | |
| hidden_dim = 32 | |
| refusal_dir = torch.randn(hidden_dim) | |
| refusal_dir = refusal_dir / refusal_dir.norm() | |
| harmless = [torch.randn(hidden_dim) for _ in range(10)] | |
| # Harmful still has strong refusal direction component | |
| harmful = [h + 5.0 * refusal_dir for h in harmless] | |
| probe = ActivationProbe() | |
| result = probe.probe_layer(harmful, harmless, refusal_dir) | |
| assert abs(result.projection_gap) > 1.0 | |
| assert result.separation_d_prime > 2.0 | |
| def test_probe_all_layers(self): | |
| """Should compute aggregate metrics across layers.""" | |
| torch.manual_seed(42) | |
| hidden_dim = 16 | |
| n_layers = 4 | |
| harmful_acts = {} | |
| harmless_acts = {} | |
| refusal_dirs = {} | |
| for layer in range(n_layers): | |
| harmful_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)] | |
| harmless_acts[layer] = [torch.randn(hidden_dim) for _ in range(5)] | |
| d = torch.randn(hidden_dim) | |
| refusal_dirs[layer] = d / d.norm() | |
| probe = ActivationProbe() | |
| result = probe.probe_all_layers(harmful_acts, harmless_acts, refusal_dirs) | |
| assert isinstance(result, ProbeResult) | |
| assert len(result.per_layer) == n_layers | |
| assert 0 <= result.refusal_elimination_score <= 1.0 | |
| assert result.mean_projection_gap >= 0 | |
| def test_res_score_range(self): | |
| """RES should always be between 0 and 1.""" | |
| torch.manual_seed(42) | |
| for seed in range(5): | |
| torch.manual_seed(seed) | |
| harmful = {0: [torch.randn(8) for _ in range(3)]} | |
| harmless = {0: [torch.randn(8) for _ in range(3)]} | |
| dirs = {0: torch.randn(8)} | |
| dirs[0] = dirs[0] / dirs[0].norm() | |
| probe = ActivationProbe() | |
| result = probe.probe_all_layers(harmful, harmless, dirs) | |
| assert 0 <= result.refusal_elimination_score <= 1.0 | |
| def test_format_report(self): | |
| """Format report should produce readable output.""" | |
| torch.manual_seed(42) | |
| harmful = {0: [torch.randn(8) for _ in range(3)]} | |
| harmless = {0: [torch.randn(8) for _ in range(3)]} | |
| dirs = {0: torch.randn(8)} | |
| probe = ActivationProbe() | |
| result = probe.probe_all_layers(harmful, harmless, dirs) | |
| report = ActivationProbe.format_report(result) | |
| assert "Refusal Elimination Score" in report | |
| def test_empty_input(self): | |
| """Should handle empty input gracefully.""" | |
| probe = ActivationProbe() | |
| result = probe.probe_all_layers({}, {}, {}) | |
| assert result.refusal_elimination_score == 0.0 | |
| assert len(result.per_layer) == 0 | |