File size: 8,453 Bytes
78ea1a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""Tests for artifact generation — consistency, determinism, and bug planting."""

import json
import pytest
from artifact_generator import (
    ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
    run_sanity_check,
)
import random


class TestArtifactGeneration:
    """Artifacts should be complete, parseable, and internally consistent."""

    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
    def test_generates_all_six_artifacts(self, bug_type):
        gen = ArtifactGenerator(bug_type, seed=42)
        artifacts = gen.generate_all()
        expected = {"config.yaml", "train.log", "dataset_stats.json",
                    "preprocessing.py", "eval_results.json", "model_card.json"}
        assert set(artifacts.keys()) == expected

    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
    def test_json_artifacts_are_valid(self, bug_type):
        gen = ArtifactGenerator(bug_type, seed=42)
        artifacts = gen.generate_all()
        for name in ["dataset_stats.json", "eval_results.json", "model_card.json"]:
            data = json.loads(artifacts[name])
            assert isinstance(data, dict), f"{name} is not a dict"

    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
    def test_config_yaml_has_required_sections(self, bug_type):
        gen = ArtifactGenerator(bug_type, seed=42)
        artifacts = gen.generate_all()
        config = artifacts["config.yaml"]
        for section in ["model:", "training:", "optimizer:", "scheduler:", "data:"]:
            assert section in config, f"Missing {section} in config.yaml"

    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
    def test_train_log_has_epochs(self, bug_type):
        gen = ArtifactGenerator(bug_type, seed=42)
        artifacts = gen.generate_all()
        log = artifacts["train.log"]
        assert "EPOCH" in log or "epoch" in log.lower()

    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
    def test_preprocessing_is_valid_python(self, bug_type):
        gen = ArtifactGenerator(bug_type, seed=42)
        artifacts = gen.generate_all()
        code = artifacts["preprocessing.py"]
        compile(code, f"<{bug_type}_preprocessing>", "exec")  # syntax check


class TestDeterminism:
    """Same (bug_type, seed) must produce identical artifacts."""

    @pytest.mark.parametrize("bug_type", ["exploding_lr", "data_leakage_scaler", "label_encoder_mismatch"])
    def test_same_seed_same_artifacts(self, bug_type):
        gen1 = ArtifactGenerator(bug_type, seed=42)
        gen2 = ArtifactGenerator(bug_type, seed=42)
        a1 = gen1.generate_all()
        a2 = gen2.generate_all()
        for name in a1:
            assert a1[name] == a2[name], f"{name} differs between runs"

    def test_different_seeds_differ(self):
        gen1 = ArtifactGenerator("exploding_lr", seed=1)
        gen2 = ArtifactGenerator("exploding_lr", seed=999)
        a1 = gen1.generate_all()
        a2 = gen2.generate_all()
        assert a1["config.yaml"] != a2["config.yaml"]


class TestBugPlanting:
    """Each bug type should plant its specific fault in the artifacts."""

    def test_exploding_lr_has_high_lr(self):
        gen = ArtifactGenerator("exploding_lr", seed=42)
        config = gen.generate_all()["config.yaml"]
        # LR should be absurdly high (10, 25, or 50)
        assert any(f"learning_rate: {lr}" in config for lr in ["50.0", "10.0", "25.0"])

    def test_wrong_optimizer_has_high_momentum(self):
        gen = ArtifactGenerator("wrong_optimizer", seed=42)
        config = gen.generate_all()["config.yaml"]
        assert "momentum: 0.99" in config

    def test_batch_size_overflow_has_large_batch(self):
        gen = ArtifactGenerator("batch_size_overflow", seed=42)
        config = gen.generate_all()["config.yaml"]
        assert any(f"batch_size: {bs}" in config for bs in ["2048", "4096", "8192"])

    def test_data_leakage_scaler_fits_before_split(self):
        gen = ArtifactGenerator("data_leakage_scaler", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "fit_transform" in code
        assert "BUG" in code or "sees val/test" in code

    def test_data_leakage_overlap_has_no_random_state(self):
        gen = ArtifactGenerator("data_leakage_overlap", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "random_state=None" in code

    def test_wrong_split_ratio_has_inverted_split(self):
        gen = ArtifactGenerator("wrong_split_ratio", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "test_size=0.8" in code

    def test_label_encoder_mismatch_has_two_encoders(self):
        gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "le_train" in code and "le_eval" in code

    def test_silent_metric_swap_has_swapped_assignments(self):
        gen = ArtifactGenerator("silent_metric_swap", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "test_acc" in code and "val_acc" in code

    def test_tokenizer_drift_has_version_mismatch(self):
        gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
        code = gen.generate_all()["preprocessing.py"]
        assert "TOKENIZER_V1" in code and "TOKENIZER_V2" in code


class TestSanityChecks:
    """Sanity checks should detect the planted bug."""

    def test_gradient_norms_detects_exploding_lr(self):
        gen = ArtifactGenerator("exploding_lr", seed=42)
        artifacts = gen.generate_all()
        rng = random.Random(42)
        result = run_sanity_check("gradient_norms", "exploding_lr", artifacts, rng)
        assert result["result"] == "ANOMALY"

    def test_data_leakage_detects_scaler_leak(self):
        gen = ArtifactGenerator("data_leakage_scaler", seed=42)
        artifacts = gen.generate_all()
        rng = random.Random(42)
        result = run_sanity_check("data_leakage", "data_leakage_scaler", artifacts, rng)
        assert result["result"] == "FAIL"

    def test_label_consistency_detects_mismatch(self):
        gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
        artifacts = gen.generate_all()
        rng = random.Random(42)
        result = run_sanity_check("label_consistency", "label_encoder_mismatch", artifacts, rng)
        assert result["result"] == "FAIL"

    def test_encoder_version_detects_drift(self):
        gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
        artifacts = gen.generate_all()
        rng = random.Random(42)
        result = run_sanity_check("encoder_version_match", "tokenizer_version_drift", artifacts, rng)
        assert result["result"] == "MISMATCH"

    def test_metric_gap_detects_hard_bugs(self):
        for bug_type in TASK_BUG_POOLS["hard"]:
            gen = ArtifactGenerator(bug_type, seed=42)
            artifacts = gen.generate_all()
            rng = random.Random(42)
            result = run_sanity_check("metric_gap_analysis", bug_type, artifacts, rng)
            assert result["result"] == "ANOMALY", f"metric_gap missed {bug_type}"

    def test_unknown_check_returns_unknown(self):
        gen = ArtifactGenerator("exploding_lr", seed=42)
        artifacts = gen.generate_all()
        rng = random.Random(42)
        result = run_sanity_check("nonexistent_check", "exploding_lr", artifacts, rng)
        assert result["result"] == "UNKNOWN"


class TestBugCatalogue:
    """Bug catalogue should be complete and consistent."""

    def test_all_bugs_have_required_fields(self):
        for name, bug in BUG_CATALOGUE.items():
            assert bug.bug_type == name
            assert bug.category in [
                "config_error", "data_leakage", "preprocessing_bug",
                "evaluation_bug", "label_mismatch", "architecture_bug",
            ]
            assert bug.file.endswith((".yaml", ".py", ".json"))
            assert len(bug.field) > 0
            assert len(bug.gold_fix) > 10
            assert bug.task_difficulty in ["easy", "medium", "hard"]

    def test_task_pools_cover_all_bugs(self):
        all_pooled = set()
        for pool in TASK_BUG_POOLS.values():
            all_pooled.update(pool)
        assert all_pooled == set(BUG_CATALOGUE.keys())

    def test_each_pool_has_three_bugs(self):
        for task_id, pool in TASK_BUG_POOLS.items():
            assert len(pool) == 3, f"{task_id} has {len(pool)} bugs, expected 3"