Rockerleo commited on
Commit
78ea1a9
·
verified ·
1 Parent(s): dc936ba

Upload folder using huggingface_hub

Browse files
tests/__init__.py ADDED
File without changes
tests/test_api.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the FastAPI server — endpoint responses and error handling."""
2
+
3
+ import pytest
4
+ from fastapi.testclient import TestClient
5
+ from app import app
6
+
7
+
8
+ @pytest.fixture
9
+ def client():
10
+ return TestClient(app)
11
+
12
+
13
+ class TestHealthAndInfo:
14
+ def test_root(self, client):
15
+ r = client.get("/")
16
+ assert r.status_code == 200
17
+ assert "MLOps Pipeline Debugger API" in r.json()["message"]
18
+
19
+ def test_health(self, client):
20
+ r = client.get("/health")
21
+ assert r.status_code == 200
22
+ assert r.json()["status"] == "ok"
23
+
24
+ def test_tasks(self, client):
25
+ r = client.get("/tasks")
26
+ assert r.status_code == 200
27
+ tasks = r.json()["tasks"]
28
+ assert len(tasks) == 3
29
+ task_ids = {t["task_id"] for t in tasks}
30
+ assert task_ids == {"easy", "medium", "hard"}
31
+
32
+
33
+ class TestResetEndpoint:
34
+ def test_reset_easy(self, client):
35
+ r = client.post("/reset", json={"task_id": "easy", "seed": 42})
36
+ assert r.status_code == 200
37
+ data = r.json()
38
+ assert data["task_id"] == "easy"
39
+ assert data["step_count"] == 0
40
+ assert data["done"] is False
41
+ assert len(data["available_artifacts"]) == 6
42
+
43
+ def test_reset_hard(self, client):
44
+ r = client.post("/reset", json={"task_id": "hard", "seed": 42})
45
+ assert r.status_code == 200
46
+ assert r.json()["task_id"] == "hard"
47
+
48
+ def test_reset_default(self, client):
49
+ r = client.post("/reset", json={})
50
+ assert r.status_code == 200
51
+ assert r.json()["task_id"] == "easy"
52
+
53
+
54
+ class TestStepEndpoint:
55
+ def test_step_read_config(self, client):
56
+ client.post("/reset", json={"task_id": "easy", "seed": 42})
57
+ r = client.post("/step", json={"action_type": "read_config"})
58
+ assert r.status_code == 200
59
+ data = r.json()
60
+ assert data["reward"] == 0.02
61
+ assert data["done"] is False
62
+
63
+ def test_step_submit_diagnosis(self, client):
64
+ client.post("/reset", json={"task_id": "easy", "seed": 42})
65
+ r = client.post("/step", json={
66
+ "action_type": "submit_diagnosis",
67
+ "failure_category": "config_error",
68
+ "root_cause_file": "config.yaml",
69
+ "root_cause_field": "optimizer.learning_rate",
70
+ "proposed_fix": "Reduce learning_rate",
71
+ })
72
+ assert r.status_code == 200
73
+ data = r.json()
74
+ assert data["done"] is True
75
+ assert 0 < data["info"]["score"] < 1
76
+
77
+ def test_step_invalid_action(self, client):
78
+ client.post("/reset", json={"task_id": "easy", "seed": 42})
79
+ r = client.post("/step", json={"action_type": "invalid_action"})
80
+ assert r.status_code == 422
81
+
82
+ def test_step_nested_action_format(self, client):
83
+ client.post("/reset", json={"task_id": "easy", "seed": 42})
84
+ r = client.post("/step", json={"action": {"action_type": "read_config"}})
85
+ assert r.status_code == 200
86
+
87
+
88
+ class TestStateEndpoint:
89
+ def test_state_after_reset(self, client):
90
+ client.post("/reset", json={"task_id": "easy", "seed": 42})
91
+ r = client.get("/state")
92
+ assert r.status_code == 200
93
+ data = r.json()
94
+ assert data["task_id"] == "easy"
95
+ assert data["seed"] == 42
96
+ assert "bug_type" in data
97
+
98
+
99
+ class TestOpenEnvState:
100
+ def test_openenv_state(self, client):
101
+ r = client.get("/openenv/state")
102
+ assert r.status_code == 200
103
+ data = r.json()
104
+ assert "scores" in data
105
+ assert "easy" in data["scores"]
tests/test_artifacts.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for artifact generation — consistency, determinism, and bug planting."""
2
+
3
+ import json
4
+ import pytest
5
+ from artifact_generator import (
6
+ ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
7
+ run_sanity_check,
8
+ )
9
+ import random
10
+
11
+
12
+ class TestArtifactGeneration:
13
+ """Artifacts should be complete, parseable, and internally consistent."""
14
+
15
+ @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
16
+ def test_generates_all_six_artifacts(self, bug_type):
17
+ gen = ArtifactGenerator(bug_type, seed=42)
18
+ artifacts = gen.generate_all()
19
+ expected = {"config.yaml", "train.log", "dataset_stats.json",
20
+ "preprocessing.py", "eval_results.json", "model_card.json"}
21
+ assert set(artifacts.keys()) == expected
22
+
23
+ @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
24
+ def test_json_artifacts_are_valid(self, bug_type):
25
+ gen = ArtifactGenerator(bug_type, seed=42)
26
+ artifacts = gen.generate_all()
27
+ for name in ["dataset_stats.json", "eval_results.json", "model_card.json"]:
28
+ data = json.loads(artifacts[name])
29
+ assert isinstance(data, dict), f"{name} is not a dict"
30
+
31
+ @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
32
+ def test_config_yaml_has_required_sections(self, bug_type):
33
+ gen = ArtifactGenerator(bug_type, seed=42)
34
+ artifacts = gen.generate_all()
35
+ config = artifacts["config.yaml"]
36
+ for section in ["model:", "training:", "optimizer:", "scheduler:", "data:"]:
37
+ assert section in config, f"Missing {section} in config.yaml"
38
+
39
+ @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
40
+ def test_train_log_has_epochs(self, bug_type):
41
+ gen = ArtifactGenerator(bug_type, seed=42)
42
+ artifacts = gen.generate_all()
43
+ log = artifacts["train.log"]
44
+ assert "EPOCH" in log or "epoch" in log.lower()
45
+
46
+ @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
47
+ def test_preprocessing_is_valid_python(self, bug_type):
48
+ gen = ArtifactGenerator(bug_type, seed=42)
49
+ artifacts = gen.generate_all()
50
+ code = artifacts["preprocessing.py"]
51
+ compile(code, f"<{bug_type}_preprocessing>", "exec") # syntax check
52
+
53
+
54
+ class TestDeterminism:
55
+ """Same (bug_type, seed) must produce identical artifacts."""
56
+
57
+ @pytest.mark.parametrize("bug_type", ["exploding_lr", "data_leakage_scaler", "label_encoder_mismatch"])
58
+ def test_same_seed_same_artifacts(self, bug_type):
59
+ gen1 = ArtifactGenerator(bug_type, seed=42)
60
+ gen2 = ArtifactGenerator(bug_type, seed=42)
61
+ a1 = gen1.generate_all()
62
+ a2 = gen2.generate_all()
63
+ for name in a1:
64
+ assert a1[name] == a2[name], f"{name} differs between runs"
65
+
66
+ def test_different_seeds_differ(self):
67
+ gen1 = ArtifactGenerator("exploding_lr", seed=1)
68
+ gen2 = ArtifactGenerator("exploding_lr", seed=999)
69
+ a1 = gen1.generate_all()
70
+ a2 = gen2.generate_all()
71
+ assert a1["config.yaml"] != a2["config.yaml"]
72
+
73
+
74
+ class TestBugPlanting:
75
+ """Each bug type should plant its specific fault in the artifacts."""
76
+
77
+ def test_exploding_lr_has_high_lr(self):
78
+ gen = ArtifactGenerator("exploding_lr", seed=42)
79
+ config = gen.generate_all()["config.yaml"]
80
+ # LR should be absurdly high (10, 25, or 50)
81
+ assert any(f"learning_rate: {lr}" in config for lr in ["50.0", "10.0", "25.0"])
82
+
83
+ def test_wrong_optimizer_has_high_momentum(self):
84
+ gen = ArtifactGenerator("wrong_optimizer", seed=42)
85
+ config = gen.generate_all()["config.yaml"]
86
+ assert "momentum: 0.99" in config
87
+
88
+ def test_batch_size_overflow_has_large_batch(self):
89
+ gen = ArtifactGenerator("batch_size_overflow", seed=42)
90
+ config = gen.generate_all()["config.yaml"]
91
+ assert any(f"batch_size: {bs}" in config for bs in ["2048", "4096", "8192"])
92
+
93
+ def test_data_leakage_scaler_fits_before_split(self):
94
+ gen = ArtifactGenerator("data_leakage_scaler", seed=42)
95
+ code = gen.generate_all()["preprocessing.py"]
96
+ assert "fit_transform" in code
97
+ assert "BUG" in code or "sees val/test" in code
98
+
99
+ def test_data_leakage_overlap_has_no_random_state(self):
100
+ gen = ArtifactGenerator("data_leakage_overlap", seed=42)
101
+ code = gen.generate_all()["preprocessing.py"]
102
+ assert "random_state=None" in code
103
+
104
+ def test_wrong_split_ratio_has_inverted_split(self):
105
+ gen = ArtifactGenerator("wrong_split_ratio", seed=42)
106
+ code = gen.generate_all()["preprocessing.py"]
107
+ assert "test_size=0.8" in code
108
+
109
+ def test_label_encoder_mismatch_has_two_encoders(self):
110
+ gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
111
+ code = gen.generate_all()["preprocessing.py"]
112
+ assert "le_train" in code and "le_eval" in code
113
+
114
+ def test_silent_metric_swap_has_swapped_assignments(self):
115
+ gen = ArtifactGenerator("silent_metric_swap", seed=42)
116
+ code = gen.generate_all()["preprocessing.py"]
117
+ assert "test_acc" in code and "val_acc" in code
118
+
119
+ def test_tokenizer_drift_has_version_mismatch(self):
120
+ gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
121
+ code = gen.generate_all()["preprocessing.py"]
122
+ assert "TOKENIZER_V1" in code and "TOKENIZER_V2" in code
123
+
124
+
125
+ class TestSanityChecks:
126
+ """Sanity checks should detect the planted bug."""
127
+
128
+ def test_gradient_norms_detects_exploding_lr(self):
129
+ gen = ArtifactGenerator("exploding_lr", seed=42)
130
+ artifacts = gen.generate_all()
131
+ rng = random.Random(42)
132
+ result = run_sanity_check("gradient_norms", "exploding_lr", artifacts, rng)
133
+ assert result["result"] == "ANOMALY"
134
+
135
+ def test_data_leakage_detects_scaler_leak(self):
136
+ gen = ArtifactGenerator("data_leakage_scaler", seed=42)
137
+ artifacts = gen.generate_all()
138
+ rng = random.Random(42)
139
+ result = run_sanity_check("data_leakage", "data_leakage_scaler", artifacts, rng)
140
+ assert result["result"] == "FAIL"
141
+
142
+ def test_label_consistency_detects_mismatch(self):
143
+ gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
144
+ artifacts = gen.generate_all()
145
+ rng = random.Random(42)
146
+ result = run_sanity_check("label_consistency", "label_encoder_mismatch", artifacts, rng)
147
+ assert result["result"] == "FAIL"
148
+
149
+ def test_encoder_version_detects_drift(self):
150
+ gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
151
+ artifacts = gen.generate_all()
152
+ rng = random.Random(42)
153
+ result = run_sanity_check("encoder_version_match", "tokenizer_version_drift", artifacts, rng)
154
+ assert result["result"] == "MISMATCH"
155
+
156
+ def test_metric_gap_detects_hard_bugs(self):
157
+ for bug_type in TASK_BUG_POOLS["hard"]:
158
+ gen = ArtifactGenerator(bug_type, seed=42)
159
+ artifacts = gen.generate_all()
160
+ rng = random.Random(42)
161
+ result = run_sanity_check("metric_gap_analysis", bug_type, artifacts, rng)
162
+ assert result["result"] == "ANOMALY", f"metric_gap missed {bug_type}"
163
+
164
+ def test_unknown_check_returns_unknown(self):
165
+ gen = ArtifactGenerator("exploding_lr", seed=42)
166
+ artifacts = gen.generate_all()
167
+ rng = random.Random(42)
168
+ result = run_sanity_check("nonexistent_check", "exploding_lr", artifacts, rng)
169
+ assert result["result"] == "UNKNOWN"
170
+
171
+
172
+ class TestBugCatalogue:
173
+ """Bug catalogue should be complete and consistent."""
174
+
175
+ def test_all_bugs_have_required_fields(self):
176
+ for name, bug in BUG_CATALOGUE.items():
177
+ assert bug.bug_type == name
178
+ assert bug.category in [
179
+ "config_error", "data_leakage", "preprocessing_bug",
180
+ "evaluation_bug", "label_mismatch", "architecture_bug",
181
+ ]
182
+ assert bug.file.endswith((".yaml", ".py", ".json"))
183
+ assert len(bug.field) > 0
184
+ assert len(bug.gold_fix) > 10
185
+ assert bug.task_difficulty in ["easy", "medium", "hard"]
186
+
187
+ def test_task_pools_cover_all_bugs(self):
188
+ all_pooled = set()
189
+ for pool in TASK_BUG_POOLS.values():
190
+ all_pooled.update(pool)
191
+ assert all_pooled == set(BUG_CATALOGUE.keys())
192
+
193
+ def test_each_pool_has_three_bugs(self):
194
+ for task_id, pool in TASK_BUG_POOLS.items():
195
+ assert len(pool) == 3, f"{task_id} has {len(pool)} bugs, expected 3"
tests/test_environment.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for MLOpsEnvironment — core episode flow, state management, and step logic."""
2
+
3
+ import pytest
4
+ from mlops_environment import MLOpsEnvironment, TASK_MAX_STEPS
5
+ from models import MLOpsAction
6
+
7
+
8
+ class TestReset:
9
+ """reset() should produce a clean, valid initial state."""
10
+
11
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
12
+ def test_reset_returns_valid_observation(self, task_id):
13
+ env = MLOpsEnvironment(task_id=task_id)
14
+ obs = env.reset(seed=42)
15
+ assert obs.task_id == task_id
16
+ assert obs.step_count == 0
17
+ assert obs.max_steps == TASK_MAX_STEPS[task_id]
18
+ assert obs.done is False
19
+ assert len(obs.available_artifacts) == 6
20
+ assert obs.artifacts_read == []
21
+
22
+ def test_reset_with_same_seed_is_deterministic(self):
23
+ env1 = MLOpsEnvironment(task_id="easy")
24
+ env2 = MLOpsEnvironment(task_id="easy")
25
+ obs1 = env1.reset(seed=123)
26
+ obs2 = env2.reset(seed=123)
27
+ assert obs1.run_id == obs2.run_id
28
+ assert env1.bug_type == env2.bug_type
29
+
30
+ def test_reset_with_different_seeds_varies(self):
31
+ env = MLOpsEnvironment(task_id="easy")
32
+ obs1 = env.reset(seed=1)
33
+ run_id_1 = obs1.run_id
34
+ obs2 = env.reset(seed=999)
35
+ assert obs2.run_id != run_id_1
36
+
37
+ def test_reset_clears_previous_episode(self):
38
+ env = MLOpsEnvironment(task_id="easy")
39
+ env.reset(seed=42)
40
+ env.step(MLOpsAction(action_type="read_config"))
41
+ assert len(env._artifacts_read) == 1
42
+ env.reset(seed=42)
43
+ assert len(env._artifacts_read) == 0
44
+ assert env._step_count == 0
45
+
46
+
47
+ class TestStepActions:
48
+ """Each action type should return expected structure and reward."""
49
+
50
+ @pytest.fixture
51
+ def env(self):
52
+ env = MLOpsEnvironment(task_id="easy")
53
+ env.reset(seed=42)
54
+ return env
55
+
56
+ def test_read_config(self, env):
57
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
58
+ assert reward == 0.02
59
+ assert done is False
60
+ assert "config.yaml" in obs.artifacts_read
61
+ assert "content" in obs.last_action_result
62
+
63
+ def test_read_logs(self, env):
64
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_logs"))
65
+ assert reward == 0.02
66
+ assert "train.log" in obs.artifacts_read
67
+
68
+ def test_read_logs_with_filter(self, env):
69
+ obs, reward, done, info = env.step(
70
+ MLOpsAction(action_type="read_logs", log_filter="epoch:1-3")
71
+ )
72
+ assert reward == 0.02
73
+ content = obs.last_action_result.get("content", "")
74
+ assert "EPOCH" in content or "No log lines" in content
75
+
76
+ def test_check_dataset_stats(self, env):
77
+ obs, reward, done, info = env.step(MLOpsAction(action_type="check_dataset_stats"))
78
+ assert reward == 0.02
79
+ assert "dataset_stats.json" in obs.artifacts_read
80
+
81
+ def test_inspect_preprocessing(self, env):
82
+ obs, reward, done, info = env.step(MLOpsAction(action_type="inspect_preprocessing"))
83
+ assert reward == 0.02
84
+ assert "preprocessing.py" in obs.artifacts_read
85
+
86
+ def test_read_eval_results(self, env):
87
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_eval_results"))
88
+ assert reward == 0.02
89
+ assert "eval_results.json" in obs.artifacts_read
90
+
91
+ def test_run_sanity_check(self, env):
92
+ obs, reward, done, info = env.step(
93
+ MLOpsAction(action_type="run_sanity_check", sanity_check_type="loss_trajectory")
94
+ )
95
+ assert reward == 0.01
96
+ assert obs.last_action_result["status"] == "ok"
97
+ assert "sanity_check" in obs.last_action_result
98
+
99
+ def test_query_artifact(self, env):
100
+ env.step(MLOpsAction(action_type="read_config"))
101
+ obs, reward, done, info = env.step(
102
+ MLOpsAction(action_type="query_artifact", artifact_name="config.yaml", field_path="model.architecture")
103
+ )
104
+ assert obs.last_action_result["status"] == "ok"
105
+
106
+ def test_duplicate_read_penalty(self, env):
107
+ env.step(MLOpsAction(action_type="read_config"))
108
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
109
+ assert reward == -0.02
110
+
111
+ def test_step_count_increments(self, env):
112
+ env.step(MLOpsAction(action_type="read_config"))
113
+ env.step(MLOpsAction(action_type="read_logs"))
114
+ assert env._step_count == 2
115
+
116
+ def test_done_after_submit(self, env):
117
+ obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
118
+ assert done is True
119
+
120
+ def test_step_after_done_returns_done(self, env):
121
+ env.step(MLOpsAction(action_type="submit_diagnosis"))
122
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
123
+ assert done is True
124
+ assert reward == 0.01 # clamped minimum
125
+ assert "score" in info
126
+
127
+
128
+ class TestEpisodeBoundaries:
129
+ """Episode should terminate correctly on submit, timeout, and re-step."""
130
+
131
+ def test_timeout_at_max_steps(self):
132
+ env = MLOpsEnvironment(task_id="easy")
133
+ env.reset(seed=42)
134
+ for _ in range(TASK_MAX_STEPS["easy"]):
135
+ obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
136
+ if done:
137
+ break
138
+ assert done is True
139
+ assert "score" in info
140
+
141
+ def test_submit_ends_episode(self):
142
+ env = MLOpsEnvironment(task_id="medium")
143
+ env.reset(seed=42)
144
+ env.step(MLOpsAction(action_type="read_logs"))
145
+ obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
146
+ assert done is True
147
+ assert "score" in info
148
+ assert "breakdown" in info
tests/test_grading.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the grading system — score ranges, component scoring, and determinism."""
2
+
3
+ import pytest
4
+ from mlops_environment import MLOpsEnvironment, grade_task
5
+ from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
6
+ from models import MLOpsAction
7
+
8
+
9
+ class TestScoreRange:
10
+ """All scores must be strictly between 0 and 1."""
11
+
12
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
13
+ def test_perfect_diagnosis_below_1(self, task_id):
14
+ env = MLOpsEnvironment(task_id=task_id)
15
+ env.reset(seed=42)
16
+ env._artifacts_read = list(env._artifacts.keys())
17
+ bug = env.bug
18
+ obs, reward, done, info = env.step(MLOpsAction(
19
+ action_type="submit_diagnosis",
20
+ failure_category=bug.category,
21
+ root_cause_file=bug.file,
22
+ root_cause_field=bug.field,
23
+ diagnosis="test",
24
+ proposed_fix=bug.gold_fix,
25
+ ))
26
+ score = info["score"]
27
+ assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
28
+ assert score <= 0.99
29
+
30
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
31
+ def test_empty_diagnosis_above_0(self, task_id):
32
+ env = MLOpsEnvironment(task_id=task_id)
33
+ env.reset(seed=42)
34
+ obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
35
+ score = info["score"]
36
+ assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
37
+ assert score >= 0.01
38
+
39
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
40
+ def test_wrong_diagnosis_above_0(self, task_id):
41
+ env = MLOpsEnvironment(task_id=task_id)
42
+ env.reset(seed=42)
43
+ env._artifacts_read = list(env._artifacts.keys())
44
+ obs, reward, done, info = env.step(MLOpsAction(
45
+ action_type="submit_diagnosis",
46
+ failure_category="architecture_bug",
47
+ root_cause_file="nonexistent.py",
48
+ root_cause_field="wrong.field",
49
+ diagnosis="completely wrong",
50
+ proposed_fix="do nothing",
51
+ ))
52
+ score = info["score"]
53
+ assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"
54
+
55
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
56
+ @pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
57
+ def test_score_range_across_seeds(self, task_id, seed):
58
+ env = MLOpsEnvironment(task_id=task_id)
59
+ env.reset(seed=seed)
60
+ env._artifacts_read = list(env._artifacts.keys())
61
+ bug = env.bug
62
+ obs, reward, done, info = env.step(MLOpsAction(
63
+ action_type="submit_diagnosis",
64
+ failure_category=bug.category,
65
+ root_cause_file=bug.file,
66
+ root_cause_field=bug.field,
67
+ diagnosis="test",
68
+ proposed_fix=bug.gold_fix,
69
+ ))
70
+ score = info["score"]
71
+ assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"
72
+
73
+
74
+ class TestComponentScoring:
75
+ """Each scoring component should award correct points."""
76
+
77
+ @pytest.fixture
78
+ def env_with_bug(self):
79
+ env = MLOpsEnvironment(task_id="easy")
80
+ env.reset(seed=42)
81
+ env._artifacts_read = list(env._artifacts.keys())
82
+ return env, env.bug
83
+
84
+ def test_category_only(self, env_with_bug):
85
+ env, bug = env_with_bug
86
+ obs, reward, done, info = env.step(MLOpsAction(
87
+ action_type="submit_diagnosis",
88
+ failure_category=bug.category,
89
+ ))
90
+ bd = info["breakdown"]
91
+ assert bd["failure_category"]["correct"] is True
92
+ assert bd["failure_category"]["awarded"] == 0.15
93
+
94
+ def test_category_plus_file(self, env_with_bug):
95
+ env, bug = env_with_bug
96
+ obs, reward, done, info = env.step(MLOpsAction(
97
+ action_type="submit_diagnosis",
98
+ failure_category=bug.category,
99
+ root_cause_file=bug.file,
100
+ ))
101
+ bd = info["breakdown"]
102
+ assert bd["failure_category"]["correct"] is True
103
+ assert bd["root_cause_file"]["correct"] is True
104
+ assert info["score"] >= 0.35
105
+
106
+ def test_file_match_case_insensitive(self, env_with_bug):
107
+ env, bug = env_with_bug
108
+ obs, reward, done, info = env.step(MLOpsAction(
109
+ action_type="submit_diagnosis",
110
+ failure_category=bug.category,
111
+ root_cause_file=bug.file.upper(),
112
+ ))
113
+ assert info["breakdown"]["root_cause_file"]["correct"] is True
114
+
115
+ def test_partial_fix_scoring(self, env_with_bug):
116
+ env, bug = env_with_bug
117
+ # Submit just one keyword from the gold fix
118
+ first_word = bug.gold_fix.split()[0]
119
+ obs, reward, done, info = env.step(MLOpsAction(
120
+ action_type="submit_diagnosis",
121
+ failure_category=bug.category,
122
+ proposed_fix=first_word,
123
+ ))
124
+ fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
125
+ assert fix_awarded > 0 # partial credit
126
+
127
+
128
+ class TestHardTaskPenalty:
129
+ """Hard task should apply 1.5x penalty when score < 0.70."""
130
+
131
+ def test_penalty_applied_on_low_score(self):
132
+ env = MLOpsEnvironment(task_id="hard")
133
+ env.reset(seed=42)
134
+ env._artifacts_read = list(env._artifacts.keys())
135
+ # Submit with only category correct → score ~0.15, well below 0.70
136
+ obs, reward, done, info = env.step(MLOpsAction(
137
+ action_type="submit_diagnosis",
138
+ failure_category=env.bug.category,
139
+ ))
140
+ assert info["breakdown"].get("hard_task_penalty_applied") is True
141
+ assert info["score"] < 0.15 # penalty reduces it
142
+
143
+ def test_no_penalty_on_high_score(self):
144
+ env = MLOpsEnvironment(task_id="hard")
145
+ env.reset(seed=42)
146
+ env._artifacts_read = list(env._artifacts.keys())
147
+ bug = env.bug
148
+ obs, reward, done, info = env.step(MLOpsAction(
149
+ action_type="submit_diagnosis",
150
+ failure_category=bug.category,
151
+ root_cause_file=bug.file,
152
+ root_cause_field=bug.field,
153
+ diagnosis="test",
154
+ proposed_fix=bug.gold_fix,
155
+ ))
156
+ assert info["breakdown"].get("hard_task_penalty_applied") is not True
157
+ assert info["score"] >= 0.70
158
+
159
+
160
+ class TestGraderDeterminism:
161
+ """Same inputs must always produce identical scores."""
162
+
163
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
164
+ def test_same_seed_same_score(self, task_id):
165
+ scores = []
166
+ for _ in range(3):
167
+ env = MLOpsEnvironment(task_id=task_id)
168
+ env.reset(seed=42)
169
+ env._artifacts_read = list(env._artifacts.keys())
170
+ bug = env.bug
171
+ obs, _, _, info = env.step(MLOpsAction(
172
+ action_type="submit_diagnosis",
173
+ failure_category=bug.category,
174
+ root_cause_file=bug.file,
175
+ root_cause_field=bug.field,
176
+ proposed_fix=bug.gold_fix,
177
+ ))
178
+ scores.append(info["score"])
179
+ assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"
180
+
181
+
182
+ class TestGradeTaskStandalone:
183
+ """grade_task() must match environment grading and respect score range."""
184
+
185
+ @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
186
+ def test_grade_task_score_in_range(self, task_id):
187
+ pool = TASK_BUG_POOLS[task_id]
188
+ for bug_name in pool:
189
+ bug = BUG_CATALOGUE[bug_name]
190
+ score = grade_task(task_id, seed=42, diagnosis={
191
+ "failure_category": bug.category,
192
+ "root_cause_file": bug.file,
193
+ "root_cause_field": bug.field,
194
+ "proposed_fix": bug.gold_fix,
195
+ })
196
+ assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"
197
+
198
+ def test_grade_task_empty_diagnosis(self):
199
+ score = grade_task("easy", seed=42, diagnosis={})
200
+ assert 0 < score < 1