Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- tests/__init__.py +0 -0
- tests/test_api.py +105 -0
- tests/test_artifacts.py +195 -0
- tests/test_environment.py +148 -0
- tests/test_grading.py +200 -0
tests/__init__.py
ADDED
|
File without changes
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the FastAPI server — endpoint responses and error handling."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from fastapi.testclient import TestClient
|
| 5 |
+
from app import app
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@pytest.fixture
|
| 9 |
+
def client():
|
| 10 |
+
return TestClient(app)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TestHealthAndInfo:
|
| 14 |
+
def test_root(self, client):
|
| 15 |
+
r = client.get("/")
|
| 16 |
+
assert r.status_code == 200
|
| 17 |
+
assert "MLOps Pipeline Debugger API" in r.json()["message"]
|
| 18 |
+
|
| 19 |
+
def test_health(self, client):
|
| 20 |
+
r = client.get("/health")
|
| 21 |
+
assert r.status_code == 200
|
| 22 |
+
assert r.json()["status"] == "ok"
|
| 23 |
+
|
| 24 |
+
def test_tasks(self, client):
|
| 25 |
+
r = client.get("/tasks")
|
| 26 |
+
assert r.status_code == 200
|
| 27 |
+
tasks = r.json()["tasks"]
|
| 28 |
+
assert len(tasks) == 3
|
| 29 |
+
task_ids = {t["task_id"] for t in tasks}
|
| 30 |
+
assert task_ids == {"easy", "medium", "hard"}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class TestResetEndpoint:
|
| 34 |
+
def test_reset_easy(self, client):
|
| 35 |
+
r = client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 36 |
+
assert r.status_code == 200
|
| 37 |
+
data = r.json()
|
| 38 |
+
assert data["task_id"] == "easy"
|
| 39 |
+
assert data["step_count"] == 0
|
| 40 |
+
assert data["done"] is False
|
| 41 |
+
assert len(data["available_artifacts"]) == 6
|
| 42 |
+
|
| 43 |
+
def test_reset_hard(self, client):
|
| 44 |
+
r = client.post("/reset", json={"task_id": "hard", "seed": 42})
|
| 45 |
+
assert r.status_code == 200
|
| 46 |
+
assert r.json()["task_id"] == "hard"
|
| 47 |
+
|
| 48 |
+
def test_reset_default(self, client):
|
| 49 |
+
r = client.post("/reset", json={})
|
| 50 |
+
assert r.status_code == 200
|
| 51 |
+
assert r.json()["task_id"] == "easy"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestStepEndpoint:
|
| 55 |
+
def test_step_read_config(self, client):
|
| 56 |
+
client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 57 |
+
r = client.post("/step", json={"action_type": "read_config"})
|
| 58 |
+
assert r.status_code == 200
|
| 59 |
+
data = r.json()
|
| 60 |
+
assert data["reward"] == 0.02
|
| 61 |
+
assert data["done"] is False
|
| 62 |
+
|
| 63 |
+
def test_step_submit_diagnosis(self, client):
|
| 64 |
+
client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 65 |
+
r = client.post("/step", json={
|
| 66 |
+
"action_type": "submit_diagnosis",
|
| 67 |
+
"failure_category": "config_error",
|
| 68 |
+
"root_cause_file": "config.yaml",
|
| 69 |
+
"root_cause_field": "optimizer.learning_rate",
|
| 70 |
+
"proposed_fix": "Reduce learning_rate",
|
| 71 |
+
})
|
| 72 |
+
assert r.status_code == 200
|
| 73 |
+
data = r.json()
|
| 74 |
+
assert data["done"] is True
|
| 75 |
+
assert 0 < data["info"]["score"] < 1
|
| 76 |
+
|
| 77 |
+
def test_step_invalid_action(self, client):
|
| 78 |
+
client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 79 |
+
r = client.post("/step", json={"action_type": "invalid_action"})
|
| 80 |
+
assert r.status_code == 422
|
| 81 |
+
|
| 82 |
+
def test_step_nested_action_format(self, client):
|
| 83 |
+
client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 84 |
+
r = client.post("/step", json={"action": {"action_type": "read_config"}})
|
| 85 |
+
assert r.status_code == 200
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class TestStateEndpoint:
|
| 89 |
+
def test_state_after_reset(self, client):
|
| 90 |
+
client.post("/reset", json={"task_id": "easy", "seed": 42})
|
| 91 |
+
r = client.get("/state")
|
| 92 |
+
assert r.status_code == 200
|
| 93 |
+
data = r.json()
|
| 94 |
+
assert data["task_id"] == "easy"
|
| 95 |
+
assert data["seed"] == 42
|
| 96 |
+
assert "bug_type" in data
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class TestOpenEnvState:
|
| 100 |
+
def test_openenv_state(self, client):
|
| 101 |
+
r = client.get("/openenv/state")
|
| 102 |
+
assert r.status_code == 200
|
| 103 |
+
data = r.json()
|
| 104 |
+
assert "scores" in data
|
| 105 |
+
assert "easy" in data["scores"]
|
tests/test_artifacts.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for artifact generation — consistency, determinism, and bug planting."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import pytest
|
| 5 |
+
from artifact_generator import (
|
| 6 |
+
ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
|
| 7 |
+
run_sanity_check,
|
| 8 |
+
)
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TestArtifactGeneration:
|
| 13 |
+
"""Artifacts should be complete, parseable, and internally consistent."""
|
| 14 |
+
|
| 15 |
+
@pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
|
| 16 |
+
def test_generates_all_six_artifacts(self, bug_type):
|
| 17 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 18 |
+
artifacts = gen.generate_all()
|
| 19 |
+
expected = {"config.yaml", "train.log", "dataset_stats.json",
|
| 20 |
+
"preprocessing.py", "eval_results.json", "model_card.json"}
|
| 21 |
+
assert set(artifacts.keys()) == expected
|
| 22 |
+
|
| 23 |
+
@pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
|
| 24 |
+
def test_json_artifacts_are_valid(self, bug_type):
|
| 25 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 26 |
+
artifacts = gen.generate_all()
|
| 27 |
+
for name in ["dataset_stats.json", "eval_results.json", "model_card.json"]:
|
| 28 |
+
data = json.loads(artifacts[name])
|
| 29 |
+
assert isinstance(data, dict), f"{name} is not a dict"
|
| 30 |
+
|
| 31 |
+
@pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
|
| 32 |
+
def test_config_yaml_has_required_sections(self, bug_type):
|
| 33 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 34 |
+
artifacts = gen.generate_all()
|
| 35 |
+
config = artifacts["config.yaml"]
|
| 36 |
+
for section in ["model:", "training:", "optimizer:", "scheduler:", "data:"]:
|
| 37 |
+
assert section in config, f"Missing {section} in config.yaml"
|
| 38 |
+
|
| 39 |
+
@pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
|
| 40 |
+
def test_train_log_has_epochs(self, bug_type):
|
| 41 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 42 |
+
artifacts = gen.generate_all()
|
| 43 |
+
log = artifacts["train.log"]
|
| 44 |
+
assert "EPOCH" in log or "epoch" in log.lower()
|
| 45 |
+
|
| 46 |
+
@pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
|
| 47 |
+
def test_preprocessing_is_valid_python(self, bug_type):
|
| 48 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 49 |
+
artifacts = gen.generate_all()
|
| 50 |
+
code = artifacts["preprocessing.py"]
|
| 51 |
+
compile(code, f"<{bug_type}_preprocessing>", "exec") # syntax check
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestDeterminism:
|
| 55 |
+
"""Same (bug_type, seed) must produce identical artifacts."""
|
| 56 |
+
|
| 57 |
+
@pytest.mark.parametrize("bug_type", ["exploding_lr", "data_leakage_scaler", "label_encoder_mismatch"])
|
| 58 |
+
def test_same_seed_same_artifacts(self, bug_type):
|
| 59 |
+
gen1 = ArtifactGenerator(bug_type, seed=42)
|
| 60 |
+
gen2 = ArtifactGenerator(bug_type, seed=42)
|
| 61 |
+
a1 = gen1.generate_all()
|
| 62 |
+
a2 = gen2.generate_all()
|
| 63 |
+
for name in a1:
|
| 64 |
+
assert a1[name] == a2[name], f"{name} differs between runs"
|
| 65 |
+
|
| 66 |
+
def test_different_seeds_differ(self):
|
| 67 |
+
gen1 = ArtifactGenerator("exploding_lr", seed=1)
|
| 68 |
+
gen2 = ArtifactGenerator("exploding_lr", seed=999)
|
| 69 |
+
a1 = gen1.generate_all()
|
| 70 |
+
a2 = gen2.generate_all()
|
| 71 |
+
assert a1["config.yaml"] != a2["config.yaml"]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class TestBugPlanting:
|
| 75 |
+
"""Each bug type should plant its specific fault in the artifacts."""
|
| 76 |
+
|
| 77 |
+
def test_exploding_lr_has_high_lr(self):
|
| 78 |
+
gen = ArtifactGenerator("exploding_lr", seed=42)
|
| 79 |
+
config = gen.generate_all()["config.yaml"]
|
| 80 |
+
# LR should be absurdly high (10, 25, or 50)
|
| 81 |
+
assert any(f"learning_rate: {lr}" in config for lr in ["50.0", "10.0", "25.0"])
|
| 82 |
+
|
| 83 |
+
def test_wrong_optimizer_has_high_momentum(self):
|
| 84 |
+
gen = ArtifactGenerator("wrong_optimizer", seed=42)
|
| 85 |
+
config = gen.generate_all()["config.yaml"]
|
| 86 |
+
assert "momentum: 0.99" in config
|
| 87 |
+
|
| 88 |
+
def test_batch_size_overflow_has_large_batch(self):
|
| 89 |
+
gen = ArtifactGenerator("batch_size_overflow", seed=42)
|
| 90 |
+
config = gen.generate_all()["config.yaml"]
|
| 91 |
+
assert any(f"batch_size: {bs}" in config for bs in ["2048", "4096", "8192"])
|
| 92 |
+
|
| 93 |
+
def test_data_leakage_scaler_fits_before_split(self):
|
| 94 |
+
gen = ArtifactGenerator("data_leakage_scaler", seed=42)
|
| 95 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 96 |
+
assert "fit_transform" in code
|
| 97 |
+
assert "BUG" in code or "sees val/test" in code
|
| 98 |
+
|
| 99 |
+
def test_data_leakage_overlap_has_no_random_state(self):
|
| 100 |
+
gen = ArtifactGenerator("data_leakage_overlap", seed=42)
|
| 101 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 102 |
+
assert "random_state=None" in code
|
| 103 |
+
|
| 104 |
+
def test_wrong_split_ratio_has_inverted_split(self):
|
| 105 |
+
gen = ArtifactGenerator("wrong_split_ratio", seed=42)
|
| 106 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 107 |
+
assert "test_size=0.8" in code
|
| 108 |
+
|
| 109 |
+
def test_label_encoder_mismatch_has_two_encoders(self):
|
| 110 |
+
gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
|
| 111 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 112 |
+
assert "le_train" in code and "le_eval" in code
|
| 113 |
+
|
| 114 |
+
def test_silent_metric_swap_has_swapped_assignments(self):
|
| 115 |
+
gen = ArtifactGenerator("silent_metric_swap", seed=42)
|
| 116 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 117 |
+
assert "test_acc" in code and "val_acc" in code
|
| 118 |
+
|
| 119 |
+
def test_tokenizer_drift_has_version_mismatch(self):
|
| 120 |
+
gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
|
| 121 |
+
code = gen.generate_all()["preprocessing.py"]
|
| 122 |
+
assert "TOKENIZER_V1" in code and "TOKENIZER_V2" in code
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class TestSanityChecks:
|
| 126 |
+
"""Sanity checks should detect the planted bug."""
|
| 127 |
+
|
| 128 |
+
def test_gradient_norms_detects_exploding_lr(self):
|
| 129 |
+
gen = ArtifactGenerator("exploding_lr", seed=42)
|
| 130 |
+
artifacts = gen.generate_all()
|
| 131 |
+
rng = random.Random(42)
|
| 132 |
+
result = run_sanity_check("gradient_norms", "exploding_lr", artifacts, rng)
|
| 133 |
+
assert result["result"] == "ANOMALY"
|
| 134 |
+
|
| 135 |
+
def test_data_leakage_detects_scaler_leak(self):
|
| 136 |
+
gen = ArtifactGenerator("data_leakage_scaler", seed=42)
|
| 137 |
+
artifacts = gen.generate_all()
|
| 138 |
+
rng = random.Random(42)
|
| 139 |
+
result = run_sanity_check("data_leakage", "data_leakage_scaler", artifacts, rng)
|
| 140 |
+
assert result["result"] == "FAIL"
|
| 141 |
+
|
| 142 |
+
def test_label_consistency_detects_mismatch(self):
|
| 143 |
+
gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
|
| 144 |
+
artifacts = gen.generate_all()
|
| 145 |
+
rng = random.Random(42)
|
| 146 |
+
result = run_sanity_check("label_consistency", "label_encoder_mismatch", artifacts, rng)
|
| 147 |
+
assert result["result"] == "FAIL"
|
| 148 |
+
|
| 149 |
+
def test_encoder_version_detects_drift(self):
|
| 150 |
+
gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
|
| 151 |
+
artifacts = gen.generate_all()
|
| 152 |
+
rng = random.Random(42)
|
| 153 |
+
result = run_sanity_check("encoder_version_match", "tokenizer_version_drift", artifacts, rng)
|
| 154 |
+
assert result["result"] == "MISMATCH"
|
| 155 |
+
|
| 156 |
+
def test_metric_gap_detects_hard_bugs(self):
|
| 157 |
+
for bug_type in TASK_BUG_POOLS["hard"]:
|
| 158 |
+
gen = ArtifactGenerator(bug_type, seed=42)
|
| 159 |
+
artifacts = gen.generate_all()
|
| 160 |
+
rng = random.Random(42)
|
| 161 |
+
result = run_sanity_check("metric_gap_analysis", bug_type, artifacts, rng)
|
| 162 |
+
assert result["result"] == "ANOMALY", f"metric_gap missed {bug_type}"
|
| 163 |
+
|
| 164 |
+
def test_unknown_check_returns_unknown(self):
|
| 165 |
+
gen = ArtifactGenerator("exploding_lr", seed=42)
|
| 166 |
+
artifacts = gen.generate_all()
|
| 167 |
+
rng = random.Random(42)
|
| 168 |
+
result = run_sanity_check("nonexistent_check", "exploding_lr", artifacts, rng)
|
| 169 |
+
assert result["result"] == "UNKNOWN"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class TestBugCatalogue:
|
| 173 |
+
"""Bug catalogue should be complete and consistent."""
|
| 174 |
+
|
| 175 |
+
def test_all_bugs_have_required_fields(self):
|
| 176 |
+
for name, bug in BUG_CATALOGUE.items():
|
| 177 |
+
assert bug.bug_type == name
|
| 178 |
+
assert bug.category in [
|
| 179 |
+
"config_error", "data_leakage", "preprocessing_bug",
|
| 180 |
+
"evaluation_bug", "label_mismatch", "architecture_bug",
|
| 181 |
+
]
|
| 182 |
+
assert bug.file.endswith((".yaml", ".py", ".json"))
|
| 183 |
+
assert len(bug.field) > 0
|
| 184 |
+
assert len(bug.gold_fix) > 10
|
| 185 |
+
assert bug.task_difficulty in ["easy", "medium", "hard"]
|
| 186 |
+
|
| 187 |
+
def test_task_pools_cover_all_bugs(self):
|
| 188 |
+
all_pooled = set()
|
| 189 |
+
for pool in TASK_BUG_POOLS.values():
|
| 190 |
+
all_pooled.update(pool)
|
| 191 |
+
assert all_pooled == set(BUG_CATALOGUE.keys())
|
| 192 |
+
|
| 193 |
+
def test_each_pool_has_three_bugs(self):
|
| 194 |
+
for task_id, pool in TASK_BUG_POOLS.items():
|
| 195 |
+
assert len(pool) == 3, f"{task_id} has {len(pool)} bugs, expected 3"
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for MLOpsEnvironment — core episode flow, state management, and step logic."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from mlops_environment import MLOpsEnvironment, TASK_MAX_STEPS
|
| 5 |
+
from models import MLOpsAction
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestReset:
|
| 9 |
+
"""reset() should produce a clean, valid initial state."""
|
| 10 |
+
|
| 11 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 12 |
+
def test_reset_returns_valid_observation(self, task_id):
|
| 13 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 14 |
+
obs = env.reset(seed=42)
|
| 15 |
+
assert obs.task_id == task_id
|
| 16 |
+
assert obs.step_count == 0
|
| 17 |
+
assert obs.max_steps == TASK_MAX_STEPS[task_id]
|
| 18 |
+
assert obs.done is False
|
| 19 |
+
assert len(obs.available_artifacts) == 6
|
| 20 |
+
assert obs.artifacts_read == []
|
| 21 |
+
|
| 22 |
+
def test_reset_with_same_seed_is_deterministic(self):
|
| 23 |
+
env1 = MLOpsEnvironment(task_id="easy")
|
| 24 |
+
env2 = MLOpsEnvironment(task_id="easy")
|
| 25 |
+
obs1 = env1.reset(seed=123)
|
| 26 |
+
obs2 = env2.reset(seed=123)
|
| 27 |
+
assert obs1.run_id == obs2.run_id
|
| 28 |
+
assert env1.bug_type == env2.bug_type
|
| 29 |
+
|
| 30 |
+
def test_reset_with_different_seeds_varies(self):
|
| 31 |
+
env = MLOpsEnvironment(task_id="easy")
|
| 32 |
+
obs1 = env.reset(seed=1)
|
| 33 |
+
run_id_1 = obs1.run_id
|
| 34 |
+
obs2 = env.reset(seed=999)
|
| 35 |
+
assert obs2.run_id != run_id_1
|
| 36 |
+
|
| 37 |
+
def test_reset_clears_previous_episode(self):
|
| 38 |
+
env = MLOpsEnvironment(task_id="easy")
|
| 39 |
+
env.reset(seed=42)
|
| 40 |
+
env.step(MLOpsAction(action_type="read_config"))
|
| 41 |
+
assert len(env._artifacts_read) == 1
|
| 42 |
+
env.reset(seed=42)
|
| 43 |
+
assert len(env._artifacts_read) == 0
|
| 44 |
+
assert env._step_count == 0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TestStepActions:
|
| 48 |
+
"""Each action type should return expected structure and reward."""
|
| 49 |
+
|
| 50 |
+
@pytest.fixture
|
| 51 |
+
def env(self):
|
| 52 |
+
env = MLOpsEnvironment(task_id="easy")
|
| 53 |
+
env.reset(seed=42)
|
| 54 |
+
return env
|
| 55 |
+
|
| 56 |
+
def test_read_config(self, env):
|
| 57 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
|
| 58 |
+
assert reward == 0.02
|
| 59 |
+
assert done is False
|
| 60 |
+
assert "config.yaml" in obs.artifacts_read
|
| 61 |
+
assert "content" in obs.last_action_result
|
| 62 |
+
|
| 63 |
+
def test_read_logs(self, env):
|
| 64 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_logs"))
|
| 65 |
+
assert reward == 0.02
|
| 66 |
+
assert "train.log" in obs.artifacts_read
|
| 67 |
+
|
| 68 |
+
def test_read_logs_with_filter(self, env):
|
| 69 |
+
obs, reward, done, info = env.step(
|
| 70 |
+
MLOpsAction(action_type="read_logs", log_filter="epoch:1-3")
|
| 71 |
+
)
|
| 72 |
+
assert reward == 0.02
|
| 73 |
+
content = obs.last_action_result.get("content", "")
|
| 74 |
+
assert "EPOCH" in content or "No log lines" in content
|
| 75 |
+
|
| 76 |
+
def test_check_dataset_stats(self, env):
|
| 77 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="check_dataset_stats"))
|
| 78 |
+
assert reward == 0.02
|
| 79 |
+
assert "dataset_stats.json" in obs.artifacts_read
|
| 80 |
+
|
| 81 |
+
def test_inspect_preprocessing(self, env):
|
| 82 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="inspect_preprocessing"))
|
| 83 |
+
assert reward == 0.02
|
| 84 |
+
assert "preprocessing.py" in obs.artifacts_read
|
| 85 |
+
|
| 86 |
+
def test_read_eval_results(self, env):
|
| 87 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_eval_results"))
|
| 88 |
+
assert reward == 0.02
|
| 89 |
+
assert "eval_results.json" in obs.artifacts_read
|
| 90 |
+
|
| 91 |
+
def test_run_sanity_check(self, env):
|
| 92 |
+
obs, reward, done, info = env.step(
|
| 93 |
+
MLOpsAction(action_type="run_sanity_check", sanity_check_type="loss_trajectory")
|
| 94 |
+
)
|
| 95 |
+
assert reward == 0.01
|
| 96 |
+
assert obs.last_action_result["status"] == "ok"
|
| 97 |
+
assert "sanity_check" in obs.last_action_result
|
| 98 |
+
|
| 99 |
+
def test_query_artifact(self, env):
|
| 100 |
+
env.step(MLOpsAction(action_type="read_config"))
|
| 101 |
+
obs, reward, done, info = env.step(
|
| 102 |
+
MLOpsAction(action_type="query_artifact", artifact_name="config.yaml", field_path="model.architecture")
|
| 103 |
+
)
|
| 104 |
+
assert obs.last_action_result["status"] == "ok"
|
| 105 |
+
|
| 106 |
+
def test_duplicate_read_penalty(self, env):
|
| 107 |
+
env.step(MLOpsAction(action_type="read_config"))
|
| 108 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
|
| 109 |
+
assert reward == -0.02
|
| 110 |
+
|
| 111 |
+
def test_step_count_increments(self, env):
|
| 112 |
+
env.step(MLOpsAction(action_type="read_config"))
|
| 113 |
+
env.step(MLOpsAction(action_type="read_logs"))
|
| 114 |
+
assert env._step_count == 2
|
| 115 |
+
|
| 116 |
+
def test_done_after_submit(self, env):
|
| 117 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
|
| 118 |
+
assert done is True
|
| 119 |
+
|
| 120 |
+
def test_step_after_done_returns_done(self, env):
|
| 121 |
+
env.step(MLOpsAction(action_type="submit_diagnosis"))
|
| 122 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
|
| 123 |
+
assert done is True
|
| 124 |
+
assert reward == 0.01 # clamped minimum
|
| 125 |
+
assert "score" in info
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
class TestEpisodeBoundaries:
|
| 129 |
+
"""Episode should terminate correctly on submit, timeout, and re-step."""
|
| 130 |
+
|
| 131 |
+
def test_timeout_at_max_steps(self):
|
| 132 |
+
env = MLOpsEnvironment(task_id="easy")
|
| 133 |
+
env.reset(seed=42)
|
| 134 |
+
for _ in range(TASK_MAX_STEPS["easy"]):
|
| 135 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
|
| 136 |
+
if done:
|
| 137 |
+
break
|
| 138 |
+
assert done is True
|
| 139 |
+
assert "score" in info
|
| 140 |
+
|
| 141 |
+
def test_submit_ends_episode(self):
|
| 142 |
+
env = MLOpsEnvironment(task_id="medium")
|
| 143 |
+
env.reset(seed=42)
|
| 144 |
+
env.step(MLOpsAction(action_type="read_logs"))
|
| 145 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
|
| 146 |
+
assert done is True
|
| 147 |
+
assert "score" in info
|
| 148 |
+
assert "breakdown" in info
|
tests/test_grading.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the grading system — score ranges, component scoring, and determinism."""
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
from mlops_environment import MLOpsEnvironment, grade_task
|
| 5 |
+
from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
|
| 6 |
+
from models import MLOpsAction
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TestScoreRange:
|
| 10 |
+
"""All scores must be strictly between 0 and 1."""
|
| 11 |
+
|
| 12 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 13 |
+
def test_perfect_diagnosis_below_1(self, task_id):
|
| 14 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 15 |
+
env.reset(seed=42)
|
| 16 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 17 |
+
bug = env.bug
|
| 18 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 19 |
+
action_type="submit_diagnosis",
|
| 20 |
+
failure_category=bug.category,
|
| 21 |
+
root_cause_file=bug.file,
|
| 22 |
+
root_cause_field=bug.field,
|
| 23 |
+
diagnosis="test",
|
| 24 |
+
proposed_fix=bug.gold_fix,
|
| 25 |
+
))
|
| 26 |
+
score = info["score"]
|
| 27 |
+
assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
|
| 28 |
+
assert score <= 0.99
|
| 29 |
+
|
| 30 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 31 |
+
def test_empty_diagnosis_above_0(self, task_id):
|
| 32 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 33 |
+
env.reset(seed=42)
|
| 34 |
+
obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
|
| 35 |
+
score = info["score"]
|
| 36 |
+
assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
|
| 37 |
+
assert score >= 0.01
|
| 38 |
+
|
| 39 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 40 |
+
def test_wrong_diagnosis_above_0(self, task_id):
|
| 41 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 42 |
+
env.reset(seed=42)
|
| 43 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 44 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 45 |
+
action_type="submit_diagnosis",
|
| 46 |
+
failure_category="architecture_bug",
|
| 47 |
+
root_cause_file="nonexistent.py",
|
| 48 |
+
root_cause_field="wrong.field",
|
| 49 |
+
diagnosis="completely wrong",
|
| 50 |
+
proposed_fix="do nothing",
|
| 51 |
+
))
|
| 52 |
+
score = info["score"]
|
| 53 |
+
assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"
|
| 54 |
+
|
| 55 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 56 |
+
@pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
|
| 57 |
+
def test_score_range_across_seeds(self, task_id, seed):
|
| 58 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 59 |
+
env.reset(seed=seed)
|
| 60 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 61 |
+
bug = env.bug
|
| 62 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 63 |
+
action_type="submit_diagnosis",
|
| 64 |
+
failure_category=bug.category,
|
| 65 |
+
root_cause_file=bug.file,
|
| 66 |
+
root_cause_field=bug.field,
|
| 67 |
+
diagnosis="test",
|
| 68 |
+
proposed_fix=bug.gold_fix,
|
| 69 |
+
))
|
| 70 |
+
score = info["score"]
|
| 71 |
+
assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class TestComponentScoring:
|
| 75 |
+
"""Each scoring component should award correct points."""
|
| 76 |
+
|
| 77 |
+
@pytest.fixture
|
| 78 |
+
def env_with_bug(self):
|
| 79 |
+
env = MLOpsEnvironment(task_id="easy")
|
| 80 |
+
env.reset(seed=42)
|
| 81 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 82 |
+
return env, env.bug
|
| 83 |
+
|
| 84 |
+
def test_category_only(self, env_with_bug):
|
| 85 |
+
env, bug = env_with_bug
|
| 86 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 87 |
+
action_type="submit_diagnosis",
|
| 88 |
+
failure_category=bug.category,
|
| 89 |
+
))
|
| 90 |
+
bd = info["breakdown"]
|
| 91 |
+
assert bd["failure_category"]["correct"] is True
|
| 92 |
+
assert bd["failure_category"]["awarded"] == 0.15
|
| 93 |
+
|
| 94 |
+
def test_category_plus_file(self, env_with_bug):
|
| 95 |
+
env, bug = env_with_bug
|
| 96 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 97 |
+
action_type="submit_diagnosis",
|
| 98 |
+
failure_category=bug.category,
|
| 99 |
+
root_cause_file=bug.file,
|
| 100 |
+
))
|
| 101 |
+
bd = info["breakdown"]
|
| 102 |
+
assert bd["failure_category"]["correct"] is True
|
| 103 |
+
assert bd["root_cause_file"]["correct"] is True
|
| 104 |
+
assert info["score"] >= 0.35
|
| 105 |
+
|
| 106 |
+
def test_file_match_case_insensitive(self, env_with_bug):
|
| 107 |
+
env, bug = env_with_bug
|
| 108 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 109 |
+
action_type="submit_diagnosis",
|
| 110 |
+
failure_category=bug.category,
|
| 111 |
+
root_cause_file=bug.file.upper(),
|
| 112 |
+
))
|
| 113 |
+
assert info["breakdown"]["root_cause_file"]["correct"] is True
|
| 114 |
+
|
| 115 |
+
def test_partial_fix_scoring(self, env_with_bug):
|
| 116 |
+
env, bug = env_with_bug
|
| 117 |
+
# Submit just one keyword from the gold fix
|
| 118 |
+
first_word = bug.gold_fix.split()[0]
|
| 119 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 120 |
+
action_type="submit_diagnosis",
|
| 121 |
+
failure_category=bug.category,
|
| 122 |
+
proposed_fix=first_word,
|
| 123 |
+
))
|
| 124 |
+
fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
|
| 125 |
+
assert fix_awarded > 0 # partial credit
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
class TestHardTaskPenalty:
|
| 129 |
+
"""Hard task should apply 1.5x penalty when score < 0.70."""
|
| 130 |
+
|
| 131 |
+
def test_penalty_applied_on_low_score(self):
|
| 132 |
+
env = MLOpsEnvironment(task_id="hard")
|
| 133 |
+
env.reset(seed=42)
|
| 134 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 135 |
+
# Submit with only category correct → score ~0.15, well below 0.70
|
| 136 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 137 |
+
action_type="submit_diagnosis",
|
| 138 |
+
failure_category=env.bug.category,
|
| 139 |
+
))
|
| 140 |
+
assert info["breakdown"].get("hard_task_penalty_applied") is True
|
| 141 |
+
assert info["score"] < 0.15 # penalty reduces it
|
| 142 |
+
|
| 143 |
+
def test_no_penalty_on_high_score(self):
|
| 144 |
+
env = MLOpsEnvironment(task_id="hard")
|
| 145 |
+
env.reset(seed=42)
|
| 146 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 147 |
+
bug = env.bug
|
| 148 |
+
obs, reward, done, info = env.step(MLOpsAction(
|
| 149 |
+
action_type="submit_diagnosis",
|
| 150 |
+
failure_category=bug.category,
|
| 151 |
+
root_cause_file=bug.file,
|
| 152 |
+
root_cause_field=bug.field,
|
| 153 |
+
diagnosis="test",
|
| 154 |
+
proposed_fix=bug.gold_fix,
|
| 155 |
+
))
|
| 156 |
+
assert info["breakdown"].get("hard_task_penalty_applied") is not True
|
| 157 |
+
assert info["score"] >= 0.70
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class TestGraderDeterminism:
|
| 161 |
+
"""Same inputs must always produce identical scores."""
|
| 162 |
+
|
| 163 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 164 |
+
def test_same_seed_same_score(self, task_id):
|
| 165 |
+
scores = []
|
| 166 |
+
for _ in range(3):
|
| 167 |
+
env = MLOpsEnvironment(task_id=task_id)
|
| 168 |
+
env.reset(seed=42)
|
| 169 |
+
env._artifacts_read = list(env._artifacts.keys())
|
| 170 |
+
bug = env.bug
|
| 171 |
+
obs, _, _, info = env.step(MLOpsAction(
|
| 172 |
+
action_type="submit_diagnosis",
|
| 173 |
+
failure_category=bug.category,
|
| 174 |
+
root_cause_file=bug.file,
|
| 175 |
+
root_cause_field=bug.field,
|
| 176 |
+
proposed_fix=bug.gold_fix,
|
| 177 |
+
))
|
| 178 |
+
scores.append(info["score"])
|
| 179 |
+
assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class TestGradeTaskStandalone:
|
| 183 |
+
"""grade_task() must match environment grading and respect score range."""
|
| 184 |
+
|
| 185 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 186 |
+
def test_grade_task_score_in_range(self, task_id):
|
| 187 |
+
pool = TASK_BUG_POOLS[task_id]
|
| 188 |
+
for bug_name in pool:
|
| 189 |
+
bug = BUG_CATALOGUE[bug_name]
|
| 190 |
+
score = grade_task(task_id, seed=42, diagnosis={
|
| 191 |
+
"failure_category": bug.category,
|
| 192 |
+
"root_cause_file": bug.file,
|
| 193 |
+
"root_cause_field": bug.field,
|
| 194 |
+
"proposed_fix": bug.gold_fix,
|
| 195 |
+
})
|
| 196 |
+
assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"
|
| 197 |
+
|
| 198 |
+
def test_grade_task_empty_diagnosis(self):
|
| 199 |
+
score = grade_task("easy", seed=42, diagnosis={})
|
| 200 |
+
assert 0 < score < 1
|