Spaces:

Rockerleo
/

mlops-openenv

Sleeping

App Files Files Community

Rockerleo commited on Apr 11

Commit

78ea1a9

verified ·

1 Parent(s): dc936ba

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

tests/__init__.py +0 -0
tests/test_api.py +105 -0
tests/test_artifacts.py +195 -0
tests/test_environment.py +148 -0
tests/test_grading.py +200 -0

tests/__init__.py ADDED Viewed

File without changes

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Tests for the FastAPI server — endpoint responses and error handling."""
+import pytest
+from fastapi.testclient import TestClient
+from app import app
+@pytest.fixture
+def client():
+    return TestClient(app)
+class TestHealthAndInfo:
+    def test_root(self, client):
+        r = client.get("/")
+        assert r.status_code == 200
+        assert "MLOps Pipeline Debugger API" in r.json()["message"]
+    def test_health(self, client):
+        r = client.get("/health")
+        assert r.status_code == 200
+        assert r.json()["status"] == "ok"
+    def test_tasks(self, client):
+        r = client.get("/tasks")
+        assert r.status_code == 200
+        tasks = r.json()["tasks"]
+        assert len(tasks) == 3
+        task_ids = {t["task_id"] for t in tasks}
+        assert task_ids == {"easy", "medium", "hard"}
+class TestResetEndpoint:
+    def test_reset_easy(self, client):
+        r = client.post("/reset", json={"task_id": "easy", "seed": 42})
+        assert r.status_code == 200
+        data = r.json()
+        assert data["task_id"] == "easy"
+        assert data["step_count"] == 0
+        assert data["done"] is False
+        assert len(data["available_artifacts"]) == 6
+    def test_reset_hard(self, client):
+        r = client.post("/reset", json={"task_id": "hard", "seed": 42})
+        assert r.status_code == 200
+        assert r.json()["task_id"] == "hard"
+    def test_reset_default(self, client):
+        r = client.post("/reset", json={})
+        assert r.status_code == 200
+        assert r.json()["task_id"] == "easy"
+class TestStepEndpoint:
+    def test_step_read_config(self, client):
+        client.post("/reset", json={"task_id": "easy", "seed": 42})
+        r = client.post("/step", json={"action_type": "read_config"})
+        assert r.status_code == 200
+        data = r.json()
+        assert data["reward"] == 0.02
+        assert data["done"] is False
+    def test_step_submit_diagnosis(self, client):
+        client.post("/reset", json={"task_id": "easy", "seed": 42})
+        r = client.post("/step", json={
+            "action_type": "submit_diagnosis",
+            "failure_category": "config_error",
+            "root_cause_file": "config.yaml",
+            "root_cause_field": "optimizer.learning_rate",
+            "proposed_fix": "Reduce learning_rate",
+        })
+        assert r.status_code == 200
+        data = r.json()
+        assert data["done"] is True
+        assert 0 < data["info"]["score"] < 1
+    def test_step_invalid_action(self, client):
+        client.post("/reset", json={"task_id": "easy", "seed": 42})
+        r = client.post("/step", json={"action_type": "invalid_action"})
+        assert r.status_code == 422
+    def test_step_nested_action_format(self, client):
+        client.post("/reset", json={"task_id": "easy", "seed": 42})
+        r = client.post("/step", json={"action": {"action_type": "read_config"}})
+        assert r.status_code == 200
+class TestStateEndpoint:
+    def test_state_after_reset(self, client):
+        client.post("/reset", json={"task_id": "easy", "seed": 42})
+        r = client.get("/state")
+        assert r.status_code == 200
+        data = r.json()
+        assert data["task_id"] == "easy"
+        assert data["seed"] == 42
+        assert "bug_type" in data
+class TestOpenEnvState:
+    def test_openenv_state(self, client):
+        r = client.get("/openenv/state")
+        assert r.status_code == 200
+        data = r.json()
+        assert "scores" in data
+        assert "easy" in data["scores"]

tests/test_artifacts.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""Tests for artifact generation — consistency, determinism, and bug planting."""
+import json
+import pytest
+from artifact_generator import (
+    ArtifactGenerator, BUG_CATALOGUE, TASK_BUG_POOLS,
+    run_sanity_check,
+)
+import random
+class TestArtifactGeneration:
+    """Artifacts should be complete, parseable, and internally consistent."""
+    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
+    def test_generates_all_six_artifacts(self, bug_type):
+        gen = ArtifactGenerator(bug_type, seed=42)
+        artifacts = gen.generate_all()
+        expected = {"config.yaml", "train.log", "dataset_stats.json",
+                    "preprocessing.py", "eval_results.json", "model_card.json"}
+        assert set(artifacts.keys()) == expected
+    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
+    def test_json_artifacts_are_valid(self, bug_type):
+        gen = ArtifactGenerator(bug_type, seed=42)
+        artifacts = gen.generate_all()
+        for name in ["dataset_stats.json", "eval_results.json", "model_card.json"]:
+            data = json.loads(artifacts[name])
+            assert isinstance(data, dict), f"{name} is not a dict"
+    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
+    def test_config_yaml_has_required_sections(self, bug_type):
+        gen = ArtifactGenerator(bug_type, seed=42)
+        artifacts = gen.generate_all()
+        config = artifacts["config.yaml"]
+        for section in ["model:", "training:", "optimizer:", "scheduler:", "data:"]:
+            assert section in config, f"Missing {section} in config.yaml"
+    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
+    def test_train_log_has_epochs(self, bug_type):
+        gen = ArtifactGenerator(bug_type, seed=42)
+        artifacts = gen.generate_all()
+        log = artifacts["train.log"]
+        assert "EPOCH" in log or "epoch" in log.lower()
+    @pytest.mark.parametrize("bug_type", list(BUG_CATALOGUE.keys()))
+    def test_preprocessing_is_valid_python(self, bug_type):
+        gen = ArtifactGenerator(bug_type, seed=42)
+        artifacts = gen.generate_all()
+        code = artifacts["preprocessing.py"]
+        compile(code, f"<{bug_type}_preprocessing>", "exec")  # syntax check
+class TestDeterminism:
+    """Same (bug_type, seed) must produce identical artifacts."""
+    @pytest.mark.parametrize("bug_type", ["exploding_lr", "data_leakage_scaler", "label_encoder_mismatch"])
+    def test_same_seed_same_artifacts(self, bug_type):
+        gen1 = ArtifactGenerator(bug_type, seed=42)
+        gen2 = ArtifactGenerator(bug_type, seed=42)
+        a1 = gen1.generate_all()
+        a2 = gen2.generate_all()
+        for name in a1:
+            assert a1[name] == a2[name], f"{name} differs between runs"
+    def test_different_seeds_differ(self):
+        gen1 = ArtifactGenerator("exploding_lr", seed=1)
+        gen2 = ArtifactGenerator("exploding_lr", seed=999)
+        a1 = gen1.generate_all()
+        a2 = gen2.generate_all()
+        assert a1["config.yaml"] != a2["config.yaml"]
+class TestBugPlanting:
+    """Each bug type should plant its specific fault in the artifacts."""
+    def test_exploding_lr_has_high_lr(self):
+        gen = ArtifactGenerator("exploding_lr", seed=42)
+        config = gen.generate_all()["config.yaml"]
+        # LR should be absurdly high (10, 25, or 50)
+        assert any(f"learning_rate: {lr}" in config for lr in ["50.0", "10.0", "25.0"])
+    def test_wrong_optimizer_has_high_momentum(self):
+        gen = ArtifactGenerator("wrong_optimizer", seed=42)
+        config = gen.generate_all()["config.yaml"]
+        assert "momentum: 0.99" in config
+    def test_batch_size_overflow_has_large_batch(self):
+        gen = ArtifactGenerator("batch_size_overflow", seed=42)
+        config = gen.generate_all()["config.yaml"]
+        assert any(f"batch_size: {bs}" in config for bs in ["2048", "4096", "8192"])
+    def test_data_leakage_scaler_fits_before_split(self):
+        gen = ArtifactGenerator("data_leakage_scaler", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "fit_transform" in code
+        assert "BUG" in code or "sees val/test" in code
+    def test_data_leakage_overlap_has_no_random_state(self):
+        gen = ArtifactGenerator("data_leakage_overlap", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "random_state=None" in code
+    def test_wrong_split_ratio_has_inverted_split(self):
+        gen = ArtifactGenerator("wrong_split_ratio", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "test_size=0.8" in code
+    def test_label_encoder_mismatch_has_two_encoders(self):
+        gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "le_train" in code and "le_eval" in code
+    def test_silent_metric_swap_has_swapped_assignments(self):
+        gen = ArtifactGenerator("silent_metric_swap", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "test_acc" in code and "val_acc" in code
+    def test_tokenizer_drift_has_version_mismatch(self):
+        gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
+        code = gen.generate_all()["preprocessing.py"]
+        assert "TOKENIZER_V1" in code and "TOKENIZER_V2" in code
+class TestSanityChecks:
+    """Sanity checks should detect the planted bug."""
+    def test_gradient_norms_detects_exploding_lr(self):
+        gen = ArtifactGenerator("exploding_lr", seed=42)
+        artifacts = gen.generate_all()
+        rng = random.Random(42)
+        result = run_sanity_check("gradient_norms", "exploding_lr", artifacts, rng)
+        assert result["result"] == "ANOMALY"
+    def test_data_leakage_detects_scaler_leak(self):
+        gen = ArtifactGenerator("data_leakage_scaler", seed=42)
+        artifacts = gen.generate_all()
+        rng = random.Random(42)
+        result = run_sanity_check("data_leakage", "data_leakage_scaler", artifacts, rng)
+        assert result["result"] == "FAIL"
+    def test_label_consistency_detects_mismatch(self):
+        gen = ArtifactGenerator("label_encoder_mismatch", seed=42)
+        artifacts = gen.generate_all()
+        rng = random.Random(42)
+        result = run_sanity_check("label_consistency", "label_encoder_mismatch", artifacts, rng)
+        assert result["result"] == "FAIL"
+    def test_encoder_version_detects_drift(self):
+        gen = ArtifactGenerator("tokenizer_version_drift", seed=42)
+        artifacts = gen.generate_all()
+        rng = random.Random(42)
+        result = run_sanity_check("encoder_version_match", "tokenizer_version_drift", artifacts, rng)
+        assert result["result"] == "MISMATCH"
+    def test_metric_gap_detects_hard_bugs(self):
+        for bug_type in TASK_BUG_POOLS["hard"]:
+            gen = ArtifactGenerator(bug_type, seed=42)
+            artifacts = gen.generate_all()
+            rng = random.Random(42)
+            result = run_sanity_check("metric_gap_analysis", bug_type, artifacts, rng)
+            assert result["result"] == "ANOMALY", f"metric_gap missed {bug_type}"
+    def test_unknown_check_returns_unknown(self):
+        gen = ArtifactGenerator("exploding_lr", seed=42)
+        artifacts = gen.generate_all()
+        rng = random.Random(42)
+        result = run_sanity_check("nonexistent_check", "exploding_lr", artifacts, rng)
+        assert result["result"] == "UNKNOWN"
+class TestBugCatalogue:
+    """Bug catalogue should be complete and consistent."""
+    def test_all_bugs_have_required_fields(self):
+        for name, bug in BUG_CATALOGUE.items():
+            assert bug.bug_type == name
+            assert bug.category in [
+                "config_error", "data_leakage", "preprocessing_bug",
+                "evaluation_bug", "label_mismatch", "architecture_bug",
+            ]
+            assert bug.file.endswith((".yaml", ".py", ".json"))
+            assert len(bug.field) > 0
+            assert len(bug.gold_fix) > 10
+            assert bug.task_difficulty in ["easy", "medium", "hard"]
+    def test_task_pools_cover_all_bugs(self):
+        all_pooled = set()
+        for pool in TASK_BUG_POOLS.values():
+            all_pooled.update(pool)
+        assert all_pooled == set(BUG_CATALOGUE.keys())
+    def test_each_pool_has_three_bugs(self):
+        for task_id, pool in TASK_BUG_POOLS.items():
+            assert len(pool) == 3, f"{task_id} has {len(pool)} bugs, expected 3"

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Tests for MLOpsEnvironment — core episode flow, state management, and step logic."""
+import pytest
+from mlops_environment import MLOpsEnvironment, TASK_MAX_STEPS
+from models import MLOpsAction
+class TestReset:
+    """reset() should produce a clean, valid initial state."""
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_reset_returns_valid_observation(self, task_id):
+        env = MLOpsEnvironment(task_id=task_id)
+        obs = env.reset(seed=42)
+        assert obs.task_id == task_id
+        assert obs.step_count == 0
+        assert obs.max_steps == TASK_MAX_STEPS[task_id]
+        assert obs.done is False
+        assert len(obs.available_artifacts) == 6
+        assert obs.artifacts_read == []
+    def test_reset_with_same_seed_is_deterministic(self):
+        env1 = MLOpsEnvironment(task_id="easy")
+        env2 = MLOpsEnvironment(task_id="easy")
+        obs1 = env1.reset(seed=123)
+        obs2 = env2.reset(seed=123)
+        assert obs1.run_id == obs2.run_id
+        assert env1.bug_type == env2.bug_type
+    def test_reset_with_different_seeds_varies(self):
+        env = MLOpsEnvironment(task_id="easy")
+        obs1 = env.reset(seed=1)
+        run_id_1 = obs1.run_id
+        obs2 = env.reset(seed=999)
+        assert obs2.run_id != run_id_1
+    def test_reset_clears_previous_episode(self):
+        env = MLOpsEnvironment(task_id="easy")
+        env.reset(seed=42)
+        env.step(MLOpsAction(action_type="read_config"))
+        assert len(env._artifacts_read) == 1
+        env.reset(seed=42)
+        assert len(env._artifacts_read) == 0
+        assert env._step_count == 0
+class TestStepActions:
+    """Each action type should return expected structure and reward."""
+    @pytest.fixture
+    def env(self):
+        env = MLOpsEnvironment(task_id="easy")
+        env.reset(seed=42)
+        return env
+    def test_read_config(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
+        assert reward == 0.02
+        assert done is False
+        assert "config.yaml" in obs.artifacts_read
+        assert "content" in obs.last_action_result
+    def test_read_logs(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="read_logs"))
+        assert reward == 0.02
+        assert "train.log" in obs.artifacts_read
+    def test_read_logs_with_filter(self, env):
+        obs, reward, done, info = env.step(
+            MLOpsAction(action_type="read_logs", log_filter="epoch:1-3")
+        )
+        assert reward == 0.02
+        content = obs.last_action_result.get("content", "")
+        assert "EPOCH" in content or "No log lines" in content
+    def test_check_dataset_stats(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="check_dataset_stats"))
+        assert reward == 0.02
+        assert "dataset_stats.json" in obs.artifacts_read
+    def test_inspect_preprocessing(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="inspect_preprocessing"))
+        assert reward == 0.02
+        assert "preprocessing.py" in obs.artifacts_read
+    def test_read_eval_results(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="read_eval_results"))
+        assert reward == 0.02
+        assert "eval_results.json" in obs.artifacts_read
+    def test_run_sanity_check(self, env):
+        obs, reward, done, info = env.step(
+            MLOpsAction(action_type="run_sanity_check", sanity_check_type="loss_trajectory")
+        )
+        assert reward == 0.01
+        assert obs.last_action_result["status"] == "ok"
+        assert "sanity_check" in obs.last_action_result
+    def test_query_artifact(self, env):
+        env.step(MLOpsAction(action_type="read_config"))
+        obs, reward, done, info = env.step(
+            MLOpsAction(action_type="query_artifact", artifact_name="config.yaml", field_path="model.architecture")
+        )
+        assert obs.last_action_result["status"] == "ok"
+    def test_duplicate_read_penalty(self, env):
+        env.step(MLOpsAction(action_type="read_config"))
+        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
+        assert reward == -0.02
+    def test_step_count_increments(self, env):
+        env.step(MLOpsAction(action_type="read_config"))
+        env.step(MLOpsAction(action_type="read_logs"))
+        assert env._step_count == 2
+    def test_done_after_submit(self, env):
+        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
+        assert done is True
+    def test_step_after_done_returns_done(self, env):
+        env.step(MLOpsAction(action_type="submit_diagnosis"))
+        obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
+        assert done is True
+        assert reward == 0.01  # clamped minimum
+        assert "score" in info
+class TestEpisodeBoundaries:
+    """Episode should terminate correctly on submit, timeout, and re-step."""
+    def test_timeout_at_max_steps(self):
+        env = MLOpsEnvironment(task_id="easy")
+        env.reset(seed=42)
+        for _ in range(TASK_MAX_STEPS["easy"]):
+            obs, reward, done, info = env.step(MLOpsAction(action_type="read_config"))
+            if done:
+                break
+        assert done is True
+        assert "score" in info
+    def test_submit_ends_episode(self):
+        env = MLOpsEnvironment(task_id="medium")
+        env.reset(seed=42)
+        env.step(MLOpsAction(action_type="read_logs"))
+        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
+        assert done is True
+        assert "score" in info
+        assert "breakdown" in info

tests/test_grading.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""Tests for the grading system — score ranges, component scoring, and determinism."""
+import pytest
+from mlops_environment import MLOpsEnvironment, grade_task
+from artifact_generator import BUG_CATALOGUE, TASK_BUG_POOLS
+from models import MLOpsAction
+class TestScoreRange:
+    """All scores must be strictly between 0 and 1."""
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_perfect_diagnosis_below_1(self, task_id):
+        env = MLOpsEnvironment(task_id=task_id)
+        env.reset(seed=42)
+        env._artifacts_read = list(env._artifacts.keys())
+        bug = env.bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            root_cause_file=bug.file,
+            root_cause_field=bug.field,
+            diagnosis="test",
+            proposed_fix=bug.gold_fix,
+        ))
+        score = info["score"]
+        assert 0 < score < 1, f"Perfect diagnosis score {score} is not in (0, 1)"
+        assert score <= 0.99
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_empty_diagnosis_above_0(self, task_id):
+        env = MLOpsEnvironment(task_id=task_id)
+        env.reset(seed=42)
+        obs, reward, done, info = env.step(MLOpsAction(action_type="submit_diagnosis"))
+        score = info["score"]
+        assert 0 < score < 1, f"Empty diagnosis score {score} is not in (0, 1)"
+        assert score >= 0.01
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_wrong_diagnosis_above_0(self, task_id):
+        env = MLOpsEnvironment(task_id=task_id)
+        env.reset(seed=42)
+        env._artifacts_read = list(env._artifacts.keys())
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category="architecture_bug",
+            root_cause_file="nonexistent.py",
+            root_cause_field="wrong.field",
+            diagnosis="completely wrong",
+            proposed_fix="do nothing",
+        ))
+        score = info["score"]
+        assert 0 < score < 1, f"Wrong diagnosis score {score} is not in (0, 1)"
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    @pytest.mark.parametrize("seed", [1, 42, 100, 999, 54321])
+    def test_score_range_across_seeds(self, task_id, seed):
+        env = MLOpsEnvironment(task_id=task_id)
+        env.reset(seed=seed)
+        env._artifacts_read = list(env._artifacts.keys())
+        bug = env.bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            root_cause_file=bug.file,
+            root_cause_field=bug.field,
+            diagnosis="test",
+            proposed_fix=bug.gold_fix,
+        ))
+        score = info["score"]
+        assert 0 < score < 1, f"Score {score} out of range for {task_id}/seed={seed}"
+class TestComponentScoring:
+    """Each scoring component should award correct points."""
+    @pytest.fixture
+    def env_with_bug(self):
+        env = MLOpsEnvironment(task_id="easy")
+        env.reset(seed=42)
+        env._artifacts_read = list(env._artifacts.keys())
+        return env, env.bug
+    def test_category_only(self, env_with_bug):
+        env, bug = env_with_bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+        ))
+        bd = info["breakdown"]
+        assert bd["failure_category"]["correct"] is True
+        assert bd["failure_category"]["awarded"] == 0.15
+    def test_category_plus_file(self, env_with_bug):
+        env, bug = env_with_bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            root_cause_file=bug.file,
+        ))
+        bd = info["breakdown"]
+        assert bd["failure_category"]["correct"] is True
+        assert bd["root_cause_file"]["correct"] is True
+        assert info["score"] >= 0.35
+    def test_file_match_case_insensitive(self, env_with_bug):
+        env, bug = env_with_bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            root_cause_file=bug.file.upper(),
+        ))
+        assert info["breakdown"]["root_cause_file"]["correct"] is True
+    def test_partial_fix_scoring(self, env_with_bug):
+        env, bug = env_with_bug
+        # Submit just one keyword from the gold fix
+        first_word = bug.gold_fix.split()[0]
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            proposed_fix=first_word,
+        ))
+        fix_awarded = info["breakdown"]["proposed_fix"]["awarded"]
+        assert fix_awarded > 0  # partial credit
+class TestHardTaskPenalty:
+    """Hard task should apply 1.5x penalty when score < 0.70."""
+    def test_penalty_applied_on_low_score(self):
+        env = MLOpsEnvironment(task_id="hard")
+        env.reset(seed=42)
+        env._artifacts_read = list(env._artifacts.keys())
+        # Submit with only category correct → score ~0.15, well below 0.70
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=env.bug.category,
+        ))
+        assert info["breakdown"].get("hard_task_penalty_applied") is True
+        assert info["score"] < 0.15  # penalty reduces it
+    def test_no_penalty_on_high_score(self):
+        env = MLOpsEnvironment(task_id="hard")
+        env.reset(seed=42)
+        env._artifacts_read = list(env._artifacts.keys())
+        bug = env.bug
+        obs, reward, done, info = env.step(MLOpsAction(
+            action_type="submit_diagnosis",
+            failure_category=bug.category,
+            root_cause_file=bug.file,
+            root_cause_field=bug.field,
+            diagnosis="test",
+            proposed_fix=bug.gold_fix,
+        ))
+        assert info["breakdown"].get("hard_task_penalty_applied") is not True
+        assert info["score"] >= 0.70
+class TestGraderDeterminism:
+    """Same inputs must always produce identical scores."""
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_same_seed_same_score(self, task_id):
+        scores = []
+        for _ in range(3):
+            env = MLOpsEnvironment(task_id=task_id)
+            env.reset(seed=42)
+            env._artifacts_read = list(env._artifacts.keys())
+            bug = env.bug
+            obs, _, _, info = env.step(MLOpsAction(
+                action_type="submit_diagnosis",
+                failure_category=bug.category,
+                root_cause_file=bug.file,
+                root_cause_field=bug.field,
+                proposed_fix=bug.gold_fix,
+            ))
+            scores.append(info["score"])
+        assert scores[0] == scores[1] == scores[2], f"Non-deterministic: {scores}"
+class TestGradeTaskStandalone:
+    """grade_task() must match environment grading and respect score range."""
+    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
+    def test_grade_task_score_in_range(self, task_id):
+        pool = TASK_BUG_POOLS[task_id]
+        for bug_name in pool:
+            bug = BUG_CATALOGUE[bug_name]
+            score = grade_task(task_id, seed=42, diagnosis={
+                "failure_category": bug.category,
+                "root_cause_file": bug.file,
+                "root_cause_field": bug.field,
+                "proposed_fix": bug.gold_fix,
+            })
+            assert 0 < score < 1, f"grade_task score {score} out of range for {bug_name}"
+    def test_grade_task_empty_diagnosis(self):
+        score = grade_task("easy", seed=42, diagnosis={})
+        assert 0 < score < 1