| """Tests for the deterministic grader.""" |
|
|
| import sys |
| import os |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| from server.grader import grade_episode |
|
|
|
|
| class TestGraderBounds: |
| """Score is always 0.0–1.0.""" |
|
|
| def test_perfect_score(self): |
| result = grade_episode( |
| final_slo_score=1.0, |
| steps_taken=3, |
| max_steps=10, |
| actions_taken=[ |
| {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True}, |
| {"tick": 1, "action": "restart_service", "target": "svc", "success": True}, |
| ], |
| terminated=True, |
| termination_reason="resolved", |
| ) |
| assert 0.0 <= result.score <= 1.0 |
| assert result.score > 0.8 |
|
|
| def test_zero_score(self): |
| result = grade_episode( |
| final_slo_score=0.0, |
| steps_taken=10, |
| max_steps=10, |
| actions_taken=[], |
| terminated=True, |
| termination_reason="timeout", |
| ) |
| assert result.score == 0.0 |
|
|
| def test_partial_credit(self): |
| result = grade_episode( |
| final_slo_score=0.5, |
| steps_taken=10, |
| max_steps=10, |
| actions_taken=[ |
| {"tick": i, "action": "noop", "success": True} |
| for i in range(10) |
| ], |
| terminated=True, |
| termination_reason="timeout", |
| ) |
| assert 0.0 < result.score < 1.0 |
|
|
| def test_determinism(self): |
| args = dict( |
| final_slo_score=0.7, |
| steps_taken=5, |
| max_steps=20, |
| actions_taken=[ |
| {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True}, |
| {"tick": 1, "action": "restart_service", "target": "svc", "success": True}, |
| ], |
| terminated=True, |
| termination_reason="timeout", |
| ) |
| r1 = grade_episode(**args) |
| r2 = grade_episode(**args) |
| assert r1.score == r2.score |
|
|
| def test_resolved_bonus(self): |
| """Resolved episodes should score higher than timed-out ones at same SLO.""" |
| resolved = grade_episode( |
| final_slo_score=1.0, |
| steps_taken=5, |
| max_steps=10, |
| actions_taken=[{"tick": i, "action": "restart_service", "target": "svc", "success": True} for i in range(5)], |
| terminated=True, |
| termination_reason="resolved", |
| ) |
| timeout = grade_episode( |
| final_slo_score=1.0, |
| steps_taken=10, |
| max_steps=10, |
| actions_taken=[{"tick": i, "action": "noop", "success": True} for i in range(10)], |
| terminated=True, |
| termination_reason="timeout", |
| ) |
| assert resolved.score > timeout.score |
|
|