File size: 2,855 Bytes
5f8bd3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""Tests for the deterministic grader."""

import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from server.grader import grade_episode


class TestGraderBounds:
    """Score is always 0.0–1.0."""

    def test_perfect_score(self):
        result = grade_episode(
            final_slo_score=1.0,
            steps_taken=3,
            max_steps=10,
            actions_taken=[
                {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
                {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
            ],
            terminated=True,
            termination_reason="resolved",
        )
        assert 0.0 <= result.score <= 1.0
        assert result.score > 0.8  # Resolved quickly = high score

    def test_zero_score(self):
        result = grade_episode(
            final_slo_score=0.0,
            steps_taken=10,
            max_steps=10,
            actions_taken=[],
            terminated=True,
            termination_reason="timeout",
        )
        assert result.score == 0.0

    def test_partial_credit(self):
        result = grade_episode(
            final_slo_score=0.5,
            steps_taken=10,
            max_steps=10,
            actions_taken=[
                {"tick": i, "action": "noop", "success": True}
                for i in range(10)
            ],
            terminated=True,
            termination_reason="timeout",
        )
        assert 0.0 < result.score < 1.0

    def test_determinism(self):
        args = dict(
            final_slo_score=0.7,
            steps_taken=5,
            max_steps=20,
            actions_taken=[
                {"tick": 0, "action": "inspect_logs", "target": "svc", "success": True},
                {"tick": 1, "action": "restart_service", "target": "svc", "success": True},
            ],
            terminated=True,
            termination_reason="timeout",
        )
        r1 = grade_episode(**args)
        r2 = grade_episode(**args)
        assert r1.score == r2.score

    def test_resolved_bonus(self):
        """Resolved episodes should score higher than timed-out ones at same SLO."""
        resolved = grade_episode(
            final_slo_score=1.0,
            steps_taken=5,
            max_steps=10,
            actions_taken=[{"tick": i, "action": "restart_service", "target": "svc", "success": True} for i in range(5)],
            terminated=True,
            termination_reason="resolved",
        )
        timeout = grade_episode(
            final_slo_score=1.0,
            steps_taken=10,
            max_steps=10,
            actions_taken=[{"tick": i, "action": "noop", "success": True} for i in range(10)],
            terminated=True,
            termination_reason="timeout",
        )
        assert resolved.score > timeout.score