File size: 3,019 Bytes
250ab26
 
 
18aa055
35ea9cd
 
 
 
 
 
 
 
 
 
 
 
 
 
18aa055
 
 
 
250ab26
35ea9cd
250ab26
18aa055
250ab26
35ea9cd
 
18aa055
 
35ea9cd
 
 
 
250ab26
 
35ea9cd
250ab26
18aa055
35ea9cd
 
 
250ab26
35ea9cd
18aa055
35ea9cd
 
 
 
18aa055
250ab26
35ea9cd
 
250ab26
18aa055
35ea9cd
 
 
 
 
18aa055
35ea9cd
 
 
 
 
 
18aa055
250ab26
 
 
 
 
 
18aa055
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from models import IncidentAction

_SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}

_TASK2_RELATED_GROUPS = [
    {"DATABASE", "APPLICATION"},
    {"NETWORK", "INFRASTRUCTURE"},
    {"NETWORK", "THIRD_PARTY"},
    {"INFRASTRUCTURE", "THIRD_PARTY"},
]
_TASK3_PARTIAL = {
    ("RESTART_SERVICE", "FAILOVER"): 0.25,
    ("FAILOVER", "RESTART_SERVICE"): 0.25,
    ("NOTIFY_VENDOR", "INVESTIGATE"): 0.25,
    ("SCALE_UP", "INVESTIGATE"): 0.25,
    ("RESTART_SERVICE", "INVESTIGATE"): 0.25,
}

# Scores must be strictly within (0, 1) — 0.0 and 1.0 are rejected by the validator.
_EXACT = 0.99
_ZERO  = 0.01


def grade_task1(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
    if action.severity is None:
        return _ZERO, "Missing severity classification."
    predicted = _SEV_ORDER.get(action.severity.value, -1)
    expected = _SEV_ORDER.get(ground_truth["severity"], -1)
    distance = abs(predicted - expected)
    score = {0: _EXACT, 1: 0.5, 2: _ZERO}.get(distance, _ZERO)
    if score == _EXACT:
        return score, "Exact severity match."
    if score == 0.5:
        return score, "Adjacent severity band: partial credit for a close escalation call."
    return score, "Severity choice is too far from the ground truth."


def grade_task2(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
    if action.root_cause is None:
        return _ZERO, "Missing root-cause classification."

    predicted = action.root_cause.value
    expected = ground_truth["root_cause"]

    if predicted == expected:
        return _EXACT, "Exact root-cause match."
    if predicted == "UNKNOWN":
        return 0.25, "Conservative fallback: uncertainty recognized, but the failure domain was not isolated."
    if any({predicted, expected} == group for group in _TASK2_RELATED_GROUPS):
        return 0.5, "Related failure domain selected: partial credit for a near-miss diagnosis."
    return _ZERO, "Root-cause classification does not match the expected failure domain."


def grade_task3(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
    if action.action is None:
        return _ZERO, "Missing remediation recommendation."

    predicted = action.action.value
    expected = ground_truth["action"]

    if predicted == expected:
        return _EXACT, "Exact remediation match."
    if predicted == "INVESTIGATE" and expected != "NO_ACTION":
        return 0.4, "Safe investigative fallback: the incident was recognized, but the optimal action was not taken."
    if predicted == "NO_ACTION" and expected == "INVESTIGATE":
        return 0.25, "Conservative response, but deeper investigation was expected."
    if (predicted, expected) in _TASK3_PARTIAL:
        return _TASK3_PARTIAL[(predicted, expected)], "Related remediation selected: partial credit for a close operational response."
    return _ZERO, "Recommended action does not match the expected operator response."


GRADERS = {
    "task1": grade_task1,
    "task2": grade_task2,
    "task3": grade_task3,
}