from models import IncidentAction _SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2} _TASK2_RELATED_GROUPS = [ {"DATABASE", "APPLICATION"}, {"NETWORK", "INFRASTRUCTURE"}, {"NETWORK", "THIRD_PARTY"}, {"INFRASTRUCTURE", "THIRD_PARTY"}, ] _TASK3_PARTIAL = { ("RESTART_SERVICE", "FAILOVER"): 0.25, ("FAILOVER", "RESTART_SERVICE"): 0.25, ("NOTIFY_VENDOR", "INVESTIGATE"): 0.25, ("SCALE_UP", "INVESTIGATE"): 0.25, ("RESTART_SERVICE", "INVESTIGATE"): 0.25, } # Scores must be strictly within (0, 1) — 0.0 and 1.0 are rejected by the validator. _EXACT = 0.99 _ZERO = 0.01 def grade_task1(action: IncidentAction, ground_truth: dict) -> tuple[float, str]: if action.severity is None: return _ZERO, "Missing severity classification." predicted = _SEV_ORDER.get(action.severity.value, -1) expected = _SEV_ORDER.get(ground_truth["severity"], -1) distance = abs(predicted - expected) score = {0: _EXACT, 1: 0.5, 2: _ZERO}.get(distance, _ZERO) if score == _EXACT: return score, "Exact severity match." if score == 0.5: return score, "Adjacent severity band: partial credit for a close escalation call." return score, "Severity choice is too far from the ground truth." def grade_task2(action: IncidentAction, ground_truth: dict) -> tuple[float, str]: if action.root_cause is None: return _ZERO, "Missing root-cause classification." predicted = action.root_cause.value expected = ground_truth["root_cause"] if predicted == expected: return _EXACT, "Exact root-cause match." if predicted == "UNKNOWN": return 0.25, "Conservative fallback: uncertainty recognized, but the failure domain was not isolated." if any({predicted, expected} == group for group in _TASK2_RELATED_GROUPS): return 0.5, "Related failure domain selected: partial credit for a near-miss diagnosis." return _ZERO, "Root-cause classification does not match the expected failure domain." def grade_task3(action: IncidentAction, ground_truth: dict) -> tuple[float, str]: if action.action is None: return _ZERO, "Missing remediation recommendation." predicted = action.action.value expected = ground_truth["action"] if predicted == expected: return _EXACT, "Exact remediation match." if predicted == "INVESTIGATE" and expected != "NO_ACTION": return 0.4, "Safe investigative fallback: the incident was recognized, but the optimal action was not taken." if predicted == "NO_ACTION" and expected == "INVESTIGATE": return 0.25, "Conservative response, but deeper investigation was expected." if (predicted, expected) in _TASK3_PARTIAL: return _TASK3_PARTIAL[(predicted, expected)], "Related remediation selected: partial credit for a close operational response." return _ZERO, "Recommended action does not match the expected operator response." GRADERS = { "task1": grade_task1, "task2": grade_task2, "task3": grade_task3, }