Spaces:

XcodeAddy
/

incident-triage-env

Running

App Files Files Community

incident-triage-env / graders.py

XcodeAddy

Keep grader rewards strictly within unit interval

18aa055 about 1 month ago

raw

history blame contribute delete

3.02 kB

	from models import IncidentAction

	_SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}

	_TASK2_RELATED_GROUPS = [
	{"DATABASE", "APPLICATION"},
	{"NETWORK", "INFRASTRUCTURE"},
	{"NETWORK", "THIRD_PARTY"},
	{"INFRASTRUCTURE", "THIRD_PARTY"},
	]
	_TASK3_PARTIAL = {
	("RESTART_SERVICE", "FAILOVER"): 0.25,
	("FAILOVER", "RESTART_SERVICE"): 0.25,
	("NOTIFY_VENDOR", "INVESTIGATE"): 0.25,
	("SCALE_UP", "INVESTIGATE"): 0.25,
	("RESTART_SERVICE", "INVESTIGATE"): 0.25,
	}

	# Scores must be strictly within (0, 1) — 0.0 and 1.0 are rejected by the validator.
	_EXACT = 0.99
	_ZERO = 0.01


	def grade_task1(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
	if action.severity is None:
	return _ZERO, "Missing severity classification."
	predicted = _SEV_ORDER.get(action.severity.value, -1)
	expected = _SEV_ORDER.get(ground_truth["severity"], -1)
	distance = abs(predicted - expected)
	score = {0: _EXACT, 1: 0.5, 2: _ZERO}.get(distance, _ZERO)
	if score == _EXACT:
	return score, "Exact severity match."
	if score == 0.5:
	return score, "Adjacent severity band: partial credit for a close escalation call."
	return score, "Severity choice is too far from the ground truth."


	def grade_task2(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
	if action.root_cause is None:
	return _ZERO, "Missing root-cause classification."

	predicted = action.root_cause.value
	expected = ground_truth["root_cause"]

	if predicted == expected:
	return _EXACT, "Exact root-cause match."
	if predicted == "UNKNOWN":
	return 0.25, "Conservative fallback: uncertainty recognized, but the failure domain was not isolated."
	if any({predicted, expected} == group for group in _TASK2_RELATED_GROUPS):
	return 0.5, "Related failure domain selected: partial credit for a near-miss diagnosis."
	return _ZERO, "Root-cause classification does not match the expected failure domain."


	def grade_task3(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
	if action.action is None:
	return _ZERO, "Missing remediation recommendation."

	predicted = action.action.value
	expected = ground_truth["action"]

	if predicted == expected:
	return _EXACT, "Exact remediation match."
	if predicted == "INVESTIGATE" and expected != "NO_ACTION":
	return 0.4, "Safe investigative fallback: the incident was recognized, but the optimal action was not taken."
	if predicted == "NO_ACTION" and expected == "INVESTIGATE":
	return 0.25, "Conservative response, but deeper investigation was expected."
	if (predicted, expected) in _TASK3_PARTIAL:
	return _TASK3_PARTIAL[(predicted, expected)], "Related remediation selected: partial credit for a close operational response."
	return _ZERO, "Recommended action does not match the expected operator response."


	GRADERS = {
	"task1": grade_task1,
	"task2": grade_task2,
	"task3": grade_task3,
	}