Spaces:
Running
Running
| import unittest | |
| from graders import GRADERS, grade_task1, grade_task2, grade_task3 | |
| from incidents import TICKETS | |
| from models import IncidentAction | |
| class GraderTests(unittest.TestCase): | |
| def test_all_ticket_ground_truth_scores_stay_strictly_within_unit_interval(self) -> None: | |
| for ticket in TICKETS: | |
| action = IncidentAction( | |
| incident_id=ticket["incident_id"], | |
| task_type=ticket["task_type"], | |
| **ticket["ground_truth"], | |
| ) | |
| score, reason = GRADERS[ticket["task_type"]](action, ticket["ground_truth"]) | |
| self.assertGreater(score, 0.0, ticket["incident_id"]) | |
| self.assertLess(score, 1.0, ticket["incident_id"]) | |
| self.assertIsInstance(reason, str) | |
| def test_task1_grader_supports_partial_credit(self) -> None: | |
| exact = IncidentAction( | |
| incident_id="INC-TEST-1", | |
| task_type="task1", | |
| severity="SEV1", | |
| ) | |
| adjacent = IncidentAction( | |
| incident_id="INC-TEST-1", | |
| task_type="task1", | |
| severity="SEV2", | |
| ) | |
| exact_score, _ = grade_task1(exact, {"severity": "SEV1"}) | |
| adjacent_score, _ = grade_task1(adjacent, {"severity": "SEV1"}) | |
| self.assertEqual(exact_score, 0.99) | |
| self.assertEqual(adjacent_score, 0.5) | |
| def test_task2_grader_is_not_constant(self) -> None: | |
| exact = IncidentAction( | |
| incident_id="INC-TEST-2", | |
| task_type="task2", | |
| root_cause="DATABASE", | |
| ) | |
| fallback = IncidentAction( | |
| incident_id="INC-TEST-2", | |
| task_type="task2", | |
| root_cause="UNKNOWN", | |
| ) | |
| wrong = IncidentAction( | |
| incident_id="INC-TEST-2", | |
| task_type="task2", | |
| root_cause="NETWORK", | |
| ) | |
| exact_score, _ = grade_task2(exact, {"root_cause": "DATABASE"}) | |
| fallback_score, _ = grade_task2(fallback, {"root_cause": "DATABASE"}) | |
| wrong_score, _ = grade_task2(wrong, {"root_cause": "DATABASE"}) | |
| self.assertEqual(exact_score, 0.99) | |
| self.assertEqual(fallback_score, 0.25) | |
| self.assertEqual(wrong_score, 0.01) | |
| def test_task2_grader_rewards_related_domain_partial_credit(self) -> None: | |
| near_miss = IncidentAction( | |
| incident_id="INC-TEST-2A", | |
| task_type="task2", | |
| root_cause="APPLICATION", | |
| ) | |
| score, reason = grade_task2(near_miss, {"root_cause": "DATABASE"}) | |
| self.assertEqual(score, 0.5) | |
| self.assertIn("partial credit", reason.lower()) | |
| def test_task3_grader_rewards_safe_fallbacks(self) -> None: | |
| exact = IncidentAction( | |
| incident_id="INC-TEST-3", | |
| task_type="task3", | |
| action="FAILOVER", | |
| ) | |
| fallback = IncidentAction( | |
| incident_id="INC-TEST-3", | |
| task_type="task3", | |
| action="INVESTIGATE", | |
| ) | |
| wrong = IncidentAction( | |
| incident_id="INC-TEST-3", | |
| task_type="task3", | |
| action="NO_ACTION", | |
| ) | |
| exact_score, _ = grade_task3(exact, {"action": "FAILOVER"}) | |
| fallback_score, _ = grade_task3(fallback, {"action": "FAILOVER"}) | |
| wrong_score, _ = grade_task3(wrong, {"action": "FAILOVER"}) | |
| self.assertEqual(exact_score, 0.99) | |
| self.assertEqual(fallback_score, 0.4) | |
| self.assertEqual(wrong_score, 0.01) | |
| def test_task3_grader_rewards_related_action_partial_credit(self) -> None: | |
| restart_instead_of_failover = IncidentAction( | |
| incident_id="INC-TEST-3A", | |
| task_type="task3", | |
| action="RESTART_SERVICE", | |
| ) | |
| notify_vendor_instead_of_investigate = IncidentAction( | |
| incident_id="INC-TEST-3B", | |
| task_type="task3", | |
| action="NOTIFY_VENDOR", | |
| ) | |
| restart_score, _ = grade_task3(restart_instead_of_failover, {"action": "FAILOVER"}) | |
| vendor_score, _ = grade_task3(notify_vendor_instead_of_investigate, {"action": "INVESTIGATE"}) | |
| self.assertEqual(restart_score, 0.25) | |
| self.assertEqual(vendor_score, 0.25) | |
| if __name__ == "__main__": | |
| unittest.main() | |