| """Grader for Task 2 β Conflict Classification (medium). |
| |
| Scoring |
| ------- |
| 1.0 β exact match with the ground-truth violation type |
| 0.5 β same constraint family (resource-limit or temporal-ordering) |
| 0.1 β valid category but from a different family |
| 0.0 β empty or completely unrecognised response |
| |
| Constraint families (related groups for partial credit) |
| ------------------------------------------------------- |
| Resource-limit family : resource_overload, capacity_exceeded |
| Both concern the number of jobs concurrently on a machine. |
| Temporal-ordering family : deadline_violation, precedence_violation |
| Both concern the sequencing and timing of job execution. |
| Standalone : availability_conflict |
| Concerns machine operational windows (no close sibling). |
| |
| After each call, ``last_breakdown`` holds a dict describing the decision. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from typing import Any |
|
|
| from models import Action |
|
|
| VALID_CATEGORIES: frozenset[str] = frozenset( |
| { |
| "resource_overload", |
| "deadline_violation", |
| "precedence_violation", |
| "availability_conflict", |
| "capacity_exceeded", |
| } |
| ) |
|
|
| |
| _RELATED_GROUPS: list[frozenset[str]] = [ |
| frozenset({"resource_overload", "capacity_exceeded"}), |
| frozenset({"deadline_violation", "precedence_violation"}), |
| ] |
|
|
|
|
| def _same_family(a: str, b: str) -> bool: |
| """Return True if a and b belong to the same related group.""" |
| return any(a in g and b in g for g in _RELATED_GROUPS) |
|
|
|
|
| class ConflictGrader: |
| """Grade the agent's constraint-violation classification.""" |
|
|
| def __init__(self) -> None: |
| self.last_breakdown: dict[str, Any] = {} |
|
|
| def grade(self, action: Action, ground_truth: dict[str, Any]) -> float: |
| |
| response: str = ( |
| action.response.strip().lower().replace(" ", "_").replace("-", "_") |
| ) |
| expected: str = ground_truth.get("violation_type") or "" |
|
|
| if not response: |
| self._record("", expected, 0.0, "Empty response.") |
| return 0.0 |
|
|
| |
| if response == expected: |
| self._record(response, expected, 1.0, "Exact match.") |
| return 1.0 |
|
|
| |
| if response not in VALID_CATEGORIES: |
| self._record( |
| response, expected, 0.0, |
| f"'{response}' is not a valid category. " |
| f"Choose from: {', '.join(sorted(VALID_CATEGORIES))}.", |
| ) |
| return 0.0 |
|
|
| |
| if _same_family(response, expected): |
| self._record( |
| response, expected, 0.5, |
| f"Related category (same family as '{expected}').", |
| ) |
| return 0.5 |
|
|
| |
| self._record( |
| response, expected, 0.1, |
| f"Valid category but wrong family. Expected '{expected}'.", |
| ) |
| return 0.1 |
|
|
| def _record( |
| self, predicted: str, expected: str, score: float, feedback: str |
| ) -> None: |
| self.last_breakdown = { |
| "predicted": predicted, |
| "expected": expected, |
| "score": score, |
| "in_valid_categories": predicted in VALID_CATEGORIES, |
| "same_family": _same_family(predicted, expected) if predicted and expected else False, |
| "exact_match": predicted == expected, |
| "feedback": feedback, |
| } |
|
|