from __future__ import annotations from dataclasses import dataclass from .tasks import TaskSpec @dataclass class GradeBreakdown: read_score: float classify_score: float reply_score: float resolve_score: float total: float def _keyword_coverage(message: str, required: tuple[str, ...]) -> float: if not required: return 1.0 lowered = message.lower() found = sum(1 for k in required if k.lower() in lowered) return found / len(required) def _forbidden_penalty(message: str, forbidden: tuple[str, ...]) -> float: lowered = message.lower() count = sum(1 for k in forbidden if k.lower() in lowered) return min(1.0, 0.5 * count) def grade_task(task: TaskSpec, env_state: dict) -> GradeBreakdown: read_target = 1.0 if task.target_ticket_id in env_state["read_ticket_ids"] else 0.0 context_hits = sum(1 for tid in task.required_context_ticket_ids if tid in env_state["read_ticket_ids"]) context_total = len(task.required_context_ticket_ids) context_score = context_hits / context_total if context_total else 1.0 read_score = 0.6 * read_target + 0.4 * context_score classification = env_state.get("classification") or {} fields_correct = 0 fields_total = 3 fields_correct += int(classification.get("priority") == task.expected_priority) fields_correct += int(classification.get("category") == task.expected_category) fields_correct += int(classification.get("needs_escalation") == task.expected_escalation) classify_score = fields_correct / fields_total draft = env_state.get("draft_reply") or "" keyword_score = _keyword_coverage(draft, task.required_reply_keywords) forbidden_penalty = _forbidden_penalty(draft, task.forbidden_reply_keywords) reply_score = max(0.0, keyword_score - forbidden_penalty) resolved = bool(env_state.get("resolved")) resolved_target = env_state.get("resolved_ticket_id") == task.target_ticket_id resolve_score = 1.0 if resolved and resolved_target else 0.0 total = (0.2 * read_score) + (0.35 * classify_score) + (0.3 * reply_score) + (0.15 * resolve_score) total = max(0.0, min(1.0, total)) return GradeBreakdown( read_score=round(read_score, 4), classify_score=round(classify_score, 4), reply_score=round(reply_score, 4), resolve_score=round(resolve_score, 4), total=round(total, 4), )