Spaces:
Sleeping
Sleeping
Vighnesh
Fix #4: use resolution_hint in reply scoring β category hits 0.03, hint hits 0.05, cap 0.25 (intentional specificity incentive)
3d8844e | """ | |
| Graders for all three tasks. | |
| Each grader returns a float in [0.0, 1.0]. | |
| Task 1 β Classification (easy) | |
| - 1.0 : correct category | |
| - 0.0 : wrong category | |
| Task 2 β Action Selection (medium) | |
| - 1.0 : correct action | |
| - 0.5 : partially correct (e.g., escalate vs reply both defensible) | |
| - 0.0 : clearly wrong (e.g., close an unsolved ticket) | |
| Task 3 β Full Resolution (hard) | |
| Combines classification + action + reply quality into a single score. | |
| Rewards partial progress so the agent gets signal throughout the trajectory. | |
| """ | |
| from __future__ import annotations | |
| from typing import Dict, Any | |
| # βββββββββββββββββββββββββββ helpers βββββββββββββββββββββββββββ | |
| # Pairs of actions that are considered "close enough" for partial credit | |
| _PARTIAL_CREDIT_PAIRS = { | |
| frozenset({"reply", "escalate"}), # borderline tickets | |
| } | |
| _KEYWORD_REWARDS: Dict[str, list[str]] = { | |
| "billing": ["refund", "charge", "invoice", "payment", "billing"], | |
| "account": ["password", "login", "account", "cancel", "subscription"], | |
| "technical": ["engineering", "escalate", "bug", "crash", "error", "fix"], | |
| "refund": ["refund", "return", "credit", "process"], | |
| "general": ["hours", "contact", "phone", "information", "help"], | |
| } | |
| def _reply_quality( | |
| reply_text: str, | |
| category: str, | |
| resolution_hint: str = "", | |
| ) -> float: | |
| """Return 0.0β0.25 based on how relevant the reply text is. | |
| Two-tier keyword scoring (both case-insensitive, punctuation-stripped): | |
| - Category keyword hit β 0.03 each (broad topical relevance) | |
| - Hint keyword hit β 0.05 each (specific resolution relevance) | |
| Total capped at 0.25 β intentionally rewards specificity over vagueness. | |
| Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00 | |
| """ | |
| if not reply_text: | |
| return 0.0 | |
| import re | |
| cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower()) | |
| # Broad category keywords β 0.03 each | |
| category_keywords = _KEYWORD_REWARDS.get(category, []) | |
| category_score = sum(0.03 for kw in category_keywords if kw in cleaned) | |
| # Specific hint keywords β 0.05 each (extracted from resolution_hint) | |
| hint_score = 0.0 | |
| if resolution_hint: | |
| hint_words = set(re.sub(r'[^\w\s]', ' ', resolution_hint.lower()).split()) | |
| # filter out short/common stop words | |
| hint_words = {w for w in hint_words if len(w) > 3} | |
| hint_score = sum(0.05 for w in hint_words if w in cleaned) | |
| return round(min(0.25, category_score + hint_score), 4) | |
| # βββββββββββββββββββββββββββ Task 1 ββββββββββββββββββββββββββββ | |
| def grade_task1( | |
| predicted_category: str, | |
| correct_category: str, | |
| ) -> float: | |
| """Binary classification reward.""" | |
| return 1.0 if predicted_category == correct_category else 0.0 | |
| # βββββββββββββββββββββββββββ Task 2 ββββββββββββββββββββββββββββ | |
| def grade_task2( | |
| action_type: str, | |
| correct_action: str, | |
| category: str | None = None, | |
| ) -> float: | |
| """ | |
| Action-selection reward. | |
| Full credit for exact match, partial credit for defensible alternatives. | |
| Penalises closing an unresolved ticket. | |
| """ | |
| if action_type == correct_action: | |
| return 1.0 | |
| # Partial credit for ambiguous cases | |
| pair = frozenset({action_type, correct_action}) | |
| if pair in _PARTIAL_CREDIT_PAIRS: | |
| return 0.5 | |
| # Closing an unresolved ticket is always wrong | |
| if action_type == "close": | |
| return 0.0 | |
| return 0.0 | |
| # βββββββββββββββββββββββββββ Task 3 ββββββββββββββββββββββββββββ | |
| def grade_task3( | |
| classified_correctly: bool, | |
| action_correct: bool, | |
| action_partial: bool, | |
| reply_text: str | None, | |
| category: str, | |
| resolved: bool, | |
| steps_taken: int, | |
| max_steps: int = 5, | |
| resolution_hint: str = "", | |
| ) -> float: | |
| """ | |
| Multi-step resolution reward with partial progress. | |
| Breakdown: | |
| 0.20 β classification correct | |
| 0.40 β action correct (0.20 if partial) | |
| 0.25 β reply quality (two-tier: category keywords @0.03, hint keywords @0.05) | |
| 0.15 β efficiency bonus (fewer steps β higher bonus) | |
| """ | |
| score = 0.0 | |
| if classified_correctly: | |
| score += 0.20 | |
| if action_correct: | |
| score += 0.40 | |
| elif action_partial: | |
| score += 0.20 | |
| if reply_text: | |
| score += _reply_quality(reply_text, category, resolution_hint) | |
| # Efficiency: full 0.15 for 1 step, 0 for max_steps steps | |
| if resolved and steps_taken <= max_steps: | |
| efficiency = max(0.0, (max_steps - steps_taken) / (max_steps - 1)) | |
| score += 0.15 * efficiency | |
| return round(min(1.0, score), 4) | |
| # βββββββββββββββββββββββββββ Penalty βββββββββββββββββββββββββββ | |
| def loop_penalty(step_count: int, max_steps: int = 10) -> float: | |
| """Return a negative reward if agent is stuck in a loop.""" | |
| if step_count > max_steps: | |
| return -0.05 * (step_count - max_steps) | |
| return 0.0 | |