Spaces:
Sleeping
Sleeping
Vighnesh commited on
Commit ·
3d8844e
1
Parent(s): 93f0ae5
Fix #4: use resolution_hint in reply scoring — category hits 0.03, hint hits 0.05, cap 0.25 (intentional specificity incentive)
Browse files- graders.py +28 -11
- server/support_environment.py +2 -1
graders.py
CHANGED
|
@@ -37,23 +37,39 @@ _KEYWORD_REWARDS: Dict[str, list[str]] = {
|
|
| 37 |
}
|
| 38 |
|
| 39 |
|
| 40 |
-
def _reply_quality(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"""Return 0.0–0.25 based on how relevant the reply text is.
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00
|
| 47 |
"""
|
| 48 |
if not reply_text:
|
| 49 |
return 0.0
|
| 50 |
-
|
| 51 |
import re
|
| 52 |
cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower())
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
# ─────────────────────────── Task 1 ────────────────────────────
|
|
@@ -104,6 +120,7 @@ def grade_task3(
|
|
| 104 |
resolved: bool,
|
| 105 |
steps_taken: int,
|
| 106 |
max_steps: int = 5,
|
|
|
|
| 107 |
) -> float:
|
| 108 |
"""
|
| 109 |
Multi-step resolution reward with partial progress.
|
|
@@ -111,7 +128,7 @@ def grade_task3(
|
|
| 111 |
Breakdown:
|
| 112 |
0.20 – classification correct
|
| 113 |
0.40 – action correct (0.20 if partial)
|
| 114 |
-
0.25 – reply quality (
|
| 115 |
0.15 – efficiency bonus (fewer steps → higher bonus)
|
| 116 |
"""
|
| 117 |
score = 0.0
|
|
@@ -125,7 +142,7 @@ def grade_task3(
|
|
| 125 |
score += 0.20
|
| 126 |
|
| 127 |
if reply_text:
|
| 128 |
-
score += _reply_quality(reply_text, category
|
| 129 |
|
| 130 |
# Efficiency: full 0.15 for 1 step, 0 for max_steps steps
|
| 131 |
if resolved and steps_taken <= max_steps:
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
|
| 40 |
+
def _reply_quality(
|
| 41 |
+
reply_text: str,
|
| 42 |
+
category: str,
|
| 43 |
+
resolution_hint: str = "",
|
| 44 |
+
) -> float:
|
| 45 |
"""Return 0.0–0.25 based on how relevant the reply text is.
|
| 46 |
|
| 47 |
+
Two-tier keyword scoring (both case-insensitive, punctuation-stripped):
|
| 48 |
+
- Category keyword hit → 0.03 each (broad topical relevance)
|
| 49 |
+
- Hint keyword hit → 0.05 each (specific resolution relevance)
|
| 50 |
+
Total capped at 0.25 — intentionally rewards specificity over vagueness.
|
| 51 |
+
|
| 52 |
Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00
|
| 53 |
"""
|
| 54 |
if not reply_text:
|
| 55 |
return 0.0
|
| 56 |
+
|
| 57 |
import re
|
| 58 |
cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower())
|
| 59 |
+
|
| 60 |
+
# Broad category keywords — 0.03 each
|
| 61 |
+
category_keywords = _KEYWORD_REWARDS.get(category, [])
|
| 62 |
+
category_score = sum(0.03 for kw in category_keywords if kw in cleaned)
|
| 63 |
+
|
| 64 |
+
# Specific hint keywords — 0.05 each (extracted from resolution_hint)
|
| 65 |
+
hint_score = 0.0
|
| 66 |
+
if resolution_hint:
|
| 67 |
+
hint_words = set(re.sub(r'[^\w\s]', ' ', resolution_hint.lower()).split())
|
| 68 |
+
# filter out short/common stop words
|
| 69 |
+
hint_words = {w for w in hint_words if len(w) > 3}
|
| 70 |
+
hint_score = sum(0.05 for w in hint_words if w in cleaned)
|
| 71 |
+
|
| 72 |
+
return round(min(0.25, category_score + hint_score), 4)
|
| 73 |
|
| 74 |
|
| 75 |
# ─────────────────────────── Task 1 ────────────────────────────
|
|
|
|
| 120 |
resolved: bool,
|
| 121 |
steps_taken: int,
|
| 122 |
max_steps: int = 5,
|
| 123 |
+
resolution_hint: str = "",
|
| 124 |
) -> float:
|
| 125 |
"""
|
| 126 |
Multi-step resolution reward with partial progress.
|
|
|
|
| 128 |
Breakdown:
|
| 129 |
0.20 – classification correct
|
| 130 |
0.40 – action correct (0.20 if partial)
|
| 131 |
+
0.25 – reply quality (two-tier: category keywords @0.03, hint keywords @0.05)
|
| 132 |
0.15 – efficiency bonus (fewer steps → higher bonus)
|
| 133 |
"""
|
| 134 |
score = 0.0
|
|
|
|
| 142 |
score += 0.20
|
| 143 |
|
| 144 |
if reply_text:
|
| 145 |
+
score += _reply_quality(reply_text, category, resolution_hint)
|
| 146 |
|
| 147 |
# Efficiency: full 0.15 for 1 step, 0 for max_steps steps
|
| 148 |
if resolved and steps_taken <= max_steps:
|
server/support_environment.py
CHANGED
|
@@ -228,7 +228,8 @@ class SupportTicketEnvironment(Environment):
|
|
| 228 |
action_correct=action_correct,
|
| 229 |
action_partial=action_partial,
|
| 230 |
reply_text=action.reply_text,
|
| 231 |
-
category=self._ticket["category"],
|
|
|
|
| 232 |
resolved=True,
|
| 233 |
steps_taken=self._step_count,
|
| 234 |
max_steps=MAX_STEPS,
|
|
|
|
| 228 |
action_correct=action_correct,
|
| 229 |
action_partial=action_partial,
|
| 230 |
reply_text=action.reply_text,
|
| 231 |
+
category=self._ticket["category"], # ground truth category
|
| 232 |
+
resolution_hint=self._ticket.get("resolution_hint", ""), # per-ticket hint keywords
|
| 233 |
resolved=True,
|
| 234 |
steps_taken=self._step_count,
|
| 235 |
max_steps=MAX_STEPS,
|