Vighnesh commited on
Commit
3d8844e
·
1 Parent(s): 93f0ae5

Fix #4: use resolution_hint in reply scoring — category hits 0.03, hint hits 0.05, cap 0.25 (intentional specificity incentive)

Browse files
Files changed (2) hide show
  1. graders.py +28 -11
  2. server/support_environment.py +2 -1
graders.py CHANGED
@@ -37,23 +37,39 @@ _KEYWORD_REWARDS: Dict[str, list[str]] = {
37
  }
38
 
39
 
40
- def _reply_quality(reply_text: str, category: str) -> float:
 
 
 
 
41
  """Return 0.0–0.25 based on how relevant the reply text is.
42
 
43
- Matching is case-insensitive and punctuation-stripped so that
44
- replies like 'Resolved.' and 'resolved' score identically.
45
- Each keyword hit = 0.05, capped at 0.25 (5 hits max).
 
 
46
  Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00
47
  """
48
  if not reply_text:
49
  return 0.0
50
- # Strip punctuation and lowercase for robust matching
51
  import re
52
  cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower())
53
- keywords = _KEYWORD_REWARDS.get(category, [])
54
- hits = sum(1 for kw in keywords if kw in cleaned)
55
- # cap at 0.25 — reply quality component of grade_task3
56
- return min(0.25, hits * 0.05)
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  # ─────────────────────────── Task 1 ────────────────────────────
@@ -104,6 +120,7 @@ def grade_task3(
104
  resolved: bool,
105
  steps_taken: int,
106
  max_steps: int = 5,
 
107
  ) -> float:
108
  """
109
  Multi-step resolution reward with partial progress.
@@ -111,7 +128,7 @@ def grade_task3(
111
  Breakdown:
112
  0.20 – classification correct
113
  0.40 – action correct (0.20 if partial)
114
- 0.25 – reply quality (NLP keyword overlap, case-insensitive, punctuation-stripped)
115
  0.15 – efficiency bonus (fewer steps → higher bonus)
116
  """
117
  score = 0.0
@@ -125,7 +142,7 @@ def grade_task3(
125
  score += 0.20
126
 
127
  if reply_text:
128
- score += _reply_quality(reply_text, category) # up to 0.5 (already capped in _reply_quality)
129
 
130
  # Efficiency: full 0.15 for 1 step, 0 for max_steps steps
131
  if resolved and steps_taken <= max_steps:
 
37
  }
38
 
39
 
40
+ def _reply_quality(
41
+ reply_text: str,
42
+ category: str,
43
+ resolution_hint: str = "",
44
+ ) -> float:
45
  """Return 0.0–0.25 based on how relevant the reply text is.
46
 
47
+ Two-tier keyword scoring (both case-insensitive, punctuation-stripped):
48
+ - Category keyword hit → 0.03 each (broad topical relevance)
49
+ - Hint keyword hit 0.05 each (specific resolution relevance)
50
+ Total capped at 0.25 — intentionally rewards specificity over vagueness.
51
+
52
  Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00
53
  """
54
  if not reply_text:
55
  return 0.0
56
+
57
  import re
58
  cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower())
59
+
60
+ # Broad category keywords 0.03 each
61
+ category_keywords = _KEYWORD_REWARDS.get(category, [])
62
+ category_score = sum(0.03 for kw in category_keywords if kw in cleaned)
63
+
64
+ # Specific hint keywords — 0.05 each (extracted from resolution_hint)
65
+ hint_score = 0.0
66
+ if resolution_hint:
67
+ hint_words = set(re.sub(r'[^\w\s]', ' ', resolution_hint.lower()).split())
68
+ # filter out short/common stop words
69
+ hint_words = {w for w in hint_words if len(w) > 3}
70
+ hint_score = sum(0.05 for w in hint_words if w in cleaned)
71
+
72
+ return round(min(0.25, category_score + hint_score), 4)
73
 
74
 
75
  # ─────────────────────────── Task 1 ────────────────────────────
 
120
  resolved: bool,
121
  steps_taken: int,
122
  max_steps: int = 5,
123
+ resolution_hint: str = "",
124
  ) -> float:
125
  """
126
  Multi-step resolution reward with partial progress.
 
128
  Breakdown:
129
  0.20 – classification correct
130
  0.40 – action correct (0.20 if partial)
131
+ 0.25 – reply quality (two-tier: category keywords @0.03, hint keywords @0.05)
132
  0.15 – efficiency bonus (fewer steps → higher bonus)
133
  """
134
  score = 0.0
 
142
  score += 0.20
143
 
144
  if reply_text:
145
+ score += _reply_quality(reply_text, category, resolution_hint)
146
 
147
  # Efficiency: full 0.15 for 1 step, 0 for max_steps steps
148
  if resolved and steps_taken <= max_steps:
server/support_environment.py CHANGED
@@ -228,7 +228,8 @@ class SupportTicketEnvironment(Environment):
228
  action_correct=action_correct,
229
  action_partial=action_partial,
230
  reply_text=action.reply_text,
231
- category=self._ticket["category"],
 
232
  resolved=True,
233
  steps_taken=self._step_count,
234
  max_steps=MAX_STEPS,
 
228
  action_correct=action_correct,
229
  action_partial=action_partial,
230
  reply_text=action.reply_text,
231
+ category=self._ticket["category"], # ground truth category
232
+ resolution_hint=self._ticket.get("resolution_hint", ""), # per-ticket hint keywords
233
  resolved=True,
234
  steps_taken=self._step_count,
235
  max_steps=MAX_STEPS,