Vighnesh commited on
Commit
4744d17
Β·
1 Parent(s): 3d83a5d

Fix #2: cap _reply_quality at 0.25, add case-insensitive punctuation-stripped matching (weights now sum to exactly 1.0)

Browse files
Files changed (1) hide show
  1. graders.py +14 -6
graders.py CHANGED
@@ -38,14 +38,22 @@ _KEYWORD_REWARDS: Dict[str, list[str]] = {
38
 
39
 
40
  def _reply_quality(reply_text: str, category: str) -> float:
41
- """Return 0.0–0.5 based on how relevant the reply text is."""
 
 
 
 
 
 
42
  if not reply_text:
43
  return 0.0
44
- text_lower = reply_text.lower()
 
 
45
  keywords = _KEYWORD_REWARDS.get(category, [])
46
- hits = sum(1 for kw in keywords if kw in text_lower)
47
- # cap at 0.5 (the other 0.5 comes from action correctness)
48
- return min(0.5, hits * 0.1)
49
 
50
 
51
  # ─────────────────────────── Task 1 ────────────────────────────
@@ -103,7 +111,7 @@ def grade_task3(
103
  Breakdown:
104
  0.20 – classification correct
105
  0.40 – action correct (0.20 if partial)
106
- 0.25 – reply quality (NLP keyword overlap)
107
  0.15 – efficiency bonus (fewer steps β†’ higher bonus)
108
  """
109
  score = 0.0
 
38
 
39
 
40
  def _reply_quality(reply_text: str, category: str) -> float:
41
+ """Return 0.0–0.25 based on how relevant the reply text is.
42
+
43
+ Matching is case-insensitive and punctuation-stripped so that
44
+ replies like 'Resolved.' and 'resolved' score identically.
45
+ Each keyword hit = 0.05, capped at 0.25 (5 hits max).
46
+ Total grade_task3 weights: 0.20 + 0.40 + 0.25 + 0.15 = 1.00
47
+ """
48
  if not reply_text:
49
  return 0.0
50
+ # Strip punctuation and lowercase for robust matching
51
+ import re
52
+ cleaned = re.sub(r'[^\w\s]', ' ', reply_text.lower())
53
  keywords = _KEYWORD_REWARDS.get(category, [])
54
+ hits = sum(1 for kw in keywords if kw in cleaned)
55
+ # cap at 0.25 β€” reply quality component of grade_task3
56
+ return min(0.25, hits * 0.05)
57
 
58
 
59
  # ─────────────────────────── Task 1 ────────────────────────────
 
111
  Breakdown:
112
  0.20 – classification correct
113
  0.40 – action correct (0.20 if partial)
114
+ 0.25 – reply quality (NLP keyword overlap, case-insensitive, punctuation-stripped)
115
  0.15 – efficiency bonus (fewer steps β†’ higher bonus)
116
  """
117
  score = 0.0