Roopalgn commited on
Commit
e3dfee6
·
1 Parent(s): ff634dc

Clamp grader task scores to open interval

Browse files
server/grader.py CHANGED
@@ -2,6 +2,8 @@ from __future__ import annotations
2
 
3
  from models import HelpdeskTicketAction, HelpdeskTicketRecord
4
 
 
 
5
 
6
  ISSUE_TYPE_SIMILARITY = {
7
  ("billing_license", "service_request"): 0.4,
@@ -98,6 +100,7 @@ def grade_action(
98
  }
99
 
100
  weights = TASK_WEIGHTS[task_id]
101
- score = sum(field_scores[field] * weight for field, weight in weights.items())
 
102
  breakdown = {field: field_scores[field] for field in weights}
103
  return score, breakdown
 
2
 
3
  from models import HelpdeskTicketAction, HelpdeskTicketRecord
4
 
5
+ TASK_SCORE_EPSILON = 0.001
6
+
7
 
8
  ISSUE_TYPE_SIMILARITY = {
9
  ("billing_license", "service_request"): 0.4,
 
100
  }
101
 
102
  weights = TASK_WEIGHTS[task_id]
103
+ raw_score = sum(field_scores[field] * weight for field, weight in weights.items())
104
+ score = max(TASK_SCORE_EPSILON, min(1.0 - TASK_SCORE_EPSILON, raw_score))
105
  breakdown = {field: field_scores[field] for field in weights}
106
  return score, breakdown
tests/test_competitive_upgrade.py CHANGED
@@ -746,9 +746,9 @@ class TestTerminalInvalidActionFinalReward(unittest.TestCase):
746
  )
747
 
748
  self.assertTrue(final_obs.done)
749
- self.assertAlmostEqual(final_obs.reward, 0.5, places=9)
750
- self.assertAlmostEqual(env.state.total_reward, 0.5, places=9)
751
- self.assertAlmostEqual(env.state.reward or 0.0, 0.5, places=9)
752
 
753
 
754
  # ---------------------------------------------------------------------------
 
746
  )
747
 
748
  self.assertTrue(final_obs.done)
749
+ self.assertAlmostEqual(final_obs.reward, 0.4995, places=9)
750
+ self.assertAlmostEqual(env.state.total_reward, 0.4995, places=9)
751
+ self.assertAlmostEqual(env.state.reward or 0.0, 0.4995, places=9)
752
 
753
 
754
  # ---------------------------------------------------------------------------
tests/test_grader_unit.py CHANGED
@@ -45,7 +45,7 @@ class GraderUnitTests(unittest.TestCase):
45
 
46
  score, breakdown = grade_action(action, ticket, task_id=3)
47
 
48
- self.assertAlmostEqual(score, 1.0)
49
  self.assertEqual(
50
  breakdown,
51
  {
@@ -81,13 +81,14 @@ class GraderUnitTests(unittest.TestCase):
81
 
82
  score, breakdown = grade_action(action, ticket, task_id=1)
83
 
84
- expected_score = (
85
  1.0
86
  if predicted == expected
87
  else ISSUE_TYPE_SIMILARITY.get((predicted, expected), 0.0)
88
  )
89
- self.assertAlmostEqual(score, expected_score)
90
- self.assertEqual(breakdown, {"issue_type": expected_score})
 
91
 
92
  def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
93
  ticket = _ticket(issue_type="onboarding")
@@ -95,7 +96,7 @@ class GraderUnitTests(unittest.TestCase):
95
 
96
  score, breakdown = grade_action(action, ticket, task_id=1)
97
 
98
- self.assertAlmostEqual(score, 0.0)
99
  self.assertEqual(breakdown, {"issue_type": 0.0})
100
 
101
  def test_priority_scoring_uses_defined_proximity_table(self) -> None:
@@ -129,7 +130,9 @@ class GraderUnitTests(unittest.TestCase):
129
  breakdown,
130
  {"issue_type": 1.0, "priority": priority_score},
131
  )
132
- self.assertAlmostEqual(score, 0.6 + 0.4 * priority_score)
 
 
133
 
134
  def test_task_2_weights_apply_as_documented(self) -> None:
135
  ticket = _ticket(priority="high")
 
45
 
46
  score, breakdown = grade_action(action, ticket, task_id=3)
47
 
48
+ self.assertAlmostEqual(score, 0.999)
49
  self.assertEqual(
50
  breakdown,
51
  {
 
81
 
82
  score, breakdown = grade_action(action, ticket, task_id=1)
83
 
84
+ raw_expected_score = (
85
  1.0
86
  if predicted == expected
87
  else ISSUE_TYPE_SIMILARITY.get((predicted, expected), 0.0)
88
  )
89
+ expected_task_score = max(0.001, min(0.999, raw_expected_score))
90
+ self.assertAlmostEqual(score, expected_task_score)
91
+ self.assertEqual(breakdown, {"issue_type": raw_expected_score})
92
 
93
  def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
94
  ticket = _ticket(issue_type="onboarding")
 
96
 
97
  score, breakdown = grade_action(action, ticket, task_id=1)
98
 
99
+ self.assertAlmostEqual(score, 0.001)
100
  self.assertEqual(breakdown, {"issue_type": 0.0})
101
 
102
  def test_priority_scoring_uses_defined_proximity_table(self) -> None:
 
130
  breakdown,
131
  {"issue_type": 1.0, "priority": priority_score},
132
  )
133
+ raw_score = 0.6 + 0.4 * priority_score
134
+ expected_task_score = max(0.001, min(0.999, raw_score))
135
+ self.assertAlmostEqual(score, expected_task_score)
136
 
137
  def test_task_2_weights_apply_as_documented(self) -> None:
138
  ticket = _ticket(priority="high")