Spaces:
Running
Running
Clamp grader task scores to open interval
Browse files- server/grader.py +4 -1
- tests/test_competitive_upgrade.py +3 -3
- tests/test_grader_unit.py +9 -6
server/grader.py
CHANGED
|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
from models import HelpdeskTicketAction, HelpdeskTicketRecord
|
| 4 |
|
|
|
|
|
|
|
| 5 |
|
| 6 |
ISSUE_TYPE_SIMILARITY = {
|
| 7 |
("billing_license", "service_request"): 0.4,
|
|
@@ -98,6 +100,7 @@ def grade_action(
|
|
| 98 |
}
|
| 99 |
|
| 100 |
weights = TASK_WEIGHTS[task_id]
|
| 101 |
-
|
|
|
|
| 102 |
breakdown = {field: field_scores[field] for field in weights}
|
| 103 |
return score, breakdown
|
|
|
|
| 2 |
|
| 3 |
from models import HelpdeskTicketAction, HelpdeskTicketRecord
|
| 4 |
|
| 5 |
+
TASK_SCORE_EPSILON = 0.001
|
| 6 |
+
|
| 7 |
|
| 8 |
ISSUE_TYPE_SIMILARITY = {
|
| 9 |
("billing_license", "service_request"): 0.4,
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
weights = TASK_WEIGHTS[task_id]
|
| 103 |
+
raw_score = sum(field_scores[field] * weight for field, weight in weights.items())
|
| 104 |
+
score = max(TASK_SCORE_EPSILON, min(1.0 - TASK_SCORE_EPSILON, raw_score))
|
| 105 |
breakdown = {field: field_scores[field] for field in weights}
|
| 106 |
return score, breakdown
|
tests/test_competitive_upgrade.py
CHANGED
|
@@ -746,9 +746,9 @@ class TestTerminalInvalidActionFinalReward(unittest.TestCase):
|
|
| 746 |
)
|
| 747 |
|
| 748 |
self.assertTrue(final_obs.done)
|
| 749 |
-
self.assertAlmostEqual(final_obs.reward, 0.
|
| 750 |
-
self.assertAlmostEqual(env.state.total_reward, 0.
|
| 751 |
-
self.assertAlmostEqual(env.state.reward or 0.0, 0.
|
| 752 |
|
| 753 |
|
| 754 |
# ---------------------------------------------------------------------------
|
|
|
|
| 746 |
)
|
| 747 |
|
| 748 |
self.assertTrue(final_obs.done)
|
| 749 |
+
self.assertAlmostEqual(final_obs.reward, 0.4995, places=9)
|
| 750 |
+
self.assertAlmostEqual(env.state.total_reward, 0.4995, places=9)
|
| 751 |
+
self.assertAlmostEqual(env.state.reward or 0.0, 0.4995, places=9)
|
| 752 |
|
| 753 |
|
| 754 |
# ---------------------------------------------------------------------------
|
tests/test_grader_unit.py
CHANGED
|
@@ -45,7 +45,7 @@ class GraderUnitTests(unittest.TestCase):
|
|
| 45 |
|
| 46 |
score, breakdown = grade_action(action, ticket, task_id=3)
|
| 47 |
|
| 48 |
-
self.assertAlmostEqual(score,
|
| 49 |
self.assertEqual(
|
| 50 |
breakdown,
|
| 51 |
{
|
|
@@ -81,13 +81,14 @@ class GraderUnitTests(unittest.TestCase):
|
|
| 81 |
|
| 82 |
score, breakdown = grade_action(action, ticket, task_id=1)
|
| 83 |
|
| 84 |
-
|
| 85 |
1.0
|
| 86 |
if predicted == expected
|
| 87 |
else ISSUE_TYPE_SIMILARITY.get((predicted, expected), 0.0)
|
| 88 |
)
|
| 89 |
-
|
| 90 |
-
self.
|
|
|
|
| 91 |
|
| 92 |
def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
|
| 93 |
ticket = _ticket(issue_type="onboarding")
|
|
@@ -95,7 +96,7 @@ class GraderUnitTests(unittest.TestCase):
|
|
| 95 |
|
| 96 |
score, breakdown = grade_action(action, ticket, task_id=1)
|
| 97 |
|
| 98 |
-
self.assertAlmostEqual(score, 0.
|
| 99 |
self.assertEqual(breakdown, {"issue_type": 0.0})
|
| 100 |
|
| 101 |
def test_priority_scoring_uses_defined_proximity_table(self) -> None:
|
|
@@ -129,7 +130,9 @@ class GraderUnitTests(unittest.TestCase):
|
|
| 129 |
breakdown,
|
| 130 |
{"issue_type": 1.0, "priority": priority_score},
|
| 131 |
)
|
| 132 |
-
|
|
|
|
|
|
|
| 133 |
|
| 134 |
def test_task_2_weights_apply_as_documented(self) -> None:
|
| 135 |
ticket = _ticket(priority="high")
|
|
|
|
| 45 |
|
| 46 |
score, breakdown = grade_action(action, ticket, task_id=3)
|
| 47 |
|
| 48 |
+
self.assertAlmostEqual(score, 0.999)
|
| 49 |
self.assertEqual(
|
| 50 |
breakdown,
|
| 51 |
{
|
|
|
|
| 81 |
|
| 82 |
score, breakdown = grade_action(action, ticket, task_id=1)
|
| 83 |
|
| 84 |
+
raw_expected_score = (
|
| 85 |
1.0
|
| 86 |
if predicted == expected
|
| 87 |
else ISSUE_TYPE_SIMILARITY.get((predicted, expected), 0.0)
|
| 88 |
)
|
| 89 |
+
expected_task_score = max(0.001, min(0.999, raw_expected_score))
|
| 90 |
+
self.assertAlmostEqual(score, expected_task_score)
|
| 91 |
+
self.assertEqual(breakdown, {"issue_type": raw_expected_score})
|
| 92 |
|
| 93 |
def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
|
| 94 |
ticket = _ticket(issue_type="onboarding")
|
|
|
|
| 96 |
|
| 97 |
score, breakdown = grade_action(action, ticket, task_id=1)
|
| 98 |
|
| 99 |
+
self.assertAlmostEqual(score, 0.001)
|
| 100 |
self.assertEqual(breakdown, {"issue_type": 0.0})
|
| 101 |
|
| 102 |
def test_priority_scoring_uses_defined_proximity_table(self) -> None:
|
|
|
|
| 130 |
breakdown,
|
| 131 |
{"issue_type": 1.0, "priority": priority_score},
|
| 132 |
)
|
| 133 |
+
raw_score = 0.6 + 0.4 * priority_score
|
| 134 |
+
expected_task_score = max(0.001, min(0.999, raw_score))
|
| 135 |
+
self.assertAlmostEqual(score, expected_task_score)
|
| 136 |
|
| 137 |
def test_task_2_weights_apply_as_documented(self) -> None:
|
| 138 |
ticket = _ticket(priority="high")
|