Spaces:
Sleeping
Sleeping
Commit ·
71d64d0
1
Parent(s): e225fd7
Final Changes
Browse files- environment/graders.py +10 -1
- tests/test_env.py +4 -2
environment/graders.py
CHANGED
|
@@ -11,6 +11,14 @@ class TaskGrader:
|
|
| 11 |
def _normalize(text: str) -> str:
|
| 12 |
return text.lower().strip()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def _match_issue(self, expected: Dict[str, Any], comment: Comment) -> bool:
|
| 15 |
expected_line = expected.get("line")
|
| 16 |
expected_type = self._normalize(expected.get("type", ""))
|
|
@@ -156,7 +164,8 @@ class TaskGrader:
|
|
| 156 |
efficiency_bonus = self.grade_efficiency(steps_taken, max_steps)
|
| 157 |
|
| 158 |
raw_score = (detection_score * 0.4) + (suggestion_score * 0.3) + (decision_score * 0.3)
|
| 159 |
-
|
|
|
|
| 160 |
|
| 161 |
return {
|
| 162 |
"expected_issue_count": expected_count,
|
|
|
|
| 11 |
def _normalize(text: str) -> str:
|
| 12 |
return text.lower().strip()
|
| 13 |
|
| 14 |
+
@staticmethod
|
| 15 |
+
def _to_open_interval(score: float, epsilon: float = 1e-4) -> float:
|
| 16 |
+
if score <= 0.0:
|
| 17 |
+
return epsilon
|
| 18 |
+
if score >= 1.0:
|
| 19 |
+
return 1.0 - epsilon
|
| 20 |
+
return score
|
| 21 |
+
|
| 22 |
def _match_issue(self, expected: Dict[str, Any], comment: Comment) -> bool:
|
| 23 |
expected_line = expected.get("line")
|
| 24 |
expected_type = self._normalize(expected.get("type", ""))
|
|
|
|
| 164 |
efficiency_bonus = self.grade_efficiency(steps_taken, max_steps)
|
| 165 |
|
| 166 |
raw_score = (detection_score * 0.4) + (suggestion_score * 0.3) + (decision_score * 0.3)
|
| 167 |
+
bounded_score = max(0.0, min(1.0, raw_score - false_positive_penalty + efficiency_bonus))
|
| 168 |
+
final_score = self._to_open_interval(bounded_score)
|
| 169 |
|
| 170 |
return {
|
| 171 |
"expected_issue_count": expected_count,
|
tests/test_env.py
CHANGED
|
@@ -222,7 +222,8 @@ class TestCodeReviewEnv(unittest.TestCase):
|
|
| 222 |
|
| 223 |
self.assertTrue(done)
|
| 224 |
self.assertEqual(obs["final_decision_made"], "approved")
|
| 225 |
-
self.
|
|
|
|
| 226 |
self.assertIn("diagnostics", info)
|
| 227 |
self.assertEqual(info["diagnostics"]["false_positive_count"], 0)
|
| 228 |
|
|
@@ -363,7 +364,8 @@ class TestCodeReviewEnv(unittest.TestCase):
|
|
| 363 |
obs, _, done, info = self.env.step(action.model_dump())
|
| 364 |
self.assertTrue(done)
|
| 365 |
self.assertEqual(obs["final_decision_made"], "approved")
|
| 366 |
-
self.
|
|
|
|
| 367 |
|
| 368 |
def test_new_task_categories_registered(self):
|
| 369 |
task_ids = {t["task_id"] for t in TaskDefinitions.get_all_tasks()}
|
|
|
|
| 222 |
|
| 223 |
self.assertTrue(done)
|
| 224 |
self.assertEqual(obs["final_decision_made"], "approved")
|
| 225 |
+
self.assertGreater(info["task_score"], 0.0)
|
| 226 |
+
self.assertLess(info["task_score"], 1.0)
|
| 227 |
self.assertIn("diagnostics", info)
|
| 228 |
self.assertEqual(info["diagnostics"]["false_positive_count"], 0)
|
| 229 |
|
|
|
|
| 364 |
obs, _, done, info = self.env.step(action.model_dump())
|
| 365 |
self.assertTrue(done)
|
| 366 |
self.assertEqual(obs["final_decision_made"], "approved")
|
| 367 |
+
self.assertGreater(info["task_score"], 0.0)
|
| 368 |
+
self.assertLess(info["task_score"], 1.0)
|
| 369 |
|
| 370 |
def test_new_task_categories_registered(self):
|
| 371 |
task_ids = {t["task_id"] for t in TaskDefinitions.get_all_tasks()}
|