md896 commited on
Commit
bc9f459
·
1 Parent(s): 87464f9

Enforce strict (0,1) task score outputs for validators

Browse files
server/tasks/base.py CHANGED
@@ -4,6 +4,16 @@ from typing import Dict, Any, List, Optional, Tuple
4
 
5
 
6
  class BaseTask(ABC):
 
 
 
 
 
 
 
 
 
 
7
  """
8
  Abstract base for all tasks.
9
 
@@ -93,19 +103,19 @@ class BaseTask(ABC):
93
  - 0.0: null result, syntax error, or empty when non-empty expected
94
  """
95
  if not actual_rows:
96
- return 0.0
97
 
98
  expected = self.expected_output
99
 
100
  if not expected:
101
  # Expected empty result
102
- return 1.0 if len(actual_rows) == 0 else 0.0
103
 
104
  # Exact row count match
105
  if len(actual_rows) != len(expected):
106
  # Partial credit for getting some rows right
107
  overlap = self._count_matching_rows(actual_rows, expected)
108
- return round(min(0.5, overlap / max(len(expected), 1) * 0.5), 3)
109
 
110
  # Check row-by-row match (order-sensitive if task requires it)
111
  matching = self._count_matching_rows(actual_rows, expected)
@@ -118,7 +128,7 @@ class BaseTask(ABC):
118
  if actual_cols != expected_cols:
119
  score *= 0.7 # Penalty for wrong columns
120
 
121
- return round(score, 3)
122
 
123
  def _count_matching_rows(
124
  self,
 
4
 
5
 
6
  class BaseTask(ABC):
7
+ _MIN_STRICT_SCORE = 0.001
8
+ _MAX_STRICT_SCORE = 0.999
9
+
10
+ def _strict_score(self, score: float) -> float:
11
+ """Keep task score strictly inside (0, 1) for validator compatibility."""
12
+ return round(
13
+ min(self._MAX_STRICT_SCORE, max(self._MIN_STRICT_SCORE, score)),
14
+ 3,
15
+ )
16
+
17
  """
18
  Abstract base for all tasks.
19
 
 
103
  - 0.0: null result, syntax error, or empty when non-empty expected
104
  """
105
  if not actual_rows:
106
+ return self._strict_score(0.0)
107
 
108
  expected = self.expected_output
109
 
110
  if not expected:
111
  # Expected empty result
112
+ return self._strict_score(1.0 if len(actual_rows) == 0 else 0.0)
113
 
114
  # Exact row count match
115
  if len(actual_rows) != len(expected):
116
  # Partial credit for getting some rows right
117
  overlap = self._count_matching_rows(actual_rows, expected)
118
+ return self._strict_score(min(0.5, overlap / max(len(expected), 1) * 0.5))
119
 
120
  # Check row-by-row match (order-sensitive if task requires it)
121
  matching = self._count_matching_rows(actual_rows, expected)
 
128
  if actual_cols != expected_cols:
129
  score *= 0.7 # Penalty for wrong columns
130
 
131
+ return self._strict_score(score)
132
 
133
  def _count_matching_rows(
134
  self,
server/tasks/task_medium.py CHANGED
@@ -127,10 +127,20 @@ class MediumTaskGrader:
127
  """
128
  Custom grader for medium task — handles NULL comparison.
129
  """
 
 
 
 
 
 
 
 
 
 
130
  @staticmethod
131
  def grade(actual: List[Dict]) -> float:
132
  if not actual or len(actual) != 4:
133
- return 0.0
134
 
135
  # Sort both by dept name for comparison
136
  actual_sorted = sorted(actual, key=lambda r: r.get("department_name", ""))
@@ -159,5 +169,5 @@ class MediumTaskGrader:
159
  if dept_ok and count_ok and salary_ok:
160
  matches += 1
161
 
162
- return round(matches / 4, 3)
163
 
 
127
  """
128
  Custom grader for medium task — handles NULL comparison.
129
  """
130
+ _MIN_STRICT_SCORE = 0.001
131
+ _MAX_STRICT_SCORE = 0.999
132
+
133
+ @staticmethod
134
+ def _strict_score(score: float) -> float:
135
+ return round(
136
+ min(MediumTaskGrader._MAX_STRICT_SCORE, max(MediumTaskGrader._MIN_STRICT_SCORE, score)),
137
+ 3,
138
+ )
139
+
140
  @staticmethod
141
  def grade(actual: List[Dict]) -> float:
142
  if not actual or len(actual) != 4:
143
+ return MediumTaskGrader._strict_score(0.0)
144
 
145
  # Sort both by dept name for comparison
146
  actual_sorted = sorted(actual, key=lambda r: r.get("department_name", ""))
 
169
  if dept_ok and count_ok and salary_ok:
170
  matches += 1
171
 
172
+ return MediumTaskGrader._strict_score(matches / 4)
173
 
tests/test_graders.py CHANGED
@@ -9,22 +9,22 @@ class TestGraders(unittest.TestCase):
9
  def test_easy_grade_perfect(self):
10
  task = EasyTask()
11
  score = task.grade(task.expected_output)
12
- self.assertAlmostEqual(score, 1.0, places=3)
13
 
14
  def test_hard_grade_perfect(self):
15
  task = HardTask()
16
  score = task.grade(task.expected_output)
17
- self.assertAlmostEqual(score, 1.0, places=3)
18
 
19
  def test_easy_grade_empty(self):
20
  task = EasyTask()
21
  score = task.grade(None)
22
- self.assertEqual(score, 0.0)
23
 
24
  def test_medium_grader_perfect(self):
25
  task = MediumTask()
26
  score = MediumTaskGrader.grade(task.expected_output)
27
- self.assertAlmostEqual(score, 1.0, places=3)
28
 
29
  def test_medium_grader_partial(self):
30
  # Flip one row's avg_salary so it no longer matches within tolerance.
@@ -37,7 +37,7 @@ class TestGraders(unittest.TestCase):
37
  r["avg_salary"] = 12345.0
38
 
39
  score = MediumTaskGrader.grade(actual)
40
- self.assertLess(score, 1.0)
41
  self.assertAlmostEqual(score, 0.75, places=3)
42
 
43
 
 
9
  def test_easy_grade_perfect(self):
10
  task = EasyTask()
11
  score = task.grade(task.expected_output)
12
+ self.assertAlmostEqual(score, 0.999, places=3)
13
 
14
  def test_hard_grade_perfect(self):
15
  task = HardTask()
16
  score = task.grade(task.expected_output)
17
+ self.assertAlmostEqual(score, 0.999, places=3)
18
 
19
  def test_easy_grade_empty(self):
20
  task = EasyTask()
21
  score = task.grade(None)
22
+ self.assertAlmostEqual(score, 0.001, places=3)
23
 
24
  def test_medium_grader_perfect(self):
25
  task = MediumTask()
26
  score = MediumTaskGrader.grade(task.expected_output)
27
+ self.assertAlmostEqual(score, 0.999, places=3)
28
 
29
  def test_medium_grader_partial(self):
30
  # Flip one row's avg_salary so it no longer matches within tolerance.
 
37
  r["avg_salary"] = 12345.0
38
 
39
  score = MediumTaskGrader.grade(actual)
40
+ self.assertLess(score, 0.999)
41
  self.assertAlmostEqual(score, 0.75, places=3)
42
 
43