ArjunSharmaSSS commited on
Commit
b0f5c84
Β·
1 Parent(s): 4b86450

fix(graders): clamp task scores strictly inside (0, 1)

Browse files

Submission validator rejects task scores at the 0.0 or 1.0 boundary.
Clamp every grader's output to [0.01, 0.99] via a shared _clamp helper
so perfect runs and total failures stay strictly inside the open
interval while preserving the relative ordering of scores.

Files changed (1) hide show
  1. graders.py +21 -12
graders.py CHANGED
@@ -8,6 +8,15 @@ from __future__ import annotations
8
 
9
  from models import EpisodeStep
10
 
 
 
 
 
 
 
 
 
 
11
 
12
  # ---------------------------------------------------------------------------
13
  # Task Easy β€” Single Spike
@@ -15,22 +24,22 @@ from models import EpisodeStep
15
 
16
  def grade_task_easy(history: list[EpisodeStep]) -> float:
17
  """
18
- Score:
19
- 1.0 β†’ no crash AND avg latency across all steps < 300 ms
20
- 0.5 β†’ no crash but avg latency >= 300 ms
21
- 0.0 β†’ any crash occurred
22
  """
23
  if not history:
24
- return 0.0
25
 
26
  crashed = any(s.crashed for s in history)
27
  if crashed:
28
- return 0.0
29
 
30
  avg_latency = sum(s.state.avg_latency for s in history) / len(history)
31
  if avg_latency < 300.0:
32
- return 1.0
33
- return 0.5
34
 
35
 
36
  # ---------------------------------------------------------------------------
@@ -47,7 +56,7 @@ def grade_task_medium(history: list[EpisodeStep]) -> float:
47
  linear interpolation in between
48
  """
49
  if not history:
50
- return 0.0
51
 
52
  total = len(history)
53
  crash_steps = sum(1 for s in history if s.crashed)
@@ -63,7 +72,7 @@ def grade_task_medium(history: list[EpisodeStep]) -> float:
63
  else:
64
  latency_factor = 1.0 - 0.5 * (avg_latency - low) / (high - low)
65
 
66
- return round(base * latency_factor, 4)
67
 
68
 
69
  # ---------------------------------------------------------------------------
@@ -79,7 +88,7 @@ def grade_task_hard(history: list[EpisodeStep]) -> float:
79
  queue_factor = fraction of steps where queue_length < 100
80
  """
81
  if not history:
82
- return 0.0
83
 
84
  total_incoming = sum(s.incoming_requests for s in history)
85
  total_allowed = sum(s.allowed_requests for s in history)
@@ -104,7 +113,7 @@ def grade_task_hard(history: list[EpisodeStep]) -> float:
104
  else:
105
  score = throughput_ratio * 0.7 + queue_factor * 0.3
106
 
107
- return round(min(1.0, max(0.0, score)), 4)
108
 
109
 
110
  # ---------------------------------------------------------------------------
 
8
 
9
  from models import EpisodeStep
10
 
11
+ # Scores must lie strictly inside (0, 1) per the submission validator,
12
+ # so we clamp every grader's output to this open interval.
13
+ SCORE_MIN = 0.01
14
+ SCORE_MAX = 0.99
15
+
16
+
17
+ def _clamp(score: float) -> float:
18
+ return round(max(SCORE_MIN, min(SCORE_MAX, score)), 4)
19
+
20
 
21
  # ---------------------------------------------------------------------------
22
  # Task Easy β€” Single Spike
 
24
 
25
  def grade_task_easy(history: list[EpisodeStep]) -> float:
26
  """
27
+ Score (clamped to (0, 1)):
28
+ ~0.99 β†’ no crash AND avg latency across all steps < 300 ms
29
+ ~0.50 β†’ no crash but avg latency >= 300 ms
30
+ ~0.01 β†’ any crash occurred
31
  """
32
  if not history:
33
+ return SCORE_MIN
34
 
35
  crashed = any(s.crashed for s in history)
36
  if crashed:
37
+ return SCORE_MIN
38
 
39
  avg_latency = sum(s.state.avg_latency for s in history) / len(history)
40
  if avg_latency < 300.0:
41
+ return _clamp(1.0)
42
+ return _clamp(0.5)
43
 
44
 
45
  # ---------------------------------------------------------------------------
 
56
  linear interpolation in between
57
  """
58
  if not history:
59
+ return SCORE_MIN
60
 
61
  total = len(history)
62
  crash_steps = sum(1 for s in history if s.crashed)
 
72
  else:
73
  latency_factor = 1.0 - 0.5 * (avg_latency - low) / (high - low)
74
 
75
+ return _clamp(base * latency_factor)
76
 
77
 
78
  # ---------------------------------------------------------------------------
 
88
  queue_factor = fraction of steps where queue_length < 100
89
  """
90
  if not history:
91
+ return SCORE_MIN
92
 
93
  total_incoming = sum(s.incoming_requests for s in history)
94
  total_allowed = sum(s.allowed_requests for s in history)
 
113
  else:
114
  score = throughput_ratio * 0.7 + queue_factor * 0.3
115
 
116
+ return _clamp(score)
117
 
118
 
119
  # ---------------------------------------------------------------------------