Roopalgn commited on
Commit
ff634dc
·
1 Parent(s): e3cd5c5

Run all tasks by default and keep task scores inside open interval

Browse files
inference.py CHANGED
@@ -307,12 +307,14 @@ def get_tasks_to_run(available_tasks: dict) -> list[int]:
307
  )
308
  raise SystemExit(1)
309
  return [task_id]
310
- if RUN_ALL_TASKS_ENV:
311
- return available_task_ids
312
  if not available_task_ids:
313
  return []
314
- # Default to a single task so evaluation emits exactly one START/END block.
315
- return [available_task_ids[0]]
 
 
 
 
316
 
317
 
318
  # ---------------------------------------------------------------------------
@@ -852,8 +854,6 @@ def run() -> None:
852
  tasks_to_run = get_tasks_to_run(available_tasks)
853
  if not tasks_to_run:
854
  return
855
- single_task_mode = len(tasks_to_run) == 1
856
-
857
  for task_id in tasks_to_run:
858
  if task_id not in available_tasks:
859
  continue
@@ -952,20 +952,12 @@ def run() -> None:
952
  emit_log(
953
  "END",
954
  final_reward=round(final_reward, 4),
 
955
  step_count=step_num,
956
  task_id=task_id,
957
  task_name=task["name"],
958
  )
959
 
960
- overall = [
961
- float(all_results[task_id]["final_reward"])
962
- for task_id in tasks_to_run
963
- if task_id in all_results
964
- ]
965
- if not single_task_mode:
966
- overall_avg = round(sum(overall) / len(overall), 4) if overall else 0.0
967
- emit_log("END", overall_avg=overall_avg, tasks_completed=len(overall))
968
-
969
 
970
  if __name__ == "__main__":
971
  run()
 
307
  )
308
  raise SystemExit(1)
309
  return [task_id]
 
 
310
  if not available_task_ids:
311
  return []
312
+ # Default to all declared tasks so validator-style runs exercise all graders.
313
+ return available_task_ids
314
+
315
+
316
+ def clamp_reported_score(score: float) -> float:
317
+ return max(0.001, min(0.999, score))
318
 
319
 
320
  # ---------------------------------------------------------------------------
 
854
  tasks_to_run = get_tasks_to_run(available_tasks)
855
  if not tasks_to_run:
856
  return
 
 
857
  for task_id in tasks_to_run:
858
  if task_id not in available_tasks:
859
  continue
 
952
  emit_log(
953
  "END",
954
  final_reward=round(final_reward, 4),
955
+ score=round(clamp_reported_score(final_reward), 4),
956
  step_count=step_num,
957
  task_id=task_id,
958
  task_name=task["name"],
959
  )
960
 
 
 
 
 
 
 
 
 
 
961
 
962
  if __name__ == "__main__":
963
  run()
server/environment.py CHANGED
@@ -14,6 +14,7 @@ from models import (
14
  )
15
  from server.grader import grade_action
16
  from server.reward import (
 
17
  compute_step_adjustments,
18
  compute_trajectory_adjustments,
19
  )
@@ -310,7 +311,7 @@ class HelpdeskTicketRoutingEnvironment(
310
  )
311
  trajectory_reward = trajectory_components["final_reward"]
312
  rubric_reward = self._apply_episode_economics(trajectory_reward)
313
- final_reward = max(0.0, min(1.0, rubric_reward - context_penalty))
314
  self._state.total_reward = rubric_reward
315
  investigation_penalty = self._compute_episode_penalty()
316
  else:
@@ -403,7 +404,7 @@ class HelpdeskTicketRoutingEnvironment(
403
 
404
  def _apply_episode_economics(self, base_reward: float) -> float:
405
  penalty = self._compute_episode_penalty()
406
- return max(0.0, min(1.0, base_reward - penalty))
407
 
408
  def _current_average_score(self) -> float:
409
  if not self._state.per_ticket_scores:
 
14
  )
15
  from server.grader import grade_action
16
  from server.reward import (
17
+ clamp_open_unit_interval,
18
  compute_step_adjustments,
19
  compute_trajectory_adjustments,
20
  )
 
311
  )
312
  trajectory_reward = trajectory_components["final_reward"]
313
  rubric_reward = self._apply_episode_economics(trajectory_reward)
314
+ final_reward = clamp_open_unit_interval(rubric_reward - context_penalty)
315
  self._state.total_reward = rubric_reward
316
  investigation_penalty = self._compute_episode_penalty()
317
  else:
 
404
 
405
  def _apply_episode_economics(self, base_reward: float) -> float:
406
  penalty = self._compute_episode_penalty()
407
+ return clamp_open_unit_interval(base_reward - penalty)
408
 
409
  def _current_average_score(self) -> float:
410
  if not self._state.per_ticket_scores:
server/reward.py CHANGED
@@ -8,12 +8,17 @@ DELTA_REWARD_WEIGHT = 0.08
8
  DELTA_REWARD_CAP = 0.04
9
  PROCESS_BONUS_CAP = 0.08
10
  RISK_PENALTY_CAP = 0.12
 
11
 
12
 
13
  def _clamp_unit_interval(value: float) -> float:
14
  return max(0.0, min(1.0, value))
15
 
16
 
 
 
 
 
17
  def compute_step_adjustments(
18
  score: float,
19
  *,
@@ -88,7 +93,7 @@ def compute_trajectory_adjustments(
88
  avg = sum(per_ticket_scores) / len(per_ticket_scores)
89
  bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
90
  bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
91
- final_reward = _clamp_unit_interval(
92
  avg + bounded_completion_bonus + bounded_consistency_bonus
93
  )
94
  return {
 
8
  DELTA_REWARD_CAP = 0.04
9
  PROCESS_BONUS_CAP = 0.08
10
  RISK_PENALTY_CAP = 0.12
11
+ OPEN_INTERVAL_EPSILON = 0.001
12
 
13
 
14
  def _clamp_unit_interval(value: float) -> float:
15
  return max(0.0, min(1.0, value))
16
 
17
 
18
+ def clamp_open_unit_interval(value: float, epsilon: float = OPEN_INTERVAL_EPSILON) -> float:
19
+ return max(epsilon, min(1.0 - epsilon, value))
20
+
21
+
22
  def compute_step_adjustments(
23
  score: float,
24
  *,
 
93
  avg = sum(per_ticket_scores) / len(per_ticket_scores)
94
  bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
95
  bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
96
+ final_reward = clamp_open_unit_interval(
97
  avg + bounded_completion_bonus + bounded_consistency_bonus
98
  )
99
  return {
tests/test_competitive_upgrade.py CHANGED
@@ -710,7 +710,7 @@ class TestQueueEconomics(unittest.TestCase):
710
  final_obs = env.step(HelpdeskTicketAction(issue_type=ticket.issue_type))
711
 
712
  self.assertTrue(final_obs.done)
713
- self.assertAlmostEqual(final_obs.reward, 0.98, places=9)
714
 
715
 
716
  class TestTerminalInvalidActionFinalReward(unittest.TestCase):
 
710
  final_obs = env.step(HelpdeskTicketAction(issue_type=ticket.issue_type))
711
 
712
  self.assertTrue(final_obs.done)
713
+ self.assertAlmostEqual(final_obs.reward, 0.979, places=9)
714
 
715
 
716
  class TestTerminalInvalidActionFinalReward(unittest.TestCase):
tests/test_inference_unit.py CHANGED
@@ -187,7 +187,7 @@ class InferenceUnitTests(unittest.TestCase):
187
 
188
  self.assertEqual(
189
  inference.get_tasks_to_run({1: {}, 2: {}, 3: {}}),
190
- [1],
191
  )
192
 
193
  def test_run_all_tasks_override_keeps_local_batch_mode_available(self) -> None:
 
187
 
188
  self.assertEqual(
189
  inference.get_tasks_to_run({1: {}, 2: {}, 3: {}}),
190
+ [1, 2, 3],
191
  )
192
 
193
  def test_run_all_tasks_override_keeps_local_batch_mode_available(self) -> None: