Spaces:
Running
Running
Run all tasks by default and keep task scores inside open interval
Browse files- inference.py +7 -15
- server/environment.py +3 -2
- server/reward.py +6 -1
- tests/test_competitive_upgrade.py +1 -1
- tests/test_inference_unit.py +1 -1
inference.py
CHANGED
|
@@ -307,12 +307,14 @@ def get_tasks_to_run(available_tasks: dict) -> list[int]:
|
|
| 307 |
)
|
| 308 |
raise SystemExit(1)
|
| 309 |
return [task_id]
|
| 310 |
-
if RUN_ALL_TASKS_ENV:
|
| 311 |
-
return available_task_ids
|
| 312 |
if not available_task_ids:
|
| 313 |
return []
|
| 314 |
-
# Default to
|
| 315 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
|
| 318 |
# ---------------------------------------------------------------------------
|
|
@@ -852,8 +854,6 @@ def run() -> None:
|
|
| 852 |
tasks_to_run = get_tasks_to_run(available_tasks)
|
| 853 |
if not tasks_to_run:
|
| 854 |
return
|
| 855 |
-
single_task_mode = len(tasks_to_run) == 1
|
| 856 |
-
|
| 857 |
for task_id in tasks_to_run:
|
| 858 |
if task_id not in available_tasks:
|
| 859 |
continue
|
|
@@ -952,20 +952,12 @@ def run() -> None:
|
|
| 952 |
emit_log(
|
| 953 |
"END",
|
| 954 |
final_reward=round(final_reward, 4),
|
|
|
|
| 955 |
step_count=step_num,
|
| 956 |
task_id=task_id,
|
| 957 |
task_name=task["name"],
|
| 958 |
)
|
| 959 |
|
| 960 |
-
overall = [
|
| 961 |
-
float(all_results[task_id]["final_reward"])
|
| 962 |
-
for task_id in tasks_to_run
|
| 963 |
-
if task_id in all_results
|
| 964 |
-
]
|
| 965 |
-
if not single_task_mode:
|
| 966 |
-
overall_avg = round(sum(overall) / len(overall), 4) if overall else 0.0
|
| 967 |
-
emit_log("END", overall_avg=overall_avg, tasks_completed=len(overall))
|
| 968 |
-
|
| 969 |
|
| 970 |
if __name__ == "__main__":
|
| 971 |
run()
|
|
|
|
| 307 |
)
|
| 308 |
raise SystemExit(1)
|
| 309 |
return [task_id]
|
|
|
|
|
|
|
| 310 |
if not available_task_ids:
|
| 311 |
return []
|
| 312 |
+
# Default to all declared tasks so validator-style runs exercise all graders.
|
| 313 |
+
return available_task_ids
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def clamp_reported_score(score: float) -> float:
|
| 317 |
+
return max(0.001, min(0.999, score))
|
| 318 |
|
| 319 |
|
| 320 |
# ---------------------------------------------------------------------------
|
|
|
|
| 854 |
tasks_to_run = get_tasks_to_run(available_tasks)
|
| 855 |
if not tasks_to_run:
|
| 856 |
return
|
|
|
|
|
|
|
| 857 |
for task_id in tasks_to_run:
|
| 858 |
if task_id not in available_tasks:
|
| 859 |
continue
|
|
|
|
| 952 |
emit_log(
|
| 953 |
"END",
|
| 954 |
final_reward=round(final_reward, 4),
|
| 955 |
+
score=round(clamp_reported_score(final_reward), 4),
|
| 956 |
step_count=step_num,
|
| 957 |
task_id=task_id,
|
| 958 |
task_name=task["name"],
|
| 959 |
)
|
| 960 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
|
| 962 |
if __name__ == "__main__":
|
| 963 |
run()
|
server/environment.py
CHANGED
|
@@ -14,6 +14,7 @@ from models import (
|
|
| 14 |
)
|
| 15 |
from server.grader import grade_action
|
| 16 |
from server.reward import (
|
|
|
|
| 17 |
compute_step_adjustments,
|
| 18 |
compute_trajectory_adjustments,
|
| 19 |
)
|
|
@@ -310,7 +311,7 @@ class HelpdeskTicketRoutingEnvironment(
|
|
| 310 |
)
|
| 311 |
trajectory_reward = trajectory_components["final_reward"]
|
| 312 |
rubric_reward = self._apply_episode_economics(trajectory_reward)
|
| 313 |
-
final_reward =
|
| 314 |
self._state.total_reward = rubric_reward
|
| 315 |
investigation_penalty = self._compute_episode_penalty()
|
| 316 |
else:
|
|
@@ -403,7 +404,7 @@ class HelpdeskTicketRoutingEnvironment(
|
|
| 403 |
|
| 404 |
def _apply_episode_economics(self, base_reward: float) -> float:
|
| 405 |
penalty = self._compute_episode_penalty()
|
| 406 |
-
return
|
| 407 |
|
| 408 |
def _current_average_score(self) -> float:
|
| 409 |
if not self._state.per_ticket_scores:
|
|
|
|
| 14 |
)
|
| 15 |
from server.grader import grade_action
|
| 16 |
from server.reward import (
|
| 17 |
+
clamp_open_unit_interval,
|
| 18 |
compute_step_adjustments,
|
| 19 |
compute_trajectory_adjustments,
|
| 20 |
)
|
|
|
|
| 311 |
)
|
| 312 |
trajectory_reward = trajectory_components["final_reward"]
|
| 313 |
rubric_reward = self._apply_episode_economics(trajectory_reward)
|
| 314 |
+
final_reward = clamp_open_unit_interval(rubric_reward - context_penalty)
|
| 315 |
self._state.total_reward = rubric_reward
|
| 316 |
investigation_penalty = self._compute_episode_penalty()
|
| 317 |
else:
|
|
|
|
| 404 |
|
| 405 |
def _apply_episode_economics(self, base_reward: float) -> float:
|
| 406 |
penalty = self._compute_episode_penalty()
|
| 407 |
+
return clamp_open_unit_interval(base_reward - penalty)
|
| 408 |
|
| 409 |
def _current_average_score(self) -> float:
|
| 410 |
if not self._state.per_ticket_scores:
|
server/reward.py
CHANGED
|
@@ -8,12 +8,17 @@ DELTA_REWARD_WEIGHT = 0.08
|
|
| 8 |
DELTA_REWARD_CAP = 0.04
|
| 9 |
PROCESS_BONUS_CAP = 0.08
|
| 10 |
RISK_PENALTY_CAP = 0.12
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def _clamp_unit_interval(value: float) -> float:
|
| 14 |
return max(0.0, min(1.0, value))
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
def compute_step_adjustments(
|
| 18 |
score: float,
|
| 19 |
*,
|
|
@@ -88,7 +93,7 @@ def compute_trajectory_adjustments(
|
|
| 88 |
avg = sum(per_ticket_scores) / len(per_ticket_scores)
|
| 89 |
bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
|
| 90 |
bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
|
| 91 |
-
final_reward =
|
| 92 |
avg + bounded_completion_bonus + bounded_consistency_bonus
|
| 93 |
)
|
| 94 |
return {
|
|
|
|
| 8 |
DELTA_REWARD_CAP = 0.04
|
| 9 |
PROCESS_BONUS_CAP = 0.08
|
| 10 |
RISK_PENALTY_CAP = 0.12
|
| 11 |
+
OPEN_INTERVAL_EPSILON = 0.001
|
| 12 |
|
| 13 |
|
| 14 |
def _clamp_unit_interval(value: float) -> float:
|
| 15 |
return max(0.0, min(1.0, value))
|
| 16 |
|
| 17 |
|
| 18 |
+
def clamp_open_unit_interval(value: float, epsilon: float = OPEN_INTERVAL_EPSILON) -> float:
|
| 19 |
+
return max(epsilon, min(1.0 - epsilon, value))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
def compute_step_adjustments(
|
| 23 |
score: float,
|
| 24 |
*,
|
|
|
|
| 93 |
avg = sum(per_ticket_scores) / len(per_ticket_scores)
|
| 94 |
bounded_completion_bonus = max(0.0, min(0.08, completion_bonus))
|
| 95 |
bounded_consistency_bonus = max(0.0, min(0.05, consistency_bonus))
|
| 96 |
+
final_reward = clamp_open_unit_interval(
|
| 97 |
avg + bounded_completion_bonus + bounded_consistency_bonus
|
| 98 |
)
|
| 99 |
return {
|
tests/test_competitive_upgrade.py
CHANGED
|
@@ -710,7 +710,7 @@ class TestQueueEconomics(unittest.TestCase):
|
|
| 710 |
final_obs = env.step(HelpdeskTicketAction(issue_type=ticket.issue_type))
|
| 711 |
|
| 712 |
self.assertTrue(final_obs.done)
|
| 713 |
-
self.assertAlmostEqual(final_obs.reward, 0.
|
| 714 |
|
| 715 |
|
| 716 |
class TestTerminalInvalidActionFinalReward(unittest.TestCase):
|
|
|
|
| 710 |
final_obs = env.step(HelpdeskTicketAction(issue_type=ticket.issue_type))
|
| 711 |
|
| 712 |
self.assertTrue(final_obs.done)
|
| 713 |
+
self.assertAlmostEqual(final_obs.reward, 0.979, places=9)
|
| 714 |
|
| 715 |
|
| 716 |
class TestTerminalInvalidActionFinalReward(unittest.TestCase):
|
tests/test_inference_unit.py
CHANGED
|
@@ -187,7 +187,7 @@ class InferenceUnitTests(unittest.TestCase):
|
|
| 187 |
|
| 188 |
self.assertEqual(
|
| 189 |
inference.get_tasks_to_run({1: {}, 2: {}, 3: {}}),
|
| 190 |
-
[1],
|
| 191 |
)
|
| 192 |
|
| 193 |
def test_run_all_tasks_override_keeps_local_batch_mode_available(self) -> None:
|
|
|
|
| 187 |
|
| 188 |
self.assertEqual(
|
| 189 |
inference.get_tasks_to_run({1: {}, 2: {}, 3: {}}),
|
| 190 |
+
[1, 2, 3],
|
| 191 |
)
|
| 192 |
|
| 193 |
def test_run_all_tasks_override_keeps_local_batch_mode_available(self) -> None:
|