Rishav commited on
Commit
9850bda
·
1 Parent(s): 0355a51

Harden v3 scoring contract

Browse files
src/delivery_dispatch_v3/api.py CHANGED
@@ -1,10 +1,10 @@
1
  from __future__ import annotations
2
 
3
- from fastapi import FastAPI
4
 
5
  from .environment import V3DeliveryDispatchEnv
6
  from .models import V3Action
7
- from .task_adapter import PUBLIC_TASK_IDS
8
 
9
 
10
  def create_app() -> FastAPI:
@@ -17,8 +17,11 @@ def create_app() -> FastAPI:
17
 
18
  @app.post("/reset")
19
  def reset(task_id: str | None = None, seed: int | None = None, pool_name: str = "test") -> dict:
20
- if task_id is not None and task_id not in PUBLIC_TASK_IDS:
21
- task_id = task_id
 
 
 
22
  return env.reset(task_id=task_id, seed=seed, pool_name=pool_name).model_dump(mode="json")
23
 
24
  @app.get("/state")
 
1
  from __future__ import annotations
2
 
3
+ from fastapi import FastAPI, HTTPException
4
 
5
  from .environment import V3DeliveryDispatchEnv
6
  from .models import V3Action
7
+ from .task_adapter import PUBLIC_TASK_IDS, is_public_task_id
8
 
9
 
10
  def create_app() -> FastAPI:
 
17
 
18
  @app.post("/reset")
19
  def reset(task_id: str | None = None, seed: int | None = None, pool_name: str = "test") -> dict:
20
+ if task_id is not None and not is_public_task_id(task_id):
21
+ raise HTTPException(
22
+ status_code=400,
23
+ detail=f"Unknown task_id '{task_id}'. Expected one of: {', '.join(PUBLIC_TASK_IDS)}",
24
+ )
25
  return env.reset(task_id=task_id, seed=seed, pool_name=pool_name).model_dump(mode="json")
26
 
27
  @app.get("/state")
src/delivery_dispatch_v3/environment.py CHANGED
@@ -32,6 +32,7 @@ class V3DeliveryDispatchEnv:
32
  self.last_step_reward = 0.0
33
  self.recent_events: list[str] = []
34
  self.done = False
 
35
 
36
  def reset(
37
  self,
@@ -71,6 +72,7 @@ class V3DeliveryDispatchEnv:
71
  self.last_step_reward = 0.0
72
  self.recent_events = ["environment reset"]
73
  self.done = False
 
74
  return self.state()
75
 
76
  def reset_internal(
@@ -92,6 +94,7 @@ class V3DeliveryDispatchEnv:
92
  self.last_step_reward = 0.0
93
  self.recent_events = ["environment reset"]
94
  self.done = False
 
95
  return self.state()
96
 
97
  def state(self) -> V3Observation:
@@ -129,16 +132,22 @@ class V3DeliveryDispatchEnv:
129
  total_rounds=recipe.profile.total_rounds,
130
  total_couriers=recipe.profile.courier_count,
131
  max_repositions_per_round=recipe.profile.max_repositions_per_round,
 
 
 
132
  ),
133
  )
134
 
135
  def step(self, action: V3Action, grade_terminal: bool = True) -> V3StepResult:
136
  if self.done:
 
 
 
137
  return V3StepResult(
138
  observation=self.state(),
139
  reward=V3Reward(step_reward=0.0, cumulative_reward=self.cumulative_reward),
140
  done=True,
141
- info={"message": "episode already finished"},
142
  )
143
 
144
  recipe = self._require_recipe()
@@ -188,13 +197,14 @@ class V3DeliveryDispatchEnv:
188
  seed=self.internal_seed,
189
  raw_reward=self.cumulative_reward,
190
  )
191
- info["episode_summary"] = {
192
  "raw_reward": round(task_result.raw_reward, 3),
193
  "baseline_reward": round(task_result.baseline_reward, 3),
194
  "target_reward": round(task_result.target_reward, 3),
195
  "heuristic_reward": None if task_result.heuristic_reward is None else round(task_result.heuristic_reward, 3),
196
  "graded_score": round(task_result.score, 4),
197
  }
 
198
 
199
  return V3StepResult(
200
  observation=self.state(),
@@ -217,6 +227,7 @@ class V3DeliveryDispatchEnv:
217
  clone.last_step_reward = self.last_step_reward
218
  clone.recent_events = list(self.recent_events)
219
  clone.done = self.done
 
220
  return clone
221
 
222
  def _require_recipe(self) -> HiddenRecipe:
 
32
  self.last_step_reward = 0.0
33
  self.recent_events: list[str] = []
34
  self.done = False
35
+ self.last_episode_summary: dict[str, object] | None = None
36
 
37
  def reset(
38
  self,
 
72
  self.last_step_reward = 0.0
73
  self.recent_events = ["environment reset"]
74
  self.done = False
75
+ self.last_episode_summary = None
76
  return self.state()
77
 
78
  def reset_internal(
 
94
  self.last_step_reward = 0.0
95
  self.recent_events = ["environment reset"]
96
  self.done = False
97
+ self.last_episode_summary = None
98
  return self.state()
99
 
100
  def state(self) -> V3Observation:
 
132
  total_rounds=recipe.profile.total_rounds,
133
  total_couriers=recipe.profile.courier_count,
134
  max_repositions_per_round=recipe.profile.max_repositions_per_round,
135
+ objective_brief="Maximize cumulative delivery reward across the full episode, not just the current round.",
136
+ action_brief="Return target courier counts for every zone; counts should sum to the total courier count.",
137
+ episode_brief="An episode lasts for a fixed number of rounds and ends when done=true.",
138
  ),
139
  )
140
 
141
  def step(self, action: V3Action, grade_terminal: bool = True) -> V3StepResult:
142
  if self.done:
143
+ info: dict[str, object] = {"message": "episode already finished"}
144
+ if self.last_episode_summary is not None:
145
+ info["episode_summary"] = dict(self.last_episode_summary)
146
  return V3StepResult(
147
  observation=self.state(),
148
  reward=V3Reward(step_reward=0.0, cumulative_reward=self.cumulative_reward),
149
  done=True,
150
+ info=info,
151
  )
152
 
153
  recipe = self._require_recipe()
 
197
  seed=self.internal_seed,
198
  raw_reward=self.cumulative_reward,
199
  )
200
+ self.last_episode_summary = {
201
  "raw_reward": round(task_result.raw_reward, 3),
202
  "baseline_reward": round(task_result.baseline_reward, 3),
203
  "target_reward": round(task_result.target_reward, 3),
204
  "heuristic_reward": None if task_result.heuristic_reward is None else round(task_result.heuristic_reward, 3),
205
  "graded_score": round(task_result.score, 4),
206
  }
207
+ info["episode_summary"] = dict(self.last_episode_summary)
208
 
209
  return V3StepResult(
210
  observation=self.state(),
 
227
  clone.last_step_reward = self.last_step_reward
228
  clone.recent_events = list(self.recent_events)
229
  clone.done = self.done
230
+ clone.last_episode_summary = None if self.last_episode_summary is None else dict(self.last_episode_summary)
231
  return clone
232
 
233
  def _require_recipe(self) -> HiddenRecipe:
src/delivery_dispatch_v3/grading.py CHANGED
@@ -2,17 +2,20 @@ from __future__ import annotations
2
 
3
  import time
4
  from collections.abc import Callable
 
5
 
6
  from .environment import V3DeliveryDispatchEnv
7
  from .models import V3TaskResult
8
  from .policies import baseline_policy, heuristic_policy
9
  from .solver import solve_exact
10
 
 
 
11
 
12
  def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult:
13
- baseline_reward = rollout_policy(task_id, seed, policy_name="baseline")
14
- heuristic_reward = rollout_policy(task_id, seed, policy_name="heuristic")
15
- target_reward = optimal_reward(task_id, seed)
16
  score = normalize_score(raw_reward, baseline_reward, target_reward)
17
  return V3TaskResult(
18
  task_id=task_id,
@@ -34,6 +37,11 @@ def rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> fl
34
  return env.cumulative_reward
35
 
36
 
 
 
 
 
 
37
  def optimal_reward(
38
  task_id: str,
39
  seed: int,
@@ -50,6 +58,11 @@ def optimal_reward(
50
  return reward
51
 
52
 
 
 
 
 
 
53
  def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
54
  started_at = time.perf_counter()
55
  reward = optimal_reward(task_id, seed)
@@ -58,6 +71,9 @@ def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
58
 
59
 
60
  def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float:
 
 
61
  if target_reward <= baseline_reward:
62
- return 1.0 if raw_reward >= target_reward else 0.0
63
- return max(0.0, min(1.0, (raw_reward - baseline_reward) / (target_reward - baseline_reward)))
 
 
2
 
3
  import time
4
  from collections.abc import Callable
5
+ from functools import lru_cache
6
 
7
  from .environment import V3DeliveryDispatchEnv
8
  from .models import V3TaskResult
9
  from .policies import baseline_policy, heuristic_policy
10
  from .solver import solve_exact
11
 
12
+ STRICT_SCORE_EPSILON = 1e-4
13
+
14
 
15
  def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult:
16
+ baseline_reward = cached_rollout_policy(task_id, seed, policy_name="baseline")
17
+ heuristic_reward = cached_rollout_policy(task_id, seed, policy_name="heuristic")
18
+ target_reward = cached_optimal_reward(task_id, seed)
19
  score = normalize_score(raw_reward, baseline_reward, target_reward)
20
  return V3TaskResult(
21
  task_id=task_id,
 
37
  return env.cumulative_reward
38
 
39
 
40
+ @lru_cache(maxsize=512)
41
+ def cached_rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float:
42
+ return rollout_policy(task_id, seed, policy_name=policy_name)
43
+
44
+
45
  def optimal_reward(
46
  task_id: str,
47
  seed: int,
 
58
  return reward
59
 
60
 
61
+ @lru_cache(maxsize=512)
62
+ def cached_optimal_reward(task_id: str, seed: int) -> float:
63
+ return optimal_reward(task_id, seed)
64
+
65
+
66
  def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
67
  started_at = time.perf_counter()
68
  reward = optimal_reward(task_id, seed)
 
71
 
72
 
73
  def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float:
74
+ lower = STRICT_SCORE_EPSILON
75
+ upper = 1.0 - STRICT_SCORE_EPSILON
76
  if target_reward <= baseline_reward:
77
+ return upper if raw_reward >= target_reward else lower
78
+ score = (raw_reward - baseline_reward) / (target_reward - baseline_reward)
79
+ return max(lower, min(upper, score))
src/delivery_dispatch_v3/models.py CHANGED
@@ -99,6 +99,9 @@ class V3ScenarioInfo(BaseModel):
99
  total_rounds: int
100
  total_couriers: int
101
  max_repositions_per_round: int
 
 
 
102
 
103
 
104
  class V3Observation(BaseModel):
 
99
  total_rounds: int
100
  total_couriers: int
101
  max_repositions_per_round: int
102
+ objective_brief: str = ""
103
+ action_brief: str = ""
104
+ episode_brief: str = ""
105
 
106
 
107
  class V3Observation(BaseModel):
validate_submission.py CHANGED
@@ -53,7 +53,7 @@ def validate_inference() -> dict:
53
  check("tasks" in result and "overall_score" in result, "inference output missing keys")
54
  check(len(result["tasks"]) >= 3, "inference must score at least three tasks")
55
  for task in result["tasks"]:
56
- check(0.0 <= float(task["score"]) <= 1.0, f"task score out of range for {task['task_id']}")
57
  return result
58
 
59
 
@@ -75,6 +75,25 @@ def validate_inference_cli_output() -> None:
75
  check("[END]" in stdout, "inference.py stdout is missing [END] block")
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def validate_http_api() -> None:
79
  client = TestClient(app)
80
  health = client.get("/health")
@@ -86,6 +105,9 @@ def validate_http_api() -> None:
86
  reset_body = reset.json()
87
  check(reset_body["task_id"] == task_id, f"/reset should expose requested public task {task_id}")
88
 
 
 
 
89
  reset = client.post("/reset")
90
  check(reset.status_code == 200, "/reset without task_id must return 200")
91
  check(reset.json()["task_id"] in PUBLIC_TASK_IDS, "/reset without task_id should choose a public task")
@@ -114,6 +136,7 @@ def main() -> None:
114
  validate_environment_contract()
115
  inference_result = validate_inference()
116
  validate_inference_cli_output()
 
117
  validate_http_api()
118
  validate_docker_build()
119
 
 
53
  check("tasks" in result and "overall_score" in result, "inference output missing keys")
54
  check(len(result["tasks"]) >= 3, "inference must score at least three tasks")
55
  for task in result["tasks"]:
56
+ check(0.0 < float(task["score"]) < 1.0, f"task score must be strictly between 0 and 1 for {task['task_id']}")
57
  return result
58
 
59
 
 
75
  check("[END]" in stdout, "inference.py stdout is missing [END] block")
76
 
77
 
78
+ def validate_inference_cli_output_with_configured_llm_if_present() -> None:
79
+ env = os.environ.copy()
80
+ token = env.get("HF_TOKEN") or env.get("OPENAI_API_KEY")
81
+ if not token:
82
+ return
83
+ completed = subprocess.run(
84
+ [sys.executable, "inference.py"],
85
+ cwd=Path(__file__).resolve().parent,
86
+ capture_output=True,
87
+ text=True,
88
+ env=env,
89
+ check=True,
90
+ )
91
+ stdout = completed.stdout
92
+ check("[START]" in stdout, "configured inference.py stdout is missing [START] block")
93
+ check("[STEP]" in stdout, "configured inference.py stdout is missing [STEP] block")
94
+ check("[END]" in stdout, "configured inference.py stdout is missing [END] block")
95
+
96
+
97
  def validate_http_api() -> None:
98
  client = TestClient(app)
99
  health = client.get("/health")
 
105
  reset_body = reset.json()
106
  check(reset_body["task_id"] == task_id, f"/reset should expose requested public task {task_id}")
107
 
108
+ invalid_reset = client.post("/reset", params={"task_id": "unknown_dispatch"})
109
+ check(invalid_reset.status_code == 400, "/reset must reject unknown task_id with 400")
110
+
111
  reset = client.post("/reset")
112
  check(reset.status_code == 200, "/reset without task_id must return 200")
113
  check(reset.json()["task_id"] in PUBLIC_TASK_IDS, "/reset without task_id should choose a public task")
 
136
  validate_environment_contract()
137
  inference_result = validate_inference()
138
  validate_inference_cli_output()
139
+ validate_inference_cli_output_with_configured_llm_if_present()
140
  validate_http_api()
141
  validate_docker_build()
142