Spaces:

rishavutk
/

fleetmind

Running

App Files Files Community

Rishav commited on Apr 8

Commit

9850bda

1 Parent(s): 0355a51

Harden v3 scoring contract

Browse files

Files changed (5) hide show

src/delivery_dispatch_v3/api.py +7 -4
src/delivery_dispatch_v3/environment.py +13 -2
src/delivery_dispatch_v3/grading.py +21 -5
src/delivery_dispatch_v3/models.py +3 -0
validate_submission.py +24 -1

src/delivery_dispatch_v3/api.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from __future__ import annotations
-from fastapi import FastAPI
 from .environment import V3DeliveryDispatchEnv
 from .models import V3Action
-from .task_adapter import PUBLIC_TASK_IDS
 def create_app() -> FastAPI:
@@ -17,8 +17,11 @@ def create_app() -> FastAPI:
     @app.post("/reset")
     def reset(task_id: str | None = None, seed: int | None = None, pool_name: str = "test") -> dict:
-        if task_id is not None and task_id not in PUBLIC_TASK_IDS:
-            task_id = task_id
         return env.reset(task_id=task_id, seed=seed, pool_name=pool_name).model_dump(mode="json")
     @app.get("/state")

 from __future__ import annotations
+from fastapi import FastAPI, HTTPException
 from .environment import V3DeliveryDispatchEnv
 from .models import V3Action
+from .task_adapter import PUBLIC_TASK_IDS, is_public_task_id
 def create_app() -> FastAPI:
     @app.post("/reset")
     def reset(task_id: str | None = None, seed: int | None = None, pool_name: str = "test") -> dict:
+        if task_id is not None and not is_public_task_id(task_id):
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unknown task_id '{task_id}'. Expected one of: {', '.join(PUBLIC_TASK_IDS)}",
+            )
         return env.reset(task_id=task_id, seed=seed, pool_name=pool_name).model_dump(mode="json")
     @app.get("/state")

src/delivery_dispatch_v3/environment.py CHANGED Viewed

@@ -32,6 +32,7 @@ class V3DeliveryDispatchEnv:
         self.last_step_reward = 0.0
         self.recent_events: list[str] = []
         self.done = False
     def reset(
         self,
@@ -71,6 +72,7 @@ class V3DeliveryDispatchEnv:
         self.last_step_reward = 0.0
         self.recent_events = ["environment reset"]
         self.done = False
         return self.state()
     def reset_internal(
@@ -92,6 +94,7 @@ class V3DeliveryDispatchEnv:
         self.last_step_reward = 0.0
         self.recent_events = ["environment reset"]
         self.done = False
         return self.state()
     def state(self) -> V3Observation:
@@ -129,16 +132,22 @@ class V3DeliveryDispatchEnv:
                 total_rounds=recipe.profile.total_rounds,
                 total_couriers=recipe.profile.courier_count,
                 max_repositions_per_round=recipe.profile.max_repositions_per_round,
             ),
         )
     def step(self, action: V3Action, grade_terminal: bool = True) -> V3StepResult:
         if self.done:
             return V3StepResult(
                 observation=self.state(),
                 reward=V3Reward(step_reward=0.0, cumulative_reward=self.cumulative_reward),
                 done=True,
-                info={"message": "episode already finished"},
             )
         recipe = self._require_recipe()
@@ -188,13 +197,14 @@ class V3DeliveryDispatchEnv:
                 seed=self.internal_seed,
                 raw_reward=self.cumulative_reward,
             )
-            info["episode_summary"] = {
                 "raw_reward": round(task_result.raw_reward, 3),
                 "baseline_reward": round(task_result.baseline_reward, 3),
                 "target_reward": round(task_result.target_reward, 3),
                 "heuristic_reward": None if task_result.heuristic_reward is None else round(task_result.heuristic_reward, 3),
                 "graded_score": round(task_result.score, 4),
             }
         return V3StepResult(
             observation=self.state(),
@@ -217,6 +227,7 @@ class V3DeliveryDispatchEnv:
         clone.last_step_reward = self.last_step_reward
         clone.recent_events = list(self.recent_events)
         clone.done = self.done
         return clone
     def _require_recipe(self) -> HiddenRecipe:

         self.last_step_reward = 0.0
         self.recent_events: list[str] = []
         self.done = False
+        self.last_episode_summary: dict[str, object] | None = None
     def reset(
         self,
         self.last_step_reward = 0.0
         self.recent_events = ["environment reset"]
         self.done = False
+        self.last_episode_summary = None
         return self.state()
     def reset_internal(
         self.last_step_reward = 0.0
         self.recent_events = ["environment reset"]
         self.done = False
+        self.last_episode_summary = None
         return self.state()
     def state(self) -> V3Observation:
                 total_rounds=recipe.profile.total_rounds,
                 total_couriers=recipe.profile.courier_count,
                 max_repositions_per_round=recipe.profile.max_repositions_per_round,
+                objective_brief="Maximize cumulative delivery reward across the full episode, not just the current round.",
+                action_brief="Return target courier counts for every zone; counts should sum to the total courier count.",
+                episode_brief="An episode lasts for a fixed number of rounds and ends when done=true.",
             ),
         )
     def step(self, action: V3Action, grade_terminal: bool = True) -> V3StepResult:
         if self.done:
+            info: dict[str, object] = {"message": "episode already finished"}
+            if self.last_episode_summary is not None:
+                info["episode_summary"] = dict(self.last_episode_summary)
             return V3StepResult(
                 observation=self.state(),
                 reward=V3Reward(step_reward=0.0, cumulative_reward=self.cumulative_reward),
                 done=True,
+                info=info,
             )
         recipe = self._require_recipe()
                 seed=self.internal_seed,
                 raw_reward=self.cumulative_reward,
             )
+            self.last_episode_summary = {
                 "raw_reward": round(task_result.raw_reward, 3),
                 "baseline_reward": round(task_result.baseline_reward, 3),
                 "target_reward": round(task_result.target_reward, 3),
                 "heuristic_reward": None if task_result.heuristic_reward is None else round(task_result.heuristic_reward, 3),
                 "graded_score": round(task_result.score, 4),
             }
+            info["episode_summary"] = dict(self.last_episode_summary)
         return V3StepResult(
             observation=self.state(),
         clone.last_step_reward = self.last_step_reward
         clone.recent_events = list(self.recent_events)
         clone.done = self.done
+        clone.last_episode_summary = None if self.last_episode_summary is None else dict(self.last_episode_summary)
         return clone
     def _require_recipe(self) -> HiddenRecipe:

src/delivery_dispatch_v3/grading.py CHANGED Viewed

@@ -2,17 +2,20 @@ from __future__ import annotations
 import time
 from collections.abc import Callable
 from .environment import V3DeliveryDispatchEnv
 from .models import V3TaskResult
 from .policies import baseline_policy, heuristic_policy
 from .solver import solve_exact
 def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult:
-    baseline_reward = rollout_policy(task_id, seed, policy_name="baseline")
-    heuristic_reward = rollout_policy(task_id, seed, policy_name="heuristic")
-    target_reward = optimal_reward(task_id, seed)
     score = normalize_score(raw_reward, baseline_reward, target_reward)
     return V3TaskResult(
         task_id=task_id,
@@ -34,6 +37,11 @@ def rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> fl
     return env.cumulative_reward
 def optimal_reward(
     task_id: str,
     seed: int,
@@ -50,6 +58,11 @@ def optimal_reward(
     return reward
 def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
     started_at = time.perf_counter()
     reward = optimal_reward(task_id, seed)
@@ -58,6 +71,9 @@ def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
 def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float:
     if target_reward <= baseline_reward:
-        return 1.0 if raw_reward >= target_reward else 0.0
-    return max(0.0, min(1.0, (raw_reward - baseline_reward) / (target_reward - baseline_reward)))

 import time
 from collections.abc import Callable
+from functools import lru_cache
 from .environment import V3DeliveryDispatchEnv
 from .models import V3TaskResult
 from .policies import baseline_policy, heuristic_policy
 from .solver import solve_exact
+STRICT_SCORE_EPSILON = 1e-4
 def grade_episode(task_id: str, seed: int, raw_reward: float) -> V3TaskResult:
+    baseline_reward = cached_rollout_policy(task_id, seed, policy_name="baseline")
+    heuristic_reward = cached_rollout_policy(task_id, seed, policy_name="heuristic")
+    target_reward = cached_optimal_reward(task_id, seed)
     score = normalize_score(raw_reward, baseline_reward, target_reward)
     return V3TaskResult(
         task_id=task_id,
     return env.cumulative_reward
+@lru_cache(maxsize=512)
+def cached_rollout_policy(task_id: str, seed: int, policy_name: str = "baseline") -> float:
+    return rollout_policy(task_id, seed, policy_name=policy_name)
 def optimal_reward(
     task_id: str,
     seed: int,
     return reward
+@lru_cache(maxsize=512)
+def cached_optimal_reward(task_id: str, seed: int) -> float:
+    return optimal_reward(task_id, seed)
 def timed_optimal_reward(task_id: str, seed: int) -> tuple[float, float]:
     started_at = time.perf_counter()
     reward = optimal_reward(task_id, seed)
 def normalize_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float:
+    lower = STRICT_SCORE_EPSILON
+    upper = 1.0 - STRICT_SCORE_EPSILON
     if target_reward <= baseline_reward:
+        return upper if raw_reward >= target_reward else lower
+    score = (raw_reward - baseline_reward) / (target_reward - baseline_reward)
+    return max(lower, min(upper, score))

src/delivery_dispatch_v3/models.py CHANGED Viewed

@@ -99,6 +99,9 @@ class V3ScenarioInfo(BaseModel):
     total_rounds: int
     total_couriers: int
     max_repositions_per_round: int
 class V3Observation(BaseModel):

     total_rounds: int
     total_couriers: int
     max_repositions_per_round: int
+    objective_brief: str = ""
+    action_brief: str = ""
+    episode_brief: str = ""
 class V3Observation(BaseModel):

validate_submission.py CHANGED Viewed

@@ -53,7 +53,7 @@ def validate_inference() -> dict:
     check("tasks" in result and "overall_score" in result, "inference output missing keys")
     check(len(result["tasks"]) >= 3, "inference must score at least three tasks")
     for task in result["tasks"]:
-        check(0.0 <= float(task["score"]) <= 1.0, f"task score out of range for {task['task_id']}")
     return result
@@ -75,6 +75,25 @@ def validate_inference_cli_output() -> None:
     check("[END]" in stdout, "inference.py stdout is missing [END] block")
 def validate_http_api() -> None:
     client = TestClient(app)
     health = client.get("/health")
@@ -86,6 +105,9 @@ def validate_http_api() -> None:
         reset_body = reset.json()
         check(reset_body["task_id"] == task_id, f"/reset should expose requested public task {task_id}")
     reset = client.post("/reset")
     check(reset.status_code == 200, "/reset without task_id must return 200")
     check(reset.json()["task_id"] in PUBLIC_TASK_IDS, "/reset without task_id should choose a public task")
@@ -114,6 +136,7 @@ def main() -> None:
     validate_environment_contract()
     inference_result = validate_inference()
     validate_inference_cli_output()
     validate_http_api()
     validate_docker_build()

     check("tasks" in result and "overall_score" in result, "inference output missing keys")
     check(len(result["tasks"]) >= 3, "inference must score at least three tasks")
     for task in result["tasks"]:
+        check(0.0 < float(task["score"]) < 1.0, f"task score must be strictly between 0 and 1 for {task['task_id']}")
     return result
     check("[END]" in stdout, "inference.py stdout is missing [END] block")
+def validate_inference_cli_output_with_configured_llm_if_present() -> None:
+    env = os.environ.copy()
+    token = env.get("HF_TOKEN") or env.get("OPENAI_API_KEY")
+    if not token:
+        return
+    completed = subprocess.run(
+        [sys.executable, "inference.py"],
+        cwd=Path(__file__).resolve().parent,
+        capture_output=True,
+        text=True,
+        env=env,
+        check=True,
+    )
+    stdout = completed.stdout
+    check("[START]" in stdout, "configured inference.py stdout is missing [START] block")
+    check("[STEP]" in stdout, "configured inference.py stdout is missing [STEP] block")
+    check("[END]" in stdout, "configured inference.py stdout is missing [END] block")
 def validate_http_api() -> None:
     client = TestClient(app)
     health = client.get("/health")
         reset_body = reset.json()
         check(reset_body["task_id"] == task_id, f"/reset should expose requested public task {task_id}")
+    invalid_reset = client.post("/reset", params={"task_id": "unknown_dispatch"})
+    check(invalid_reset.status_code == 400, "/reset must reject unknown task_id with 400")
     reset = client.post("/reset")
     check(reset.status_code == 200, "/reset without task_id must return 200")
     check(reset.json()["task_id"] in PUBLIC_TASK_IDS, "/reset without task_id should choose a public task")
     validate_environment_contract()
     inference_result = validate_inference()
     validate_inference_cli_output()
+    validate_inference_cli_output_with_configured_llm_if_present()
     validate_http_api()
     validate_docker_build()