Spaces:
Sleeping
Sleeping
Align task and grader interfaces with validator
Browse files- graders.py +54 -25
- openenv.yaml +6 -3
- server/app.py +7 -2
- server/engineer_manager_environment.py +12 -3
- tasks.py +6 -3
graders.py
CHANGED
|
@@ -2,38 +2,67 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
from typing import Any
|
| 4 |
|
| 5 |
-
from benchmark_tasks import grade_trajectory
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
def _coerce_trajectory(payload: Any) -> list[dict[str, Any]]:
|
| 9 |
-
if isinstance(payload, list):
|
| 10 |
-
return [dict(step) for step in payload]
|
| 11 |
-
if isinstance(payload, dict):
|
| 12 |
-
if isinstance(payload.get("trajectory"), list):
|
| 13 |
-
return [dict(step) for step in payload["trajectory"]]
|
| 14 |
-
if isinstance(payload.get("steps"), list):
|
| 15 |
-
return [dict(step) for step in payload["steps"]]
|
| 16 |
-
return []
|
| 17 |
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
def _grade(task_name: str, payload: Any) -> dict[str, Any]:
|
| 20 |
-
trajectory = _coerce_trajectory(payload)
|
| 21 |
-
score = grade_trajectory(task_name, trajectory)
|
| 22 |
-
return {
|
| 23 |
-
"task_name": task_name,
|
| 24 |
-
"score": score,
|
| 25 |
-
"passed": score > 0.0,
|
| 26 |
-
"reward": score,
|
| 27 |
-
}
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
def grade_task_0(payload: Any) -> dict[str, Any]:
|
| 31 |
-
return _grade("quiet-morning", payload)
|
| 32 |
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
def grade_task_1(payload: Any) -> dict[str, Any]:
|
| 35 |
-
return _grade("meeting-surgery", payload)
|
| 36 |
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from typing import Any
|
| 4 |
|
|
|
|
| 5 |
|
| 6 |
+
TASK_NAMES = {
|
| 7 |
+
0: "quiet-morning",
|
| 8 |
+
1: "meeting-surgery",
|
| 9 |
+
2: "delivery-triage",
|
| 10 |
+
}
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
def _normalize_reward(reward: float) -> float:
|
| 14 |
+
return min(max(float(reward), 0.0), 1.0)
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def _state_task_id(state: Any) -> int | None:
|
| 18 |
+
if not isinstance(state, dict):
|
| 19 |
+
return None
|
| 20 |
+
task_id = state.get("task_id")
|
| 21 |
+
if isinstance(task_id, int):
|
| 22 |
+
return task_id
|
| 23 |
+
task_name = state.get("task_name")
|
| 24 |
+
if isinstance(task_name, str):
|
| 25 |
+
for index, name in TASK_NAMES.items():
|
| 26 |
+
if name == task_name:
|
| 27 |
+
return index
|
| 28 |
+
metadata = state.get("metadata")
|
| 29 |
+
if isinstance(metadata, dict):
|
| 30 |
+
nested_task_id = metadata.get("task_id")
|
| 31 |
+
if isinstance(nested_task_id, int):
|
| 32 |
+
return nested_task_id
|
| 33 |
+
return None
|
| 34 |
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
def grade_task_0(state: dict, reward: float) -> float:
|
| 37 |
+
return _normalize_reward(reward if _state_task_id(state) == 0 else 0.0)
|
| 38 |
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
def grade_task_1(state: dict, reward: float) -> float:
|
| 41 |
+
return _normalize_reward(reward if _state_task_id(state) == 1 else 0.0)
|
| 42 |
|
| 43 |
+
|
| 44 |
+
def grade_task_2(state: dict, reward: float) -> float:
|
| 45 |
+
return _normalize_reward(reward if _state_task_id(state) == 2 else 0.0)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
GRADERS = {
|
| 49 |
+
"engineer_manager_task_0": grade_task_0,
|
| 50 |
+
"engineer_manager_task_1": grade_task_1,
|
| 51 |
+
"engineer_manager_task_2": grade_task_2,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
TASK_GRADER_PAIRS = [
|
| 56 |
+
("engineer_manager_task_0", grade_task_0),
|
| 57 |
+
("engineer_manager_task_1", grade_task_1),
|
| 58 |
+
("engineer_manager_task_2", grade_task_2),
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
__all__ = [
|
| 63 |
+
"grade_task_0",
|
| 64 |
+
"grade_task_1",
|
| 65 |
+
"grade_task_2",
|
| 66 |
+
"GRADERS",
|
| 67 |
+
"TASK_GRADER_PAIRS",
|
| 68 |
+
]
|
openenv.yaml
CHANGED
|
@@ -32,10 +32,11 @@ tasks:
|
|
| 32 |
description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
|
| 33 |
max_steps: 32
|
| 34 |
reset_params:
|
| 35 |
-
|
| 36 |
action_schema:
|
| 37 |
target_slot: integer slot index within the workday
|
| 38 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
|
|
|
| 39 |
grader: graders:grade_task_0
|
| 40 |
graders:
|
| 41 |
- graders:grade_task_0
|
|
@@ -49,10 +50,11 @@ tasks:
|
|
| 49 |
description: Fragmented calendar where selective meeting moves should improve flow.
|
| 50 |
max_steps: 32
|
| 51 |
reset_params:
|
| 52 |
-
|
| 53 |
action_schema:
|
| 54 |
target_slot: integer slot index within the workday
|
| 55 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
|
|
|
| 56 |
grader: graders:grade_task_1
|
| 57 |
graders:
|
| 58 |
- graders:grade_task_1
|
|
@@ -66,10 +68,11 @@ tasks:
|
|
| 66 |
description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
|
| 67 |
max_steps: 32
|
| 68 |
reset_params:
|
| 69 |
-
|
| 70 |
action_schema:
|
| 71 |
target_slot: integer slot index within the workday
|
| 72 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
|
|
|
| 73 |
grader: graders:grade_task_2
|
| 74 |
graders:
|
| 75 |
- graders:grade_task_2
|
|
|
|
| 32 |
description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
|
| 33 |
max_steps: 32
|
| 34 |
reset_params:
|
| 35 |
+
task_id: 0
|
| 36 |
action_schema:
|
| 37 |
target_slot: integer slot index within the workday
|
| 38 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 39 |
+
task_id: quiet-morning
|
| 40 |
grader: graders:grade_task_0
|
| 41 |
graders:
|
| 42 |
- graders:grade_task_0
|
|
|
|
| 50 |
description: Fragmented calendar where selective meeting moves should improve flow.
|
| 51 |
max_steps: 32
|
| 52 |
reset_params:
|
| 53 |
+
task_id: 1
|
| 54 |
action_schema:
|
| 55 |
target_slot: integer slot index within the workday
|
| 56 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 57 |
+
task_id: meeting-surgery
|
| 58 |
grader: graders:grade_task_1
|
| 59 |
graders:
|
| 60 |
- graders:grade_task_1
|
|
|
|
| 68 |
description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
|
| 69 |
max_steps: 32
|
| 70 |
reset_params:
|
| 71 |
+
task_id: 2
|
| 72 |
action_schema:
|
| 73 |
target_slot: integer slot index within the workday
|
| 74 |
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 75 |
+
task_id: delivery-triage
|
| 76 |
grader: graders:grade_task_2
|
| 77 |
graders:
|
| 78 |
- graders:grade_task_2
|
server/app.py
CHANGED
|
@@ -36,7 +36,8 @@ app = create_fastapi_app(
|
|
| 36 |
|
| 37 |
class GraderRequest(BaseModel):
|
| 38 |
task_id: str
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
WEB_CSS = dedent(
|
| 42 |
"""\
|
|
@@ -473,6 +474,9 @@ def grader(request: GraderRequest) -> JSONResponse:
|
|
| 473 |
"quiet-morning": grade_task_0,
|
| 474 |
"meeting-surgery": grade_task_1,
|
| 475 |
"delivery-triage": grade_task_2,
|
|
|
|
|
|
|
|
|
|
| 476 |
}
|
| 477 |
grader_fn = graders.get(request.task_id)
|
| 478 |
if grader_fn is None:
|
|
@@ -480,7 +484,8 @@ def grader(request: GraderRequest) -> JSONResponse:
|
|
| 480 |
{"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
|
| 481 |
status_code=400,
|
| 482 |
)
|
| 483 |
-
|
|
|
|
| 484 |
|
| 485 |
|
| 486 |
def run(host: str = "0.0.0.0", port: int = 8000) -> None:
|
|
|
|
| 36 |
|
| 37 |
class GraderRequest(BaseModel):
|
| 38 |
task_id: str
|
| 39 |
+
state: dict
|
| 40 |
+
reward: float
|
| 41 |
|
| 42 |
WEB_CSS = dedent(
|
| 43 |
"""\
|
|
|
|
| 474 |
"quiet-morning": grade_task_0,
|
| 475 |
"meeting-surgery": grade_task_1,
|
| 476 |
"delivery-triage": grade_task_2,
|
| 477 |
+
"engineer_manager_task_0": grade_task_0,
|
| 478 |
+
"engineer_manager_task_1": grade_task_1,
|
| 479 |
+
"engineer_manager_task_2": grade_task_2,
|
| 480 |
}
|
| 481 |
grader_fn = graders.get(request.task_id)
|
| 482 |
if grader_fn is None:
|
|
|
|
| 484 |
{"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
|
| 485 |
status_code=400,
|
| 486 |
)
|
| 487 |
+
score = float(grader_fn(request.state, request.reward))
|
| 488 |
+
return JSONResponse({"task_id": request.task_id, "score": score, "passed": score > 0.0, "reward": score})
|
| 489 |
|
| 490 |
|
| 491 |
def run(host: str = "0.0.0.0", port: int = 8000) -> None:
|
server/engineer_manager_environment.py
CHANGED
|
@@ -8,7 +8,7 @@ import os
|
|
| 8 |
from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
|
| 9 |
from openenv.core.env_server.types import State
|
| 10 |
|
| 11 |
-
from benchmark_tasks import TASK_SPECS, apply_task
|
| 12 |
from focus_resource_env import FocusResourceEnv
|
| 13 |
|
| 14 |
try:
|
|
@@ -38,6 +38,7 @@ class EngineerManagerEnvironment(
|
|
| 38 |
self._distraction_risk = distraction_risk
|
| 39 |
self._seed = seed
|
| 40 |
self._task_name = task_name or os.getenv("TASK_NAME")
|
|
|
|
| 41 |
self._step_count = 0
|
| 42 |
self._episode_id = str(uuid4())
|
| 43 |
self._trajectory: list[dict[str, object]] = []
|
|
@@ -53,10 +54,17 @@ class EngineerManagerEnvironment(
|
|
| 53 |
seed: int | None = None,
|
| 54 |
episode_id: str | None = None,
|
| 55 |
task_name: str | None = None,
|
|
|
|
| 56 |
**_: object,
|
| 57 |
) -> EngineerManagerObservation:
|
| 58 |
self._seed = self._seed if seed is None else seed
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
self._episode_id = episode_id or str(uuid4())
|
| 61 |
self._step_count = 0
|
| 62 |
self._trajectory = []
|
|
@@ -125,6 +133,7 @@ class EngineerManagerEnvironment(
|
|
| 125 |
payload["done"] = done
|
| 126 |
metadata = dict(info or {})
|
| 127 |
metadata["task_name"] = self._task_name
|
|
|
|
| 128 |
metadata["episode_metrics"] = {
|
| 129 |
"interruptions": int(self._env.interruptions),
|
| 130 |
"invalid_actions": int(self._env.invalid_actions),
|
|
@@ -136,7 +145,7 @@ class EngineerManagerEnvironment(
|
|
| 136 |
if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
|
| 137 |
),
|
| 138 |
"total_score": float(self._env._total_score()),
|
| 139 |
-
"grader_score":
|
| 140 |
}
|
| 141 |
payload["metadata"] = metadata
|
| 142 |
return EngineerManagerObservation.model_validate(payload)
|
|
|
|
| 8 |
from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
|
| 9 |
from openenv.core.env_server.types import State
|
| 10 |
|
| 11 |
+
from benchmark_tasks import TASK_SPECS, apply_task
|
| 12 |
from focus_resource_env import FocusResourceEnv
|
| 13 |
|
| 14 |
try:
|
|
|
|
| 38 |
self._distraction_risk = distraction_risk
|
| 39 |
self._seed = seed
|
| 40 |
self._task_name = task_name or os.getenv("TASK_NAME")
|
| 41 |
+
self._task_id = 0
|
| 42 |
self._step_count = 0
|
| 43 |
self._episode_id = str(uuid4())
|
| 44 |
self._trajectory: list[dict[str, object]] = []
|
|
|
|
| 54 |
seed: int | None = None,
|
| 55 |
episode_id: str | None = None,
|
| 56 |
task_name: str | None = None,
|
| 57 |
+
task_id: int | None = None,
|
| 58 |
**_: object,
|
| 59 |
) -> EngineerManagerObservation:
|
| 60 |
self._seed = self._seed if seed is None else seed
|
| 61 |
+
task_names = ["quiet-morning", "meeting-surgery", "delivery-triage"]
|
| 62 |
+
if task_id is not None and 0 <= int(task_id) < len(task_names):
|
| 63 |
+
self._task_id = int(task_id)
|
| 64 |
+
self._task_name = task_names[self._task_id]
|
| 65 |
+
else:
|
| 66 |
+
self._task_name = task_name or self._task_name or os.getenv("TASK_NAME")
|
| 67 |
+
self._task_id = task_names.index(self._task_name) if self._task_name in task_names else 0
|
| 68 |
self._episode_id = episode_id or str(uuid4())
|
| 69 |
self._step_count = 0
|
| 70 |
self._trajectory = []
|
|
|
|
| 133 |
payload["done"] = done
|
| 134 |
metadata = dict(info or {})
|
| 135 |
metadata["task_name"] = self._task_name
|
| 136 |
+
metadata["task_id"] = self._task_id
|
| 137 |
metadata["episode_metrics"] = {
|
| 138 |
"interruptions": int(self._env.interruptions),
|
| 139 |
"invalid_actions": int(self._env.invalid_actions),
|
|
|
|
| 145 |
if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
|
| 146 |
),
|
| 147 |
"total_score": float(self._env._total_score()),
|
| 148 |
+
"grader_score": min(max(float(reward or 0.0), 0.0), 1.0),
|
| 149 |
}
|
| 150 |
payload["metadata"] = metadata
|
| 151 |
return EngineerManagerObservation.model_validate(payload)
|
tasks.py
CHANGED
|
@@ -11,10 +11,11 @@ TASKS = [
|
|
| 11 |
"difficulty": "easy",
|
| 12 |
"description": TASK_SPECS["quiet-morning"].description,
|
| 13 |
"max_steps": 32,
|
| 14 |
-
"reset_params": {"
|
| 15 |
"action_schema": {
|
| 16 |
"target_slot": "integer slot index within the workday",
|
| 17 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
|
|
|
| 18 |
},
|
| 19 |
"grader": "graders:grade_task_0",
|
| 20 |
"graders": ["graders:grade_task_0"],
|
|
@@ -27,10 +28,11 @@ TASKS = [
|
|
| 27 |
"difficulty": "medium",
|
| 28 |
"description": TASK_SPECS["meeting-surgery"].description,
|
| 29 |
"max_steps": 32,
|
| 30 |
-
"reset_params": {"
|
| 31 |
"action_schema": {
|
| 32 |
"target_slot": "integer slot index within the workday",
|
| 33 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
|
|
|
| 34 |
},
|
| 35 |
"grader": "graders:grade_task_1",
|
| 36 |
"graders": ["graders:grade_task_1"],
|
|
@@ -43,10 +45,11 @@ TASKS = [
|
|
| 43 |
"difficulty": "hard",
|
| 44 |
"description": TASK_SPECS["delivery-triage"].description,
|
| 45 |
"max_steps": 32,
|
| 46 |
-
"reset_params": {"
|
| 47 |
"action_schema": {
|
| 48 |
"target_slot": "integer slot index within the workday",
|
| 49 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
|
|
|
| 50 |
},
|
| 51 |
"grader": "graders:grade_task_2",
|
| 52 |
"graders": ["graders:grade_task_2"],
|
|
|
|
| 11 |
"difficulty": "easy",
|
| 12 |
"description": TASK_SPECS["quiet-morning"].description,
|
| 13 |
"max_steps": 32,
|
| 14 |
+
"reset_params": {"task_id": 0},
|
| 15 |
"action_schema": {
|
| 16 |
"target_slot": "integer slot index within the workday",
|
| 17 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 18 |
+
"task_id": "quiet-morning",
|
| 19 |
},
|
| 20 |
"grader": "graders:grade_task_0",
|
| 21 |
"graders": ["graders:grade_task_0"],
|
|
|
|
| 28 |
"difficulty": "medium",
|
| 29 |
"description": TASK_SPECS["meeting-surgery"].description,
|
| 30 |
"max_steps": 32,
|
| 31 |
+
"reset_params": {"task_id": 1},
|
| 32 |
"action_schema": {
|
| 33 |
"target_slot": "integer slot index within the workday",
|
| 34 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 35 |
+
"task_id": "meeting-surgery",
|
| 36 |
},
|
| 37 |
"grader": "graders:grade_task_1",
|
| 38 |
"graders": ["graders:grade_task_1"],
|
|
|
|
| 45 |
"difficulty": "hard",
|
| 46 |
"description": TASK_SPECS["delivery-triage"].description,
|
| 47 |
"max_steps": 32,
|
| 48 |
+
"reset_params": {"task_id": 2},
|
| 49 |
"action_schema": {
|
| 50 |
"target_slot": "integer slot index within the workday",
|
| 51 |
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 52 |
+
"task_id": "delivery-triage",
|
| 53 |
},
|
| 54 |
"grader": "graders:grade_task_2",
|
| 55 |
"graders": ["graders:grade_task_2"],
|