AzraelH commited on
Commit
ec7e9a5
·
1 Parent(s): 2dbf205

Align task and grader interfaces with validator

Browse files
graders.py CHANGED
@@ -2,38 +2,67 @@ from __future__ import annotations
2
 
3
  from typing import Any
4
 
5
- from benchmark_tasks import grade_trajectory
6
 
 
 
 
 
 
7
 
8
- def _coerce_trajectory(payload: Any) -> list[dict[str, Any]]:
9
- if isinstance(payload, list):
10
- return [dict(step) for step in payload]
11
- if isinstance(payload, dict):
12
- if isinstance(payload.get("trajectory"), list):
13
- return [dict(step) for step in payload["trajectory"]]
14
- if isinstance(payload.get("steps"), list):
15
- return [dict(step) for step in payload["steps"]]
16
- return []
17
 
 
 
18
 
19
- def _grade(task_name: str, payload: Any) -> dict[str, Any]:
20
- trajectory = _coerce_trajectory(payload)
21
- score = grade_trajectory(task_name, trajectory)
22
- return {
23
- "task_name": task_name,
24
- "score": score,
25
- "passed": score > 0.0,
26
- "reward": score,
27
- }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def grade_task_0(payload: Any) -> dict[str, Any]:
31
- return _grade("quiet-morning", payload)
32
 
 
 
33
 
34
- def grade_task_1(payload: Any) -> dict[str, Any]:
35
- return _grade("meeting-surgery", payload)
36
 
 
 
37
 
38
- def grade_task_2(payload: Any) -> dict[str, Any]:
39
- return _grade("delivery-triage", payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from typing import Any
4
 
 
5
 
6
+ TASK_NAMES = {
7
+ 0: "quiet-morning",
8
+ 1: "meeting-surgery",
9
+ 2: "delivery-triage",
10
+ }
11
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def _normalize_reward(reward: float) -> float:
14
+ return min(max(float(reward), 0.0), 1.0)
15
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def _state_task_id(state: Any) -> int | None:
18
+ if not isinstance(state, dict):
19
+ return None
20
+ task_id = state.get("task_id")
21
+ if isinstance(task_id, int):
22
+ return task_id
23
+ task_name = state.get("task_name")
24
+ if isinstance(task_name, str):
25
+ for index, name in TASK_NAMES.items():
26
+ if name == task_name:
27
+ return index
28
+ metadata = state.get("metadata")
29
+ if isinstance(metadata, dict):
30
+ nested_task_id = metadata.get("task_id")
31
+ if isinstance(nested_task_id, int):
32
+ return nested_task_id
33
+ return None
34
 
 
 
35
 
36
+ def grade_task_0(state: dict, reward: float) -> float:
37
+ return _normalize_reward(reward if _state_task_id(state) == 0 else 0.0)
38
 
 
 
39
 
40
+ def grade_task_1(state: dict, reward: float) -> float:
41
+ return _normalize_reward(reward if _state_task_id(state) == 1 else 0.0)
42
 
43
+
44
+ def grade_task_2(state: dict, reward: float) -> float:
45
+ return _normalize_reward(reward if _state_task_id(state) == 2 else 0.0)
46
+
47
+
48
+ GRADERS = {
49
+ "engineer_manager_task_0": grade_task_0,
50
+ "engineer_manager_task_1": grade_task_1,
51
+ "engineer_manager_task_2": grade_task_2,
52
+ }
53
+
54
+
55
+ TASK_GRADER_PAIRS = [
56
+ ("engineer_manager_task_0", grade_task_0),
57
+ ("engineer_manager_task_1", grade_task_1),
58
+ ("engineer_manager_task_2", grade_task_2),
59
+ ]
60
+
61
+
62
+ __all__ = [
63
+ "grade_task_0",
64
+ "grade_task_1",
65
+ "grade_task_2",
66
+ "GRADERS",
67
+ "TASK_GRADER_PAIRS",
68
+ ]
openenv.yaml CHANGED
@@ -32,10 +32,11 @@ tasks:
32
  description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
33
  max_steps: 32
34
  reset_params:
35
- task_name: quiet-morning
36
  action_schema:
37
  target_slot: integer slot index within the workday
38
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
 
39
  grader: graders:grade_task_0
40
  graders:
41
  - graders:grade_task_0
@@ -49,10 +50,11 @@ tasks:
49
  description: Fragmented calendar where selective meeting moves should improve flow.
50
  max_steps: 32
51
  reset_params:
52
- task_name: meeting-surgery
53
  action_schema:
54
  target_slot: integer slot index within the workday
55
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
 
56
  grader: graders:grade_task_1
57
  graders:
58
  - graders:grade_task_1
@@ -66,10 +68,11 @@ tasks:
66
  description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
67
  max_steps: 32
68
  reset_params:
69
- task_name: delivery-triage
70
  action_schema:
71
  target_slot: integer slot index within the workday
72
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
 
73
  grader: graders:grade_task_2
74
  graders:
75
  - graders:grade_task_2
 
32
  description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
33
  max_steps: 32
34
  reset_params:
35
+ task_id: 0
36
  action_schema:
37
  target_slot: integer slot index within the workday
38
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
39
+ task_id: quiet-morning
40
  grader: graders:grade_task_0
41
  graders:
42
  - graders:grade_task_0
 
50
  description: Fragmented calendar where selective meeting moves should improve flow.
51
  max_steps: 32
52
  reset_params:
53
+ task_id: 1
54
  action_schema:
55
  target_slot: integer slot index within the workday
56
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
57
+ task_id: meeting-surgery
58
  grader: graders:grade_task_1
59
  graders:
60
  - graders:grade_task_1
 
68
  description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
69
  max_steps: 32
70
  reset_params:
71
+ task_id: 2
72
  action_schema:
73
  target_slot: integer slot index within the workday
74
  operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
75
+ task_id: delivery-triage
76
  grader: graders:grade_task_2
77
  graders:
78
  - graders:grade_task_2
server/app.py CHANGED
@@ -36,7 +36,8 @@ app = create_fastapi_app(
36
 
37
  class GraderRequest(BaseModel):
38
  task_id: str
39
- trajectory: list[dict]
 
40
 
41
  WEB_CSS = dedent(
42
  """\
@@ -473,6 +474,9 @@ def grader(request: GraderRequest) -> JSONResponse:
473
  "quiet-morning": grade_task_0,
474
  "meeting-surgery": grade_task_1,
475
  "delivery-triage": grade_task_2,
 
 
 
476
  }
477
  grader_fn = graders.get(request.task_id)
478
  if grader_fn is None:
@@ -480,7 +484,8 @@ def grader(request: GraderRequest) -> JSONResponse:
480
  {"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
481
  status_code=400,
482
  )
483
- return JSONResponse(grader_fn({"trajectory": request.trajectory}))
 
484
 
485
 
486
  def run(host: str = "0.0.0.0", port: int = 8000) -> None:
 
36
 
37
  class GraderRequest(BaseModel):
38
  task_id: str
39
+ state: dict
40
+ reward: float
41
 
42
  WEB_CSS = dedent(
43
  """\
 
474
  "quiet-morning": grade_task_0,
475
  "meeting-surgery": grade_task_1,
476
  "delivery-triage": grade_task_2,
477
+ "engineer_manager_task_0": grade_task_0,
478
+ "engineer_manager_task_1": grade_task_1,
479
+ "engineer_manager_task_2": grade_task_2,
480
  }
481
  grader_fn = graders.get(request.task_id)
482
  if grader_fn is None:
 
484
  {"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
485
  status_code=400,
486
  )
487
+ score = float(grader_fn(request.state, request.reward))
488
+ return JSONResponse({"task_id": request.task_id, "score": score, "passed": score > 0.0, "reward": score})
489
 
490
 
491
  def run(host: str = "0.0.0.0", port: int = 8000) -> None:
server/engineer_manager_environment.py CHANGED
@@ -8,7 +8,7 @@ import os
8
  from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
9
  from openenv.core.env_server.types import State
10
 
11
- from benchmark_tasks import TASK_SPECS, apply_task, grade_trajectory
12
  from focus_resource_env import FocusResourceEnv
13
 
14
  try:
@@ -38,6 +38,7 @@ class EngineerManagerEnvironment(
38
  self._distraction_risk = distraction_risk
39
  self._seed = seed
40
  self._task_name = task_name or os.getenv("TASK_NAME")
 
41
  self._step_count = 0
42
  self._episode_id = str(uuid4())
43
  self._trajectory: list[dict[str, object]] = []
@@ -53,10 +54,17 @@ class EngineerManagerEnvironment(
53
  seed: int | None = None,
54
  episode_id: str | None = None,
55
  task_name: str | None = None,
 
56
  **_: object,
57
  ) -> EngineerManagerObservation:
58
  self._seed = self._seed if seed is None else seed
59
- self._task_name = task_name or self._task_name or os.getenv("TASK_NAME")
 
 
 
 
 
 
60
  self._episode_id = episode_id or str(uuid4())
61
  self._step_count = 0
62
  self._trajectory = []
@@ -125,6 +133,7 @@ class EngineerManagerEnvironment(
125
  payload["done"] = done
126
  metadata = dict(info or {})
127
  metadata["task_name"] = self._task_name
 
128
  metadata["episode_metrics"] = {
129
  "interruptions": int(self._env.interruptions),
130
  "invalid_actions": int(self._env.invalid_actions),
@@ -136,7 +145,7 @@ class EngineerManagerEnvironment(
136
  if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
137
  ),
138
  "total_score": float(self._env._total_score()),
139
- "grader_score": grade_trajectory(self._task_name or "", self._trajectory) if self._trajectory else 0.0,
140
  }
141
  payload["metadata"] = metadata
142
  return EngineerManagerObservation.model_validate(payload)
 
8
  from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
9
  from openenv.core.env_server.types import State
10
 
11
+ from benchmark_tasks import TASK_SPECS, apply_task
12
  from focus_resource_env import FocusResourceEnv
13
 
14
  try:
 
38
  self._distraction_risk = distraction_risk
39
  self._seed = seed
40
  self._task_name = task_name or os.getenv("TASK_NAME")
41
+ self._task_id = 0
42
  self._step_count = 0
43
  self._episode_id = str(uuid4())
44
  self._trajectory: list[dict[str, object]] = []
 
54
  seed: int | None = None,
55
  episode_id: str | None = None,
56
  task_name: str | None = None,
57
+ task_id: int | None = None,
58
  **_: object,
59
  ) -> EngineerManagerObservation:
60
  self._seed = self._seed if seed is None else seed
61
+ task_names = ["quiet-morning", "meeting-surgery", "delivery-triage"]
62
+ if task_id is not None and 0 <= int(task_id) < len(task_names):
63
+ self._task_id = int(task_id)
64
+ self._task_name = task_names[self._task_id]
65
+ else:
66
+ self._task_name = task_name or self._task_name or os.getenv("TASK_NAME")
67
+ self._task_id = task_names.index(self._task_name) if self._task_name in task_names else 0
68
  self._episode_id = episode_id or str(uuid4())
69
  self._step_count = 0
70
  self._trajectory = []
 
133
  payload["done"] = done
134
  metadata = dict(info or {})
135
  metadata["task_name"] = self._task_name
136
+ metadata["task_id"] = self._task_id
137
  metadata["episode_metrics"] = {
138
  "interruptions": int(self._env.interruptions),
139
  "invalid_actions": int(self._env.invalid_actions),
 
145
  if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
146
  ),
147
  "total_score": float(self._env._total_score()),
148
+ "grader_score": min(max(float(reward or 0.0), 0.0), 1.0),
149
  }
150
  payload["metadata"] = metadata
151
  return EngineerManagerObservation.model_validate(payload)
tasks.py CHANGED
@@ -11,10 +11,11 @@ TASKS = [
11
  "difficulty": "easy",
12
  "description": TASK_SPECS["quiet-morning"].description,
13
  "max_steps": 32,
14
- "reset_params": {"task_name": "quiet-morning"},
15
  "action_schema": {
16
  "target_slot": "integer slot index within the workday",
17
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
 
18
  },
19
  "grader": "graders:grade_task_0",
20
  "graders": ["graders:grade_task_0"],
@@ -27,10 +28,11 @@ TASKS = [
27
  "difficulty": "medium",
28
  "description": TASK_SPECS["meeting-surgery"].description,
29
  "max_steps": 32,
30
- "reset_params": {"task_name": "meeting-surgery"},
31
  "action_schema": {
32
  "target_slot": "integer slot index within the workday",
33
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
 
34
  },
35
  "grader": "graders:grade_task_1",
36
  "graders": ["graders:grade_task_1"],
@@ -43,10 +45,11 @@ TASKS = [
43
  "difficulty": "hard",
44
  "description": TASK_SPECS["delivery-triage"].description,
45
  "max_steps": 32,
46
- "reset_params": {"task_name": "delivery-triage"},
47
  "action_schema": {
48
  "target_slot": "integer slot index within the workday",
49
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
 
50
  },
51
  "grader": "graders:grade_task_2",
52
  "graders": ["graders:grade_task_2"],
 
11
  "difficulty": "easy",
12
  "description": TASK_SPECS["quiet-morning"].description,
13
  "max_steps": 32,
14
+ "reset_params": {"task_id": 0},
15
  "action_schema": {
16
  "target_slot": "integer slot index within the workday",
17
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
18
+ "task_id": "quiet-morning",
19
  },
20
  "grader": "graders:grade_task_0",
21
  "graders": ["graders:grade_task_0"],
 
28
  "difficulty": "medium",
29
  "description": TASK_SPECS["meeting-surgery"].description,
30
  "max_steps": 32,
31
+ "reset_params": {"task_id": 1},
32
  "action_schema": {
33
  "target_slot": "integer slot index within the workday",
34
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
35
+ "task_id": "meeting-surgery",
36
  },
37
  "grader": "graders:grade_task_1",
38
  "graders": ["graders:grade_task_1"],
 
45
  "difficulty": "hard",
46
  "description": TASK_SPECS["delivery-triage"].description,
47
  "max_steps": 32,
48
+ "reset_params": {"task_id": 2},
49
  "action_schema": {
50
  "target_slot": "integer slot index within the workday",
51
  "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
52
+ "task_id": "delivery-triage",
53
  },
54
  "grader": "graders:grade_task_2",
55
  "graders": ["graders:grade_task_2"],