README.md CHANGED
@@ -43,13 +43,3 @@ openenv validate http://127.0.0.1:8000
43
  - `task_buffer`: pending tasks with estimated duration and hidden complexity
44
  - `flow_score`, `social_debt`, `calendar_churn`: core scoring metrics
45
  - `current_slot`, `current_time`, `recovery_state`, `mute_comms`: live execution state
46
-
47
- ## Built-in benchmark tasks
48
-
49
- Set `TASK_NAME` to select a deterministic scenario before reset. Available tasks:
50
-
51
- - `quiet-morning`: high-noise start where muting comms early and protecting focus is rewarded
52
- - `meeting-surgery`: fragmented calendar where selective meeting moves should improve flow
53
- - `delivery-triage`: constrained delivery day with hidden task complexity and tighter tradeoffs
54
-
55
- Each task has a grader in [benchmark_tasks.py](/C:/Users/arshi/OneDrive/Desktop/idk/engineer-manager/benchmark_tasks.py:1). The environment also exposes task metadata and the current grader score in `observation.metadata.episode_metrics.grader_score`.
 
43
  - `task_buffer`: pending tasks with estimated duration and hidden complexity
44
  - `flow_score`, `social_debt`, `calendar_churn`: core scoring metrics
45
  - `current_slot`, `current_time`, `recovery_state`, `mute_comms`: live execution state
 
 
 
 
 
 
 
 
 
 
benchmark_tasks.py DELETED
@@ -1,189 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- from dataclasses import dataclass
5
- from typing import Any, Callable
6
-
7
- from focus_resource_env import DEEP_WORK, EMPTY, MEETING, FocusResourceEnv, Task
8
-
9
-
10
- StepRecord = dict[str, Any]
11
- TaskSetup = Callable[[FocusResourceEnv], None]
12
- TaskGrader = Callable[[list[StepRecord]], float]
13
-
14
-
15
- def _reset_state(env: FocusResourceEnv) -> None:
16
- env.timeline[:] = EMPTY
17
- env.meeting_meta = {}
18
- env.task_buffer = []
19
- env.current_slot = 0
20
- env.current_work_streak_slots = 0
21
- env.recovery_remaining = 0
22
- env.mute_comms = False
23
- env.social_debt = 0.0
24
- env.calendar_churn = 0
25
- env.flow_score = 0.0
26
- env.last_executed_kind = EMPTY
27
- env.interruptions = 0
28
- env.invalid_actions = 0
29
-
30
-
31
- def _set_meeting(
32
- env: FocusResourceEnv,
33
- *,
34
- start: int,
35
- length: int,
36
- priority: int,
37
- meeting_id: int,
38
- ) -> None:
39
- env._place_meeting(start, length, priority, meeting_id)
40
-
41
-
42
- def _normalized_total_score(env: FocusResourceEnv) -> float:
43
- max_score = max(1.0, (env.timeline_length * 0.5) ** 2)
44
- return min(1.0, max(0.0, env._total_score() / max_score))
45
-
46
-
47
- def setup_quiet_morning(env: FocusResourceEnv) -> None:
48
- _reset_state(env)
49
- env.distraction_risk = 0.65
50
- env.task_buffer = [
51
- Task(duration=2, hidden_complexity=1.0),
52
- Task(duration=3, hidden_complexity=1.0),
53
- Task(duration=2, hidden_complexity=1.25),
54
- ]
55
- _set_meeting(env, start=5, length=1, priority=4, meeting_id=1)
56
- _set_meeting(env, start=7, length=1, priority=3, meeting_id=2)
57
-
58
-
59
- def setup_meeting_surgery(env: FocusResourceEnv) -> None:
60
- _reset_state(env)
61
- env.distraction_risk = 0.10
62
- env.task_buffer = [
63
- Task(duration=2, hidden_complexity=1.0),
64
- Task(duration=2, hidden_complexity=1.25),
65
- Task(duration=1, hidden_complexity=1.0),
66
- ]
67
- _set_meeting(env, start=1, length=1, priority=2, meeting_id=1)
68
- _set_meeting(env, start=3, length=1, priority=2, meeting_id=2)
69
- _set_meeting(env, start=6, length=2, priority=8, meeting_id=3)
70
-
71
-
72
- def setup_delivery_triage(env: FocusResourceEnv) -> None:
73
- _reset_state(env)
74
- env.distraction_risk = 0.25
75
- env.task_buffer = [
76
- Task(duration=3, hidden_complexity=1.5),
77
- Task(duration=2, hidden_complexity=1.0),
78
- Task(duration=1, hidden_complexity=1.0),
79
- ]
80
- _set_meeting(env, start=4, length=1, priority=9, meeting_id=1)
81
- _set_meeting(env, start=8, length=2, priority=7, meeting_id=2)
82
-
83
-
84
- def grade_quiet_morning(trajectory: list[StepRecord]) -> float:
85
- if not trajectory:
86
- return 0.0
87
- first_action = int(trajectory[0]["action"]["operation"])
88
- final = trajectory[-1]["observation"]
89
- final_score = float(final["flow_score"])
90
- transition_count = sum(1 for step in trajectory if step["info"]["transition_info"]["interrupted"])
91
- scheduled = sum(1 for slot in final["timeline"] if int(slot) == DEEP_WORK)
92
-
93
- score = 0.0
94
- score += 0.25 if first_action == 3 else 0.0
95
- score += min(0.45, final_score / 6.0)
96
- score += 0.15 if transition_count == 0 else 0.0
97
- score += min(0.15, scheduled / 6.0)
98
- return min(1.0, round(score, 4))
99
-
100
-
101
- def grade_meeting_surgery(trajectory: list[StepRecord]) -> float:
102
- if not trajectory:
103
- return 0.0
104
- final = trajectory[-1]["observation"]
105
- flow = float(final["flow_score"])
106
- debt = float(final["social_debt"])
107
- churn = int(final["calendar_churn"])
108
- reschedules = sum(
109
- 1
110
- for step in trajectory
111
- if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
112
- )
113
-
114
- score = 0.0
115
- score += min(0.40, flow / 5.0)
116
- score += 0.20 if reschedules >= 1 else 0.0
117
- score += 0.20 if 1 <= churn <= 2 else max(0.0, 0.20 - (0.10 * abs(churn - 1)))
118
- score += max(0.0, 0.20 - (debt / 8.0))
119
- return min(1.0, round(score, 4))
120
-
121
-
122
- def grade_delivery_triage(trajectory: list[StepRecord]) -> float:
123
- if not trajectory:
124
- return 0.0
125
- final = trajectory[-1]["observation"]
126
- total = float(final["flow_score"]) - float(final["social_debt"]) - float(final["calendar_churn"])
127
- invalid_actions = sum(
128
- 1
129
- for step in trajectory
130
- if str(step["info"].get("action_info", {}).get("status", "")).startswith("invalid")
131
- )
132
- remaining_tasks = len(final["task_buffer"])
133
- scheduled = sum(1 for slot in final["timeline"] if int(slot) == DEEP_WORK)
134
-
135
- score = 0.0
136
- score += min(0.45, max(0.0, total) / 6.0)
137
- score += min(0.25, scheduled / 8.0)
138
- score += 0.20 if remaining_tasks <= 1 else 0.10 if remaining_tasks == 2 else 0.0
139
- score += max(0.0, 0.10 - (0.05 * invalid_actions))
140
- return min(1.0, round(score, 4))
141
-
142
-
143
- @dataclass(frozen=True)
144
- class TaskSpec:
145
- name: str
146
- description: str
147
- setup: TaskSetup
148
- grader: TaskGrader
149
-
150
-
151
- TASK_SPECS: dict[str, TaskSpec] = {
152
- "quiet-morning": TaskSpec(
153
- name="quiet-morning",
154
- description="High-noise morning where the agent should mute comms early and protect an uninterrupted work block.",
155
- setup=setup_quiet_morning,
156
- grader=grade_quiet_morning,
157
- ),
158
- "meeting-surgery": TaskSpec(
159
- name="meeting-surgery",
160
- description="A fragmented calendar where the agent should improve flow with limited, selective meeting moves.",
161
- setup=setup_meeting_surgery,
162
- grader=grade_meeting_surgery,
163
- ),
164
- "delivery-triage": TaskSpec(
165
- name="delivery-triage",
166
- description="A constrained day with hidden task complexity where the agent must schedule useful work without spiraling debt.",
167
- setup=setup_delivery_triage,
168
- grader=grade_delivery_triage,
169
- ),
170
- }
171
-
172
-
173
- DEFAULT_TASK_NAME = "quiet-morning"
174
-
175
-
176
- def get_task_spec(task_name: str | None) -> TaskSpec:
177
- normalized = (task_name or os.getenv("TASK_NAME") or DEFAULT_TASK_NAME).strip()
178
- return TASK_SPECS.get(normalized, TASK_SPECS[DEFAULT_TASK_NAME])
179
-
180
-
181
- def apply_task(env: FocusResourceEnv, task_name: str | None) -> TaskSpec:
182
- spec = get_task_spec(task_name)
183
- spec.setup(env)
184
- return spec
185
-
186
-
187
- def grade_trajectory(task_name: str, trajectory: list[StepRecord]) -> float:
188
- spec = get_task_spec(task_name)
189
- return spec.grader(trajectory)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
engineer-manager DELETED
@@ -1 +0,0 @@
1
- Subproject commit 40c10a7795c79e546608c011d3aff8820e1e479c
 
 
graders.py DELETED
@@ -1,68 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any
4
-
5
-
6
- TASK_NAMES = {
7
- 0: "quiet-morning",
8
- 1: "meeting-surgery",
9
- 2: "delivery-triage",
10
- }
11
-
12
-
13
- def _normalize_reward(reward: float) -> float:
14
- return min(max(float(reward), 0.0), 1.0)
15
-
16
-
17
- def _state_task_id(state: Any) -> int | None:
18
- if not isinstance(state, dict):
19
- return None
20
- task_id = state.get("task_id")
21
- if isinstance(task_id, int):
22
- return task_id
23
- task_name = state.get("task_name")
24
- if isinstance(task_name, str):
25
- for index, name in TASK_NAMES.items():
26
- if name == task_name:
27
- return index
28
- metadata = state.get("metadata")
29
- if isinstance(metadata, dict):
30
- nested_task_id = metadata.get("task_id")
31
- if isinstance(nested_task_id, int):
32
- return nested_task_id
33
- return None
34
-
35
-
36
- def grade_task_0(state: dict, reward: float) -> float:
37
- return _normalize_reward(reward if _state_task_id(state) == 0 else 0.0)
38
-
39
-
40
- def grade_task_1(state: dict, reward: float) -> float:
41
- return _normalize_reward(reward if _state_task_id(state) == 1 else 0.0)
42
-
43
-
44
- def grade_task_2(state: dict, reward: float) -> float:
45
- return _normalize_reward(reward if _state_task_id(state) == 2 else 0.0)
46
-
47
-
48
- GRADERS = {
49
- "engineer_manager_task_0": grade_task_0,
50
- "engineer_manager_task_1": grade_task_1,
51
- "engineer_manager_task_2": grade_task_2,
52
- }
53
-
54
-
55
- TASK_GRADER_PAIRS = [
56
- ("engineer_manager_task_0", grade_task_0),
57
- ("engineer_manager_task_1", grade_task_1),
58
- ("engineer_manager_task_2", grade_task_2),
59
- ]
60
-
61
-
62
- __all__ = [
63
- "grade_task_0",
64
- "grade_task_1",
65
- "grade_task_2",
66
- "GRADERS",
67
- "TASK_GRADER_PAIRS",
68
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py CHANGED
@@ -3,21 +3,15 @@ import json
3
  import math
4
  import os
5
  import textwrap
6
- from dataclasses import dataclass
7
  from typing import Any
8
 
9
  from openai import OpenAI
10
  from openenv.core.generic_client import GenericEnvClient
11
 
12
- try:
13
- from server.engineer_manager_environment import EngineerManagerEnvironment
14
- except ImportError:
15
- EngineerManagerEnvironment = None # type: ignore[assignment]
16
-
17
 
18
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
19
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
20
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
21
  LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
22
  OPENENV_BASE_URL = os.getenv("OPENENV_BASE_URL")
23
  TASK_NAME = os.getenv("TASK_NAME", "engineer-manager")
@@ -52,52 +46,15 @@ SYSTEM_PROMPT = textwrap.dedent(
52
  ).strip()
53
 
54
 
55
- @dataclass
56
- class _EnvResult:
57
- observation: dict[str, Any]
58
- reward: float | None
59
- done: bool
60
-
61
-
62
- class _InProcessEnvClient:
63
- def __init__(self) -> None:
64
- if EngineerManagerEnvironment is None:
65
- raise RuntimeError("Bundled EngineerManagerEnvironment is unavailable")
66
- self._env = EngineerManagerEnvironment()
67
-
68
- async def connect(self) -> None:
69
- return None
70
-
71
- async def reset(self) -> _EnvResult:
72
- observation = self._env.reset().model_dump()
73
- return _EnvResult(
74
- observation=dict(observation),
75
- reward=float(observation.get("reward") or 0.0),
76
- done=bool(observation.get("done")),
77
- )
78
-
79
- async def step(self, action: dict[str, int]) -> _EnvResult:
80
- observation = self._env.step(type("Action", (), action)()).model_dump()
81
- return _EnvResult(
82
- observation=dict(observation),
83
- reward=float(observation.get("reward") or 0.0),
84
- done=bool(observation.get("done")),
85
- )
86
-
87
- async def close(self) -> None:
88
- return None
89
 
90
 
91
  def _sanitize_field(value: Any) -> str:
92
- return str(value).replace("\r", " ").replace("\n", " ").strip()
93
-
94
-
95
- def _format_error(error: str | None) -> str:
96
- return "null" if error in (None, "") else _sanitize_field(error)
97
-
98
-
99
- def _action_to_text(action: dict[str, int]) -> str:
100
- return f'{{"target_slot":{int(action["target_slot"])},"operation":{int(action["operation"])}}}'
101
 
102
 
103
  def log_start(task: str, env: str, model: str) -> None:
@@ -107,10 +64,17 @@ def log_start(task: str, env: str, model: str) -> None:
107
  )
108
 
109
 
110
- def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
 
 
 
 
 
 
 
111
  print(
112
  f"[STEP] step={step} action={_sanitize_field(action)} reward={reward:.2f} "
113
- f"done={str(done).lower()} error={_format_error(error)}",
114
  flush=True,
115
  )
116
 
@@ -127,12 +91,15 @@ def estimate_max_flow_score(timeline: list[int]) -> float:
127
  slot_count = len(timeline)
128
  if slot_count <= 0:
129
  return 1.0
130
- return max(1.0, (slot_count * 0.5) ** 2)
 
131
 
132
 
133
  def normalize_score(total_reward: float, observation: dict[str, Any]) -> float:
134
- max_score = estimate_max_flow_score(observation.get("timeline") or [])
135
- return min(1.0, max(0.0, total_reward / max_score))
 
 
136
 
137
 
138
  def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
@@ -144,12 +111,17 @@ def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
144
  return None
145
 
146
 
 
 
 
 
147
  def build_user_prompt(
148
  step: int,
149
  observation: dict[str, Any],
150
  rewards: list[float],
151
  history: list[str],
152
  ) -> str:
 
153
  metadata = observation.get("metadata") or {}
154
  return textwrap.dedent(
155
  f"""
@@ -162,10 +134,10 @@ def build_user_prompt(
162
  social_debt={float(observation.get("social_debt", 0.0)):.2f}
163
  calendar_churn={int(observation.get("calendar_churn", 0))}
164
  recovery_state={int(observation.get("recovery_state", 0))}
165
- timeline={json.dumps(observation.get("timeline", []), separators=(",", ":"))}
166
  task_buffer={json.dumps(observation.get("task_buffer", []), separators=(",", ":"))}
167
  last_rewards={",".join(f"{reward:.2f}" for reward in rewards[-5:]) or "none"}
168
- recent_history={json.dumps(history[-5:], separators=(",", ":"))}
169
  last_metadata={json.dumps(metadata, separators=(",", ":"))}
170
  Choose the single next action.
171
  """
@@ -176,24 +148,15 @@ def choose_fallback_action(observation: dict[str, Any]) -> dict[str, int]:
176
  current_slot = int(observation.get("current_slot", 0))
177
  distraction_risk = float(observation.get("distraction_risk", 0.0))
178
  mute_comms = bool(observation.get("mute_comms", False))
179
- recovery_state = int(observation.get("recovery_state", 0))
180
- timeline = observation.get("timeline") or []
181
-
182
- if current_slot == 0 and distraction_risk > 0.0 and not mute_comms:
183
  return {"target_slot": current_slot, "operation": 3}
184
 
185
- if recovery_state > 0:
186
- return {"target_slot": current_slot, "operation": 0}
187
-
188
- if current_slot < len(timeline) and int(timeline[current_slot]) == 0 and observation.get("task_buffer"):
189
- return {"target_slot": current_slot, "operation": 1}
190
-
191
- empty_slot = first_future_slot(observation, 0)
192
  if empty_slot is not None and observation.get("task_buffer"):
193
  return {"target_slot": empty_slot, "operation": 1}
194
 
195
  meeting_slot = first_future_slot(observation, 2)
196
- if meeting_slot is not None:
197
  return {"target_slot": meeting_slot, "operation": 2}
198
 
199
  return {"target_slot": current_slot, "operation": 0}
@@ -201,8 +164,8 @@ def choose_fallback_action(observation: dict[str, Any]) -> dict[str, int]:
201
 
202
  def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
203
  timeline = observation.get("timeline") or []
204
- fallback = choose_fallback_action(observation)
205
  max_slot = max(0, len(timeline) - 1)
 
206
  try:
207
  data = json.loads(raw_text)
208
  target_slot = int(data["target_slot"])
@@ -212,7 +175,8 @@ def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
212
 
213
  if operation not in {0, 1, 2, 3}:
214
  return fallback
215
- return {"target_slot": min(max(target_slot, 0), max_slot), "operation": operation}
 
216
 
217
 
218
  def get_model_action(
@@ -239,81 +203,67 @@ def get_model_action(
239
  return choose_fallback_action(observation)
240
 
241
 
242
- async def create_env() -> Any:
243
  if OPENENV_BASE_URL:
244
  env = GenericEnvClient(base_url=OPENENV_BASE_URL)
245
  await env.connect()
246
  return env
247
 
248
- if LOCAL_IMAGE_NAME:
249
- try:
250
- return await GenericEnvClient.from_docker_image(LOCAL_IMAGE_NAME)
251
- except Exception:
252
- pass
253
-
254
- env = _InProcessEnvClient()
255
- await env.connect()
256
- return env
257
 
258
 
259
  async def main() -> None:
 
 
260
  env = None
261
  rewards: list[float] = []
262
  history: list[str] = []
263
  steps_taken = 0
264
  success = False
265
  score = 0.0
266
- observation: dict[str, Any] = {}
267
- completed = False
268
 
269
  log_start(TASK_NAME, BENCHMARK, MODEL_NAME)
270
 
271
  try:
272
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) if API_KEY else None
273
  env = await create_env()
274
  result = await env.reset()
275
  observation = dict(result.observation)
276
 
277
  for step in range(1, MAX_STEPS + 1):
278
  if result.done:
279
- completed = True
280
  break
281
 
282
- if client is None:
283
- action = choose_fallback_action(observation)
284
- else:
285
- action = get_model_action(client, step, observation, rewards, history)
286
- action_text = _action_to_text(action)
287
- step_error: str | None = None
288
 
289
- try:
290
- result = await env.step(action)
291
- observation = dict(result.observation)
292
- reward = float(result.reward or 0.0)
293
- done = bool(result.done)
294
- metadata = observation.get("metadata") or {}
295
- step_error = metadata.get("last_action_error")
296
- except Exception as exc:
297
- reward = 0.0
298
- done = True
299
- step_error = str(exc)
300
 
301
  rewards.append(reward)
302
  steps_taken = step
303
- log_step(step, action_text, reward, done, step_error)
 
 
 
 
 
304
  history.append(
305
- f"step={step} action={action_text} reward={reward:.2f} error={_format_error(step_error)}"
 
 
306
  )
307
 
308
  if done:
309
- completed = True
310
  break
311
 
312
- score = round(normalize_score(math.fsum(rewards), observation), 2)
313
- success = completed and score >= 0.0
314
- except Exception:
315
- success = False
316
- score = 0.0
317
  finally:
318
  if env is not None:
319
  try:
 
3
  import math
4
  import os
5
  import textwrap
 
6
  from typing import Any
7
 
8
  from openai import OpenAI
9
  from openenv.core.generic_client import GenericEnvClient
10
 
 
 
 
 
 
11
 
12
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
13
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
14
+ HF_TOKEN = os.getenv("HF_TOKEN")
15
  LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
16
  OPENENV_BASE_URL = os.getenv("OPENENV_BASE_URL")
17
  TASK_NAME = os.getenv("TASK_NAME", "engineer-manager")
 
46
  ).strip()
47
 
48
 
49
+ def _require_env(name: str, value: str | None) -> str:
50
+ if value:
51
+ return value
52
+ raise RuntimeError(f"Missing required environment variable: {name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  def _sanitize_field(value: Any) -> str:
56
+ text = str(value)
57
+ return text.replace("\r", " ").replace("\n", " ").strip()
 
 
 
 
 
 
 
58
 
59
 
60
  def log_start(task: str, env: str, model: str) -> None:
 
64
  )
65
 
66
 
67
+ def log_step(
68
+ step: int,
69
+ action: str,
70
+ reward: float,
71
+ done: bool,
72
+ error: str | None,
73
+ ) -> None:
74
+ error_text = "null" if error in (None, "") else _sanitize_field(error)
75
  print(
76
  f"[STEP] step={step} action={_sanitize_field(action)} reward={reward:.2f} "
77
+ f"done={str(done).lower()} error={error_text}",
78
  flush=True,
79
  )
80
 
 
91
  slot_count = len(timeline)
92
  if slot_count <= 0:
93
  return 1.0
94
+ hours = slot_count * 0.5
95
+ return max(1.0, hours * hours)
96
 
97
 
98
  def normalize_score(total_reward: float, observation: dict[str, Any]) -> float:
99
+ timeline = observation.get("timeline") or []
100
+ max_score = estimate_max_flow_score(timeline)
101
+ normalized = total_reward / max_score
102
+ return min(1.0, max(0.0, normalized))
103
 
104
 
105
  def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
 
111
  return None
112
 
113
 
114
+ def first_future_empty_slot(observation: dict[str, Any]) -> int | None:
115
+ return first_future_slot(observation, 0)
116
+
117
+
118
  def build_user_prompt(
119
  step: int,
120
  observation: dict[str, Any],
121
  rewards: list[float],
122
  history: list[str],
123
  ) -> str:
124
+ timeline = observation.get("timeline") or []
125
  metadata = observation.get("metadata") or {}
126
  return textwrap.dedent(
127
  f"""
 
134
  social_debt={float(observation.get("social_debt", 0.0)):.2f}
135
  calendar_churn={int(observation.get("calendar_churn", 0))}
136
  recovery_state={int(observation.get("recovery_state", 0))}
137
+ timeline={timeline}
138
  task_buffer={json.dumps(observation.get("task_buffer", []), separators=(",", ":"))}
139
  last_rewards={",".join(f"{reward:.2f}" for reward in rewards[-5:]) or "none"}
140
+ recent_history={json.dumps(history[-5:])}
141
  last_metadata={json.dumps(metadata, separators=(",", ":"))}
142
  Choose the single next action.
143
  """
 
148
  current_slot = int(observation.get("current_slot", 0))
149
  distraction_risk = float(observation.get("distraction_risk", 0.0))
150
  mute_comms = bool(observation.get("mute_comms", False))
151
+ if distraction_risk >= 0.2 and not mute_comms:
 
 
 
152
  return {"target_slot": current_slot, "operation": 3}
153
 
154
+ empty_slot = first_future_empty_slot(observation)
 
 
 
 
 
 
155
  if empty_slot is not None and observation.get("task_buffer"):
156
  return {"target_slot": empty_slot, "operation": 1}
157
 
158
  meeting_slot = first_future_slot(observation, 2)
159
+ if meeting_slot is not None and current_slot <= meeting_slot:
160
  return {"target_slot": meeting_slot, "operation": 2}
161
 
162
  return {"target_slot": current_slot, "operation": 0}
 
164
 
165
  def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
166
  timeline = observation.get("timeline") or []
 
167
  max_slot = max(0, len(timeline) - 1)
168
+ fallback = choose_fallback_action(observation)
169
  try:
170
  data = json.loads(raw_text)
171
  target_slot = int(data["target_slot"])
 
175
 
176
  if operation not in {0, 1, 2, 3}:
177
  return fallback
178
+ target_slot = min(max(target_slot, 0), max_slot)
179
+ return {"target_slot": target_slot, "operation": operation}
180
 
181
 
182
  def get_model_action(
 
203
  return choose_fallback_action(observation)
204
 
205
 
206
+ async def create_env() -> GenericEnvClient:
207
  if OPENENV_BASE_URL:
208
  env = GenericEnvClient(base_url=OPENENV_BASE_URL)
209
  await env.connect()
210
  return env
211
 
212
+ image_name = _require_env("LOCAL_IMAGE_NAME", LOCAL_IMAGE_NAME)
213
+ return await GenericEnvClient.from_docker_image(image_name)
 
 
 
 
 
 
 
214
 
215
 
216
  async def main() -> None:
217
+ api_key = _require_env("HF_TOKEN", HF_TOKEN)
218
+ client = OpenAI(base_url=API_BASE_URL, api_key=api_key)
219
  env = None
220
  rewards: list[float] = []
221
  history: list[str] = []
222
  steps_taken = 0
223
  success = False
224
  score = 0.0
 
 
225
 
226
  log_start(TASK_NAME, BENCHMARK, MODEL_NAME)
227
 
228
  try:
 
229
  env = await create_env()
230
  result = await env.reset()
231
  observation = dict(result.observation)
232
 
233
  for step in range(1, MAX_STEPS + 1):
234
  if result.done:
 
235
  break
236
 
237
+ action = get_model_action(client, step, observation, rewards, history)
238
+ result = await env.step(action)
239
+ observation = dict(result.observation)
 
 
 
240
 
241
+ reward = float(result.reward or 0.0)
242
+ done = bool(result.done)
243
+ metadata = observation.get("metadata") or {}
244
+ error = metadata.get("last_action_error")
 
 
 
 
 
 
 
245
 
246
  rewards.append(reward)
247
  steps_taken = step
248
+
249
+ action_text = (
250
+ f"target_slot={int(action['target_slot'])},operation={int(action['operation'])}"
251
+ )
252
+ log_step(step, action_text, reward, done, error)
253
+
254
  history.append(
255
+ f"step={step} action={action_text} reward={reward:.2f} "
256
+ f"flow={float(observation.get('flow_score', 0.0)):.2f} "
257
+ f"debt={float(observation.get('social_debt', 0.0)):.2f}"
258
  )
259
 
260
  if done:
 
261
  break
262
 
263
+ total_reward = math.fsum(rewards)
264
+ score = normalize_score(total_reward, observation if "observation" in locals() else {})
265
+ score = round(score, 2)
266
+ success = score > 0.0
 
267
  finally:
268
  if env is not None:
269
  try:
openenv.yaml CHANGED
@@ -4,82 +4,3 @@ type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
- version: 0.1.0
8
- entry_point: server.engineer_manager_environment:EngineerManagerEnvironment
9
- api:
10
- base_url: /
11
- endpoints:
12
- reset:
13
- method: POST
14
- path: /reset
15
- step:
16
- method: POST
17
- path: /step
18
- state:
19
- method: GET
20
- path: /state
21
- tasks:
22
- method: GET
23
- path: /tasks
24
- grader:
25
- method: POST
26
- path: /grader
27
- tasks:
28
- - id: engineer_manager_task_0
29
- task_id: quiet-morning
30
- name: quiet-morning
31
- difficulty: easy
32
- description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
33
- max_steps: 32
34
- reset_params:
35
- task_id: 0
36
- action_schema:
37
- target_slot: integer slot index within the workday
38
- operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
39
- task_id: quiet-morning
40
- grader: graders:grade_task_0
41
- graders:
42
- - graders:grade_task_0
43
- reward_range:
44
- - 0.0
45
- - 1.0
46
- - id: engineer_manager_task_1
47
- task_id: meeting-surgery
48
- name: meeting-surgery
49
- difficulty: medium
50
- description: Fragmented calendar where selective meeting moves should improve flow.
51
- max_steps: 32
52
- reset_params:
53
- task_id: 1
54
- action_schema:
55
- target_slot: integer slot index within the workday
56
- operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
57
- task_id: meeting-surgery
58
- grader: graders:grade_task_1
59
- graders:
60
- - graders:grade_task_1
61
- reward_range:
62
- - 0.0
63
- - 1.0
64
- - id: engineer_manager_task_2
65
- task_id: delivery-triage
66
- name: delivery-triage
67
- difficulty: hard
68
- description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
69
- max_steps: 32
70
- reset_params:
71
- task_id: 2
72
- action_schema:
73
- target_slot: integer slot index within the workday
74
- operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
75
- task_id: delivery-triage
76
- grader: graders:grade_task_2
77
- graders:
78
- - graders:grade_task_2
79
- reward_range:
80
- - 0.0
81
- - 1.0
82
- graders:
83
- - graders:grade_task_0
84
- - graders:grade_task_1
85
- - graders:grade_task_2
 
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/app.py CHANGED
@@ -7,10 +7,6 @@ from textwrap import dedent
7
  import uvicorn
8
  from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse, RedirectResponse, Response
9
  from openenv.core.env_server.http_server import create_fastapi_app
10
- from pydantic import BaseModel
11
-
12
- from graders import grade_task_0, grade_task_1, grade_task_2
13
- from tasks import TASKS
14
 
15
  try:
16
  from ..models import EngineerManagerAction, EngineerManagerObservation
@@ -33,12 +29,6 @@ app = create_fastapi_app(
33
  max_concurrent_envs=2,
34
  )
35
 
36
-
37
- class GraderRequest(BaseModel):
38
- task_id: str
39
- state: dict
40
- reward: float
41
-
42
  WEB_CSS = dedent(
43
  """\
44
  :root {
@@ -446,8 +436,6 @@ def web_js() -> PlainTextResponse:
446
  @app.get("/favicon.ico", include_in_schema=False)
447
  def favicon() -> Response:
448
  return Response(status_code=204)
449
-
450
-
451
  @app.get("/manifest.json", include_in_schema=False)
452
  def manifest() -> JSONResponse:
453
  return JSONResponse(
@@ -463,42 +451,6 @@ def manifest() -> JSONResponse:
463
  )
464
 
465
 
466
- @app.get("/tasks", include_in_schema=False)
467
- def tasks() -> list[dict]:
468
- return TASKS
469
-
470
-
471
- @app.post("/grader", include_in_schema=False)
472
- def grader(request: GraderRequest) -> JSONResponse:
473
- task_index_map = {
474
- "quiet-morning": 0,
475
- "engineer_manager_task_0": 0,
476
- "meeting-surgery": 1,
477
- "engineer_manager_task_1": 1,
478
- "delivery-triage": 2,
479
- "engineer_manager_task_2": 2,
480
- }
481
- grader_fn_map = {
482
- "quiet-morning": grade_task_0,
483
- "meeting-surgery": grade_task_1,
484
- "delivery-triage": grade_task_2,
485
- "engineer_manager_task_0": grade_task_0,
486
- "engineer_manager_task_1": grade_task_1,
487
- "engineer_manager_task_2": grade_task_2,
488
- }
489
- grader_fn = grader_fn_map.get(request.task_id)
490
- if grader_fn is None:
491
- return JSONResponse(
492
- {"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
493
- status_code=400,
494
- )
495
- state = dict(request.state)
496
- if "task_id" not in state or state["task_id"] is None:
497
- state["task_id"] = task_index_map[request.task_id]
498
- score = float(grader_fn(state, request.reward))
499
- return JSONResponse({"task_id": request.task_id, "score": score, "passed": score >= 0.0, "reward": score})
500
-
501
-
502
  def run(host: str = "0.0.0.0", port: int = 8000) -> None:
503
  """Run the OpenEnv HTTP server."""
504
  uvicorn.run(app, host=host, port=port)
 
7
  import uvicorn
8
  from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse, RedirectResponse, Response
9
  from openenv.core.env_server.http_server import create_fastapi_app
 
 
 
 
10
 
11
  try:
12
  from ..models import EngineerManagerAction, EngineerManagerObservation
 
29
  max_concurrent_envs=2,
30
  )
31
 
 
 
 
 
 
 
32
  WEB_CSS = dedent(
33
  """\
34
  :root {
 
436
  @app.get("/favicon.ico", include_in_schema=False)
437
  def favicon() -> Response:
438
  return Response(status_code=204)
 
 
439
  @app.get("/manifest.json", include_in_schema=False)
440
  def manifest() -> JSONResponse:
441
  return JSONResponse(
 
451
  )
452
 
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  def run(host: str = "0.0.0.0", port: int = 8000) -> None:
455
  """Run the OpenEnv HTTP server."""
456
  uvicorn.run(app, host=host, port=port)
server/engineer_manager_environment.py CHANGED
@@ -3,12 +3,10 @@
3
  from __future__ import annotations
4
 
5
  from uuid import uuid4
6
- import os
7
 
8
  from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
9
  from openenv.core.env_server.types import State
10
 
11
- from benchmark_tasks import TASK_SPECS, apply_task
12
  from focus_resource_env import FocusResourceEnv
13
 
14
  try:
@@ -30,18 +28,14 @@ class EngineerManagerEnvironment(
30
  end_hour: str = "17:00",
31
  distraction_risk: float = 0.15,
32
  seed: int | None = 7,
33
- task_name: str | None = None,
34
  ) -> None:
35
  super().__init__()
36
  self._start_hour = start_hour
37
  self._end_hour = end_hour
38
  self._distraction_risk = distraction_risk
39
  self._seed = seed
40
- self._task_name = task_name or os.getenv("TASK_NAME")
41
- self._task_id = 0
42
  self._step_count = 0
43
  self._episode_id = str(uuid4())
44
- self._trajectory: list[dict[str, object]] = []
45
  self._env = FocusResourceEnv(
46
  start_hour=start_hour,
47
  end_hour=end_hour,
@@ -53,30 +47,18 @@ class EngineerManagerEnvironment(
53
  self,
54
  seed: int | None = None,
55
  episode_id: str | None = None,
56
- task_name: str | None = None,
57
- task_id: int | None = None,
58
  **_: object,
59
  ) -> EngineerManagerObservation:
60
  self._seed = self._seed if seed is None else seed
61
- task_names = ["quiet-morning", "meeting-surgery", "delivery-triage"]
62
- if task_id is not None and 0 <= int(task_id) < len(task_names):
63
- self._task_id = int(task_id)
64
- self._task_name = task_names[self._task_id]
65
- else:
66
- self._task_name = task_name or self._task_name or os.getenv("TASK_NAME")
67
- self._task_id = task_names.index(self._task_name) if self._task_name in task_names else 0
68
  self._episode_id = episode_id or str(uuid4())
69
  self._step_count = 0
70
- self._trajectory = []
71
  self._env = FocusResourceEnv(
72
  start_hour=self._start_hour,
73
  end_hour=self._end_hour,
74
  distraction_risk=self._distraction_risk,
75
  seed=self._seed,
76
  )
77
- self._env.reset()
78
- apply_task(self._env, self._task_name)
79
- return self._to_observation(self._env._observation(), reward=0.0, done=False)
80
 
81
  def step(
82
  self,
@@ -89,15 +71,6 @@ class EngineerManagerEnvironment(
89
  (action.target_slot, action.operation)
90
  )
91
  self._step_count += 1
92
- self._trajectory.append(
93
- {
94
- "action": {"target_slot": int(action.target_slot), "operation": int(action.operation)},
95
- "observation": observation,
96
- "reward": float(reward),
97
- "done": bool(done),
98
- "info": info,
99
- }
100
- )
101
  return self._to_observation(observation, reward=reward, done=done, info=info)
102
 
103
  @property
@@ -114,8 +87,7 @@ class EngineerManagerEnvironment(
114
  name="Engineer Manager",
115
  description=(
116
  "Manage a workday by scheduling deep work, rescheduling meetings, "
117
- "and controlling communication noise. "
118
- f"Available tasks: {', '.join(sorted(TASK_SPECS))}."
119
  ),
120
  version="0.1.0",
121
  )
@@ -131,21 +103,5 @@ class EngineerManagerEnvironment(
131
  payload = dict(observation)
132
  payload["reward"] = reward
133
  payload["done"] = done
134
- metadata = dict(info or {})
135
- metadata["task_name"] = self._task_name
136
- metadata["task_id"] = self._task_id
137
- metadata["episode_metrics"] = {
138
- "interruptions": int(self._env.interruptions),
139
- "invalid_actions": int(self._env.invalid_actions),
140
- "remaining_tasks": len(self._env.task_buffer),
141
- "scheduled_work_slots": sum(1 for slot in self._env.timeline if int(slot) == 1),
142
- "successful_reschedules": sum(
143
- 1
144
- for step in self._trajectory
145
- if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
146
- ),
147
- "total_score": float(self._env._total_score()),
148
- "grader_score": min(max(float(reward or 0.0), 0.0), 1.0),
149
- }
150
- payload["metadata"] = metadata
151
  return EngineerManagerObservation.model_validate(payload)
 
3
  from __future__ import annotations
4
 
5
  from uuid import uuid4
 
6
 
7
  from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
8
  from openenv.core.env_server.types import State
9
 
 
10
  from focus_resource_env import FocusResourceEnv
11
 
12
  try:
 
28
  end_hour: str = "17:00",
29
  distraction_risk: float = 0.15,
30
  seed: int | None = 7,
 
31
  ) -> None:
32
  super().__init__()
33
  self._start_hour = start_hour
34
  self._end_hour = end_hour
35
  self._distraction_risk = distraction_risk
36
  self._seed = seed
 
 
37
  self._step_count = 0
38
  self._episode_id = str(uuid4())
 
39
  self._env = FocusResourceEnv(
40
  start_hour=start_hour,
41
  end_hour=end_hour,
 
47
  self,
48
  seed: int | None = None,
49
  episode_id: str | None = None,
 
 
50
  **_: object,
51
  ) -> EngineerManagerObservation:
52
  self._seed = self._seed if seed is None else seed
 
 
 
 
 
 
 
53
  self._episode_id = episode_id or str(uuid4())
54
  self._step_count = 0
 
55
  self._env = FocusResourceEnv(
56
  start_hour=self._start_hour,
57
  end_hour=self._end_hour,
58
  distraction_risk=self._distraction_risk,
59
  seed=self._seed,
60
  )
61
+ return self._to_observation(self._env.reset(), reward=0.0, done=False)
 
 
62
 
63
  def step(
64
  self,
 
71
  (action.target_slot, action.operation)
72
  )
73
  self._step_count += 1
 
 
 
 
 
 
 
 
 
74
  return self._to_observation(observation, reward=reward, done=done, info=info)
75
 
76
  @property
 
87
  name="Engineer Manager",
88
  description=(
89
  "Manage a workday by scheduling deep work, rescheduling meetings, "
90
+ "and controlling communication noise."
 
91
  ),
92
  version="0.1.0",
93
  )
 
103
  payload = dict(observation)
104
  payload["reward"] = reward
105
  payload["done"] = done
106
+ payload["metadata"] = info or {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  return EngineerManagerObservation.model_validate(payload)
tasks.py DELETED
@@ -1,71 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from benchmark_tasks import TASK_SPECS
4
-
5
-
6
- TASKS = [
7
- {
8
- "id": "engineer_manager_task_0",
9
- "task_id": "quiet-morning",
10
- "name": "quiet-morning",
11
- "difficulty": "easy",
12
- "description": TASK_SPECS["quiet-morning"].description,
13
- "max_steps": 32,
14
- "reset_params": {"task_id": 0},
15
- "action_schema": {
16
- "target_slot": "integer slot index within the workday",
17
- "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
18
- "task_id": "quiet-morning",
19
- },
20
- "grader": "graders:grade_task_0",
21
- "graders": ["graders:grade_task_0"],
22
- "reward_range": [0.0, 1.0],
23
- },
24
- {
25
- "id": "engineer_manager_task_1",
26
- "task_id": "meeting-surgery",
27
- "name": "meeting-surgery",
28
- "difficulty": "medium",
29
- "description": TASK_SPECS["meeting-surgery"].description,
30
- "max_steps": 32,
31
- "reset_params": {"task_id": 1},
32
- "action_schema": {
33
- "target_slot": "integer slot index within the workday",
34
- "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
35
- "task_id": "meeting-surgery",
36
- },
37
- "grader": "graders:grade_task_1",
38
- "graders": ["graders:grade_task_1"],
39
- "reward_range": [0.0, 1.0],
40
- },
41
- {
42
- "id": "engineer_manager_task_2",
43
- "task_id": "delivery-triage",
44
- "name": "delivery-triage",
45
- "difficulty": "hard",
46
- "description": TASK_SPECS["delivery-triage"].description,
47
- "max_steps": 32,
48
- "reset_params": {"task_id": 2},
49
- "action_schema": {
50
- "target_slot": "integer slot index within the workday",
51
- "operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
52
- "task_id": "delivery-triage",
53
- },
54
- "grader": "graders:grade_task_2",
55
- "graders": ["graders:grade_task_2"],
56
- "reward_range": [0.0, 1.0],
57
- },
58
- ]
59
-
60
-
61
- TASK_ID_TO_INDEX = {task["task_id"]: index for index, task in enumerate(TASKS)}
62
-
63
-
64
- TASK_GRADER_PAIRS = [
65
- ("engineer_manager_task_0", "graders:grade_task_0"),
66
- ("engineer_manager_task_1", "graders:grade_task_1"),
67
- ("engineer_manager_task_2", "graders:grade_task_2"),
68
- ]
69
-
70
-
71
- __all__ = ["TASKS", "TASK_ID_TO_INDEX", "TASK_GRADER_PAIRS"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
validate-submission.sh CHANGED
@@ -19,6 +19,13 @@ else
19
  RED='' GREEN='' YELLOW='' BLUE='' BOLD='' NC=''
20
  fi
21
 
 
 
 
 
 
 
 
22
  log() { printf "[%s] %b\n" "$(date +%H:%M:%S)" "$*"; }
23
  pass() { log "${GREEN}${BOLD}PASS${NC} -- $1"; }
24
  fail() { log "${RED}${BOLD}FAIL${NC} -- $1"; }
@@ -84,7 +91,7 @@ fi
84
  pass "Environment is ready."
85
 
86
  log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
87
- HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
88
  -H "Content-Type: application/json" -d '{}' \
89
  "$PING_URL/reset" --max-time 20 || echo "000")
90
 
 
19
  RED='' GREEN='' YELLOW='' BLUE='' BOLD='' NC=''
20
  fi
21
 
22
+ CURL_ARGS=(-s)
23
+ case "$(uname -s 2>/dev/null || printf unknown)" in
24
+ MINGW*|MSYS*|CYGWIN*)
25
+ CURL_ARGS+=(--ssl-no-revoke)
26
+ ;;
27
+ esac
28
+
29
  log() { printf "[%s] %b\n" "$(date +%H:%M:%S)" "$*"; }
30
  pass() { log "${GREEN}${BOLD}PASS${NC} -- $1"; }
31
  fail() { log "${RED}${BOLD}FAIL${NC} -- $1"; }
 
91
  pass "Environment is ready."
92
 
93
  log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
94
+ HTTP_CODE=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}" -X POST \
95
  -H "Content-Type: application/json" -d '{}' \
96
  "$PING_URL/reset" --max-time 20 || echo "000")
97