Spaces:
Sleeping
Sleeping
local-update
#1
by arnavm7 - opened
- README.md +0 -10
- benchmark_tasks.py +0 -189
- engineer-manager +0 -1
- graders.py +0 -68
- inference.py +60 -110
- openenv.yaml +0 -79
- server/app.py +0 -48
- server/engineer_manager_environment.py +3 -47
- tasks.py +0 -71
- validate-submission.sh +8 -1
README.md
CHANGED
|
@@ -43,13 +43,3 @@ openenv validate http://127.0.0.1:8000
|
|
| 43 |
- `task_buffer`: pending tasks with estimated duration and hidden complexity
|
| 44 |
- `flow_score`, `social_debt`, `calendar_churn`: core scoring metrics
|
| 45 |
- `current_slot`, `current_time`, `recovery_state`, `mute_comms`: live execution state
|
| 46 |
-
|
| 47 |
-
## Built-in benchmark tasks
|
| 48 |
-
|
| 49 |
-
Set `TASK_NAME` to select a deterministic scenario before reset. Available tasks:
|
| 50 |
-
|
| 51 |
-
- `quiet-morning`: high-noise start where muting comms early and protecting focus is rewarded
|
| 52 |
-
- `meeting-surgery`: fragmented calendar where selective meeting moves should improve flow
|
| 53 |
-
- `delivery-triage`: constrained delivery day with hidden task complexity and tighter tradeoffs
|
| 54 |
-
|
| 55 |
-
Each task has a grader in [benchmark_tasks.py](/C:/Users/arshi/OneDrive/Desktop/idk/engineer-manager/benchmark_tasks.py:1). The environment also exposes task metadata and the current grader score in `observation.metadata.episode_metrics.grader_score`.
|
|
|
|
| 43 |
- `task_buffer`: pending tasks with estimated duration and hidden complexity
|
| 44 |
- `flow_score`, `social_debt`, `calendar_churn`: core scoring metrics
|
| 45 |
- `current_slot`, `current_time`, `recovery_state`, `mute_comms`: live execution state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmark_tasks.py
DELETED
|
@@ -1,189 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
-
from typing import Any, Callable
|
| 6 |
-
|
| 7 |
-
from focus_resource_env import DEEP_WORK, EMPTY, MEETING, FocusResourceEnv, Task
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
StepRecord = dict[str, Any]
|
| 11 |
-
TaskSetup = Callable[[FocusResourceEnv], None]
|
| 12 |
-
TaskGrader = Callable[[list[StepRecord]], float]
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def _reset_state(env: FocusResourceEnv) -> None:
|
| 16 |
-
env.timeline[:] = EMPTY
|
| 17 |
-
env.meeting_meta = {}
|
| 18 |
-
env.task_buffer = []
|
| 19 |
-
env.current_slot = 0
|
| 20 |
-
env.current_work_streak_slots = 0
|
| 21 |
-
env.recovery_remaining = 0
|
| 22 |
-
env.mute_comms = False
|
| 23 |
-
env.social_debt = 0.0
|
| 24 |
-
env.calendar_churn = 0
|
| 25 |
-
env.flow_score = 0.0
|
| 26 |
-
env.last_executed_kind = EMPTY
|
| 27 |
-
env.interruptions = 0
|
| 28 |
-
env.invalid_actions = 0
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def _set_meeting(
|
| 32 |
-
env: FocusResourceEnv,
|
| 33 |
-
*,
|
| 34 |
-
start: int,
|
| 35 |
-
length: int,
|
| 36 |
-
priority: int,
|
| 37 |
-
meeting_id: int,
|
| 38 |
-
) -> None:
|
| 39 |
-
env._place_meeting(start, length, priority, meeting_id)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def _normalized_total_score(env: FocusResourceEnv) -> float:
|
| 43 |
-
max_score = max(1.0, (env.timeline_length * 0.5) ** 2)
|
| 44 |
-
return min(1.0, max(0.0, env._total_score() / max_score))
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def setup_quiet_morning(env: FocusResourceEnv) -> None:
|
| 48 |
-
_reset_state(env)
|
| 49 |
-
env.distraction_risk = 0.65
|
| 50 |
-
env.task_buffer = [
|
| 51 |
-
Task(duration=2, hidden_complexity=1.0),
|
| 52 |
-
Task(duration=3, hidden_complexity=1.0),
|
| 53 |
-
Task(duration=2, hidden_complexity=1.25),
|
| 54 |
-
]
|
| 55 |
-
_set_meeting(env, start=5, length=1, priority=4, meeting_id=1)
|
| 56 |
-
_set_meeting(env, start=7, length=1, priority=3, meeting_id=2)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def setup_meeting_surgery(env: FocusResourceEnv) -> None:
|
| 60 |
-
_reset_state(env)
|
| 61 |
-
env.distraction_risk = 0.10
|
| 62 |
-
env.task_buffer = [
|
| 63 |
-
Task(duration=2, hidden_complexity=1.0),
|
| 64 |
-
Task(duration=2, hidden_complexity=1.25),
|
| 65 |
-
Task(duration=1, hidden_complexity=1.0),
|
| 66 |
-
]
|
| 67 |
-
_set_meeting(env, start=1, length=1, priority=2, meeting_id=1)
|
| 68 |
-
_set_meeting(env, start=3, length=1, priority=2, meeting_id=2)
|
| 69 |
-
_set_meeting(env, start=6, length=2, priority=8, meeting_id=3)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def setup_delivery_triage(env: FocusResourceEnv) -> None:
|
| 73 |
-
_reset_state(env)
|
| 74 |
-
env.distraction_risk = 0.25
|
| 75 |
-
env.task_buffer = [
|
| 76 |
-
Task(duration=3, hidden_complexity=1.5),
|
| 77 |
-
Task(duration=2, hidden_complexity=1.0),
|
| 78 |
-
Task(duration=1, hidden_complexity=1.0),
|
| 79 |
-
]
|
| 80 |
-
_set_meeting(env, start=4, length=1, priority=9, meeting_id=1)
|
| 81 |
-
_set_meeting(env, start=8, length=2, priority=7, meeting_id=2)
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
def grade_quiet_morning(trajectory: list[StepRecord]) -> float:
|
| 85 |
-
if not trajectory:
|
| 86 |
-
return 0.0
|
| 87 |
-
first_action = int(trajectory[0]["action"]["operation"])
|
| 88 |
-
final = trajectory[-1]["observation"]
|
| 89 |
-
final_score = float(final["flow_score"])
|
| 90 |
-
transition_count = sum(1 for step in trajectory if step["info"]["transition_info"]["interrupted"])
|
| 91 |
-
scheduled = sum(1 for slot in final["timeline"] if int(slot) == DEEP_WORK)
|
| 92 |
-
|
| 93 |
-
score = 0.0
|
| 94 |
-
score += 0.25 if first_action == 3 else 0.0
|
| 95 |
-
score += min(0.45, final_score / 6.0)
|
| 96 |
-
score += 0.15 if transition_count == 0 else 0.0
|
| 97 |
-
score += min(0.15, scheduled / 6.0)
|
| 98 |
-
return min(1.0, round(score, 4))
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def grade_meeting_surgery(trajectory: list[StepRecord]) -> float:
|
| 102 |
-
if not trajectory:
|
| 103 |
-
return 0.0
|
| 104 |
-
final = trajectory[-1]["observation"]
|
| 105 |
-
flow = float(final["flow_score"])
|
| 106 |
-
debt = float(final["social_debt"])
|
| 107 |
-
churn = int(final["calendar_churn"])
|
| 108 |
-
reschedules = sum(
|
| 109 |
-
1
|
| 110 |
-
for step in trajectory
|
| 111 |
-
if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
score = 0.0
|
| 115 |
-
score += min(0.40, flow / 5.0)
|
| 116 |
-
score += 0.20 if reschedules >= 1 else 0.0
|
| 117 |
-
score += 0.20 if 1 <= churn <= 2 else max(0.0, 0.20 - (0.10 * abs(churn - 1)))
|
| 118 |
-
score += max(0.0, 0.20 - (debt / 8.0))
|
| 119 |
-
return min(1.0, round(score, 4))
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
def grade_delivery_triage(trajectory: list[StepRecord]) -> float:
|
| 123 |
-
if not trajectory:
|
| 124 |
-
return 0.0
|
| 125 |
-
final = trajectory[-1]["observation"]
|
| 126 |
-
total = float(final["flow_score"]) - float(final["social_debt"]) - float(final["calendar_churn"])
|
| 127 |
-
invalid_actions = sum(
|
| 128 |
-
1
|
| 129 |
-
for step in trajectory
|
| 130 |
-
if str(step["info"].get("action_info", {}).get("status", "")).startswith("invalid")
|
| 131 |
-
)
|
| 132 |
-
remaining_tasks = len(final["task_buffer"])
|
| 133 |
-
scheduled = sum(1 for slot in final["timeline"] if int(slot) == DEEP_WORK)
|
| 134 |
-
|
| 135 |
-
score = 0.0
|
| 136 |
-
score += min(0.45, max(0.0, total) / 6.0)
|
| 137 |
-
score += min(0.25, scheduled / 8.0)
|
| 138 |
-
score += 0.20 if remaining_tasks <= 1 else 0.10 if remaining_tasks == 2 else 0.0
|
| 139 |
-
score += max(0.0, 0.10 - (0.05 * invalid_actions))
|
| 140 |
-
return min(1.0, round(score, 4))
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
@dataclass(frozen=True)
|
| 144 |
-
class TaskSpec:
|
| 145 |
-
name: str
|
| 146 |
-
description: str
|
| 147 |
-
setup: TaskSetup
|
| 148 |
-
grader: TaskGrader
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
TASK_SPECS: dict[str, TaskSpec] = {
|
| 152 |
-
"quiet-morning": TaskSpec(
|
| 153 |
-
name="quiet-morning",
|
| 154 |
-
description="High-noise morning where the agent should mute comms early and protect an uninterrupted work block.",
|
| 155 |
-
setup=setup_quiet_morning,
|
| 156 |
-
grader=grade_quiet_morning,
|
| 157 |
-
),
|
| 158 |
-
"meeting-surgery": TaskSpec(
|
| 159 |
-
name="meeting-surgery",
|
| 160 |
-
description="A fragmented calendar where the agent should improve flow with limited, selective meeting moves.",
|
| 161 |
-
setup=setup_meeting_surgery,
|
| 162 |
-
grader=grade_meeting_surgery,
|
| 163 |
-
),
|
| 164 |
-
"delivery-triage": TaskSpec(
|
| 165 |
-
name="delivery-triage",
|
| 166 |
-
description="A constrained day with hidden task complexity where the agent must schedule useful work without spiraling debt.",
|
| 167 |
-
setup=setup_delivery_triage,
|
| 168 |
-
grader=grade_delivery_triage,
|
| 169 |
-
),
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
DEFAULT_TASK_NAME = "quiet-morning"
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
def get_task_spec(task_name: str | None) -> TaskSpec:
|
| 177 |
-
normalized = (task_name or os.getenv("TASK_NAME") or DEFAULT_TASK_NAME).strip()
|
| 178 |
-
return TASK_SPECS.get(normalized, TASK_SPECS[DEFAULT_TASK_NAME])
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
def apply_task(env: FocusResourceEnv, task_name: str | None) -> TaskSpec:
|
| 182 |
-
spec = get_task_spec(task_name)
|
| 183 |
-
spec.setup(env)
|
| 184 |
-
return spec
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
def grade_trajectory(task_name: str, trajectory: list[StepRecord]) -> float:
|
| 188 |
-
spec = get_task_spec(task_name)
|
| 189 |
-
return spec.grader(trajectory)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
engineer-manager
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit 40c10a7795c79e546608c011d3aff8820e1e479c
|
|
|
|
|
|
graders.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from typing import Any
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
TASK_NAMES = {
|
| 7 |
-
0: "quiet-morning",
|
| 8 |
-
1: "meeting-surgery",
|
| 9 |
-
2: "delivery-triage",
|
| 10 |
-
}
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def _normalize_reward(reward: float) -> float:
|
| 14 |
-
return min(max(float(reward), 0.0), 1.0)
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def _state_task_id(state: Any) -> int | None:
|
| 18 |
-
if not isinstance(state, dict):
|
| 19 |
-
return None
|
| 20 |
-
task_id = state.get("task_id")
|
| 21 |
-
if isinstance(task_id, int):
|
| 22 |
-
return task_id
|
| 23 |
-
task_name = state.get("task_name")
|
| 24 |
-
if isinstance(task_name, str):
|
| 25 |
-
for index, name in TASK_NAMES.items():
|
| 26 |
-
if name == task_name:
|
| 27 |
-
return index
|
| 28 |
-
metadata = state.get("metadata")
|
| 29 |
-
if isinstance(metadata, dict):
|
| 30 |
-
nested_task_id = metadata.get("task_id")
|
| 31 |
-
if isinstance(nested_task_id, int):
|
| 32 |
-
return nested_task_id
|
| 33 |
-
return None
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def grade_task_0(state: dict, reward: float) -> float:
|
| 37 |
-
return _normalize_reward(reward if _state_task_id(state) == 0 else 0.0)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def grade_task_1(state: dict, reward: float) -> float:
|
| 41 |
-
return _normalize_reward(reward if _state_task_id(state) == 1 else 0.0)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def grade_task_2(state: dict, reward: float) -> float:
|
| 45 |
-
return _normalize_reward(reward if _state_task_id(state) == 2 else 0.0)
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
GRADERS = {
|
| 49 |
-
"engineer_manager_task_0": grade_task_0,
|
| 50 |
-
"engineer_manager_task_1": grade_task_1,
|
| 51 |
-
"engineer_manager_task_2": grade_task_2,
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
TASK_GRADER_PAIRS = [
|
| 56 |
-
("engineer_manager_task_0", grade_task_0),
|
| 57 |
-
("engineer_manager_task_1", grade_task_1),
|
| 58 |
-
("engineer_manager_task_2", grade_task_2),
|
| 59 |
-
]
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
__all__ = [
|
| 63 |
-
"grade_task_0",
|
| 64 |
-
"grade_task_1",
|
| 65 |
-
"grade_task_2",
|
| 66 |
-
"GRADERS",
|
| 67 |
-
"TASK_GRADER_PAIRS",
|
| 68 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
CHANGED
|
@@ -3,21 +3,15 @@ import json
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
import textwrap
|
| 6 |
-
from dataclasses import dataclass
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
from openai import OpenAI
|
| 10 |
from openenv.core.generic_client import GenericEnvClient
|
| 11 |
|
| 12 |
-
try:
|
| 13 |
-
from server.engineer_manager_environment import EngineerManagerEnvironment
|
| 14 |
-
except ImportError:
|
| 15 |
-
EngineerManagerEnvironment = None # type: ignore[assignment]
|
| 16 |
-
|
| 17 |
|
| 18 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 19 |
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 20 |
-
|
| 21 |
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 22 |
OPENENV_BASE_URL = os.getenv("OPENENV_BASE_URL")
|
| 23 |
TASK_NAME = os.getenv("TASK_NAME", "engineer-manager")
|
|
@@ -52,52 +46,15 @@ SYSTEM_PROMPT = textwrap.dedent(
|
|
| 52 |
).strip()
|
| 53 |
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
done: bool
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
class _InProcessEnvClient:
|
| 63 |
-
def __init__(self) -> None:
|
| 64 |
-
if EngineerManagerEnvironment is None:
|
| 65 |
-
raise RuntimeError("Bundled EngineerManagerEnvironment is unavailable")
|
| 66 |
-
self._env = EngineerManagerEnvironment()
|
| 67 |
-
|
| 68 |
-
async def connect(self) -> None:
|
| 69 |
-
return None
|
| 70 |
-
|
| 71 |
-
async def reset(self) -> _EnvResult:
|
| 72 |
-
observation = self._env.reset().model_dump()
|
| 73 |
-
return _EnvResult(
|
| 74 |
-
observation=dict(observation),
|
| 75 |
-
reward=float(observation.get("reward") or 0.0),
|
| 76 |
-
done=bool(observation.get("done")),
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
async def step(self, action: dict[str, int]) -> _EnvResult:
|
| 80 |
-
observation = self._env.step(type("Action", (), action)()).model_dump()
|
| 81 |
-
return _EnvResult(
|
| 82 |
-
observation=dict(observation),
|
| 83 |
-
reward=float(observation.get("reward") or 0.0),
|
| 84 |
-
done=bool(observation.get("done")),
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
async def close(self) -> None:
|
| 88 |
-
return None
|
| 89 |
|
| 90 |
|
| 91 |
def _sanitize_field(value: Any) -> str:
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def _format_error(error: str | None) -> str:
|
| 96 |
-
return "null" if error in (None, "") else _sanitize_field(error)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
def _action_to_text(action: dict[str, int]) -> str:
|
| 100 |
-
return f'{{"target_slot":{int(action["target_slot"])},"operation":{int(action["operation"])}}}'
|
| 101 |
|
| 102 |
|
| 103 |
def log_start(task: str, env: str, model: str) -> None:
|
|
@@ -107,10 +64,17 @@ def log_start(task: str, env: str, model: str) -> None:
|
|
| 107 |
)
|
| 108 |
|
| 109 |
|
| 110 |
-
def log_step(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
print(
|
| 112 |
f"[STEP] step={step} action={_sanitize_field(action)} reward={reward:.2f} "
|
| 113 |
-
f"done={str(done).lower()} error={
|
| 114 |
flush=True,
|
| 115 |
)
|
| 116 |
|
|
@@ -127,12 +91,15 @@ def estimate_max_flow_score(timeline: list[int]) -> float:
|
|
| 127 |
slot_count = len(timeline)
|
| 128 |
if slot_count <= 0:
|
| 129 |
return 1.0
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
def normalize_score(total_reward: float, observation: dict[str, Any]) -> float:
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
|
|
@@ -144,12 +111,17 @@ def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
|
|
| 144 |
return None
|
| 145 |
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def build_user_prompt(
|
| 148 |
step: int,
|
| 149 |
observation: dict[str, Any],
|
| 150 |
rewards: list[float],
|
| 151 |
history: list[str],
|
| 152 |
) -> str:
|
|
|
|
| 153 |
metadata = observation.get("metadata") or {}
|
| 154 |
return textwrap.dedent(
|
| 155 |
f"""
|
|
@@ -162,10 +134,10 @@ def build_user_prompt(
|
|
| 162 |
social_debt={float(observation.get("social_debt", 0.0)):.2f}
|
| 163 |
calendar_churn={int(observation.get("calendar_churn", 0))}
|
| 164 |
recovery_state={int(observation.get("recovery_state", 0))}
|
| 165 |
-
timeline={
|
| 166 |
task_buffer={json.dumps(observation.get("task_buffer", []), separators=(",", ":"))}
|
| 167 |
last_rewards={",".join(f"{reward:.2f}" for reward in rewards[-5:]) or "none"}
|
| 168 |
-
recent_history={json.dumps(history[-5:]
|
| 169 |
last_metadata={json.dumps(metadata, separators=(",", ":"))}
|
| 170 |
Choose the single next action.
|
| 171 |
"""
|
|
@@ -176,24 +148,15 @@ def choose_fallback_action(observation: dict[str, Any]) -> dict[str, int]:
|
|
| 176 |
current_slot = int(observation.get("current_slot", 0))
|
| 177 |
distraction_risk = float(observation.get("distraction_risk", 0.0))
|
| 178 |
mute_comms = bool(observation.get("mute_comms", False))
|
| 179 |
-
|
| 180 |
-
timeline = observation.get("timeline") or []
|
| 181 |
-
|
| 182 |
-
if current_slot == 0 and distraction_risk > 0.0 and not mute_comms:
|
| 183 |
return {"target_slot": current_slot, "operation": 3}
|
| 184 |
|
| 185 |
-
|
| 186 |
-
return {"target_slot": current_slot, "operation": 0}
|
| 187 |
-
|
| 188 |
-
if current_slot < len(timeline) and int(timeline[current_slot]) == 0 and observation.get("task_buffer"):
|
| 189 |
-
return {"target_slot": current_slot, "operation": 1}
|
| 190 |
-
|
| 191 |
-
empty_slot = first_future_slot(observation, 0)
|
| 192 |
if empty_slot is not None and observation.get("task_buffer"):
|
| 193 |
return {"target_slot": empty_slot, "operation": 1}
|
| 194 |
|
| 195 |
meeting_slot = first_future_slot(observation, 2)
|
| 196 |
-
if meeting_slot is not None:
|
| 197 |
return {"target_slot": meeting_slot, "operation": 2}
|
| 198 |
|
| 199 |
return {"target_slot": current_slot, "operation": 0}
|
|
@@ -201,8 +164,8 @@ def choose_fallback_action(observation: dict[str, Any]) -> dict[str, int]:
|
|
| 201 |
|
| 202 |
def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
|
| 203 |
timeline = observation.get("timeline") or []
|
| 204 |
-
fallback = choose_fallback_action(observation)
|
| 205 |
max_slot = max(0, len(timeline) - 1)
|
|
|
|
| 206 |
try:
|
| 207 |
data = json.loads(raw_text)
|
| 208 |
target_slot = int(data["target_slot"])
|
|
@@ -212,7 +175,8 @@ def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
|
|
| 212 |
|
| 213 |
if operation not in {0, 1, 2, 3}:
|
| 214 |
return fallback
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
|
| 218 |
def get_model_action(
|
|
@@ -239,81 +203,67 @@ def get_model_action(
|
|
| 239 |
return choose_fallback_action(observation)
|
| 240 |
|
| 241 |
|
| 242 |
-
async def create_env() ->
|
| 243 |
if OPENENV_BASE_URL:
|
| 244 |
env = GenericEnvClient(base_url=OPENENV_BASE_URL)
|
| 245 |
await env.connect()
|
| 246 |
return env
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
return await GenericEnvClient.from_docker_image(LOCAL_IMAGE_NAME)
|
| 251 |
-
except Exception:
|
| 252 |
-
pass
|
| 253 |
-
|
| 254 |
-
env = _InProcessEnvClient()
|
| 255 |
-
await env.connect()
|
| 256 |
-
return env
|
| 257 |
|
| 258 |
|
| 259 |
async def main() -> None:
|
|
|
|
|
|
|
| 260 |
env = None
|
| 261 |
rewards: list[float] = []
|
| 262 |
history: list[str] = []
|
| 263 |
steps_taken = 0
|
| 264 |
success = False
|
| 265 |
score = 0.0
|
| 266 |
-
observation: dict[str, Any] = {}
|
| 267 |
-
completed = False
|
| 268 |
|
| 269 |
log_start(TASK_NAME, BENCHMARK, MODEL_NAME)
|
| 270 |
|
| 271 |
try:
|
| 272 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) if API_KEY else None
|
| 273 |
env = await create_env()
|
| 274 |
result = await env.reset()
|
| 275 |
observation = dict(result.observation)
|
| 276 |
|
| 277 |
for step in range(1, MAX_STEPS + 1):
|
| 278 |
if result.done:
|
| 279 |
-
completed = True
|
| 280 |
break
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
action = get_model_action(client, step, observation, rewards, history)
|
| 286 |
-
action_text = _action_to_text(action)
|
| 287 |
-
step_error: str | None = None
|
| 288 |
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
done = bool(result.done)
|
| 294 |
-
metadata = observation.get("metadata") or {}
|
| 295 |
-
step_error = metadata.get("last_action_error")
|
| 296 |
-
except Exception as exc:
|
| 297 |
-
reward = 0.0
|
| 298 |
-
done = True
|
| 299 |
-
step_error = str(exc)
|
| 300 |
|
| 301 |
rewards.append(reward)
|
| 302 |
steps_taken = step
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
history.append(
|
| 305 |
-
f"step={step} action={action_text} reward={reward:.2f}
|
|
|
|
|
|
|
| 306 |
)
|
| 307 |
|
| 308 |
if done:
|
| 309 |
-
completed = True
|
| 310 |
break
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
success =
|
| 316 |
-
score = 0.0
|
| 317 |
finally:
|
| 318 |
if env is not None:
|
| 319 |
try:
|
|
|
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
import textwrap
|
|
|
|
| 6 |
from typing import Any
|
| 7 |
|
| 8 |
from openai import OpenAI
|
| 9 |
from openenv.core.generic_client import GenericEnvClient
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 13 |
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 14 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 15 |
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 16 |
OPENENV_BASE_URL = os.getenv("OPENENV_BASE_URL")
|
| 17 |
TASK_NAME = os.getenv("TASK_NAME", "engineer-manager")
|
|
|
|
| 46 |
).strip()
|
| 47 |
|
| 48 |
|
| 49 |
+
def _require_env(name: str, value: str | None) -> str:
|
| 50 |
+
if value:
|
| 51 |
+
return value
|
| 52 |
+
raise RuntimeError(f"Missing required environment variable: {name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
def _sanitize_field(value: Any) -> str:
|
| 56 |
+
text = str(value)
|
| 57 |
+
return text.replace("\r", " ").replace("\n", " ").strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def log_start(task: str, env: str, model: str) -> None:
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
|
| 67 |
+
def log_step(
|
| 68 |
+
step: int,
|
| 69 |
+
action: str,
|
| 70 |
+
reward: float,
|
| 71 |
+
done: bool,
|
| 72 |
+
error: str | None,
|
| 73 |
+
) -> None:
|
| 74 |
+
error_text = "null" if error in (None, "") else _sanitize_field(error)
|
| 75 |
print(
|
| 76 |
f"[STEP] step={step} action={_sanitize_field(action)} reward={reward:.2f} "
|
| 77 |
+
f"done={str(done).lower()} error={error_text}",
|
| 78 |
flush=True,
|
| 79 |
)
|
| 80 |
|
|
|
|
| 91 |
slot_count = len(timeline)
|
| 92 |
if slot_count <= 0:
|
| 93 |
return 1.0
|
| 94 |
+
hours = slot_count * 0.5
|
| 95 |
+
return max(1.0, hours * hours)
|
| 96 |
|
| 97 |
|
| 98 |
def normalize_score(total_reward: float, observation: dict[str, Any]) -> float:
|
| 99 |
+
timeline = observation.get("timeline") or []
|
| 100 |
+
max_score = estimate_max_flow_score(timeline)
|
| 101 |
+
normalized = total_reward / max_score
|
| 102 |
+
return min(1.0, max(0.0, normalized))
|
| 103 |
|
| 104 |
|
| 105 |
def first_future_slot(observation: dict[str, Any], kind: int) -> int | None:
|
|
|
|
| 111 |
return None
|
| 112 |
|
| 113 |
|
| 114 |
+
def first_future_empty_slot(observation: dict[str, Any]) -> int | None:
|
| 115 |
+
return first_future_slot(observation, 0)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def build_user_prompt(
|
| 119 |
step: int,
|
| 120 |
observation: dict[str, Any],
|
| 121 |
rewards: list[float],
|
| 122 |
history: list[str],
|
| 123 |
) -> str:
|
| 124 |
+
timeline = observation.get("timeline") or []
|
| 125 |
metadata = observation.get("metadata") or {}
|
| 126 |
return textwrap.dedent(
|
| 127 |
f"""
|
|
|
|
| 134 |
social_debt={float(observation.get("social_debt", 0.0)):.2f}
|
| 135 |
calendar_churn={int(observation.get("calendar_churn", 0))}
|
| 136 |
recovery_state={int(observation.get("recovery_state", 0))}
|
| 137 |
+
timeline={timeline}
|
| 138 |
task_buffer={json.dumps(observation.get("task_buffer", []), separators=(",", ":"))}
|
| 139 |
last_rewards={",".join(f"{reward:.2f}" for reward in rewards[-5:]) or "none"}
|
| 140 |
+
recent_history={json.dumps(history[-5:])}
|
| 141 |
last_metadata={json.dumps(metadata, separators=(",", ":"))}
|
| 142 |
Choose the single next action.
|
| 143 |
"""
|
|
|
|
| 148 |
current_slot = int(observation.get("current_slot", 0))
|
| 149 |
distraction_risk = float(observation.get("distraction_risk", 0.0))
|
| 150 |
mute_comms = bool(observation.get("mute_comms", False))
|
| 151 |
+
if distraction_risk >= 0.2 and not mute_comms:
|
|
|
|
|
|
|
|
|
|
| 152 |
return {"target_slot": current_slot, "operation": 3}
|
| 153 |
|
| 154 |
+
empty_slot = first_future_empty_slot(observation)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
if empty_slot is not None and observation.get("task_buffer"):
|
| 156 |
return {"target_slot": empty_slot, "operation": 1}
|
| 157 |
|
| 158 |
meeting_slot = first_future_slot(observation, 2)
|
| 159 |
+
if meeting_slot is not None and current_slot <= meeting_slot:
|
| 160 |
return {"target_slot": meeting_slot, "operation": 2}
|
| 161 |
|
| 162 |
return {"target_slot": current_slot, "operation": 0}
|
|
|
|
| 164 |
|
| 165 |
def coerce_action(raw_text: str, observation: dict[str, Any]) -> dict[str, int]:
|
| 166 |
timeline = observation.get("timeline") or []
|
|
|
|
| 167 |
max_slot = max(0, len(timeline) - 1)
|
| 168 |
+
fallback = choose_fallback_action(observation)
|
| 169 |
try:
|
| 170 |
data = json.loads(raw_text)
|
| 171 |
target_slot = int(data["target_slot"])
|
|
|
|
| 175 |
|
| 176 |
if operation not in {0, 1, 2, 3}:
|
| 177 |
return fallback
|
| 178 |
+
target_slot = min(max(target_slot, 0), max_slot)
|
| 179 |
+
return {"target_slot": target_slot, "operation": operation}
|
| 180 |
|
| 181 |
|
| 182 |
def get_model_action(
|
|
|
|
| 203 |
return choose_fallback_action(observation)
|
| 204 |
|
| 205 |
|
| 206 |
+
async def create_env() -> GenericEnvClient:
|
| 207 |
if OPENENV_BASE_URL:
|
| 208 |
env = GenericEnvClient(base_url=OPENENV_BASE_URL)
|
| 209 |
await env.connect()
|
| 210 |
return env
|
| 211 |
|
| 212 |
+
image_name = _require_env("LOCAL_IMAGE_NAME", LOCAL_IMAGE_NAME)
|
| 213 |
+
return await GenericEnvClient.from_docker_image(image_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
|
| 216 |
async def main() -> None:
|
| 217 |
+
api_key = _require_env("HF_TOKEN", HF_TOKEN)
|
| 218 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=api_key)
|
| 219 |
env = None
|
| 220 |
rewards: list[float] = []
|
| 221 |
history: list[str] = []
|
| 222 |
steps_taken = 0
|
| 223 |
success = False
|
| 224 |
score = 0.0
|
|
|
|
|
|
|
| 225 |
|
| 226 |
log_start(TASK_NAME, BENCHMARK, MODEL_NAME)
|
| 227 |
|
| 228 |
try:
|
|
|
|
| 229 |
env = await create_env()
|
| 230 |
result = await env.reset()
|
| 231 |
observation = dict(result.observation)
|
| 232 |
|
| 233 |
for step in range(1, MAX_STEPS + 1):
|
| 234 |
if result.done:
|
|
|
|
| 235 |
break
|
| 236 |
|
| 237 |
+
action = get_model_action(client, step, observation, rewards, history)
|
| 238 |
+
result = await env.step(action)
|
| 239 |
+
observation = dict(result.observation)
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
reward = float(result.reward or 0.0)
|
| 242 |
+
done = bool(result.done)
|
| 243 |
+
metadata = observation.get("metadata") or {}
|
| 244 |
+
error = metadata.get("last_action_error")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
rewards.append(reward)
|
| 247 |
steps_taken = step
|
| 248 |
+
|
| 249 |
+
action_text = (
|
| 250 |
+
f"target_slot={int(action['target_slot'])},operation={int(action['operation'])}"
|
| 251 |
+
)
|
| 252 |
+
log_step(step, action_text, reward, done, error)
|
| 253 |
+
|
| 254 |
history.append(
|
| 255 |
+
f"step={step} action={action_text} reward={reward:.2f} "
|
| 256 |
+
f"flow={float(observation.get('flow_score', 0.0)):.2f} "
|
| 257 |
+
f"debt={float(observation.get('social_debt', 0.0)):.2f}"
|
| 258 |
)
|
| 259 |
|
| 260 |
if done:
|
|
|
|
| 261 |
break
|
| 262 |
|
| 263 |
+
total_reward = math.fsum(rewards)
|
| 264 |
+
score = normalize_score(total_reward, observation if "observation" in locals() else {})
|
| 265 |
+
score = round(score, 2)
|
| 266 |
+
success = score > 0.0
|
|
|
|
| 267 |
finally:
|
| 268 |
if env is not None:
|
| 269 |
try:
|
openenv.yaml
CHANGED
|
@@ -4,82 +4,3 @@ type: space
|
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
-
version: 0.1.0
|
| 8 |
-
entry_point: server.engineer_manager_environment:EngineerManagerEnvironment
|
| 9 |
-
api:
|
| 10 |
-
base_url: /
|
| 11 |
-
endpoints:
|
| 12 |
-
reset:
|
| 13 |
-
method: POST
|
| 14 |
-
path: /reset
|
| 15 |
-
step:
|
| 16 |
-
method: POST
|
| 17 |
-
path: /step
|
| 18 |
-
state:
|
| 19 |
-
method: GET
|
| 20 |
-
path: /state
|
| 21 |
-
tasks:
|
| 22 |
-
method: GET
|
| 23 |
-
path: /tasks
|
| 24 |
-
grader:
|
| 25 |
-
method: POST
|
| 26 |
-
path: /grader
|
| 27 |
-
tasks:
|
| 28 |
-
- id: engineer_manager_task_0
|
| 29 |
-
task_id: quiet-morning
|
| 30 |
-
name: quiet-morning
|
| 31 |
-
difficulty: easy
|
| 32 |
-
description: High-noise morning where the agent should mute comms early and protect an uninterrupted work block.
|
| 33 |
-
max_steps: 32
|
| 34 |
-
reset_params:
|
| 35 |
-
task_id: 0
|
| 36 |
-
action_schema:
|
| 37 |
-
target_slot: integer slot index within the workday
|
| 38 |
-
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 39 |
-
task_id: quiet-morning
|
| 40 |
-
grader: graders:grade_task_0
|
| 41 |
-
graders:
|
| 42 |
-
- graders:grade_task_0
|
| 43 |
-
reward_range:
|
| 44 |
-
- 0.0
|
| 45 |
-
- 1.0
|
| 46 |
-
- id: engineer_manager_task_1
|
| 47 |
-
task_id: meeting-surgery
|
| 48 |
-
name: meeting-surgery
|
| 49 |
-
difficulty: medium
|
| 50 |
-
description: Fragmented calendar where selective meeting moves should improve flow.
|
| 51 |
-
max_steps: 32
|
| 52 |
-
reset_params:
|
| 53 |
-
task_id: 1
|
| 54 |
-
action_schema:
|
| 55 |
-
target_slot: integer slot index within the workday
|
| 56 |
-
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 57 |
-
task_id: meeting-surgery
|
| 58 |
-
grader: graders:grade_task_1
|
| 59 |
-
graders:
|
| 60 |
-
- graders:grade_task_1
|
| 61 |
-
reward_range:
|
| 62 |
-
- 0.0
|
| 63 |
-
- 1.0
|
| 64 |
-
- id: engineer_manager_task_2
|
| 65 |
-
task_id: delivery-triage
|
| 66 |
-
name: delivery-triage
|
| 67 |
-
difficulty: hard
|
| 68 |
-
description: Constrained delivery day with hidden task complexity and tighter tradeoffs.
|
| 69 |
-
max_steps: 32
|
| 70 |
-
reset_params:
|
| 71 |
-
task_id: 2
|
| 72 |
-
action_schema:
|
| 73 |
-
target_slot: integer slot index within the workday
|
| 74 |
-
operation: 0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms
|
| 75 |
-
task_id: delivery-triage
|
| 76 |
-
grader: graders:grade_task_2
|
| 77 |
-
graders:
|
| 78 |
-
- graders:grade_task_2
|
| 79 |
-
reward_range:
|
| 80 |
-
- 0.0
|
| 81 |
-
- 1.0
|
| 82 |
-
graders:
|
| 83 |
-
- graders:grade_task_0
|
| 84 |
-
- graders:grade_task_1
|
| 85 |
-
- graders:grade_task_2
|
|
|
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/app.py
CHANGED
|
@@ -7,10 +7,6 @@ from textwrap import dedent
|
|
| 7 |
import uvicorn
|
| 8 |
from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse, RedirectResponse, Response
|
| 9 |
from openenv.core.env_server.http_server import create_fastapi_app
|
| 10 |
-
from pydantic import BaseModel
|
| 11 |
-
|
| 12 |
-
from graders import grade_task_0, grade_task_1, grade_task_2
|
| 13 |
-
from tasks import TASKS
|
| 14 |
|
| 15 |
try:
|
| 16 |
from ..models import EngineerManagerAction, EngineerManagerObservation
|
|
@@ -33,12 +29,6 @@ app = create_fastapi_app(
|
|
| 33 |
max_concurrent_envs=2,
|
| 34 |
)
|
| 35 |
|
| 36 |
-
|
| 37 |
-
class GraderRequest(BaseModel):
|
| 38 |
-
task_id: str
|
| 39 |
-
state: dict
|
| 40 |
-
reward: float
|
| 41 |
-
|
| 42 |
WEB_CSS = dedent(
|
| 43 |
"""\
|
| 44 |
:root {
|
|
@@ -446,8 +436,6 @@ def web_js() -> PlainTextResponse:
|
|
| 446 |
@app.get("/favicon.ico", include_in_schema=False)
|
| 447 |
def favicon() -> Response:
|
| 448 |
return Response(status_code=204)
|
| 449 |
-
|
| 450 |
-
|
| 451 |
@app.get("/manifest.json", include_in_schema=False)
|
| 452 |
def manifest() -> JSONResponse:
|
| 453 |
return JSONResponse(
|
|
@@ -463,42 +451,6 @@ def manifest() -> JSONResponse:
|
|
| 463 |
)
|
| 464 |
|
| 465 |
|
| 466 |
-
@app.get("/tasks", include_in_schema=False)
|
| 467 |
-
def tasks() -> list[dict]:
|
| 468 |
-
return TASKS
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
@app.post("/grader", include_in_schema=False)
|
| 472 |
-
def grader(request: GraderRequest) -> JSONResponse:
|
| 473 |
-
task_index_map = {
|
| 474 |
-
"quiet-morning": 0,
|
| 475 |
-
"engineer_manager_task_0": 0,
|
| 476 |
-
"meeting-surgery": 1,
|
| 477 |
-
"engineer_manager_task_1": 1,
|
| 478 |
-
"delivery-triage": 2,
|
| 479 |
-
"engineer_manager_task_2": 2,
|
| 480 |
-
}
|
| 481 |
-
grader_fn_map = {
|
| 482 |
-
"quiet-morning": grade_task_0,
|
| 483 |
-
"meeting-surgery": grade_task_1,
|
| 484 |
-
"delivery-triage": grade_task_2,
|
| 485 |
-
"engineer_manager_task_0": grade_task_0,
|
| 486 |
-
"engineer_manager_task_1": grade_task_1,
|
| 487 |
-
"engineer_manager_task_2": grade_task_2,
|
| 488 |
-
}
|
| 489 |
-
grader_fn = grader_fn_map.get(request.task_id)
|
| 490 |
-
if grader_fn is None:
|
| 491 |
-
return JSONResponse(
|
| 492 |
-
{"error": f"Unknown task_id: {request.task_id}", "score": 0.0, "passed": False},
|
| 493 |
-
status_code=400,
|
| 494 |
-
)
|
| 495 |
-
state = dict(request.state)
|
| 496 |
-
if "task_id" not in state or state["task_id"] is None:
|
| 497 |
-
state["task_id"] = task_index_map[request.task_id]
|
| 498 |
-
score = float(grader_fn(state, request.reward))
|
| 499 |
-
return JSONResponse({"task_id": request.task_id, "score": score, "passed": score >= 0.0, "reward": score})
|
| 500 |
-
|
| 501 |
-
|
| 502 |
def run(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 503 |
"""Run the OpenEnv HTTP server."""
|
| 504 |
uvicorn.run(app, host=host, port=port)
|
|
|
|
| 7 |
import uvicorn
|
| 8 |
from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse, RedirectResponse, Response
|
| 9 |
from openenv.core.env_server.http_server import create_fastapi_app
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
try:
|
| 12 |
from ..models import EngineerManagerAction, EngineerManagerObservation
|
|
|
|
| 29 |
max_concurrent_envs=2,
|
| 30 |
)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
WEB_CSS = dedent(
|
| 33 |
"""\
|
| 34 |
:root {
|
|
|
|
| 436 |
@app.get("/favicon.ico", include_in_schema=False)
|
| 437 |
def favicon() -> Response:
|
| 438 |
return Response(status_code=204)
|
|
|
|
|
|
|
| 439 |
@app.get("/manifest.json", include_in_schema=False)
|
| 440 |
def manifest() -> JSONResponse:
|
| 441 |
return JSONResponse(
|
|
|
|
| 451 |
)
|
| 452 |
|
| 453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
def run(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 455 |
"""Run the OpenEnv HTTP server."""
|
| 456 |
uvicorn.run(app, host=host, port=port)
|
server/engineer_manager_environment.py
CHANGED
|
@@ -3,12 +3,10 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from uuid import uuid4
|
| 6 |
-
import os
|
| 7 |
|
| 8 |
from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
|
| 9 |
from openenv.core.env_server.types import State
|
| 10 |
|
| 11 |
-
from benchmark_tasks import TASK_SPECS, apply_task
|
| 12 |
from focus_resource_env import FocusResourceEnv
|
| 13 |
|
| 14 |
try:
|
|
@@ -30,18 +28,14 @@ class EngineerManagerEnvironment(
|
|
| 30 |
end_hour: str = "17:00",
|
| 31 |
distraction_risk: float = 0.15,
|
| 32 |
seed: int | None = 7,
|
| 33 |
-
task_name: str | None = None,
|
| 34 |
) -> None:
|
| 35 |
super().__init__()
|
| 36 |
self._start_hour = start_hour
|
| 37 |
self._end_hour = end_hour
|
| 38 |
self._distraction_risk = distraction_risk
|
| 39 |
self._seed = seed
|
| 40 |
-
self._task_name = task_name or os.getenv("TASK_NAME")
|
| 41 |
-
self._task_id = 0
|
| 42 |
self._step_count = 0
|
| 43 |
self._episode_id = str(uuid4())
|
| 44 |
-
self._trajectory: list[dict[str, object]] = []
|
| 45 |
self._env = FocusResourceEnv(
|
| 46 |
start_hour=start_hour,
|
| 47 |
end_hour=end_hour,
|
|
@@ -53,30 +47,18 @@ class EngineerManagerEnvironment(
|
|
| 53 |
self,
|
| 54 |
seed: int | None = None,
|
| 55 |
episode_id: str | None = None,
|
| 56 |
-
task_name: str | None = None,
|
| 57 |
-
task_id: int | None = None,
|
| 58 |
**_: object,
|
| 59 |
) -> EngineerManagerObservation:
|
| 60 |
self._seed = self._seed if seed is None else seed
|
| 61 |
-
task_names = ["quiet-morning", "meeting-surgery", "delivery-triage"]
|
| 62 |
-
if task_id is not None and 0 <= int(task_id) < len(task_names):
|
| 63 |
-
self._task_id = int(task_id)
|
| 64 |
-
self._task_name = task_names[self._task_id]
|
| 65 |
-
else:
|
| 66 |
-
self._task_name = task_name or self._task_name or os.getenv("TASK_NAME")
|
| 67 |
-
self._task_id = task_names.index(self._task_name) if self._task_name in task_names else 0
|
| 68 |
self._episode_id = episode_id or str(uuid4())
|
| 69 |
self._step_count = 0
|
| 70 |
-
self._trajectory = []
|
| 71 |
self._env = FocusResourceEnv(
|
| 72 |
start_hour=self._start_hour,
|
| 73 |
end_hour=self._end_hour,
|
| 74 |
distraction_risk=self._distraction_risk,
|
| 75 |
seed=self._seed,
|
| 76 |
)
|
| 77 |
-
self._env.reset()
|
| 78 |
-
apply_task(self._env, self._task_name)
|
| 79 |
-
return self._to_observation(self._env._observation(), reward=0.0, done=False)
|
| 80 |
|
| 81 |
def step(
|
| 82 |
self,
|
|
@@ -89,15 +71,6 @@ class EngineerManagerEnvironment(
|
|
| 89 |
(action.target_slot, action.operation)
|
| 90 |
)
|
| 91 |
self._step_count += 1
|
| 92 |
-
self._trajectory.append(
|
| 93 |
-
{
|
| 94 |
-
"action": {"target_slot": int(action.target_slot), "operation": int(action.operation)},
|
| 95 |
-
"observation": observation,
|
| 96 |
-
"reward": float(reward),
|
| 97 |
-
"done": bool(done),
|
| 98 |
-
"info": info,
|
| 99 |
-
}
|
| 100 |
-
)
|
| 101 |
return self._to_observation(observation, reward=reward, done=done, info=info)
|
| 102 |
|
| 103 |
@property
|
|
@@ -114,8 +87,7 @@ class EngineerManagerEnvironment(
|
|
| 114 |
name="Engineer Manager",
|
| 115 |
description=(
|
| 116 |
"Manage a workday by scheduling deep work, rescheduling meetings, "
|
| 117 |
-
"and controlling communication noise.
|
| 118 |
-
f"Available tasks: {', '.join(sorted(TASK_SPECS))}."
|
| 119 |
),
|
| 120 |
version="0.1.0",
|
| 121 |
)
|
|
@@ -131,21 +103,5 @@ class EngineerManagerEnvironment(
|
|
| 131 |
payload = dict(observation)
|
| 132 |
payload["reward"] = reward
|
| 133 |
payload["done"] = done
|
| 134 |
-
metadata =
|
| 135 |
-
metadata["task_name"] = self._task_name
|
| 136 |
-
metadata["task_id"] = self._task_id
|
| 137 |
-
metadata["episode_metrics"] = {
|
| 138 |
-
"interruptions": int(self._env.interruptions),
|
| 139 |
-
"invalid_actions": int(self._env.invalid_actions),
|
| 140 |
-
"remaining_tasks": len(self._env.task_buffer),
|
| 141 |
-
"scheduled_work_slots": sum(1 for slot in self._env.timeline if int(slot) == 1),
|
| 142 |
-
"successful_reschedules": sum(
|
| 143 |
-
1
|
| 144 |
-
for step in self._trajectory
|
| 145 |
-
if step["info"].get("action_info", {}).get("status") == "meeting_rescheduled"
|
| 146 |
-
),
|
| 147 |
-
"total_score": float(self._env._total_score()),
|
| 148 |
-
"grader_score": min(max(float(reward or 0.0), 0.0), 1.0),
|
| 149 |
-
}
|
| 150 |
-
payload["metadata"] = metadata
|
| 151 |
return EngineerManagerObservation.model_validate(payload)
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from uuid import uuid4
|
|
|
|
| 6 |
|
| 7 |
from openenv.core.env_server.interfaces import Environment, EnvironmentMetadata
|
| 8 |
from openenv.core.env_server.types import State
|
| 9 |
|
|
|
|
| 10 |
from focus_resource_env import FocusResourceEnv
|
| 11 |
|
| 12 |
try:
|
|
|
|
| 28 |
end_hour: str = "17:00",
|
| 29 |
distraction_risk: float = 0.15,
|
| 30 |
seed: int | None = 7,
|
|
|
|
| 31 |
) -> None:
|
| 32 |
super().__init__()
|
| 33 |
self._start_hour = start_hour
|
| 34 |
self._end_hour = end_hour
|
| 35 |
self._distraction_risk = distraction_risk
|
| 36 |
self._seed = seed
|
|
|
|
|
|
|
| 37 |
self._step_count = 0
|
| 38 |
self._episode_id = str(uuid4())
|
|
|
|
| 39 |
self._env = FocusResourceEnv(
|
| 40 |
start_hour=start_hour,
|
| 41 |
end_hour=end_hour,
|
|
|
|
| 47 |
self,
|
| 48 |
seed: int | None = None,
|
| 49 |
episode_id: str | None = None,
|
|
|
|
|
|
|
| 50 |
**_: object,
|
| 51 |
) -> EngineerManagerObservation:
|
| 52 |
self._seed = self._seed if seed is None else seed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
self._episode_id = episode_id or str(uuid4())
|
| 54 |
self._step_count = 0
|
|
|
|
| 55 |
self._env = FocusResourceEnv(
|
| 56 |
start_hour=self._start_hour,
|
| 57 |
end_hour=self._end_hour,
|
| 58 |
distraction_risk=self._distraction_risk,
|
| 59 |
seed=self._seed,
|
| 60 |
)
|
| 61 |
+
return self._to_observation(self._env.reset(), reward=0.0, done=False)
|
|
|
|
|
|
|
| 62 |
|
| 63 |
def step(
|
| 64 |
self,
|
|
|
|
| 71 |
(action.target_slot, action.operation)
|
| 72 |
)
|
| 73 |
self._step_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
return self._to_observation(observation, reward=reward, done=done, info=info)
|
| 75 |
|
| 76 |
@property
|
|
|
|
| 87 |
name="Engineer Manager",
|
| 88 |
description=(
|
| 89 |
"Manage a workday by scheduling deep work, rescheduling meetings, "
|
| 90 |
+
"and controlling communication noise."
|
|
|
|
| 91 |
),
|
| 92 |
version="0.1.0",
|
| 93 |
)
|
|
|
|
| 103 |
payload = dict(observation)
|
| 104 |
payload["reward"] = reward
|
| 105 |
payload["done"] = done
|
| 106 |
+
payload["metadata"] = info or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
return EngineerManagerObservation.model_validate(payload)
|
tasks.py
DELETED
|
@@ -1,71 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from benchmark_tasks import TASK_SPECS
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
TASKS = [
|
| 7 |
-
{
|
| 8 |
-
"id": "engineer_manager_task_0",
|
| 9 |
-
"task_id": "quiet-morning",
|
| 10 |
-
"name": "quiet-morning",
|
| 11 |
-
"difficulty": "easy",
|
| 12 |
-
"description": TASK_SPECS["quiet-morning"].description,
|
| 13 |
-
"max_steps": 32,
|
| 14 |
-
"reset_params": {"task_id": 0},
|
| 15 |
-
"action_schema": {
|
| 16 |
-
"target_slot": "integer slot index within the workday",
|
| 17 |
-
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 18 |
-
"task_id": "quiet-morning",
|
| 19 |
-
},
|
| 20 |
-
"grader": "graders:grade_task_0",
|
| 21 |
-
"graders": ["graders:grade_task_0"],
|
| 22 |
-
"reward_range": [0.0, 1.0],
|
| 23 |
-
},
|
| 24 |
-
{
|
| 25 |
-
"id": "engineer_manager_task_1",
|
| 26 |
-
"task_id": "meeting-surgery",
|
| 27 |
-
"name": "meeting-surgery",
|
| 28 |
-
"difficulty": "medium",
|
| 29 |
-
"description": TASK_SPECS["meeting-surgery"].description,
|
| 30 |
-
"max_steps": 32,
|
| 31 |
-
"reset_params": {"task_id": 1},
|
| 32 |
-
"action_schema": {
|
| 33 |
-
"target_slot": "integer slot index within the workday",
|
| 34 |
-
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 35 |
-
"task_id": "meeting-surgery",
|
| 36 |
-
},
|
| 37 |
-
"grader": "graders:grade_task_1",
|
| 38 |
-
"graders": ["graders:grade_task_1"],
|
| 39 |
-
"reward_range": [0.0, 1.0],
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"id": "engineer_manager_task_2",
|
| 43 |
-
"task_id": "delivery-triage",
|
| 44 |
-
"name": "delivery-triage",
|
| 45 |
-
"difficulty": "hard",
|
| 46 |
-
"description": TASK_SPECS["delivery-triage"].description,
|
| 47 |
-
"max_steps": 32,
|
| 48 |
-
"reset_params": {"task_id": 2},
|
| 49 |
-
"action_schema": {
|
| 50 |
-
"target_slot": "integer slot index within the workday",
|
| 51 |
-
"operation": "0=idle, 1=schedule work, 2=reschedule meeting, 3=toggle mute comms",
|
| 52 |
-
"task_id": "delivery-triage",
|
| 53 |
-
},
|
| 54 |
-
"grader": "graders:grade_task_2",
|
| 55 |
-
"graders": ["graders:grade_task_2"],
|
| 56 |
-
"reward_range": [0.0, 1.0],
|
| 57 |
-
},
|
| 58 |
-
]
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
TASK_ID_TO_INDEX = {task["task_id"]: index for index, task in enumerate(TASKS)}
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
TASK_GRADER_PAIRS = [
|
| 65 |
-
("engineer_manager_task_0", "graders:grade_task_0"),
|
| 66 |
-
("engineer_manager_task_1", "graders:grade_task_1"),
|
| 67 |
-
("engineer_manager_task_2", "graders:grade_task_2"),
|
| 68 |
-
]
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
__all__ = ["TASKS", "TASK_ID_TO_INDEX", "TASK_GRADER_PAIRS"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
validate-submission.sh
CHANGED
|
@@ -19,6 +19,13 @@ else
|
|
| 19 |
RED='' GREEN='' YELLOW='' BLUE='' BOLD='' NC=''
|
| 20 |
fi
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
log() { printf "[%s] %b\n" "$(date +%H:%M:%S)" "$*"; }
|
| 23 |
pass() { log "${GREEN}${BOLD}PASS${NC} -- $1"; }
|
| 24 |
fail() { log "${RED}${BOLD}FAIL${NC} -- $1"; }
|
|
@@ -84,7 +91,7 @@ fi
|
|
| 84 |
pass "Environment is ready."
|
| 85 |
|
| 86 |
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 87 |
-
HTTP_CODE=$(curl
|
| 88 |
-H "Content-Type: application/json" -d '{}' \
|
| 89 |
"$PING_URL/reset" --max-time 20 || echo "000")
|
| 90 |
|
|
|
|
| 19 |
RED='' GREEN='' YELLOW='' BLUE='' BOLD='' NC=''
|
| 20 |
fi
|
| 21 |
|
| 22 |
+
CURL_ARGS=(-s)
|
| 23 |
+
case "$(uname -s 2>/dev/null || printf unknown)" in
|
| 24 |
+
MINGW*|MSYS*|CYGWIN*)
|
| 25 |
+
CURL_ARGS+=(--ssl-no-revoke)
|
| 26 |
+
;;
|
| 27 |
+
esac
|
| 28 |
+
|
| 29 |
log() { printf "[%s] %b\n" "$(date +%H:%M:%S)" "$*"; }
|
| 30 |
pass() { log "${GREEN}${BOLD}PASS${NC} -- $1"; }
|
| 31 |
fail() { log "${RED}${BOLD}FAIL${NC} -- $1"; }
|
|
|
|
| 91 |
pass "Environment is ready."
|
| 92 |
|
| 93 |
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 94 |
+
HTTP_CODE=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}" -X POST \
|
| 95 |
-H "Content-Type: application/json" -d '{}' \
|
| 96 |
"$PING_URL/reset" --max-time 20 || echo "000")
|
| 97 |
|