Cyber-Machine commited on
Commit
9007754
·
verified ·
1 Parent(s): 2dfa6e3

feat: add benchmarks and improve inference module with Docker support

Browse files
Files changed (2) hide show
  1. README.md +14 -0
  2. inference.py +102 -51
README.md CHANGED
@@ -216,6 +216,20 @@ Typical downstream evaluation reads:
216
  - how many deadlines were missed
217
  - how much important work remained unfinished
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  ## Local Development
220
 
221
  Validate the environment:
 
216
  - how many deadlines were missed
217
  - how much important work remained unfinished
218
 
219
+ ## Benchmarks
220
+
221
+ Verified self-contained inference run using:
222
+
223
+ 1. `qwen/qwen3.5-9b`
224
+
225
+ Results:
226
+
227
+ | Preset | Success | Steps | Score |
228
+ | -------- | ------- | ----- | ------- |
229
+ | `easy` | `true` | `11` | `0.952` |
230
+ | `medium` | `true` | `20` | `0.945` |
231
+ | `hard` | `true` | `45` | `0.652` |
232
+
233
  ## Local Development
234
 
235
  Validate the environment:
inference.py CHANGED
@@ -1,8 +1,12 @@
1
  from __future__ import annotations
2
 
 
3
  import json
4
  import os
 
 
5
  from dataclasses import dataclass
 
6
 
7
  from openai import OpenAI
8
  from workflow_arena import WorkflowArenaAction, WorkflowArenaEnv
@@ -12,7 +16,6 @@ from workflow_arena.models import (
12
  WorkflowArenaObservation,
13
  WorkflowTaskView,
14
  )
15
- from workflow_arena.presets import get_preset_config
16
 
17
  BENCHMARK = "WorkflowArena"
18
  PRESETS = [
@@ -20,7 +23,9 @@ PRESETS = [
20
  DifficultyPreset.MEDIUM,
21
  DifficultyPreset.HARD,
22
  ]
23
- DEFAULT_BASE_URL = os.getenv("WORKFLOW_ARENA_BASE_URL", "http://localhost:8000")
 
 
24
  TEMPERATURE = 0.0
25
  MAX_STEPS = 256
26
 
@@ -190,7 +195,54 @@ class EpisodeResult:
190
  rewards: list[float]
191
 
192
 
193
- def run_episode(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  client: OpenAI | None,
195
  model_name: str,
196
  preset: DifficultyPreset,
@@ -203,56 +255,53 @@ def run_episode(
203
 
204
  log_start(task=preset.value, env=BENCHMARK, model=model_name)
205
 
206
- with WorkflowArenaEnv(base_url=DEFAULT_BASE_URL).sync() as env:
207
- preset_config = get_preset_config(preset)
208
- result = env.reset(
209
- seed=seed,
210
- preset=preset.value,
211
- worker_count=preset_config.worker_count,
212
- )
213
- observation = result.observation
214
 
215
- while not observation.done and steps_taken < MAX_STEPS:
216
- try:
217
- if client is None:
218
- action = heuristic_action(observation)
219
- else:
220
- action = get_model_action(client, model_name, observation)
221
- except (
222
- Exception
223
- ): # pragma: no cover - network/model failures are expected sometimes
224
  action = heuristic_action(observation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- try:
227
- result = env.step(action)
228
- except (
229
- Exception
230
- ): # pragma: no cover - preserve log format and continue safely
231
- action = heuristic_action(observation)
232
- result = env.step(action)
233
-
234
- observation = result.observation
235
- reward = float(result.reward or 0.0)
236
- rewards.append(reward)
237
- steps_taken += 1
238
- log_step(
239
- step=steps_taken,
240
- action=action_to_log_string(action),
241
- reward=reward,
242
- done=bool(result.done),
243
- error=observation.validation_error,
244
- )
245
 
246
- success = is_success(observation)
247
- score = compute_score(observation) if observation.done else 0.0
248
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
249
 
250
  return EpisodeResult(
251
  success=success, steps=steps_taken, score=score, rewards=rewards
252
  )
253
 
254
 
255
- def main() -> None:
256
  api_base_url = os.environ["API_BASE_URL"]
257
  model_name = os.environ["MODEL_NAME"]
258
  api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
@@ -261,14 +310,16 @@ def main() -> None:
261
 
262
  client = OpenAI(base_url=api_base_url, api_key=api_key)
263
 
264
- for index, preset in enumerate(PRESETS):
265
- run_episode(
266
- client=client,
267
- model_name=model_name,
268
- preset=preset,
269
- seed=100 + index,
270
- )
 
 
271
 
272
 
273
  if __name__ == "__main__":
274
- main()
 
1
  from __future__ import annotations
2
 
3
+ import asyncio
4
  import json
5
  import os
6
+ import subprocess
7
+ from contextlib import asynccontextmanager
8
  from dataclasses import dataclass
9
+ from pathlib import Path
10
 
11
  from openai import OpenAI
12
  from workflow_arena import WorkflowArenaAction, WorkflowArenaEnv
 
16
  WorkflowArenaObservation,
17
  WorkflowTaskView,
18
  )
 
19
 
20
  BENCHMARK = "WorkflowArena"
21
  PRESETS = [
 
23
  DifficultyPreset.MEDIUM,
24
  DifficultyPreset.HARD,
25
  ]
26
+ PROJECT_DIR = Path(__file__).resolve().parent
27
+ IMAGE_NAME = "workflow-arena-inference:latest"
28
+ DOCKERFILE_PATH = PROJECT_DIR / "server" / "Dockerfile"
29
  TEMPERATURE = 0.0
30
  MAX_STEPS = 256
31
 
 
195
  rewards: list[float]
196
 
197
 
198
+ def ensure_local_image() -> None:
199
+ try:
200
+ inspect_result = subprocess.run(
201
+ ["docker", "image", "inspect", IMAGE_NAME],
202
+ cwd=PROJECT_DIR,
203
+ stdout=subprocess.DEVNULL,
204
+ stderr=subprocess.DEVNULL,
205
+ check=False,
206
+ )
207
+ except OSError as exc:
208
+ raise RuntimeError(f"Failed to execute docker: {exc}") from exc
209
+
210
+ if inspect_result.returncode == 0:
211
+ return
212
+
213
+ try:
214
+ build_result = subprocess.run(
215
+ ["docker", "build", "-t", IMAGE_NAME, "-f", str(DOCKERFILE_PATH), "."],
216
+ cwd=PROJECT_DIR,
217
+ capture_output=True,
218
+ text=True,
219
+ check=False,
220
+ )
221
+ except OSError as exc:
222
+ raise RuntimeError(f"Failed to execute docker build: {exc}") from exc
223
+
224
+ if build_result.returncode != 0:
225
+ raise RuntimeError(
226
+ "Failed to build Docker image for inference.\n"
227
+ f"Command: docker build -t {IMAGE_NAME} -f {DOCKERFILE_PATH} .\n"
228
+ f"Exit code: {build_result.returncode}\n"
229
+ f"Stdout: {build_result.stdout}\n"
230
+ f"Stderr: {build_result.stderr}"
231
+ )
232
+
233
+
234
+ @asynccontextmanager
235
+ async def managed_env():
236
+ ensure_local_image()
237
+ env = await WorkflowArenaEnv.from_docker_image(IMAGE_NAME)
238
+ try:
239
+ yield env
240
+ finally:
241
+ await env.close()
242
+
243
+
244
+ async def run_episode(
245
+ env,
246
  client: OpenAI | None,
247
  model_name: str,
248
  preset: DifficultyPreset,
 
255
 
256
  log_start(task=preset.value, env=BENCHMARK, model=model_name)
257
 
258
+ result = await env.reset(
259
+ seed=seed,
260
+ preset=preset.value,
261
+ )
262
+ observation = result.observation
 
 
 
263
 
264
+ while not observation.done and steps_taken < MAX_STEPS:
265
+ try:
266
+ if client is None:
 
 
 
 
 
 
267
  action = heuristic_action(observation)
268
+ else:
269
+ action = get_model_action(client, model_name, observation)
270
+ except (
271
+ Exception
272
+ ): # pragma: no cover - network/model failures are expected sometimes
273
+ action = heuristic_action(observation)
274
+
275
+ try:
276
+ result = await env.step(action)
277
+ except (
278
+ Exception
279
+ ): # pragma: no cover - preserve log format and continue safely
280
+ action = heuristic_action(observation)
281
+ result = await env.step(action)
282
 
283
+ observation = result.observation
284
+ reward = float(result.reward or 0.0)
285
+ rewards.append(reward)
286
+ steps_taken += 1
287
+ log_step(
288
+ step=steps_taken,
289
+ action=action_to_log_string(action),
290
+ reward=reward,
291
+ done=bool(result.done),
292
+ error=observation.validation_error,
293
+ )
 
 
 
 
 
 
 
 
294
 
295
+ success = is_success(observation)
296
+ score = compute_score(observation) if observation.done else 0.0
297
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
298
 
299
  return EpisodeResult(
300
  success=success, steps=steps_taken, score=score, rewards=rewards
301
  )
302
 
303
 
304
+ async def main() -> None:
305
  api_base_url = os.environ["API_BASE_URL"]
306
  model_name = os.environ["MODEL_NAME"]
307
  api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
 
310
 
311
  client = OpenAI(base_url=api_base_url, api_key=api_key)
312
 
313
+ async with managed_env() as env:
314
+ for index, preset in enumerate(PRESETS):
315
+ await run_episode(
316
+ env=env,
317
+ client=client,
318
+ model_name=model_name,
319
+ preset=preset,
320
+ seed=100 + index,
321
+ )
322
 
323
 
324
  if __name__ == "__main__":
325
+ asyncio.run(main())