natnael kahssay Claude Sonnet 4.6 commited on
Commit
5d3d3ff
·
1 Parent(s): 002fe30

feat: multi-turn tool-using RL environment (RFC 005 pattern)

Browse files

Actions are now tool calls (read/edit/bash/submit) instead of one-shot
file submissions. The agent navigates the sandbox across multiple steps,
exactly like Claude Code, then calls submit to run vitest and get reward.

- MOAAction: {tool, params} replaces {file_path, content}
- MOAObservation: adds tool_result, user_messages, step_count
- moa_env.py: dispatches read/edit/bash/submit with sandbox path safety
- tasks.py: adds real user_messages per task (Step 1 intent alignment)
- Max 20 steps per episode; auto-submits on step 20

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

src/envs/moa_env/moa_env.py CHANGED
@@ -1,8 +1,16 @@
1
  """
2
- MOA Code Environment.
3
-
4
- Agent is given a task description + broken TypeScript file.
5
- Agent submits a fixed version. Tests run. Reward = test pass rate.
 
 
 
 
 
 
 
 
6
  """
7
 
8
  import os
@@ -15,18 +23,17 @@ from core.env_server import Action, Environment, Observation
15
  from .models import MOAAction, MOAObservation, MOAState
16
  from .tasks import load_task, TASKS
17
 
 
 
 
18
 
19
- class MOAEnv(Environment):
20
- """
21
- RL environment for TypeScript code tasks derived from real MOA dev sessions.
22
-
23
- reset() → gives agent task description + broken file + test file
24
- step() → agent submits fixed file → runs vitest → returns reward
25
- """
26
 
 
27
  def __init__(self):
28
  self._state = MOAState()
29
- self._task_index = 0 # cycle through tasks
 
 
30
 
31
  def reset(self) -> Observation:
32
  task_id = TASKS[self._task_index % len(TASKS)]["id"]
@@ -39,6 +46,7 @@ class MOAEnv(Environment):
39
  episode_id=str(uuid.uuid4()),
40
  step_count=0,
41
  current_task=task["description"],
 
42
  broken_file_path=task["source_file"],
43
  broken_file_content=task["broken_content"],
44
  test_file_content=task["test_file_content"],
@@ -49,30 +57,149 @@ class MOAEnv(Environment):
49
 
50
  return MOAObservation(
51
  task=task["description"],
 
52
  broken_file_path=task["source_file"],
53
  broken_file_content=task["broken_content"],
54
  test_file_content=task["test_file_content"],
 
 
55
  done=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  )
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def _make_sandbox(self, task: dict) -> str:
59
- """
60
- Copy moav2/src to a temp dir, symlink node_modules from the pre-installed
61
- /app/moav2 directory (avoids copying 700MB per request), then blank the
62
- target source file so the agent starts from scratch.
63
- """
64
  import shutil
65
  MOAV2 = "/app/moav2"
66
  sandbox = tempfile.mkdtemp(prefix="moa_env_")
67
 
68
- # Copy source tree (812KB) and config files
69
  shutil.copytree(os.path.join(MOAV2, "src"), os.path.join(sandbox, "src"))
70
  for f in ("package.json", "vitest.config.ts", "tsconfig.json"):
71
  src = os.path.join(MOAV2, f)
72
  if os.path.exists(src):
73
  shutil.copy(src, sandbox)
74
 
75
- # Symlink node_modules — no copy needed
76
  os.symlink(
77
  os.path.join(MOAV2, "node_modules"),
78
  os.path.join(sandbox, "node_modules"),
@@ -80,47 +207,15 @@ class MOAEnv(Environment):
80
 
81
  # Blank out the target file — agent must implement it
82
  broken_path = os.path.join(sandbox, task["source_file"])
 
83
  with open(broken_path, "w") as f:
84
  f.write(task["broken_content"])
85
 
86
  return sandbox
87
 
88
- def step(self, action: Action) -> Observation:
89
- if not isinstance(action, MOAAction):
90
- raise ValueError(f"Expected MOAAction, got {type(action)}")
91
-
92
- self._state.step_count += 1
93
-
94
- # write agent's fix into sandbox
95
- sandbox_file = os.path.join(
96
- self._state.sandbox_dir,
97
- action.file_path.lstrip("/"),
98
- )
99
- os.makedirs(os.path.dirname(sandbox_file), exist_ok=True)
100
- with open(sandbox_file, "w") as f:
101
- f.write(action.content)
102
-
103
- # run tests
104
- passed, total, output = self._run_tests()
105
- reward = passed / max(total, 1)
106
- done = (passed == total and total > 0) or self._state.step_count >= 10
107
-
108
- self._state.last_reward = reward
109
-
110
- return MOAObservation(
111
- task=self._state.current_task,
112
- broken_file_path=self._state.broken_file_path,
113
- broken_file_content=action.content, # show what agent submitted
114
- test_file_content=self._state.test_file_content,
115
- test_output=output,
116
- tests_passed=passed,
117
- tests_total=total,
118
- reward=reward,
119
- done=done,
120
- )
121
 
122
  def _run_tests(self) -> tuple[int, int, str]:
123
- """Run only the task's test file for speed (~500ms vs full suite)."""
124
  try:
125
  result = subprocess.run(
126
  ["npx", "vitest", "run", "--reporter=verbose",
 
1
  """
2
+ MOA Code Environment — multi-turn tool-using RL environment.
3
+
4
+ Following OpenEnv RFC 005 (agentic harnesses) pattern:
5
+ reset() → task + broken file stub agent starts exploring
6
+ step(read/edit/bash) → tool result, no reward yet
7
+ step(submit) → runs vitest → reward = tests_passed / tests_total → done
8
+
9
+ The agent uses the same tool kit as Claude Code:
10
+ read — read any file in the sandbox
11
+ edit — apply an exact string replacement
12
+ bash — run a shell command in the sandbox (timeout 10s)
13
+ submit — trigger tests and end the episode
14
  """
15
 
16
  import os
 
23
  from .models import MOAAction, MOAObservation, MOAState
24
  from .tasks import load_task, TASKS
25
 
26
+ MAX_STEPS = 20
27
+ BASH_TIMEOUT = 10 # seconds per bash command
28
+ READ_MAX_CHARS = 8000
29
 
 
 
 
 
 
 
 
30
 
31
+ class MOAEnv(Environment):
32
  def __init__(self):
33
  self._state = MOAState()
34
+ self._task_index = 0
35
+
36
+ # ── reset ──────────────────────────────────────────────────────
37
 
38
  def reset(self) -> Observation:
39
  task_id = TASKS[self._task_index % len(TASKS)]["id"]
 
46
  episode_id=str(uuid.uuid4()),
47
  step_count=0,
48
  current_task=task["description"],
49
+ user_messages=task.get("user_messages", []),
50
  broken_file_path=task["source_file"],
51
  broken_file_content=task["broken_content"],
52
  test_file_content=task["test_file_content"],
 
57
 
58
  return MOAObservation(
59
  task=task["description"],
60
+ user_messages=task.get("user_messages", []),
61
  broken_file_path=task["source_file"],
62
  broken_file_content=task["broken_content"],
63
  test_file_content=task["test_file_content"],
64
+ tool="reset",
65
+ tool_result="",
66
  done=False,
67
+ step_count=0,
68
+ )
69
+
70
+ # ── step ───────────────────────────────────────────────────────
71
+
72
+ def step(self, action: Action) -> Observation:
73
+ if not isinstance(action, MOAAction):
74
+ raise ValueError(f"Expected MOAAction, got {type(action)}")
75
+
76
+ self._state.step_count += 1
77
+ tool = action.tool
78
+ params = action.params
79
+
80
+ # ── submit: run tests, end episode ──
81
+ if tool == "submit":
82
+ passed, total, output = self._run_tests()
83
+ reward = passed / max(total, 1)
84
+ self._state.last_reward = reward
85
+ return MOAObservation(
86
+ task=self._state.current_task,
87
+ user_messages=self._state.user_messages,
88
+ broken_file_path=self._state.broken_file_path,
89
+ tool="submit",
90
+ tool_result="",
91
+ test_output=output,
92
+ tests_passed=passed,
93
+ tests_total=total,
94
+ reward=reward,
95
+ done=True,
96
+ step_count=self._state.step_count,
97
+ )
98
+
99
+ # ── tool calls ──
100
+ try:
101
+ if tool == "read":
102
+ result = self._tool_read(params.get("path", ""))
103
+ elif tool == "edit":
104
+ result = self._tool_edit(
105
+ params.get("path", ""),
106
+ params.get("old_string", ""),
107
+ params.get("new_string", ""),
108
+ )
109
+ elif tool == "bash":
110
+ result = self._tool_bash(params.get("cmd", ""))
111
+ else:
112
+ result = f"Unknown tool '{tool}'. Available: read, edit, bash, submit"
113
+ except Exception as e:
114
+ result = f"Error: {e}"
115
+
116
+ # max steps → auto-submit
117
+ done = self._state.step_count >= MAX_STEPS
118
+ if done:
119
+ passed, total, output = self._run_tests()
120
+ reward = passed / max(total, 1)
121
+ self._state.last_reward = reward
122
+ else:
123
+ reward, passed, total, output = 0.0, 0, 0, ""
124
+
125
+ return MOAObservation(
126
+ task=self._state.current_task,
127
+ user_messages=self._state.user_messages,
128
+ broken_file_path=self._state.broken_file_path,
129
+ tool=tool,
130
+ tool_result=result,
131
+ test_output=output,
132
+ tests_passed=passed,
133
+ tests_total=total,
134
+ reward=reward,
135
+ done=done,
136
+ step_count=self._state.step_count,
137
  )
138
 
139
+ # ── tools ──────────────────────────────────────────────────────
140
+
141
+ def _sandbox_path(self, rel_path: str) -> str:
142
+ """Resolve a relative path to the sandbox, blocking directory traversal."""
143
+ clean = rel_path.lstrip("/")
144
+ full = os.path.realpath(os.path.join(self._state.sandbox_dir, clean))
145
+ if not full.startswith(os.path.realpath(self._state.sandbox_dir)):
146
+ raise ValueError("Path escapes sandbox")
147
+ return full
148
+
149
+ def _tool_read(self, path: str) -> str:
150
+ full = self._sandbox_path(path)
151
+ if not os.path.isfile(full):
152
+ return f"Error: file not found: {path}"
153
+ with open(full) as f:
154
+ content = f.read(READ_MAX_CHARS)
155
+ if len(content) == READ_MAX_CHARS:
156
+ content += "\n... (truncated)"
157
+ return content
158
+
159
+ def _tool_edit(self, path: str, old_string: str, new_string: str) -> str:
160
+ full = self._sandbox_path(path)
161
+ if not os.path.isfile(full):
162
+ return f"Error: file not found: {path}"
163
+ with open(full) as f:
164
+ original = f.read()
165
+ if old_string not in original:
166
+ return f"Error: old_string not found in {path}"
167
+ updated = original.replace(old_string, new_string, 1)
168
+ with open(full, "w") as f:
169
+ f.write(updated)
170
+ lines_changed = new_string.count("\n") - old_string.count("\n")
171
+ return f"Edited {path} ({lines_changed:+d} lines)"
172
+
173
+ def _tool_bash(self, cmd: str) -> str:
174
+ try:
175
+ result = subprocess.run(
176
+ cmd,
177
+ shell=True,
178
+ cwd=self._state.sandbox_dir,
179
+ capture_output=True,
180
+ text=True,
181
+ timeout=BASH_TIMEOUT,
182
+ )
183
+ out = (result.stdout + result.stderr)[-3000:]
184
+ return out if out else "(no output)"
185
+ except subprocess.TimeoutExpired:
186
+ return f"Error: command timed out after {BASH_TIMEOUT}s"
187
+ except Exception as e:
188
+ return f"Error: {e}"
189
+
190
+ # ── sandbox setup ──────────────────────────────────────────────
191
+
192
  def _make_sandbox(self, task: dict) -> str:
 
 
 
 
 
193
  import shutil
194
  MOAV2 = "/app/moav2"
195
  sandbox = tempfile.mkdtemp(prefix="moa_env_")
196
 
 
197
  shutil.copytree(os.path.join(MOAV2, "src"), os.path.join(sandbox, "src"))
198
  for f in ("package.json", "vitest.config.ts", "tsconfig.json"):
199
  src = os.path.join(MOAV2, f)
200
  if os.path.exists(src):
201
  shutil.copy(src, sandbox)
202
 
 
203
  os.symlink(
204
  os.path.join(MOAV2, "node_modules"),
205
  os.path.join(sandbox, "node_modules"),
 
207
 
208
  # Blank out the target file — agent must implement it
209
  broken_path = os.path.join(sandbox, task["source_file"])
210
+ os.makedirs(os.path.dirname(broken_path), exist_ok=True)
211
  with open(broken_path, "w") as f:
212
  f.write(task["broken_content"])
213
 
214
  return sandbox
215
 
216
+ # ── test runner ──────────────���─────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  def _run_tests(self) -> tuple[int, int, str]:
 
219
  try:
220
  result = subprocess.run(
221
  ["npx", "vitest", "run", "--reporter=verbose",
src/envs/moa_env/models.py CHANGED
@@ -1,33 +1,57 @@
1
  """
2
  Models for the MOA Code Environment.
3
- Agent receives a task + broken file, submits a fixed file, gets scored by tests.
 
 
 
4
  """
5
 
6
  from __future__ import annotations
7
  from dataclasses import dataclass, field
8
- from typing import Optional
9
  from core.env_server import Action, Observation, State
10
 
11
 
12
  @dataclass
13
  class MOAAction(Action):
14
- """Agent submits a fixed version of the file."""
15
- file_path: str # which file they are fixing
16
- content: str # the fixed file contents
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  @dataclass
20
  class MOAObservation(Observation):
21
  """What the agent sees at each step."""
22
- task: str = "" # collapsed task description
23
- broken_file_path: str = "" # path of the file to fix
24
- broken_file_content: str = "" # current (broken) content
25
- test_file_content: str = "" # the test file (so agent knows what must pass)
26
- test_output: str = "" # vitest output after submission
 
 
 
 
 
 
 
 
 
 
27
  tests_passed: int = 0
28
  tests_total: int = 0
 
29
  reward: float = 0.0
30
  done: bool = False
 
31
 
32
 
33
  @dataclass
@@ -36,9 +60,11 @@ class MOAState(State):
36
  episode_id: str = ""
37
  step_count: int = 0
38
  current_task: str = ""
 
39
  broken_file_path: str = ""
40
  broken_file_content: str = ""
41
  test_file_content: str = ""
42
  sandbox_dir: str = ""
43
  test_file: str = ""
44
  last_reward: float = 0.0
 
 
1
  """
2
  Models for the MOA Code Environment.
3
+
4
+ Multi-turn tool-using environment following OpenEnv RFC 005 (agentic harnesses).
5
+ The agent calls tools (read/edit/bash) across multiple steps, then submits to
6
+ trigger the test suite. Reward = tests_passed / tests_total on submit.
7
  """
8
 
9
  from __future__ import annotations
10
  from dataclasses import dataclass, field
11
+ from typing import List
12
  from core.env_server import Action, Observation, State
13
 
14
 
15
  @dataclass
16
  class MOAAction(Action):
17
+ """One tool call from the agent.
18
+
19
+ tool: "read" | "edit" | "bash" | "submit"
20
+ params: tool-specific parameters
21
+
22
+ read: {"path": "src/foo.ts"}
23
+ edit: {"path": "src/foo.ts", "old_string": "...", "new_string": "..."}
24
+ bash: {"cmd": "npx tsc --noEmit 2>&1 | head -20"}
25
+ submit: {} — runs the test suite and ends the episode
26
+ """
27
+ tool: str
28
+ params: dict = field(default_factory=dict)
29
 
30
 
31
  @dataclass
32
  class MOAObservation(Observation):
33
  """What the agent sees at each step."""
34
+ # Present on reset and every step so agent always has context
35
+ task: str = ""
36
+ broken_file_path: str = ""
37
+ user_messages: List[str] = field(default_factory=list)
38
+
39
+ # Set on reset only (initial state)
40
+ broken_file_content: str = ""
41
+ test_file_content: str = ""
42
+
43
+ # Set after each tool call
44
+ tool: str = "" # which tool was just called
45
+ tool_result: str = "" # output / result of the tool
46
+
47
+ # Set only on submit (final step)
48
+ test_output: str = ""
49
  tests_passed: int = 0
50
  tests_total: int = 0
51
+
52
  reward: float = 0.0
53
  done: bool = False
54
+ step_count: int = 0
55
 
56
 
57
  @dataclass
 
60
  episode_id: str = ""
61
  step_count: int = 0
62
  current_task: str = ""
63
+ user_messages: List[str] = field(default_factory=list)
64
  broken_file_path: str = ""
65
  broken_file_content: str = ""
66
  test_file_content: str = ""
67
  sandbox_dir: str = ""
68
  test_file: str = ""
69
  last_reward: float = 0.0
70
+ max_steps: int = 20
src/envs/moa_env/tasks.py CHANGED
@@ -2,6 +2,9 @@
2
  Tasks for the MOA RL environment.
3
  Uses real moav2 service files. Source and tests are pre-installed in /app/moav2.
4
  Each task blanks out one service file — the agent must re-implement it.
 
 
 
5
  """
6
  import os
7
 
@@ -10,6 +13,12 @@ MOAV2_DIR = "/app/moav2"
10
  TASKS = [
11
  {
12
  "id": "task_001",
 
 
 
 
 
 
13
  "description": (
14
  "Implement resolveModel() in model-resolver.ts. "
15
  "It maps (modelId, authMethod) to a Model object using the pi-ai registry. "
@@ -30,6 +39,12 @@ TASKS = [
30
  },
31
  {
32
  "id": "task_002",
 
 
 
 
 
 
33
  "description": (
34
  "Implement retry.ts with three exports: "
35
  "isRetryableError(e) returns true for HTTP 429/5xx and common retry keywords. "
@@ -53,6 +68,12 @@ TASKS = [
53
  },
54
  {
55
  "id": "task_003",
 
 
 
 
 
 
56
  "description": (
57
  "Implement EventStore in event-store.ts. "
58
  "It persists events to a DB with append(event), query(filter), "
 
2
  Tasks for the MOA RL environment.
3
  Uses real moav2 service files. Source and tests are pre-installed in /app/moav2.
4
  Each task blanks out one service file — the agent must re-implement it.
5
+
6
+ user_messages: real messages from MOA dev sessions. The model learns to map
7
+ terse, imprecise human intent directly to correct TypeScript implementations.
8
  """
9
  import os
10
 
 
13
  TASKS = [
14
  {
15
  "id": "task_001",
16
+ "user_messages": [
17
+ "the model resolver isn't working, it keeps throwing not implemented",
18
+ "we need it to support anthropic key, oauth, and vertex auth methods",
19
+ "if the registry lookup fails it should scan all providers",
20
+ "fall back to a custom config with providerBaseUrl if nothing works",
21
+ ],
22
  "description": (
23
  "Implement resolveModel() in model-resolver.ts. "
24
  "It maps (modelId, authMethod) to a Model object using the pi-ai registry. "
 
39
  },
40
  {
41
  "id": "task_002",
42
+ "user_messages": [
43
+ "add retry logic to the api client, it's failing on rate limits",
44
+ "need exponential backoff, start at 500ms",
45
+ "also needs to handle 429 and 5xx errors",
46
+ "the sleep function should respect an abort signal",
47
+ ],
48
  "description": (
49
  "Implement retry.ts with three exports: "
50
  "isRetryableError(e) returns true for HTTP 429/5xx and common retry keywords. "
 
68
  },
69
  {
70
  "id": "task_003",
71
+ "user_messages": [
72
+ "we need an event store that persists to the db",
73
+ "it needs append, query, search, count, and materialize",
74
+ "materialize should reconstruct session state from events",
75
+ "search should do full text search across event data",
76
+ ],
77
  "description": (
78
  "Implement EventStore in event-store.ts. "
79
  "It persists events to a DB with append(event), query(filter), "