Spaces:

nks321
/

moa-rl-env

Sleeping

natnael kahssay Claude Sonnet 4.6 commited on Mar 8

Commit

5d3d3ff

1 Parent(s): 002fe30

feat: multi-turn tool-using RL environment (RFC 005 pattern)

Actions are now tool calls (read/edit/bash/submit) instead of one-shot
file submissions. The agent navigates the sandbox across multiple steps,
exactly like Claude Code, then calls submit to run vitest and get reward.

- MOAAction: {tool, params} replaces {file_path, content}
- MOAObservation: adds tool_result, user_messages, step_count
- moa_env.py: dispatches read/edit/bash/submit with sandbox path safety
- tasks.py: adds real user_messages per task (Step 1 intent alignment)
- Max 20 steps per episode; auto-submits on step 20

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

src/envs/moa_env/moa_env.py +148 -53
src/envs/moa_env/models.py +36 -10
src/envs/moa_env/tasks.py +21 -0

src/envs/moa_env/moa_env.py CHANGED Viewed

@@ -1,8 +1,16 @@
 """
-MOA Code Environment.
-Agent is given a task description + broken TypeScript file.
-Agent submits a fixed version. Tests run. Reward = test pass rate.
 """
 import os
@@ -15,18 +23,17 @@ from core.env_server import Action, Environment, Observation
 from .models import MOAAction, MOAObservation, MOAState
 from .tasks import load_task, TASKS
-class MOAEnv(Environment):
-    """
-    RL environment for TypeScript code tasks derived from real MOA dev sessions.
-    reset() → gives agent task description + broken file + test file
-    step()  → agent submits fixed file → runs vitest → returns reward
-    """
     def __init__(self):
         self._state = MOAState()
-        self._task_index = 0  # cycle through tasks
     def reset(self) -> Observation:
         task_id = TASKS[self._task_index % len(TASKS)]["id"]
@@ -39,6 +46,7 @@ class MOAEnv(Environment):
             episode_id=str(uuid.uuid4()),
             step_count=0,
             current_task=task["description"],
             broken_file_path=task["source_file"],
             broken_file_content=task["broken_content"],
             test_file_content=task["test_file_content"],
@@ -49,30 +57,149 @@ class MOAEnv(Environment):
         return MOAObservation(
             task=task["description"],
             broken_file_path=task["source_file"],
             broken_file_content=task["broken_content"],
             test_file_content=task["test_file_content"],
             done=False,
         )
     def _make_sandbox(self, task: dict) -> str:
-        """
-        Copy moav2/src to a temp dir, symlink node_modules from the pre-installed
-        /app/moav2 directory (avoids copying 700MB per request), then blank the
-        target source file so the agent starts from scratch.
-        """
         import shutil
         MOAV2 = "/app/moav2"
         sandbox = tempfile.mkdtemp(prefix="moa_env_")
-        # Copy source tree (812KB) and config files
         shutil.copytree(os.path.join(MOAV2, "src"), os.path.join(sandbox, "src"))
         for f in ("package.json", "vitest.config.ts", "tsconfig.json"):
             src = os.path.join(MOAV2, f)
             if os.path.exists(src):
                 shutil.copy(src, sandbox)
-        # Symlink node_modules — no copy needed
         os.symlink(
             os.path.join(MOAV2, "node_modules"),
             os.path.join(sandbox, "node_modules"),
@@ -80,47 +207,15 @@ class MOAEnv(Environment):
         # Blank out the target file — agent must implement it
         broken_path = os.path.join(sandbox, task["source_file"])
         with open(broken_path, "w") as f:
             f.write(task["broken_content"])
         return sandbox
-    def step(self, action: Action) -> Observation:
-        if not isinstance(action, MOAAction):
-            raise ValueError(f"Expected MOAAction, got {type(action)}")
-        self._state.step_count += 1
-        # write agent's fix into sandbox
-        sandbox_file = os.path.join(
-            self._state.sandbox_dir,
-            action.file_path.lstrip("/"),
-        )
-        os.makedirs(os.path.dirname(sandbox_file), exist_ok=True)
-        with open(sandbox_file, "w") as f:
-            f.write(action.content)
-        # run tests
-        passed, total, output = self._run_tests()
-        reward = passed / max(total, 1)
-        done = (passed == total and total > 0) or self._state.step_count >= 10
-        self._state.last_reward = reward
-        return MOAObservation(
-            task=self._state.current_task,
-            broken_file_path=self._state.broken_file_path,
-            broken_file_content=action.content,  # show what agent submitted
-            test_file_content=self._state.test_file_content,
-            test_output=output,
-            tests_passed=passed,
-            tests_total=total,
-            reward=reward,
-            done=done,
-        )
     def _run_tests(self) -> tuple[int, int, str]:
-        """Run only the task's test file for speed (~500ms vs full suite)."""
         try:
             result = subprocess.run(
                 ["npx", "vitest", "run", "--reporter=verbose",

 """
+MOA Code Environment — multi-turn tool-using RL environment.
+Following OpenEnv RFC 005 (agentic harnesses) pattern:
+  reset()              → task + broken file stub → agent starts exploring
+  step(read/edit/bash) → tool result, no reward yet
+  step(submit)         → runs vitest → reward = tests_passed / tests_total → done
+The agent uses the same tool kit as Claude Code:
+  read   — read any file in the sandbox
+  edit   — apply an exact string replacement
+  bash   — run a shell command in the sandbox (timeout 10s)
+  submit — trigger tests and end the episode
 """
 import os
 from .models import MOAAction, MOAObservation, MOAState
 from .tasks import load_task, TASKS
+MAX_STEPS = 20
+BASH_TIMEOUT = 10   # seconds per bash command
+READ_MAX_CHARS = 8000
+class MOAEnv(Environment):
     def __init__(self):
         self._state = MOAState()
+        self._task_index = 0
+    # ── reset ──────────────────────────────────────────────────────
     def reset(self) -> Observation:
         task_id = TASKS[self._task_index % len(TASKS)]["id"]
             episode_id=str(uuid.uuid4()),
             step_count=0,
             current_task=task["description"],
+            user_messages=task.get("user_messages", []),
             broken_file_path=task["source_file"],
             broken_file_content=task["broken_content"],
             test_file_content=task["test_file_content"],
         return MOAObservation(
             task=task["description"],
+            user_messages=task.get("user_messages", []),
             broken_file_path=task["source_file"],
             broken_file_content=task["broken_content"],
             test_file_content=task["test_file_content"],
+            tool="reset",
+            tool_result="",
             done=False,
+            step_count=0,
+        )
+    # ── step ───────────────────────────────────────────────────────
+    def step(self, action: Action) -> Observation:
+        if not isinstance(action, MOAAction):
+            raise ValueError(f"Expected MOAAction, got {type(action)}")
+        self._state.step_count += 1
+        tool = action.tool
+        params = action.params
+        # ── submit: run tests, end episode ──
+        if tool == "submit":
+            passed, total, output = self._run_tests()
+            reward = passed / max(total, 1)
+            self._state.last_reward = reward
+            return MOAObservation(
+                task=self._state.current_task,
+                user_messages=self._state.user_messages,
+                broken_file_path=self._state.broken_file_path,
+                tool="submit",
+                tool_result="",
+                test_output=output,
+                tests_passed=passed,
+                tests_total=total,
+                reward=reward,
+                done=True,
+                step_count=self._state.step_count,
+            )
+        # ── tool calls ──
+        try:
+            if tool == "read":
+                result = self._tool_read(params.get("path", ""))
+            elif tool == "edit":
+                result = self._tool_edit(
+                    params.get("path", ""),
+                    params.get("old_string", ""),
+                    params.get("new_string", ""),
+                )
+            elif tool == "bash":
+                result = self._tool_bash(params.get("cmd", ""))
+            else:
+                result = f"Unknown tool '{tool}'. Available: read, edit, bash, submit"
+        except Exception as e:
+            result = f"Error: {e}"
+        # max steps → auto-submit
+        done = self._state.step_count >= MAX_STEPS
+        if done:
+            passed, total, output = self._run_tests()
+            reward = passed / max(total, 1)
+            self._state.last_reward = reward
+        else:
+            reward, passed, total, output = 0.0, 0, 0, ""
+        return MOAObservation(
+            task=self._state.current_task,
+            user_messages=self._state.user_messages,
+            broken_file_path=self._state.broken_file_path,
+            tool=tool,
+            tool_result=result,
+            test_output=output,
+            tests_passed=passed,
+            tests_total=total,
+            reward=reward,
+            done=done,
+            step_count=self._state.step_count,
         )
+    # ── tools ──────────────────────────────────────────────────────
+    def _sandbox_path(self, rel_path: str) -> str:
+        """Resolve a relative path to the sandbox, blocking directory traversal."""
+        clean = rel_path.lstrip("/")
+        full = os.path.realpath(os.path.join(self._state.sandbox_dir, clean))
+        if not full.startswith(os.path.realpath(self._state.sandbox_dir)):
+            raise ValueError("Path escapes sandbox")
+        return full
+    def _tool_read(self, path: str) -> str:
+        full = self._sandbox_path(path)
+        if not os.path.isfile(full):
+            return f"Error: file not found: {path}"
+        with open(full) as f:
+            content = f.read(READ_MAX_CHARS)
+        if len(content) == READ_MAX_CHARS:
+            content += "\n... (truncated)"
+        return content
+    def _tool_edit(self, path: str, old_string: str, new_string: str) -> str:
+        full = self._sandbox_path(path)
+        if not os.path.isfile(full):
+            return f"Error: file not found: {path}"
+        with open(full) as f:
+            original = f.read()
+        if old_string not in original:
+            return f"Error: old_string not found in {path}"
+        updated = original.replace(old_string, new_string, 1)
+        with open(full, "w") as f:
+            f.write(updated)
+        lines_changed = new_string.count("\n") - old_string.count("\n")
+        return f"Edited {path} ({lines_changed:+d} lines)"
+    def _tool_bash(self, cmd: str) -> str:
+        try:
+            result = subprocess.run(
+                cmd,
+                shell=True,
+                cwd=self._state.sandbox_dir,
+                capture_output=True,
+                text=True,
+                timeout=BASH_TIMEOUT,
+            )
+            out = (result.stdout + result.stderr)[-3000:]
+            return out if out else "(no output)"
+        except subprocess.TimeoutExpired:
+            return f"Error: command timed out after {BASH_TIMEOUT}s"
+        except Exception as e:
+            return f"Error: {e}"
+    # ── sandbox setup ──────────────────────────────────────────────
     def _make_sandbox(self, task: dict) -> str:
         import shutil
         MOAV2 = "/app/moav2"
         sandbox = tempfile.mkdtemp(prefix="moa_env_")
         shutil.copytree(os.path.join(MOAV2, "src"), os.path.join(sandbox, "src"))
         for f in ("package.json", "vitest.config.ts", "tsconfig.json"):
             src = os.path.join(MOAV2, f)
             if os.path.exists(src):
                 shutil.copy(src, sandbox)
         os.symlink(
             os.path.join(MOAV2, "node_modules"),
             os.path.join(sandbox, "node_modules"),
         # Blank out the target file — agent must implement it
         broken_path = os.path.join(sandbox, task["source_file"])
+        os.makedirs(os.path.dirname(broken_path), exist_ok=True)
         with open(broken_path, "w") as f:
             f.write(task["broken_content"])
         return sandbox
+    # ── test runner ──────────────���─────────────────────────────────
     def _run_tests(self) -> tuple[int, int, str]:
         try:
             result = subprocess.run(
                 ["npx", "vitest", "run", "--reporter=verbose",

src/envs/moa_env/models.py CHANGED Viewed

@@ -1,33 +1,57 @@
 """
 Models for the MOA Code Environment.
-Agent receives a task + broken file, submits a fixed file, gets scored by tests.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import Optional
 from core.env_server import Action, Observation, State
 @dataclass
 class MOAAction(Action):
-    """Agent submits a fixed version of the file."""
-    file_path: str      # which file they are fixing
-    content: str        # the fixed file contents
 @dataclass
 class MOAObservation(Observation):
     """What the agent sees at each step."""
-    task: str = ""                  # collapsed task description
-    broken_file_path: str = ""      # path of the file to fix
-    broken_file_content: str = ""   # current (broken) content
-    test_file_content: str = ""     # the test file (so agent knows what must pass)
-    test_output: str = ""           # vitest output after submission
     tests_passed: int = 0
     tests_total: int = 0
     reward: float = 0.0
     done: bool = False
 @dataclass
@@ -36,9 +60,11 @@ class MOAState(State):
     episode_id: str = ""
     step_count: int = 0
     current_task: str = ""
     broken_file_path: str = ""
     broken_file_content: str = ""
     test_file_content: str = ""
     sandbox_dir: str = ""
     test_file: str = ""
     last_reward: float = 0.0

 """
 Models for the MOA Code Environment.
+Multi-turn tool-using environment following OpenEnv RFC 005 (agentic harnesses).
+The agent calls tools (read/edit/bash) across multiple steps, then submits to
+trigger the test suite. Reward = tests_passed / tests_total on submit.
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
+from typing import List
 from core.env_server import Action, Observation, State
 @dataclass
 class MOAAction(Action):
+    """One tool call from the agent.
+    tool:   "read" | "edit" | "bash" | "submit"
+    params: tool-specific parameters
+    read:   {"path": "src/foo.ts"}
+    edit:   {"path": "src/foo.ts", "old_string": "...", "new_string": "..."}
+    bash:   {"cmd": "npx tsc --noEmit 2>&1 | head -20"}
+    submit: {}  — runs the test suite and ends the episode
+    """
+    tool: str
+    params: dict = field(default_factory=dict)
 @dataclass
 class MOAObservation(Observation):
     """What the agent sees at each step."""
+    # Present on reset and every step so agent always has context
+    task: str = ""
+    broken_file_path: str = ""
+    user_messages: List[str] = field(default_factory=list)
+    # Set on reset only (initial state)
+    broken_file_content: str = ""
+    test_file_content: str = ""
+    # Set after each tool call
+    tool: str = ""          # which tool was just called
+    tool_result: str = ""   # output / result of the tool
+    # Set only on submit (final step)
+    test_output: str = ""
     tests_passed: int = 0
     tests_total: int = 0
     reward: float = 0.0
     done: bool = False
+    step_count: int = 0
 @dataclass
     episode_id: str = ""
     step_count: int = 0
     current_task: str = ""
+    user_messages: List[str] = field(default_factory=list)
     broken_file_path: str = ""
     broken_file_content: str = ""
     test_file_content: str = ""
     sandbox_dir: str = ""
     test_file: str = ""
     last_reward: float = 0.0
+    max_steps: int = 20

src/envs/moa_env/tasks.py CHANGED Viewed

@@ -2,6 +2,9 @@
 Tasks for the MOA RL environment.
 Uses real moav2 service files. Source and tests are pre-installed in /app/moav2.
 Each task blanks out one service file — the agent must re-implement it.
 """
 import os
@@ -10,6 +13,12 @@ MOAV2_DIR = "/app/moav2"
 TASKS = [
     {
         "id": "task_001",
         "description": (
             "Implement resolveModel() in model-resolver.ts. "
             "It maps (modelId, authMethod) to a Model object using the pi-ai registry. "
@@ -30,6 +39,12 @@ TASKS = [
     },
     {
         "id": "task_002",
         "description": (
             "Implement retry.ts with three exports: "
             "isRetryableError(e) returns true for HTTP 429/5xx and common retry keywords. "
@@ -53,6 +68,12 @@ TASKS = [
     },
     {
         "id": "task_003",
         "description": (
             "Implement EventStore in event-store.ts. "
             "It persists events to a DB with append(event), query(filter), "

 Tasks for the MOA RL environment.
 Uses real moav2 service files. Source and tests are pre-installed in /app/moav2.
 Each task blanks out one service file — the agent must re-implement it.
+user_messages: real messages from MOA dev sessions. The model learns to map
+terse, imprecise human intent directly to correct TypeScript implementations.
 """
 import os
 TASKS = [
     {
         "id": "task_001",
+        "user_messages": [
+            "the model resolver isn't working, it keeps throwing not implemented",
+            "we need it to support anthropic key, oauth, and vertex auth methods",
+            "if the registry lookup fails it should scan all providers",
+            "fall back to a custom config with providerBaseUrl if nothing works",
+        ],
         "description": (
             "Implement resolveModel() in model-resolver.ts. "
             "It maps (modelId, authMethod) to a Model object using the pi-ai registry. "
     },
     {
         "id": "task_002",
+        "user_messages": [
+            "add retry logic to the api client, it's failing on rate limits",
+            "need exponential backoff, start at 500ms",
+            "also needs to handle 429 and 5xx errors",
+            "the sleep function should respect an abort signal",
+        ],
         "description": (
             "Implement retry.ts with three exports: "
             "isRetryableError(e) returns true for HTTP 429/5xx and common retry keywords. "
     },
     {
         "id": "task_003",
+        "user_messages": [
+            "we need an event store that persists to the db",
+            "it needs append, query, search, count, and materialize",
+            "materialize should reconstruct session state from events",
+            "search should do full text search across event data",
+        ],
         "description": (
             "Implement EventStore in event-store.ts. "
             "It persists events to a DB with append(event), query(filter), "