Add V1 env

Files changed (6) hide show

.gitignore +0 -2
README.md +89 -43
env/adapt_env.py +104 -0
env/executor.py +36 -0
env/test_cases.py +35 -0
environment.py +0 -80

.gitignore CHANGED Viewed

@@ -148,10 +148,8 @@ activemq-data/
 *.sage.py
 # Environments
-.env
 .envrc
 .venv
-env/
 venv/
 ENV/
 env.bak/

 *.sage.py
 # Environments
 .envrc
 .venv
 venv/
 ENV/
 env.bak/

README.md CHANGED Viewed

@@ -1,81 +1,127 @@
 # meta-rl-dsa-solver
-ADAPT (Adversarial DSA Tutor) is a minimal reinforcement learning environment for coding tasks. The current V0 environment is a pure Python class with no API dependency, so it can be used directly from a training loop with `env.reset()` and `env.step(...)`.
-## Current V0
-- Fixed DSA problem: given an integer `n`, return `n * 2`
-- Single test input: `5`
-- Expected output: `10`
-- Binary reward: `1.0` for correct output, `0.0` otherwise
-- Subprocess execution with a 2 second timeout
-## Run a Smoke Test
-From this directory:
-```powershell
-cd C:\Users\kaust\PycharmProjects\meta-rl-dsa-solver
-python3 -c "from environment import AdaptEnv; env=AdaptEnv(); print(env.reset()); print(env.step('n=int(input()); print(n*2)'))"
 ```
-Expected reward:
 ```text
-1.0
 ```
-## Use in Python
 ```python
-from environment import AdaptEnv
-env = AdaptEnv()
-obs = env.reset()
-print(obs)
-code = "n=int(input()); print(n*2)"
-result = env.step(code)
-print(result)
-assert result["reward"] == 1.0
 ```
-## Check Failure Cases
-Wrong answer:
 ```powershell
-python3 -c "from environment import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('print(0)'))"
 ```
-Timeout:
 ```powershell
-python3 -c "from environment import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('while True: pass'))"
 ```
-## Environment Contract
-`reset()` returns:
-```python
-{
-    "problem": "Given an integer n, return n * 2",
-    "input": "5",
-}
 ```
-`step(action: str)` returns:
 ```python
-{
-    "observation": "<program output or error>",
-    "reward": 1.0,
-    "done": True,
-    "info": {},
-}
 ```
-The implementation keeps the verifier pluggable so later versions can replace the single expected-output check with hidden tests, randomized inputs, or adaptive curriculum logic.

 # meta-rl-dsa-solver
+ADAPT (Adversarial DSA Tutor) is a minimal reinforcement learning environment for DSA code-generation tasks.
+The current implementation is V1: direct Python usage, no FastAPI, multiple test cases, hidden tests, subprocess execution, and verifier-based rewards.
+## Usage
+```python
+from env.adapt_env import AdaptEnv
+env = AdaptEnv()
+obs = env.reset()
+result = env.step("n=int(input())\nprint(n*2)")
+reward = result["reward"]
 ```
+Flow:
 ```text
+model -> generates code -> env.step(code) -> executor runs code -> verifier evaluates -> env returns result
 ```
+## Files
+- `env/adapt_env.py`: reset/step orchestration only
+- `env/executor.py`: subprocess execution with a 2 second timeout
+- `env/test_cases.py`: problem definition plus visible and hidden test cases
+## Observation
+`reset()` returns:
 ```python
+{
+    "problem": str,
+    "input_format": str,
+    "constraints": str,
+    "examples": list,
+    "visible_tests": list,
+}
+```
+Hidden tests are kept inside the environment and are not shown in the observation.
+## Step Result
+`step(code)` returns:
+```python
+{
+    "reward": float,
+    "done": bool,
+    "feedback": str,
+    "pass_rate": float,
+}
 ```
+## Verifier Requirement
+`env.step(code)` calls:
+```python
+from verifier.verifier import verify
+reward, metadata = verify(code, test_cases)
+```
+The verifier should return:
+```python
+(
+    1.0,
+    {
+        "pass_rate": 1.0,
+        "feedback": "All tests passed. Pass rate: 1.00",
+    },
+)
+```
+If `metadata` does not include `pass_rate` or `feedback`, the environment computes fallback values from executor results.
+## Smoke Checks
+From this directory:
 ```powershell
+cd C:\Users\kaust\PycharmProjects\meta-rl-dsa-solver
 ```
+Check reset and visible/hidden split:
 ```powershell
+python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); print(env.reset()); print(len(env.visible_tests), len(env.hidden_tests))"
 ```
+Expected split:
+```text
+3 5
+```
+Check executor directly:
+```powershell
+python -B -c "from env.executor import run_code; print(run_code('n=int(input())\nprint(n*2)', '5\n'))"
 ```
+Expected output:
 ```python
+{'stdout': '10\n', 'stderr': '', 'exit_code': 0}
+```
+Once `verifier/verifier.py` exists, check the full environment:
+```powershell
+python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('n=int(input())\nprint(n*2)'))"
 ```
+Check a wrong answer:
+```powershell
+python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('n=int(input())\nprint(n+2)'))"
+```

env/adapt_env.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from __future__ import annotations
+from typing import Any
+from env.executor import run_code
+from env.test_cases import VISIBLE_TEST_COUNT, load_problem, load_test_cases
+class AdaptEnv:
+    def __init__(self) -> None:
+        self.problem: dict[str, Any] = {}
+        self.test_cases: list[dict[str, str]] = []
+        self.visible_tests: list[dict[str, str]] = []
+        self.hidden_tests: list[dict[str, str]] = []
+        self.step_count = 0
+    def reset(self) -> dict:
+        self.problem = self._load_problem()
+        self.test_cases = load_test_cases()
+        self.visible_tests, self.hidden_tests = self._split_test_cases(self.test_cases)
+        self.step_count = 0
+        return self._build_observation()
+    def step(self, code: str) -> dict:
+        if not self.test_cases:
+            self.reset()
+        self.step_count += 1
+        run_results = self._run_all_tests(code)
+        reward, metadata = self._verify_code(code)
+        metadata = metadata or {}
+        pass_rate = float(metadata.get("pass_rate", self._compute_pass_rate(run_results)))
+        feedback = str(metadata.get("feedback") or self._build_feedback(run_results, pass_rate))
+        return {
+            "reward": float(reward),
+            "done": True,
+            "feedback": feedback,
+            "pass_rate": pass_rate,
+        }
+    def _load_problem(self) -> dict:
+        return load_problem()
+    def _split_test_cases(
+        self,
+        test_cases: list[dict[str, str]],
+    ) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
+        visible_tests = test_cases[:VISIBLE_TEST_COUNT]
+        hidden_tests = test_cases[VISIBLE_TEST_COUNT:]
+        return visible_tests, hidden_tests
+    def _build_observation(self) -> dict:
+        return {
+            "problem": self.problem["problem"],
+            "input_format": self.problem["input_format"],
+            "constraints": self.problem["constraints"],
+            "examples": self.problem["examples"],
+            "visible_tests": self.visible_tests,
+        }
+    def _run_all_tests(self, code: str) -> list[dict[str, Any]]:
+        results = []
+        for test_case in self.test_cases:
+            execution = run_code(code, test_case["input"])
+            actual = execution["stdout"].strip()
+            expected = test_case["output"].strip()
+            results.append(
+                {
+                    "input": test_case["input"],
+                    "expected": expected,
+                    "actual": actual,
+                    "stderr": execution["stderr"].strip(),
+                    "exit_code": execution["exit_code"],
+                    "passed": execution["exit_code"] == 0 and actual == expected,
+                }
+            )
+        return results
+    def _verify_code(self, code: str) -> tuple[float, dict[str, Any]]:
+        from verifier.verifier import verify
+        return verify(code, self.test_cases)
+    def _compute_pass_rate(self, run_results: list[dict[str, Any]]) -> float:
+        if not run_results:
+            return 0.0
+        passed = sum(1 for result in run_results if result["passed"])
+        return passed / len(run_results)
+    def _build_feedback(self, run_results: list[dict[str, Any]], pass_rate: float) -> str:
+        for result in run_results:
+            if result["exit_code"] != 0:
+                error = result["stderr"] or "runtime error"
+                return f"Runtime error on input {result['input'].strip()}: {error}"
+            if not result["passed"]:
+                return (
+                    f"Failed on input {result['input'].strip()}: "
+                    f"expected {result['expected']}, got {result['actual']}"
+                )
+        return f"All tests passed. Pass rate: {pass_rate:.2f}"

env/executor.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+TIMEOUT_SECONDS = 2
+def run_code(code: str, input_data: str) -> dict:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = Path(tmpdir) / "submission.py"
+        file_path.write_text(code, encoding="utf-8")
+        try:
+            result = subprocess.run(
+                [sys.executable, str(file_path)],
+                input=input_data,
+                text=True,
+                capture_output=True,
+                timeout=TIMEOUT_SECONDS,
+            )
+        except subprocess.TimeoutExpired as exc:
+            return {
+                "stdout": exc.stdout or "",
+                "stderr": "Execution timed out",
+                "exit_code": -1,
+            }
+        return {
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "exit_code": result.returncode,
+        }

env/test_cases.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+PROBLEM = {
+    "problem": "Given an integer n, print n * 2.",
+    "input_format": "A single integer n.",
+    "constraints": "-10^9 <= n <= 10^9",
+    "examples": [
+        {"input": "2\n", "output": "4"},
+        {"input": "5\n", "output": "10"},
+    ],
+}
+TEST_CASES = [
+    {"input": "2\n", "output": "4"},
+    {"input": "5\n", "output": "10"},
+    {"input": "0\n", "output": "0"},
+    {"input": "1\n", "output": "2"},
+    {"input": "-3\n", "output": "-6"},
+    {"input": "10\n", "output": "20"},
+    {"input": "999999\n", "output": "1999998"},
+    {"input": "-1000000000\n", "output": "-2000000000"},
+]
+VISIBLE_TEST_COUNT = 3
+def load_problem() -> dict:
+    return dict(PROBLEM)
+def load_test_cases() -> list[dict[str, str]]:
+    return [dict(test_case) for test_case in TEST_CASES]

environment.py DELETED Viewed

@@ -1,80 +0,0 @@
-from __future__ import annotations
-import subprocess
-import tempfile
-from pathlib import Path
-from typing import Any, Callable
-class AdaptEnv:
-    def __init__(
-        self,
-        verifier: Callable[[str, str], tuple[float, dict[str, Any]]] | None = None,
-    ):
-        self.verifier = verifier
-        self.problem = ""
-        self.current_input = ""
-        self.expected_output = ""
-        self.step_count = 0
-        self.last_output = ""
-    def reset(self) -> dict[str, str]:
-        self.problem = "Given an integer n, return n * 2"
-        self.current_input = "5"
-        self.expected_output = "10"
-        self.step_count = 0
-        self.last_output = ""
-        return {
-            "problem": self.problem,
-            "input": self.current_input,
-        }
-    def step(self, action: str) -> dict[str, Any]:
-        if not self.problem:
-            self.reset()
-        self.step_count += 1
-        output = self._run_code(action)
-        reward = self._compute_reward(output)
-        self.last_output = output
-        return {
-            "observation": output,
-            "reward": reward,
-            "done": True,
-            "info": {},
-        }
-    def _run_code(self, code: str) -> str:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            file_path = Path(tmpdir) / "submission.py"
-            file_path.write_text(code, encoding="utf-8")
-            try:
-                result = subprocess.run(
-                    ["python3", str(file_path)],
-                    input=self.current_input,
-                    text=True,
-                    capture_output=True,
-                    timeout=2,
-                )
-            except subprocess.TimeoutExpired:
-                return "ERROR: timeout"
-            except Exception as exc:
-                return f"ERROR: {exc}"
-            if result.returncode != 0:
-                stderr = result.stderr.strip()
-                return f"ERROR: {stderr or 'runtime error'}"
-            return result.stdout.strip()
-    def _compute_reward(self, output: str) -> float:
-        if self.verifier is not None:
-            reward, _info = self.verifier(output, self.expected_output)
-            return reward
-        if output == self.expected_output:
-            return 1.0
-        return 0.0