s-shah4 commited on
Commit ·
da8df85
1
Parent(s): 1b7b2a4
Add V1 env
Browse files- .gitignore +0 -2
- README.md +89 -43
- env/adapt_env.py +104 -0
- env/executor.py +36 -0
- env/test_cases.py +35 -0
- environment.py +0 -80
.gitignore
CHANGED
|
@@ -148,10 +148,8 @@ activemq-data/
|
|
| 148 |
*.sage.py
|
| 149 |
|
| 150 |
# Environments
|
| 151 |
-
.env
|
| 152 |
.envrc
|
| 153 |
.venv
|
| 154 |
-
env/
|
| 155 |
venv/
|
| 156 |
ENV/
|
| 157 |
env.bak/
|
|
|
|
| 148 |
*.sage.py
|
| 149 |
|
| 150 |
# Environments
|
|
|
|
| 151 |
.envrc
|
| 152 |
.venv
|
|
|
|
| 153 |
venv/
|
| 154 |
ENV/
|
| 155 |
env.bak/
|
README.md
CHANGED
|
@@ -1,81 +1,127 @@
|
|
| 1 |
# meta-rl-dsa-solver
|
| 2 |
|
| 3 |
-
ADAPT (Adversarial DSA Tutor) is a minimal reinforcement learning environment for
|
| 4 |
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
-
- Single test input: `5`
|
| 9 |
-
- Expected output: `10`
|
| 10 |
-
- Binary reward: `1.0` for correct output, `0.0` otherwise
|
| 11 |
-
- Subprocess execution with a 2 second timeout
|
| 12 |
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
| 20 |
```
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
```text
|
| 25 |
-
|
| 26 |
```
|
| 27 |
|
| 28 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
```python
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
print(obs)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
result = env.step(code)
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
```
|
| 44 |
|
| 45 |
-
##
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
```powershell
|
| 50 |
-
|
| 51 |
```
|
| 52 |
|
| 53 |
-
|
| 54 |
|
| 55 |
```powershell
|
| 56 |
-
|
| 57 |
```
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
`
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
}
|
| 68 |
```
|
| 69 |
|
| 70 |
-
|
| 71 |
|
| 72 |
```python
|
| 73 |
-
{
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
```
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# meta-rl-dsa-solver
|
| 2 |
|
| 3 |
+
ADAPT (Adversarial DSA Tutor) is a minimal reinforcement learning environment for DSA code-generation tasks.
|
| 4 |
|
| 5 |
+
The current implementation is V1: direct Python usage, no FastAPI, multiple test cases, hidden tests, subprocess execution, and verifier-based rewards.
|
| 6 |
|
| 7 |
+
## Usage
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
```python
|
| 10 |
+
from env.adapt_env import AdaptEnv
|
| 11 |
|
| 12 |
+
env = AdaptEnv()
|
| 13 |
|
| 14 |
+
obs = env.reset()
|
| 15 |
+
result = env.step("n=int(input())\nprint(n*2)")
|
| 16 |
+
|
| 17 |
+
reward = result["reward"]
|
| 18 |
```
|
| 19 |
|
| 20 |
+
Flow:
|
| 21 |
|
| 22 |
```text
|
| 23 |
+
model -> generates code -> env.step(code) -> executor runs code -> verifier evaluates -> env returns result
|
| 24 |
```
|
| 25 |
|
| 26 |
+
## Files
|
| 27 |
+
|
| 28 |
+
- `env/adapt_env.py`: reset/step orchestration only
|
| 29 |
+
- `env/executor.py`: subprocess execution with a 2 second timeout
|
| 30 |
+
- `env/test_cases.py`: problem definition plus visible and hidden test cases
|
| 31 |
+
|
| 32 |
+
## Observation
|
| 33 |
+
|
| 34 |
+
`reset()` returns:
|
| 35 |
|
| 36 |
```python
|
| 37 |
+
{
|
| 38 |
+
"problem": str,
|
| 39 |
+
"input_format": str,
|
| 40 |
+
"constraints": str,
|
| 41 |
+
"examples": list,
|
| 42 |
+
"visible_tests": list,
|
| 43 |
+
}
|
| 44 |
+
```
|
| 45 |
|
| 46 |
+
Hidden tests are kept inside the environment and are not shown in the observation.
|
| 47 |
|
| 48 |
+
## Step Result
|
|
|
|
| 49 |
|
| 50 |
+
`step(code)` returns:
|
|
|
|
| 51 |
|
| 52 |
+
```python
|
| 53 |
+
{
|
| 54 |
+
"reward": float,
|
| 55 |
+
"done": bool,
|
| 56 |
+
"feedback": str,
|
| 57 |
+
"pass_rate": float,
|
| 58 |
+
}
|
| 59 |
```
|
| 60 |
|
| 61 |
+
## Verifier Requirement
|
| 62 |
|
| 63 |
+
`env.step(code)` calls:
|
| 64 |
+
|
| 65 |
+
```python
|
| 66 |
+
from verifier.verifier import verify
|
| 67 |
+
|
| 68 |
+
reward, metadata = verify(code, test_cases)
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
The verifier should return:
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
(
|
| 75 |
+
1.0,
|
| 76 |
+
{
|
| 77 |
+
"pass_rate": 1.0,
|
| 78 |
+
"feedback": "All tests passed. Pass rate: 1.00",
|
| 79 |
+
},
|
| 80 |
+
)
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
If `metadata` does not include `pass_rate` or `feedback`, the environment computes fallback values from executor results.
|
| 84 |
+
|
| 85 |
+
## Smoke Checks
|
| 86 |
+
|
| 87 |
+
From this directory:
|
| 88 |
|
| 89 |
```powershell
|
| 90 |
+
cd C:\Users\kaust\PycharmProjects\meta-rl-dsa-solver
|
| 91 |
```
|
| 92 |
|
| 93 |
+
Check reset and visible/hidden split:
|
| 94 |
|
| 95 |
```powershell
|
| 96 |
+
python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); print(env.reset()); print(len(env.visible_tests), len(env.hidden_tests))"
|
| 97 |
```
|
| 98 |
|
| 99 |
+
Expected split:
|
| 100 |
|
| 101 |
+
```text
|
| 102 |
+
3 5
|
| 103 |
+
```
|
| 104 |
|
| 105 |
+
Check executor directly:
|
| 106 |
+
|
| 107 |
+
```powershell
|
| 108 |
+
python -B -c "from env.executor import run_code; print(run_code('n=int(input())\nprint(n*2)', '5\n'))"
|
|
|
|
| 109 |
```
|
| 110 |
|
| 111 |
+
Expected output:
|
| 112 |
|
| 113 |
```python
|
| 114 |
+
{'stdout': '10\n', 'stderr': '', 'exit_code': 0}
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
Once `verifier/verifier.py` exists, check the full environment:
|
| 118 |
+
|
| 119 |
+
```powershell
|
| 120 |
+
python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('n=int(input())\nprint(n*2)'))"
|
| 121 |
```
|
| 122 |
|
| 123 |
+
Check a wrong answer:
|
| 124 |
+
|
| 125 |
+
```powershell
|
| 126 |
+
python -B -c "from env.adapt_env import AdaptEnv; env=AdaptEnv(); env.reset(); print(env.step('n=int(input())\nprint(n+2)'))"
|
| 127 |
+
```
|
env/adapt_env.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from env.executor import run_code
|
| 6 |
+
from env.test_cases import VISIBLE_TEST_COUNT, load_problem, load_test_cases
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AdaptEnv:
|
| 10 |
+
def __init__(self) -> None:
|
| 11 |
+
self.problem: dict[str, Any] = {}
|
| 12 |
+
self.test_cases: list[dict[str, str]] = []
|
| 13 |
+
self.visible_tests: list[dict[str, str]] = []
|
| 14 |
+
self.hidden_tests: list[dict[str, str]] = []
|
| 15 |
+
self.step_count = 0
|
| 16 |
+
|
| 17 |
+
def reset(self) -> dict:
|
| 18 |
+
self.problem = self._load_problem()
|
| 19 |
+
self.test_cases = load_test_cases()
|
| 20 |
+
self.visible_tests, self.hidden_tests = self._split_test_cases(self.test_cases)
|
| 21 |
+
self.step_count = 0
|
| 22 |
+
return self._build_observation()
|
| 23 |
+
|
| 24 |
+
def step(self, code: str) -> dict:
|
| 25 |
+
if not self.test_cases:
|
| 26 |
+
self.reset()
|
| 27 |
+
|
| 28 |
+
self.step_count += 1
|
| 29 |
+
run_results = self._run_all_tests(code)
|
| 30 |
+
reward, metadata = self._verify_code(code)
|
| 31 |
+
metadata = metadata or {}
|
| 32 |
+
|
| 33 |
+
pass_rate = float(metadata.get("pass_rate", self._compute_pass_rate(run_results)))
|
| 34 |
+
feedback = str(metadata.get("feedback") or self._build_feedback(run_results, pass_rate))
|
| 35 |
+
|
| 36 |
+
return {
|
| 37 |
+
"reward": float(reward),
|
| 38 |
+
"done": True,
|
| 39 |
+
"feedback": feedback,
|
| 40 |
+
"pass_rate": pass_rate,
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
def _load_problem(self) -> dict:
|
| 44 |
+
return load_problem()
|
| 45 |
+
|
| 46 |
+
def _split_test_cases(
|
| 47 |
+
self,
|
| 48 |
+
test_cases: list[dict[str, str]],
|
| 49 |
+
) -> tuple[list[dict[str, str]], list[dict[str, str]]]:
|
| 50 |
+
visible_tests = test_cases[:VISIBLE_TEST_COUNT]
|
| 51 |
+
hidden_tests = test_cases[VISIBLE_TEST_COUNT:]
|
| 52 |
+
return visible_tests, hidden_tests
|
| 53 |
+
|
| 54 |
+
def _build_observation(self) -> dict:
|
| 55 |
+
return {
|
| 56 |
+
"problem": self.problem["problem"],
|
| 57 |
+
"input_format": self.problem["input_format"],
|
| 58 |
+
"constraints": self.problem["constraints"],
|
| 59 |
+
"examples": self.problem["examples"],
|
| 60 |
+
"visible_tests": self.visible_tests,
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
def _run_all_tests(self, code: str) -> list[dict[str, Any]]:
|
| 64 |
+
results = []
|
| 65 |
+
for test_case in self.test_cases:
|
| 66 |
+
execution = run_code(code, test_case["input"])
|
| 67 |
+
actual = execution["stdout"].strip()
|
| 68 |
+
expected = test_case["output"].strip()
|
| 69 |
+
results.append(
|
| 70 |
+
{
|
| 71 |
+
"input": test_case["input"],
|
| 72 |
+
"expected": expected,
|
| 73 |
+
"actual": actual,
|
| 74 |
+
"stderr": execution["stderr"].strip(),
|
| 75 |
+
"exit_code": execution["exit_code"],
|
| 76 |
+
"passed": execution["exit_code"] == 0 and actual == expected,
|
| 77 |
+
}
|
| 78 |
+
)
|
| 79 |
+
return results
|
| 80 |
+
|
| 81 |
+
def _verify_code(self, code: str) -> tuple[float, dict[str, Any]]:
|
| 82 |
+
from verifier.verifier import verify
|
| 83 |
+
|
| 84 |
+
return verify(code, self.test_cases)
|
| 85 |
+
|
| 86 |
+
def _compute_pass_rate(self, run_results: list[dict[str, Any]]) -> float:
|
| 87 |
+
if not run_results:
|
| 88 |
+
return 0.0
|
| 89 |
+
passed = sum(1 for result in run_results if result["passed"])
|
| 90 |
+
return passed / len(run_results)
|
| 91 |
+
|
| 92 |
+
def _build_feedback(self, run_results: list[dict[str, Any]], pass_rate: float) -> str:
|
| 93 |
+
for result in run_results:
|
| 94 |
+
if result["exit_code"] != 0:
|
| 95 |
+
error = result["stderr"] or "runtime error"
|
| 96 |
+
return f"Runtime error on input {result['input'].strip()}: {error}"
|
| 97 |
+
|
| 98 |
+
if not result["passed"]:
|
| 99 |
+
return (
|
| 100 |
+
f"Failed on input {result['input'].strip()}: "
|
| 101 |
+
f"expected {result['expected']}, got {result['actual']}"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return f"All tests passed. Pass rate: {pass_rate:.2f}"
|
env/executor.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import subprocess
|
| 4 |
+
import sys
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
TIMEOUT_SECONDS = 2
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def run_code(code: str, input_data: str) -> dict:
|
| 13 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 14 |
+
file_path = Path(tmpdir) / "submission.py"
|
| 15 |
+
file_path.write_text(code, encoding="utf-8")
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
result = subprocess.run(
|
| 19 |
+
[sys.executable, str(file_path)],
|
| 20 |
+
input=input_data,
|
| 21 |
+
text=True,
|
| 22 |
+
capture_output=True,
|
| 23 |
+
timeout=TIMEOUT_SECONDS,
|
| 24 |
+
)
|
| 25 |
+
except subprocess.TimeoutExpired as exc:
|
| 26 |
+
return {
|
| 27 |
+
"stdout": exc.stdout or "",
|
| 28 |
+
"stderr": "Execution timed out",
|
| 29 |
+
"exit_code": -1,
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
return {
|
| 33 |
+
"stdout": result.stdout,
|
| 34 |
+
"stderr": result.stderr,
|
| 35 |
+
"exit_code": result.returncode,
|
| 36 |
+
}
|
env/test_cases.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
PROBLEM = {
|
| 5 |
+
"problem": "Given an integer n, print n * 2.",
|
| 6 |
+
"input_format": "A single integer n.",
|
| 7 |
+
"constraints": "-10^9 <= n <= 10^9",
|
| 8 |
+
"examples": [
|
| 9 |
+
{"input": "2\n", "output": "4"},
|
| 10 |
+
{"input": "5\n", "output": "10"},
|
| 11 |
+
],
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
TEST_CASES = [
|
| 16 |
+
{"input": "2\n", "output": "4"},
|
| 17 |
+
{"input": "5\n", "output": "10"},
|
| 18 |
+
{"input": "0\n", "output": "0"},
|
| 19 |
+
{"input": "1\n", "output": "2"},
|
| 20 |
+
{"input": "-3\n", "output": "-6"},
|
| 21 |
+
{"input": "10\n", "output": "20"},
|
| 22 |
+
{"input": "999999\n", "output": "1999998"},
|
| 23 |
+
{"input": "-1000000000\n", "output": "-2000000000"},
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
VISIBLE_TEST_COUNT = 3
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_problem() -> dict:
|
| 31 |
+
return dict(PROBLEM)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_test_cases() -> list[dict[str, str]]:
|
| 35 |
+
return [dict(test_case) for test_case in TEST_CASES]
|
environment.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import subprocess
|
| 4 |
-
import tempfile
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import Any, Callable
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
class AdaptEnv:
|
| 10 |
-
def __init__(
|
| 11 |
-
self,
|
| 12 |
-
verifier: Callable[[str, str], tuple[float, dict[str, Any]]] | None = None,
|
| 13 |
-
):
|
| 14 |
-
self.verifier = verifier
|
| 15 |
-
self.problem = ""
|
| 16 |
-
self.current_input = ""
|
| 17 |
-
self.expected_output = ""
|
| 18 |
-
self.step_count = 0
|
| 19 |
-
self.last_output = ""
|
| 20 |
-
|
| 21 |
-
def reset(self) -> dict[str, str]:
|
| 22 |
-
self.problem = "Given an integer n, return n * 2"
|
| 23 |
-
self.current_input = "5"
|
| 24 |
-
self.expected_output = "10"
|
| 25 |
-
self.step_count = 0
|
| 26 |
-
self.last_output = ""
|
| 27 |
-
|
| 28 |
-
return {
|
| 29 |
-
"problem": self.problem,
|
| 30 |
-
"input": self.current_input,
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
-
def step(self, action: str) -> dict[str, Any]:
|
| 34 |
-
if not self.problem:
|
| 35 |
-
self.reset()
|
| 36 |
-
|
| 37 |
-
self.step_count += 1
|
| 38 |
-
output = self._run_code(action)
|
| 39 |
-
reward = self._compute_reward(output)
|
| 40 |
-
self.last_output = output
|
| 41 |
-
|
| 42 |
-
return {
|
| 43 |
-
"observation": output,
|
| 44 |
-
"reward": reward,
|
| 45 |
-
"done": True,
|
| 46 |
-
"info": {},
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
def _run_code(self, code: str) -> str:
|
| 50 |
-
with tempfile.TemporaryDirectory() as tmpdir:
|
| 51 |
-
file_path = Path(tmpdir) / "submission.py"
|
| 52 |
-
file_path.write_text(code, encoding="utf-8")
|
| 53 |
-
|
| 54 |
-
try:
|
| 55 |
-
result = subprocess.run(
|
| 56 |
-
["python3", str(file_path)],
|
| 57 |
-
input=self.current_input,
|
| 58 |
-
text=True,
|
| 59 |
-
capture_output=True,
|
| 60 |
-
timeout=2,
|
| 61 |
-
)
|
| 62 |
-
except subprocess.TimeoutExpired:
|
| 63 |
-
return "ERROR: timeout"
|
| 64 |
-
except Exception as exc:
|
| 65 |
-
return f"ERROR: {exc}"
|
| 66 |
-
|
| 67 |
-
if result.returncode != 0:
|
| 68 |
-
stderr = result.stderr.strip()
|
| 69 |
-
return f"ERROR: {stderr or 'runtime error'}"
|
| 70 |
-
|
| 71 |
-
return result.stdout.strip()
|
| 72 |
-
|
| 73 |
-
def _compute_reward(self, output: str) -> float:
|
| 74 |
-
if self.verifier is not None:
|
| 75 |
-
reward, _info = self.verifier(output, self.expected_output)
|
| 76 |
-
return reward
|
| 77 |
-
|
| 78 |
-
if output == self.expected_output:
|
| 79 |
-
return 1.0
|
| 80 |
-
return 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|