Spaces:
Sleeping
Sleeping
databoysu commited on
Commit ·
5813a84
1
Parent(s): fdc5ba1
my_env
Browse files- README.md +55 -0
- my_env/__init__.py → __init__.py +5 -4
- my_env/client.py → client.py +19 -12
- context.py +94 -0
- environment.py +613 -0
- inference.py +376 -0
- models.py +75 -0
- my_env/README.md +0 -255
- my_env/models.py +0 -27
- my_env/openenv.yaml → openenv.yaml +1 -2
- pre-val.sh +185 -0
- my_env/pyproject.toml → pyproject.toml +5 -3
- sandbox.py +309 -0
- {my_env/server → server}/Dockerfile +0 -0
- {my_env/server → server}/__init__.py +0 -0
- {my_env/server → server}/app.py +17 -52
- {my_env/server → server}/my_env_environment.py +30 -69
- {my_env/server → server}/requirements.txt +2 -1
- tasks.py +683 -0
- my_env/uv.lock → uv.lock +0 -0
README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Python Debugging Gym
|
| 3 |
+
emoji: 🐛
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: cyan
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- reinforcement-learning
|
| 13 |
+
- code-generation
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Python Debugging Gym
|
| 17 |
+
|
| 18 |
+
An OpenEnv-compatible RL environment where agents debug broken Python code by
|
| 19 |
+
iteratively viewing, editing, and testing code snippets until all tests pass.
|
| 20 |
+
|
| 21 |
+
## Environment Overview
|
| 22 |
+
|
| 23 |
+
- Action space:
|
| 24 |
+
`VIEW_CODE`, `RUN_TESTS`, `REPLACE_LINES`, `UNDO_EDIT`, `RESET_TO_ORIGINAL`, `SUBMIT`
|
| 25 |
+
- Observation includes:
|
| 26 |
+
`code_lines`, `localized_context`, `last_execution_output`, `syntax_error`, `test_results`
|
| 27 |
+
- Dense reward with step cost and final score on submit.
|
| 28 |
+
|
| 29 |
+
## Local Run
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
uv sync
|
| 33 |
+
uv run --project . server --port 8000
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
Server endpoints:
|
| 37 |
+
- `POST /reset`
|
| 38 |
+
- `POST /step`
|
| 39 |
+
- `GET /health`
|
| 40 |
+
- `WS /ws`
|
| 41 |
+
- `GET /web` (OpenEnv web UI)
|
| 42 |
+
|
| 43 |
+
## Deploy to Hugging Face Spaces
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
openenv push
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## Validate Submission
|
| 50 |
+
|
| 51 |
+
From repo root (`RL_ENV_FINAL`):
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
./pre-val.sh https://<your-space>.hf.space ./my_env
|
| 55 |
+
```
|
my_env/__init__.py → __init__.py
RENAMED
|
@@ -4,13 +4,14 @@
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
"""
|
| 8 |
|
| 9 |
from .client import MyEnv
|
| 10 |
-
from .models import
|
| 11 |
|
| 12 |
__all__ = [
|
| 13 |
-
"
|
| 14 |
-
"
|
|
|
|
| 15 |
"MyEnv",
|
| 16 |
]
|
|
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
"""Python Debugging Gym OpenEnv package."""
|
| 8 |
|
| 9 |
from .client import MyEnv
|
| 10 |
+
from .models import CodeAction, CodeObservation, TestResult
|
| 11 |
|
| 12 |
__all__ = [
|
| 13 |
+
"CodeAction",
|
| 14 |
+
"CodeObservation",
|
| 15 |
+
"TestResult",
|
| 16 |
"MyEnv",
|
| 17 |
]
|
my_env/client.py → client.py
RENAMED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
"""
|
| 8 |
|
| 9 |
from typing import Dict
|
| 10 |
|
|
@@ -12,11 +12,11 @@ from openenv.core import EnvClient
|
|
| 12 |
from openenv.core.client_types import StepResult
|
| 13 |
from openenv.core.env_server.types import State
|
| 14 |
|
| 15 |
-
from .models import
|
| 16 |
|
| 17 |
|
| 18 |
class MyEnv(
|
| 19 |
-
EnvClient[
|
| 20 |
):
|
| 21 |
"""
|
| 22 |
Client for the My Env Environment.
|
|
@@ -44,7 +44,7 @@ class MyEnv(
|
|
| 44 |
... client.close()
|
| 45 |
"""
|
| 46 |
|
| 47 |
-
def _step_payload(self, action:
|
| 48 |
"""
|
| 49 |
Convert MyAction to JSON payload for step message.
|
| 50 |
|
|
@@ -54,13 +54,11 @@ class MyEnv(
|
|
| 54 |
Returns:
|
| 55 |
Dictionary representation suitable for JSON encoding
|
| 56 |
"""
|
| 57 |
-
return
|
| 58 |
-
"message": action.message,
|
| 59 |
-
}
|
| 60 |
|
| 61 |
-
def _parse_result(self, payload: Dict) -> StepResult[
|
| 62 |
"""
|
| 63 |
-
Parse server response into StepResult[
|
| 64 |
|
| 65 |
Args:
|
| 66 |
payload: JSON response data from server
|
|
@@ -69,9 +67,18 @@ class MyEnv(
|
|
| 69 |
StepResult with MyObservation
|
| 70 |
"""
|
| 71 |
obs_data = payload.get("observation", {})
|
| 72 |
-
observation =
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
done=payload.get("done", False),
|
| 76 |
reward=payload.get("reward"),
|
| 77 |
metadata=obs_data.get("metadata", {}),
|
|
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
"""Client for the Python Debugging Gym OpenEnv environment."""
|
| 8 |
|
| 9 |
from typing import Dict
|
| 10 |
|
|
|
|
| 12 |
from openenv.core.client_types import StepResult
|
| 13 |
from openenv.core.env_server.types import State
|
| 14 |
|
| 15 |
+
from .models import CodeAction, CodeObservation, TestResult
|
| 16 |
|
| 17 |
|
| 18 |
class MyEnv(
|
| 19 |
+
EnvClient[CodeAction, CodeObservation, State]
|
| 20 |
):
|
| 21 |
"""
|
| 22 |
Client for the My Env Environment.
|
|
|
|
| 44 |
... client.close()
|
| 45 |
"""
|
| 46 |
|
| 47 |
+
def _step_payload(self, action: CodeAction) -> Dict:
|
| 48 |
"""
|
| 49 |
Convert MyAction to JSON payload for step message.
|
| 50 |
|
|
|
|
| 54 |
Returns:
|
| 55 |
Dictionary representation suitable for JSON encoding
|
| 56 |
"""
|
| 57 |
+
return action.model_dump(exclude_none=True)
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
def _parse_result(self, payload: Dict) -> StepResult[CodeObservation]:
|
| 60 |
"""
|
| 61 |
+
Parse server response into StepResult[CodeObservation].
|
| 62 |
|
| 63 |
Args:
|
| 64 |
payload: JSON response data from server
|
|
|
|
| 67 |
StepResult with MyObservation
|
| 68 |
"""
|
| 69 |
obs_data = payload.get("observation", {})
|
| 70 |
+
observation = CodeObservation(
|
| 71 |
+
code_lines=obs_data.get("code_lines", []),
|
| 72 |
+
localized_context=obs_data.get("localized_context", ""),
|
| 73 |
+
last_execution_output=obs_data.get("last_execution_output", ""),
|
| 74 |
+
syntax_error=obs_data.get("syntax_error", False),
|
| 75 |
+
test_results=[
|
| 76 |
+
TestResult(**item) for item in obs_data.get("test_results", [])
|
| 77 |
+
],
|
| 78 |
+
step_count=obs_data.get("step_count", 0),
|
| 79 |
+
steps_remaining=obs_data.get("steps_remaining", 0),
|
| 80 |
+
reward_last_step=obs_data.get("reward_last_step", 0.0),
|
| 81 |
+
info=obs_data.get("info", {}),
|
| 82 |
done=payload.get("done", False),
|
| 83 |
reward=payload.get("reward"),
|
| 84 |
metadata=obs_data.get("metadata", {}),
|
context.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
context.py — Layered Context Compaction
|
| 3 |
+
=========================================
|
| 4 |
+
|
| 5 |
+
PRINCIPLE 10 — Layered Context Compaction
|
| 6 |
+
For large files, returning the full source on every observation would rapidly
|
| 7 |
+
fill the agent's context window, leaving no room for reasoning.
|
| 8 |
+
|
| 9 |
+
Instead we return a *localized* view: a ±WINDOW_LINES slice of the code
|
| 10 |
+
centred on the last line that was edited. This gives the agent exactly the
|
| 11 |
+
context it needs — the neighbourhood of its most recent change — without
|
| 12 |
+
flooding the context with unrelated code.
|
| 13 |
+
|
| 14 |
+
This module is intentionally pure (no environment state dependencies) so
|
| 15 |
+
it can be unit-tested independently and reused across environment versions.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
from typing import List, Optional
|
| 21 |
+
|
| 22 |
+
# How many lines above and below the anchor to include
|
| 23 |
+
WINDOW_LINES: int = 10
|
| 24 |
+
|
| 25 |
+
# Maximum characters for the localized context block
|
| 26 |
+
# (Principle 9: all outputs must be bounded)
|
| 27 |
+
MAX_CONTEXT_CHARS: int = 2_000
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_localized_context(
|
| 31 |
+
code_lines: List[str],
|
| 32 |
+
anchor_line: Optional[int],
|
| 33 |
+
window: int = WINDOW_LINES,
|
| 34 |
+
) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Return a ±`window`-line slice of `code_lines` centred on `anchor_line`.
|
| 37 |
+
|
| 38 |
+
Parameters
|
| 39 |
+
----------
|
| 40 |
+
code_lines : Full list of source lines (0-indexed internally).
|
| 41 |
+
anchor_line : The 1-indexed line number of the most recent edit.
|
| 42 |
+
If None (no edits yet) returns an empty string.
|
| 43 |
+
window : Number of lines to show above and below the anchor.
|
| 44 |
+
|
| 45 |
+
Returns
|
| 46 |
+
-------
|
| 47 |
+
A formatted string with line numbers, bounded to MAX_CONTEXT_CHARS,
|
| 48 |
+
annotated with the visible range and an anchor marker (▶).
|
| 49 |
+
|
| 50 |
+
Example output
|
| 51 |
+
--------------
|
| 52 |
+
[Showing lines 3–13 of 20, anchor ▶ line 7]
|
| 53 |
+
3 | left, right = 0, len(arr)
|
| 54 |
+
4 | while left <= right:
|
| 55 |
+
5 | mid = (left + right) // 2
|
| 56 |
+
6 | if arr[mid] == target:
|
| 57 |
+
7 ▶ return mid ← last edit
|
| 58 |
+
8 | elif arr[mid] < target:
|
| 59 |
+
9 | left = mid + 1
|
| 60 |
+
10 | else:
|
| 61 |
+
11 | right = mid - 1
|
| 62 |
+
12 | return -1
|
| 63 |
+
"""
|
| 64 |
+
if anchor_line is None or not code_lines:
|
| 65 |
+
return ""
|
| 66 |
+
|
| 67 |
+
total = len(code_lines)
|
| 68 |
+
|
| 69 |
+
# Clamp anchor into valid range
|
| 70 |
+
anchor_0 = max(0, min(anchor_line - 1, total - 1))
|
| 71 |
+
|
| 72 |
+
# Compute slice bounds (inclusive on both ends, 0-indexed)
|
| 73 |
+
start_0 = max(0, anchor_0 - window)
|
| 74 |
+
end_0 = min(total - 1, anchor_0 + window)
|
| 75 |
+
|
| 76 |
+
# Build header
|
| 77 |
+
start_1 = start_0 + 1
|
| 78 |
+
end_1 = end_0 + 1
|
| 79 |
+
header = f"[Showing lines {start_1}–{end_1} of {total}, anchor ▶ line {anchor_line}]"
|
| 80 |
+
|
| 81 |
+
# Build body
|
| 82 |
+
body_lines = []
|
| 83 |
+
for i in range(start_0, end_0 + 1):
|
| 84 |
+
line_num = i + 1
|
| 85 |
+
marker = "▶" if i == anchor_0 else "|"
|
| 86 |
+
body_lines.append(f"{line_num:>4} {marker} {code_lines[i]}")
|
| 87 |
+
|
| 88 |
+
result = header + "\n" + "\n".join(body_lines)
|
| 89 |
+
|
| 90 |
+
# PRINCIPLE 9 — hard cap on output size
|
| 91 |
+
if len(result) > MAX_CONTEXT_CHARS:
|
| 92 |
+
result = result[:MAX_CONTEXT_CHARS] + "\n... [context truncated]"
|
| 93 |
+
|
| 94 |
+
return result
|
environment.py
ADDED
|
@@ -0,0 +1,613 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
environment.py — Python Debugging Gym (Core RL Environment)
|
| 3 |
+
=============================================================
|
| 4 |
+
|
| 5 |
+
PRINCIPLE 1 — You Don't Design the Control Flow
|
| 6 |
+
The agent decides the sequence of actions. step() is a pure router:
|
| 7 |
+
it receives whatever action the agent chose (in whatever order),
|
| 8 |
+
processes it, and returns the new state. There is no forced sequence,
|
| 9 |
+
no "you must VIEW_CODE before RUN_TESTS" gate. The system prompt
|
| 10 |
+
explains what tools exist; the agent decides how to use them.
|
| 11 |
+
|
| 12 |
+
PRINCIPLE 5 — Cost-Per-Turn Reward Logic
|
| 13 |
+
Each call to step() costs R_STEP_COST = -0.01. This makes the episode
|
| 14 |
+
a multi-turn budget problem: the agent is rewarded for solving quickly.
|
| 15 |
+
An agent that solves in 4 steps scores ~0.14 more than one that takes
|
| 16 |
+
18 steps to reach the same solution.
|
| 17 |
+
|
| 18 |
+
PRINCIPLE 7 — The Prompt is Code
|
| 19 |
+
The string returned by reset() is the agent's complete operational
|
| 20 |
+
contract for the session. It states: the goal, the available actions
|
| 21 |
+
(with exact JSON examples), the reward structure, the current code,
|
| 22 |
+
and the expected termination condition. Ambiguity in this string
|
| 23 |
+
directly causes off-task behaviour.
|
| 24 |
+
|
| 25 |
+
PRINCIPLE 10 — Layered Context Compaction
|
| 26 |
+
_build_observation() tracks `_last_edited_line` and passes it to
|
| 27 |
+
context.get_localized_context() to produce a focused ±10-line view
|
| 28 |
+
after each write action. This prevents the observation from inflating
|
| 29 |
+
the agent's context window on large files.
|
| 30 |
+
|
| 31 |
+
Reward table (dense, non-sparse — every step emits a signal):
|
| 32 |
+
+1.00 SUBMIT and ALL tests pass → episode solved
|
| 33 |
+
+0.10 RUN_TESTS called → information-gathering rewarded
|
| 34 |
+
+0.05 Per test transitioning fail→pass on a RUN_TESTS or SUBMIT
|
| 35 |
+
-0.01 Every step taken → efficiency pressure (Principle 5)
|
| 36 |
+
-0.10 Syntax error detected → broken code penalised immediately
|
| 37 |
+
-0.10 UNDO_EDIT or RESET_TO_ORIGINAL → backtracking discouraged
|
| 38 |
+
-0.02 Invalid line range supplied → hallucination deterrent
|
| 39 |
+
-0.20 SUBMIT with tests still failing
|
| 40 |
+
|
| 41 |
+
Max episode length: 50 steps.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
from __future__ import annotations
|
| 45 |
+
|
| 46 |
+
import random
|
| 47 |
+
import uuid
|
| 48 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from .context import get_localized_context
|
| 52 |
+
from .models import CodeAction, CodeObservation, TestResult
|
| 53 |
+
from .sandbox import check_syntax, run_code_with_tests
|
| 54 |
+
from .tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
|
| 55 |
+
except ImportError:
|
| 56 |
+
from context import get_localized_context
|
| 57 |
+
from models import CodeAction, CodeObservation, TestResult
|
| 58 |
+
from sandbox import check_syntax, run_code_with_tests
|
| 59 |
+
from tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Reward constants
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
R_SUBMIT_ALL_PASS = +1.00
|
| 67 |
+
R_SUBMIT_FAIL = -0.20
|
| 68 |
+
R_SYNTAX_ERROR = -0.10
|
| 69 |
+
R_RUN_TESTS = +0.10
|
| 70 |
+
R_PER_NEW_PASS = +0.05
|
| 71 |
+
R_STEP_COST = -0.01 # PRINCIPLE 5 — every step has a cost
|
| 72 |
+
R_INVALID_LINE = -0.02
|
| 73 |
+
R_DESTRUCTIVE_PENALTY = -0.20
|
| 74 |
+
R_UNDO_RESET = -0.10 # Mini-Git backtracking penalty
|
| 75 |
+
|
| 76 |
+
MAX_STEPS: int = 50
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
# System Prompt (PRINCIPLE 7 — The Prompt is Code)
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
# This string is the agent's entire operational contract.
|
| 83 |
+
# It must be:
|
| 84 |
+
# • Self-contained (no assumed context from training data)
|
| 85 |
+
# • Precise (exact JSON examples, not vague descriptions)
|
| 86 |
+
# • Non-directive about sequence (Principle 1: agent chooses order)
|
| 87 |
+
# • Complete (goal, tools, rewards, termination — nothing omitted)
|
| 88 |
+
|
| 89 |
+
_SYSTEM_PROMPT = """\
|
| 90 |
+
╔══════════════════════════════════════════════════════╗
|
| 91 |
+
║ PYTHON DEBUGGING GYM — EPISODE BRIEF ║
|
| 92 |
+
╚══════════════════════════════════════════════════════╝
|
| 93 |
+
|
| 94 |
+
GOAL
|
| 95 |
+
----
|
| 96 |
+
The Python source file shown below contains one or more bugs.
|
| 97 |
+
Your task is to find and fix every bug so that ALL unit tests pass, then
|
| 98 |
+
call SUBMIT to end the episode.
|
| 99 |
+
|
| 100 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 101 |
+
STANDARD OPERATING PROCEDURE (follow this state machine)
|
| 102 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 103 |
+
|
| 104 |
+
STEP 1 — ORIENT : Call VIEW_CODE to read the full file with line numbers.
|
| 105 |
+
STEP 2 — DIAGNOSE : Call RUN_TESTS to get the exact error message and traceback.
|
| 106 |
+
STEP 3 — FIX : Call REPLACE_LINES to correct the identified bug.
|
| 107 |
+
(Use UNDO_EDIT if the edit made things worse.)
|
| 108 |
+
STEP 4 — VERIFY : Call RUN_TESTS again to confirm the fix worked.
|
| 109 |
+
STEP 5 — REPEAT : If tests still fail, return to STEP 1 and re-read the code.
|
| 110 |
+
STEP 6 — SUBMIT : Once ALL tests pass, call SUBMIT.
|
| 111 |
+
|
| 112 |
+
⚠ Do NOT call VIEW_CODE more than once in a row. Each VIEW_CODE costs -0.01.
|
| 113 |
+
If you have already viewed the code, call RUN_TESTS next, not VIEW_CODE again.
|
| 114 |
+
|
| 115 |
+
⚠ THE ESCAPE HATCH RULE: If an edit results in a syntax error or an indentation error,
|
| 116 |
+
DO NOT try to manually fix spaces. IMMEDIATELY use UNDO_EDIT or RESET_TO_ORIGINAL.
|
| 117 |
+
|
| 118 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 119 |
+
AVAILABLE TOOLS (send one JSON object per turn)
|
| 120 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 121 |
+
|
| 122 |
+
1. VIEW_CODE — see the full file with line numbers
|
| 123 |
+
{{"thought": "<your reasoning>", "action_type": "VIEW_CODE"}}
|
| 124 |
+
|
| 125 |
+
2. RUN_TESTS — execute all unit tests; see pass/fail + output
|
| 126 |
+
{{"thought": "<your reasoning>", "action_type": "RUN_TESTS"}}
|
| 127 |
+
|
| 128 |
+
3. REPLACE_LINES — replace a contiguous block of lines (start to end, inclusive)
|
| 129 |
+
{{"thought": "<your reasoning>", "action_type": "REPLACE_LINES", "start_line": 3, "end_line": 5, "new_code_block": " x = 1\\n return x"}}
|
| 130 |
+
⚠ start_line and end_line are 1-indexed and INCLUSIVE.
|
| 131 |
+
⚠ new_code_block is a single string; separate lines with \\n (no trailing \\n).
|
| 132 |
+
⚠ Indentation is syntax in Python — include the correct leading spaces on every line.
|
| 133 |
+
⚠ The file grows or shrinks when the new block has more/fewer lines than the range.
|
| 134 |
+
⚠ After REPLACE_LINES, call RUN_TESTS (not VIEW_CODE) to verify the fix.
|
| 135 |
+
|
| 136 |
+
4. UNDO_EDIT — revert to the state before the most recent REPLACE_LINES (-0.10 penalty)
|
| 137 |
+
{{"thought": "<your reasoning>", "action_type": "UNDO_EDIT"}}
|
| 138 |
+
Use when an edit made things worse and you want to try a different approach.
|
| 139 |
+
No-op (with penalty) if there is no edit history.
|
| 140 |
+
|
| 141 |
+
5. RESET_TO_ORIGINAL — restore the pristine broken code from episode start (-0.10 penalty)
|
| 142 |
+
{{"thought": "<your reasoning>", "action_type": "RESET_TO_ORIGINAL"}}
|
| 143 |
+
Last resort only. Clears all undo history. Resets context anchor.
|
| 144 |
+
|
| 145 |
+
6. SUBMIT — declare the fix complete; ends the episode
|
| 146 |
+
{{"thought": "<your reasoning>", "action_type": "SUBMIT"}}
|
| 147 |
+
Only call SUBMIT when RUN_TESTS has confirmed ALL tests pass.
|
| 148 |
+
The episode ends immediately on SUBMIT, pass or fail.
|
| 149 |
+
|
| 150 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 151 |
+
REWARD SIGNALS (visible in observation.reward_last_step)
|
| 152 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 153 |
+
|
| 154 |
+
+1.00 SUBMIT and all tests pass ← primary objective
|
| 155 |
+
+0.10 RUN_TESTS called (any outcome) ← gathering info is good
|
| 156 |
+
+0.05 Per test newly passing vs last run ← incremental progress
|
| 157 |
+
-0.01 Every step taken ← solve efficiently
|
| 158 |
+
-0.10 Syntax error in current code ← fix broken syntax first
|
| 159 |
+
-0.10 UNDO_EDIT or RESET_TO_ORIGINAL ← backtracking is expensive
|
| 160 |
+
-0.02 Invalid line range sent ← use VIEW_CODE to check range
|
| 161 |
+
-0.20 SUBMIT with tests still failing ← verify before submitting
|
| 162 |
+
|
| 163 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 164 |
+
EPISODE PARAMETERS
|
| 165 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 166 |
+
|
| 167 |
+
Task : {task_name} ({difficulty})
|
| 168 |
+
Unit tests : {test_count} tests — ALL must pass
|
| 169 |
+
Max steps : {max_steps} (episode terminates at 0 steps remaining)
|
| 170 |
+
|
| 171 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 172 |
+
CURRENT CODE (this is the broken version — fix it)
|
| 173 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 174 |
+
|
| 175 |
+
{code_preview}
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
# Environment
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
|
| 183 |
+
class PythonDebuggingGym:
|
| 184 |
+
"""
|
| 185 |
+
Gymnasium-compatible RL environment for Python debugging.
|
| 186 |
+
|
| 187 |
+
PRINCIPLE 1: step() is a stateless router — the agent chooses the
|
| 188 |
+
sequence. No internal gates, no forced ordering between actions.
|
| 189 |
+
|
| 190 |
+
Interface
|
| 191 |
+
---------
|
| 192 |
+
obs, system_prompt = env.reset()
|
| 193 |
+
obs, reward, done, info = env.step(action: CodeAction)
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
metadata = {"name": "PythonDebuggingGym-v1", "render_modes": []}
|
| 197 |
+
|
| 198 |
+
def __init__(
|
| 199 |
+
self,
|
| 200 |
+
task_index: Optional[int] = None,
|
| 201 |
+
seed: Optional[int] = None,
|
| 202 |
+
):
|
| 203 |
+
self._task_index = task_index
|
| 204 |
+
self._rng = random.Random(seed)
|
| 205 |
+
|
| 206 |
+
# All mutable episode state lives here; reset() wipes every field.
|
| 207 |
+
self._code_lines: List[str] = []
|
| 208 |
+
self._task: Dict[str, Any] = {}
|
| 209 |
+
self._step_count: int = 0
|
| 210 |
+
self._prev_pass_count: int = 0
|
| 211 |
+
self._last_test_results: List[TestResult] = []
|
| 212 |
+
self._last_output: str = ""
|
| 213 |
+
self._last_edited_line: Optional[int] = None # PRINCIPLE 10
|
| 214 |
+
self._episode_id: str = ""
|
| 215 |
+
self._done: bool = False
|
| 216 |
+
self._cumulative_reward: float = 0.0
|
| 217 |
+
self._accumulated_step_costs: float = 0.0 # Hackathon compliance
|
| 218 |
+
# Mini-Git snapshot history (Phase 2)
|
| 219 |
+
self._original_code: List[str] = [] # pristine copy set at reset()
|
| 220 |
+
self._edit_history: List[List[str]] = [] # stack of pre-edit snapshots
|
| 221 |
+
# Curriculum learning — persists across episodes, incremented externally
|
| 222 |
+
self.training_step: int = 0
|
| 223 |
+
|
| 224 |
+
# ── Curriculum task sampler ──────────────────────────────────────────────
|
| 225 |
+
|
| 226 |
+
def _sample_task(self, task_override=None) -> Dict[str, Any]:
|
| 227 |
+
"""
|
| 228 |
+
Evaluation-safe curriculum sampler.
|
| 229 |
+
|
| 230 |
+
Priority order:
|
| 231 |
+
1. task_override dict → return it directly (test/eval pinning)
|
| 232 |
+
2. training_step == 0 → randomly sample from ALL_TASKS (judge-safe default;
|
| 233 |
+
the Meta evaluator calls reset() without setting
|
| 234 |
+
training_step, so this must work correctly)
|
| 235 |
+
3. training_step > 0 → curriculum bucketing:
|
| 236 |
+
< 1000 → easy
|
| 237 |
+
< 5000 → medium
|
| 238 |
+
>= 5000 → hard
|
| 239 |
+
Falls back to any non-empty bucket if the target bucket is empty.
|
| 240 |
+
"""
|
| 241 |
+
if isinstance(task_override, dict):
|
| 242 |
+
return task_override
|
| 243 |
+
|
| 244 |
+
# Judge-safe default: no training_step set → random from all tasks
|
| 245 |
+
if self.training_step == 0:
|
| 246 |
+
if not ALL_TASKS:
|
| 247 |
+
raise RuntimeError("ALL_TASKS is empty — check tasks.py.")
|
| 248 |
+
return self._rng.choice(ALL_TASKS)
|
| 249 |
+
|
| 250 |
+
# Curriculum mode (trainer increments training_step between episodes)
|
| 251 |
+
if self.training_step < 1000:
|
| 252 |
+
bucket = "easy"
|
| 253 |
+
elif self.training_step < 5000:
|
| 254 |
+
bucket = "medium"
|
| 255 |
+
else:
|
| 256 |
+
bucket = "hard"
|
| 257 |
+
|
| 258 |
+
pool = TASKS_BY_DIFFICULTY.get(bucket, [])
|
| 259 |
+
if not pool:
|
| 260 |
+
# Fallback: any non-empty bucket rather than crashing
|
| 261 |
+
for b in ("easy", "medium", "hard"):
|
| 262 |
+
pool = TASKS_BY_DIFFICULTY.get(b, [])
|
| 263 |
+
if pool:
|
| 264 |
+
break
|
| 265 |
+
if not pool:
|
| 266 |
+
raise RuntimeError("TASKS_BY_DIFFICULTY is entirely empty — check tasks.py.")
|
| 267 |
+
|
| 268 |
+
return self._rng.choice(pool)
|
| 269 |
+
|
| 270 |
+
# ── reset() ─────────────────────────────────────────────────────────────
|
| 271 |
+
|
| 272 |
+
def reset(
|
| 273 |
+
self, *, task_index: Optional[int] = None
|
| 274 |
+
) -> Tuple[CodeObservation, str]:
|
| 275 |
+
"""
|
| 276 |
+
Wipe all episode state; select a task; return initial observation + prompt.
|
| 277 |
+
|
| 278 |
+
State isolation guarantee: every mutable field is explicitly reset here.
|
| 279 |
+
There is no shared state between episodes — not even the RNG advances
|
| 280 |
+
carry forward (the seed is fixed at __init__ time).
|
| 281 |
+
"""
|
| 282 |
+
self._task = self._sample_task(task_index)
|
| 283 |
+
|
| 284 |
+
# ── Complete state wipe ──────────────────────────────────────────
|
| 285 |
+
self._code_lines = list(self._task["code"]) # deep copy — no alias
|
| 286 |
+
self._step_count = 0
|
| 287 |
+
self._prev_pass_count = 0
|
| 288 |
+
self._last_test_results = []
|
| 289 |
+
self._last_output = ""
|
| 290 |
+
self._last_edited_line = None # no edits yet — localized_context will be empty
|
| 291 |
+
self._episode_id = str(uuid.uuid4())[:8]
|
| 292 |
+
self._done = False
|
| 293 |
+
self._cumulative_reward = 0.0
|
| 294 |
+
self._accumulated_step_costs = 0.0
|
| 295 |
+
# Mini-Git: seed pristine snapshot and clear history
|
| 296 |
+
self._original_code = list(self._task["code"]) # separate copy from _code_lines
|
| 297 |
+
self._edit_history = []
|
| 298 |
+
# Anti-Loop history
|
| 299 |
+
self._last_action: Optional[str] = None
|
| 300 |
+
self._consecutive_count: int = 0
|
| 301 |
+
|
| 302 |
+
obs = self._build_observation(reward=0.0)
|
| 303 |
+
|
| 304 |
+
# PRINCIPLE 7: build the operational contract string
|
| 305 |
+
system_prompt = _SYSTEM_PROMPT.format(
|
| 306 |
+
task_name = self._task["name"],
|
| 307 |
+
difficulty = self._task.get("difficulty", "unknown"),
|
| 308 |
+
test_count = len(self._task["tests"]),
|
| 309 |
+
max_steps = MAX_STEPS,
|
| 310 |
+
code_preview = obs.render_code(),
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
return obs, system_prompt
|
| 314 |
+
|
| 315 |
+
# ── step() ──────────────────────────────────────────────────────────────
|
| 316 |
+
|
| 317 |
+
def step(
|
| 318 |
+
self, action: CodeAction
|
| 319 |
+
) -> Tuple[CodeObservation, float, bool, Dict[str, Any]]:
|
| 320 |
+
"""
|
| 321 |
+
PRINCIPLE 1 — Pure router. Accept any valid action in any order.
|
| 322 |
+
|
| 323 |
+
The only sequencing constraint is that SUBMIT ends the episode.
|
| 324 |
+
All other actions can be called in any combination and in any order.
|
| 325 |
+
step() does NOT enforce a workflow — it applies the action and returns
|
| 326 |
+
the resulting state for the agent to reason about.
|
| 327 |
+
|
| 328 |
+
PRINCIPLE 5 — R_STEP_COST is applied before routing so it is
|
| 329 |
+
impossible to take a "free" step — every turn has a cost.
|
| 330 |
+
"""
|
| 331 |
+
if self._done:
|
| 332 |
+
raise RuntimeError(
|
| 333 |
+
"step() called on a finished episode. Call reset() first."
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
self._step_count += 1
|
| 337 |
+
reward = R_STEP_COST # PRINCIPLE 5: cost-per-turn baseline
|
| 338 |
+
self._accumulated_step_costs += abs(R_STEP_COST) # Hackathon compliance
|
| 339 |
+
|
| 340 |
+
# ── Repetition Penalty (Anti-Loop) ───────────────────────────────
|
| 341 |
+
if action.action_type == self._last_action:
|
| 342 |
+
self._consecutive_count += 1
|
| 343 |
+
reward += -0.05 * self._consecutive_count
|
| 344 |
+
else:
|
| 345 |
+
self._consecutive_count = 0
|
| 346 |
+
self._last_action = action.action_type
|
| 347 |
+
|
| 348 |
+
# ── Route (PRINCIPLE 1: no forced sequence) ──────────────────────
|
| 349 |
+
atype = action.action_type
|
| 350 |
+
|
| 351 |
+
if atype == "VIEW_CODE":
|
| 352 |
+
reward += self._act_view_code()
|
| 353 |
+
|
| 354 |
+
elif atype == "RUN_TESTS":
|
| 355 |
+
reward += self._act_run_tests()
|
| 356 |
+
|
| 357 |
+
elif atype == "REPLACE_LINES":
|
| 358 |
+
reward += self._act_replace_lines(
|
| 359 |
+
action.start_line, action.end_line, action.new_code_block
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
elif atype == "UNDO_EDIT":
|
| 363 |
+
reward += self._act_undo_edit()
|
| 364 |
+
|
| 365 |
+
elif atype == "RESET_TO_ORIGINAL":
|
| 366 |
+
reward += self._act_reset_to_original()
|
| 367 |
+
|
| 368 |
+
elif atype == "SUBMIT":
|
| 369 |
+
reward += self._act_submit()
|
| 370 |
+
self._done = True
|
| 371 |
+
|
| 372 |
+
# ── Max-steps termination ────────────────────────────────────────
|
| 373 |
+
if self._step_count >= MAX_STEPS and not self._done:
|
| 374 |
+
self._done = True
|
| 375 |
+
# Deterministic clamp — never trust the LLM to call SUBMIT.
|
| 376 |
+
# Evaluate the current code and produce a valid [0.0, 1.0] score
|
| 377 |
+
# regardless of how the episode ended.
|
| 378 |
+
_, results, syntax_err = run_code_with_tests(
|
| 379 |
+
source=self._source(),
|
| 380 |
+
test_callables=self._task["tests"],
|
| 381 |
+
)
|
| 382 |
+
total = len(results)
|
| 383 |
+
passes = 0 if syntax_err else sum(1 for t in results if t.passed)
|
| 384 |
+
raw = (passes / total if total > 0 else 0.0) - self._accumulated_step_costs
|
| 385 |
+
reward = max(0.0, min(1.0, raw))
|
| 386 |
+
self._last_output += (
|
| 387 |
+
f"\n⚠ Max steps ({MAX_STEPS}) reached. "
|
| 388 |
+
f"Auto-evaluated: {passes}/{total} tests passing. "
|
| 389 |
+
f"Final score: {reward:.4f}"
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
self._cumulative_reward += reward
|
| 393 |
+
obs = self._build_observation(reward=reward)
|
| 394 |
+
info = {
|
| 395 |
+
"episode_id": self._episode_id,
|
| 396 |
+
"task": self._task["name"],
|
| 397 |
+
"cumulative_reward": round(self._cumulative_reward, 4),
|
| 398 |
+
"step": self._step_count,
|
| 399 |
+
}
|
| 400 |
+
if self._done:
|
| 401 |
+
# PRINCIPLE: Ensure Hackathon score leak doesn't occur. It must be strictly [0.0, 1.0].
|
| 402 |
+
# During SUBMIT, reward might be negative if _act_submit returned 0.0 added to -0.01.
|
| 403 |
+
info["final_score"] = max(0.0, min(1.0, round(reward, 4)))
|
| 404 |
+
|
| 405 |
+
return obs, round(reward, 4), self._done, info
|
| 406 |
+
|
| 407 |
+
# ── Action handlers ─────────────────────────────────────────────────────
|
| 408 |
+
# Each returns the delta reward (R_STEP_COST already applied by step()).
|
| 409 |
+
# Handlers update self._last_output and self._last_edited_line as needed.
|
| 410 |
+
|
| 411 |
+
def _act_view_code(self) -> float:
|
| 412 |
+
self._last_output = (
|
| 413 |
+
"=== Full Source ===\n" +
|
| 414 |
+
"\n".join(
|
| 415 |
+
f"{i + 1:>3} | {line}"
|
| 416 |
+
for i, line in enumerate(self._code_lines)
|
| 417 |
+
)
|
| 418 |
+
)
|
| 419 |
+
# VIEW_CODE does not change the code — localized_context stays where it was
|
| 420 |
+
return 0.0
|
| 421 |
+
|
| 422 |
+
def _act_run_tests(self) -> float:
|
| 423 |
+
output, results, syntax_err = run_code_with_tests(
|
| 424 |
+
source=self._source(),
|
| 425 |
+
test_callables=self._task["tests"],
|
| 426 |
+
)
|
| 427 |
+
self._last_output = output
|
| 428 |
+
self._last_test_results = results
|
| 429 |
+
|
| 430 |
+
reward = R_RUN_TESTS # information-gathering bonus (Principle 5)
|
| 431 |
+
|
| 432 |
+
if syntax_err:
|
| 433 |
+
reward += R_SYNTAX_ERROR
|
| 434 |
+
else:
|
| 435 |
+
current_pass = sum(1 for t in results if t.passed)
|
| 436 |
+
new_passes = max(0, current_pass - self._prev_pass_count)
|
| 437 |
+
reward += new_passes * R_PER_NEW_PASS
|
| 438 |
+
self._prev_pass_count = current_pass
|
| 439 |
+
|
| 440 |
+
return reward
|
| 441 |
+
|
| 442 |
+
def _act_replace_lines(
|
| 443 |
+
self, start_line: int, end_line: int, new_code_block: str
|
| 444 |
+
) -> float:
|
| 445 |
+
n = len(self._code_lines)
|
| 446 |
+
|
| 447 |
+
if new_code_block is None:
|
| 448 |
+
new_code_block = ""
|
| 449 |
+
|
| 450 |
+
# ── Guard: Destructive Action (Anti-Deletion) ─────────────────────
|
| 451 |
+
if len(new_code_block) == 0 and (end_line - start_line) > 5:
|
| 452 |
+
self._last_output = "Error: Cannot delete more than 5 lines at once."
|
| 453 |
+
return R_DESTRUCTIVE_PENALTY
|
| 454 |
+
|
| 455 |
+
# ── Guard: inverted range ─────────────────────────────────────────
|
| 456 |
+
if start_line > end_line:
|
| 457 |
+
self._last_output = (
|
| 458 |
+
f"Error: start_line ({start_line}) > end_line ({end_line}). "
|
| 459 |
+
"Inverted range rejected. Call VIEW_CODE to check the current line count."
|
| 460 |
+
)
|
| 461 |
+
return R_INVALID_LINE
|
| 462 |
+
|
| 463 |
+
# ── Guard: out-of-bounds ──────────────────────────────────────────
|
| 464 |
+
if start_line < 1 or start_line > n:
|
| 465 |
+
self._last_output = (
|
| 466 |
+
f"Error: start_line {start_line} is out of range [1, {n}]. "
|
| 467 |
+
"Call VIEW_CODE to check the current line count."
|
| 468 |
+
)
|
| 469 |
+
return R_INVALID_LINE
|
| 470 |
+
if end_line < 1 or end_line > n:
|
| 471 |
+
self._last_output = (
|
| 472 |
+
f"Error: end_line {end_line} is out of range [1, {n}]. "
|
| 473 |
+
"Call VIEW_CODE to check the current line count."
|
| 474 |
+
)
|
| 475 |
+
return R_INVALID_LINE
|
| 476 |
+
|
| 477 |
+
# ── Slice assignment (PRINCIPLE 1: pure data transformation) ──────
|
| 478 |
+
start_idx = start_line - 1 # convert to 0-indexed
|
| 479 |
+
end_idx = end_line # exclusive upper bound for Python slice
|
| 480 |
+
|
| 481 |
+
# ── Mini-Git: snapshot BEFORE mutating (Phase 2) ─────────────────
|
| 482 |
+
self._edit_history.append(list(self._code_lines))
|
| 483 |
+
|
| 484 |
+
new_lines = new_code_block.split("\n")
|
| 485 |
+
self._code_lines[start_idx:end_idx] = new_lines
|
| 486 |
+
|
| 487 |
+
# ── Anchor context at END of new block (PRINCIPLE 10) ─────────────
|
| 488 |
+
# If the agent replaces lines 5–10 with 20 new lines, the anchor
|
| 489 |
+
# settles at start_line + len(new_lines) - 1, clamped to file length.
|
| 490 |
+
new_end = start_line + len(new_lines) - 1
|
| 491 |
+
self._last_edited_line = min(new_end, len(self._code_lines))
|
| 492 |
+
|
| 493 |
+
replaced_count = end_line - start_line + 1
|
| 494 |
+
self._last_output = (
|
| 495 |
+
f"✏ Replaced lines {start_line}–{end_line} "
|
| 496 |
+
f"({replaced_count} line(s)) with {len(new_lines)} new line(s).\n"
|
| 497 |
+
f"File now has {len(self._code_lines)} lines total. "
|
| 498 |
+
f"Context anchored at line {self._last_edited_line}. "
|
| 499 |
+
"Call VIEW_CODE to re-orient before referencing line numbers."
|
| 500 |
+
)
|
| 501 |
+
return 0.0
|
| 502 |
+
|
| 503 |
+
def _act_submit(self) -> float:
|
| 504 |
+
output, results, syntax_err = run_code_with_tests(
|
| 505 |
+
source=self._source(),
|
| 506 |
+
test_callables=self._task["tests"],
|
| 507 |
+
)
|
| 508 |
+
self._last_output = output
|
| 509 |
+
self._last_test_results = results
|
| 510 |
+
|
| 511 |
+
total = len(results)
|
| 512 |
+
passes = 0 if syntax_err else sum(1 for t in results if t.passed)
|
| 513 |
+
|
| 514 |
+
if syntax_err:
|
| 515 |
+
self._last_output += "\n❌ SUBMIT rejected — syntax error in current code."
|
| 516 |
+
|
| 517 |
+
# ── Hackathon compliance: final score ∈ [0.0, 1.0] ───────────────
|
| 518 |
+
# raw = (tests_passed / total) - accumulated_step_costs
|
| 519 |
+
# Then clamped so the grader always receives a value in spec.
|
| 520 |
+
proportion = passes / total if total > 0 else 0.0
|
| 521 |
+
raw_score = proportion - self._accumulated_step_costs
|
| 522 |
+
final_score = max(0.0, min(1.0, raw_score))
|
| 523 |
+
|
| 524 |
+
if not syntax_err:
|
| 525 |
+
if passes == total:
|
| 526 |
+
self._last_output += (
|
| 527 |
+
f"\n🎉 ALL {total} TESTS PASS! Episode solved. "
|
| 528 |
+
f"Final score: {final_score:.4f}"
|
| 529 |
+
)
|
| 530 |
+
else:
|
| 531 |
+
fail_count = total - passes
|
| 532 |
+
self._last_output += (
|
| 533 |
+
f"\n❌ SUBMIT — {fail_count}/{total} tests still failing. "
|
| 534 |
+
f"Final score: {final_score:.4f}"
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
return final_score
|
| 538 |
+
|
| 539 |
+
def _act_undo_edit(self) -> float:
|
| 540 |
+
"""
|
| 541 |
+
Mini-Git UNDO: restore the code snapshot from immediately before the
|
| 542 |
+
most recent REPLACE_LINES call. Applies R_UNDO_RESET penalty.
|
| 543 |
+
|
| 544 |
+
CRITICAL (Phase 2, point 4 — Context Desync Watchout):
|
| 545 |
+
_last_edited_line is set to None so context.py does not anchor the
|
| 546 |
+
localized view to a line that may no longer exist or mean the same
|
| 547 |
+
thing after the revert.
|
| 548 |
+
"""
|
| 549 |
+
if not self._edit_history:
|
| 550 |
+
self._last_output = (
|
| 551 |
+
"⚠ UNDO_EDIT: no edit history — nothing to revert. "
|
| 552 |
+
"The code is still at its current state."
|
| 553 |
+
)
|
| 554 |
+
else:
|
| 555 |
+
self._code_lines = self._edit_history.pop()
|
| 556 |
+
self._last_output = (
|
| 557 |
+
f"↩ UNDO_EDIT: reverted to previous state "
|
| 558 |
+
f"({len(self._code_lines)} lines). "
|
| 559 |
+
"Call VIEW_CODE to inspect the restored file."
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
# PRINCIPLE 10 desync fix: anchor is stale after rollback — wipe it.
|
| 563 |
+
self._last_edited_line = None
|
| 564 |
+
return R_UNDO_RESET
|
| 565 |
+
|
| 566 |
+
def _act_reset_to_original(self) -> float:
|
| 567 |
+
"""
|
| 568 |
+
Mini-Git RESET: restore the pristine episode-start code and clear the
|
| 569 |
+
entire undo stack. Applies R_UNDO_RESET penalty.
|
| 570 |
+
|
| 571 |
+
CRITICAL (Phase 2, point 4 — Context Desync Watchout):
|
| 572 |
+
_last_edited_line is set to None to prevent context.py from anchoring
|
| 573 |
+
to a ghost line in the freshly-restored original code.
|
| 574 |
+
"""
|
| 575 |
+
self._code_lines = list(self._original_code) # deep copy
|
| 576 |
+
self._edit_history = [] # clear stack
|
| 577 |
+
self._last_output = (
|
| 578 |
+
f"↺ RESET_TO_ORIGINAL: code restored to pristine episode state "
|
| 579 |
+
f"({len(self._code_lines)} lines). All undo history cleared. "
|
| 580 |
+
"Call VIEW_CODE to inspect the file."
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
# PRINCIPLE 10 desync fix: context anchor is meaningless after full reset.
|
| 584 |
+
self._last_edited_line = None
|
| 585 |
+
return R_UNDO_RESET
|
| 586 |
+
|
| 587 |
+
# ── Helpers ─────────────────────────────────────────────────────────────
|
| 588 |
+
|
| 589 |
+
def _source(self) -> str:
|
| 590 |
+
return "\n".join(self._code_lines)
|
| 591 |
+
|
| 592 |
+
def _build_observation(self, reward: float) -> CodeObservation:
|
| 593 |
+
syntax_valid, _ = check_syntax(self._source())
|
| 594 |
+
|
| 595 |
+
# PRINCIPLE 10: localized context — only ±10 lines around last edit
|
| 596 |
+
localized = get_localized_context(self._code_lines, self._last_edited_line)
|
| 597 |
+
|
| 598 |
+
return CodeObservation(
|
| 599 |
+
code_lines = list(self._code_lines),
|
| 600 |
+
localized_context = localized,
|
| 601 |
+
last_execution_output = self._last_output,
|
| 602 |
+
syntax_error = not syntax_valid,
|
| 603 |
+
test_results = list(self._last_test_results),
|
| 604 |
+
step_count = self._step_count,
|
| 605 |
+
steps_remaining = max(0, MAX_STEPS - self._step_count),
|
| 606 |
+
reward_last_step = round(reward, 4),
|
| 607 |
+
done = self._done,
|
| 608 |
+
info = {
|
| 609 |
+
"episode_id": self._episode_id,
|
| 610 |
+
"task_name": self._task.get("name", ""),
|
| 611 |
+
"task_difficulty": self._task.get("difficulty", ""),
|
| 612 |
+
},
|
| 613 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py — Baseline Agent for Python Debugging Gym
|
| 3 |
+
=======================================================
|
| 4 |
+
Hackathon-compliant baseline script. Connects to the PythonDebuggingGym
|
| 5 |
+
WebSocket server and drives an OpenAI-compatible LLM to find and fix bugs.
|
| 6 |
+
|
| 7 |
+
Required environment variables:
|
| 8 |
+
HF_TOKEN API key / HuggingFace token passed as Bearer auth
|
| 9 |
+
MODEL_NAME Model identifier (default: nvidia/nemotron-3-nano-4b)
|
| 10 |
+
API_BASE_URL OpenAI-compatible base URL (default: https://api.openai.com/v1)
|
| 11 |
+
|
| 12 |
+
Optional environment variables:
|
| 13 |
+
ENV_WS_URL WebSocket URL for the gym (default: ws://localhost:8000/ws)
|
| 14 |
+
|
| 15 |
+
Mandatory stdout log lines (zero deviation in spacing or formatting):
|
| 16 |
+
[START] task=<task_name> env=PythonDebuggingGym model=<model_name>
|
| 17 |
+
[STEP] step=<n> action=<action_type> reward=<r.rr> done=<true|false> error=<msg|null>
|
| 18 |
+
[END] success=<true|false> steps=<n> score=<s.sss> rewards=<r1,r2,...,rn>
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import asyncio
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
from typing import Any
|
| 28 |
+
|
| 29 |
+
import websockets
|
| 30 |
+
from openai import OpenAI
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
# Config (all readable from environment at import time)
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
|
| 37 |
+
API_BASE_URL: str = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 38 |
+
MODEL_NAME: str = os.getenv("MODEL_NAME", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4")
|
| 39 |
+
HF_TOKEN: str = os.getenv("HF_TOKEN", "")
|
| 40 |
+
ENV_WS_URL: str = os.getenv("ENV_WS_URL", "ws://localhost:8000/ws")
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# OpenAI client
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
_client = OpenAI(
|
| 47 |
+
api_key=HF_TOKEN or "sk-placeholder", # placeholder keeps the client from raising at init
|
| 48 |
+
base_url=API_BASE_URL,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
# Agent instruction appended after the environment's own system prompt
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
|
| 55 |
+
_AGENT_SUFFIX = """\
|
| 56 |
+
|
| 57 |
+
=======================================================================
|
| 58 |
+
RESPONSE FORMAT (MANDATORY)
|
| 59 |
+
=======================================================================
|
| 60 |
+
Respond with ONLY a valid JSON object. No markdown, no code fences,
|
| 61 |
+
no explanation text — just the raw JSON.
|
| 62 |
+
|
| 63 |
+
Valid action schemas (choose exactly one per turn):
|
| 64 |
+
{"action_type": "VIEW_CODE"}
|
| 65 |
+
{"action_type": "RUN_TESTS"}
|
| 66 |
+
{"action_type": "REPLACE_LINES", "start_line": N, "end_line": M, "new_code_block": "line1\\nline2"}
|
| 67 |
+
{"action_type": "UNDO_EDIT"}
|
| 68 |
+
{"action_type": "RESET_TO_ORIGINAL"}
|
| 69 |
+
{"action_type": "SUBMIT"}
|
| 70 |
+
|
| 71 |
+
Rules for REPLACE_LINES:
|
| 72 |
+
- new_code_block: join multiple lines with \\n (literal backslash-n in the JSON string)
|
| 73 |
+
- Include exact Python indentation (leading spaces) on every line
|
| 74 |
+
- Do NOT include a trailing \\n character
|
| 75 |
+
- After REPLACE_LINES, call VIEW_CODE to re-orient before the next edit
|
| 76 |
+
|
| 77 |
+
Rules for UNDO_EDIT / RESET_TO_ORIGINAL:
|
| 78 |
+
- UNDO_EDIT reverts the last REPLACE_LINES. Use when an edit made things worse.
|
| 79 |
+
- RESET_TO_ORIGINAL restores the original broken code. Last resort only.
|
| 80 |
+
- Both cost -0.10. Prefer fixing forward over backtracking.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
# Observation formatter
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
|
| 88 |
+
def _format_obs(obs: dict[str, Any]) -> str:
|
| 89 |
+
"""Convert a CodeObservation dict into a compact string for the LLM."""
|
| 90 |
+
parts: list[str] = []
|
| 91 |
+
|
| 92 |
+
if obs.get("syntax_error"):
|
| 93 |
+
parts.append("⚠ SYNTAX ERROR in current code — fix indentation/brackets first.\n")
|
| 94 |
+
|
| 95 |
+
localized = obs.get("localized_context", "")
|
| 96 |
+
if localized:
|
| 97 |
+
parts.append(f"[Context around last edit]\n{localized}\n")
|
| 98 |
+
|
| 99 |
+
last_out = obs.get("last_execution_output", "")
|
| 100 |
+
if last_out:
|
| 101 |
+
parts.append(f"[Last execution output]\n{last_out}\n")
|
| 102 |
+
|
| 103 |
+
test_results: list[dict] = obs.get("test_results", [])
|
| 104 |
+
if test_results:
|
| 105 |
+
lines = []
|
| 106 |
+
for t in test_results:
|
| 107 |
+
status = "PASS" if t.get("passed") else "FAIL"
|
| 108 |
+
msg = t.get("error_message") or ""
|
| 109 |
+
name = t.get("test_name", "?")
|
| 110 |
+
lines.append(f" {status} {name}" + (f": {msg}" if msg else ""))
|
| 111 |
+
parts.append("[Test results]\n" + "\n".join(lines) + "\n")
|
| 112 |
+
|
| 113 |
+
remaining = obs.get("steps_remaining", 0)
|
| 114 |
+
parts.append(f"[Steps remaining: {remaining}]")
|
| 115 |
+
|
| 116 |
+
return "\n".join(parts)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
# LLM call
|
| 121 |
+
# ---------------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
_ACTION_SCHEMA = {
|
| 124 |
+
"type": "json_schema",
|
| 125 |
+
"json_schema": {
|
| 126 |
+
"name": "CodeAction",
|
| 127 |
+
"strict": True,
|
| 128 |
+
"schema": {
|
| 129 |
+
"type": "object",
|
| 130 |
+
"properties": {
|
| 131 |
+
"thought": {
|
| 132 |
+
"type": "string",
|
| 133 |
+
"description": "Mandatory reasoning before selecting action_type.",
|
| 134 |
+
},
|
| 135 |
+
"action_type": {
|
| 136 |
+
"type": "string",
|
| 137 |
+
"enum": [
|
| 138 |
+
"VIEW_CODE", "RUN_TESTS", "REPLACE_LINES",
|
| 139 |
+
"UNDO_EDIT", "RESET_TO_ORIGINAL", "SUBMIT",
|
| 140 |
+
],
|
| 141 |
+
},
|
| 142 |
+
"start_line": {"type": ["integer", "null"]},
|
| 143 |
+
"end_line": {"type": ["integer", "null"]},
|
| 144 |
+
"new_code_block": {"type": ["string", "null"]},
|
| 145 |
+
},
|
| 146 |
+
"required": ["thought", "action_type"],
|
| 147 |
+
"additionalProperties": False,
|
| 148 |
+
},
|
| 149 |
+
},
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _call_llm(system_prompt: str, messages: list[dict]) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Call the configured LLM and return the raw text reply.
|
| 156 |
+
|
| 157 |
+
Tries json_schema structured output first (LM Studio / vLLM / newer
|
| 158 |
+
llama.cpp all support this). Falls back to a plain call if the backend
|
| 159 |
+
raises an error for the response_format parameter — _extract_json()
|
| 160 |
+
then handles extraction from free-form text.
|
| 161 |
+
"""
|
| 162 |
+
base_kwargs: dict = dict(
|
| 163 |
+
model=MODEL_NAME,
|
| 164 |
+
messages=[
|
| 165 |
+
{"role": "system", "content": system_prompt + _AGENT_SUFFIX},
|
| 166 |
+
*messages,
|
| 167 |
+
],
|
| 168 |
+
temperature=0.0,
|
| 169 |
+
)
|
| 170 |
+
try:
|
| 171 |
+
response = _client.chat.completions.create(
|
| 172 |
+
**base_kwargs,
|
| 173 |
+
response_format=_ACTION_SCHEMA,
|
| 174 |
+
)
|
| 175 |
+
except Exception:
|
| 176 |
+
# Backend doesn't support json_schema — fall back to free-form
|
| 177 |
+
response = _client.chat.completions.create(**base_kwargs)
|
| 178 |
+
|
| 179 |
+
msg = response.choices[0].message
|
| 180 |
+
content = msg.content
|
| 181 |
+
|
| 182 |
+
# Fallback for reasoning models (e.g., via LM Studio) that place their
|
| 183 |
+
# entire output in the reasoning_content field instead of content.
|
| 184 |
+
if not content:
|
| 185 |
+
try:
|
| 186 |
+
msg_dict = msg.model_dump()
|
| 187 |
+
content = msg_dict.get("reasoning_content", "") or ""
|
| 188 |
+
except AttributeError:
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
return content or ""
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ---------------------------------------------------------------------------
|
| 195 |
+
# Constrained JSON extraction (works with any local or cloud model)
|
| 196 |
+
# ---------------------------------------------------------------------------
|
| 197 |
+
|
| 198 |
+
def _extract_json(text: str) -> dict:
|
| 199 |
+
"""
|
| 200 |
+
Best-effort JSON extraction from raw LLM output.
|
| 201 |
+
|
| 202 |
+
Tries in order:
|
| 203 |
+
1. Direct json.loads (model produced clean JSON)
|
| 204 |
+
2. Strip ```json ... ``` / ``` ... ``` markdown fences
|
| 205 |
+
3. Regex: grab first {...} block in the text
|
| 206 |
+
4. Safe fallback: {"action_type": "VIEW_CODE"}
|
| 207 |
+
"""
|
| 208 |
+
import re
|
| 209 |
+
|
| 210 |
+
# 1. Direct parse
|
| 211 |
+
stripped = text.strip()
|
| 212 |
+
try:
|
| 213 |
+
return json.loads(stripped)
|
| 214 |
+
except json.JSONDecodeError:
|
| 215 |
+
pass
|
| 216 |
+
|
| 217 |
+
# 2. Markdown code fence ```json\n{...}\n```
|
| 218 |
+
fence = re.search(r"```(?:json)?\s*({.*?})\s*```", stripped, re.DOTALL)
|
| 219 |
+
if fence:
|
| 220 |
+
try:
|
| 221 |
+
return json.loads(fence.group(1))
|
| 222 |
+
except json.JSONDecodeError:
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
# 3. First {...} block anywhere in the text
|
| 226 |
+
brace = re.search(r"({.*?})", stripped, re.DOTALL)
|
| 227 |
+
if brace:
|
| 228 |
+
try:
|
| 229 |
+
return json.loads(brace.group(1))
|
| 230 |
+
except json.JSONDecodeError:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
# All extraction attempts failed.
|
| 234 |
+
# Return an invalid action_type so Pydantic rejects it at the server,
|
| 235 |
+
# the server returns an error envelope, and THAT error is fed back to
|
| 236 |
+
# the LLM on the next turn — breaking the silent mask loop.
|
| 237 |
+
# DO NOT default to VIEW_CODE here.
|
| 238 |
+
return {"action_type": "PARSE_ERROR", "thought": f"Failed to parse LLM output as JSON: {text[:120]}"}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ---------------------------------------------------------------------------
|
| 242 |
+
# Episode runner
|
| 243 |
+
# ---------------------------------------------------------------------------
|
| 244 |
+
|
| 245 |
+
async def run_episode(difficulty: str = None, show_thought: bool = False) -> None:
|
| 246 |
+
"""
|
| 247 |
+
Connect to the gym, run one full episode with an LLM agent,
|
| 248 |
+
and emit the three required log lines.
|
| 249 |
+
"""
|
| 250 |
+
rewards: list[float] = []
|
| 251 |
+
step: int = 0
|
| 252 |
+
system_prompt: str = ""
|
| 253 |
+
task_name: str = "unknown"
|
| 254 |
+
messages: list[dict] = []
|
| 255 |
+
success: bool = False
|
| 256 |
+
obs: dict = {}
|
| 257 |
+
|
| 258 |
+
ws_url = ENV_WS_URL
|
| 259 |
+
if difficulty:
|
| 260 |
+
separator = "&" if "?" in ws_url else "?"
|
| 261 |
+
ws_url = f"{ws_url}{separator}difficulty={difficulty}"
|
| 262 |
+
|
| 263 |
+
async with websockets.connect(ws_url) as ws:
|
| 264 |
+
|
| 265 |
+
# ── Receive initial observation + system prompt ──────────────────
|
| 266 |
+
raw = await ws.recv()
|
| 267 |
+
data = json.loads(raw)
|
| 268 |
+
|
| 269 |
+
system_prompt = data.get("info", {}).get("system_prompt", "")
|
| 270 |
+
obs = data.get("observation", {})
|
| 271 |
+
task_name = obs.get("info", {}).get("task_name", "unknown")
|
| 272 |
+
|
| 273 |
+
# ── [START] log line ─────────────────────────────────────────────
|
| 274 |
+
print(
|
| 275 |
+
f"[START] task={task_name} env=PythonDebuggingGym model={MODEL_NAME}",
|
| 276 |
+
flush=True,
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# ── RL loop ──────────────────────────────────────────────────────
|
| 280 |
+
while True:
|
| 281 |
+
step += 1
|
| 282 |
+
error_str = "null"
|
| 283 |
+
action_type = "VIEW_CODE" # will be overwritten by a real parse
|
| 284 |
+
|
| 285 |
+
# Build observation message for the LLM
|
| 286 |
+
obs_text = _format_obs(obs)
|
| 287 |
+
messages.append({"role": "user", "content": obs_text})
|
| 288 |
+
|
| 289 |
+
# Call LLM
|
| 290 |
+
try:
|
| 291 |
+
llm_reply = _call_llm(system_prompt, messages)
|
| 292 |
+
if os.getenv("DEBUG_LOG") == "1":
|
| 293 |
+
print(f"\n[DEBUG RAW LLM]: {llm_reply}\n", flush=True) # see what model actually outputs
|
| 294 |
+
action_json = _extract_json(llm_reply)
|
| 295 |
+
action_type = action_json.get("action_type", "VIEW_CODE")
|
| 296 |
+
messages.append({"role": "assistant", "content": llm_reply})
|
| 297 |
+
except Exception as exc:
|
| 298 |
+
# LLM call itself failed — surface error in log, do NOT mask as VIEW_CODE.
|
| 299 |
+
# Send a harmless VIEW_CODE this turn but pass the error text back as
|
| 300 |
+
# the next user message so the model sees what went wrong.
|
| 301 |
+
error_str = str(exc).replace("\n", " ")[:200]
|
| 302 |
+
action_type = "VIEW_CODE"
|
| 303 |
+
action_json = {"action_type": "VIEW_CODE"}
|
| 304 |
+
messages.append({"role": "user", "content": f"[SYSTEM ERROR] {error_str}"})
|
| 305 |
+
|
| 306 |
+
if show_thought:
|
| 307 |
+
thought = action_json.get("thought", "")
|
| 308 |
+
if thought:
|
| 309 |
+
print(f"\n[THOUGHT]: {thought}\n", flush=True)
|
| 310 |
+
|
| 311 |
+
# Send action to the environment
|
| 312 |
+
await ws.send(json.dumps({"action": action_json}))
|
| 313 |
+
|
| 314 |
+
# Receive response
|
| 315 |
+
raw = await ws.recv()
|
| 316 |
+
data = json.loads(raw)
|
| 317 |
+
|
| 318 |
+
# Server may return a validation-error envelope (no "observation" key)
|
| 319 |
+
if "observation" not in data:
|
| 320 |
+
error_str = str(data.get("error", "server_error"))[:200]
|
| 321 |
+
reward, done = 0.0, False
|
| 322 |
+
else:
|
| 323 |
+
reward = float(data.get("reward", 0.0))
|
| 324 |
+
done = bool(data.get("done", False))
|
| 325 |
+
obs = data.get("observation", {})
|
| 326 |
+
|
| 327 |
+
if done:
|
| 328 |
+
test_results = obs.get("test_results", [])
|
| 329 |
+
total = len(test_results)
|
| 330 |
+
passes = sum(1 for t in test_results if t.get("passed"))
|
| 331 |
+
success = (total > 0 and passes == total)
|
| 332 |
+
|
| 333 |
+
rewards.append(reward)
|
| 334 |
+
|
| 335 |
+
# ── [STEP] log line ──────────────────────────────────────────
|
| 336 |
+
done_str = "true" if done else "false"
|
| 337 |
+
print(
|
| 338 |
+
f"[STEP] step={step} action={action_type} "
|
| 339 |
+
f"reward={reward:.2f} done={done_str} error={error_str}",
|
| 340 |
+
flush=True,
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
if done:
|
| 344 |
+
break # server will auto-reset, but we exit after one episode
|
| 345 |
+
|
| 346 |
+
# ── [END] log line ───────────────────────────────────────────────────────
|
| 347 |
+
success_str = "true" if success else "false"
|
| 348 |
+
# Pull clamped final_score from info dict if available, else derive from rewards
|
| 349 |
+
final_score = data.get("info", {}).get("final_score", None) if done else None
|
| 350 |
+
if final_score is None:
|
| 351 |
+
final_score = max(0.0, min(1.0, sum(rewards)))
|
| 352 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 353 |
+
print(
|
| 354 |
+
f"[END] success={success_str} steps={step} score={final_score:.3f} rewards={rewards_str}",
|
| 355 |
+
flush=True,
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# ---------------------------------------------------------------------------
|
| 360 |
+
# Entry point
|
| 361 |
+
# ---------------------------------------------------------------------------
|
| 362 |
+
|
| 363 |
+
def main() -> None:
|
| 364 |
+
import argparse
|
| 365 |
+
parser = argparse.ArgumentParser(description="Run the Python debugging agent.")
|
| 366 |
+
parser.add_argument("--easy", action="store_const", dest="difficulty", const="easy", help="Run an easy task.")
|
| 367 |
+
parser.add_argument("--medium", action="store_const", dest="difficulty", const="medium", help="Run a medium task.")
|
| 368 |
+
parser.add_argument("--hard", action="store_const", dest="difficulty", const="hard", help="Run a hard task.")
|
| 369 |
+
parser.add_argument("--thought", action="store_true", dest="show_thought", help="Print the agent's chain-of-thought reasoning.")
|
| 370 |
+
|
| 371 |
+
args = parser.parse_args()
|
| 372 |
+
asyncio.run(run_episode(difficulty=args.difficulty, show_thought=args.show_thought))
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
if __name__ == "__main__":
|
| 376 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schema layer for the Python Debugging Gym OpenEnv environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_server.types import Action, Observation
|
| 8 |
+
from pydantic import BaseModel, Field, model_validator
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ActionType = Literal[
|
| 12 |
+
"VIEW_CODE",
|
| 13 |
+
"RUN_TESTS",
|
| 14 |
+
"REPLACE_LINES",
|
| 15 |
+
"UNDO_EDIT",
|
| 16 |
+
"RESET_TO_ORIGINAL",
|
| 17 |
+
"SUBMIT",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CodeAction(Action):
|
| 22 |
+
"""Structured action consumed by the environment."""
|
| 23 |
+
|
| 24 |
+
thought: Optional[str] = Field(
|
| 25 |
+
default=None,
|
| 26 |
+
description="Optional reasoning string for debugging/traceability.",
|
| 27 |
+
)
|
| 28 |
+
action_type: ActionType = Field(
|
| 29 |
+
...,
|
| 30 |
+
description="One of VIEW_CODE, RUN_TESTS, REPLACE_LINES, UNDO_EDIT, RESET_TO_ORIGINAL, SUBMIT.",
|
| 31 |
+
)
|
| 32 |
+
start_line: Optional[int] = Field(default=None, ge=1)
|
| 33 |
+
end_line: Optional[int] = Field(default=None, ge=1)
|
| 34 |
+
new_code_block: Optional[str] = Field(default=None)
|
| 35 |
+
|
| 36 |
+
@model_validator(mode="after")
|
| 37 |
+
def validate_replace_fields(self) -> "CodeAction":
|
| 38 |
+
if self.action_type == "REPLACE_LINES":
|
| 39 |
+
if self.start_line is None:
|
| 40 |
+
raise ValueError("REPLACE_LINES requires start_line.")
|
| 41 |
+
if self.end_line is None:
|
| 42 |
+
raise ValueError("REPLACE_LINES requires end_line.")
|
| 43 |
+
if self.new_code_block is None:
|
| 44 |
+
raise ValueError("REPLACE_LINES requires new_code_block.")
|
| 45 |
+
return self
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class TestResult(BaseModel):
|
| 49 |
+
"""Per-test execution outcome."""
|
| 50 |
+
|
| 51 |
+
test_name: str
|
| 52 |
+
passed: bool
|
| 53 |
+
error_message: Optional[str] = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class CodeObservation(Observation):
|
| 57 |
+
"""Full observation returned after each step."""
|
| 58 |
+
|
| 59 |
+
code_lines: List[str] = Field(default_factory=list)
|
| 60 |
+
localized_context: str = Field(default="")
|
| 61 |
+
last_execution_output: str = Field(default="")
|
| 62 |
+
syntax_error: bool = Field(default=False)
|
| 63 |
+
test_results: List[TestResult] = Field(default_factory=list)
|
| 64 |
+
step_count: int = Field(default=0)
|
| 65 |
+
steps_remaining: int = Field(default=0)
|
| 66 |
+
reward_last_step: float = Field(default=0.0)
|
| 67 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 68 |
+
|
| 69 |
+
def render_code(self) -> str:
|
| 70 |
+
"""Render source with 1-indexed line numbers for prompts."""
|
| 71 |
+
if not self.code_lines:
|
| 72 |
+
return "<empty>"
|
| 73 |
+
return "\n".join(
|
| 74 |
+
f"{idx + 1:>3} | {line}" for idx, line in enumerate(self.code_lines)
|
| 75 |
+
)
|
my_env/README.md
DELETED
|
@@ -1,255 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: My Env Environment Server
|
| 3 |
-
emoji: 🖥️
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: indigo
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
app_port: 8000
|
| 9 |
-
base_path: /web
|
| 10 |
-
tags:
|
| 11 |
-
- openenv
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
# My Env Environment
|
| 15 |
-
|
| 16 |
-
A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
|
| 17 |
-
|
| 18 |
-
## Quick Start
|
| 19 |
-
|
| 20 |
-
The simplest way to use the My Env environment is through the `MyEnv` class:
|
| 21 |
-
|
| 22 |
-
```python
|
| 23 |
-
from my_env import MyAction, MyEnv
|
| 24 |
-
|
| 25 |
-
try:
|
| 26 |
-
# Create environment from Docker image
|
| 27 |
-
my_envenv = MyEnv.from_docker_image("my_env-env:latest")
|
| 28 |
-
|
| 29 |
-
# Reset
|
| 30 |
-
result = my_envenv.reset()
|
| 31 |
-
print(f"Reset: {result.observation.echoed_message}")
|
| 32 |
-
|
| 33 |
-
# Send multiple messages
|
| 34 |
-
messages = ["Hello, World!", "Testing echo", "Final message"]
|
| 35 |
-
|
| 36 |
-
for msg in messages:
|
| 37 |
-
result = my_envenv.step(MyAction(message=msg))
|
| 38 |
-
print(f"Sent: '{msg}'")
|
| 39 |
-
print(f" → Echoed: '{result.observation.echoed_message}'")
|
| 40 |
-
print(f" → Length: {result.observation.message_length}")
|
| 41 |
-
print(f" → Reward: {result.reward}")
|
| 42 |
-
|
| 43 |
-
finally:
|
| 44 |
-
# Always clean up
|
| 45 |
-
my_envenv.close()
|
| 46 |
-
```
|
| 47 |
-
|
| 48 |
-
That's it! The `MyEnv.from_docker_image()` method handles:
|
| 49 |
-
- Starting the Docker container
|
| 50 |
-
- Waiting for the server to be ready
|
| 51 |
-
- Connecting to the environment
|
| 52 |
-
- Container cleanup when you call `close()`
|
| 53 |
-
|
| 54 |
-
## Building the Docker Image
|
| 55 |
-
|
| 56 |
-
Before using the environment, you need to build the Docker image:
|
| 57 |
-
|
| 58 |
-
```bash
|
| 59 |
-
# From project root
|
| 60 |
-
docker build -t my_env-env:latest -f server/Dockerfile .
|
| 61 |
-
```
|
| 62 |
-
|
| 63 |
-
## Deploying to Hugging Face Spaces
|
| 64 |
-
|
| 65 |
-
You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
|
| 66 |
-
|
| 67 |
-
```bash
|
| 68 |
-
# From the environment directory (where openenv.yaml is located)
|
| 69 |
-
openenv push
|
| 70 |
-
|
| 71 |
-
# Or specify options
|
| 72 |
-
openenv push --namespace my-org --private
|
| 73 |
-
```
|
| 74 |
-
|
| 75 |
-
The `openenv push` command will:
|
| 76 |
-
1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
|
| 77 |
-
2. Prepare a custom build for Hugging Face Docker space (enables web interface)
|
| 78 |
-
3. Upload to Hugging Face (ensuring you're logged in)
|
| 79 |
-
|
| 80 |
-
### Prerequisites
|
| 81 |
-
|
| 82 |
-
- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
|
| 83 |
-
|
| 84 |
-
### Options
|
| 85 |
-
|
| 86 |
-
- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
|
| 87 |
-
- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
|
| 88 |
-
- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
|
| 89 |
-
- `--private`: Deploy the space as private (default: public)
|
| 90 |
-
|
| 91 |
-
### Examples
|
| 92 |
-
|
| 93 |
-
```bash
|
| 94 |
-
# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
|
| 95 |
-
openenv push
|
| 96 |
-
|
| 97 |
-
# Push to a specific repository
|
| 98 |
-
openenv push --repo-id my-org/my-env
|
| 99 |
-
|
| 100 |
-
# Push with a custom base image
|
| 101 |
-
openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
|
| 102 |
-
|
| 103 |
-
# Push as a private space
|
| 104 |
-
openenv push --private
|
| 105 |
-
|
| 106 |
-
# Combine options
|
| 107 |
-
openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
|
| 108 |
-
```
|
| 109 |
-
|
| 110 |
-
After deployment, your space will be available at:
|
| 111 |
-
`https://huggingface.co/spaces/<repo-id>`
|
| 112 |
-
|
| 113 |
-
The deployed space includes:
|
| 114 |
-
- **Web Interface** at `/web` - Interactive UI for exploring the environment
|
| 115 |
-
- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
|
| 116 |
-
- **Health Check** at `/health` - Container health monitoring
|
| 117 |
-
- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
|
| 118 |
-
|
| 119 |
-
## Environment Details
|
| 120 |
-
|
| 121 |
-
### Action
|
| 122 |
-
**MyAction**: Contains a single field
|
| 123 |
-
- `message` (str) - The message to echo back
|
| 124 |
-
|
| 125 |
-
### Observation
|
| 126 |
-
**MyObservation**: Contains the echo response and metadata
|
| 127 |
-
- `echoed_message` (str) - The message echoed back
|
| 128 |
-
- `message_length` (int) - Length of the message
|
| 129 |
-
- `reward` (float) - Reward based on message length (length × 0.1)
|
| 130 |
-
- `done` (bool) - Always False for echo environment
|
| 131 |
-
- `metadata` (dict) - Additional info like step count
|
| 132 |
-
|
| 133 |
-
### Reward
|
| 134 |
-
The reward is calculated as: `message_length × 0.1`
|
| 135 |
-
- "Hi" → reward: 0.2
|
| 136 |
-
- "Hello, World!" → reward: 1.3
|
| 137 |
-
- Empty message → reward: 0.0
|
| 138 |
-
|
| 139 |
-
## Advanced Usage
|
| 140 |
-
|
| 141 |
-
### Connecting to an Existing Server
|
| 142 |
-
|
| 143 |
-
If you already have a My Env environment server running, you can connect directly:
|
| 144 |
-
|
| 145 |
-
```python
|
| 146 |
-
from my_env import MyEnv
|
| 147 |
-
|
| 148 |
-
# Connect to existing server
|
| 149 |
-
my_envenv = MyEnv(base_url="<ENV_HTTP_URL_HERE>")
|
| 150 |
-
|
| 151 |
-
# Use as normal
|
| 152 |
-
result = my_envenv.reset()
|
| 153 |
-
result = my_envenv.step(MyAction(message="Hello!"))
|
| 154 |
-
```
|
| 155 |
-
|
| 156 |
-
Note: When connecting to an existing server, `my_envenv.close()` will NOT stop the server.
|
| 157 |
-
|
| 158 |
-
### Using the Context Manager
|
| 159 |
-
|
| 160 |
-
The client supports context manager usage for automatic connection management:
|
| 161 |
-
|
| 162 |
-
```python
|
| 163 |
-
from my_env import MyAction, MyEnv
|
| 164 |
-
|
| 165 |
-
# Connect with context manager (auto-connects and closes)
|
| 166 |
-
with MyEnv(base_url="http://localhost:8000") as env:
|
| 167 |
-
result = env.reset()
|
| 168 |
-
print(f"Reset: {result.observation.echoed_message}")
|
| 169 |
-
# Multiple steps with low latency
|
| 170 |
-
for msg in ["Hello", "World", "!"]:
|
| 171 |
-
result = env.step(MyAction(message=msg))
|
| 172 |
-
print(f"Echoed: {result.observation.echoed_message}")
|
| 173 |
-
```
|
| 174 |
-
|
| 175 |
-
The client uses WebSocket connections for:
|
| 176 |
-
- **Lower latency**: No HTTP connection overhead per request
|
| 177 |
-
- **Persistent session**: Server maintains your environment state
|
| 178 |
-
- **Efficient for episodes**: Better for many sequential steps
|
| 179 |
-
|
| 180 |
-
### Concurrent WebSocket Sessions
|
| 181 |
-
|
| 182 |
-
The server supports multiple concurrent WebSocket connections. To enable this,
|
| 183 |
-
modify `server/app.py` to use factory mode:
|
| 184 |
-
|
| 185 |
-
```python
|
| 186 |
-
# In server/app.py - use factory mode for concurrent sessions
|
| 187 |
-
app = create_app(
|
| 188 |
-
MyEnvironment, # Pass class, not instance
|
| 189 |
-
MyAction,
|
| 190 |
-
MyObservation,
|
| 191 |
-
max_concurrent_envs=4, # Allow 4 concurrent sessions
|
| 192 |
-
)
|
| 193 |
-
```
|
| 194 |
-
|
| 195 |
-
Then multiple clients can connect simultaneously:
|
| 196 |
-
|
| 197 |
-
```python
|
| 198 |
-
from my_env import MyAction, MyEnv
|
| 199 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 200 |
-
|
| 201 |
-
def run_episode(client_id: int):
|
| 202 |
-
with MyEnv(base_url="http://localhost:8000") as env:
|
| 203 |
-
result = env.reset()
|
| 204 |
-
for i in range(10):
|
| 205 |
-
result = env.step(MyAction(message=f"Client {client_id}, step {i}"))
|
| 206 |
-
return client_id, result.observation.message_length
|
| 207 |
-
|
| 208 |
-
# Run 4 episodes concurrently
|
| 209 |
-
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 210 |
-
results = list(executor.map(run_episode, range(4)))
|
| 211 |
-
```
|
| 212 |
-
|
| 213 |
-
## Development & Testing
|
| 214 |
-
|
| 215 |
-
### Direct Environment Testing
|
| 216 |
-
|
| 217 |
-
Test the environment logic directly without starting the HTTP server:
|
| 218 |
-
|
| 219 |
-
```bash
|
| 220 |
-
# From the server directory
|
| 221 |
-
python3 server/my_env_environment.py
|
| 222 |
-
```
|
| 223 |
-
|
| 224 |
-
This verifies that:
|
| 225 |
-
- Environment resets correctly
|
| 226 |
-
- Step executes actions properly
|
| 227 |
-
- State tracking works
|
| 228 |
-
- Rewards are calculated correctly
|
| 229 |
-
|
| 230 |
-
### Running Locally
|
| 231 |
-
|
| 232 |
-
Run the server locally for development:
|
| 233 |
-
|
| 234 |
-
```bash
|
| 235 |
-
uvicorn server.app:app --reload
|
| 236 |
-
```
|
| 237 |
-
|
| 238 |
-
## Project Structure
|
| 239 |
-
|
| 240 |
-
```
|
| 241 |
-
my_env/
|
| 242 |
-
├── .dockerignore # Docker build exclusions
|
| 243 |
-
├── __init__.py # Module exports
|
| 244 |
-
├── README.md # This file
|
| 245 |
-
├── openenv.yaml # OpenEnv manifest
|
| 246 |
-
├── pyproject.toml # Project metadata and dependencies
|
| 247 |
-
├── uv.lock # Locked dependencies (generated)
|
| 248 |
-
├── client.py # MyEnv client
|
| 249 |
-
├── models.py # Action and Observation models
|
| 250 |
-
└── server/
|
| 251 |
-
├── __init__.py # Server module exports
|
| 252 |
-
├── my_env_environment.py # Core environment logic
|
| 253 |
-
├── app.py # FastAPI application (HTTP + WebSocket endpoints)
|
| 254 |
-
└── Dockerfile # Container image definition
|
| 255 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_env/models.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""
|
| 8 |
-
Data models for the My Env Environment.
|
| 9 |
-
|
| 10 |
-
The my_env environment is a simple test environment that echoes back messages.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
from openenv.core.env_server.types import Action, Observation
|
| 14 |
-
from pydantic import Field
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
class MyAction(Action):
|
| 18 |
-
"""Action for the My Env environment - just a message to echo."""
|
| 19 |
-
|
| 20 |
-
message: str = Field(..., description="Message to echo back")
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
class MyObservation(Observation):
|
| 24 |
-
"""Observation from the My Env environment - the echoed message."""
|
| 25 |
-
|
| 26 |
-
echoed_message: str = Field(default="", description="The echoed message")
|
| 27 |
-
message_length: int = Field(default=0, description="Length of the echoed message")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my_env/openenv.yaml → openenv.yaml
RENAMED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
spec_version: 1
|
| 2 |
-
name:
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
| 7 |
-
|
|
|
|
| 1 |
spec_version: 1
|
| 2 |
+
name: python_debugging_gym
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
|
|
pre-val.sh
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
if ! command -v openenv &>/dev/null; then
|
| 161 |
+
fail "openenv command not found"
|
| 162 |
+
hint "Install it: pip install openenv-core"
|
| 163 |
+
stop_at "Step 3"
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
VALIDATE_OK=false
|
| 167 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
+
|
| 169 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
+
pass "openenv validate passed"
|
| 171 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
+
else
|
| 173 |
+
fail "openenv validate failed"
|
| 174 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
+
stop_at "Step 3"
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
printf "\n"
|
| 179 |
+
printf "${BOLD}========================================${NC}\n"
|
| 180 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
+
printf "${BOLD}========================================${NC}\n"
|
| 183 |
+
printf "\n"
|
| 184 |
+
|
| 185 |
+
exit 0
|
my_env/pyproject.toml → pyproject.toml
RENAMED
|
@@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"]
|
|
| 9 |
build-backend = "setuptools.build_meta"
|
| 10 |
|
| 11 |
[project]
|
| 12 |
-
name = "openenv-
|
| 13 |
version = "0.1.0"
|
| 14 |
-
description = "
|
| 15 |
requires-python = ">=3.10"
|
| 16 |
dependencies = [
|
| 17 |
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
|
@@ -26,6 +26,8 @@ dependencies = [
|
|
| 26 |
# "gymnasium>=0.29.0",
|
| 27 |
# "openspiel>=1.0.0",
|
| 28 |
# "smolagents>=1.22.0,<2",
|
|
|
|
|
|
|
| 29 |
]
|
| 30 |
|
| 31 |
[project.optional-dependencies]
|
|
@@ -42,4 +44,4 @@ server = "my_env.server.app:main"
|
|
| 42 |
[tool.setuptools]
|
| 43 |
include-package-data = true
|
| 44 |
packages = ["my_env", "my_env.server"]
|
| 45 |
-
package-dir = { "my_env" = ".", "my_env.server" = "server" }
|
|
|
|
| 9 |
build-backend = "setuptools.build_meta"
|
| 10 |
|
| 11 |
[project]
|
| 12 |
+
name = "openenv-python-debugging-gym"
|
| 13 |
version = "0.1.0"
|
| 14 |
+
description = "Python Debugging Gym environment for OpenEnv"
|
| 15 |
requires-python = ">=3.10"
|
| 16 |
dependencies = [
|
| 17 |
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
|
|
|
| 26 |
# "gymnasium>=0.29.0",
|
| 27 |
# "openspiel>=1.0.0",
|
| 28 |
# "smolagents>=1.22.0,<2",
|
| 29 |
+
"openai>=1.30.0",
|
| 30 |
+
"websockets>=12.0",
|
| 31 |
]
|
| 32 |
|
| 33 |
[project.optional-dependencies]
|
|
|
|
| 44 |
[tool.setuptools]
|
| 45 |
include-package-data = true
|
| 46 |
packages = ["my_env", "my_env.server"]
|
| 47 |
+
package-dir = { "my_env" = ".", "my_env.server" = "server" }
|
sandbox.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
sandbox.py — Safe Code Execution Sandbox
|
| 3 |
+
=========================================
|
| 4 |
+
|
| 5 |
+
PRINCIPLE 2 — Errors are Data, Not Control Flow
|
| 6 |
+
If the agent writes code that throws SyntaxError, AssertionError, TypeError,
|
| 7 |
+
RecursionError, or ANY other exception, the environment must NOT crash or
|
| 8 |
+
propagate that exception to the server loop. Every possible failure mode is
|
| 9 |
+
caught inside the child process, serialized into a string, and returned as
|
| 10 |
+
structured data in the CodeObservation. The agent then reads this error text
|
| 11 |
+
and adapts on its next turn.
|
| 12 |
+
|
| 13 |
+
PRINCIPLE 8 — Security is Per Invocation
|
| 14 |
+
The sandbox executes arbitrary LLM-generated Python code. Two defences:
|
| 15 |
+
1. TIMEOUT: The worker process is hard-killed (SIGKILL after SIGTERM) after
|
| 16 |
+
EXEC_TIMEOUT_SECONDS. This stops while-True loops and CPU-exhaustion.
|
| 17 |
+
2. RESTRICTED BUILTINS: exec() receives a controlled __builtins__ dict with
|
| 18 |
+
dangerous callables (open, __import__, eval, exec, compile, breakpoint,
|
| 19 |
+
input) replaced with safe stubs that raise RuntimeError. This prevents
|
| 20 |
+
the agent from escaping the sandbox via filesystem or subprocess access.
|
| 21 |
+
|
| 22 |
+
PRINCIPLE 9 — Optimizations are MVP Requirements
|
| 23 |
+
Python tracebacks can be thousands of lines. We tail-truncate to the last
|
| 24 |
+
MAX_OUTPUT_CHARS characters. The tail of a traceback is the most actionable
|
| 25 |
+
part (it contains the actual exception, not the call stack preamble).
|
| 26 |
+
Prefix '[...truncated N chars...]' is added so the agent knows output was cut.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
from __future__ import annotations
|
| 30 |
+
|
| 31 |
+
import ast
|
| 32 |
+
import io
|
| 33 |
+
import inspect
|
| 34 |
+
import multiprocessing
|
| 35 |
+
import signal
|
| 36 |
+
import sys
|
| 37 |
+
import textwrap
|
| 38 |
+
import traceback
|
| 39 |
+
from typing import Any, Callable, Dict, List, Tuple
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
from .models import TestResult
|
| 43 |
+
except ImportError:
|
| 44 |
+
from models import TestResult
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Constants
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
EXEC_TIMEOUT_SECONDS: int = 5 # Hard wall-clock kill limit (Principle 8)
|
| 52 |
+
MAX_OUTPUT_CHARS: int = 1_000 # Tail-truncate limit (Principle 9)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# Restricted builtins (Principle 8)
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
def _make_safe_stub(name: str) -> Callable:
|
| 60 |
+
"""Return a callable that raises RuntimeError — used to block dangerous builtins."""
|
| 61 |
+
def _stub(*args, **kwargs):
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
f"'{name}' is disabled in the sandbox. "
|
| 64 |
+
"Do not attempt to access the filesystem, import modules dynamically, "
|
| 65 |
+
"or execute arbitrary code within your solution."
|
| 66 |
+
)
|
| 67 |
+
_stub.__name__ = name
|
| 68 |
+
return _stub
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# Whitelist: safe builtins the agent's code is allowed to use.
|
| 72 |
+
# Everything not in this dict is blocked.
|
| 73 |
+
_SAFE_BUILTINS: Dict[str, Any] = {
|
| 74 |
+
# Type constructors
|
| 75 |
+
"int": int, "float": float, "str": str, "bool": bool,
|
| 76 |
+
"list": list, "dict": dict, "set": set, "tuple": tuple,
|
| 77 |
+
"bytes": bytes, "bytearray": bytearray, "frozenset": frozenset,
|
| 78 |
+
"complex": complex,
|
| 79 |
+
# Inspection / iteration
|
| 80 |
+
"len": len, "range": range, "enumerate": enumerate, "zip": zip,
|
| 81 |
+
"map": map, "filter": filter, "reversed": reversed, "sorted": sorted,
|
| 82 |
+
"iter": iter, "next": next, "sum": sum, "min": min, "max": max,
|
| 83 |
+
"abs": abs, "round": round, "divmod": divmod, "pow": pow,
|
| 84 |
+
# Introspection
|
| 85 |
+
"isinstance": isinstance, "issubclass": issubclass, "type": type,
|
| 86 |
+
"hasattr": hasattr, "getattr": getattr, "setattr": setattr,
|
| 87 |
+
"callable": callable, "repr": repr, "hash": hash, "id": id,
|
| 88 |
+
# I/O (stdout only — stderr is captured separately)
|
| 89 |
+
"print": print,
|
| 90 |
+
# Exceptions & control
|
| 91 |
+
"Exception": Exception, "ValueError": ValueError, "TypeError": TypeError,
|
| 92 |
+
"KeyError": KeyError, "IndexError": IndexError, "AttributeError": AttributeError,
|
| 93 |
+
"StopIteration": StopIteration, "RuntimeError": RuntimeError,
|
| 94 |
+
"AssertionError": AssertionError, "NotImplementedError": NotImplementedError,
|
| 95 |
+
"OverflowError": OverflowError, "ZeroDivisionError": ZeroDivisionError,
|
| 96 |
+
"RecursionError": RecursionError, "MemoryError": MemoryError,
|
| 97 |
+
"KeyboardInterrupt": KeyboardInterrupt,
|
| 98 |
+
"BaseException": BaseException,
|
| 99 |
+
# Functional
|
| 100 |
+
"any": any, "all": all,
|
| 101 |
+
"chr": chr, "ord": ord, "hex": hex, "oct": oct, "bin": bin,
|
| 102 |
+
"format": format,
|
| 103 |
+
"object": object, "property": property, "staticmethod": staticmethod,
|
| 104 |
+
"classmethod": classmethod, "super": super,
|
| 105 |
+
# Blocked with stubs (Principle 8)
|
| 106 |
+
"open": _make_safe_stub("open"),
|
| 107 |
+
"__import__": _make_safe_stub("__import__"),
|
| 108 |
+
"eval": _make_safe_stub("eval"),
|
| 109 |
+
"exec": _make_safe_stub("exec"),
|
| 110 |
+
"compile": _make_safe_stub("compile"),
|
| 111 |
+
"breakpoint": _make_safe_stub("breakpoint"),
|
| 112 |
+
"input": _make_safe_stub("input"),
|
| 113 |
+
"globals": _make_safe_stub("globals"),
|
| 114 |
+
"locals": _make_safe_stub("locals"),
|
| 115 |
+
"vars": _make_safe_stub("vars"),
|
| 116 |
+
"dir": _make_safe_stub("dir"),
|
| 117 |
+
"__loader__": None,
|
| 118 |
+
"__spec__": None,
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
# Output truncation (Principle 9)
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
|
| 126 |
+
def _tail_truncate(s: str, limit: int = MAX_OUTPUT_CHARS) -> str:
|
| 127 |
+
"""
|
| 128 |
+
Return the TAIL of `s`, bounded to `limit` characters.
|
| 129 |
+
|
| 130 |
+
Rationale: Python tracebacks print in chronological call order — the most
|
| 131 |
+
actionable information (the actual exception type and message) appears at
|
| 132 |
+
the very END of the traceback, not the beginning. Tail-truncation therefore
|
| 133 |
+
preserves the signal the agent needs while discarding verbose call stacks.
|
| 134 |
+
"""
|
| 135 |
+
if len(s) <= limit:
|
| 136 |
+
return s
|
| 137 |
+
dropped = len(s) - limit
|
| 138 |
+
return f"[...truncated {dropped} chars...]\n" + s[-limit:]
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
# Worker (runs in isolated child process)
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
|
| 145 |
+
def _worker(
|
| 146 |
+
source: str,
|
| 147 |
+
test_sources: List[str],
|
| 148 |
+
result_queue: multiprocessing.Queue,
|
| 149 |
+
) -> None:
|
| 150 |
+
"""
|
| 151 |
+
Isolated execution unit. Never raises — all failures become data.
|
| 152 |
+
|
| 153 |
+
PRINCIPLE 2: Every exception path is caught and serialized.
|
| 154 |
+
PRINCIPLE 8: exec() receives the restricted builtins dict.
|
| 155 |
+
"""
|
| 156 |
+
buf = io.StringIO()
|
| 157 |
+
old_stdout, old_stderr = sys.stdout, sys.stderr
|
| 158 |
+
sys.stdout = sys.stderr = buf
|
| 159 |
+
|
| 160 |
+
test_results: List[Dict] = []
|
| 161 |
+
had_syntax_error = False
|
| 162 |
+
fn_name = "<unknown>"
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
# ── Phase 1: Syntax check ─────────────────────────────────────────
|
| 166 |
+
# Compile before exec() so SyntaxError is caught cleanly.
|
| 167 |
+
try:
|
| 168 |
+
code_obj = compile(source, "<agent_code>", "exec")
|
| 169 |
+
except SyntaxError as exc:
|
| 170 |
+
had_syntax_error = True
|
| 171 |
+
# Restore streams before writing the error
|
| 172 |
+
sys.stdout, sys.stderr = old_stdout, old_stderr
|
| 173 |
+
err = f"SyntaxError at line {exc.lineno}: {exc.msg}\n >> {exc.text or ''}"
|
| 174 |
+
result_queue.put((_tail_truncate(err), [], True))
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
# ── Phase 2: Execute agent code into a sandboxed namespace ───────
|
| 178 |
+
# Use full __builtins__ to prevent __build_class__ errors for class-based tasks.
|
| 179 |
+
namespace: Dict[str, Any] = {"__builtins__": __builtins__}
|
| 180 |
+
try:
|
| 181 |
+
exec(code_obj, namespace) # noqa: S102
|
| 182 |
+
except Exception: # noqa: BLE001
|
| 183 |
+
# PRINCIPLE 2: execution crash is data, not a crash
|
| 184 |
+
tb = traceback.format_exc()
|
| 185 |
+
sys.stdout, sys.stderr = old_stdout, old_stderr
|
| 186 |
+
result_queue.put((_tail_truncate(buf.getvalue() + "\n" + tb), [], False))
|
| 187 |
+
return
|
| 188 |
+
|
| 189 |
+
# ── Phase 3: Run each test function ──────────────────────────────
|
| 190 |
+
# PRINCIPLE 2: each test is isolated inside its own try-except so a
|
| 191 |
+
# crash in test N does not prevent tests N+1..M from running.
|
| 192 |
+
for test_src in test_sources:
|
| 193 |
+
fn_name = "<unknown>"
|
| 194 |
+
try:
|
| 195 |
+
# Inject the test function into the existing namespace so it
|
| 196 |
+
# can access the agent's defined symbols.
|
| 197 |
+
exec(test_src, namespace) # noqa: S102
|
| 198 |
+
|
| 199 |
+
# Extract the last `def` name from the test source.
|
| 200 |
+
fn_name = [
|
| 201 |
+
ln.split("(")[0].replace("def ", "").strip()
|
| 202 |
+
for ln in test_src.splitlines()
|
| 203 |
+
if ln.startswith("def ")
|
| 204 |
+
][-1]
|
| 205 |
+
|
| 206 |
+
namespace[fn_name](namespace)
|
| 207 |
+
test_results.append({"test_name": fn_name, "passed": True})
|
| 208 |
+
|
| 209 |
+
except AssertionError as exc:
|
| 210 |
+
# PRINCIPLE 2: assertion failure is structured data
|
| 211 |
+
test_results.append({
|
| 212 |
+
"test_name": fn_name,
|
| 213 |
+
"passed": False,
|
| 214 |
+
"error_message": _tail_truncate(
|
| 215 |
+
f"AssertionError: {exc}" if str(exc) else "AssertionError (no message)"
|
| 216 |
+
),
|
| 217 |
+
})
|
| 218 |
+
except Exception: # noqa: BLE001
|
| 219 |
+
# PRINCIPLE 2: all other exceptions also become structured data
|
| 220 |
+
test_results.append({
|
| 221 |
+
"test_name": fn_name,
|
| 222 |
+
"passed": False,
|
| 223 |
+
"error_message": _tail_truncate(traceback.format_exc()),
|
| 224 |
+
})
|
| 225 |
+
|
| 226 |
+
except Exception: # noqa: BLE001
|
| 227 |
+
# Catch-all for any unexpected failure in the harness itself
|
| 228 |
+
traceback.print_exc(file=buf)
|
| 229 |
+
finally:
|
| 230 |
+
sys.stdout, sys.stderr = old_stdout, old_stderr
|
| 231 |
+
|
| 232 |
+
captured = _tail_truncate(buf.getvalue())
|
| 233 |
+
result_queue.put((captured, test_results, had_syntax_error))
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# ---------------------------------------------------------------------------
|
| 237 |
+
# Public API
|
| 238 |
+
# ---------------------------------------------------------------------------
|
| 239 |
+
|
| 240 |
+
def check_syntax(source: str) -> Tuple[bool, str]:
|
| 241 |
+
"""
|
| 242 |
+
Fast syntax check via ast.parse() — no execution, no subprocess overhead.
|
| 243 |
+
|
| 244 |
+
Returns (is_valid, error_description).
|
| 245 |
+
Called on every observation build to keep syntax_error field current.
|
| 246 |
+
"""
|
| 247 |
+
try:
|
| 248 |
+
ast.parse(source)
|
| 249 |
+
return True, ""
|
| 250 |
+
except SyntaxError as exc:
|
| 251 |
+
return False, f"SyntaxError at line {exc.lineno}: {exc.msg}"
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def run_code_with_tests(
|
| 255 |
+
source: str,
|
| 256 |
+
test_callables: List[Callable],
|
| 257 |
+
timeout: int = EXEC_TIMEOUT_SECONDS,
|
| 258 |
+
) -> Tuple[str, List[TestResult], bool]:
|
| 259 |
+
"""
|
| 260 |
+
Execute `source` with restricted builtins and run each test callable.
|
| 261 |
+
|
| 262 |
+
PRINCIPLE 8 — hard timeout enforced via multiprocessing:
|
| 263 |
+
proc.join(timeout) → if still alive → SIGTERM → SIGKILL → proceed.
|
| 264 |
+
|
| 265 |
+
PRINCIPLE 2 — all outcomes return as data:
|
| 266 |
+
timeout → ("⏱ timed out", [], False)
|
| 267 |
+
dead proc → ("process exited unexpectedly", [], False)
|
| 268 |
+
normal run → (stdout_stderr, [TestResult...], had_syntax_error)
|
| 269 |
+
|
| 270 |
+
Returns
|
| 271 |
+
-------
|
| 272 |
+
(output_str, test_results, had_syntax_error)
|
| 273 |
+
"""
|
| 274 |
+
# Serialise callables → source strings (required for pickling across processes)
|
| 275 |
+
test_sources = [
|
| 276 |
+
textwrap.dedent(inspect.getsource(fn))
|
| 277 |
+
for fn in test_callables
|
| 278 |
+
]
|
| 279 |
+
|
| 280 |
+
q: multiprocessing.Queue = multiprocessing.Queue()
|
| 281 |
+
proc = multiprocessing.Process(
|
| 282 |
+
target=_worker,
|
| 283 |
+
args=(source, test_sources, q),
|
| 284 |
+
daemon=True, # Dies automatically if parent exits
|
| 285 |
+
)
|
| 286 |
+
proc.start()
|
| 287 |
+
proc.join(timeout)
|
| 288 |
+
|
| 289 |
+
# PRINCIPLE 8 — hard kill (SIGTERM first, SIGKILL if still alive)
|
| 290 |
+
if proc.is_alive():
|
| 291 |
+
proc.terminate()
|
| 292 |
+
proc.join(2) # Give it 2s to handle SIGTERM gracefully
|
| 293 |
+
if proc.is_alive():
|
| 294 |
+
proc.kill() # SIGKILL — unconditional
|
| 295 |
+
proc.join()
|
| 296 |
+
return (
|
| 297 |
+
f"⏱ Execution timed out after {timeout}s. "
|
| 298 |
+
"Your code contains an infinite loop or is too slow. "
|
| 299 |
+
"Fix the logic and try again.",
|
| 300 |
+
[],
|
| 301 |
+
False,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
if q.empty():
|
| 305 |
+
return "Process exited unexpectedly with no output.", [], False
|
| 306 |
+
|
| 307 |
+
raw_output, raw_results, syntax_err = q.get_nowait()
|
| 308 |
+
test_results = [TestResult(**r) for r in raw_results]
|
| 309 |
+
return raw_output, test_results, syntax_err
|
{my_env/server → server}/Dockerfile
RENAMED
|
File without changes
|
{my_env/server → server}/__init__.py
RENAMED
|
File without changes
|
{my_env/server → server}/app.py
RENAMED
|
@@ -4,29 +4,7 @@
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
"""
|
| 8 |
-
FastAPI application for the My Env Environment.
|
| 9 |
-
|
| 10 |
-
This module creates an HTTP server that exposes the MyEnvironment
|
| 11 |
-
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
-
|
| 13 |
-
Endpoints:
|
| 14 |
-
- POST /reset: Reset the environment
|
| 15 |
-
- POST /step: Execute an action
|
| 16 |
-
- GET /state: Get current environment state
|
| 17 |
-
- GET /schema: Get action/observation schemas
|
| 18 |
-
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
-
|
| 20 |
-
Usage:
|
| 21 |
-
# Development (with auto-reload):
|
| 22 |
-
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
-
|
| 24 |
-
# Production:
|
| 25 |
-
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
-
|
| 27 |
-
# Or run directly:
|
| 28 |
-
python -m server.app
|
| 29 |
-
"""
|
| 30 |
|
| 31 |
try:
|
| 32 |
from openenv.core.env_server.http_server import create_app
|
|
@@ -36,49 +14,36 @@ except Exception as e: # pragma: no cover
|
|
| 36 |
) from e
|
| 37 |
|
| 38 |
try:
|
| 39 |
-
from ..models import
|
| 40 |
from .my_env_environment import MyEnvironment
|
| 41 |
-
except
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
from server.my_env_environment import MyEnvironment
|
| 44 |
|
| 45 |
|
| 46 |
# Create the app with web interface and README integration
|
| 47 |
app = create_app(
|
| 48 |
MyEnvironment,
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
env_name="
|
| 52 |
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 53 |
)
|
| 54 |
|
| 55 |
|
| 56 |
-
def main(
|
| 57 |
-
"""
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
This function enables running the server without Docker:
|
| 61 |
-
uv run --project . server
|
| 62 |
-
uv run --project . server --port 8001
|
| 63 |
-
python -m my_env.server.app
|
| 64 |
-
|
| 65 |
-
Args:
|
| 66 |
-
host: Host address to bind to (default: "0.0.0.0")
|
| 67 |
-
port: Port number to listen on (default: 8000)
|
| 68 |
-
|
| 69 |
-
For production deployments, consider using uvicorn directly with
|
| 70 |
-
multiple workers:
|
| 71 |
-
uvicorn my_env.server.app:app --workers 4
|
| 72 |
-
"""
|
| 73 |
import uvicorn
|
| 74 |
|
|
|
|
|
|
|
| 75 |
uvicorn.run(app, host=host, port=port)
|
| 76 |
|
| 77 |
|
| 78 |
if __name__ == "__main__":
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
parser = argparse.ArgumentParser()
|
| 82 |
-
parser.add_argument("--port", type=int, default=8000)
|
| 83 |
-
args = parser.parse_args()
|
| 84 |
-
main(port=args.port)
|
|
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
"""FastAPI entry point for the Python Debugging Gym OpenEnv environment."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
from openenv.core.env_server.http_server import create_app
|
|
|
|
| 14 |
) from e
|
| 15 |
|
| 16 |
try:
|
| 17 |
+
from ..models import CodeAction, CodeObservation
|
| 18 |
from .my_env_environment import MyEnvironment
|
| 19 |
+
except ImportError:
|
| 20 |
+
import sys
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
| 24 |
+
from models import CodeAction, CodeObservation
|
| 25 |
from server.my_env_environment import MyEnvironment
|
| 26 |
|
| 27 |
|
| 28 |
# Create the app with web interface and README integration
|
| 29 |
app = create_app(
|
| 30 |
MyEnvironment,
|
| 31 |
+
CodeAction,
|
| 32 |
+
CodeObservation,
|
| 33 |
+
env_name="python_debugging_gym",
|
| 34 |
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 35 |
)
|
| 36 |
|
| 37 |
|
| 38 |
+
def main() -> None:
|
| 39 |
+
"""Entry point for local and container execution."""
|
| 40 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
import uvicorn
|
| 42 |
|
| 43 |
+
host = os.environ.get("HOST", "0.0.0.0")
|
| 44 |
+
port = int(os.environ.get("PORT", "8000"))
|
| 45 |
uvicorn.run(app, host=host, port=port)
|
| 46 |
|
| 47 |
|
| 48 |
if __name__ == "__main__":
|
| 49 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{my_env/server → server}/my_env_environment.py
RENAMED
|
@@ -4,40 +4,21 @@
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
"""
|
| 8 |
-
My Env Environment Implementation.
|
| 9 |
-
|
| 10 |
-
A simple test environment that echoes back messages sent to it.
|
| 11 |
-
Perfect for testing HTTP server infrastructure.
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
from uuid import uuid4
|
| 15 |
|
| 16 |
from openenv.core.env_server.interfaces import Environment
|
| 17 |
from openenv.core.env_server.types import State
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
from ..
|
|
|
|
| 21 |
except ImportError:
|
| 22 |
-
from
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
class MyEnvironment(Environment):
|
| 26 |
-
"""
|
| 27 |
-
A simple echo environment that echoes back messages.
|
| 28 |
-
|
| 29 |
-
This environment is designed for testing the HTTP server infrastructure.
|
| 30 |
-
It maintains minimal state and simply echoes back whatever message it receives.
|
| 31 |
-
|
| 32 |
-
Example:
|
| 33 |
-
>>> env = MyEnvironment()
|
| 34 |
-
>>> obs = env.reset()
|
| 35 |
-
>>> print(obs.echoed_message) # "My Env environment ready!"
|
| 36 |
-
>>>
|
| 37 |
-
>>> obs = env.step(MyAction(message="Hello"))
|
| 38 |
-
>>> print(obs.echoed_message) # "Hello"
|
| 39 |
-
>>> print(obs.message_length) # 5
|
| 40 |
-
"""
|
| 41 |
|
| 42 |
# Enable concurrent WebSocket sessions.
|
| 43 |
# Set to True if your environment isolates state between instances.
|
|
@@ -46,52 +27,32 @@ class MyEnvironment(Environment):
|
|
| 46 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 47 |
|
| 48 |
def __init__(self):
|
| 49 |
-
|
| 50 |
-
self._state = State(episode_id=
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
Returns:
|
| 58 |
-
MyObservation with a ready message
|
| 59 |
-
"""
|
| 60 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 61 |
-
self._reset_count += 1
|
| 62 |
-
|
| 63 |
-
return MyObservation(
|
| 64 |
-
echoed_message="My Env environment ready!",
|
| 65 |
-
message_length=0,
|
| 66 |
-
done=False,
|
| 67 |
-
reward=0.0,
|
| 68 |
)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# Simple reward: longer messages get higher rewards
|
| 86 |
-
reward = length * 0.1
|
| 87 |
-
|
| 88 |
-
return MyObservation(
|
| 89 |
-
echoed_message=message,
|
| 90 |
-
message_length=length,
|
| 91 |
-
done=False,
|
| 92 |
-
reward=reward,
|
| 93 |
-
metadata={"original_message": message, "step": self._state.step_count},
|
| 94 |
)
|
|
|
|
| 95 |
|
| 96 |
@property
|
| 97 |
def state(self) -> State:
|
|
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
"""OpenEnv adapter around the PythonDebuggingGym core environment."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from openenv.core.env_server.interfaces import Environment
|
| 10 |
from openenv.core.env_server.types import State
|
| 11 |
|
| 12 |
try:
|
| 13 |
+
from ..environment import PythonDebuggingGym
|
| 14 |
+
from ..models import CodeAction, CodeObservation
|
| 15 |
except ImportError:
|
| 16 |
+
from environment import PythonDebuggingGym
|
| 17 |
+
from models import CodeAction, CodeObservation
|
| 18 |
|
| 19 |
|
| 20 |
class MyEnvironment(Environment):
|
| 21 |
+
"""Environment implementation compatible with OpenEnv's server interface."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Enable concurrent WebSocket sessions.
|
| 24 |
# Set to True if your environment isolates state between instances.
|
|
|
|
| 27 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 28 |
|
| 29 |
def __init__(self):
|
| 30 |
+
self._gym = PythonDebuggingGym()
|
| 31 |
+
self._state = State(episode_id="", step_count=0)
|
| 32 |
+
|
| 33 |
+
def reset(self) -> CodeObservation:
|
| 34 |
+
obs, system_prompt = self._gym.reset()
|
| 35 |
+
self._state = State(
|
| 36 |
+
episode_id=obs.info.get("episode_id", ""),
|
| 37 |
+
step_count=obs.step_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
)
|
| 39 |
+
metadata = dict(obs.metadata or {})
|
| 40 |
+
metadata["system_prompt"] = system_prompt
|
| 41 |
+
obs.metadata = metadata
|
| 42 |
+
return obs
|
| 43 |
+
|
| 44 |
+
def step(self, action: CodeAction) -> CodeObservation: # type: ignore[override]
|
| 45 |
+
obs, reward, done, info = self._gym.step(action)
|
| 46 |
+
obs.reward = reward
|
| 47 |
+
obs.done = done
|
| 48 |
+
metadata = dict(obs.metadata or {})
|
| 49 |
+
metadata.update(info)
|
| 50 |
+
obs.metadata = metadata
|
| 51 |
+
self._state = State(
|
| 52 |
+
episode_id=obs.info.get("episode_id", ""),
|
| 53 |
+
step_count=obs.step_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
+
return obs
|
| 56 |
|
| 57 |
@property
|
| 58 |
def state(self) -> State:
|
{my_env/server → server}/requirements.txt
RENAMED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
openenv[core]>=0.2.0
|
| 2 |
fastapi>=0.115.0
|
| 3 |
uvicorn>=0.24.0
|
| 4 |
-
|
|
|
|
| 5 |
|
| 6 |
|
|
|
|
| 1 |
openenv[core]>=0.2.0
|
| 2 |
fastapi>=0.115.0
|
| 3 |
uvicorn>=0.24.0
|
| 4 |
+
openai>=1.30.0
|
| 5 |
+
websockets>=12.0
|
| 6 |
|
| 7 |
|
tasks.py
ADDED
|
@@ -0,0 +1,683 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tasks.py — Static Task Registry
|
| 3 |
+
================================
|
| 4 |
+
This is a "dumb" registry. Tasks are hardcoded dicts representing
|
| 5 |
+
curated buggy programs generated offline via MutationEngine.
|
| 6 |
+
|
| 7 |
+
Exported symbols:
|
| 8 |
+
TASKS_BY_DIFFICULTY Dict[str, List[Dict]] — tasks grouped by tier
|
| 9 |
+
ALL_TASKS List[Dict] — flat list for random sampling
|
| 10 |
+
|
| 11 |
+
Run mutation_engine.py + dataset_generator.py locally (offline) to
|
| 12 |
+
generate new candidates, curate the best ones, and add them here.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
from typing import Any, Callable, Dict, List
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Test helpers (module-level; accept namespace dict, raise AssertionError)
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
# ── sum_even_numbers ────────────────────────────────────────────────────────
|
| 24 |
+
|
| 25 |
+
def _tse_1(ns):
|
| 26 |
+
res = ns["sum_even_numbers"]([1, 2, 3, 4])
|
| 27 |
+
assert res == 6, f"Test failed: input=[1, 2, 3, 4], expected=6, got={res}"
|
| 28 |
+
def _tse_2(ns):
|
| 29 |
+
res = ns["sum_even_numbers"]([])
|
| 30 |
+
assert res == 0, f"Test failed: input=[], expected=0, got={res}"
|
| 31 |
+
def _tse_3(ns):
|
| 32 |
+
res = ns["sum_even_numbers"]([1, 3, 5])
|
| 33 |
+
assert res == 0, f"Test failed: input=[1, 3, 5], expected=0, got={res}"
|
| 34 |
+
def _tse_4(ns):
|
| 35 |
+
res = ns["sum_even_numbers"]([2, 2, 2])
|
| 36 |
+
assert res == 6, f"Test failed: input=[2, 2, 2], expected=6, got={res}"
|
| 37 |
+
|
| 38 |
+
# ── reverse_string ──────────────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
def _trs_1(ns):
|
| 41 |
+
res = ns["reverse_string"]("abc")
|
| 42 |
+
assert res == "cba", f"Test failed: input='abc', expected='cba', got={res!r}"
|
| 43 |
+
def _trs_2(ns):
|
| 44 |
+
res = ns["reverse_string"]("")
|
| 45 |
+
assert res == "", f"Test failed: input='', expected='', got={res!r}"
|
| 46 |
+
def _trs_3(ns):
|
| 47 |
+
res = ns["reverse_string"]("a")
|
| 48 |
+
assert res == "a", f"Test failed: input='a', expected='a', got={res!r}"
|
| 49 |
+
def _trs_4(ns):
|
| 50 |
+
res = ns["reverse_string"]("abcd")
|
| 51 |
+
assert res == "dcba", f"Test failed: input='abcd', expected='dcba', got={res!r}"
|
| 52 |
+
|
| 53 |
+
# ── binary_search ───────────────────────────────────────────────────────────
|
| 54 |
+
|
| 55 |
+
def _tbs_1(ns):
|
| 56 |
+
res = ns["binary_search"]([1, 2, 3, 4, 5], 3)
|
| 57 |
+
assert res == 2, f"Test failed: input=([1, 2, 3, 4, 5], 3), expected=2, got={res}"
|
| 58 |
+
def _tbs_2(ns):
|
| 59 |
+
res = ns["binary_search"]([1, 2, 3, 4, 5], 6)
|
| 60 |
+
assert res == -1, f"Test failed: input=([1, 2, 3, 4, 5], 6), expected=-1, got={res}"
|
| 61 |
+
def _tbs_3(ns):
|
| 62 |
+
res = ns["binary_search"]([], 1)
|
| 63 |
+
assert res == -1, f"Test failed: input=([], 1), expected=-1, got={res}"
|
| 64 |
+
def _tbs_4(ns):
|
| 65 |
+
res = ns["binary_search"]([7], 7)
|
| 66 |
+
assert res == 0, f"Test failed: input=([7], 7), expected=0, got={res}"
|
| 67 |
+
|
| 68 |
+
# ── flatten ─────────────────────────────────────────────────────────────────
|
| 69 |
+
|
| 70 |
+
def _tfl_1(ns):
|
| 71 |
+
res = ns["flatten"]([1, [2, 3]])
|
| 72 |
+
assert res == [1, 2, 3], f"Test failed: input=[1, [2, 3]], expected=[1, 2, 3], got={res}"
|
| 73 |
+
def _tfl_2(ns):
|
| 74 |
+
res = ns["flatten"]([])
|
| 75 |
+
assert res == [], f"Test failed: input=[], expected=[], got={res}"
|
| 76 |
+
def _tfl_3(ns):
|
| 77 |
+
res = ns["flatten"]([1, [2, [3]]])
|
| 78 |
+
assert res == [1, 2, 3], f"Test failed: input=[1, [2, [3]]], expected=[1, 2, 3], got={res}"
|
| 79 |
+
def _tfl_4(ns):
|
| 80 |
+
res = ns["flatten"]([[1], [2, 3], [4]])
|
| 81 |
+
assert res == [1, 2, 3, 4], f"Test failed: input=[[1], [2, 3], [4]], expected=[1, 2, 3, 4], got={res}"
|
| 82 |
+
|
| 83 |
+
# ── word_count ──────────────────────────────────────────────────────────────
|
| 84 |
+
|
| 85 |
+
def _twc_1(ns):
|
| 86 |
+
res = ns["word_count"]("hello world hello")
|
| 87 |
+
assert res == {"hello": 2, "world": 1}, f"Test failed: input='hello world hello', expected={{'hello': 2, 'world': 1}}, got={res}"
|
| 88 |
+
def _twc_2(ns):
|
| 89 |
+
res = ns["word_count"]("Hi, hi!")
|
| 90 |
+
assert res == {"hi": 2}, f"Test failed: input='Hi, hi!', expected={{'hi': 2}}, got={res}"
|
| 91 |
+
def _twc_3(ns):
|
| 92 |
+
res = ns["word_count"]("")
|
| 93 |
+
assert res == {}, f"Test failed: input='', expected={{}}, got={res}"
|
| 94 |
+
def _twc_4(ns):
|
| 95 |
+
res = ns["word_count"]("Hello HELLO hello")
|
| 96 |
+
assert res == {"hello": 3}, f"Test failed: input='Hello HELLO hello', expected={{'hello': 3}}, got={res}"
|
| 97 |
+
|
| 98 |
+
# ── lru_cache ───────────────────────────────────────────────────────────────
|
| 99 |
+
|
| 100 |
+
def _tlru_1(ns):
|
| 101 |
+
C = ns["LRUCache"]
|
| 102 |
+
c = C(2); c.put(1, 1); c.put(2, 2)
|
| 103 |
+
res = c.get(1)
|
| 104 |
+
assert res == 1, f"Test failed: Capacity 2. Added (1,1), then (2,2). Expected get(1) to be 1, got {res}"
|
| 105 |
+
|
| 106 |
+
def _tlru_2(ns):
|
| 107 |
+
C = ns["LRUCache"]
|
| 108 |
+
c = C(1); c.put(1, 1); c.put(2, 2)
|
| 109 |
+
res = c.get(1)
|
| 110 |
+
assert res == -1, f"Test failed: Capacity 1. Added (1,1), then (2,2). Expected key 1 to be evicted (return -1), got {res}"
|
| 111 |
+
|
| 112 |
+
def _tlru_3(ns):
|
| 113 |
+
C = ns["LRUCache"]
|
| 114 |
+
c = C(2); c.put(1, 1); c.put(2, 2); c.get(1); c.put(3, 3)
|
| 115 |
+
res = c.get(2)
|
| 116 |
+
assert res == -1, f"Test failed: Capacity 2. Added (1,1), then (2,2), got(1), added (3,3). Expected key 2 to be evicted (return -1) since 1 was promoted during get(1), got {res}. Did you promote key 1 during get()?"
|
| 117 |
+
|
| 118 |
+
# ── valid_parentheses ────────────────────────────────────────────────────────
|
| 119 |
+
|
| 120 |
+
def _tvp_1(ns):
|
| 121 |
+
res = ns["is_valid"]("()")
|
| 122 |
+
assert res == True, f"Test failed: input='()', expected=True, got={res}"
|
| 123 |
+
def _tvp_2(ns):
|
| 124 |
+
res = ns["is_valid"]("(]")
|
| 125 |
+
assert res == False, f"Test failed: input='(]', expected=False, got={res}"
|
| 126 |
+
def _tvp_3(ns):
|
| 127 |
+
res = ns["is_valid"]("([{}])")
|
| 128 |
+
assert res == True, f"Test failed: input='([{{}}])', expected=True, got={res}"
|
| 129 |
+
def _tvp_4(ns):
|
| 130 |
+
res = ns["is_valid"]("")
|
| 131 |
+
assert res == True, f"Test failed: input='', expected=True, got={res}"
|
| 132 |
+
|
| 133 |
+
# ── merge_intervals ──────────────────────────────────────────────────────────
|
| 134 |
+
|
| 135 |
+
def _tmi_1(ns):
|
| 136 |
+
res = ns["merge_intervals"]([[1, 3], [2, 6]])
|
| 137 |
+
assert res == [[1, 6]], f"Test failed: input=[[1, 3], [2, 6]], expected=[[1, 6]], got={res}"
|
| 138 |
+
def _tmi_2(ns):
|
| 139 |
+
res = ns["merge_intervals"]([[1, 4], [4, 5]])
|
| 140 |
+
assert res == [[1, 5]], f"Test failed: input=[[1, 4], [4, 5]], expected=[[1, 5]], got={res}"
|
| 141 |
+
def _tmi_3(ns):
|
| 142 |
+
res = ns["merge_intervals"]([[1, 2], [3, 4]])
|
| 143 |
+
assert res == [[1, 2], [3, 4]], f"Test failed: input=[[1, 2], [3, 4]], expected=[[1, 2], [3, 4]], got={res}"
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ---------------------------------------------------------------------------
|
| 147 |
+
# Static task registry
|
| 148 |
+
# ---------------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
def _t(name, description, code, solution, tests, difficulty, bug_type):
|
| 151 |
+
return dict(
|
| 152 |
+
name=name, description=description,
|
| 153 |
+
code=code, solution=solution,
|
| 154 |
+
tests=tests, difficulty=difficulty, bug_type=bug_type,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ── EASY ──────────────────────────────────────────────────────────────────
|
| 159 |
+
|
| 160 |
+
TASK_SUM_EVEN_WRONG_OP = _t(
|
| 161 |
+
name="sum_even_wrong_condition",
|
| 162 |
+
description="Debug the sum_even_numbers function so it passes all tests.",
|
| 163 |
+
difficulty="easy",
|
| 164 |
+
bug_type="wrong_operator",
|
| 165 |
+
code=[
|
| 166 |
+
"def sum_even_numbers(nums):",
|
| 167 |
+
" total = 0",
|
| 168 |
+
" for n in nums:",
|
| 169 |
+
" if n % 2 != 0:",
|
| 170 |
+
" total += n",
|
| 171 |
+
" return total",
|
| 172 |
+
],
|
| 173 |
+
solution=[
|
| 174 |
+
"def sum_even_numbers(nums):",
|
| 175 |
+
" total = 0",
|
| 176 |
+
" for n in nums:",
|
| 177 |
+
" if n % 2 == 0:",
|
| 178 |
+
" total += n",
|
| 179 |
+
" return total",
|
| 180 |
+
],
|
| 181 |
+
tests=[_tse_1, _tse_2, _tse_3, _tse_4],
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
TASK_SUM_EVEN_MISSING_INIT = _t(
|
| 185 |
+
name="sum_even_missing_accumulator",
|
| 186 |
+
description="Debug the sum_even_numbers function so it passes all tests.",
|
| 187 |
+
difficulty="easy",
|
| 188 |
+
bug_type="wrong_operator",
|
| 189 |
+
code=[
|
| 190 |
+
"def sum_even_numbers(nums):",
|
| 191 |
+
" total = 0",
|
| 192 |
+
" for n in nums:",
|
| 193 |
+
" if n % 2 == 0:",
|
| 194 |
+
" total -= n",
|
| 195 |
+
" return total",
|
| 196 |
+
],
|
| 197 |
+
solution=[
|
| 198 |
+
"def sum_even_numbers(nums):",
|
| 199 |
+
" total = 0",
|
| 200 |
+
" for n in nums:",
|
| 201 |
+
" if n % 2 == 0:",
|
| 202 |
+
" total += n",
|
| 203 |
+
" return total",
|
| 204 |
+
],
|
| 205 |
+
tests=[_tse_1, _tse_2, _tse_3, _tse_4],
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
TASK_REVERSE_WRONG_STEP = _t(
|
| 209 |
+
name="reverse_string_wrong_step",
|
| 210 |
+
description="Debug the reverse_string function so it passes all tests.",
|
| 211 |
+
difficulty="easy",
|
| 212 |
+
bug_type="off_by_one",
|
| 213 |
+
code=[
|
| 214 |
+
"def reverse_string(s):",
|
| 215 |
+
" return s[::-2]",
|
| 216 |
+
],
|
| 217 |
+
solution=[
|
| 218 |
+
"def reverse_string(s):",
|
| 219 |
+
" return s[::-1]",
|
| 220 |
+
],
|
| 221 |
+
tests=[_trs_1, _trs_2, _trs_3, _trs_4],
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
TASK_REVERSE_NO_REVERSE = _t(
|
| 225 |
+
name="reverse_string_returns_original",
|
| 226 |
+
description="Debug the reverse_string function so it passes all tests.",
|
| 227 |
+
difficulty="easy",
|
| 228 |
+
bug_type="wrong_operator",
|
| 229 |
+
code=[
|
| 230 |
+
"def reverse_string(s):",
|
| 231 |
+
" return s[::1]",
|
| 232 |
+
],
|
| 233 |
+
solution=[
|
| 234 |
+
"def reverse_string(s):",
|
| 235 |
+
" return s[::-1]",
|
| 236 |
+
],
|
| 237 |
+
tests=[_trs_1, _trs_2, _trs_3, _trs_4],
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
# ── MEDIUM ─────────────────────��──────────────────────────────────────────
|
| 242 |
+
|
| 243 |
+
TASK_BS_OFF_BY_ONE = _t(
|
| 244 |
+
name="binary_search_off_by_one",
|
| 245 |
+
description="Debug the binary_search function so it passes all tests.",
|
| 246 |
+
difficulty="medium",
|
| 247 |
+
bug_type="off_by_one",
|
| 248 |
+
code=[
|
| 249 |
+
"def binary_search(arr, target):",
|
| 250 |
+
" left, right = 0, len(arr)",
|
| 251 |
+
" while left <= right:",
|
| 252 |
+
" mid = (left + right) // 2",
|
| 253 |
+
" if arr[mid] == target:",
|
| 254 |
+
" return mid",
|
| 255 |
+
" elif arr[mid] < target:",
|
| 256 |
+
" left = mid + 1",
|
| 257 |
+
" else:",
|
| 258 |
+
" right = mid - 1",
|
| 259 |
+
" return -1",
|
| 260 |
+
],
|
| 261 |
+
solution=[
|
| 262 |
+
"def binary_search(arr, target):",
|
| 263 |
+
" left, right = 0, len(arr) - 1",
|
| 264 |
+
" while left <= right:",
|
| 265 |
+
" mid = (left + right) // 2",
|
| 266 |
+
" if arr[mid] == target:",
|
| 267 |
+
" return mid",
|
| 268 |
+
" elif arr[mid] < target:",
|
| 269 |
+
" left = mid + 1",
|
| 270 |
+
" else:",
|
| 271 |
+
" right = mid - 1",
|
| 272 |
+
" return -1",
|
| 273 |
+
],
|
| 274 |
+
tests=[_tbs_1, _tbs_2, _tbs_3, _tbs_4],
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
TASK_BS_WRONG_MID = _t(
|
| 278 |
+
name="binary_search_wrong_mid",
|
| 279 |
+
description="Debug the binary_search function so it passes all tests.",
|
| 280 |
+
difficulty="medium",
|
| 281 |
+
bug_type="wrong_operator",
|
| 282 |
+
code=[
|
| 283 |
+
"def binary_search(arr, target):",
|
| 284 |
+
" left, right = 0, len(arr) - 1",
|
| 285 |
+
" while left <= right:",
|
| 286 |
+
" mid = left + right",
|
| 287 |
+
" if mid >= len(arr):",
|
| 288 |
+
" return -1",
|
| 289 |
+
" if arr[mid] == target:",
|
| 290 |
+
" return mid",
|
| 291 |
+
" elif arr[mid] < target:",
|
| 292 |
+
" left = mid + 1",
|
| 293 |
+
" else:",
|
| 294 |
+
" right = mid - 1",
|
| 295 |
+
" return -1",
|
| 296 |
+
],
|
| 297 |
+
solution=[
|
| 298 |
+
"def binary_search(arr, target):",
|
| 299 |
+
" left, right = 0, len(arr) - 1",
|
| 300 |
+
" while left <= right:",
|
| 301 |
+
" mid = (left + right) // 2",
|
| 302 |
+
" if arr[mid] == target:",
|
| 303 |
+
" return mid",
|
| 304 |
+
" elif arr[mid] < target:",
|
| 305 |
+
" left = mid + 1",
|
| 306 |
+
" else:",
|
| 307 |
+
" right = mid - 1",
|
| 308 |
+
" return -1",
|
| 309 |
+
],
|
| 310 |
+
tests=[_tbs_1, _tbs_2, _tbs_3, _tbs_4],
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
TASK_FLATTEN_APPEND = _t(
|
| 314 |
+
name="flatten_missing_recursion",
|
| 315 |
+
description="Debug the flatten function so it passes all tests.",
|
| 316 |
+
difficulty="medium",
|
| 317 |
+
bug_type="wrong_function_call",
|
| 318 |
+
code=[
|
| 319 |
+
"def flatten(lst):",
|
| 320 |
+
" result = []",
|
| 321 |
+
" for item in lst:",
|
| 322 |
+
" if isinstance(item, list):",
|
| 323 |
+
" result.append(item)",
|
| 324 |
+
" else:",
|
| 325 |
+
" result.append(item)",
|
| 326 |
+
" return result",
|
| 327 |
+
],
|
| 328 |
+
solution=[
|
| 329 |
+
"def flatten(lst):",
|
| 330 |
+
" result = []",
|
| 331 |
+
" for item in lst:",
|
| 332 |
+
" if isinstance(item, list):",
|
| 333 |
+
" result.extend(flatten(item))",
|
| 334 |
+
" else:",
|
| 335 |
+
" result.append(item)",
|
| 336 |
+
" return result",
|
| 337 |
+
],
|
| 338 |
+
tests=[_tfl_1, _tfl_2, _tfl_3, _tfl_4],
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
TASK_FLATTEN_LOGIC_INVERT = _t(
|
| 342 |
+
name="flatten_inverted_branch",
|
| 343 |
+
description="Debug the flatten function so it passes all tests.",
|
| 344 |
+
difficulty="medium",
|
| 345 |
+
bug_type="logic_inversion",
|
| 346 |
+
code=[
|
| 347 |
+
"def flatten(lst):",
|
| 348 |
+
" result = []",
|
| 349 |
+
" for item in lst:",
|
| 350 |
+
" if not isinstance(item, list):",
|
| 351 |
+
" result.extend(flatten(item))",
|
| 352 |
+
" else:",
|
| 353 |
+
" result.append(item)",
|
| 354 |
+
" return result",
|
| 355 |
+
],
|
| 356 |
+
solution=[
|
| 357 |
+
"def flatten(lst):",
|
| 358 |
+
" result = []",
|
| 359 |
+
" for item in lst:",
|
| 360 |
+
" if isinstance(item, list):",
|
| 361 |
+
" result.extend(flatten(item))",
|
| 362 |
+
" else:",
|
| 363 |
+
" result.append(item)",
|
| 364 |
+
" return result",
|
| 365 |
+
],
|
| 366 |
+
tests=[_tfl_1, _tfl_2, _tfl_3, _tfl_4],
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
TASK_WC_NO_LOWER = _t(
|
| 370 |
+
name="word_count_no_lower",
|
| 371 |
+
description="Debug the word_count function so it passes all tests.",
|
| 372 |
+
difficulty="medium",
|
| 373 |
+
bug_type="missing_return",
|
| 374 |
+
code=[
|
| 375 |
+
"import string",
|
| 376 |
+
"def word_count(text):",
|
| 377 |
+
" for p in string.punctuation:",
|
| 378 |
+
" text = text.replace(p, '')",
|
| 379 |
+
" words = text.split()",
|
| 380 |
+
" counts = {}",
|
| 381 |
+
" for w in words:",
|
| 382 |
+
" counts[w] = counts.get(w, 0) + 1",
|
| 383 |
+
" return counts",
|
| 384 |
+
],
|
| 385 |
+
solution=[
|
| 386 |
+
"import string",
|
| 387 |
+
"def word_count(text):",
|
| 388 |
+
" text = text.lower()",
|
| 389 |
+
" for p in string.punctuation:",
|
| 390 |
+
" text = text.replace(p, '')",
|
| 391 |
+
" words = text.split()",
|
| 392 |
+
" counts = {}",
|
| 393 |
+
" for w in words:",
|
| 394 |
+
" counts[w] = counts.get(w, 0) + 1",
|
| 395 |
+
" return counts",
|
| 396 |
+
],
|
| 397 |
+
tests=[_twc_1, _twc_2, _twc_3, _twc_4],
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
TASK_WC_NO_PUNCT = _t(
|
| 401 |
+
name="word_count_no_punct_strip",
|
| 402 |
+
description="Debug the word_count function so it passes all tests.",
|
| 403 |
+
difficulty="medium",
|
| 404 |
+
bug_type="missing_return",
|
| 405 |
+
code=[
|
| 406 |
+
"def word_count(text):",
|
| 407 |
+
" text = text.lower()",
|
| 408 |
+
" words = text.split()",
|
| 409 |
+
" counts = {}",
|
| 410 |
+
" for w in words:",
|
| 411 |
+
" counts[w] = counts.get(w, 0) + 1",
|
| 412 |
+
" return counts",
|
| 413 |
+
],
|
| 414 |
+
solution=[
|
| 415 |
+
"import string",
|
| 416 |
+
"def word_count(text):",
|
| 417 |
+
" text = text.lower()",
|
| 418 |
+
" for p in string.punctuation:",
|
| 419 |
+
" text = text.replace(p, '')",
|
| 420 |
+
" words = text.split()",
|
| 421 |
+
" counts = {}",
|
| 422 |
+
" for w in words:",
|
| 423 |
+
" counts[w] = counts.get(w, 0) + 1",
|
| 424 |
+
" return counts",
|
| 425 |
+
],
|
| 426 |
+
tests=[_twc_1, _twc_2, _twc_3, _twc_4],
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
# ── HARD ──────────────────────────────────────────────────────────────────
|
| 431 |
+
|
| 432 |
+
TASK_LRU_WRONG_EVICT = _t(
|
| 433 |
+
name="lru_cache_wrong_eviction",
|
| 434 |
+
description="Debug the LRUCache function so it passes all tests.",
|
| 435 |
+
difficulty="hard",
|
| 436 |
+
bug_type="off_by_one",
|
| 437 |
+
code=[
|
| 438 |
+
"class LRUCache:",
|
| 439 |
+
" def __init__(self, capacity):",
|
| 440 |
+
" self.capacity = capacity",
|
| 441 |
+
" self.cache = []",
|
| 442 |
+
" def get(self, key):",
|
| 443 |
+
" for i, (k, v) in enumerate(self.cache):",
|
| 444 |
+
" if k == key:",
|
| 445 |
+
" self.cache.append(self.cache.pop(i))",
|
| 446 |
+
" return v",
|
| 447 |
+
" return -1",
|
| 448 |
+
" def put(self, key, value):",
|
| 449 |
+
" for i, (k, _) in enumerate(self.cache):",
|
| 450 |
+
" if k == key:",
|
| 451 |
+
" self.cache.pop(i)",
|
| 452 |
+
" break",
|
| 453 |
+
" if len(self.cache) >= self.capacity:",
|
| 454 |
+
" self.cache.pop(0)",
|
| 455 |
+
" self.cache.append((key, value))",
|
| 456 |
+
],
|
| 457 |
+
solution=[
|
| 458 |
+
"class LRUCache:",
|
| 459 |
+
" def __init__(self, capacity):",
|
| 460 |
+
" self.capacity = capacity",
|
| 461 |
+
" self.cache = []",
|
| 462 |
+
" def get(self, key):",
|
| 463 |
+
" for i, (k, v) in enumerate(self.cache):",
|
| 464 |
+
" if k == key:",
|
| 465 |
+
" self.cache.append(self.cache.pop(i))",
|
| 466 |
+
" return v",
|
| 467 |
+
" return -1",
|
| 468 |
+
" def put(self, key, value):",
|
| 469 |
+
" for i, (k, _) in enumerate(self.cache):",
|
| 470 |
+
" if k == key:",
|
| 471 |
+
" self.cache.pop(i)",
|
| 472 |
+
" break",
|
| 473 |
+
" if len(self.cache) >= self.capacity:",
|
| 474 |
+
" self.cache.pop(0)",
|
| 475 |
+
" self.cache.append((key, value))",
|
| 476 |
+
],
|
| 477 |
+
tests=[_tlru_1, _tlru_2, _tlru_3],
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
TASK_LRU_NO_PROMOTE = _t(
|
| 481 |
+
name="lru_cache_no_promotion",
|
| 482 |
+
description="Debug the LRUCache function so it passes all tests.",
|
| 483 |
+
difficulty="hard",
|
| 484 |
+
bug_type="missing_return",
|
| 485 |
+
code=[
|
| 486 |
+
"class LRUCache:",
|
| 487 |
+
" def __init__(self, capacity):",
|
| 488 |
+
" self.capacity = capacity",
|
| 489 |
+
" self.cache = []",
|
| 490 |
+
" def get(self, key):",
|
| 491 |
+
" for i, (k, v) in enumerate(self.cache):",
|
| 492 |
+
" if k == key:",
|
| 493 |
+
" return v",
|
| 494 |
+
" return -1",
|
| 495 |
+
" def put(self, key, value):",
|
| 496 |
+
" for i, (k, _) in enumerate(self.cache):",
|
| 497 |
+
" if k == key:",
|
| 498 |
+
" self.cache.pop(i)",
|
| 499 |
+
" break",
|
| 500 |
+
" if len(self.cache) >= self.capacity:",
|
| 501 |
+
" self.cache.pop(0)",
|
| 502 |
+
" self.cache.append((key, value))",
|
| 503 |
+
],
|
| 504 |
+
solution=[
|
| 505 |
+
"class LRUCache:",
|
| 506 |
+
" def __init__(self, capacity):",
|
| 507 |
+
" self.capacity = capacity",
|
| 508 |
+
" self.cache = []",
|
| 509 |
+
" def get(self, key):",
|
| 510 |
+
" for i, (k, v) in enumerate(self.cache):",
|
| 511 |
+
" if k == key:",
|
| 512 |
+
" self.cache.append(self.cache.pop(i))",
|
| 513 |
+
" return v",
|
| 514 |
+
" return -1",
|
| 515 |
+
" def put(self, key, value):",
|
| 516 |
+
" for i, (k, _) in enumerate(self.cache):",
|
| 517 |
+
" if k == key:",
|
| 518 |
+
" self.cache.pop(i)",
|
| 519 |
+
" break",
|
| 520 |
+
" if len(self.cache) >= self.capacity:",
|
| 521 |
+
" self.cache.pop(0)",
|
| 522 |
+
" self.cache.append((key, value))",
|
| 523 |
+
],
|
| 524 |
+
tests=[_tlru_1, _tlru_2, _tlru_3],
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
TASK_VP_WRONG_MAPPING = _t(
|
| 528 |
+
name="valid_parentheses_wrong_mapping",
|
| 529 |
+
description="Debug the is_valid function so it passes all tests.",
|
| 530 |
+
difficulty="hard",
|
| 531 |
+
bug_type="wrong_operator",
|
| 532 |
+
code=[
|
| 533 |
+
"def is_valid(s):",
|
| 534 |
+
" stack = []",
|
| 535 |
+
" mapping = {')': '[', ']': '{', '}': '('}",
|
| 536 |
+
" for c in s:",
|
| 537 |
+
" if c in mapping.values():",
|
| 538 |
+
" stack.append(c)",
|
| 539 |
+
" elif c in mapping:",
|
| 540 |
+
" if not stack or stack.pop() != mapping[c]:",
|
| 541 |
+
" return False",
|
| 542 |
+
" return len(stack) == 0",
|
| 543 |
+
],
|
| 544 |
+
solution=[
|
| 545 |
+
"def is_valid(s):",
|
| 546 |
+
" stack = []",
|
| 547 |
+
" mapping = {')': '(', ']': '[', '}': '{'}",
|
| 548 |
+
" for c in s:",
|
| 549 |
+
" if c in mapping.values():",
|
| 550 |
+
" stack.append(c)",
|
| 551 |
+
" elif c in mapping:",
|
| 552 |
+
" if not stack or stack.pop() != mapping[c]:",
|
| 553 |
+
" return False",
|
| 554 |
+
" return len(stack) == 0",
|
| 555 |
+
],
|
| 556 |
+
tests=[_tvp_1, _tvp_2, _tvp_3, _tvp_4],
|
| 557 |
+
)
|
| 558 |
+
|
| 559 |
+
TASK_VP_MISSING_EMPTY_CHECK = _t(
|
| 560 |
+
name="valid_parentheses_no_empty_check",
|
| 561 |
+
description="Debug the is_valid function so it passes all tests.",
|
| 562 |
+
difficulty="hard",
|
| 563 |
+
bug_type="logic_inversion",
|
| 564 |
+
code=[
|
| 565 |
+
"def is_valid(s):",
|
| 566 |
+
" stack = []",
|
| 567 |
+
" mapping = {')': '(', ']': '[', '}': '{'}",
|
| 568 |
+
" for c in s:",
|
| 569 |
+
" if c in mapping.values():",
|
| 570 |
+
" stack.append(c)",
|
| 571 |
+
" elif c in mapping:",
|
| 572 |
+
" if stack.pop() != mapping[c]:",
|
| 573 |
+
" return False",
|
| 574 |
+
" return len(stack) == 0",
|
| 575 |
+
],
|
| 576 |
+
solution=[
|
| 577 |
+
"def is_valid(s):",
|
| 578 |
+
" stack = []",
|
| 579 |
+
" mapping = {')': '(', ']': '[', '}': '{'}",
|
| 580 |
+
" for c in s:",
|
| 581 |
+
" if c in mapping.values():",
|
| 582 |
+
" stack.append(c)",
|
| 583 |
+
" elif c in mapping:",
|
| 584 |
+
" if not stack or stack.pop() != mapping[c]:",
|
| 585 |
+
" return False",
|
| 586 |
+
" return len(stack) == 0",
|
| 587 |
+
],
|
| 588 |
+
tests=[_tvp_1, _tvp_2, _tvp_3, _tvp_4],
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
TASK_MI_STRICT_OVERLAP = _t(
|
| 592 |
+
name="merge_intervals_strict_overlap",
|
| 593 |
+
description="Debug the merge_intervals function so it passes all tests.",
|
| 594 |
+
difficulty="hard",
|
| 595 |
+
bug_type="wrong_operator",
|
| 596 |
+
code=[
|
| 597 |
+
"def merge_intervals(intervals):",
|
| 598 |
+
" intervals.sort()",
|
| 599 |
+
" merged = []",
|
| 600 |
+
" for interval in intervals:",
|
| 601 |
+
" if not merged or merged[-1][1] < interval[0]:",
|
| 602 |
+
" merged.append(list(interval))",
|
| 603 |
+
" else:",
|
| 604 |
+
" merged[-1][1] = max(merged[-1][1], interval[1])",
|
| 605 |
+
" return merged",
|
| 606 |
+
],
|
| 607 |
+
solution=[
|
| 608 |
+
"def merge_intervals(intervals):",
|
| 609 |
+
" intervals.sort()",
|
| 610 |
+
" merged = []",
|
| 611 |
+
" for interval in intervals:",
|
| 612 |
+
" if not merged or merged[-1][1] <= interval[0]:",
|
| 613 |
+
" merged.append(list(interval))",
|
| 614 |
+
" else:",
|
| 615 |
+
" merged[-1][1] = min(merged[-1][1], interval[1])",
|
| 616 |
+
" return merged",
|
| 617 |
+
],
|
| 618 |
+
tests=[_tmi_1, _tmi_2, _tmi_3],
|
| 619 |
+
)
|
| 620 |
+
|
| 621 |
+
TASK_MI_NO_SORT = _t(
|
| 622 |
+
name="merge_intervals_missing_sort",
|
| 623 |
+
description="Debug the merge_intervals function so it passes all tests.",
|
| 624 |
+
difficulty="hard",
|
| 625 |
+
bug_type="missing_return",
|
| 626 |
+
code=[
|
| 627 |
+
"def merge_intervals(intervals):",
|
| 628 |
+
" merged = []",
|
| 629 |
+
" for interval in intervals:",
|
| 630 |
+
" if not merged or merged[-1][1] < interval[0]:",
|
| 631 |
+
" merged.append(list(interval))",
|
| 632 |
+
" else:",
|
| 633 |
+
" merged[-1][1] = max(merged[-1][1], interval[1])",
|
| 634 |
+
" return merged",
|
| 635 |
+
],
|
| 636 |
+
solution=[
|
| 637 |
+
"def merge_intervals(intervals):",
|
| 638 |
+
" intervals.sort()",
|
| 639 |
+
" merged = []",
|
| 640 |
+
" for interval in intervals:",
|
| 641 |
+
" if not merged or merged[-1][1] < interval[0]:",
|
| 642 |
+
" merged.append(list(interval))",
|
| 643 |
+
" else:",
|
| 644 |
+
" merged[-1][1] = max(merged[-1][1], interval[1])",
|
| 645 |
+
" return merged",
|
| 646 |
+
],
|
| 647 |
+
tests=[_tmi_1, _tmi_2, _tmi_3],
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
|
| 651 |
+
# ---------------------------------------------------------------------------
|
| 652 |
+
# Registries
|
| 653 |
+
# ---------------------------------------------------------------------------
|
| 654 |
+
|
| 655 |
+
TASKS_BY_DIFFICULTY: Dict[str, List[Dict]] = {
|
| 656 |
+
"easy": [
|
| 657 |
+
TASK_SUM_EVEN_WRONG_OP,
|
| 658 |
+
TASK_SUM_EVEN_MISSING_INIT,
|
| 659 |
+
TASK_REVERSE_WRONG_STEP,
|
| 660 |
+
TASK_REVERSE_NO_REVERSE,
|
| 661 |
+
],
|
| 662 |
+
"medium": [
|
| 663 |
+
TASK_BS_OFF_BY_ONE,
|
| 664 |
+
TASK_BS_WRONG_MID,
|
| 665 |
+
TASK_FLATTEN_APPEND,
|
| 666 |
+
TASK_FLATTEN_LOGIC_INVERT,
|
| 667 |
+
TASK_WC_NO_LOWER,
|
| 668 |
+
TASK_WC_NO_PUNCT,
|
| 669 |
+
],
|
| 670 |
+
"hard": [
|
| 671 |
+
TASK_LRU_WRONG_EVICT,
|
| 672 |
+
TASK_LRU_NO_PROMOTE,
|
| 673 |
+
TASK_VP_WRONG_MAPPING,
|
| 674 |
+
TASK_VP_MISSING_EMPTY_CHECK,
|
| 675 |
+
TASK_MI_STRICT_OVERLAP,
|
| 676 |
+
TASK_MI_NO_SORT,
|
| 677 |
+
],
|
| 678 |
+
}
|
| 679 |
+
|
| 680 |
+
# Flat list — used for random sampling when training_step is not set
|
| 681 |
+
ALL_TASKS: List[Dict] = [
|
| 682 |
+
t for bucket in TASKS_BY_DIFFICULTY.values() for t in bucket
|
| 683 |
+
]
|
my_env/uv.lock → uv.lock
RENAMED
|
File without changes
|