databoysu commited on
Commit
5813a84
·
1 Parent(s): fdc5ba1
README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Python Debugging Gym
3
+ emoji: 🐛
4
+ colorFrom: blue
5
+ colorTo: cyan
6
+ sdk: docker
7
+ pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - reinforcement-learning
13
+ - code-generation
14
+ ---
15
+
16
+ # Python Debugging Gym
17
+
18
+ An OpenEnv-compatible RL environment where agents debug broken Python code by
19
+ iteratively viewing, editing, and testing code snippets until all tests pass.
20
+
21
+ ## Environment Overview
22
+
23
+ - Action space:
24
+ `VIEW_CODE`, `RUN_TESTS`, `REPLACE_LINES`, `UNDO_EDIT`, `RESET_TO_ORIGINAL`, `SUBMIT`
25
+ - Observation includes:
26
+ `code_lines`, `localized_context`, `last_execution_output`, `syntax_error`, `test_results`
27
+ - Dense reward with step cost and final score on submit.
28
+
29
+ ## Local Run
30
+
31
+ ```bash
32
+ uv sync
33
+ uv run --project . server --port 8000
34
+ ```
35
+
36
+ Server endpoints:
37
+ - `POST /reset`
38
+ - `POST /step`
39
+ - `GET /health`
40
+ - `WS /ws`
41
+ - `GET /web` (OpenEnv web UI)
42
+
43
+ ## Deploy to Hugging Face Spaces
44
+
45
+ ```bash
46
+ openenv push
47
+ ```
48
+
49
+ ## Validate Submission
50
+
51
+ From repo root (`RL_ENV_FINAL`):
52
+
53
+ ```bash
54
+ ./pre-val.sh https://<your-space>.hf.space ./my_env
55
+ ```
my_env/__init__.py → __init__.py RENAMED
@@ -4,13 +4,14 @@
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- """My Env Environment."""
8
 
9
  from .client import MyEnv
10
- from .models import MyAction, MyObservation
11
 
12
  __all__ = [
13
- "MyAction",
14
- "MyObservation",
 
15
  "MyEnv",
16
  ]
 
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ """Python Debugging Gym OpenEnv package."""
8
 
9
  from .client import MyEnv
10
+ from .models import CodeAction, CodeObservation, TestResult
11
 
12
  __all__ = [
13
+ "CodeAction",
14
+ "CodeObservation",
15
+ "TestResult",
16
  "MyEnv",
17
  ]
my_env/client.py → client.py RENAMED
@@ -4,7 +4,7 @@
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- """My Env Environment Client."""
8
 
9
  from typing import Dict
10
 
@@ -12,11 +12,11 @@ from openenv.core import EnvClient
12
  from openenv.core.client_types import StepResult
13
  from openenv.core.env_server.types import State
14
 
15
- from .models import MyAction, MyObservation
16
 
17
 
18
  class MyEnv(
19
- EnvClient[MyAction, MyObservation, State]
20
  ):
21
  """
22
  Client for the My Env Environment.
@@ -44,7 +44,7 @@ class MyEnv(
44
  ... client.close()
45
  """
46
 
47
- def _step_payload(self, action: MyAction) -> Dict:
48
  """
49
  Convert MyAction to JSON payload for step message.
50
 
@@ -54,13 +54,11 @@ class MyEnv(
54
  Returns:
55
  Dictionary representation suitable for JSON encoding
56
  """
57
- return {
58
- "message": action.message,
59
- }
60
 
61
- def _parse_result(self, payload: Dict) -> StepResult[MyObservation]:
62
  """
63
- Parse server response into StepResult[MyObservation].
64
 
65
  Args:
66
  payload: JSON response data from server
@@ -69,9 +67,18 @@ class MyEnv(
69
  StepResult with MyObservation
70
  """
71
  obs_data = payload.get("observation", {})
72
- observation = MyObservation(
73
- echoed_message=obs_data.get("echoed_message", ""),
74
- message_length=obs_data.get("message_length", 0),
 
 
 
 
 
 
 
 
 
75
  done=payload.get("done", False),
76
  reward=payload.get("reward"),
77
  metadata=obs_data.get("metadata", {}),
 
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ """Client for the Python Debugging Gym OpenEnv environment."""
8
 
9
  from typing import Dict
10
 
 
12
  from openenv.core.client_types import StepResult
13
  from openenv.core.env_server.types import State
14
 
15
+ from .models import CodeAction, CodeObservation, TestResult
16
 
17
 
18
  class MyEnv(
19
+ EnvClient[CodeAction, CodeObservation, State]
20
  ):
21
  """
22
  Client for the My Env Environment.
 
44
  ... client.close()
45
  """
46
 
47
+ def _step_payload(self, action: CodeAction) -> Dict:
48
  """
49
  Convert MyAction to JSON payload for step message.
50
 
 
54
  Returns:
55
  Dictionary representation suitable for JSON encoding
56
  """
57
+ return action.model_dump(exclude_none=True)
 
 
58
 
59
+ def _parse_result(self, payload: Dict) -> StepResult[CodeObservation]:
60
  """
61
+ Parse server response into StepResult[CodeObservation].
62
 
63
  Args:
64
  payload: JSON response data from server
 
67
  StepResult with MyObservation
68
  """
69
  obs_data = payload.get("observation", {})
70
+ observation = CodeObservation(
71
+ code_lines=obs_data.get("code_lines", []),
72
+ localized_context=obs_data.get("localized_context", ""),
73
+ last_execution_output=obs_data.get("last_execution_output", ""),
74
+ syntax_error=obs_data.get("syntax_error", False),
75
+ test_results=[
76
+ TestResult(**item) for item in obs_data.get("test_results", [])
77
+ ],
78
+ step_count=obs_data.get("step_count", 0),
79
+ steps_remaining=obs_data.get("steps_remaining", 0),
80
+ reward_last_step=obs_data.get("reward_last_step", 0.0),
81
+ info=obs_data.get("info", {}),
82
  done=payload.get("done", False),
83
  reward=payload.get("reward"),
84
  metadata=obs_data.get("metadata", {}),
context.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ context.py — Layered Context Compaction
3
+ =========================================
4
+
5
+ PRINCIPLE 10 — Layered Context Compaction
6
+ For large files, returning the full source on every observation would rapidly
7
+ fill the agent's context window, leaving no room for reasoning.
8
+
9
+ Instead we return a *localized* view: a ±WINDOW_LINES slice of the code
10
+ centred on the last line that was edited. This gives the agent exactly the
11
+ context it needs — the neighbourhood of its most recent change — without
12
+ flooding the context with unrelated code.
13
+
14
+ This module is intentionally pure (no environment state dependencies) so
15
+ it can be unit-tested independently and reused across environment versions.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import List, Optional
21
+
22
+ # How many lines above and below the anchor to include
23
+ WINDOW_LINES: int = 10
24
+
25
+ # Maximum characters for the localized context block
26
+ # (Principle 9: all outputs must be bounded)
27
+ MAX_CONTEXT_CHARS: int = 2_000
28
+
29
+
30
+ def get_localized_context(
31
+ code_lines: List[str],
32
+ anchor_line: Optional[int],
33
+ window: int = WINDOW_LINES,
34
+ ) -> str:
35
+ """
36
+ Return a ±`window`-line slice of `code_lines` centred on `anchor_line`.
37
+
38
+ Parameters
39
+ ----------
40
+ code_lines : Full list of source lines (0-indexed internally).
41
+ anchor_line : The 1-indexed line number of the most recent edit.
42
+ If None (no edits yet) returns an empty string.
43
+ window : Number of lines to show above and below the anchor.
44
+
45
+ Returns
46
+ -------
47
+ A formatted string with line numbers, bounded to MAX_CONTEXT_CHARS,
48
+ annotated with the visible range and an anchor marker (▶).
49
+
50
+ Example output
51
+ --------------
52
+ [Showing lines 3–13 of 20, anchor ▶ line 7]
53
+ 3 | left, right = 0, len(arr)
54
+ 4 | while left <= right:
55
+ 5 | mid = (left + right) // 2
56
+ 6 | if arr[mid] == target:
57
+ 7 ▶ return mid ← last edit
58
+ 8 | elif arr[mid] < target:
59
+ 9 | left = mid + 1
60
+ 10 | else:
61
+ 11 | right = mid - 1
62
+ 12 | return -1
63
+ """
64
+ if anchor_line is None or not code_lines:
65
+ return ""
66
+
67
+ total = len(code_lines)
68
+
69
+ # Clamp anchor into valid range
70
+ anchor_0 = max(0, min(anchor_line - 1, total - 1))
71
+
72
+ # Compute slice bounds (inclusive on both ends, 0-indexed)
73
+ start_0 = max(0, anchor_0 - window)
74
+ end_0 = min(total - 1, anchor_0 + window)
75
+
76
+ # Build header
77
+ start_1 = start_0 + 1
78
+ end_1 = end_0 + 1
79
+ header = f"[Showing lines {start_1}–{end_1} of {total}, anchor ▶ line {anchor_line}]"
80
+
81
+ # Build body
82
+ body_lines = []
83
+ for i in range(start_0, end_0 + 1):
84
+ line_num = i + 1
85
+ marker = "▶" if i == anchor_0 else "|"
86
+ body_lines.append(f"{line_num:>4} {marker} {code_lines[i]}")
87
+
88
+ result = header + "\n" + "\n".join(body_lines)
89
+
90
+ # PRINCIPLE 9 — hard cap on output size
91
+ if len(result) > MAX_CONTEXT_CHARS:
92
+ result = result[:MAX_CONTEXT_CHARS] + "\n... [context truncated]"
93
+
94
+ return result
environment.py ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ environment.py — Python Debugging Gym (Core RL Environment)
3
+ =============================================================
4
+
5
+ PRINCIPLE 1 — You Don't Design the Control Flow
6
+ The agent decides the sequence of actions. step() is a pure router:
7
+ it receives whatever action the agent chose (in whatever order),
8
+ processes it, and returns the new state. There is no forced sequence,
9
+ no "you must VIEW_CODE before RUN_TESTS" gate. The system prompt
10
+ explains what tools exist; the agent decides how to use them.
11
+
12
+ PRINCIPLE 5 — Cost-Per-Turn Reward Logic
13
+ Each call to step() costs R_STEP_COST = -0.01. This makes the episode
14
+ a multi-turn budget problem: the agent is rewarded for solving quickly.
15
+ An agent that solves in 4 steps scores ~0.14 more than one that takes
16
+ 18 steps to reach the same solution.
17
+
18
+ PRINCIPLE 7 — The Prompt is Code
19
+ The string returned by reset() is the agent's complete operational
20
+ contract for the session. It states: the goal, the available actions
21
+ (with exact JSON examples), the reward structure, the current code,
22
+ and the expected termination condition. Ambiguity in this string
23
+ directly causes off-task behaviour.
24
+
25
+ PRINCIPLE 10 — Layered Context Compaction
26
+ _build_observation() tracks `_last_edited_line` and passes it to
27
+ context.get_localized_context() to produce a focused ±10-line view
28
+ after each write action. This prevents the observation from inflating
29
+ the agent's context window on large files.
30
+
31
+ Reward table (dense, non-sparse — every step emits a signal):
32
+ +1.00 SUBMIT and ALL tests pass → episode solved
33
+ +0.10 RUN_TESTS called → information-gathering rewarded
34
+ +0.05 Per test transitioning fail→pass on a RUN_TESTS or SUBMIT
35
+ -0.01 Every step taken → efficiency pressure (Principle 5)
36
+ -0.10 Syntax error detected → broken code penalised immediately
37
+ -0.10 UNDO_EDIT or RESET_TO_ORIGINAL → backtracking discouraged
38
+ -0.02 Invalid line range supplied → hallucination deterrent
39
+ -0.20 SUBMIT with tests still failing
40
+
41
+ Max episode length: 50 steps.
42
+ """
43
+
44
+ from __future__ import annotations
45
+
46
+ import random
47
+ import uuid
48
+ from typing import Any, Dict, List, Optional, Tuple
49
+
50
+ try:
51
+ from .context import get_localized_context
52
+ from .models import CodeAction, CodeObservation, TestResult
53
+ from .sandbox import check_syntax, run_code_with_tests
54
+ from .tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
55
+ except ImportError:
56
+ from context import get_localized_context
57
+ from models import CodeAction, CodeObservation, TestResult
58
+ from sandbox import check_syntax, run_code_with_tests
59
+ from tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Reward constants
64
+ # ---------------------------------------------------------------------------
65
+
66
+ R_SUBMIT_ALL_PASS = +1.00
67
+ R_SUBMIT_FAIL = -0.20
68
+ R_SYNTAX_ERROR = -0.10
69
+ R_RUN_TESTS = +0.10
70
+ R_PER_NEW_PASS = +0.05
71
+ R_STEP_COST = -0.01 # PRINCIPLE 5 — every step has a cost
72
+ R_INVALID_LINE = -0.02
73
+ R_DESTRUCTIVE_PENALTY = -0.20
74
+ R_UNDO_RESET = -0.10 # Mini-Git backtracking penalty
75
+
76
+ MAX_STEPS: int = 50
77
+
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # System Prompt (PRINCIPLE 7 — The Prompt is Code)
81
+ # ---------------------------------------------------------------------------
82
+ # This string is the agent's entire operational contract.
83
+ # It must be:
84
+ # • Self-contained (no assumed context from training data)
85
+ # • Precise (exact JSON examples, not vague descriptions)
86
+ # • Non-directive about sequence (Principle 1: agent chooses order)
87
+ # • Complete (goal, tools, rewards, termination — nothing omitted)
88
+
89
+ _SYSTEM_PROMPT = """\
90
+ ╔══════════════════════════════════════════════════════╗
91
+ ║ PYTHON DEBUGGING GYM — EPISODE BRIEF ║
92
+ ╚══════════════════════════════════════════════════════╝
93
+
94
+ GOAL
95
+ ----
96
+ The Python source file shown below contains one or more bugs.
97
+ Your task is to find and fix every bug so that ALL unit tests pass, then
98
+ call SUBMIT to end the episode.
99
+
100
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
101
+ STANDARD OPERATING PROCEDURE (follow this state machine)
102
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
103
+
104
+ STEP 1 — ORIENT : Call VIEW_CODE to read the full file with line numbers.
105
+ STEP 2 — DIAGNOSE : Call RUN_TESTS to get the exact error message and traceback.
106
+ STEP 3 — FIX : Call REPLACE_LINES to correct the identified bug.
107
+ (Use UNDO_EDIT if the edit made things worse.)
108
+ STEP 4 — VERIFY : Call RUN_TESTS again to confirm the fix worked.
109
+ STEP 5 — REPEAT : If tests still fail, return to STEP 1 and re-read the code.
110
+ STEP 6 — SUBMIT : Once ALL tests pass, call SUBMIT.
111
+
112
+ ⚠ Do NOT call VIEW_CODE more than once in a row. Each VIEW_CODE costs -0.01.
113
+ If you have already viewed the code, call RUN_TESTS next, not VIEW_CODE again.
114
+
115
+ ⚠ THE ESCAPE HATCH RULE: If an edit results in a syntax error or an indentation error,
116
+ DO NOT try to manually fix spaces. IMMEDIATELY use UNDO_EDIT or RESET_TO_ORIGINAL.
117
+
118
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
119
+ AVAILABLE TOOLS (send one JSON object per turn)
120
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
121
+
122
+ 1. VIEW_CODE — see the full file with line numbers
123
+ {{"thought": "<your reasoning>", "action_type": "VIEW_CODE"}}
124
+
125
+ 2. RUN_TESTS — execute all unit tests; see pass/fail + output
126
+ {{"thought": "<your reasoning>", "action_type": "RUN_TESTS"}}
127
+
128
+ 3. REPLACE_LINES — replace a contiguous block of lines (start to end, inclusive)
129
+ {{"thought": "<your reasoning>", "action_type": "REPLACE_LINES", "start_line": 3, "end_line": 5, "new_code_block": " x = 1\\n return x"}}
130
+ ⚠ start_line and end_line are 1-indexed and INCLUSIVE.
131
+ ⚠ new_code_block is a single string; separate lines with \\n (no trailing \\n).
132
+ ⚠ Indentation is syntax in Python — include the correct leading spaces on every line.
133
+ ⚠ The file grows or shrinks when the new block has more/fewer lines than the range.
134
+ ⚠ After REPLACE_LINES, call RUN_TESTS (not VIEW_CODE) to verify the fix.
135
+
136
+ 4. UNDO_EDIT — revert to the state before the most recent REPLACE_LINES (-0.10 penalty)
137
+ {{"thought": "<your reasoning>", "action_type": "UNDO_EDIT"}}
138
+ Use when an edit made things worse and you want to try a different approach.
139
+ No-op (with penalty) if there is no edit history.
140
+
141
+ 5. RESET_TO_ORIGINAL — restore the pristine broken code from episode start (-0.10 penalty)
142
+ {{"thought": "<your reasoning>", "action_type": "RESET_TO_ORIGINAL"}}
143
+ Last resort only. Clears all undo history. Resets context anchor.
144
+
145
+ 6. SUBMIT — declare the fix complete; ends the episode
146
+ {{"thought": "<your reasoning>", "action_type": "SUBMIT"}}
147
+ Only call SUBMIT when RUN_TESTS has confirmed ALL tests pass.
148
+ The episode ends immediately on SUBMIT, pass or fail.
149
+
150
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
151
+ REWARD SIGNALS (visible in observation.reward_last_step)
152
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
153
+
154
+ +1.00 SUBMIT and all tests pass ← primary objective
155
+ +0.10 RUN_TESTS called (any outcome) ← gathering info is good
156
+ +0.05 Per test newly passing vs last run ← incremental progress
157
+ -0.01 Every step taken ← solve efficiently
158
+ -0.10 Syntax error in current code ← fix broken syntax first
159
+ -0.10 UNDO_EDIT or RESET_TO_ORIGINAL ← backtracking is expensive
160
+ -0.02 Invalid line range sent ← use VIEW_CODE to check range
161
+ -0.20 SUBMIT with tests still failing ← verify before submitting
162
+
163
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
164
+ EPISODE PARAMETERS
165
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
166
+
167
+ Task : {task_name} ({difficulty})
168
+ Unit tests : {test_count} tests — ALL must pass
169
+ Max steps : {max_steps} (episode terminates at 0 steps remaining)
170
+
171
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
172
+ CURRENT CODE (this is the broken version — fix it)
173
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
174
+
175
+ {code_preview}
176
+ """
177
+
178
+
179
+ # ---------------------------------------------------------------------------
180
+ # Environment
181
+ # ---------------------------------------------------------------------------
182
+
183
+ class PythonDebuggingGym:
184
+ """
185
+ Gymnasium-compatible RL environment for Python debugging.
186
+
187
+ PRINCIPLE 1: step() is a stateless router — the agent chooses the
188
+ sequence. No internal gates, no forced ordering between actions.
189
+
190
+ Interface
191
+ ---------
192
+ obs, system_prompt = env.reset()
193
+ obs, reward, done, info = env.step(action: CodeAction)
194
+ """
195
+
196
+ metadata = {"name": "PythonDebuggingGym-v1", "render_modes": []}
197
+
198
+ def __init__(
199
+ self,
200
+ task_index: Optional[int] = None,
201
+ seed: Optional[int] = None,
202
+ ):
203
+ self._task_index = task_index
204
+ self._rng = random.Random(seed)
205
+
206
+ # All mutable episode state lives here; reset() wipes every field.
207
+ self._code_lines: List[str] = []
208
+ self._task: Dict[str, Any] = {}
209
+ self._step_count: int = 0
210
+ self._prev_pass_count: int = 0
211
+ self._last_test_results: List[TestResult] = []
212
+ self._last_output: str = ""
213
+ self._last_edited_line: Optional[int] = None # PRINCIPLE 10
214
+ self._episode_id: str = ""
215
+ self._done: bool = False
216
+ self._cumulative_reward: float = 0.0
217
+ self._accumulated_step_costs: float = 0.0 # Hackathon compliance
218
+ # Mini-Git snapshot history (Phase 2)
219
+ self._original_code: List[str] = [] # pristine copy set at reset()
220
+ self._edit_history: List[List[str]] = [] # stack of pre-edit snapshots
221
+ # Curriculum learning — persists across episodes, incremented externally
222
+ self.training_step: int = 0
223
+
224
+ # ── Curriculum task sampler ──────────────────────────────────────────────
225
+
226
+ def _sample_task(self, task_override=None) -> Dict[str, Any]:
227
+ """
228
+ Evaluation-safe curriculum sampler.
229
+
230
+ Priority order:
231
+ 1. task_override dict → return it directly (test/eval pinning)
232
+ 2. training_step == 0 → randomly sample from ALL_TASKS (judge-safe default;
233
+ the Meta evaluator calls reset() without setting
234
+ training_step, so this must work correctly)
235
+ 3. training_step > 0 → curriculum bucketing:
236
+ < 1000 → easy
237
+ < 5000 → medium
238
+ >= 5000 → hard
239
+ Falls back to any non-empty bucket if the target bucket is empty.
240
+ """
241
+ if isinstance(task_override, dict):
242
+ return task_override
243
+
244
+ # Judge-safe default: no training_step set → random from all tasks
245
+ if self.training_step == 0:
246
+ if not ALL_TASKS:
247
+ raise RuntimeError("ALL_TASKS is empty — check tasks.py.")
248
+ return self._rng.choice(ALL_TASKS)
249
+
250
+ # Curriculum mode (trainer increments training_step between episodes)
251
+ if self.training_step < 1000:
252
+ bucket = "easy"
253
+ elif self.training_step < 5000:
254
+ bucket = "medium"
255
+ else:
256
+ bucket = "hard"
257
+
258
+ pool = TASKS_BY_DIFFICULTY.get(bucket, [])
259
+ if not pool:
260
+ # Fallback: any non-empty bucket rather than crashing
261
+ for b in ("easy", "medium", "hard"):
262
+ pool = TASKS_BY_DIFFICULTY.get(b, [])
263
+ if pool:
264
+ break
265
+ if not pool:
266
+ raise RuntimeError("TASKS_BY_DIFFICULTY is entirely empty — check tasks.py.")
267
+
268
+ return self._rng.choice(pool)
269
+
270
+ # ── reset() ─────────────────────────────────────────────────────────────
271
+
272
+ def reset(
273
+ self, *, task_index: Optional[int] = None
274
+ ) -> Tuple[CodeObservation, str]:
275
+ """
276
+ Wipe all episode state; select a task; return initial observation + prompt.
277
+
278
+ State isolation guarantee: every mutable field is explicitly reset here.
279
+ There is no shared state between episodes — not even the RNG advances
280
+ carry forward (the seed is fixed at __init__ time).
281
+ """
282
+ self._task = self._sample_task(task_index)
283
+
284
+ # ── Complete state wipe ──────────────────────────────────────────
285
+ self._code_lines = list(self._task["code"]) # deep copy — no alias
286
+ self._step_count = 0
287
+ self._prev_pass_count = 0
288
+ self._last_test_results = []
289
+ self._last_output = ""
290
+ self._last_edited_line = None # no edits yet — localized_context will be empty
291
+ self._episode_id = str(uuid.uuid4())[:8]
292
+ self._done = False
293
+ self._cumulative_reward = 0.0
294
+ self._accumulated_step_costs = 0.0
295
+ # Mini-Git: seed pristine snapshot and clear history
296
+ self._original_code = list(self._task["code"]) # separate copy from _code_lines
297
+ self._edit_history = []
298
+ # Anti-Loop history
299
+ self._last_action: Optional[str] = None
300
+ self._consecutive_count: int = 0
301
+
302
+ obs = self._build_observation(reward=0.0)
303
+
304
+ # PRINCIPLE 7: build the operational contract string
305
+ system_prompt = _SYSTEM_PROMPT.format(
306
+ task_name = self._task["name"],
307
+ difficulty = self._task.get("difficulty", "unknown"),
308
+ test_count = len(self._task["tests"]),
309
+ max_steps = MAX_STEPS,
310
+ code_preview = obs.render_code(),
311
+ )
312
+
313
+ return obs, system_prompt
314
+
315
+ # ── step() ──────────────────────────────────────────────────────────────
316
+
317
+ def step(
318
+ self, action: CodeAction
319
+ ) -> Tuple[CodeObservation, float, bool, Dict[str, Any]]:
320
+ """
321
+ PRINCIPLE 1 — Pure router. Accept any valid action in any order.
322
+
323
+ The only sequencing constraint is that SUBMIT ends the episode.
324
+ All other actions can be called in any combination and in any order.
325
+ step() does NOT enforce a workflow — it applies the action and returns
326
+ the resulting state for the agent to reason about.
327
+
328
+ PRINCIPLE 5 — R_STEP_COST is applied before routing so it is
329
+ impossible to take a "free" step — every turn has a cost.
330
+ """
331
+ if self._done:
332
+ raise RuntimeError(
333
+ "step() called on a finished episode. Call reset() first."
334
+ )
335
+
336
+ self._step_count += 1
337
+ reward = R_STEP_COST # PRINCIPLE 5: cost-per-turn baseline
338
+ self._accumulated_step_costs += abs(R_STEP_COST) # Hackathon compliance
339
+
340
+ # ── Repetition Penalty (Anti-Loop) ───────────────────────────────
341
+ if action.action_type == self._last_action:
342
+ self._consecutive_count += 1
343
+ reward += -0.05 * self._consecutive_count
344
+ else:
345
+ self._consecutive_count = 0
346
+ self._last_action = action.action_type
347
+
348
+ # ── Route (PRINCIPLE 1: no forced sequence) ──────────────────────
349
+ atype = action.action_type
350
+
351
+ if atype == "VIEW_CODE":
352
+ reward += self._act_view_code()
353
+
354
+ elif atype == "RUN_TESTS":
355
+ reward += self._act_run_tests()
356
+
357
+ elif atype == "REPLACE_LINES":
358
+ reward += self._act_replace_lines(
359
+ action.start_line, action.end_line, action.new_code_block
360
+ )
361
+
362
+ elif atype == "UNDO_EDIT":
363
+ reward += self._act_undo_edit()
364
+
365
+ elif atype == "RESET_TO_ORIGINAL":
366
+ reward += self._act_reset_to_original()
367
+
368
+ elif atype == "SUBMIT":
369
+ reward += self._act_submit()
370
+ self._done = True
371
+
372
+ # ── Max-steps termination ────────────────────────────────────────
373
+ if self._step_count >= MAX_STEPS and not self._done:
374
+ self._done = True
375
+ # Deterministic clamp — never trust the LLM to call SUBMIT.
376
+ # Evaluate the current code and produce a valid [0.0, 1.0] score
377
+ # regardless of how the episode ended.
378
+ _, results, syntax_err = run_code_with_tests(
379
+ source=self._source(),
380
+ test_callables=self._task["tests"],
381
+ )
382
+ total = len(results)
383
+ passes = 0 if syntax_err else sum(1 for t in results if t.passed)
384
+ raw = (passes / total if total > 0 else 0.0) - self._accumulated_step_costs
385
+ reward = max(0.0, min(1.0, raw))
386
+ self._last_output += (
387
+ f"\n⚠ Max steps ({MAX_STEPS}) reached. "
388
+ f"Auto-evaluated: {passes}/{total} tests passing. "
389
+ f"Final score: {reward:.4f}"
390
+ )
391
+
392
+ self._cumulative_reward += reward
393
+ obs = self._build_observation(reward=reward)
394
+ info = {
395
+ "episode_id": self._episode_id,
396
+ "task": self._task["name"],
397
+ "cumulative_reward": round(self._cumulative_reward, 4),
398
+ "step": self._step_count,
399
+ }
400
+ if self._done:
401
+ # PRINCIPLE: Ensure Hackathon score leak doesn't occur. It must be strictly [0.0, 1.0].
402
+ # During SUBMIT, reward might be negative if _act_submit returned 0.0 added to -0.01.
403
+ info["final_score"] = max(0.0, min(1.0, round(reward, 4)))
404
+
405
+ return obs, round(reward, 4), self._done, info
406
+
407
+ # ── Action handlers ─────────────────────────────────────────────────────
408
+ # Each returns the delta reward (R_STEP_COST already applied by step()).
409
+ # Handlers update self._last_output and self._last_edited_line as needed.
410
+
411
+ def _act_view_code(self) -> float:
412
+ self._last_output = (
413
+ "=== Full Source ===\n" +
414
+ "\n".join(
415
+ f"{i + 1:>3} | {line}"
416
+ for i, line in enumerate(self._code_lines)
417
+ )
418
+ )
419
+ # VIEW_CODE does not change the code — localized_context stays where it was
420
+ return 0.0
421
+
422
+ def _act_run_tests(self) -> float:
423
+ output, results, syntax_err = run_code_with_tests(
424
+ source=self._source(),
425
+ test_callables=self._task["tests"],
426
+ )
427
+ self._last_output = output
428
+ self._last_test_results = results
429
+
430
+ reward = R_RUN_TESTS # information-gathering bonus (Principle 5)
431
+
432
+ if syntax_err:
433
+ reward += R_SYNTAX_ERROR
434
+ else:
435
+ current_pass = sum(1 for t in results if t.passed)
436
+ new_passes = max(0, current_pass - self._prev_pass_count)
437
+ reward += new_passes * R_PER_NEW_PASS
438
+ self._prev_pass_count = current_pass
439
+
440
+ return reward
441
+
442
+ def _act_replace_lines(
443
+ self, start_line: int, end_line: int, new_code_block: str
444
+ ) -> float:
445
+ n = len(self._code_lines)
446
+
447
+ if new_code_block is None:
448
+ new_code_block = ""
449
+
450
+ # ── Guard: Destructive Action (Anti-Deletion) ─────────────────────
451
+ if len(new_code_block) == 0 and (end_line - start_line) > 5:
452
+ self._last_output = "Error: Cannot delete more than 5 lines at once."
453
+ return R_DESTRUCTIVE_PENALTY
454
+
455
+ # ── Guard: inverted range ─────────────────────────────────────────
456
+ if start_line > end_line:
457
+ self._last_output = (
458
+ f"Error: start_line ({start_line}) > end_line ({end_line}). "
459
+ "Inverted range rejected. Call VIEW_CODE to check the current line count."
460
+ )
461
+ return R_INVALID_LINE
462
+
463
+ # ── Guard: out-of-bounds ──────────────────────────────────────────
464
+ if start_line < 1 or start_line > n:
465
+ self._last_output = (
466
+ f"Error: start_line {start_line} is out of range [1, {n}]. "
467
+ "Call VIEW_CODE to check the current line count."
468
+ )
469
+ return R_INVALID_LINE
470
+ if end_line < 1 or end_line > n:
471
+ self._last_output = (
472
+ f"Error: end_line {end_line} is out of range [1, {n}]. "
473
+ "Call VIEW_CODE to check the current line count."
474
+ )
475
+ return R_INVALID_LINE
476
+
477
+ # ── Slice assignment (PRINCIPLE 1: pure data transformation) ──────
478
+ start_idx = start_line - 1 # convert to 0-indexed
479
+ end_idx = end_line # exclusive upper bound for Python slice
480
+
481
+ # ── Mini-Git: snapshot BEFORE mutating (Phase 2) ─────────────────
482
+ self._edit_history.append(list(self._code_lines))
483
+
484
+ new_lines = new_code_block.split("\n")
485
+ self._code_lines[start_idx:end_idx] = new_lines
486
+
487
+ # ── Anchor context at END of new block (PRINCIPLE 10) ─────────────
488
+ # If the agent replaces lines 5–10 with 20 new lines, the anchor
489
+ # settles at start_line + len(new_lines) - 1, clamped to file length.
490
+ new_end = start_line + len(new_lines) - 1
491
+ self._last_edited_line = min(new_end, len(self._code_lines))
492
+
493
+ replaced_count = end_line - start_line + 1
494
+ self._last_output = (
495
+ f"✏ Replaced lines {start_line}–{end_line} "
496
+ f"({replaced_count} line(s)) with {len(new_lines)} new line(s).\n"
497
+ f"File now has {len(self._code_lines)} lines total. "
498
+ f"Context anchored at line {self._last_edited_line}. "
499
+ "Call VIEW_CODE to re-orient before referencing line numbers."
500
+ )
501
+ return 0.0
502
+
503
+ def _act_submit(self) -> float:
504
+ output, results, syntax_err = run_code_with_tests(
505
+ source=self._source(),
506
+ test_callables=self._task["tests"],
507
+ )
508
+ self._last_output = output
509
+ self._last_test_results = results
510
+
511
+ total = len(results)
512
+ passes = 0 if syntax_err else sum(1 for t in results if t.passed)
513
+
514
+ if syntax_err:
515
+ self._last_output += "\n❌ SUBMIT rejected — syntax error in current code."
516
+
517
+ # ── Hackathon compliance: final score ∈ [0.0, 1.0] ───────────────
518
+ # raw = (tests_passed / total) - accumulated_step_costs
519
+ # Then clamped so the grader always receives a value in spec.
520
+ proportion = passes / total if total > 0 else 0.0
521
+ raw_score = proportion - self._accumulated_step_costs
522
+ final_score = max(0.0, min(1.0, raw_score))
523
+
524
+ if not syntax_err:
525
+ if passes == total:
526
+ self._last_output += (
527
+ f"\n🎉 ALL {total} TESTS PASS! Episode solved. "
528
+ f"Final score: {final_score:.4f}"
529
+ )
530
+ else:
531
+ fail_count = total - passes
532
+ self._last_output += (
533
+ f"\n❌ SUBMIT — {fail_count}/{total} tests still failing. "
534
+ f"Final score: {final_score:.4f}"
535
+ )
536
+
537
+ return final_score
538
+
539
+ def _act_undo_edit(self) -> float:
540
+ """
541
+ Mini-Git UNDO: restore the code snapshot from immediately before the
542
+ most recent REPLACE_LINES call. Applies R_UNDO_RESET penalty.
543
+
544
+ CRITICAL (Phase 2, point 4 — Context Desync Watchout):
545
+ _last_edited_line is set to None so context.py does not anchor the
546
+ localized view to a line that may no longer exist or mean the same
547
+ thing after the revert.
548
+ """
549
+ if not self._edit_history:
550
+ self._last_output = (
551
+ "⚠ UNDO_EDIT: no edit history — nothing to revert. "
552
+ "The code is still at its current state."
553
+ )
554
+ else:
555
+ self._code_lines = self._edit_history.pop()
556
+ self._last_output = (
557
+ f"↩ UNDO_EDIT: reverted to previous state "
558
+ f"({len(self._code_lines)} lines). "
559
+ "Call VIEW_CODE to inspect the restored file."
560
+ )
561
+
562
+ # PRINCIPLE 10 desync fix: anchor is stale after rollback — wipe it.
563
+ self._last_edited_line = None
564
+ return R_UNDO_RESET
565
+
566
+ def _act_reset_to_original(self) -> float:
567
+ """
568
+ Mini-Git RESET: restore the pristine episode-start code and clear the
569
+ entire undo stack. Applies R_UNDO_RESET penalty.
570
+
571
+ CRITICAL (Phase 2, point 4 — Context Desync Watchout):
572
+ _last_edited_line is set to None to prevent context.py from anchoring
573
+ to a ghost line in the freshly-restored original code.
574
+ """
575
+ self._code_lines = list(self._original_code) # deep copy
576
+ self._edit_history = [] # clear stack
577
+ self._last_output = (
578
+ f"↺ RESET_TO_ORIGINAL: code restored to pristine episode state "
579
+ f"({len(self._code_lines)} lines). All undo history cleared. "
580
+ "Call VIEW_CODE to inspect the file."
581
+ )
582
+
583
+ # PRINCIPLE 10 desync fix: context anchor is meaningless after full reset.
584
+ self._last_edited_line = None
585
+ return R_UNDO_RESET
586
+
587
+ # ── Helpers ─────────────────────────────────────────────────────────────
588
+
589
+ def _source(self) -> str:
590
+ return "\n".join(self._code_lines)
591
+
592
+ def _build_observation(self, reward: float) -> CodeObservation:
593
+ syntax_valid, _ = check_syntax(self._source())
594
+
595
+ # PRINCIPLE 10: localized context — only ±10 lines around last edit
596
+ localized = get_localized_context(self._code_lines, self._last_edited_line)
597
+
598
+ return CodeObservation(
599
+ code_lines = list(self._code_lines),
600
+ localized_context = localized,
601
+ last_execution_output = self._last_output,
602
+ syntax_error = not syntax_valid,
603
+ test_results = list(self._last_test_results),
604
+ step_count = self._step_count,
605
+ steps_remaining = max(0, MAX_STEPS - self._step_count),
606
+ reward_last_step = round(reward, 4),
607
+ done = self._done,
608
+ info = {
609
+ "episode_id": self._episode_id,
610
+ "task_name": self._task.get("name", ""),
611
+ "task_difficulty": self._task.get("difficulty", ""),
612
+ },
613
+ )
inference.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ inference.py — Baseline Agent for Python Debugging Gym
3
+ =======================================================
4
+ Hackathon-compliant baseline script. Connects to the PythonDebuggingGym
5
+ WebSocket server and drives an OpenAI-compatible LLM to find and fix bugs.
6
+
7
+ Required environment variables:
8
+ HF_TOKEN API key / HuggingFace token passed as Bearer auth
9
+ MODEL_NAME Model identifier (default: nvidia/nemotron-3-nano-4b)
10
+ API_BASE_URL OpenAI-compatible base URL (default: https://api.openai.com/v1)
11
+
12
+ Optional environment variables:
13
+ ENV_WS_URL WebSocket URL for the gym (default: ws://localhost:8000/ws)
14
+
15
+ Mandatory stdout log lines (zero deviation in spacing or formatting):
16
+ [START] task=<task_name> env=PythonDebuggingGym model=<model_name>
17
+ [STEP] step=<n> action=<action_type> reward=<r.rr> done=<true|false> error=<msg|null>
18
+ [END] success=<true|false> steps=<n> score=<s.sss> rewards=<r1,r2,...,rn>
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import json
25
+ import os
26
+ import sys
27
+ from typing import Any
28
+
29
+ import websockets
30
+ from openai import OpenAI
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Config (all readable from environment at import time)
35
+ # ---------------------------------------------------------------------------
36
+
37
+ API_BASE_URL: str = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
38
+ MODEL_NAME: str = os.getenv("MODEL_NAME", "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4")
39
+ HF_TOKEN: str = os.getenv("HF_TOKEN", "")
40
+ ENV_WS_URL: str = os.getenv("ENV_WS_URL", "ws://localhost:8000/ws")
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # OpenAI client
44
+ # ---------------------------------------------------------------------------
45
+
46
+ _client = OpenAI(
47
+ api_key=HF_TOKEN or "sk-placeholder", # placeholder keeps the client from raising at init
48
+ base_url=API_BASE_URL,
49
+ )
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Agent instruction appended after the environment's own system prompt
53
+ # ---------------------------------------------------------------------------
54
+
55
+ _AGENT_SUFFIX = """\
56
+
57
+ =======================================================================
58
+ RESPONSE FORMAT (MANDATORY)
59
+ =======================================================================
60
+ Respond with ONLY a valid JSON object. No markdown, no code fences,
61
+ no explanation text — just the raw JSON.
62
+
63
+ Valid action schemas (choose exactly one per turn):
64
+ {"action_type": "VIEW_CODE"}
65
+ {"action_type": "RUN_TESTS"}
66
+ {"action_type": "REPLACE_LINES", "start_line": N, "end_line": M, "new_code_block": "line1\\nline2"}
67
+ {"action_type": "UNDO_EDIT"}
68
+ {"action_type": "RESET_TO_ORIGINAL"}
69
+ {"action_type": "SUBMIT"}
70
+
71
+ Rules for REPLACE_LINES:
72
+ - new_code_block: join multiple lines with \\n (literal backslash-n in the JSON string)
73
+ - Include exact Python indentation (leading spaces) on every line
74
+ - Do NOT include a trailing \\n character
75
+ - After REPLACE_LINES, call VIEW_CODE to re-orient before the next edit
76
+
77
+ Rules for UNDO_EDIT / RESET_TO_ORIGINAL:
78
+ - UNDO_EDIT reverts the last REPLACE_LINES. Use when an edit made things worse.
79
+ - RESET_TO_ORIGINAL restores the original broken code. Last resort only.
80
+ - Both cost -0.10. Prefer fixing forward over backtracking.
81
+ """
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Observation formatter
86
+ # ---------------------------------------------------------------------------
87
+
88
+ def _format_obs(obs: dict[str, Any]) -> str:
89
+ """Convert a CodeObservation dict into a compact string for the LLM."""
90
+ parts: list[str] = []
91
+
92
+ if obs.get("syntax_error"):
93
+ parts.append("⚠ SYNTAX ERROR in current code — fix indentation/brackets first.\n")
94
+
95
+ localized = obs.get("localized_context", "")
96
+ if localized:
97
+ parts.append(f"[Context around last edit]\n{localized}\n")
98
+
99
+ last_out = obs.get("last_execution_output", "")
100
+ if last_out:
101
+ parts.append(f"[Last execution output]\n{last_out}\n")
102
+
103
+ test_results: list[dict] = obs.get("test_results", [])
104
+ if test_results:
105
+ lines = []
106
+ for t in test_results:
107
+ status = "PASS" if t.get("passed") else "FAIL"
108
+ msg = t.get("error_message") or ""
109
+ name = t.get("test_name", "?")
110
+ lines.append(f" {status} {name}" + (f": {msg}" if msg else ""))
111
+ parts.append("[Test results]\n" + "\n".join(lines) + "\n")
112
+
113
+ remaining = obs.get("steps_remaining", 0)
114
+ parts.append(f"[Steps remaining: {remaining}]")
115
+
116
+ return "\n".join(parts)
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # LLM call
121
+ # ---------------------------------------------------------------------------
122
+
123
+ _ACTION_SCHEMA = {
124
+ "type": "json_schema",
125
+ "json_schema": {
126
+ "name": "CodeAction",
127
+ "strict": True,
128
+ "schema": {
129
+ "type": "object",
130
+ "properties": {
131
+ "thought": {
132
+ "type": "string",
133
+ "description": "Mandatory reasoning before selecting action_type.",
134
+ },
135
+ "action_type": {
136
+ "type": "string",
137
+ "enum": [
138
+ "VIEW_CODE", "RUN_TESTS", "REPLACE_LINES",
139
+ "UNDO_EDIT", "RESET_TO_ORIGINAL", "SUBMIT",
140
+ ],
141
+ },
142
+ "start_line": {"type": ["integer", "null"]},
143
+ "end_line": {"type": ["integer", "null"]},
144
+ "new_code_block": {"type": ["string", "null"]},
145
+ },
146
+ "required": ["thought", "action_type"],
147
+ "additionalProperties": False,
148
+ },
149
+ },
150
+ }
151
+
152
+
153
+ def _call_llm(system_prompt: str, messages: list[dict]) -> str:
154
+ """
155
+ Call the configured LLM and return the raw text reply.
156
+
157
+ Tries json_schema structured output first (LM Studio / vLLM / newer
158
+ llama.cpp all support this). Falls back to a plain call if the backend
159
+ raises an error for the response_format parameter — _extract_json()
160
+ then handles extraction from free-form text.
161
+ """
162
+ base_kwargs: dict = dict(
163
+ model=MODEL_NAME,
164
+ messages=[
165
+ {"role": "system", "content": system_prompt + _AGENT_SUFFIX},
166
+ *messages,
167
+ ],
168
+ temperature=0.0,
169
+ )
170
+ try:
171
+ response = _client.chat.completions.create(
172
+ **base_kwargs,
173
+ response_format=_ACTION_SCHEMA,
174
+ )
175
+ except Exception:
176
+ # Backend doesn't support json_schema — fall back to free-form
177
+ response = _client.chat.completions.create(**base_kwargs)
178
+
179
+ msg = response.choices[0].message
180
+ content = msg.content
181
+
182
+ # Fallback for reasoning models (e.g., via LM Studio) that place their
183
+ # entire output in the reasoning_content field instead of content.
184
+ if not content:
185
+ try:
186
+ msg_dict = msg.model_dump()
187
+ content = msg_dict.get("reasoning_content", "") or ""
188
+ except AttributeError:
189
+ pass
190
+
191
+ return content or ""
192
+
193
+
194
+ # ---------------------------------------------------------------------------
195
+ # Constrained JSON extraction (works with any local or cloud model)
196
+ # ---------------------------------------------------------------------------
197
+
198
+ def _extract_json(text: str) -> dict:
199
+ """
200
+ Best-effort JSON extraction from raw LLM output.
201
+
202
+ Tries in order:
203
+ 1. Direct json.loads (model produced clean JSON)
204
+ 2. Strip ```json ... ``` / ``` ... ``` markdown fences
205
+ 3. Regex: grab first {...} block in the text
206
+ 4. Safe fallback: {"action_type": "VIEW_CODE"}
207
+ """
208
+ import re
209
+
210
+ # 1. Direct parse
211
+ stripped = text.strip()
212
+ try:
213
+ return json.loads(stripped)
214
+ except json.JSONDecodeError:
215
+ pass
216
+
217
+ # 2. Markdown code fence ```json\n{...}\n```
218
+ fence = re.search(r"```(?:json)?\s*({.*?})\s*```", stripped, re.DOTALL)
219
+ if fence:
220
+ try:
221
+ return json.loads(fence.group(1))
222
+ except json.JSONDecodeError:
223
+ pass
224
+
225
+ # 3. First {...} block anywhere in the text
226
+ brace = re.search(r"({.*?})", stripped, re.DOTALL)
227
+ if brace:
228
+ try:
229
+ return json.loads(brace.group(1))
230
+ except json.JSONDecodeError:
231
+ pass
232
+
233
+ # All extraction attempts failed.
234
+ # Return an invalid action_type so Pydantic rejects it at the server,
235
+ # the server returns an error envelope, and THAT error is fed back to
236
+ # the LLM on the next turn — breaking the silent mask loop.
237
+ # DO NOT default to VIEW_CODE here.
238
+ return {"action_type": "PARSE_ERROR", "thought": f"Failed to parse LLM output as JSON: {text[:120]}"}
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Episode runner
243
+ # ---------------------------------------------------------------------------
244
+
245
+ async def run_episode(difficulty: str = None, show_thought: bool = False) -> None:
246
+ """
247
+ Connect to the gym, run one full episode with an LLM agent,
248
+ and emit the three required log lines.
249
+ """
250
+ rewards: list[float] = []
251
+ step: int = 0
252
+ system_prompt: str = ""
253
+ task_name: str = "unknown"
254
+ messages: list[dict] = []
255
+ success: bool = False
256
+ obs: dict = {}
257
+
258
+ ws_url = ENV_WS_URL
259
+ if difficulty:
260
+ separator = "&" if "?" in ws_url else "?"
261
+ ws_url = f"{ws_url}{separator}difficulty={difficulty}"
262
+
263
+ async with websockets.connect(ws_url) as ws:
264
+
265
+ # ── Receive initial observation + system prompt ──────────────────
266
+ raw = await ws.recv()
267
+ data = json.loads(raw)
268
+
269
+ system_prompt = data.get("info", {}).get("system_prompt", "")
270
+ obs = data.get("observation", {})
271
+ task_name = obs.get("info", {}).get("task_name", "unknown")
272
+
273
+ # ── [START] log line ─────────────────────────────────────────────
274
+ print(
275
+ f"[START] task={task_name} env=PythonDebuggingGym model={MODEL_NAME}",
276
+ flush=True,
277
+ )
278
+
279
+ # ── RL loop ──────────────────────────────────────────────────────
280
+ while True:
281
+ step += 1
282
+ error_str = "null"
283
+ action_type = "VIEW_CODE" # will be overwritten by a real parse
284
+
285
+ # Build observation message for the LLM
286
+ obs_text = _format_obs(obs)
287
+ messages.append({"role": "user", "content": obs_text})
288
+
289
+ # Call LLM
290
+ try:
291
+ llm_reply = _call_llm(system_prompt, messages)
292
+ if os.getenv("DEBUG_LOG") == "1":
293
+ print(f"\n[DEBUG RAW LLM]: {llm_reply}\n", flush=True) # see what model actually outputs
294
+ action_json = _extract_json(llm_reply)
295
+ action_type = action_json.get("action_type", "VIEW_CODE")
296
+ messages.append({"role": "assistant", "content": llm_reply})
297
+ except Exception as exc:
298
+ # LLM call itself failed — surface error in log, do NOT mask as VIEW_CODE.
299
+ # Send a harmless VIEW_CODE this turn but pass the error text back as
300
+ # the next user message so the model sees what went wrong.
301
+ error_str = str(exc).replace("\n", " ")[:200]
302
+ action_type = "VIEW_CODE"
303
+ action_json = {"action_type": "VIEW_CODE"}
304
+ messages.append({"role": "user", "content": f"[SYSTEM ERROR] {error_str}"})
305
+
306
+ if show_thought:
307
+ thought = action_json.get("thought", "")
308
+ if thought:
309
+ print(f"\n[THOUGHT]: {thought}\n", flush=True)
310
+
311
+ # Send action to the environment
312
+ await ws.send(json.dumps({"action": action_json}))
313
+
314
+ # Receive response
315
+ raw = await ws.recv()
316
+ data = json.loads(raw)
317
+
318
+ # Server may return a validation-error envelope (no "observation" key)
319
+ if "observation" not in data:
320
+ error_str = str(data.get("error", "server_error"))[:200]
321
+ reward, done = 0.0, False
322
+ else:
323
+ reward = float(data.get("reward", 0.0))
324
+ done = bool(data.get("done", False))
325
+ obs = data.get("observation", {})
326
+
327
+ if done:
328
+ test_results = obs.get("test_results", [])
329
+ total = len(test_results)
330
+ passes = sum(1 for t in test_results if t.get("passed"))
331
+ success = (total > 0 and passes == total)
332
+
333
+ rewards.append(reward)
334
+
335
+ # ── [STEP] log line ──────────────────────────────────────────
336
+ done_str = "true" if done else "false"
337
+ print(
338
+ f"[STEP] step={step} action={action_type} "
339
+ f"reward={reward:.2f} done={done_str} error={error_str}",
340
+ flush=True,
341
+ )
342
+
343
+ if done:
344
+ break # server will auto-reset, but we exit after one episode
345
+
346
+ # ── [END] log line ───────────────────────────────────────────────────────
347
+ success_str = "true" if success else "false"
348
+ # Pull clamped final_score from info dict if available, else derive from rewards
349
+ final_score = data.get("info", {}).get("final_score", None) if done else None
350
+ if final_score is None:
351
+ final_score = max(0.0, min(1.0, sum(rewards)))
352
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
353
+ print(
354
+ f"[END] success={success_str} steps={step} score={final_score:.3f} rewards={rewards_str}",
355
+ flush=True,
356
+ )
357
+
358
+
359
+ # ---------------------------------------------------------------------------
360
+ # Entry point
361
+ # ---------------------------------------------------------------------------
362
+
363
+ def main() -> None:
364
+ import argparse
365
+ parser = argparse.ArgumentParser(description="Run the Python debugging agent.")
366
+ parser.add_argument("--easy", action="store_const", dest="difficulty", const="easy", help="Run an easy task.")
367
+ parser.add_argument("--medium", action="store_const", dest="difficulty", const="medium", help="Run a medium task.")
368
+ parser.add_argument("--hard", action="store_const", dest="difficulty", const="hard", help="Run a hard task.")
369
+ parser.add_argument("--thought", action="store_true", dest="show_thought", help="Print the agent's chain-of-thought reasoning.")
370
+
371
+ args = parser.parse_args()
372
+ asyncio.run(run_episode(difficulty=args.difficulty, show_thought=args.show_thought))
373
+
374
+
375
+ if __name__ == "__main__":
376
+ main()
models.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schema layer for the Python Debugging Gym OpenEnv environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Literal, Optional
6
+
7
+ from openenv.core.env_server.types import Action, Observation
8
+ from pydantic import BaseModel, Field, model_validator
9
+
10
+
11
+ ActionType = Literal[
12
+ "VIEW_CODE",
13
+ "RUN_TESTS",
14
+ "REPLACE_LINES",
15
+ "UNDO_EDIT",
16
+ "RESET_TO_ORIGINAL",
17
+ "SUBMIT",
18
+ ]
19
+
20
+
21
+ class CodeAction(Action):
22
+ """Structured action consumed by the environment."""
23
+
24
+ thought: Optional[str] = Field(
25
+ default=None,
26
+ description="Optional reasoning string for debugging/traceability.",
27
+ )
28
+ action_type: ActionType = Field(
29
+ ...,
30
+ description="One of VIEW_CODE, RUN_TESTS, REPLACE_LINES, UNDO_EDIT, RESET_TO_ORIGINAL, SUBMIT.",
31
+ )
32
+ start_line: Optional[int] = Field(default=None, ge=1)
33
+ end_line: Optional[int] = Field(default=None, ge=1)
34
+ new_code_block: Optional[str] = Field(default=None)
35
+
36
+ @model_validator(mode="after")
37
+ def validate_replace_fields(self) -> "CodeAction":
38
+ if self.action_type == "REPLACE_LINES":
39
+ if self.start_line is None:
40
+ raise ValueError("REPLACE_LINES requires start_line.")
41
+ if self.end_line is None:
42
+ raise ValueError("REPLACE_LINES requires end_line.")
43
+ if self.new_code_block is None:
44
+ raise ValueError("REPLACE_LINES requires new_code_block.")
45
+ return self
46
+
47
+
48
+ class TestResult(BaseModel):
49
+ """Per-test execution outcome."""
50
+
51
+ test_name: str
52
+ passed: bool
53
+ error_message: Optional[str] = None
54
+
55
+
56
+ class CodeObservation(Observation):
57
+ """Full observation returned after each step."""
58
+
59
+ code_lines: List[str] = Field(default_factory=list)
60
+ localized_context: str = Field(default="")
61
+ last_execution_output: str = Field(default="")
62
+ syntax_error: bool = Field(default=False)
63
+ test_results: List[TestResult] = Field(default_factory=list)
64
+ step_count: int = Field(default=0)
65
+ steps_remaining: int = Field(default=0)
66
+ reward_last_step: float = Field(default=0.0)
67
+ info: Dict[str, Any] = Field(default_factory=dict)
68
+
69
+ def render_code(self) -> str:
70
+ """Render source with 1-indexed line numbers for prompts."""
71
+ if not self.code_lines:
72
+ return "<empty>"
73
+ return "\n".join(
74
+ f"{idx + 1:>3} | {line}" for idx, line in enumerate(self.code_lines)
75
+ )
my_env/README.md DELETED
@@ -1,255 +0,0 @@
1
- ---
2
- title: My Env Environment Server
3
- emoji: 🖥️
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- app_port: 8000
9
- base_path: /web
10
- tags:
11
- - openenv
12
- ---
13
-
14
- # My Env Environment
15
-
16
- A simple test environment that echoes back messages. Perfect for testing the env APIs as well as demonstrating environment usage patterns.
17
-
18
- ## Quick Start
19
-
20
- The simplest way to use the My Env environment is through the `MyEnv` class:
21
-
22
- ```python
23
- from my_env import MyAction, MyEnv
24
-
25
- try:
26
- # Create environment from Docker image
27
- my_envenv = MyEnv.from_docker_image("my_env-env:latest")
28
-
29
- # Reset
30
- result = my_envenv.reset()
31
- print(f"Reset: {result.observation.echoed_message}")
32
-
33
- # Send multiple messages
34
- messages = ["Hello, World!", "Testing echo", "Final message"]
35
-
36
- for msg in messages:
37
- result = my_envenv.step(MyAction(message=msg))
38
- print(f"Sent: '{msg}'")
39
- print(f" → Echoed: '{result.observation.echoed_message}'")
40
- print(f" → Length: {result.observation.message_length}")
41
- print(f" → Reward: {result.reward}")
42
-
43
- finally:
44
- # Always clean up
45
- my_envenv.close()
46
- ```
47
-
48
- That's it! The `MyEnv.from_docker_image()` method handles:
49
- - Starting the Docker container
50
- - Waiting for the server to be ready
51
- - Connecting to the environment
52
- - Container cleanup when you call `close()`
53
-
54
- ## Building the Docker Image
55
-
56
- Before using the environment, you need to build the Docker image:
57
-
58
- ```bash
59
- # From project root
60
- docker build -t my_env-env:latest -f server/Dockerfile .
61
- ```
62
-
63
- ## Deploying to Hugging Face Spaces
64
-
65
- You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
66
-
67
- ```bash
68
- # From the environment directory (where openenv.yaml is located)
69
- openenv push
70
-
71
- # Or specify options
72
- openenv push --namespace my-org --private
73
- ```
74
-
75
- The `openenv push` command will:
76
- 1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
77
- 2. Prepare a custom build for Hugging Face Docker space (enables web interface)
78
- 3. Upload to Hugging Face (ensuring you're logged in)
79
-
80
- ### Prerequisites
81
-
82
- - Authenticate with Hugging Face: The command will prompt for login if not already authenticated
83
-
84
- ### Options
85
-
86
- - `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
87
- - `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
88
- - `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
89
- - `--private`: Deploy the space as private (default: public)
90
-
91
- ### Examples
92
-
93
- ```bash
94
- # Push to your personal namespace (defaults to username/env-name from openenv.yaml)
95
- openenv push
96
-
97
- # Push to a specific repository
98
- openenv push --repo-id my-org/my-env
99
-
100
- # Push with a custom base image
101
- openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
102
-
103
- # Push as a private space
104
- openenv push --private
105
-
106
- # Combine options
107
- openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
108
- ```
109
-
110
- After deployment, your space will be available at:
111
- `https://huggingface.co/spaces/<repo-id>`
112
-
113
- The deployed space includes:
114
- - **Web Interface** at `/web` - Interactive UI for exploring the environment
115
- - **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
116
- - **Health Check** at `/health` - Container health monitoring
117
- - **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
118
-
119
- ## Environment Details
120
-
121
- ### Action
122
- **MyAction**: Contains a single field
123
- - `message` (str) - The message to echo back
124
-
125
- ### Observation
126
- **MyObservation**: Contains the echo response and metadata
127
- - `echoed_message` (str) - The message echoed back
128
- - `message_length` (int) - Length of the message
129
- - `reward` (float) - Reward based on message length (length × 0.1)
130
- - `done` (bool) - Always False for echo environment
131
- - `metadata` (dict) - Additional info like step count
132
-
133
- ### Reward
134
- The reward is calculated as: `message_length × 0.1`
135
- - "Hi" → reward: 0.2
136
- - "Hello, World!" → reward: 1.3
137
- - Empty message → reward: 0.0
138
-
139
- ## Advanced Usage
140
-
141
- ### Connecting to an Existing Server
142
-
143
- If you already have a My Env environment server running, you can connect directly:
144
-
145
- ```python
146
- from my_env import MyEnv
147
-
148
- # Connect to existing server
149
- my_envenv = MyEnv(base_url="<ENV_HTTP_URL_HERE>")
150
-
151
- # Use as normal
152
- result = my_envenv.reset()
153
- result = my_envenv.step(MyAction(message="Hello!"))
154
- ```
155
-
156
- Note: When connecting to an existing server, `my_envenv.close()` will NOT stop the server.
157
-
158
- ### Using the Context Manager
159
-
160
- The client supports context manager usage for automatic connection management:
161
-
162
- ```python
163
- from my_env import MyAction, MyEnv
164
-
165
- # Connect with context manager (auto-connects and closes)
166
- with MyEnv(base_url="http://localhost:8000") as env:
167
- result = env.reset()
168
- print(f"Reset: {result.observation.echoed_message}")
169
- # Multiple steps with low latency
170
- for msg in ["Hello", "World", "!"]:
171
- result = env.step(MyAction(message=msg))
172
- print(f"Echoed: {result.observation.echoed_message}")
173
- ```
174
-
175
- The client uses WebSocket connections for:
176
- - **Lower latency**: No HTTP connection overhead per request
177
- - **Persistent session**: Server maintains your environment state
178
- - **Efficient for episodes**: Better for many sequential steps
179
-
180
- ### Concurrent WebSocket Sessions
181
-
182
- The server supports multiple concurrent WebSocket connections. To enable this,
183
- modify `server/app.py` to use factory mode:
184
-
185
- ```python
186
- # In server/app.py - use factory mode for concurrent sessions
187
- app = create_app(
188
- MyEnvironment, # Pass class, not instance
189
- MyAction,
190
- MyObservation,
191
- max_concurrent_envs=4, # Allow 4 concurrent sessions
192
- )
193
- ```
194
-
195
- Then multiple clients can connect simultaneously:
196
-
197
- ```python
198
- from my_env import MyAction, MyEnv
199
- from concurrent.futures import ThreadPoolExecutor
200
-
201
- def run_episode(client_id: int):
202
- with MyEnv(base_url="http://localhost:8000") as env:
203
- result = env.reset()
204
- for i in range(10):
205
- result = env.step(MyAction(message=f"Client {client_id}, step {i}"))
206
- return client_id, result.observation.message_length
207
-
208
- # Run 4 episodes concurrently
209
- with ThreadPoolExecutor(max_workers=4) as executor:
210
- results = list(executor.map(run_episode, range(4)))
211
- ```
212
-
213
- ## Development & Testing
214
-
215
- ### Direct Environment Testing
216
-
217
- Test the environment logic directly without starting the HTTP server:
218
-
219
- ```bash
220
- # From the server directory
221
- python3 server/my_env_environment.py
222
- ```
223
-
224
- This verifies that:
225
- - Environment resets correctly
226
- - Step executes actions properly
227
- - State tracking works
228
- - Rewards are calculated correctly
229
-
230
- ### Running Locally
231
-
232
- Run the server locally for development:
233
-
234
- ```bash
235
- uvicorn server.app:app --reload
236
- ```
237
-
238
- ## Project Structure
239
-
240
- ```
241
- my_env/
242
- ├── .dockerignore # Docker build exclusions
243
- ├── __init__.py # Module exports
244
- ├── README.md # This file
245
- ├── openenv.yaml # OpenEnv manifest
246
- ├── pyproject.toml # Project metadata and dependencies
247
- ├── uv.lock # Locked dependencies (generated)
248
- ├── client.py # MyEnv client
249
- ├── models.py # Action and Observation models
250
- └── server/
251
- ├── __init__.py # Server module exports
252
- ├── my_env_environment.py # Core environment logic
253
- ├── app.py # FastAPI application (HTTP + WebSocket endpoints)
254
- └── Dockerfile # Container image definition
255
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
my_env/models.py DELETED
@@ -1,27 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- Data models for the My Env Environment.
9
-
10
- The my_env environment is a simple test environment that echoes back messages.
11
- """
12
-
13
- from openenv.core.env_server.types import Action, Observation
14
- from pydantic import Field
15
-
16
-
17
- class MyAction(Action):
18
- """Action for the My Env environment - just a message to echo."""
19
-
20
- message: str = Field(..., description="Message to echo back")
21
-
22
-
23
- class MyObservation(Observation):
24
- """Observation from the My Env environment - the echoed message."""
25
-
26
- echoed_message: str = Field(default="", description="The echoed message")
27
- message_length: int = Field(default=0, description="Length of the echoed message")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
my_env/openenv.yaml → openenv.yaml RENAMED
@@ -1,7 +1,6 @@
1
  spec_version: 1
2
- name: my_env
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
-
 
1
  spec_version: 1
2
+ name: python_debugging_gym
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
 
pre-val.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0
my_env/pyproject.toml → pyproject.toml RENAMED
@@ -9,9 +9,9 @@ requires = ["setuptools>=45", "wheel"]
9
  build-backend = "setuptools.build_meta"
10
 
11
  [project]
12
- name = "openenv-my_env"
13
  version = "0.1.0"
14
- description = "My Env environment for OpenEnv"
15
  requires-python = ">=3.10"
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
@@ -26,6 +26,8 @@ dependencies = [
26
  # "gymnasium>=0.29.0",
27
  # "openspiel>=1.0.0",
28
  # "smolagents>=1.22.0,<2",
 
 
29
  ]
30
 
31
  [project.optional-dependencies]
@@ -42,4 +44,4 @@ server = "my_env.server.app:main"
42
  [tool.setuptools]
43
  include-package-data = true
44
  packages = ["my_env", "my_env.server"]
45
- package-dir = { "my_env" = ".", "my_env.server" = "server" }
 
9
  build-backend = "setuptools.build_meta"
10
 
11
  [project]
12
+ name = "openenv-python-debugging-gym"
13
  version = "0.1.0"
14
+ description = "Python Debugging Gym environment for OpenEnv"
15
  requires-python = ">=3.10"
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
 
26
  # "gymnasium>=0.29.0",
27
  # "openspiel>=1.0.0",
28
  # "smolagents>=1.22.0,<2",
29
+ "openai>=1.30.0",
30
+ "websockets>=12.0",
31
  ]
32
 
33
  [project.optional-dependencies]
 
44
  [tool.setuptools]
45
  include-package-data = true
46
  packages = ["my_env", "my_env.server"]
47
+ package-dir = { "my_env" = ".", "my_env.server" = "server" }
sandbox.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ sandbox.py — Safe Code Execution Sandbox
3
+ =========================================
4
+
5
+ PRINCIPLE 2 — Errors are Data, Not Control Flow
6
+ If the agent writes code that throws SyntaxError, AssertionError, TypeError,
7
+ RecursionError, or ANY other exception, the environment must NOT crash or
8
+ propagate that exception to the server loop. Every possible failure mode is
9
+ caught inside the child process, serialized into a string, and returned as
10
+ structured data in the CodeObservation. The agent then reads this error text
11
+ and adapts on its next turn.
12
+
13
+ PRINCIPLE 8 — Security is Per Invocation
14
+ The sandbox executes arbitrary LLM-generated Python code. Two defences:
15
+ 1. TIMEOUT: The worker process is hard-killed (SIGKILL after SIGTERM) after
16
+ EXEC_TIMEOUT_SECONDS. This stops while-True loops and CPU-exhaustion.
17
+ 2. RESTRICTED BUILTINS: exec() receives a controlled __builtins__ dict with
18
+ dangerous callables (open, __import__, eval, exec, compile, breakpoint,
19
+ input) replaced with safe stubs that raise RuntimeError. This prevents
20
+ the agent from escaping the sandbox via filesystem or subprocess access.
21
+
22
+ PRINCIPLE 9 — Optimizations are MVP Requirements
23
+ Python tracebacks can be thousands of lines. We tail-truncate to the last
24
+ MAX_OUTPUT_CHARS characters. The tail of a traceback is the most actionable
25
+ part (it contains the actual exception, not the call stack preamble).
26
+ Prefix '[...truncated N chars...]' is added so the agent knows output was cut.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import ast
32
+ import io
33
+ import inspect
34
+ import multiprocessing
35
+ import signal
36
+ import sys
37
+ import textwrap
38
+ import traceback
39
+ from typing import Any, Callable, Dict, List, Tuple
40
+
41
+ try:
42
+ from .models import TestResult
43
+ except ImportError:
44
+ from models import TestResult
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Constants
49
+ # ---------------------------------------------------------------------------
50
+
51
+ EXEC_TIMEOUT_SECONDS: int = 5 # Hard wall-clock kill limit (Principle 8)
52
+ MAX_OUTPUT_CHARS: int = 1_000 # Tail-truncate limit (Principle 9)
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Restricted builtins (Principle 8)
57
+ # ---------------------------------------------------------------------------
58
+
59
+ def _make_safe_stub(name: str) -> Callable:
60
+ """Return a callable that raises RuntimeError — used to block dangerous builtins."""
61
+ def _stub(*args, **kwargs):
62
+ raise RuntimeError(
63
+ f"'{name}' is disabled in the sandbox. "
64
+ "Do not attempt to access the filesystem, import modules dynamically, "
65
+ "or execute arbitrary code within your solution."
66
+ )
67
+ _stub.__name__ = name
68
+ return _stub
69
+
70
+
71
+ # Whitelist: safe builtins the agent's code is allowed to use.
72
+ # Everything not in this dict is blocked.
73
+ _SAFE_BUILTINS: Dict[str, Any] = {
74
+ # Type constructors
75
+ "int": int, "float": float, "str": str, "bool": bool,
76
+ "list": list, "dict": dict, "set": set, "tuple": tuple,
77
+ "bytes": bytes, "bytearray": bytearray, "frozenset": frozenset,
78
+ "complex": complex,
79
+ # Inspection / iteration
80
+ "len": len, "range": range, "enumerate": enumerate, "zip": zip,
81
+ "map": map, "filter": filter, "reversed": reversed, "sorted": sorted,
82
+ "iter": iter, "next": next, "sum": sum, "min": min, "max": max,
83
+ "abs": abs, "round": round, "divmod": divmod, "pow": pow,
84
+ # Introspection
85
+ "isinstance": isinstance, "issubclass": issubclass, "type": type,
86
+ "hasattr": hasattr, "getattr": getattr, "setattr": setattr,
87
+ "callable": callable, "repr": repr, "hash": hash, "id": id,
88
+ # I/O (stdout only — stderr is captured separately)
89
+ "print": print,
90
+ # Exceptions & control
91
+ "Exception": Exception, "ValueError": ValueError, "TypeError": TypeError,
92
+ "KeyError": KeyError, "IndexError": IndexError, "AttributeError": AttributeError,
93
+ "StopIteration": StopIteration, "RuntimeError": RuntimeError,
94
+ "AssertionError": AssertionError, "NotImplementedError": NotImplementedError,
95
+ "OverflowError": OverflowError, "ZeroDivisionError": ZeroDivisionError,
96
+ "RecursionError": RecursionError, "MemoryError": MemoryError,
97
+ "KeyboardInterrupt": KeyboardInterrupt,
98
+ "BaseException": BaseException,
99
+ # Functional
100
+ "any": any, "all": all,
101
+ "chr": chr, "ord": ord, "hex": hex, "oct": oct, "bin": bin,
102
+ "format": format,
103
+ "object": object, "property": property, "staticmethod": staticmethod,
104
+ "classmethod": classmethod, "super": super,
105
+ # Blocked with stubs (Principle 8)
106
+ "open": _make_safe_stub("open"),
107
+ "__import__": _make_safe_stub("__import__"),
108
+ "eval": _make_safe_stub("eval"),
109
+ "exec": _make_safe_stub("exec"),
110
+ "compile": _make_safe_stub("compile"),
111
+ "breakpoint": _make_safe_stub("breakpoint"),
112
+ "input": _make_safe_stub("input"),
113
+ "globals": _make_safe_stub("globals"),
114
+ "locals": _make_safe_stub("locals"),
115
+ "vars": _make_safe_stub("vars"),
116
+ "dir": _make_safe_stub("dir"),
117
+ "__loader__": None,
118
+ "__spec__": None,
119
+ }
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Output truncation (Principle 9)
124
+ # ---------------------------------------------------------------------------
125
+
126
+ def _tail_truncate(s: str, limit: int = MAX_OUTPUT_CHARS) -> str:
127
+ """
128
+ Return the TAIL of `s`, bounded to `limit` characters.
129
+
130
+ Rationale: Python tracebacks print in chronological call order — the most
131
+ actionable information (the actual exception type and message) appears at
132
+ the very END of the traceback, not the beginning. Tail-truncation therefore
133
+ preserves the signal the agent needs while discarding verbose call stacks.
134
+ """
135
+ if len(s) <= limit:
136
+ return s
137
+ dropped = len(s) - limit
138
+ return f"[...truncated {dropped} chars...]\n" + s[-limit:]
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Worker (runs in isolated child process)
143
+ # ---------------------------------------------------------------------------
144
+
145
+ def _worker(
146
+ source: str,
147
+ test_sources: List[str],
148
+ result_queue: multiprocessing.Queue,
149
+ ) -> None:
150
+ """
151
+ Isolated execution unit. Never raises — all failures become data.
152
+
153
+ PRINCIPLE 2: Every exception path is caught and serialized.
154
+ PRINCIPLE 8: exec() receives the restricted builtins dict.
155
+ """
156
+ buf = io.StringIO()
157
+ old_stdout, old_stderr = sys.stdout, sys.stderr
158
+ sys.stdout = sys.stderr = buf
159
+
160
+ test_results: List[Dict] = []
161
+ had_syntax_error = False
162
+ fn_name = "<unknown>"
163
+
164
+ try:
165
+ # ── Phase 1: Syntax check ─────────────────────────────────────────
166
+ # Compile before exec() so SyntaxError is caught cleanly.
167
+ try:
168
+ code_obj = compile(source, "<agent_code>", "exec")
169
+ except SyntaxError as exc:
170
+ had_syntax_error = True
171
+ # Restore streams before writing the error
172
+ sys.stdout, sys.stderr = old_stdout, old_stderr
173
+ err = f"SyntaxError at line {exc.lineno}: {exc.msg}\n >> {exc.text or ''}"
174
+ result_queue.put((_tail_truncate(err), [], True))
175
+ return
176
+
177
+ # ── Phase 2: Execute agent code into a sandboxed namespace ───────
178
+ # Use full __builtins__ to prevent __build_class__ errors for class-based tasks.
179
+ namespace: Dict[str, Any] = {"__builtins__": __builtins__}
180
+ try:
181
+ exec(code_obj, namespace) # noqa: S102
182
+ except Exception: # noqa: BLE001
183
+ # PRINCIPLE 2: execution crash is data, not a crash
184
+ tb = traceback.format_exc()
185
+ sys.stdout, sys.stderr = old_stdout, old_stderr
186
+ result_queue.put((_tail_truncate(buf.getvalue() + "\n" + tb), [], False))
187
+ return
188
+
189
+ # ── Phase 3: Run each test function ──────────────────────────────
190
+ # PRINCIPLE 2: each test is isolated inside its own try-except so a
191
+ # crash in test N does not prevent tests N+1..M from running.
192
+ for test_src in test_sources:
193
+ fn_name = "<unknown>"
194
+ try:
195
+ # Inject the test function into the existing namespace so it
196
+ # can access the agent's defined symbols.
197
+ exec(test_src, namespace) # noqa: S102
198
+
199
+ # Extract the last `def` name from the test source.
200
+ fn_name = [
201
+ ln.split("(")[0].replace("def ", "").strip()
202
+ for ln in test_src.splitlines()
203
+ if ln.startswith("def ")
204
+ ][-1]
205
+
206
+ namespace[fn_name](namespace)
207
+ test_results.append({"test_name": fn_name, "passed": True})
208
+
209
+ except AssertionError as exc:
210
+ # PRINCIPLE 2: assertion failure is structured data
211
+ test_results.append({
212
+ "test_name": fn_name,
213
+ "passed": False,
214
+ "error_message": _tail_truncate(
215
+ f"AssertionError: {exc}" if str(exc) else "AssertionError (no message)"
216
+ ),
217
+ })
218
+ except Exception: # noqa: BLE001
219
+ # PRINCIPLE 2: all other exceptions also become structured data
220
+ test_results.append({
221
+ "test_name": fn_name,
222
+ "passed": False,
223
+ "error_message": _tail_truncate(traceback.format_exc()),
224
+ })
225
+
226
+ except Exception: # noqa: BLE001
227
+ # Catch-all for any unexpected failure in the harness itself
228
+ traceback.print_exc(file=buf)
229
+ finally:
230
+ sys.stdout, sys.stderr = old_stdout, old_stderr
231
+
232
+ captured = _tail_truncate(buf.getvalue())
233
+ result_queue.put((captured, test_results, had_syntax_error))
234
+
235
+
236
+ # ---------------------------------------------------------------------------
237
+ # Public API
238
+ # ---------------------------------------------------------------------------
239
+
240
+ def check_syntax(source: str) -> Tuple[bool, str]:
241
+ """
242
+ Fast syntax check via ast.parse() — no execution, no subprocess overhead.
243
+
244
+ Returns (is_valid, error_description).
245
+ Called on every observation build to keep syntax_error field current.
246
+ """
247
+ try:
248
+ ast.parse(source)
249
+ return True, ""
250
+ except SyntaxError as exc:
251
+ return False, f"SyntaxError at line {exc.lineno}: {exc.msg}"
252
+
253
+
254
+ def run_code_with_tests(
255
+ source: str,
256
+ test_callables: List[Callable],
257
+ timeout: int = EXEC_TIMEOUT_SECONDS,
258
+ ) -> Tuple[str, List[TestResult], bool]:
259
+ """
260
+ Execute `source` with restricted builtins and run each test callable.
261
+
262
+ PRINCIPLE 8 — hard timeout enforced via multiprocessing:
263
+ proc.join(timeout) → if still alive → SIGTERM → SIGKILL → proceed.
264
+
265
+ PRINCIPLE 2 — all outcomes return as data:
266
+ timeout → ("⏱ timed out", [], False)
267
+ dead proc → ("process exited unexpectedly", [], False)
268
+ normal run → (stdout_stderr, [TestResult...], had_syntax_error)
269
+
270
+ Returns
271
+ -------
272
+ (output_str, test_results, had_syntax_error)
273
+ """
274
+ # Serialise callables → source strings (required for pickling across processes)
275
+ test_sources = [
276
+ textwrap.dedent(inspect.getsource(fn))
277
+ for fn in test_callables
278
+ ]
279
+
280
+ q: multiprocessing.Queue = multiprocessing.Queue()
281
+ proc = multiprocessing.Process(
282
+ target=_worker,
283
+ args=(source, test_sources, q),
284
+ daemon=True, # Dies automatically if parent exits
285
+ )
286
+ proc.start()
287
+ proc.join(timeout)
288
+
289
+ # PRINCIPLE 8 — hard kill (SIGTERM first, SIGKILL if still alive)
290
+ if proc.is_alive():
291
+ proc.terminate()
292
+ proc.join(2) # Give it 2s to handle SIGTERM gracefully
293
+ if proc.is_alive():
294
+ proc.kill() # SIGKILL — unconditional
295
+ proc.join()
296
+ return (
297
+ f"⏱ Execution timed out after {timeout}s. "
298
+ "Your code contains an infinite loop or is too slow. "
299
+ "Fix the logic and try again.",
300
+ [],
301
+ False,
302
+ )
303
+
304
+ if q.empty():
305
+ return "Process exited unexpectedly with no output.", [], False
306
+
307
+ raw_output, raw_results, syntax_err = q.get_nowait()
308
+ test_results = [TestResult(**r) for r in raw_results]
309
+ return raw_output, test_results, syntax_err
{my_env/server → server}/Dockerfile RENAMED
File without changes
{my_env/server → server}/__init__.py RENAMED
File without changes
{my_env/server → server}/app.py RENAMED
@@ -4,29 +4,7 @@
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- """
8
- FastAPI application for the My Env Environment.
9
-
10
- This module creates an HTTP server that exposes the MyEnvironment
11
- over HTTP and WebSocket endpoints, compatible with EnvClient.
12
-
13
- Endpoints:
14
- - POST /reset: Reset the environment
15
- - POST /step: Execute an action
16
- - GET /state: Get current environment state
17
- - GET /schema: Get action/observation schemas
18
- - WS /ws: WebSocket endpoint for persistent sessions
19
-
20
- Usage:
21
- # Development (with auto-reload):
22
- uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
-
24
- # Production:
25
- uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
-
27
- # Or run directly:
28
- python -m server.app
29
- """
30
 
31
  try:
32
  from openenv.core.env_server.http_server import create_app
@@ -36,49 +14,36 @@ except Exception as e: # pragma: no cover
36
  ) from e
37
 
38
  try:
39
- from ..models import MyAction, MyObservation
40
  from .my_env_environment import MyEnvironment
41
- except ModuleNotFoundError:
42
- from models import MyAction, MyObservation
 
 
 
 
43
  from server.my_env_environment import MyEnvironment
44
 
45
 
46
  # Create the app with web interface and README integration
47
  app = create_app(
48
  MyEnvironment,
49
- MyAction,
50
- MyObservation,
51
- env_name="my_env",
52
  max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
53
  )
54
 
55
 
56
- def main(host: str = "0.0.0.0", port: int = 8000):
57
- """
58
- Entry point for direct execution via uv run or python -m.
59
-
60
- This function enables running the server without Docker:
61
- uv run --project . server
62
- uv run --project . server --port 8001
63
- python -m my_env.server.app
64
-
65
- Args:
66
- host: Host address to bind to (default: "0.0.0.0")
67
- port: Port number to listen on (default: 8000)
68
-
69
- For production deployments, consider using uvicorn directly with
70
- multiple workers:
71
- uvicorn my_env.server.app:app --workers 4
72
- """
73
  import uvicorn
74
 
 
 
75
  uvicorn.run(app, host=host, port=port)
76
 
77
 
78
  if __name__ == "__main__":
79
- import argparse
80
-
81
- parser = argparse.ArgumentParser()
82
- parser.add_argument("--port", type=int, default=8000)
83
- args = parser.parse_args()
84
- main(port=args.port)
 
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ """FastAPI entry point for the Python Debugging Gym OpenEnv environment."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  try:
10
  from openenv.core.env_server.http_server import create_app
 
14
  ) from e
15
 
16
  try:
17
+ from ..models import CodeAction, CodeObservation
18
  from .my_env_environment import MyEnvironment
19
+ except ImportError:
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
24
+ from models import CodeAction, CodeObservation
25
  from server.my_env_environment import MyEnvironment
26
 
27
 
28
  # Create the app with web interface and README integration
29
  app = create_app(
30
  MyEnvironment,
31
+ CodeAction,
32
+ CodeObservation,
33
+ env_name="python_debugging_gym",
34
  max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
35
  )
36
 
37
 
38
+ def main() -> None:
39
+ """Entry point for local and container execution."""
40
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  import uvicorn
42
 
43
+ host = os.environ.get("HOST", "0.0.0.0")
44
+ port = int(os.environ.get("PORT", "8000"))
45
  uvicorn.run(app, host=host, port=port)
46
 
47
 
48
  if __name__ == "__main__":
49
+ main()
 
 
 
 
 
{my_env/server → server}/my_env_environment.py RENAMED
@@ -4,40 +4,21 @@
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
- """
8
- My Env Environment Implementation.
9
-
10
- A simple test environment that echoes back messages sent to it.
11
- Perfect for testing HTTP server infrastructure.
12
- """
13
-
14
- from uuid import uuid4
15
 
16
  from openenv.core.env_server.interfaces import Environment
17
  from openenv.core.env_server.types import State
18
 
19
  try:
20
- from ..models import MyAction, MyObservation
 
21
  except ImportError:
22
- from models import MyAction, MyObservation
 
23
 
24
 
25
  class MyEnvironment(Environment):
26
- """
27
- A simple echo environment that echoes back messages.
28
-
29
- This environment is designed for testing the HTTP server infrastructure.
30
- It maintains minimal state and simply echoes back whatever message it receives.
31
-
32
- Example:
33
- >>> env = MyEnvironment()
34
- >>> obs = env.reset()
35
- >>> print(obs.echoed_message) # "My Env environment ready!"
36
- >>>
37
- >>> obs = env.step(MyAction(message="Hello"))
38
- >>> print(obs.echoed_message) # "Hello"
39
- >>> print(obs.message_length) # 5
40
- """
41
 
42
  # Enable concurrent WebSocket sessions.
43
  # Set to True if your environment isolates state between instances.
@@ -46,52 +27,32 @@ class MyEnvironment(Environment):
46
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
47
 
48
  def __init__(self):
49
- """Initialize the my_env environment."""
50
- self._state = State(episode_id=str(uuid4()), step_count=0)
51
- self._reset_count = 0
52
-
53
- def reset(self) -> MyObservation:
54
- """
55
- Reset the environment.
56
-
57
- Returns:
58
- MyObservation with a ready message
59
- """
60
- self._state = State(episode_id=str(uuid4()), step_count=0)
61
- self._reset_count += 1
62
-
63
- return MyObservation(
64
- echoed_message="My Env environment ready!",
65
- message_length=0,
66
- done=False,
67
- reward=0.0,
68
  )
69
-
70
- def step(self, action: MyAction) -> MyObservation: # type: ignore[override]
71
- """
72
- Execute a step in the environment by echoing the message.
73
-
74
- Args:
75
- action: MyAction containing the message to echo
76
-
77
- Returns:
78
- MyObservation with the echoed message and its length
79
- """
80
- self._state.step_count += 1
81
-
82
- message = action.message
83
- length = len(message)
84
-
85
- # Simple reward: longer messages get higher rewards
86
- reward = length * 0.1
87
-
88
- return MyObservation(
89
- echoed_message=message,
90
- message_length=length,
91
- done=False,
92
- reward=reward,
93
- metadata={"original_message": message, "step": self._state.step_count},
94
  )
 
95
 
96
  @property
97
  def state(self) -> State:
 
4
  # This source code is licensed under the BSD-style license found in the
5
  # LICENSE file in the root directory of this source tree.
6
 
7
+ """OpenEnv adapter around the PythonDebuggingGym core environment."""
 
 
 
 
 
 
 
8
 
9
  from openenv.core.env_server.interfaces import Environment
10
  from openenv.core.env_server.types import State
11
 
12
  try:
13
+ from ..environment import PythonDebuggingGym
14
+ from ..models import CodeAction, CodeObservation
15
  except ImportError:
16
+ from environment import PythonDebuggingGym
17
+ from models import CodeAction, CodeObservation
18
 
19
 
20
  class MyEnvironment(Environment):
21
+ """Environment implementation compatible with OpenEnv's server interface."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Enable concurrent WebSocket sessions.
24
  # Set to True if your environment isolates state between instances.
 
27
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
28
 
29
  def __init__(self):
30
+ self._gym = PythonDebuggingGym()
31
+ self._state = State(episode_id="", step_count=0)
32
+
33
+ def reset(self) -> CodeObservation:
34
+ obs, system_prompt = self._gym.reset()
35
+ self._state = State(
36
+ episode_id=obs.info.get("episode_id", ""),
37
+ step_count=obs.step_count,
 
 
 
 
 
 
 
 
 
 
 
38
  )
39
+ metadata = dict(obs.metadata or {})
40
+ metadata["system_prompt"] = system_prompt
41
+ obs.metadata = metadata
42
+ return obs
43
+
44
+ def step(self, action: CodeAction) -> CodeObservation: # type: ignore[override]
45
+ obs, reward, done, info = self._gym.step(action)
46
+ obs.reward = reward
47
+ obs.done = done
48
+ metadata = dict(obs.metadata or {})
49
+ metadata.update(info)
50
+ obs.metadata = metadata
51
+ self._state = State(
52
+ episode_id=obs.info.get("episode_id", ""),
53
+ step_count=obs.step_count,
 
 
 
 
 
 
 
 
 
 
54
  )
55
+ return obs
56
 
57
  @property
58
  def state(self) -> State:
{my_env/server → server}/requirements.txt RENAMED
@@ -1,6 +1,7 @@
1
  openenv[core]>=0.2.0
2
  fastapi>=0.115.0
3
  uvicorn>=0.24.0
4
-
 
5
 
6
 
 
1
  openenv[core]>=0.2.0
2
  fastapi>=0.115.0
3
  uvicorn>=0.24.0
4
+ openai>=1.30.0
5
+ websockets>=12.0
6
 
7
 
tasks.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tasks.py — Static Task Registry
3
+ ================================
4
+ This is a "dumb" registry. Tasks are hardcoded dicts representing
5
+ curated buggy programs generated offline via MutationEngine.
6
+
7
+ Exported symbols:
8
+ TASKS_BY_DIFFICULTY Dict[str, List[Dict]] — tasks grouped by tier
9
+ ALL_TASKS List[Dict] — flat list for random sampling
10
+
11
+ Run mutation_engine.py + dataset_generator.py locally (offline) to
12
+ generate new candidates, curate the best ones, and add them here.
13
+ """
14
+
15
+ from __future__ import annotations
16
+ from typing import Any, Callable, Dict, List
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Test helpers (module-level; accept namespace dict, raise AssertionError)
21
+ # ---------------------------------------------------------------------------
22
+
23
+ # ── sum_even_numbers ────────────────────────────────────────────────────────
24
+
25
+ def _tse_1(ns):
26
+ res = ns["sum_even_numbers"]([1, 2, 3, 4])
27
+ assert res == 6, f"Test failed: input=[1, 2, 3, 4], expected=6, got={res}"
28
+ def _tse_2(ns):
29
+ res = ns["sum_even_numbers"]([])
30
+ assert res == 0, f"Test failed: input=[], expected=0, got={res}"
31
+ def _tse_3(ns):
32
+ res = ns["sum_even_numbers"]([1, 3, 5])
33
+ assert res == 0, f"Test failed: input=[1, 3, 5], expected=0, got={res}"
34
+ def _tse_4(ns):
35
+ res = ns["sum_even_numbers"]([2, 2, 2])
36
+ assert res == 6, f"Test failed: input=[2, 2, 2], expected=6, got={res}"
37
+
38
+ # ── reverse_string ──────────────────────────────────────────────────────────
39
+
40
+ def _trs_1(ns):
41
+ res = ns["reverse_string"]("abc")
42
+ assert res == "cba", f"Test failed: input='abc', expected='cba', got={res!r}"
43
+ def _trs_2(ns):
44
+ res = ns["reverse_string"]("")
45
+ assert res == "", f"Test failed: input='', expected='', got={res!r}"
46
+ def _trs_3(ns):
47
+ res = ns["reverse_string"]("a")
48
+ assert res == "a", f"Test failed: input='a', expected='a', got={res!r}"
49
+ def _trs_4(ns):
50
+ res = ns["reverse_string"]("abcd")
51
+ assert res == "dcba", f"Test failed: input='abcd', expected='dcba', got={res!r}"
52
+
53
+ # ── binary_search ───────────────────────────────────────────────────────────
54
+
55
+ def _tbs_1(ns):
56
+ res = ns["binary_search"]([1, 2, 3, 4, 5], 3)
57
+ assert res == 2, f"Test failed: input=([1, 2, 3, 4, 5], 3), expected=2, got={res}"
58
+ def _tbs_2(ns):
59
+ res = ns["binary_search"]([1, 2, 3, 4, 5], 6)
60
+ assert res == -1, f"Test failed: input=([1, 2, 3, 4, 5], 6), expected=-1, got={res}"
61
+ def _tbs_3(ns):
62
+ res = ns["binary_search"]([], 1)
63
+ assert res == -1, f"Test failed: input=([], 1), expected=-1, got={res}"
64
+ def _tbs_4(ns):
65
+ res = ns["binary_search"]([7], 7)
66
+ assert res == 0, f"Test failed: input=([7], 7), expected=0, got={res}"
67
+
68
+ # ── flatten ─────────────────────────────────────────────────────────────────
69
+
70
+ def _tfl_1(ns):
71
+ res = ns["flatten"]([1, [2, 3]])
72
+ assert res == [1, 2, 3], f"Test failed: input=[1, [2, 3]], expected=[1, 2, 3], got={res}"
73
+ def _tfl_2(ns):
74
+ res = ns["flatten"]([])
75
+ assert res == [], f"Test failed: input=[], expected=[], got={res}"
76
+ def _tfl_3(ns):
77
+ res = ns["flatten"]([1, [2, [3]]])
78
+ assert res == [1, 2, 3], f"Test failed: input=[1, [2, [3]]], expected=[1, 2, 3], got={res}"
79
+ def _tfl_4(ns):
80
+ res = ns["flatten"]([[1], [2, 3], [4]])
81
+ assert res == [1, 2, 3, 4], f"Test failed: input=[[1], [2, 3], [4]], expected=[1, 2, 3, 4], got={res}"
82
+
83
+ # ── word_count ──────────────────────────────────────────────────────────────
84
+
85
+ def _twc_1(ns):
86
+ res = ns["word_count"]("hello world hello")
87
+ assert res == {"hello": 2, "world": 1}, f"Test failed: input='hello world hello', expected={{'hello': 2, 'world': 1}}, got={res}"
88
+ def _twc_2(ns):
89
+ res = ns["word_count"]("Hi, hi!")
90
+ assert res == {"hi": 2}, f"Test failed: input='Hi, hi!', expected={{'hi': 2}}, got={res}"
91
+ def _twc_3(ns):
92
+ res = ns["word_count"]("")
93
+ assert res == {}, f"Test failed: input='', expected={{}}, got={res}"
94
+ def _twc_4(ns):
95
+ res = ns["word_count"]("Hello HELLO hello")
96
+ assert res == {"hello": 3}, f"Test failed: input='Hello HELLO hello', expected={{'hello': 3}}, got={res}"
97
+
98
+ # ── lru_cache ───────────────────────────────────────────────────────────────
99
+
100
+ def _tlru_1(ns):
101
+ C = ns["LRUCache"]
102
+ c = C(2); c.put(1, 1); c.put(2, 2)
103
+ res = c.get(1)
104
+ assert res == 1, f"Test failed: Capacity 2. Added (1,1), then (2,2). Expected get(1) to be 1, got {res}"
105
+
106
+ def _tlru_2(ns):
107
+ C = ns["LRUCache"]
108
+ c = C(1); c.put(1, 1); c.put(2, 2)
109
+ res = c.get(1)
110
+ assert res == -1, f"Test failed: Capacity 1. Added (1,1), then (2,2). Expected key 1 to be evicted (return -1), got {res}"
111
+
112
+ def _tlru_3(ns):
113
+ C = ns["LRUCache"]
114
+ c = C(2); c.put(1, 1); c.put(2, 2); c.get(1); c.put(3, 3)
115
+ res = c.get(2)
116
+ assert res == -1, f"Test failed: Capacity 2. Added (1,1), then (2,2), got(1), added (3,3). Expected key 2 to be evicted (return -1) since 1 was promoted during get(1), got {res}. Did you promote key 1 during get()?"
117
+
118
+ # ── valid_parentheses ────────────────────────────────────────────────────────
119
+
120
+ def _tvp_1(ns):
121
+ res = ns["is_valid"]("()")
122
+ assert res == True, f"Test failed: input='()', expected=True, got={res}"
123
+ def _tvp_2(ns):
124
+ res = ns["is_valid"]("(]")
125
+ assert res == False, f"Test failed: input='(]', expected=False, got={res}"
126
+ def _tvp_3(ns):
127
+ res = ns["is_valid"]("([{}])")
128
+ assert res == True, f"Test failed: input='([{{}}])', expected=True, got={res}"
129
+ def _tvp_4(ns):
130
+ res = ns["is_valid"]("")
131
+ assert res == True, f"Test failed: input='', expected=True, got={res}"
132
+
133
+ # ── merge_intervals ──────────────────────────────────────────────────────────
134
+
135
+ def _tmi_1(ns):
136
+ res = ns["merge_intervals"]([[1, 3], [2, 6]])
137
+ assert res == [[1, 6]], f"Test failed: input=[[1, 3], [2, 6]], expected=[[1, 6]], got={res}"
138
+ def _tmi_2(ns):
139
+ res = ns["merge_intervals"]([[1, 4], [4, 5]])
140
+ assert res == [[1, 5]], f"Test failed: input=[[1, 4], [4, 5]], expected=[[1, 5]], got={res}"
141
+ def _tmi_3(ns):
142
+ res = ns["merge_intervals"]([[1, 2], [3, 4]])
143
+ assert res == [[1, 2], [3, 4]], f"Test failed: input=[[1, 2], [3, 4]], expected=[[1, 2], [3, 4]], got={res}"
144
+
145
+
146
+ # ---------------------------------------------------------------------------
147
+ # Static task registry
148
+ # ---------------------------------------------------------------------------
149
+
150
+ def _t(name, description, code, solution, tests, difficulty, bug_type):
151
+ return dict(
152
+ name=name, description=description,
153
+ code=code, solution=solution,
154
+ tests=tests, difficulty=difficulty, bug_type=bug_type,
155
+ )
156
+
157
+
158
+ # ── EASY ──────────────────────────────────────────────────────────────────
159
+
160
+ TASK_SUM_EVEN_WRONG_OP = _t(
161
+ name="sum_even_wrong_condition",
162
+ description="Debug the sum_even_numbers function so it passes all tests.",
163
+ difficulty="easy",
164
+ bug_type="wrong_operator",
165
+ code=[
166
+ "def sum_even_numbers(nums):",
167
+ " total = 0",
168
+ " for n in nums:",
169
+ " if n % 2 != 0:",
170
+ " total += n",
171
+ " return total",
172
+ ],
173
+ solution=[
174
+ "def sum_even_numbers(nums):",
175
+ " total = 0",
176
+ " for n in nums:",
177
+ " if n % 2 == 0:",
178
+ " total += n",
179
+ " return total",
180
+ ],
181
+ tests=[_tse_1, _tse_2, _tse_3, _tse_4],
182
+ )
183
+
184
+ TASK_SUM_EVEN_MISSING_INIT = _t(
185
+ name="sum_even_missing_accumulator",
186
+ description="Debug the sum_even_numbers function so it passes all tests.",
187
+ difficulty="easy",
188
+ bug_type="wrong_operator",
189
+ code=[
190
+ "def sum_even_numbers(nums):",
191
+ " total = 0",
192
+ " for n in nums:",
193
+ " if n % 2 == 0:",
194
+ " total -= n",
195
+ " return total",
196
+ ],
197
+ solution=[
198
+ "def sum_even_numbers(nums):",
199
+ " total = 0",
200
+ " for n in nums:",
201
+ " if n % 2 == 0:",
202
+ " total += n",
203
+ " return total",
204
+ ],
205
+ tests=[_tse_1, _tse_2, _tse_3, _tse_4],
206
+ )
207
+
208
+ TASK_REVERSE_WRONG_STEP = _t(
209
+ name="reverse_string_wrong_step",
210
+ description="Debug the reverse_string function so it passes all tests.",
211
+ difficulty="easy",
212
+ bug_type="off_by_one",
213
+ code=[
214
+ "def reverse_string(s):",
215
+ " return s[::-2]",
216
+ ],
217
+ solution=[
218
+ "def reverse_string(s):",
219
+ " return s[::-1]",
220
+ ],
221
+ tests=[_trs_1, _trs_2, _trs_3, _trs_4],
222
+ )
223
+
224
+ TASK_REVERSE_NO_REVERSE = _t(
225
+ name="reverse_string_returns_original",
226
+ description="Debug the reverse_string function so it passes all tests.",
227
+ difficulty="easy",
228
+ bug_type="wrong_operator",
229
+ code=[
230
+ "def reverse_string(s):",
231
+ " return s[::1]",
232
+ ],
233
+ solution=[
234
+ "def reverse_string(s):",
235
+ " return s[::-1]",
236
+ ],
237
+ tests=[_trs_1, _trs_2, _trs_3, _trs_4],
238
+ )
239
+
240
+
241
+ # ── MEDIUM ─────────────────────��──────────────────────────────────────────
242
+
243
+ TASK_BS_OFF_BY_ONE = _t(
244
+ name="binary_search_off_by_one",
245
+ description="Debug the binary_search function so it passes all tests.",
246
+ difficulty="medium",
247
+ bug_type="off_by_one",
248
+ code=[
249
+ "def binary_search(arr, target):",
250
+ " left, right = 0, len(arr)",
251
+ " while left <= right:",
252
+ " mid = (left + right) // 2",
253
+ " if arr[mid] == target:",
254
+ " return mid",
255
+ " elif arr[mid] < target:",
256
+ " left = mid + 1",
257
+ " else:",
258
+ " right = mid - 1",
259
+ " return -1",
260
+ ],
261
+ solution=[
262
+ "def binary_search(arr, target):",
263
+ " left, right = 0, len(arr) - 1",
264
+ " while left <= right:",
265
+ " mid = (left + right) // 2",
266
+ " if arr[mid] == target:",
267
+ " return mid",
268
+ " elif arr[mid] < target:",
269
+ " left = mid + 1",
270
+ " else:",
271
+ " right = mid - 1",
272
+ " return -1",
273
+ ],
274
+ tests=[_tbs_1, _tbs_2, _tbs_3, _tbs_4],
275
+ )
276
+
277
+ TASK_BS_WRONG_MID = _t(
278
+ name="binary_search_wrong_mid",
279
+ description="Debug the binary_search function so it passes all tests.",
280
+ difficulty="medium",
281
+ bug_type="wrong_operator",
282
+ code=[
283
+ "def binary_search(arr, target):",
284
+ " left, right = 0, len(arr) - 1",
285
+ " while left <= right:",
286
+ " mid = left + right",
287
+ " if mid >= len(arr):",
288
+ " return -1",
289
+ " if arr[mid] == target:",
290
+ " return mid",
291
+ " elif arr[mid] < target:",
292
+ " left = mid + 1",
293
+ " else:",
294
+ " right = mid - 1",
295
+ " return -1",
296
+ ],
297
+ solution=[
298
+ "def binary_search(arr, target):",
299
+ " left, right = 0, len(arr) - 1",
300
+ " while left <= right:",
301
+ " mid = (left + right) // 2",
302
+ " if arr[mid] == target:",
303
+ " return mid",
304
+ " elif arr[mid] < target:",
305
+ " left = mid + 1",
306
+ " else:",
307
+ " right = mid - 1",
308
+ " return -1",
309
+ ],
310
+ tests=[_tbs_1, _tbs_2, _tbs_3, _tbs_4],
311
+ )
312
+
313
+ TASK_FLATTEN_APPEND = _t(
314
+ name="flatten_missing_recursion",
315
+ description="Debug the flatten function so it passes all tests.",
316
+ difficulty="medium",
317
+ bug_type="wrong_function_call",
318
+ code=[
319
+ "def flatten(lst):",
320
+ " result = []",
321
+ " for item in lst:",
322
+ " if isinstance(item, list):",
323
+ " result.append(item)",
324
+ " else:",
325
+ " result.append(item)",
326
+ " return result",
327
+ ],
328
+ solution=[
329
+ "def flatten(lst):",
330
+ " result = []",
331
+ " for item in lst:",
332
+ " if isinstance(item, list):",
333
+ " result.extend(flatten(item))",
334
+ " else:",
335
+ " result.append(item)",
336
+ " return result",
337
+ ],
338
+ tests=[_tfl_1, _tfl_2, _tfl_3, _tfl_4],
339
+ )
340
+
341
+ TASK_FLATTEN_LOGIC_INVERT = _t(
342
+ name="flatten_inverted_branch",
343
+ description="Debug the flatten function so it passes all tests.",
344
+ difficulty="medium",
345
+ bug_type="logic_inversion",
346
+ code=[
347
+ "def flatten(lst):",
348
+ " result = []",
349
+ " for item in lst:",
350
+ " if not isinstance(item, list):",
351
+ " result.extend(flatten(item))",
352
+ " else:",
353
+ " result.append(item)",
354
+ " return result",
355
+ ],
356
+ solution=[
357
+ "def flatten(lst):",
358
+ " result = []",
359
+ " for item in lst:",
360
+ " if isinstance(item, list):",
361
+ " result.extend(flatten(item))",
362
+ " else:",
363
+ " result.append(item)",
364
+ " return result",
365
+ ],
366
+ tests=[_tfl_1, _tfl_2, _tfl_3, _tfl_4],
367
+ )
368
+
369
+ TASK_WC_NO_LOWER = _t(
370
+ name="word_count_no_lower",
371
+ description="Debug the word_count function so it passes all tests.",
372
+ difficulty="medium",
373
+ bug_type="missing_return",
374
+ code=[
375
+ "import string",
376
+ "def word_count(text):",
377
+ " for p in string.punctuation:",
378
+ " text = text.replace(p, '')",
379
+ " words = text.split()",
380
+ " counts = {}",
381
+ " for w in words:",
382
+ " counts[w] = counts.get(w, 0) + 1",
383
+ " return counts",
384
+ ],
385
+ solution=[
386
+ "import string",
387
+ "def word_count(text):",
388
+ " text = text.lower()",
389
+ " for p in string.punctuation:",
390
+ " text = text.replace(p, '')",
391
+ " words = text.split()",
392
+ " counts = {}",
393
+ " for w in words:",
394
+ " counts[w] = counts.get(w, 0) + 1",
395
+ " return counts",
396
+ ],
397
+ tests=[_twc_1, _twc_2, _twc_3, _twc_4],
398
+ )
399
+
400
+ TASK_WC_NO_PUNCT = _t(
401
+ name="word_count_no_punct_strip",
402
+ description="Debug the word_count function so it passes all tests.",
403
+ difficulty="medium",
404
+ bug_type="missing_return",
405
+ code=[
406
+ "def word_count(text):",
407
+ " text = text.lower()",
408
+ " words = text.split()",
409
+ " counts = {}",
410
+ " for w in words:",
411
+ " counts[w] = counts.get(w, 0) + 1",
412
+ " return counts",
413
+ ],
414
+ solution=[
415
+ "import string",
416
+ "def word_count(text):",
417
+ " text = text.lower()",
418
+ " for p in string.punctuation:",
419
+ " text = text.replace(p, '')",
420
+ " words = text.split()",
421
+ " counts = {}",
422
+ " for w in words:",
423
+ " counts[w] = counts.get(w, 0) + 1",
424
+ " return counts",
425
+ ],
426
+ tests=[_twc_1, _twc_2, _twc_3, _twc_4],
427
+ )
428
+
429
+
430
+ # ── HARD ──────────────────────────────────────────────────────────────────
431
+
432
+ TASK_LRU_WRONG_EVICT = _t(
433
+ name="lru_cache_wrong_eviction",
434
+ description="Debug the LRUCache function so it passes all tests.",
435
+ difficulty="hard",
436
+ bug_type="off_by_one",
437
+ code=[
438
+ "class LRUCache:",
439
+ " def __init__(self, capacity):",
440
+ " self.capacity = capacity",
441
+ " self.cache = []",
442
+ " def get(self, key):",
443
+ " for i, (k, v) in enumerate(self.cache):",
444
+ " if k == key:",
445
+ " self.cache.append(self.cache.pop(i))",
446
+ " return v",
447
+ " return -1",
448
+ " def put(self, key, value):",
449
+ " for i, (k, _) in enumerate(self.cache):",
450
+ " if k == key:",
451
+ " self.cache.pop(i)",
452
+ " break",
453
+ " if len(self.cache) >= self.capacity:",
454
+ " self.cache.pop(0)",
455
+ " self.cache.append((key, value))",
456
+ ],
457
+ solution=[
458
+ "class LRUCache:",
459
+ " def __init__(self, capacity):",
460
+ " self.capacity = capacity",
461
+ " self.cache = []",
462
+ " def get(self, key):",
463
+ " for i, (k, v) in enumerate(self.cache):",
464
+ " if k == key:",
465
+ " self.cache.append(self.cache.pop(i))",
466
+ " return v",
467
+ " return -1",
468
+ " def put(self, key, value):",
469
+ " for i, (k, _) in enumerate(self.cache):",
470
+ " if k == key:",
471
+ " self.cache.pop(i)",
472
+ " break",
473
+ " if len(self.cache) >= self.capacity:",
474
+ " self.cache.pop(0)",
475
+ " self.cache.append((key, value))",
476
+ ],
477
+ tests=[_tlru_1, _tlru_2, _tlru_3],
478
+ )
479
+
480
+ TASK_LRU_NO_PROMOTE = _t(
481
+ name="lru_cache_no_promotion",
482
+ description="Debug the LRUCache function so it passes all tests.",
483
+ difficulty="hard",
484
+ bug_type="missing_return",
485
+ code=[
486
+ "class LRUCache:",
487
+ " def __init__(self, capacity):",
488
+ " self.capacity = capacity",
489
+ " self.cache = []",
490
+ " def get(self, key):",
491
+ " for i, (k, v) in enumerate(self.cache):",
492
+ " if k == key:",
493
+ " return v",
494
+ " return -1",
495
+ " def put(self, key, value):",
496
+ " for i, (k, _) in enumerate(self.cache):",
497
+ " if k == key:",
498
+ " self.cache.pop(i)",
499
+ " break",
500
+ " if len(self.cache) >= self.capacity:",
501
+ " self.cache.pop(0)",
502
+ " self.cache.append((key, value))",
503
+ ],
504
+ solution=[
505
+ "class LRUCache:",
506
+ " def __init__(self, capacity):",
507
+ " self.capacity = capacity",
508
+ " self.cache = []",
509
+ " def get(self, key):",
510
+ " for i, (k, v) in enumerate(self.cache):",
511
+ " if k == key:",
512
+ " self.cache.append(self.cache.pop(i))",
513
+ " return v",
514
+ " return -1",
515
+ " def put(self, key, value):",
516
+ " for i, (k, _) in enumerate(self.cache):",
517
+ " if k == key:",
518
+ " self.cache.pop(i)",
519
+ " break",
520
+ " if len(self.cache) >= self.capacity:",
521
+ " self.cache.pop(0)",
522
+ " self.cache.append((key, value))",
523
+ ],
524
+ tests=[_tlru_1, _tlru_2, _tlru_3],
525
+ )
526
+
527
+ TASK_VP_WRONG_MAPPING = _t(
528
+ name="valid_parentheses_wrong_mapping",
529
+ description="Debug the is_valid function so it passes all tests.",
530
+ difficulty="hard",
531
+ bug_type="wrong_operator",
532
+ code=[
533
+ "def is_valid(s):",
534
+ " stack = []",
535
+ " mapping = {')': '[', ']': '{', '}': '('}",
536
+ " for c in s:",
537
+ " if c in mapping.values():",
538
+ " stack.append(c)",
539
+ " elif c in mapping:",
540
+ " if not stack or stack.pop() != mapping[c]:",
541
+ " return False",
542
+ " return len(stack) == 0",
543
+ ],
544
+ solution=[
545
+ "def is_valid(s):",
546
+ " stack = []",
547
+ " mapping = {')': '(', ']': '[', '}': '{'}",
548
+ " for c in s:",
549
+ " if c in mapping.values():",
550
+ " stack.append(c)",
551
+ " elif c in mapping:",
552
+ " if not stack or stack.pop() != mapping[c]:",
553
+ " return False",
554
+ " return len(stack) == 0",
555
+ ],
556
+ tests=[_tvp_1, _tvp_2, _tvp_3, _tvp_4],
557
+ )
558
+
559
+ TASK_VP_MISSING_EMPTY_CHECK = _t(
560
+ name="valid_parentheses_no_empty_check",
561
+ description="Debug the is_valid function so it passes all tests.",
562
+ difficulty="hard",
563
+ bug_type="logic_inversion",
564
+ code=[
565
+ "def is_valid(s):",
566
+ " stack = []",
567
+ " mapping = {')': '(', ']': '[', '}': '{'}",
568
+ " for c in s:",
569
+ " if c in mapping.values():",
570
+ " stack.append(c)",
571
+ " elif c in mapping:",
572
+ " if stack.pop() != mapping[c]:",
573
+ " return False",
574
+ " return len(stack) == 0",
575
+ ],
576
+ solution=[
577
+ "def is_valid(s):",
578
+ " stack = []",
579
+ " mapping = {')': '(', ']': '[', '}': '{'}",
580
+ " for c in s:",
581
+ " if c in mapping.values():",
582
+ " stack.append(c)",
583
+ " elif c in mapping:",
584
+ " if not stack or stack.pop() != mapping[c]:",
585
+ " return False",
586
+ " return len(stack) == 0",
587
+ ],
588
+ tests=[_tvp_1, _tvp_2, _tvp_3, _tvp_4],
589
+ )
590
+
591
+ TASK_MI_STRICT_OVERLAP = _t(
592
+ name="merge_intervals_strict_overlap",
593
+ description="Debug the merge_intervals function so it passes all tests.",
594
+ difficulty="hard",
595
+ bug_type="wrong_operator",
596
+ code=[
597
+ "def merge_intervals(intervals):",
598
+ " intervals.sort()",
599
+ " merged = []",
600
+ " for interval in intervals:",
601
+ " if not merged or merged[-1][1] < interval[0]:",
602
+ " merged.append(list(interval))",
603
+ " else:",
604
+ " merged[-1][1] = max(merged[-1][1], interval[1])",
605
+ " return merged",
606
+ ],
607
+ solution=[
608
+ "def merge_intervals(intervals):",
609
+ " intervals.sort()",
610
+ " merged = []",
611
+ " for interval in intervals:",
612
+ " if not merged or merged[-1][1] <= interval[0]:",
613
+ " merged.append(list(interval))",
614
+ " else:",
615
+ " merged[-1][1] = min(merged[-1][1], interval[1])",
616
+ " return merged",
617
+ ],
618
+ tests=[_tmi_1, _tmi_2, _tmi_3],
619
+ )
620
+
621
+ TASK_MI_NO_SORT = _t(
622
+ name="merge_intervals_missing_sort",
623
+ description="Debug the merge_intervals function so it passes all tests.",
624
+ difficulty="hard",
625
+ bug_type="missing_return",
626
+ code=[
627
+ "def merge_intervals(intervals):",
628
+ " merged = []",
629
+ " for interval in intervals:",
630
+ " if not merged or merged[-1][1] < interval[0]:",
631
+ " merged.append(list(interval))",
632
+ " else:",
633
+ " merged[-1][1] = max(merged[-1][1], interval[1])",
634
+ " return merged",
635
+ ],
636
+ solution=[
637
+ "def merge_intervals(intervals):",
638
+ " intervals.sort()",
639
+ " merged = []",
640
+ " for interval in intervals:",
641
+ " if not merged or merged[-1][1] < interval[0]:",
642
+ " merged.append(list(interval))",
643
+ " else:",
644
+ " merged[-1][1] = max(merged[-1][1], interval[1])",
645
+ " return merged",
646
+ ],
647
+ tests=[_tmi_1, _tmi_2, _tmi_3],
648
+ )
649
+
650
+
651
+ # ---------------------------------------------------------------------------
652
+ # Registries
653
+ # ---------------------------------------------------------------------------
654
+
655
+ TASKS_BY_DIFFICULTY: Dict[str, List[Dict]] = {
656
+ "easy": [
657
+ TASK_SUM_EVEN_WRONG_OP,
658
+ TASK_SUM_EVEN_MISSING_INIT,
659
+ TASK_REVERSE_WRONG_STEP,
660
+ TASK_REVERSE_NO_REVERSE,
661
+ ],
662
+ "medium": [
663
+ TASK_BS_OFF_BY_ONE,
664
+ TASK_BS_WRONG_MID,
665
+ TASK_FLATTEN_APPEND,
666
+ TASK_FLATTEN_LOGIC_INVERT,
667
+ TASK_WC_NO_LOWER,
668
+ TASK_WC_NO_PUNCT,
669
+ ],
670
+ "hard": [
671
+ TASK_LRU_WRONG_EVICT,
672
+ TASK_LRU_NO_PROMOTE,
673
+ TASK_VP_WRONG_MAPPING,
674
+ TASK_VP_MISSING_EMPTY_CHECK,
675
+ TASK_MI_STRICT_OVERLAP,
676
+ TASK_MI_NO_SORT,
677
+ ],
678
+ }
679
+
680
+ # Flat list — used for random sampling when training_step is not set
681
+ ALL_TASKS: List[Dict] = [
682
+ t for bucket in TASKS_BY_DIFFICULTY.values() for t in bucket
683
+ ]
my_env/uv.lock → uv.lock RENAMED
File without changes