databoysu commited on
Commit
100d601
·
1 Parent(s): ba3fae8
Files changed (7) hide show
  1. .gitignore +1 -0
  2. Dockerfile +52 -0
  3. README.md +4 -4
  4. client.py +4 -1
  5. inference.py +182 -321
  6. openenv.yaml +1 -1
  7. pre-val.sh +14 -6
.gitignore CHANGED
@@ -2,3 +2,4 @@
2
  .agents
3
  .env
4
  uv.lock
 
 
2
  .agents
3
  .env
4
  uv.lock
5
+ claude.md
Dockerfile ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
8
+ FROM ${BASE_IMAGE} AS builder
9
+
10
+ WORKDIR /app
11
+
12
+ RUN apt-get update && \
13
+ apt-get install -y --no-install-recommends git && \
14
+ rm -rf /var/lib/apt/lists/*
15
+
16
+ COPY . /app/env
17
+ WORKDIR /app/env
18
+
19
+ RUN if ! command -v uv >/dev/null 2>&1; then \
20
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
21
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
22
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
23
+ fi
24
+
25
+ RUN --mount=type=cache,target=/root/.cache/uv \
26
+ if [ -f uv.lock ]; then \
27
+ uv sync --frozen --no-install-project --no-editable; \
28
+ else \
29
+ uv sync --no-install-project --no-editable; \
30
+ fi
31
+
32
+ RUN --mount=type=cache,target=/root/.cache/uv \
33
+ if [ -f uv.lock ]; then \
34
+ uv sync --frozen --no-editable; \
35
+ else \
36
+ uv sync --no-editable; \
37
+ fi
38
+
39
+ FROM ${BASE_IMAGE}
40
+
41
+ WORKDIR /app
42
+
43
+ COPY --from=builder /app/env/.venv /app/.venv
44
+ COPY --from=builder /app/env /app/env
45
+
46
+ ENV PATH="/app/.venv/bin:$PATH"
47
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
48
+
49
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
50
+ CMD curl -f http://localhost:7860/health || exit 1
51
+
52
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: blue
5
  colorTo: cyan
6
  sdk: docker
7
  pinned: false
8
- app_port: 8000
9
  base_path: /web
10
  tags:
11
  - openenv
@@ -30,7 +30,7 @@ iteratively viewing, editing, and testing code snippets until all tests pass.
30
 
31
  ```bash
32
  uv sync
33
- uv run --project . server --port 8000
34
  ```
35
 
36
  Server endpoints:
@@ -48,8 +48,8 @@ openenv push
48
 
49
  ## Validate Submission
50
 
51
- From repo root (`RL_ENV_FINAL`):
52
 
53
  ```bash
54
- ./pre-val.sh https://<your-space>.hf.space ./my_env
55
  ```
 
5
  colorTo: cyan
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
9
  base_path: /web
10
  tags:
11
  - openenv
 
30
 
31
  ```bash
32
  uv sync
33
+ uv run --project . server --port 7860
34
  ```
35
 
36
  Server endpoints:
 
48
 
49
  ## Validate Submission
50
 
51
+ From repo:
52
 
53
  ```bash
54
+ ./pre-val.sh https://<your-space>.hf.space .
55
  ```
client.py CHANGED
@@ -12,7 +12,10 @@ from openenv.core import EnvClient
12
  from openenv.core.client_types import StepResult
13
  from openenv.core.env_server.types import State
14
 
15
- from .models import CodeAction, CodeObservation, TestResult
 
 
 
16
 
17
 
18
  class MyEnv(
 
12
  from openenv.core.client_types import StepResult
13
  from openenv.core.env_server.types import State
14
 
15
+ try:
16
+ from .models import CodeAction, CodeObservation, TestResult
17
+ except ImportError:
18
+ from models import CodeAction, CodeObservation, TestResult
19
 
20
 
21
  class MyEnv(
inference.py CHANGED
@@ -1,21 +1,16 @@
1
  """
2
- inference.py Baseline Agent for Python Debugging Gym
3
- =======================================================
4
- Hackathon-compliant baseline script. Connects to the PythonDebuggingGym
5
- WebSocket server and drives an OpenAI-compatible LLM to find and fix bugs.
6
-
7
- Required environment variables:
8
- HF_TOKEN API key / HuggingFace token passed as Bearer auth
9
- MODEL_NAME Model identifier (default: nvidia/nemotron-3-nano-4b)
10
- API_BASE_URL OpenAI-compatible base URL (default: https://api.openai.com/v1)
11
-
12
- Optional environment variables:
13
- ENV_WS_URL WebSocket URL for the gym (default: ws://localhost:8000/ws)
14
-
15
- Mandatory stdout log lines (zero deviation in spacing or formatting):
16
- [START] task=<task_name> env=PythonDebuggingGym model=<model_name>
17
- [STEP] step=<n> action=<action_type> reward=<r.rr> done=<true|false> error=<msg|null>
18
- [END] success=<true|false> steps=<n> score=<s.sss> rewards=<r1,r2,...,rn>
19
  """
20
 
21
  from __future__ import annotations
@@ -23,198 +18,61 @@ from __future__ import annotations
23
  import asyncio
24
  import json
25
  import os
26
- import sys
27
  from typing import Any
28
 
29
- import websockets
30
  from openai import OpenAI
31
 
 
32
 
33
- # ---------------------------------------------------------------------------
34
- # Config (all readable from environment at import time)
35
- # ---------------------------------------------------------------------------
36
 
37
- API_BASE_URL: str = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
38
- MODEL_NAME: str = os.getenv("MODEL_NAME", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8")
39
- HF_TOKEN: str = os.getenv("HF_TOKEN", "")
40
- ENV_WS_URL: str = os.getenv("ENV_WS_URL", "ws://localhost:7860/ws")
41
 
42
- # ---------------------------------------------------------------------------
43
- # OpenAI client
44
- # ---------------------------------------------------------------------------
 
 
45
 
46
- _client = OpenAI(
47
- api_key=HF_TOKEN or "sk-placeholder", # placeholder keeps the client from raising at init
48
- base_url=API_BASE_URL,
 
 
 
49
  )
50
 
51
- # ---------------------------------------------------------------------------
52
- # Agent instruction appended after the environment's own system prompt
53
- # ---------------------------------------------------------------------------
54
-
55
- _AGENT_SUFFIX = """\
56
-
57
- =======================================================================
58
- RESPONSE FORMAT (MANDATORY)
59
- =======================================================================
60
- Respond with ONLY a valid JSON object. No markdown, no code fences,
61
- no explanation text — just the raw JSON.
62
-
63
- Valid action schemas (choose exactly one per turn):
64
- {"action_type": "VIEW_CODE"}
65
- {"action_type": "RUN_TESTS"}
66
- {"action_type": "REPLACE_LINES", "start_line": N, "end_line": M, "new_code_block": "line1\\nline2"}
67
- {"action_type": "UNDO_EDIT"}
68
- {"action_type": "RESET_TO_ORIGINAL"}
69
- {"action_type": "SUBMIT"}
70
-
71
- Rules for REPLACE_LINES:
72
- - new_code_block: join multiple lines with \\n (literal backslash-n in the JSON string)
73
- - Include exact Python indentation (leading spaces) on every line
74
- - Do NOT include a trailing \\n character
75
- - After REPLACE_LINES, call VIEW_CODE to re-orient before the next edit
76
-
77
- Rules for UNDO_EDIT / RESET_TO_ORIGINAL:
78
- - UNDO_EDIT reverts the last REPLACE_LINES. Use when an edit made things worse.
79
- - RESET_TO_ORIGINAL restores the original broken code. Last resort only.
80
- - Both cost -0.10. Prefer fixing forward over backtracking.
81
- """
82
 
 
 
83
 
84
- # ---------------------------------------------------------------------------
85
- # Observation formatter
86
- # ---------------------------------------------------------------------------
87
-
88
- def _format_obs(obs: dict[str, Any]) -> str:
89
- """Convert a CodeObservation dict into a compact string for the LLM."""
90
- parts: list[str] = []
91
-
92
- if obs.get("syntax_error"):
93
- parts.append("⚠ SYNTAX ERROR in current code — fix indentation/brackets first.\n")
94
-
95
- localized = obs.get("localized_context", "")
96
- if localized:
97
- parts.append(f"[Context around last edit]\n{localized}\n")
98
-
99
- last_out = obs.get("last_execution_output", "")
100
- if last_out:
101
- parts.append(f"[Last execution output]\n{last_out}\n")
102
-
103
- test_results: list[dict] = obs.get("test_results", [])
104
- if test_results:
105
- lines = []
106
- for t in test_results:
107
- status = "PASS" if t.get("passed") else "FAIL"
108
- msg = t.get("error_message") or ""
109
- name = t.get("test_name", "?")
110
- lines.append(f" {status} {name}" + (f": {msg}" if msg else ""))
111
- parts.append("[Test results]\n" + "\n".join(lines) + "\n")
112
-
113
- remaining = obs.get("steps_remaining", 0)
114
- parts.append(f"[Steps remaining: {remaining}]")
115
-
116
- return "\n".join(parts)
117
-
118
-
119
- # ---------------------------------------------------------------------------
120
- # LLM call
121
- # ---------------------------------------------------------------------------
122
-
123
- _ACTION_SCHEMA = {
124
- "type": "json_schema",
125
- "json_schema": {
126
- "name": "CodeAction",
127
- "strict": True,
128
- "schema": {
129
- "type": "object",
130
- "properties": {
131
- "thought": {
132
- "type": "string",
133
- "description": "Mandatory reasoning before selecting action_type.",
134
- },
135
- "action_type": {
136
- "type": "string",
137
- "enum": [
138
- "VIEW_CODE", "RUN_TESTS", "REPLACE_LINES",
139
- "UNDO_EDIT", "RESET_TO_ORIGINAL", "SUBMIT",
140
- ],
141
- },
142
- "start_line": {"type": ["integer", "null"]},
143
- "end_line": {"type": ["integer", "null"]},
144
- "new_code_block": {"type": ["string", "null"]},
145
- },
146
- "required": ["thought", "action_type"],
147
- "additionalProperties": False,
148
- },
149
- },
150
- }
151
-
152
-
153
- def _call_llm(system_prompt: str, messages: list[dict]) -> str:
154
- """
155
- Call the configured LLM and return the raw text reply.
156
-
157
- Tries json_schema structured output first (LM Studio / vLLM / newer
158
- llama.cpp all support this). Falls back to a plain call if the backend
159
- raises an error for the response_format parameter — _extract_json()
160
- then handles extraction from free-form text.
161
- """
162
- base_kwargs: dict = dict(
163
- model=MODEL_NAME,
164
- messages=[
165
- {"role": "system", "content": system_prompt + _AGENT_SUFFIX},
166
- *messages,
167
- ],
168
- temperature=0.0,
169
- )
170
- try:
171
- response = _client.chat.completions.create(
172
- **base_kwargs,
173
- response_format=_ACTION_SCHEMA,
174
- )
175
- except Exception:
176
- # Backend doesn't support json_schema — fall back to free-form
177
- response = _client.chat.completions.create(**base_kwargs)
178
-
179
- msg = response.choices[0].message
180
- content = msg.content
181
-
182
- # Fallback for reasoning models (e.g., via LM Studio) that place their
183
- # entire output in the reasoning_content field instead of content.
184
- if not content:
185
- try:
186
- msg_dict = msg.model_dump()
187
- content = msg_dict.get("reasoning_content", "") or ""
188
- except AttributeError:
189
- pass
190
-
191
- return content or ""
192
 
 
 
 
 
 
 
193
 
194
- # ---------------------------------------------------------------------------
195
- # Constrained JSON extraction (works with any local or cloud model)
196
- # ---------------------------------------------------------------------------
197
 
198
- def _extract_json(text: str) -> dict:
199
- """
200
- Best-effort JSON extraction from raw LLM output.
 
 
 
201
 
202
- Tries in order:
203
- 1. Direct json.loads (model produced clean JSON)
204
- 2. Strip ```json ... ``` / ``` ... ``` markdown fences
205
- 3. Regex: grab first {...} block in the text
206
- 4. Safe fallback: {"action_type": "VIEW_CODE"}
207
- """
208
- import re
209
 
210
- # 1. Direct parse
211
  stripped = text.strip()
212
  try:
213
  return json.loads(stripped)
214
  except json.JSONDecodeError:
215
  pass
216
 
217
- # 2. Markdown code fence ```json\n{...}\n```
218
  fence = re.search(r"```(?:json)?\s*({.*?})\s*```", stripped, re.DOTALL)
219
  if fence:
220
  try:
@@ -222,155 +80,158 @@ def _extract_json(text: str) -> dict:
222
  except json.JSONDecodeError:
223
  pass
224
 
225
- # 3. First {...} block anywhere in the text
226
- brace = re.search(r"({.*?})", stripped, re.DOTALL)
227
- if brace:
228
  try:
229
- return json.loads(brace.group(1))
230
  except json.JSONDecodeError:
231
  pass
232
 
233
- # All extraction attempts failed.
234
- # Return an invalid action_type so Pydantic rejects it at the server,
235
- # the server returns an error envelope, and THAT error is fed back to
236
- # the LLM on the next turn — breaking the silent mask loop.
237
- # DO NOT default to VIEW_CODE here.
238
- return {"action_type": "PARSE_ERROR", "thought": f"Failed to parse LLM output as JSON: {text[:120]}"}
239
-
240
-
241
- # ---------------------------------------------------------------------------
242
- # Episode runner
243
- # ---------------------------------------------------------------------------
244
-
245
- async def run_episode(difficulty: str = None, show_thought: bool = False) -> None:
246
- """
247
- Connect to the gym, run one full episode with an LLM agent,
248
- and emit the three required log lines.
249
- """
250
- rewards: list[float] = []
251
- step: int = 0
252
- system_prompt: str = ""
253
- task_name: str = "unknown"
254
- messages: list[dict] = []
255
- success: bool = False
256
- obs: dict = {}
257
-
258
- ws_url = ENV_WS_URL
259
- if difficulty:
260
- separator = "&" if "?" in ws_url else "?"
261
- ws_url = f"{ws_url}{separator}difficulty={difficulty}"
262
-
263
- async with websockets.connect(ws_url) as ws:
264
-
265
- # ── Receive initial observation + system prompt ──────────────────
266
- raw = await ws.recv()
267
- data = json.loads(raw)
268
-
269
- system_prompt = data.get("info", {}).get("system_prompt", "")
270
- obs = data.get("observation", {})
271
- task_name = obs.get("info", {}).get("task_name", "unknown")
272
-
273
- # ── [START] log line ─────────────────────────────────────────────
274
- print(
275
- f"[START] task={task_name} env=PythonDebuggingGym model={MODEL_NAME}",
276
- flush=True,
277
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- # ── RL loop ──────────────────────────────────────────────────────
280
- while True:
281
- step += 1
282
- error_str = "null"
283
- action_type = "VIEW_CODE" # will be overwritten by a real parse
284
 
285
- # Build observation message for the LLM
286
- obs_text = _format_obs(obs)
287
- messages.append({"role": "user", "content": obs_text})
 
 
 
 
 
 
288
 
289
- # Call LLM
290
- try:
291
- llm_reply = _call_llm(system_prompt, messages)
292
- if os.getenv("DEBUG_LOG") == "1":
293
- print(f"\n[DEBUG RAW LLM]: {llm_reply}\n", flush=True) # see what model actually outputs
294
- action_json = _extract_json(llm_reply)
295
- action_type = action_json.get("action_type", "VIEW_CODE")
296
- messages.append({"role": "assistant", "content": llm_reply})
297
- except Exception as exc:
298
- # LLM call itself failed — surface error in log, do NOT mask as VIEW_CODE.
299
- # Send a harmless VIEW_CODE this turn but pass the error text back as
300
- # the next user message so the model sees what went wrong.
301
- error_str = str(exc).replace("\n", " ")[:200]
302
- action_type = "VIEW_CODE"
303
- action_json = {"action_type": "VIEW_CODE"}
304
- messages.append({"role": "user", "content": f"[SYSTEM ERROR] {error_str}"})
305
-
306
- if show_thought:
307
- thought = action_json.get("thought", "")
308
- if thought:
309
- print(f"\n[THOUGHT]: {thought}\n", flush=True)
310
-
311
- # Send action to the environment
312
- await ws.send(json.dumps({"action": action_json}))
313
-
314
- # Receive response
315
- raw = await ws.recv()
316
- data = json.loads(raw)
317
-
318
- # Server may return a validation-error envelope (no "observation" key)
319
- if "observation" not in data:
320
- error_str = str(data.get("error", "server_error"))[:200]
321
- reward, done = 0.0, False
322
- else:
323
- reward = float(data.get("reward", 0.0))
324
- done = bool(data.get("done", False))
325
- obs = data.get("observation", {})
326
-
327
- if done:
328
- test_results = obs.get("test_results", [])
329
- total = len(test_results)
330
- passes = sum(1 for t in test_results if t.get("passed"))
331
- success = (total > 0 and passes == total)
332
 
333
- rewards.append(reward)
 
334
 
335
- # ── [STEP] log line ──────────────────────────────────────────
336
- done_str = "true" if done else "false"
337
- print(
338
- f"[STEP] step={step} action={action_type} "
339
- f"reward={reward:.2f} done={done_str} error={error_str}",
340
- flush=True,
341
- )
342
 
343
- if done:
344
- break # server will auto-reset, but we exit after one episode
345
-
346
- # ── [END] log line ───────────────────────────────────────────────────────
347
- success_str = "true" if success else "false"
348
- # Pull clamped final_score from info dict if available, else derive from rewards
349
- final_score = data.get("info", {}).get("final_score", None) if done else None
350
- if final_score is None:
351
- final_score = max(0.0, min(1.0, sum(rewards)))
352
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
353
- print(
354
- f"[END] success={success_str} steps={step} score={final_score:.3f} rewards={rewards_str}",
355
- flush=True,
356
- )
 
 
 
 
357
 
 
 
 
358
 
359
- # ---------------------------------------------------------------------------
360
- # Entry point
361
- # ---------------------------------------------------------------------------
 
362
 
363
- def main() -> None:
364
- import argparse
365
- parser = argparse.ArgumentParser(description="Run the Python debugging agent.")
366
- parser.add_argument("--easy", action="store_const", dest="difficulty", const="easy", help="Run an easy task.")
367
- parser.add_argument("--medium", action="store_const", dest="difficulty", const="medium", help="Run a medium task.")
368
- parser.add_argument("--hard", action="store_const", dest="difficulty", const="hard", help="Run a hard task.")
369
- parser.add_argument("--thought", action="store_true", dest="show_thought", help="Print the agent's chain-of-thought reasoning.")
370
-
371
- args = parser.parse_args()
372
- asyncio.run(run_episode(difficulty=args.difficulty, show_thought=args.show_thought))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
 
375
  if __name__ == "__main__":
376
- main()
 
1
  """
2
+ Inference script for Python Debugging Gym.
3
+
4
+ Mandatory env vars expected in deployment config:
5
+ API_BASE_URL
6
+ MODEL_NAME
7
+ HF_TOKEN
8
+ LOCAL_IMAGE_NAME (required if using MyEnv.from_docker_image)
9
+
10
+ This script prints exactly:
11
+ [START] ...
12
+ [STEP] ...
13
+ [END] ...
 
 
 
 
 
14
  """
15
 
16
  from __future__ import annotations
 
18
  import asyncio
19
  import json
20
  import os
21
+ import re
22
  from typing import Any
23
 
 
24
  from openai import OpenAI
25
 
26
+ from my_env import CodeAction, MyEnv
27
 
 
 
 
28
 
29
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
30
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
31
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
32
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME", "")
33
 
34
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7860")
35
+ TASK_NAME = os.getenv("TASK_NAME", "python_debugging_gym")
36
+ BENCHMARK = os.getenv("BENCHMARK", "python_debugging_gym")
37
+ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
38
+ SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
39
 
40
+ SYSTEM_PROMPT = (
41
+ "You are controlling a Python debugging RL environment. "
42
+ "Return only JSON for one action.\n"
43
+ 'Allowed action_type values: VIEW_CODE, RUN_TESTS, REPLACE_LINES, UNDO_EDIT, RESET_TO_ORIGINAL, SUBMIT.\n'
44
+ "For REPLACE_LINES include start_line, end_line, new_code_block.\n"
45
+ "Prefer RUN_TESTS after edits and SUBMIT only when all tests pass."
46
  )
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def log_start(task: str, env: str, model: str) -> None:
50
+ print(f"[START] task={task} env={env} model={model}", flush=True)
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
54
+ error_value = error if error else "null"
55
+ print(
56
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error_value}",
57
+ flush=True,
58
+ )
59
 
 
 
 
60
 
61
+ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
62
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
63
+ print(
64
+ f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
65
+ flush=True,
66
+ )
67
 
 
 
 
 
 
 
 
68
 
69
+ def _extract_json(text: str) -> dict[str, Any]:
70
  stripped = text.strip()
71
  try:
72
  return json.loads(stripped)
73
  except json.JSONDecodeError:
74
  pass
75
 
 
76
  fence = re.search(r"```(?:json)?\s*({.*?})\s*```", stripped, re.DOTALL)
77
  if fence:
78
  try:
 
80
  except json.JSONDecodeError:
81
  pass
82
 
83
+ block = re.search(r"({.*?})", stripped, re.DOTALL)
84
+ if block:
 
85
  try:
86
+ return json.loads(block.group(1))
87
  except json.JSONDecodeError:
88
  pass
89
 
90
+ return {"action_type": "RUN_TESTS"}
91
+
92
+
93
+ def _build_observation_text(observation: Any) -> str:
94
+ code_preview = "\n".join(observation.code_lines[:30]) if observation.code_lines else ""
95
+ return (
96
+ f"step_count={observation.step_count}\n"
97
+ f"steps_remaining={observation.steps_remaining}\n"
98
+ f"syntax_error={observation.syntax_error}\n"
99
+ f"localized_context=\n{observation.localized_context}\n\n"
100
+ f"last_execution_output=\n{observation.last_execution_output}\n\n"
101
+ f"code_preview=\n{code_preview}"
102
+ )
103
+
104
+
105
+ def _get_model_action(client: OpenAI, observation: Any, history: list[str]) -> dict[str, Any]:
106
+ obs_text = _build_observation_text(observation)
107
+ user_prompt = (
108
+ "Pick the single best next action and return only JSON.\n\n"
109
+ f"{obs_text}\n\n"
110
+ f"history:\n{chr(10).join(history[-5:]) if history else 'none'}"
111
+ )
112
+ try:
113
+ completion = client.chat.completions.create(
114
+ model=MODEL_NAME,
115
+ messages=[
116
+ {"role": "system", "content": SYSTEM_PROMPT},
117
+ {"role": "user", "content": user_prompt},
118
+ ],
119
+ temperature=0.0,
120
+ max_tokens=300,
121
+ stream=False,
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
+ response_text = (completion.choices[0].message.content or "").strip()
124
+ action = _extract_json(response_text)
125
+ except Exception:
126
+ action = {"action_type": "RUN_TESTS"}
127
+
128
+ if action.get("action_type") not in {
129
+ "VIEW_CODE",
130
+ "RUN_TESTS",
131
+ "REPLACE_LINES",
132
+ "UNDO_EDIT",
133
+ "RESET_TO_ORIGINAL",
134
+ "SUBMIT",
135
+ }:
136
+ action = {"action_type": "RUN_TESTS"}
137
+
138
+ return action
139
+
140
+
141
+ def _to_code_action(action_dict: dict[str, Any]) -> CodeAction:
142
+ payload = {
143
+ "action_type": action_dict.get("action_type", "RUN_TESTS"),
144
+ "thought": action_dict.get("thought"),
145
+ "start_line": action_dict.get("start_line"),
146
+ "end_line": action_dict.get("end_line"),
147
+ "new_code_block": action_dict.get("new_code_block"),
148
+ }
149
+ try:
150
+ return CodeAction(**payload)
151
+ except Exception:
152
+ return CodeAction(action_type="RUN_TESTS")
153
 
 
 
 
 
 
154
 
155
+ def _compute_score(step_result: Any, rewards: list[float]) -> float:
156
+ meta = step_result.observation.metadata or {}
157
+ raw = meta.get("final_score")
158
+ if raw is None:
159
+ info = step_result.observation.info or {}
160
+ raw = info.get("final_score")
161
+ if raw is None:
162
+ raw = sum(rewards)
163
+ return max(0.0, min(1.0, float(raw)))
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ async def main() -> None:
167
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
168
 
169
+ env: MyEnv | None = None
170
+ rewards: list[float] = []
171
+ history: list[str] = []
172
+ steps_taken = 0
173
+ score = 0.0
174
+ success = False
175
+ started = False
176
 
177
+ try:
178
+ if LOCAL_IMAGE_NAME:
179
+ env = await MyEnv.from_docker_image(LOCAL_IMAGE_NAME)
180
+ else:
181
+ env = MyEnv(base_url=ENV_BASE_URL)
182
+
183
+ result = await env.reset()
184
+ task_name = result.observation.info.get("task_name") or TASK_NAME
185
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
186
+ started = True
187
+
188
+ for step in range(1, MAX_STEPS + 1):
189
+ if result.done:
190
+ break
191
+
192
+ action_dict = _get_model_action(client, result.observation, history)
193
+ action = _to_code_action(action_dict)
194
+ result = await env.step(action)
195
 
196
+ reward = float(result.reward or 0.0)
197
+ done = bool(result.done)
198
+ action_str = action.action_type
199
 
200
+ obs_meta = result.observation.metadata or {}
201
+ error = obs_meta.get("last_action_error")
202
+ if error is not None:
203
+ error = str(error).replace("\n", " ")
204
 
205
+ rewards.append(reward)
206
+ steps_taken = step
207
+ history.append(f"step={step} action={action_str} reward={reward:.2f}")
208
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
209
+
210
+ if done:
211
+ break
212
+
213
+ score = _compute_score(result, rewards)
214
+ success = score >= SUCCESS_SCORE_THRESHOLD
215
+
216
+ except Exception as exc:
217
+ if not started:
218
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
219
+ started = True
220
+ msg = str(exc).replace("\n", " ")
221
+ if steps_taken == 0:
222
+ log_step(step=1, action="RUN_TESTS", reward=0.0, done=False, error=msg)
223
+ steps_taken = 1
224
+ rewards.append(0.0)
225
+ score = 0.0
226
+ success = False
227
+ finally:
228
+ if env is not None:
229
+ try:
230
+ await env.close()
231
+ except Exception:
232
+ pass
233
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
234
 
235
 
236
  if __name__ == "__main__":
237
+ asyncio.run(main())
openenv.yaml CHANGED
@@ -3,4 +3,4 @@ name: python_debugging_gym
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
- port: 8000
 
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
+ port: 7860
pre-val.sh CHANGED
@@ -135,17 +135,19 @@ fi
135
 
136
  if [ -f "$REPO_DIR/Dockerfile" ]; then
137
  DOCKER_CONTEXT="$REPO_DIR"
 
138
  elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
- DOCKER_CONTEXT="$REPO_DIR/server"
 
140
  else
141
  fail "No Dockerfile found in repo root or server/ directory"
142
  stop_at "Step 2"
143
  fi
144
 
145
- log " Found Dockerfile in $DOCKER_CONTEXT"
146
 
147
  BUILD_OK=false
148
- BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
 
150
  if [ "$BUILD_OK" = true ]; then
151
  pass "Docker build succeeded"
@@ -157,14 +159,20 @@ fi
157
 
158
  log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
 
160
- if ! command -v openenv &>/dev/null; then
 
 
 
 
 
161
  fail "openenv command not found"
162
  hint "Install it: pip install openenv-core"
 
163
  stop_at "Step 3"
164
  fi
165
 
166
  VALIDATE_OK=false
167
- VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
 
169
  if [ "$VALIDATE_OK" = true ]; then
170
  pass "openenv validate passed"
@@ -182,4 +190,4 @@ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
  printf "${BOLD}========================================${NC}\n"
183
  printf "\n"
184
 
185
- exit 0
 
135
 
136
  if [ -f "$REPO_DIR/Dockerfile" ]; then
137
  DOCKER_CONTEXT="$REPO_DIR"
138
+ DOCKERFILE_PATH="$REPO_DIR/Dockerfile"
139
  elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
140
+ DOCKER_CONTEXT="$REPO_DIR"
141
+ DOCKERFILE_PATH="$REPO_DIR/server/Dockerfile"
142
  else
143
  fail "No Dockerfile found in repo root or server/ directory"
144
  stop_at "Step 2"
145
  fi
146
 
147
+ log " Found Dockerfile at $DOCKERFILE_PATH"
148
 
149
  BUILD_OK=false
150
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build -f "$DOCKERFILE_PATH" "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
151
 
152
  if [ "$BUILD_OK" = true ]; then
153
  pass "Docker build succeeded"
 
159
 
160
  log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
161
 
162
+ OPENENV_BIN=""
163
+ if command -v openenv &>/dev/null; then
164
+ OPENENV_BIN="openenv"
165
+ elif [ -x "$REPO_DIR/.venv/bin/openenv" ]; then
166
+ OPENENV_BIN="$REPO_DIR/.venv/bin/openenv"
167
+ else
168
  fail "openenv command not found"
169
  hint "Install it: pip install openenv-core"
170
+ hint "Or create a local venv in the repo with .venv/bin/openenv available."
171
  stop_at "Step 3"
172
  fi
173
 
174
  VALIDATE_OK=false
175
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && "$OPENENV_BIN" validate 2>&1) && VALIDATE_OK=true
176
 
177
  if [ "$VALIDATE_OK" = true ]; then
178
  pass "openenv validate passed"
 
190
  printf "${BOLD}========================================${NC}\n"
191
  printf "\n"
192
 
193
+ exit 0