databoysu commited on
Commit
20ef9ad
·
1 Parent(s): 2e11c6a

TraceFix-RL v1 deploy

Browse files
README.md CHANGED
@@ -58,6 +58,7 @@ Tasks are organized in `tasks.py` into three tiers.
58
  Focus: data-structure invariants, eviction/promotion logic, bracket mapping, and interval merging edge behavior.
59
 
60
  Every task follows the same schema:
 
61
  - `name`, `description`, `difficulty`, `bug_type`
62
  - `code`: buggy implementation (line list)
63
  - `solution`: reference implementation
@@ -83,6 +84,7 @@ uv run --project . server
83
  ```
84
 
85
  Server endpoints:
 
86
  - `POST /reset`
87
  - `POST /step`
88
  - `GET /health`
@@ -100,6 +102,26 @@ Server endpoints:
100
  `[START]`, one or more `[STEP]`, then `[END]`.
101
  - Final score is clamped to `[0, 1]`.
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  ## Docker + Deployment
104
 
105
  Build locally:
 
58
  Focus: data-structure invariants, eviction/promotion logic, bracket mapping, and interval merging edge behavior.
59
 
60
  Every task follows the same schema:
61
+
62
  - `name`, `description`, `difficulty`, `bug_type`
63
  - `code`: buggy implementation (line list)
64
  - `solution`: reference implementation
 
84
  ```
85
 
86
  Server endpoints:
87
+
88
  - `POST /reset`
89
  - `POST /step`
90
  - `GET /health`
 
102
  `[START]`, one or more `[STEP]`, then `[END]`.
103
  - Final score is clamped to `[0, 1]`.
104
 
105
+ ## Inference Flags
106
+
107
+ `inference.py` supports:
108
+
109
+ - `--easy`: run episode using easy-tier curriculum sampling.
110
+ - `--medium`: run episode using medium-tier curriculum sampling.
111
+ - `--hard`: run episode using hard-tier curriculum sampling.
112
+ - `--debug`: print raw model response snippets for troubleshooting.
113
+
114
+ Example:
115
+
116
+ ```bash
117
+ python inference.py --medium --debug
118
+ ```
119
+
120
+ The script also enforces a model-thinking/output cap:
121
+
122
+ - `THINKING_TOKEN_LIMIT` (default `512`) is used as `max_tokens` in model calls.
123
+ - `thought` content is hard-truncated before action validation to prevent oversized payloads.
124
+
125
  ## Docker + Deployment
126
 
127
  Build locally:
__pycache__/client.cpython-312.pyc CHANGED
Binary files a/__pycache__/client.cpython-312.pyc and b/__pycache__/client.cpython-312.pyc differ
 
__pycache__/context.cpython-312.pyc CHANGED
Binary files a/__pycache__/context.cpython-312.pyc and b/__pycache__/context.cpython-312.pyc differ
 
__pycache__/environment.cpython-312.pyc CHANGED
Binary files a/__pycache__/environment.cpython-312.pyc and b/__pycache__/environment.cpython-312.pyc differ
 
__pycache__/models.cpython-312.pyc CHANGED
Binary files a/__pycache__/models.cpython-312.pyc and b/__pycache__/models.cpython-312.pyc differ
 
__pycache__/sandbox.cpython-312.pyc CHANGED
Binary files a/__pycache__/sandbox.cpython-312.pyc and b/__pycache__/sandbox.cpython-312.pyc differ
 
inference.py CHANGED
@@ -15,17 +15,23 @@ This script prints exactly:
15
 
16
  from __future__ import annotations
17
 
 
18
  import asyncio
19
  import json
20
  import os
21
  import re
22
- from typing import Any
 
 
23
 
24
  from openai import OpenAI
25
 
26
  try:
27
  from tracefix_rl import CodeAction, TraceFixRLEnv
28
- except ImportError:
 
 
 
29
  from client import TraceFixRLEnv
30
  from models import CodeAction
31
 
@@ -40,6 +46,9 @@ TASK_NAME = os.getenv("TASK_NAME", "tracefix_rl")
40
  BENCHMARK = os.getenv("BENCHMARK", "tracefix_rl")
41
  MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
42
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
 
 
 
43
 
44
  SYSTEM_PROMPT = (
45
  "You are controlling a Python debugging RL environment. "
@@ -54,7 +63,7 @@ def log_start(task: str, env: str, model: str) -> None:
54
  print(f"[START] task={task} env={env} model={model}", flush=True)
55
 
56
 
57
- def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
58
  error_value = error if error else "null"
59
  print(
60
  f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error_value}",
@@ -106,7 +115,9 @@ def _build_observation_text(observation: Any) -> str:
106
  )
107
 
108
 
109
- def _get_model_action(client: OpenAI, observation: Any, history: list[str]) -> dict[str, Any]:
 
 
110
  obs_text = _build_observation_text(observation)
111
  user_prompt = (
112
  "Pick the single best next action and return only JSON.\n\n"
@@ -121,10 +132,12 @@ def _get_model_action(client: OpenAI, observation: Any, history: list[str]) -> d
121
  {"role": "user", "content": user_prompt},
122
  ],
123
  temperature=0.0,
124
- max_tokens=300,
125
  stream=False,
126
  )
127
  response_text = (completion.choices[0].message.content or "").strip()
 
 
128
  action = _extract_json(response_text)
129
  except Exception:
130
  action = {"action_type": "RUN_TESTS"}
@@ -143,9 +156,13 @@ def _get_model_action(client: OpenAI, observation: Any, history: list[str]) -> d
143
 
144
 
145
  def _to_code_action(action_dict: dict[str, Any]) -> CodeAction:
 
 
 
 
146
  payload = {
147
  "action_type": action_dict.get("action_type", "RUN_TESTS"),
148
- "thought": action_dict.get("thought"),
149
  "start_line": action_dict.get("start_line"),
150
  "end_line": action_dict.get("end_line"),
151
  "new_code_block": action_dict.get("new_code_block"),
@@ -167,10 +184,10 @@ def _compute_score(step_result: Any, rewards: list[float]) -> float:
167
  return max(0.0, min(1.0, float(raw)))
168
 
169
 
170
- async def main() -> None:
171
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
172
 
173
- env: TraceFixRLEnv | None = None
174
  rewards: list[float] = []
175
  history: list[str] = []
176
  steps_taken = 0
@@ -184,7 +201,11 @@ async def main() -> None:
184
  else:
185
  env = TraceFixRLEnv(base_url=ENV_BASE_URL)
186
 
187
- result = await env.reset()
 
 
 
 
188
  task_name = result.observation.info.get("task_name") or TASK_NAME
189
  log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
190
  started = True
@@ -193,7 +214,7 @@ async def main() -> None:
193
  if result.done:
194
  break
195
 
196
- action_dict = _get_model_action(client, result.observation, history)
197
  action = _to_code_action(action_dict)
198
  result = await env.step(action)
199
 
@@ -238,4 +259,20 @@ async def main() -> None:
238
 
239
 
240
  if __name__ == "__main__":
241
- asyncio.run(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  from __future__ import annotations
17
 
18
+ import argparse
19
  import asyncio
20
  import json
21
  import os
22
  import re
23
+ import sys
24
+ from pathlib import Path
25
+ from typing import Any, Optional
26
 
27
  from openai import OpenAI
28
 
29
  try:
30
  from tracefix_rl import CodeAction, TraceFixRLEnv
31
+ except Exception:
32
+ ROOT_DIR = Path(__file__).resolve().parent
33
+ if str(ROOT_DIR) not in sys.path:
34
+ sys.path.insert(0, str(ROOT_DIR))
35
  from client import TraceFixRLEnv
36
  from models import CodeAction
37
 
 
46
  BENCHMARK = os.getenv("BENCHMARK", "tracefix_rl")
47
  MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
48
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
49
+ THINKING_TOKEN_LIMIT = int(os.getenv("THINKING_TOKEN_LIMIT", "512"))
50
+ # Approximation used for hard truncation before sending to server.
51
+ THINKING_CHAR_LIMIT = THINKING_TOKEN_LIMIT * 4
52
 
53
  SYSTEM_PROMPT = (
54
  "You are controlling a Python debugging RL environment. "
 
63
  print(f"[START] task={task} env={env} model={model}", flush=True)
64
 
65
 
66
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
67
  error_value = error if error else "null"
68
  print(
69
  f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error_value}",
 
115
  )
116
 
117
 
118
+ def _get_model_action(
119
+ client: OpenAI, observation: Any, history: list[str], debug: bool = False
120
+ ) -> dict[str, Any]:
121
  obs_text = _build_observation_text(observation)
122
  user_prompt = (
123
  "Pick the single best next action and return only JSON.\n\n"
 
132
  {"role": "user", "content": user_prompt},
133
  ],
134
  temperature=0.0,
135
+ max_tokens=THINKING_TOKEN_LIMIT,
136
  stream=False,
137
  )
138
  response_text = (completion.choices[0].message.content or "").strip()
139
+ if debug:
140
+ print(f"[DEBUG] raw_model_response={response_text[:500]}", flush=True)
141
  action = _extract_json(response_text)
142
  except Exception:
143
  action = {"action_type": "RUN_TESTS"}
 
156
 
157
 
158
  def _to_code_action(action_dict: dict[str, Any]) -> CodeAction:
159
+ thought = action_dict.get("thought")
160
+ if isinstance(thought, str):
161
+ thought = thought[:THINKING_CHAR_LIMIT]
162
+
163
  payload = {
164
  "action_type": action_dict.get("action_type", "RUN_TESTS"),
165
+ "thought": thought,
166
  "start_line": action_dict.get("start_line"),
167
  "end_line": action_dict.get("end_line"),
168
  "new_code_block": action_dict.get("new_code_block"),
 
184
  return max(0.0, min(1.0, float(raw)))
185
 
186
 
187
+ async def run(difficulty: Optional[str] = None, debug: bool = False) -> None:
188
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
189
 
190
+ env: Optional[TraceFixRLEnv] = None
191
  rewards: list[float] = []
192
  history: list[str] = []
193
  steps_taken = 0
 
201
  else:
202
  env = TraceFixRLEnv(base_url=ENV_BASE_URL)
203
 
204
+ if difficulty:
205
+ reset_kwargs = {"difficulty": difficulty}
206
+ result = await env.reset(**reset_kwargs)
207
+ else:
208
+ result = await env.reset()
209
  task_name = result.observation.info.get("task_name") or TASK_NAME
210
  log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
211
  started = True
 
214
  if result.done:
215
  break
216
 
217
+ action_dict = _get_model_action(client, result.observation, history, debug=debug)
218
  action = _to_code_action(action_dict)
219
  result = await env.step(action)
220
 
 
259
 
260
 
261
  if __name__ == "__main__":
262
+ parser = argparse.ArgumentParser(description="Run TraceFix-RL inference baseline.")
263
+ group = parser.add_mutually_exclusive_group()
264
+ group.add_argument("--easy", action="store_true", help="Run on easy curriculum tier.")
265
+ group.add_argument("--medium", action="store_true", help="Run on medium curriculum tier.")
266
+ group.add_argument("--hard", action="store_true", help="Run on hard curriculum tier.")
267
+ parser.add_argument("--debug", action="store_true", help="Print debug model output snippets.")
268
+ args = parser.parse_args()
269
+
270
+ difficulty: Optional[str] = None
271
+ if args.easy:
272
+ difficulty = "easy"
273
+ elif args.medium:
274
+ difficulty = "medium"
275
+ elif args.hard:
276
+ difficulty = "hard"
277
+
278
+ asyncio.run(run(difficulty=difficulty, debug=args.debug))
server/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/server/__pycache__/__init__.cpython-312.pyc and b/server/__pycache__/__init__.cpython-312.pyc differ
 
server/tracefix_rl_environment.py CHANGED
@@ -20,7 +20,14 @@ class TraceFixRLEnvironment(Environment):
20
  self._gym = TraceFixRLGym()
21
  self._state = State(episode_id="", step_count=0)
22
 
23
- def reset(self) -> CodeObservation:
 
 
 
 
 
 
 
24
  obs, system_prompt = self._gym.reset()
25
  self._state = State(
26
  episode_id=obs.info.get("episode_id", ""),
 
20
  self._gym = TraceFixRLGym()
21
  self._state = State(episode_id="", step_count=0)
22
 
23
+ def reset(self, difficulty: str | None = None) -> CodeObservation:
24
+ if difficulty == "easy":
25
+ self._gym.training_step = 1
26
+ elif difficulty == "medium":
27
+ self._gym.training_step = 2000
28
+ elif difficulty == "hard":
29
+ self._gym.training_step = 6000
30
+
31
  obs, system_prompt = self._gym.reset()
32
  self._state = State(
33
  episode_id=obs.info.get("episode_id", ""),