Souravdanyal commited on
Commit
511ea3a
Β·
1 Parent(s): d510c1d

fixing errors....

Browse files
inference.py CHANGED
@@ -11,7 +11,7 @@ Usage:
11
  STDOUT FORMAT (strictly required by evaluator):
12
  [START] task=<id> env=<benchmark> model=<model>
13
  [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
14
- [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
15
  """
16
 
17
  import os, sys, json, time, argparse, requests, re
@@ -25,26 +25,30 @@ HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
  ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
26
  BENCHMARK = "code-debug-env"
27
  MAX_STEPS = 5
 
28
 
29
  client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
30
 
31
  # ── Logging β€” STRICT FORMAT ───────────────────────────────────────────────────
32
- def log_start(task_id, env, model):
33
  print(f"[START] task={task_id} env={env} model={model}", flush=True)
34
 
35
- def log_step(step, action, reward, done, error):
36
- print(f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", flush=True)
 
 
37
 
38
- def log_end(success, steps, rewards):
39
- print(f"[END] success={str(success).lower()} steps={steps} rewards={','.join(f'{r:.2f}' for r in rewards)}", flush=True)
 
40
 
41
  # ── Env client ────────────────────────────────────────────────────────────────
42
- def env_reset(url, difficulty):
43
  r = requests.post(f"{url}/reset", json={"difficulty": difficulty}, timeout=30)
44
  r.raise_for_status()
45
  return r.json()
46
 
47
- def env_step(url, fixed_code, explanation=None):
48
  payload = {"fixed_code": fixed_code}
49
  if explanation:
50
  payload["explanation"] = explanation
@@ -90,25 +94,27 @@ def _parse_llm_response(raw: str, buggy_code: str) -> dict:
90
 
91
  # Find JSON boundaries
92
  start = raw.find("{")
93
- end = raw.rfind("}") + 1
94
  if start >= 0 and end > start:
95
  raw = raw[start:end]
96
 
97
  # Try direct parse
98
  try:
99
  parsed = json.loads(raw)
100
- return {"fixed_code": parsed.get("fixed_code", ""), "explanation": parsed.get("explanation")}
 
 
 
101
  except json.JSONDecodeError:
102
  pass
103
 
104
- # Fix control characters (literal newlines inside JSON strings)
105
  try:
106
- fixed = re.sub(r'(?<!\\)\n', r'\\n', raw)
107
- fixed = re.sub(r'(?<!\\)\t', r'\\t', raw)
108
- fixed = re.sub(r'(?<!\\)\r', r'\\r', raw)
109
  parsed = json.loads(fixed)
110
- # Unescape the fixed_code back to real newlines
111
- code = parsed.get("fixed_code", "")
112
  if "\\n" in code:
113
  code = code.replace("\\n", "\n").replace("\\t", "\t")
114
  return {"fixed_code": code, "explanation": parsed.get("explanation")}
@@ -116,28 +122,42 @@ def _parse_llm_response(raw: str, buggy_code: str) -> dict:
116
  pass
117
 
118
  # Last resort: regex extraction
119
- code_match = re.search(r'"fixed_code"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[,}]', raw, re.DOTALL)
120
- exp_match = re.search(r'"explanation"\s*:\s*"((?:[^"\\]|\\.)*)"\s*[,}]', raw, re.DOTALL)
121
 
122
  if code_match:
123
  code = code_match.group(1).replace("\\n", "\n").replace("\\t", "\t")
124
- exp = exp_match.group(1).replace("\\n", "\n") if exp_match else None
125
  return {"fixed_code": code, "explanation": exp}
126
 
127
- # Complete fallback
128
  return {"fixed_code": buggy_code, "explanation": None}
129
 
130
 
131
- def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, prev_code=None):
132
- content = f"Difficulty: {difficulty}\nInstructions: {instructions}\n\nBuggy code:\n```python\n{buggy_code}\n```\n"
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  if feedback and attempt > 1:
135
- content += f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\nYour previous code:\n```python\n{prev_code or ''}\n```\n"
136
- content += "ANALYZE THE FEEDBACK CAREFULLY:\n"
137
- content += "- Look at Input/Expected/Got for each failing test\n"
138
- content += "- If Got shows wrong rotation direction: use lst[-k:] + lst[:-k] for RIGHT rotate\n"
139
- content += "- If TimeoutError: add visited=set([start]) before queue in graph code\n"
140
- content += "- Try a COMPLETELY DIFFERENT fix.\n"
 
 
 
141
 
142
  if difficulty == "hard":
143
  hint_match = re.search(r'[Mm]ention[:\s]+([^.]+?)(?:\.|$)', instructions)
@@ -151,7 +171,7 @@ def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, pre
151
  model=MODEL_NAME,
152
  messages=[
153
  {"role": "system", "content": SYSTEM_PROMPT},
154
- {"role": "user", "content": content}
155
  ],
156
  max_tokens=1500,
157
  temperature=0.1 if attempt == 1 else 0.4,
@@ -164,23 +184,27 @@ def call_llm(buggy_code, instructions, difficulty, feedback=None, attempt=1, pre
164
 
165
 
166
  # ── Episode ───────────────────────────────────────────────────────────────────
167
- def run_episode(env_url, difficulty):
168
- data = env_reset(env_url, difficulty)
169
- obs = data["observation"]
 
170
  task_id = obs["task_id"]
171
  buggy_code = obs["buggy_code"]
172
  instructions = obs["instructions"]
173
 
174
  log_start(task_id, BENCHMARK, MODEL_NAME)
175
 
176
- rewards, steps_taken, success = [], 0, False
177
- last_feedback, last_code = None, None
 
 
 
178
 
179
  for attempt in range(1, MAX_STEPS + 1):
180
  steps_taken = attempt
181
- action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
182
- code = action["fixed_code"]
183
- last_code = code
184
 
185
  if not code or not code.strip():
186
  log_step(attempt, "empty_submission", 0.0, False, "empty_code")
@@ -194,9 +218,9 @@ def run_episode(env_url, difficulty):
194
  rewards.append(0.0)
195
  continue
196
 
197
- reward = result.get("reward", 0.0)
198
- done = result.get("done", False)
199
- obs_r = result.get("observation", {})
200
  last_feedback = obs_r.get("feedback", "")
201
 
202
  log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
@@ -207,27 +231,35 @@ def run_episode(env_url, difficulty):
207
  if done:
208
  break
209
 
210
- log_end(success, steps_taken, rewards)
 
 
 
 
 
211
  return success, steps_taken, rewards
212
 
213
 
214
  # ── Main ──────────────────────────────────────────────────────────────────────
215
  def main():
216
  parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
217
- parser.add_argument("--url", default=ENV_URL)
218
- parser.add_argument("--difficulty", default=None, choices=["easy","medium","hard","all"])
219
  args = parser.parse_args()
220
- url = args.url.rstrip("/")
221
 
 
222
  try:
223
  requests.get(f"{url}/health", timeout=10).raise_for_status()
224
- print(f"# Environment healthy at {url}", flush=True)
225
  except Exception as e:
226
  print(f"# Health check failed: {e}", file=sys.stderr)
227
  sys.exit(1)
228
 
229
- diffs = ["easy","medium","hard"] if args.difficulty in (None,"all") else [args.difficulty]
230
- all_rewards, successes = [], []
 
 
231
 
232
  for d in diffs:
233
  ok, _, rewards = run_episode(url, d)
@@ -235,8 +267,12 @@ def main():
235
  successes.append(ok)
236
  time.sleep(0.5)
237
 
238
- avg = round(sum(all_rewards)/len(all_rewards), 3) if all_rewards else 0.0
239
- print(f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}", flush=True)
 
 
 
 
240
 
241
  if __name__ == "__main__":
242
- main()
 
11
  STDOUT FORMAT (strictly required by evaluator):
12
  [START] task=<id> env=<benchmark> model=<model>
13
  [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
14
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...,rn>
15
  """
16
 
17
  import os, sys, json, time, argparse, requests, re
 
25
  ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
26
  BENCHMARK = "code-debug-env"
27
  MAX_STEPS = 5
28
+ SUCCESS_SCORE_THRESHOLD = 0.5
29
 
30
  client = OpenAI(api_key=HF_TOKEN or "dummy", base_url=API_BASE_URL)
31
 
32
  # ── Logging β€” STRICT FORMAT ───────────────────────────────────────────────────
33
+ def log_start(task_id: str, env: str, model: str) -> None:
34
  print(f"[START] task={task_id} env={env} model={model}", flush=True)
35
 
36
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
37
+ error_val = error if error else "null"
38
+ done_val = str(done).lower()
39
+ print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
40
 
41
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
42
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
43
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
44
 
45
  # ── Env client ────────────────────────────────────────────────────────────────
46
+ def env_reset(url: str, difficulty: str) -> dict:
47
  r = requests.post(f"{url}/reset", json={"difficulty": difficulty}, timeout=30)
48
  r.raise_for_status()
49
  return r.json()
50
 
51
+ def env_step(url: str, fixed_code: str, explanation: Optional[str] = None) -> dict:
52
  payload = {"fixed_code": fixed_code}
53
  if explanation:
54
  payload["explanation"] = explanation
 
94
 
95
  # Find JSON boundaries
96
  start = raw.find("{")
97
+ end = raw.rfind("}") + 1
98
  if start >= 0 and end > start:
99
  raw = raw[start:end]
100
 
101
  # Try direct parse
102
  try:
103
  parsed = json.loads(raw)
104
+ return {
105
+ "fixed_code": parsed.get("fixed_code", ""),
106
+ "explanation": parsed.get("explanation"),
107
+ }
108
  except json.JSONDecodeError:
109
  pass
110
 
111
+ # Fix literal control characters inside JSON strings
112
  try:
113
+ fixed = re.sub(r'(?<!\\)\n', r'\\n', raw)
114
+ fixed = re.sub(r'(?<!\\)\t', r'\\t', fixed)
115
+ fixed = re.sub(r'(?<!\\)\r', r'\\r', fixed)
116
  parsed = json.loads(fixed)
117
+ code = parsed.get("fixed_code", "")
 
118
  if "\\n" in code:
119
  code = code.replace("\\n", "\n").replace("\\t", "\t")
120
  return {"fixed_code": code, "explanation": parsed.get("explanation")}
 
122
  pass
123
 
124
  # Last resort: regex extraction
125
+ code_match = re.search(r'"fixed_code"\s*:\s*"((?:[^"\\]|\\.)*)"', raw, re.DOTALL)
126
+ exp_match = re.search(r'"explanation"\s*:\s*"((?:[^"\\]|\\.)*)"', raw, re.DOTALL)
127
 
128
  if code_match:
129
  code = code_match.group(1).replace("\\n", "\n").replace("\\t", "\t")
130
+ exp = exp_match.group(1).replace("\\n", "\n") if exp_match else None
131
  return {"fixed_code": code, "explanation": exp}
132
 
133
+ # Complete fallback β€” return buggy code unchanged
134
  return {"fixed_code": buggy_code, "explanation": None}
135
 
136
 
137
+ def call_llm(
138
+ buggy_code: str,
139
+ instructions: str,
140
+ difficulty: str,
141
+ feedback: Optional[str] = None,
142
+ attempt: int = 1,
143
+ prev_code: Optional[str] = None,
144
+ ) -> dict:
145
+ content = (
146
+ f"Difficulty: {difficulty}\n"
147
+ f"Instructions: {instructions}\n\n"
148
+ f"Buggy code:\n```python\n{buggy_code}\n```\n"
149
+ )
150
 
151
  if feedback and attempt > 1:
152
+ content += (
153
+ f"\nPREVIOUS FIX FAILED. Feedback:\n{feedback}\n\n"
154
+ f"Your previous code:\n```python\n{prev_code or ''}\n```\n"
155
+ "ANALYZE THE FEEDBACK CAREFULLY:\n"
156
+ "- Look at Input/Expected/Got for each failing test\n"
157
+ "- If Got shows wrong rotation direction: use lst[-k:] + lst[:-k] for RIGHT rotate\n"
158
+ "- If TimeoutError: add visited=set([start]) before queue in graph code\n"
159
+ "- Try a COMPLETELY DIFFERENT fix.\n"
160
+ )
161
 
162
  if difficulty == "hard":
163
  hint_match = re.search(r'[Mm]ention[:\s]+([^.]+?)(?:\.|$)', instructions)
 
171
  model=MODEL_NAME,
172
  messages=[
173
  {"role": "system", "content": SYSTEM_PROMPT},
174
+ {"role": "user", "content": content},
175
  ],
176
  max_tokens=1500,
177
  temperature=0.1 if attempt == 1 else 0.4,
 
184
 
185
 
186
  # ── Episode ───────────────────────────────────────────────────────────────────
187
+ def run_episode(env_url: str, difficulty: str) -> tuple:
188
+ """Run one full episode. Returns (success, steps_taken, rewards)."""
189
+ data = env_reset(env_url, difficulty)
190
+ obs = data["observation"]
191
  task_id = obs["task_id"]
192
  buggy_code = obs["buggy_code"]
193
  instructions = obs["instructions"]
194
 
195
  log_start(task_id, BENCHMARK, MODEL_NAME)
196
 
197
+ rewards: List[float] = []
198
+ steps_taken = 0
199
+ success = False
200
+ last_feedback = None
201
+ last_code = None
202
 
203
  for attempt in range(1, MAX_STEPS + 1):
204
  steps_taken = attempt
205
+ action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
206
+ code = action["fixed_code"]
207
+ last_code = code
208
 
209
  if not code or not code.strip():
210
  log_step(attempt, "empty_submission", 0.0, False, "empty_code")
 
218
  rewards.append(0.0)
219
  continue
220
 
221
+ reward = result.get("reward", 0.0)
222
+ done = result.get("done", False)
223
+ obs_r = result.get("observation", {})
224
  last_feedback = obs_r.get("feedback", "")
225
 
226
  log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
 
231
  if done:
232
  break
233
 
234
+ # Compute normalised score for this episode (best reward achieved)
235
+ score = max(rewards) if rewards else 0.0
236
+ score = min(max(score, 0.0), 1.0)
237
+ success = success or (score >= SUCCESS_SCORE_THRESHOLD)
238
+
239
+ log_end(success, steps_taken, score, rewards)
240
  return success, steps_taken, rewards
241
 
242
 
243
  # ── Main ──────────────────────────────────────────────────────────────────────
244
  def main():
245
  parser = argparse.ArgumentParser(description="Code Debug Environment Baseline Agent")
246
+ parser.add_argument("--url", default=ENV_URL)
247
+ parser.add_argument("--difficulty", default=None, choices=["easy", "medium", "hard", "all"])
248
  args = parser.parse_args()
249
+ url = args.url.rstrip("/")
250
 
251
+ # Health check
252
  try:
253
  requests.get(f"{url}/health", timeout=10).raise_for_status()
254
+ print(f"# Environment healthy at {url}", file=sys.stderr, flush=True)
255
  except Exception as e:
256
  print(f"# Health check failed: {e}", file=sys.stderr)
257
  sys.exit(1)
258
 
259
+ diffs = ["easy", "medium", "hard"] if args.difficulty in (None, "all") else [args.difficulty]
260
+
261
+ all_rewards: List[float] = []
262
+ successes: List[bool] = []
263
 
264
  for d in diffs:
265
  ok, _, rewards = run_episode(url, d)
 
267
  successes.append(ok)
268
  time.sleep(0.5)
269
 
270
+ avg = round(sum(all_rewards) / len(all_rewards), 3) if all_rewards else 0.0
271
+ print(
272
+ f"# SUMMARY: {sum(successes)}/{len(diffs)} tasks solved | avg_reward={avg}",
273
+ file=sys.stderr, flush=True,
274
+ )
275
+
276
 
277
  if __name__ == "__main__":
278
+ main()
server/__pycache__/app.cpython-310.pyc CHANGED
Binary files a/server/__pycache__/app.cpython-310.pyc and b/server/__pycache__/app.cpython-310.pyc differ
 
server/__pycache__/environment.cpython-310.pyc CHANGED
Binary files a/server/__pycache__/environment.cpython-310.pyc and b/server/__pycache__/environment.cpython-310.pyc differ
 
server/tasks/__pycache__/task_easy.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_easy.cpython-310.pyc and b/server/tasks/__pycache__/task_easy.cpython-310.pyc differ
 
server/tasks/__pycache__/task_hard.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_hard.cpython-310.pyc and b/server/tasks/__pycache__/task_hard.cpython-310.pyc differ
 
server/tasks/__pycache__/task_medium.cpython-310.pyc CHANGED
Binary files a/server/tasks/__pycache__/task_medium.cpython-310.pyc and b/server/tasks/__pycache__/task_medium.cpython-310.pyc differ