Souravdanyal commited on
Commit
66d8c67
·
1 Parent(s): 2efa047
inference.py CHANGED
@@ -11,7 +11,7 @@ Usage:
11
  STDOUT FORMAT (strictly required by evaluator - plaintext):
12
  [START] task=<id> env=<benchmark> model=<model>
13
  [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
14
- [END] success=<true|false> steps=<n> score=<0.000> rewards=[<r1>,<r2>,...]
15
  """
16
 
17
  import os, sys, json, time, argparse, requests, re
@@ -31,6 +31,9 @@ API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
31
  MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.1-8b-instant")
32
  HF_TOKEN = os.getenv("HF_TOKEN")
33
  HF_TOKEN_SOURCE = "HF_TOKEN"
 
 
 
34
  if not HF_TOKEN:
35
  HF_TOKEN = os.getenv("hf_token")
36
  HF_TOKEN_SOURCE = "hf_token"
@@ -55,12 +58,12 @@ def _normalize_token(value: str) -> str:
55
  def _format_error(error: Optional[str]) -> str:
56
  if error is None:
57
  return "null"
58
- cleaned = _normalize_token(error)
59
- return cleaned if cleaned else "null"
60
 
61
 
62
  def _format_rewards(rewards: List[float]) -> str:
63
- return "[" + ",".join(f"{round(r, 2):.2f}" for r in rewards) + "]"
64
 
65
 
66
  def log_start(task_id: str, env: str, model: str) -> None:
@@ -78,7 +81,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[
78
 
79
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
80
  print(
81
- f"[END] success={_format_bool(success)} steps={steps} score={round(score, 3):.3f} "
82
  f"rewards={_format_rewards(rewards)}",
83
  flush=True,
84
  )
@@ -241,43 +244,43 @@ def run_episode(env_url: str, difficulty: str) -> tuple:
241
  last_feedback = None
242
  last_code = None
243
 
244
- for attempt in range(1, MAX_STEPS + 1):
245
- steps_taken = attempt
246
- action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
247
- code = action["fixed_code"]
248
- last_code = code
249
-
250
- if not code or not code.strip():
251
- log_step(attempt, "empty_submission", 0.0, False, "empty_code")
252
- rewards.append(0.0)
253
- continue
254
-
255
- try:
256
- result = env_step(env_url, code, action.get("explanation"))
257
- except Exception as e:
258
- log_step(attempt, "step_failed", 0.0, False, str(e)[:60])
259
- rewards.append(0.0)
260
- continue
261
-
262
- reward = result.get("reward", 0.0)
263
- done = result.get("done", False)
264
- obs_r = result.get("observation", {})
265
- last_feedback = obs_r.get("feedback", "")
266
-
267
- log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, None)
268
- rewards.append(reward)
269
-
270
- if reward >= 1.0:
271
- success = True
272
- if done:
273
- break
274
-
275
- # Compute normalised score for this episode (best reward achieved)
276
- score = max(rewards) if rewards else 0.0
277
- score = min(max(score, 0.0), 1.0)
278
- success = success or (score >= SUCCESS_SCORE_THRESHOLD)
279
-
280
- log_end(success, steps_taken, score, rewards)
281
  return success, steps_taken, rewards
282
 
283
 
@@ -291,7 +294,7 @@ def main():
291
 
292
  if not HF_TOKEN:
293
  print(
294
- "# Missing HF_TOKEN (or lowercase hf_token).",
295
  file=sys.stderr,
296
  flush=True,
297
  )
 
11
  STDOUT FORMAT (strictly required by evaluator - plaintext):
12
  [START] task=<id> env=<benchmark> model=<model>
13
  [STEP] step=<n> action=<str> reward=<0.00> done=<true|false> error=<msg|null>
14
+ [END] success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...>
15
  """
16
 
17
  import os, sys, json, time, argparse, requests, re
 
31
  MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.1-8b-instant")
32
  HF_TOKEN = os.getenv("HF_TOKEN")
33
  HF_TOKEN_SOURCE = "HF_TOKEN"
34
+ if not HF_TOKEN:
35
+ HF_TOKEN = os.getenv("API_KEY")
36
+ HF_TOKEN_SOURCE = "API_KEY"
37
  if not HF_TOKEN:
38
  HF_TOKEN = os.getenv("hf_token")
39
  HF_TOKEN_SOURCE = "hf_token"
 
58
  def _format_error(error: Optional[str]) -> str:
59
  if error is None:
60
  return "null"
61
+ text = str(error).replace("\r", "\\r").replace("\n", "\\n")
62
+ return text if text else "null"
63
 
64
 
65
  def _format_rewards(rewards: List[float]) -> str:
66
+ return ",".join(f"{round(r, 2):.2f}" for r in rewards)
67
 
68
 
69
  def log_start(task_id: str, env: str, model: str) -> None:
 
81
 
82
  def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
83
  print(
84
+ f"[END] success={_format_bool(success)} steps={steps} score={round(score, 2):.2f} "
85
  f"rewards={_format_rewards(rewards)}",
86
  flush=True,
87
  )
 
244
  last_feedback = None
245
  last_code = None
246
 
247
+ try:
248
+ for attempt in range(1, MAX_STEPS + 1):
249
+ steps_taken = attempt
250
+ action = call_llm(buggy_code, instructions, difficulty, last_feedback, attempt, last_code)
251
+ code = action.get("fixed_code") or ""
252
+ last_code = code
253
+
254
+ reward = 0.0
255
+ done = False
256
+ step_error: Optional[str] = None
257
+ try:
258
+ result = env_step(env_url, code, action.get("explanation"))
259
+ reward = result.get("reward", 0.0)
260
+ done = result.get("done", False)
261
+ obs_r = result.get("observation", {})
262
+ if isinstance(obs_r, dict):
263
+ last_feedback = obs_r.get("feedback", "")
264
+ step_error = obs_r.get("last_action_error")
265
+ if step_error is None:
266
+ step_error = obs_r.get("error")
267
+ except Exception as e:
268
+ step_error = str(e)
269
+
270
+ log_step(attempt, f"fix_{difficulty}_attempt{attempt}", reward, done, step_error)
271
+ rewards.append(reward)
272
+
273
+ if reward >= 1.0:
274
+ success = True
275
+ if done:
276
+ break
277
+ finally:
278
+ # Compute normalized score for this episode and always emit [END].
279
+ score = max(rewards) if rewards else 0.0
280
+ score = min(max(score, 0.0), 1.0)
281
+ success = success or (score >= SUCCESS_SCORE_THRESHOLD)
282
+ log_end(success, steps_taken, score, rewards)
283
+
284
  return success, steps_taken, rewards
285
 
286
 
 
294
 
295
  if not HF_TOKEN:
296
  print(
297
+ "# Missing API key. Set HF_TOKEN (or API_KEY / lowercase hf_token).",
298
  file=sys.stderr,
299
  flush=True,
300
  )
validator/__pycache__/pre_submit_check.cpython-39.pyc CHANGED
Binary files a/validator/__pycache__/pre_submit_check.cpython-39.pyc and b/validator/__pycache__/pre_submit_check.cpython-39.pyc differ
 
validator/pre_submit_check.py CHANGED
@@ -157,11 +157,13 @@ def run_checks(base_url: str):
157
  has_step = "[STEP] step=" in content
158
  has_end = "[END] success=" in content
159
  avoids_json_logs = "print(json.dumps(log_entry)" not in content
 
160
  check("inference.py emits [START] logs", has_start)
161
  check("inference.py emits [STEP] logs", has_step)
162
  check("inference.py emits [END] logs", has_end)
163
  check("inference.py avoids JSON log dict dumps", avoids_json_logs)
164
- all_passed &= has_start and has_step and has_end and avoids_json_logs
 
165
  except Exception as e:
166
  check("inference.py log format", False, str(e))
167
  all_passed = False
 
157
  has_step = "[STEP] step=" in content
158
  has_end = "[END] success=" in content
159
  avoids_json_logs = "print(json.dumps(log_entry)" not in content
160
+ rewards_csv = "rewards=[" not in content
161
  check("inference.py emits [START] logs", has_start)
162
  check("inference.py emits [STEP] logs", has_step)
163
  check("inference.py emits [END] logs", has_end)
164
  check("inference.py avoids JSON log dict dumps", avoids_json_logs)
165
+ check("inference.py emits CSV rewards in [END]", rewards_csv)
166
+ all_passed &= has_start and has_step and has_end and avoids_json_logs and rewards_csv
167
  except Exception as e:
168
  check("inference.py log format", False, str(e))
169
  all_passed = False