PRANAV05092003 commited on
Commit
96c1a25
·
1 Parent(s): 7f6de27

Fixed structured logging format for validator

Browse files
Files changed (1) hide show
  1. inference.py +8 -8
inference.py CHANGED
@@ -213,7 +213,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
213
  state = get_state()
214
 
215
  # STRICT logging format required by evaluator.
216
- print(f"START {task_id}", flush=True)
217
 
218
  cumulative_reward = 0.0
219
 
@@ -228,7 +228,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
228
  cumulative_reward += raw_reward
229
 
230
  # STRICT logging format required by evaluator.
231
- print(f"STEP {int(action)}", flush=True)
232
 
233
  if result.get("done") or result.get("terminated") or result.get("truncated"):
234
  break
@@ -237,7 +237,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
237
  task_score = grade(task_id, final_state.get("current_code", ""))
238
 
239
  # STRICT logging format required by evaluator.
240
- print(f"END {task_score:.4f}", flush=True)
241
 
242
  return task_score
243
 
@@ -326,11 +326,11 @@ def run_all_tasks() -> Dict[str, float]:
326
  for _ in range(5):
327
  state = get_state()
328
  action = _choose_action_name(str(state.get("current_code", "")), task_id)
329
- print(f"STEP {int(action)}", flush=True)
330
  step_env(action)
331
  final_state = get_state()
332
  score = float(grade(task_id, final_state.get("current_code", "")))
333
- print(f"END {float(score):.4f}", flush=True)
334
  scores.append(score)
335
  if task_id == "rename_variables":
336
  results["easy"] = score
@@ -345,18 +345,18 @@ def run_all_tasks() -> Dict[str, float]:
345
  else:
346
  # Local in-process execution (fast + no network recursion).
347
  for task_id in task_plan:
348
- print(f"START {task_id}", flush=True)
349
  env.reset(seed=0, task_id=task_id)
350
  for _ in range(5):
351
  st = env.state()
352
  code = str(st.current_code)
353
  action = int(_choose_action_name(code, task_id))
354
- print(f"STEP {int(action)}", flush=True)
355
  env.step(action)
356
  st = env.state()
357
  task = registry.get_task(task_id)
358
  score = float(task.grade_against_expected(st.current_code)) if task is not None else 0.0
359
- print(f"END {float(score):.4f}", flush=True)
360
  scores.append(score)
361
  if task_id == "rename_variables":
362
  results["easy"] = score
 
213
  state = get_state()
214
 
215
  # STRICT logging format required by evaluator.
216
+ print(f"[START] task={task_id}", flush=True)
217
 
218
  cumulative_reward = 0.0
219
 
 
228
  cumulative_reward += raw_reward
229
 
230
  # STRICT logging format required by evaluator.
231
+ print(f"[STEP] action={int(action)}", flush=True)
232
 
233
  if result.get("done") or result.get("terminated") or result.get("truncated"):
234
  break
 
237
  task_score = grade(task_id, final_state.get("current_code", ""))
238
 
239
  # STRICT logging format required by evaluator.
240
+ print(f"[END] task={task_id} score={task_score:.4f}", flush=True)
241
 
242
  return task_score
243
 
 
326
  for _ in range(5):
327
  state = get_state()
328
  action = _choose_action_name(str(state.get("current_code", "")), task_id)
329
+ print(f"[STEP] action={int(action)}", flush=True)
330
  step_env(action)
331
  final_state = get_state()
332
  score = float(grade(task_id, final_state.get("current_code", "")))
333
+ print(f"[END] task={task_id} score={float(score):.4f}", flush=True)
334
  scores.append(score)
335
  if task_id == "rename_variables":
336
  results["easy"] = score
 
345
  else:
346
  # Local in-process execution (fast + no network recursion).
347
  for task_id in task_plan:
348
+ print(f"[START] task={task_id}", flush=True)
349
  env.reset(seed=0, task_id=task_id)
350
  for _ in range(5):
351
  st = env.state()
352
  code = str(st.current_code)
353
  action = int(_choose_action_name(code, task_id))
354
+ print(f"[STEP] action={int(action)}", flush=True)
355
  env.step(action)
356
  st = env.state()
357
  task = registry.get_task(task_id)
358
  score = float(task.grade_against_expected(st.current_code)) if task is not None else 0.0
359
+ print(f"[END] task={task_id} score={float(score):.4f}", flush=True)
360
  scores.append(score)
361
  if task_id == "rename_variables":
362
  results["easy"] = score