Commit ·
96c1a25
1
Parent(s): 7f6de27
Fixed structured logging format for validator
Browse files- inference.py +8 -8
inference.py
CHANGED
|
@@ -213,7 +213,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
|
|
| 213 |
state = get_state()
|
| 214 |
|
| 215 |
# STRICT logging format required by evaluator.
|
| 216 |
-
print(f"START {task_id}", flush=True)
|
| 217 |
|
| 218 |
cumulative_reward = 0.0
|
| 219 |
|
|
@@ -228,7 +228,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
|
|
| 228 |
cumulative_reward += raw_reward
|
| 229 |
|
| 230 |
# STRICT logging format required by evaluator.
|
| 231 |
-
print(f"STEP {int(action)}", flush=True)
|
| 232 |
|
| 233 |
if result.get("done") or result.get("terminated") or result.get("truncated"):
|
| 234 |
break
|
|
@@ -237,7 +237,7 @@ def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> flo
|
|
| 237 |
task_score = grade(task_id, final_state.get("current_code", ""))
|
| 238 |
|
| 239 |
# STRICT logging format required by evaluator.
|
| 240 |
-
print(f"END {task_score:.4f}", flush=True)
|
| 241 |
|
| 242 |
return task_score
|
| 243 |
|
|
@@ -326,11 +326,11 @@ def run_all_tasks() -> Dict[str, float]:
|
|
| 326 |
for _ in range(5):
|
| 327 |
state = get_state()
|
| 328 |
action = _choose_action_name(str(state.get("current_code", "")), task_id)
|
| 329 |
-
print(f"STEP {int(action)}", flush=True)
|
| 330 |
step_env(action)
|
| 331 |
final_state = get_state()
|
| 332 |
score = float(grade(task_id, final_state.get("current_code", "")))
|
| 333 |
-
print(f"END {float(score):.4f}", flush=True)
|
| 334 |
scores.append(score)
|
| 335 |
if task_id == "rename_variables":
|
| 336 |
results["easy"] = score
|
|
@@ -345,18 +345,18 @@ def run_all_tasks() -> Dict[str, float]:
|
|
| 345 |
else:
|
| 346 |
# Local in-process execution (fast + no network recursion).
|
| 347 |
for task_id in task_plan:
|
| 348 |
-
print(f"START {task_id}", flush=True)
|
| 349 |
env.reset(seed=0, task_id=task_id)
|
| 350 |
for _ in range(5):
|
| 351 |
st = env.state()
|
| 352 |
code = str(st.current_code)
|
| 353 |
action = int(_choose_action_name(code, task_id))
|
| 354 |
-
print(f"STEP {int(action)}", flush=True)
|
| 355 |
env.step(action)
|
| 356 |
st = env.state()
|
| 357 |
task = registry.get_task(task_id)
|
| 358 |
score = float(task.grade_against_expected(st.current_code)) if task is not None else 0.0
|
| 359 |
-
print(f"END {float(score):.4f}", flush=True)
|
| 360 |
scores.append(score)
|
| 361 |
if task_id == "rename_variables":
|
| 362 |
results["easy"] = score
|
|
|
|
| 213 |
state = get_state()
|
| 214 |
|
| 215 |
# STRICT logging format required by evaluator.
|
| 216 |
+
print(f"[START] task={task_id}", flush=True)
|
| 217 |
|
| 218 |
cumulative_reward = 0.0
|
| 219 |
|
|
|
|
| 228 |
cumulative_reward += raw_reward
|
| 229 |
|
| 230 |
# STRICT logging format required by evaluator.
|
| 231 |
+
print(f"[STEP] action={int(action)}", flush=True)
|
| 232 |
|
| 233 |
if result.get("done") or result.get("terminated") or result.get("truncated"):
|
| 234 |
break
|
|
|
|
| 237 |
task_score = grade(task_id, final_state.get("current_code", ""))
|
| 238 |
|
| 239 |
# STRICT logging format required by evaluator.
|
| 240 |
+
print(f"[END] task={task_id} score={task_score:.4f}", flush=True)
|
| 241 |
|
| 242 |
return task_score
|
| 243 |
|
|
|
|
| 326 |
for _ in range(5):
|
| 327 |
state = get_state()
|
| 328 |
action = _choose_action_name(str(state.get("current_code", "")), task_id)
|
| 329 |
+
print(f"[STEP] action={int(action)}", flush=True)
|
| 330 |
step_env(action)
|
| 331 |
final_state = get_state()
|
| 332 |
score = float(grade(task_id, final_state.get("current_code", "")))
|
| 333 |
+
print(f"[END] task={task_id} score={float(score):.4f}", flush=True)
|
| 334 |
scores.append(score)
|
| 335 |
if task_id == "rename_variables":
|
| 336 |
results["easy"] = score
|
|
|
|
| 345 |
else:
|
| 346 |
# Local in-process execution (fast + no network recursion).
|
| 347 |
for task_id in task_plan:
|
| 348 |
+
print(f"[START] task={task_id}", flush=True)
|
| 349 |
env.reset(seed=0, task_id=task_id)
|
| 350 |
for _ in range(5):
|
| 351 |
st = env.state()
|
| 352 |
code = str(st.current_code)
|
| 353 |
action = int(_choose_action_name(code, task_id))
|
| 354 |
+
print(f"[STEP] action={int(action)}", flush=True)
|
| 355 |
env.step(action)
|
| 356 |
st = env.state()
|
| 357 |
task = registry.get_task(task_id)
|
| 358 |
score = float(task.grade_against_expected(st.current_code)) if task is not None else 0.0
|
| 359 |
+
print(f"[END] task={task_id} score={float(score):.4f}", flush=True)
|
| 360 |
scores.append(score)
|
| 361 |
if task_id == "rename_variables":
|
| 362 |
results["easy"] = score
|