immortalindeed commited on
Commit
723407b
·
1 Parent(s): b7c48de

Fix: abort [END] lines use rewards=0.01 instead of empty rewards= to prevent evaluator 0.0 score

Browse files
Files changed (1) hide show
  1. inference.py +4 -4
inference.py CHANGED
@@ -266,12 +266,12 @@ def run_task(client: OpenAI, task_id: str) -> tuple:
266
  except Exception as e:
267
  # Env unreachable — must still emit [START] and [END]
268
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
269
- print(f"[END] success=false steps=0 rewards=", flush=True)
270
  return 0.01, False
271
 
272
  if "error" in data and not data.get("episode_id"):
273
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
274
- print(f"[END] success=false steps=0 rewards=", flush=True)
275
  return 0.01, False
276
 
277
  episode_id = data.get("episode_id", "unknown")
@@ -420,13 +420,13 @@ def main() -> None:
420
  if remaining not in scores:
421
  scores[remaining] = 0.01
422
  print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
423
- print(f"[END] success=false steps=0 rewards=", flush=True)
424
  break
425
 
426
  except Exception as e:
427
  scores[task_id] = 0.01
428
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
429
- print(f"[END] success=false steps=0 rewards=", flush=True)
430
 
431
  avg = round(sum(scores.values()) / max(len(scores), 1), 4)
432
  print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)
 
266
  except Exception as e:
267
  # Env unreachable — must still emit [START] and [END]
268
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
269
+ print(f"[END] success=false steps=0 rewards=0.01", flush=True)
270
  return 0.01, False
271
 
272
  if "error" in data and not data.get("episode_id"):
273
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
274
+ print(f"[END] success=false steps=0 rewards=0.01", flush=True)
275
  return 0.01, False
276
 
277
  episode_id = data.get("episode_id", "unknown")
 
420
  if remaining not in scores:
421
  scores[remaining] = 0.01
422
  print(f"[START] task={remaining} env={BENCHMARK} model={MODEL_NAME}", flush=True)
423
+ print(f"[END] success=false steps=0 rewards=0.01", flush=True)
424
  break
425
 
426
  except Exception as e:
427
  scores[task_id] = 0.01
428
  print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}", flush=True)
429
+ print(f"[END] success=false steps=0 rewards=0.01", flush=True)
430
 
431
  avg = round(sum(scores.values()) / max(len(scores), 1), 4)
432
  print(f"\n✅ All tasks complete! Average: {avg:.4f}", flush=True)