immortalindeed commited on
Commit
1ecd7e1
·
1 Parent(s): 3466d21

Skip benchmark store on fatal API errors (402/401/403)

Browse files
Files changed (1) hide show
  1. inference.py +17 -7
inference.py CHANGED
@@ -376,6 +376,8 @@ def main() -> None:
376
  return
377
 
378
  scores = {}
 
 
379
  for task_id in TASKS:
380
  try:
381
  score, is_fatal = run_task(client, task_id)
@@ -383,9 +385,10 @@ def main() -> None:
383
 
384
  # If we hit a fatal API error (402/401/403), stop ALL remaining tasks
385
  if is_fatal:
 
386
  print(f"\n🚫 Fatal API error on {task_id}. Stopping all remaining tasks.", flush=True)
387
  print(f" Likely cause: invalid token, no credits, or unauthorized access.", flush=True)
388
- # Fill remaining tasks with 0.01
389
  for remaining in TASKS:
390
  if remaining not in scores:
391
  scores[remaining] = 0.01
@@ -402,12 +405,19 @@ def main() -> None:
402
  print(f"\n✅ All tasks complete! Average: {avg:.2f}", flush=True)
403
  print(json.dumps({"final_scores": scores}), flush=True)
404
 
405
- try:
406
- from server.benchmark_store import append_result
407
- append_result(MODEL_NAME, MODEL_NAME, scores)
408
- print(f"💾 Results saved (avg: {avg:.4f})", flush=True)
409
- except Exception as e:
410
- print(f"⚠️ Failed to save results to disk: {e}", flush=True)
 
 
 
 
 
 
 
411
 
412
 
413
  if __name__ == "__main__":
 
376
  return
377
 
378
  scores = {}
379
+ had_fatal_error = False
380
+
381
  for task_id in TASKS:
382
  try:
383
  score, is_fatal = run_task(client, task_id)
 
385
 
386
  # If we hit a fatal API error (402/401/403), stop ALL remaining tasks
387
  if is_fatal:
388
+ had_fatal_error = True
389
  print(f"\n🚫 Fatal API error on {task_id}. Stopping all remaining tasks.", flush=True)
390
  print(f" Likely cause: invalid token, no credits, or unauthorized access.", flush=True)
391
+ # Emit mandatory [START]/[END] lines for remaining tasks (spec compliance)
392
  for remaining in TASKS:
393
  if remaining not in scores:
394
  scores[remaining] = 0.01
 
405
  print(f"\n✅ All tasks complete! Average: {avg:.2f}", flush=True)
406
  print(json.dumps({"final_scores": scores}), flush=True)
407
 
408
+ # Only save to disk if the run was NOT killed by a fatal API error.
409
+ # A run where the model had no credits or invalid token produces all-0.01
410
+ # scores that would corrupt the benchmark history.
411
+ if had_fatal_error:
412
+ print(f"⚠️ Results NOT saved — run was aborted due to a fatal API error (invalid token / no credits).", flush=True)
413
+ print(f" Fix your API key/credits and re-run to get valid scores.", flush=True)
414
+ else:
415
+ try:
416
+ from server.benchmark_store import append_result
417
+ append_result(MODEL_NAME, MODEL_NAME, scores)
418
+ print(f"💾 Results saved (avg: {avg:.4f})", flush=True)
419
+ except Exception as e:
420
+ print(f"⚠️ Failed to save results to disk: {e}", flush=True)
421
 
422
 
423
  if __name__ == "__main__":