Spaces:
Running
Running
Commit ·
1ecd7e1
1
Parent(s): 3466d21
Skip benchmark store on fatal API errors (402/401/403)
Browse files- inference.py +17 -7
inference.py
CHANGED
|
@@ -376,6 +376,8 @@ def main() -> None:
|
|
| 376 |
return
|
| 377 |
|
| 378 |
scores = {}
|
|
|
|
|
|
|
| 379 |
for task_id in TASKS:
|
| 380 |
try:
|
| 381 |
score, is_fatal = run_task(client, task_id)
|
|
@@ -383,9 +385,10 @@ def main() -> None:
|
|
| 383 |
|
| 384 |
# If we hit a fatal API error (402/401/403), stop ALL remaining tasks
|
| 385 |
if is_fatal:
|
|
|
|
| 386 |
print(f"\n🚫 Fatal API error on {task_id}. Stopping all remaining tasks.", flush=True)
|
| 387 |
print(f" Likely cause: invalid token, no credits, or unauthorized access.", flush=True)
|
| 388 |
-
#
|
| 389 |
for remaining in TASKS:
|
| 390 |
if remaining not in scores:
|
| 391 |
scores[remaining] = 0.01
|
|
@@ -402,12 +405,19 @@ def main() -> None:
|
|
| 402 |
print(f"\n✅ All tasks complete! Average: {avg:.2f}", flush=True)
|
| 403 |
print(json.dumps({"final_scores": scores}), flush=True)
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
|
| 413 |
if __name__ == "__main__":
|
|
|
|
| 376 |
return
|
| 377 |
|
| 378 |
scores = {}
|
| 379 |
+
had_fatal_error = False
|
| 380 |
+
|
| 381 |
for task_id in TASKS:
|
| 382 |
try:
|
| 383 |
score, is_fatal = run_task(client, task_id)
|
|
|
|
| 385 |
|
| 386 |
# If we hit a fatal API error (402/401/403), stop ALL remaining tasks
|
| 387 |
if is_fatal:
|
| 388 |
+
had_fatal_error = True
|
| 389 |
print(f"\n🚫 Fatal API error on {task_id}. Stopping all remaining tasks.", flush=True)
|
| 390 |
print(f" Likely cause: invalid token, no credits, or unauthorized access.", flush=True)
|
| 391 |
+
# Emit mandatory [START]/[END] lines for remaining tasks (spec compliance)
|
| 392 |
for remaining in TASKS:
|
| 393 |
if remaining not in scores:
|
| 394 |
scores[remaining] = 0.01
|
|
|
|
| 405 |
print(f"\n✅ All tasks complete! Average: {avg:.2f}", flush=True)
|
| 406 |
print(json.dumps({"final_scores": scores}), flush=True)
|
| 407 |
|
| 408 |
+
# Only save to disk if the run was NOT killed by a fatal API error.
|
| 409 |
+
# A run where the model had no credits or invalid token produces all-0.01
|
| 410 |
+
# scores that would corrupt the benchmark history.
|
| 411 |
+
if had_fatal_error:
|
| 412 |
+
print(f"⚠️ Results NOT saved — run was aborted due to a fatal API error (invalid token / no credits).", flush=True)
|
| 413 |
+
print(f" Fix your API key/credits and re-run to get valid scores.", flush=True)
|
| 414 |
+
else:
|
| 415 |
+
try:
|
| 416 |
+
from server.benchmark_store import append_result
|
| 417 |
+
append_result(MODEL_NAME, MODEL_NAME, scores)
|
| 418 |
+
print(f"💾 Results saved (avg: {avg:.4f})", flush=True)
|
| 419 |
+
except Exception as e:
|
| 420 |
+
print(f"⚠️ Failed to save results to disk: {e}", flush=True)
|
| 421 |
|
| 422 |
|
| 423 |
if __name__ == "__main__":
|