Spaces:
Sleeping
Sleeping
Reject sorry final_answer while Aristotle pending (max 5 rejections)
Browse filesWhen agent tries to finalize with sorry-containing code while
Aristotle jobs are still running, reject it and force the agent
to keep working. Cap at 5 rejections to avoid infinite cost burn
(B2 test burned $8 on 23 rejections).
Validated: B2 ran 70+ minutes, received completed Aristotle result,
agent was forced to continue working instead of quitting at 14 min.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
agent.py
CHANGED
|
@@ -372,10 +372,29 @@ async def run_agent_job(job: JobState) -> None:
|
|
| 372 |
|
| 373 |
# Handle final_answer
|
| 374 |
if fn_name == "final_answer":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
job.answer = fn_args.get("answer", "")
|
| 376 |
-
job.best_lean_code =
|
| 377 |
job.best_code_verified = fn_args.get("verified", False)
|
| 378 |
-
job.best_code_sorry_free =
|
| 379 |
job.set_phase(JobPhase.COMPLETED)
|
| 380 |
job.finished_at = time.time()
|
| 381 |
job.add_status("Research complete!")
|
|
|
|
| 372 |
|
| 373 |
# Handle final_answer
|
| 374 |
if fn_name == "final_answer":
|
| 375 |
+
lean_code = fn_args.get("lean_code", job.best_lean_code)
|
| 376 |
+
has_sorry = "sorry" in lean_code
|
| 377 |
+
has_pending_aristotle = any(
|
| 378 |
+
aj.get("status") in ("SUBMITTED", "QUEUED", "IN_PROGRESS")
|
| 379 |
+
for aj in job.aristotle_jobs
|
| 380 |
+
)
|
| 381 |
+
# Reject premature finalization: sorry + Aristotle still running + budget remaining
|
| 382 |
+
# But cap rejections at 5 to avoid infinite cost burn
|
| 383 |
+
reject_count = sum(1 for s in job.status_log if 'Rejected final_answer' in s)
|
| 384 |
+
if has_sorry and has_pending_aristotle and job.total_cost < MAX_COST_PER_QUERY * 0.8 and reject_count < 5:
|
| 385 |
+
job.add_status("Rejected final_answer: proof has sorry and Aristotle is still running. Keep working!")
|
| 386 |
+
job.messages.append({
|
| 387 |
+
"role": "tool",
|
| 388 |
+
"tool_call_id": tool_call.id,
|
| 389 |
+
"content": "REJECTED: Do not give up while Aristotle jobs are still running. Your proof has sorry — keep trying to fill it yourself, or wait for Aristotle results. Check Aristotle status with check_aristotle_status.",
|
| 390 |
+
})
|
| 391 |
+
job.save()
|
| 392 |
+
continue
|
| 393 |
+
|
| 394 |
job.answer = fn_args.get("answer", "")
|
| 395 |
+
job.best_lean_code = lean_code
|
| 396 |
job.best_code_verified = fn_args.get("verified", False)
|
| 397 |
+
job.best_code_sorry_free = not has_sorry
|
| 398 |
job.set_phase(JobPhase.COMPLETED)
|
| 399 |
job.finished_at = time.time()
|
| 400 |
job.add_status("Research complete!")
|