Vilin97 Claude Opus 4.6 (1M context) commited on
Commit
d64d4c5
·
1 Parent(s): 7d872a8

Reject sorry final_answer while Aristotle pending (max 5 rejections)

Browse files

When agent tries to finalize with sorry-containing code while
Aristotle jobs are still running, reject it and force the agent
to keep working. Cap at 5 rejections to avoid infinite cost burn
(B2 test burned $8 on 23 rejections).

Validated: B2 ran 70+ minutes, received completed Aristotle result,
agent was forced to continue working instead of quitting at 14 min.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. agent.py +21 -2
agent.py CHANGED
@@ -372,10 +372,29 @@ async def run_agent_job(job: JobState) -> None:
372
 
373
  # Handle final_answer
374
  if fn_name == "final_answer":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  job.answer = fn_args.get("answer", "")
376
- job.best_lean_code = fn_args.get("lean_code", job.best_lean_code)
377
  job.best_code_verified = fn_args.get("verified", False)
378
- job.best_code_sorry_free = "sorry" not in job.best_lean_code
379
  job.set_phase(JobPhase.COMPLETED)
380
  job.finished_at = time.time()
381
  job.add_status("Research complete!")
 
372
 
373
  # Handle final_answer
374
  if fn_name == "final_answer":
375
+ lean_code = fn_args.get("lean_code", job.best_lean_code)
376
+ has_sorry = "sorry" in lean_code
377
+ has_pending_aristotle = any(
378
+ aj.get("status") in ("SUBMITTED", "QUEUED", "IN_PROGRESS")
379
+ for aj in job.aristotle_jobs
380
+ )
381
+ # Reject premature finalization: sorry + Aristotle still running + budget remaining
382
+ # But cap rejections at 5 to avoid infinite cost burn
383
+ reject_count = sum(1 for s in job.status_log if 'Rejected final_answer' in s)
384
+ if has_sorry and has_pending_aristotle and job.total_cost < MAX_COST_PER_QUERY * 0.8 and reject_count < 5:
385
+ job.add_status("Rejected final_answer: proof has sorry and Aristotle is still running. Keep working!")
386
+ job.messages.append({
387
+ "role": "tool",
388
+ "tool_call_id": tool_call.id,
389
+ "content": "REJECTED: Do not give up while Aristotle jobs are still running. Your proof has sorry — keep trying to fill it yourself, or wait for Aristotle results. Check Aristotle status with check_aristotle_status.",
390
+ })
391
+ job.save()
392
+ continue
393
+
394
  job.answer = fn_args.get("answer", "")
395
+ job.best_lean_code = lean_code
396
  job.best_code_verified = fn_args.get("verified", False)
397
+ job.best_code_sorry_free = not has_sorry
398
  job.set_phase(JobPhase.COMPLETED)
399
  job.finished_at = time.time()
400
  job.add_status("Research complete!")