| | |
| | """ |
| | Minimal script to check if tasks in solved_tasks.jsonl were fully completed and verified. |
| | Uses an LLM to assess completion status and adds the result to each row. |
| | """ |
| |
|
| | import argparse |
| | import json |
| | import sys |
| | from concurrent.futures import ThreadPoolExecutor, as_completed |
| |
|
| | import litellm |
| | from dotenv import load_dotenv |
| | from pydantic import BaseModel |
| |
|
| | load_dotenv() |
| |
|
| |
|
| | class CompletionCheck(BaseModel): |
| | reasoning: str |
| | completed: bool |
| | verified: bool |
| |
|
| |
|
| | PROMPT = """You are evaluating whether an AI agent fully completed a task AND verified its completion. |
| | |
| | Task: {question} |
| | |
| | Agent's final answer: {solution} |
| | |
| | Agent's trace (tool calls and responses): |
| | {trace} |
| | |
| | Evaluate: |
| | 1. **completed**: Did the agent actually complete the task? (not just explain what could be done, but actually do it) |
| | 2. **verified**: Did the agent verify/confirm that the task was completed correctly? (e.g., checked output, validated results, confirmed success) |
| | |
| | Be strict: |
| | - If the agent asked for more information or said "please provide...", it's NOT completed. |
| | - If the agent only explained how to do something but didn't do it, it's NOT completed. |
| | - If the agent just made a plan of how to complete it but didn't do it, it's NOT completed. |
| | - If there's an error in the trace and no recovery, it's NOT completed. |
| | - If the agent didn't check/confirm the code/command completed succesfully or the result is correct somehow, it's NOT verified. |
| | |
| | Return JSON with: completed (bool), verified (bool), reasoning (brief explanation).""" |
| |
|
| |
|
| | def format_trace(messages: list) -> str: |
| | """Format messages trace for the prompt.""" |
| | if not messages: |
| | return "(No trace)" |
| |
|
| | parts = [] |
| | for msg in messages: |
| | role = msg.get("role", "unknown") |
| | if role == "system": |
| | continue |
| |
|
| | content = msg.get("content", "") |
| | tool_calls = msg.get("tool_calls", []) |
| |
|
| | if tool_calls: |
| | for tc in tool_calls: |
| | if isinstance(tc, dict) and "function" in tc: |
| | name = tc["function"].get("name", "?") |
| | parts.append(f"[TOOL CALL] {name}") |
| |
|
| | if content: |
| | |
| | if len(content) > 5000: |
| | content = content[:4000] + "..." + content[-1000:] |
| | parts.append(f"[{role.upper()}] {content}") |
| |
|
| | return "\n".join(parts) if parts else "(Empty trace)" |
| |
|
| |
|
| | def check_row(row: dict, model: str) -> CompletionCheck | None: |
| | """Check if a single task was completed and verified.""" |
| | prompt = PROMPT.format( |
| | question=row["question"], |
| | solution=row.get("solution", "(No solution)"), |
| | trace=format_trace(row.get("messages", [])), |
| | ) |
| |
|
| | try: |
| | response = litellm.completion( |
| | model=model, |
| | messages=[{"role": "user", "content": prompt}], |
| | response_format=CompletionCheck, |
| | timeout=60, |
| | ) |
| | return CompletionCheck.model_validate_json(response.choices[0].message.content) |
| | except Exception as e: |
| | print(f"Error: {e}", file=sys.stderr) |
| | return None |
| |
|
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="Check task completion status") |
| | parser.add_argument("--infile", type=str, default="eval/solved_tasks.jsonl") |
| | parser.add_argument( |
| | "--outfile", type=str, default="eval/solved_tasks_checked.jsonl" |
| | ) |
| | parser.add_argument( |
| | "--model", type=str, default="anthropic/claude-sonnet-4-5-20250929" |
| | ) |
| | parser.add_argument("--max-concurrent", type=int, default=30) |
| | args = parser.parse_args() |
| |
|
| | |
| | print(f"Loading {args.infile}...") |
| | rows = [] |
| | with open(args.infile) as f: |
| | for line in f: |
| | rows.append(json.loads(line)) |
| | print(f"Loaded {len(rows)} rows") |
| |
|
| | |
| | print(f"Checking completion with {args.model}...") |
| | with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor: |
| | futures = { |
| | executor.submit(check_row, row, args.model): i for i, row in enumerate(rows) |
| | } |
| | results = [None] * len(rows) |
| |
|
| | for future in as_completed(futures): |
| | idx = futures[future] |
| | results[idx] = future.result() |
| | print( |
| | f"Done: {sum(1 for r in results if r is not None)}/{len(rows)}", |
| | end="\r", |
| | ) |
| |
|
| | print() |
| |
|
| | |
| | output_rows = [] |
| | for row, result in zip(rows, results): |
| | if result: |
| | row["task_completed"] = result.completed |
| | row["task_verified"] = result.verified |
| | row["completion_reasoning"] = result.reasoning |
| | else: |
| | row["task_completed"] = None |
| | row["task_verified"] = None |
| | row["completion_reasoning"] = "Error during check" |
| | output_rows.append(row) |
| |
|
| | |
| | print(f"Writing to {args.outfile}...") |
| | with open(args.outfile, "w") as f: |
| | for row in output_rows: |
| | f.write(json.dumps(row, default=str) + "\n") |
| |
|
| | |
| | completed = sum(1 for r in results if r and r.completed) |
| | verified = sum(1 for r in results if r and r.verified) |
| | print("\nSummary:") |
| | print(f" Completed: {completed}/{len(rows)}") |
| | print(f" Verified: {verified}/{len(rows)}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|