Spaces:
Running
Running
| [ | |
| { | |
| "name": "bpo-benchmark", | |
| "user_info": [], | |
| "test_cases": [ | |
| { | |
| "name": "task_43", | |
| "description": "Tests agent handling of non-standard error format. The candidate_source_requisition_details tool returns {\"err\": \"not_found\"} instead of standard RequisitionNotFoundResponse when given an invalid ID.", | |
| "intent": "Get detailed info for requisition 99999BR", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "Requisition 99999BR was not found. The system returned a non-standard error indicating the requisition does not exist.", | |
| "keywords": [ | |
| "99999BR", | |
| "not found|not_found|doesn't exist|does not exist|invalid" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_requisition_details", | |
| "args": { | |
| "requisition_id": "99999BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "undocumented_error_format", | |
| "should_report_error": true, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should parse non-standard error format and report not found" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_requisition_details", | |
| "result": { | |
| "err": "not_found", | |
| "req": "99999BR" | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_44", | |
| "description": "Tests agent handling of undocumented pagination. The candidate_source_list_all_sources tool returns only a page of results with a next_page token not described in any schema.", | |
| "intent": "List all available sourcing channels for 05958BR", | |
| "difficulty": "hard", | |
| "expected_output": { | |
| "response": "The available sourcing channels for 05958BR include multiple sources. The results show a total count of all channels, though the response is paginated.", | |
| "keywords": [ | |
| "source|sources|channels", | |
| "05958BR" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_list_all_sources", | |
| "args": { | |
| "requisition_id": "05958BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "undocumented_pagination", | |
| "should_report_error": false, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should detect and handle pagination, noting there are more results" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_list_all_sources", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "sources": [ | |
| { | |
| "name": "CyberSec Jobs", | |
| "index": 0 | |
| }, | |
| { | |
| "name": "Dice", | |
| "index": 1 | |
| }, | |
| { | |
| "name": "GitHub", | |
| "index": 2 | |
| } | |
| ], | |
| "total_count": 7, | |
| "page_size": 3, | |
| "page": 1, | |
| "next_page": "eyJvZmZzZXQiOjMsInJlcV9pZCI6IjA1OTU4QlIifQ==", | |
| "has_more": true | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_45", | |
| "description": "Tests agent handling of undocumented rate limiting info in response body. The candidate_source_batch_metrics tool includes X-RateLimit headers embedded in the JSON response.", | |
| "intent": "Fetch batch metrics for all sources on 05958BR", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "Batch metrics for 05958BR show candidate counts and hire data across all sourcing channels including LinkedIn, Dice, Indeed, GitHub, Referral, CyberSec Jobs, and TechCareers.", | |
| "keywords": [ | |
| "metrics|batch", | |
| "candidates|hires|sources", | |
| "05958BR" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_batch_metrics", | |
| "args": { | |
| "requisition_id": "05958BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "undocumented_rate_limiting", | |
| "should_report_error": false, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should process the metrics data and optionally note rate limit information" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_batch_metrics", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "metrics": { | |
| "CyberSec Jobs": { | |
| "candidates": 200, | |
| "hires": 3, | |
| "reviewed": 161 | |
| }, | |
| "Dice": { | |
| "candidates": 516, | |
| "hires": 11, | |
| "reviewed": 57 | |
| }, | |
| "GitHub": { | |
| "candidates": 468, | |
| "hires": 10, | |
| "reviewed": 356 | |
| }, | |
| "Indeed": { | |
| "candidates": 410, | |
| "hires": 0, | |
| "reviewed": 316 | |
| }, | |
| "Internal": { | |
| "candidates": 400, | |
| "hires": 5, | |
| "reviewed": 296 | |
| }, | |
| "LinkedIn": { | |
| "candidates": 519, | |
| "hires": 7, | |
| "reviewed": 390 | |
| }, | |
| "Referral": { | |
| "candidates": 400, | |
| "hires": 4, | |
| "reviewed": 280 | |
| } | |
| }, | |
| "X-RateLimit-Limit": 100, | |
| "X-RateLimit-Remaining": 97, | |
| "X-RateLimit-Reset": "2025-05-01T00:00:00Z", | |
| "X-RateLimit-Window": "1h" | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| ] | |