[ { "name": "bpo-benchmark", "user_info": [], "test_cases": [ { "name": "task_34", "description": "Tests agent handling of untyped/unschema'd response. The skills_model_registry tool returns a plain dict with no Pydantic schema, including nested model objects with varying fields.", "intent": "What ML models are registered for 05958BR?", "difficulty": "medium", "expected_output": { "response": "The following ML models are registered for 05958BR: Skill relevance classifier (v2.1.0, active), SLA impact regression model (v1.4.2, active), and Funnel conversion recommender (v3.0.0-beta, staging).", "keywords": [ "Skill relevance classifier", "SLA impact regression model", "Funnel conversion recommender", "active|staging" ], "tool_calls": [ { "name": "skills_model_registry", "args": { "requisition_id": "05958BR" } } ], "error_handling": { "error_type": "missing_output_schema", "should_report_error": false, "should_retry": false, "expected_behavior": "Agent should infer structure from the untyped response and present model info" }, "tool_call_results": [ { "name": "skills_model_registry", "result": { "requisition_id": "05958BR", "models": [ { "name": "Skill relevance classifier", "version": "2.1.0", "status": "active", "last_trained": "2024-11-15", "accuracy": 0.87 }, { "name": "SLA impact regression model", "version": "1.4.2", "status": "active", "last_trained": "2024-10-01", "r_squared": 0.72 }, { "name": "Funnel conversion recommender", "version": "3.0.0-beta", "status": "staging", "last_trained": "2025-01-20", "precision": 0.81 } ], "registry_updated": "2025-04-29" } } ] } }, { "name": "task_35", "description": "Tests agent handling of undocumented/extra input parameters. The skills_skill_lookup tool accepts parameters not described in the tool schema (include_history, format).", "intent": "Look up the skill Python for requisition 05958BR", "difficulty": "medium", "expected_output": { "response": "Python for requisition 05958BR has an occurrence count across similar candidates, showing its prevalence in the candidate pool.", "keywords": [ "Python", "05958BR", "occurrence|count|rate" ], "tool_calls": [ { "name": "skills_skill_lookup", "args": { "requisition_id": "05958BR", "skill_name": "Python" } } ], "error_handling": { "error_type": "missing_input_schema", "should_report_error": false, "should_retry": false, "expected_behavior": "Agent should infer required parameters and call the tool correctly" }, "tool_call_results": [ { "name": "skills_skill_lookup", "result": { "requisition_id": "05958BR", "skill_name": "Python", "occurrence_count": 200, "total_candidates": 2913, "occurrence_rate": 6.9 } } ] } }, { "name": "task_36", "description": "Tests agent handling of response with missing required fields. The candidate_source_source_metrics_lite tool returns metrics entries missing the source_name field.", "intent": "Get a lightweight summary of source metrics for 05958BR", "difficulty": "hard", "expected_output": { "response": "Source metrics for 05958BR show candidate counts and hire counts per source. Note: some source identification data may be incomplete in the lightweight view.", "keywords": [ "metrics|source", "candidate|hire", "05958BR" ], "tool_calls": [ { "name": "candidate_source_source_metrics_lite", "args": { "requisition_id": "05958BR" } } ], "error_handling": { "error_type": "missing_fields", "should_report_error": true, "should_retry": false, "expected_behavior": "Agent should handle partial data and note missing source names" }, "tool_call_results": [ { "name": "candidate_source_source_metrics_lite", "result": { "requisition_id": "05958BR", "metrics": [ { "candidate_count": 200, "hire_count": 3, "sla_met_count": 108 }, { "candidate_count": 516, "hire_count": 11, "sla_met_count": 54 }, { "candidate_count": 468, "hire_count": 10, "sla_met_count": 320 }, { "candidate_count": 410, "hire_count": 0, "sla_met_count": 272 }, { "candidate_count": 400, "hire_count": 5, "sla_met_count": 281 }, { "candidate_count": 519, "hire_count": 7, "sla_met_count": 370 }, { "candidate_count": 400, "hire_count": 4, "sla_met_count": 266 } ], "note": "Lightweight view — some fields may be omitted for performance." } } ] } }, { "name": "task_37", "description": "Tests agent handling of wrong field types in response. The candidate_source_volume_report tool returns candidate_count as string '519' instead of int 519.", "intent": "Generate a volume report for 05958BR", "difficulty": "medium", "expected_output": { "response": "Volume report for 05958BR shows candidate counts by source, with LinkedIn, Dice, and GitHub among the top contributors.", "keywords": [ "volume|report", "candidates|count", "05958BR" ], "tool_calls": [ { "name": "candidate_source_volume_report", "args": { "requisition_id": "05958BR" } } ], "error_handling": { "error_type": "wrong_field_types", "should_report_error": false, "should_retry": false, "expected_behavior": "Agent should handle type coercion (string to int) transparently" }, "tool_call_results": [ { "name": "candidate_source_volume_report", "result": { "requisition_id": "05958BR", "metrics": [ { "source_name": "CyberSec Jobs", "candidate_count": "200", "hire_count": "3", "review_rate": "80.5%" }, { "source_name": "Dice", "candidate_count": "516", "hire_count": "11", "review_rate": "11.0%" }, { "source_name": "GitHub", "candidate_count": "468", "hire_count": "10", "review_rate": "76.1%" }, { "source_name": "Indeed", "candidate_count": "410", "hire_count": "0", "review_rate": "77.1%" }, { "source_name": "Internal", "candidate_count": "400", "hire_count": "5", "review_rate": "74.0%" }, { "source_name": "LinkedIn", "candidate_count": "519", "hire_count": "7", "review_rate": "75.1%" }, { "source_name": "Referral", "candidate_count": "400", "hire_count": "4", "review_rate": "70.0%" } ], "total_candidates": "2913" } } ] } } ] } ]