Spaces:
Running
Running
| [ | |
| { | |
| "name": "bpo-benchmark", | |
| "user_info": [], | |
| "test_cases": [ | |
| { | |
| "name": "task_34", | |
| "description": "Tests agent handling of untyped/unschema'd response. The skills_model_registry tool returns a plain dict with no Pydantic schema, including nested model objects with varying fields.", | |
| "intent": "What ML models are registered for 05958BR?", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "The following ML models are registered for 05958BR: Skill relevance classifier (v2.1.0, active), SLA impact regression model (v1.4.2, active), and Funnel conversion recommender (v3.0.0-beta, staging).", | |
| "keywords": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender", | |
| "active|staging" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_model_registry", | |
| "args": { | |
| "requisition_id": "05958BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "missing_output_schema", | |
| "should_report_error": false, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should infer structure from the untyped response and present model info" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_model_registry", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "models": [ | |
| { | |
| "name": "Skill relevance classifier", | |
| "version": "2.1.0", | |
| "status": "active", | |
| "last_trained": "2024-11-15", | |
| "accuracy": 0.87 | |
| }, | |
| { | |
| "name": "SLA impact regression model", | |
| "version": "1.4.2", | |
| "status": "active", | |
| "last_trained": "2024-10-01", | |
| "r_squared": 0.72 | |
| }, | |
| { | |
| "name": "Funnel conversion recommender", | |
| "version": "3.0.0-beta", | |
| "status": "staging", | |
| "last_trained": "2025-01-20", | |
| "precision": 0.81 | |
| } | |
| ], | |
| "registry_updated": "2025-04-29" | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_35", | |
| "description": "Tests agent handling of undocumented/extra input parameters. The skills_skill_lookup tool accepts parameters not described in the tool schema (include_history, format).", | |
| "intent": "Look up the skill Python for requisition 05958BR", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "Python for requisition 05958BR has an occurrence count across similar candidates, showing its prevalence in the candidate pool.", | |
| "keywords": [ | |
| "Python", | |
| "05958BR", | |
| "occurrence|count|rate" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_skill_lookup", | |
| "args": { | |
| "requisition_id": "05958BR", | |
| "skill_name": "Python" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "missing_input_schema", | |
| "should_report_error": false, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should infer required parameters and call the tool correctly" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_skill_lookup", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "skill_name": "Python", | |
| "occurrence_count": 200, | |
| "total_candidates": 2913, | |
| "occurrence_rate": 6.9 | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_36", | |
| "description": "Tests agent handling of response with missing required fields. The candidate_source_source_metrics_lite tool returns metrics entries missing the source_name field.", | |
| "intent": "Get a lightweight summary of source metrics for 05958BR", | |
| "difficulty": "hard", | |
| "expected_output": { | |
| "response": "Source metrics for 05958BR show candidate counts and hire counts per source. Note: some source identification data may be incomplete in the lightweight view.", | |
| "keywords": [ | |
| "metrics|source", | |
| "candidate|hire", | |
| "05958BR" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_source_metrics_lite", | |
| "args": { | |
| "requisition_id": "05958BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "missing_fields", | |
| "should_report_error": true, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should handle partial data and note missing source names" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_source_metrics_lite", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "metrics": [ | |
| { | |
| "candidate_count": 200, | |
| "hire_count": 3, | |
| "sla_met_count": 108 | |
| }, | |
| { | |
| "candidate_count": 516, | |
| "hire_count": 11, | |
| "sla_met_count": 54 | |
| }, | |
| { | |
| "candidate_count": 468, | |
| "hire_count": 10, | |
| "sla_met_count": 320 | |
| }, | |
| { | |
| "candidate_count": 410, | |
| "hire_count": 0, | |
| "sla_met_count": 272 | |
| }, | |
| { | |
| "candidate_count": 400, | |
| "hire_count": 5, | |
| "sla_met_count": 281 | |
| }, | |
| { | |
| "candidate_count": 519, | |
| "hire_count": 7, | |
| "sla_met_count": 370 | |
| }, | |
| { | |
| "candidate_count": 400, | |
| "hire_count": 4, | |
| "sla_met_count": 266 | |
| } | |
| ], | |
| "note": "Lightweight view — some fields may be omitted for performance." | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_37", | |
| "description": "Tests agent handling of wrong field types in response. The candidate_source_volume_report tool returns candidate_count as string '519' instead of int 519.", | |
| "intent": "Generate a volume report for 05958BR", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "Volume report for 05958BR shows candidate counts by source, with LinkedIn, Dice, and GitHub among the top contributors.", | |
| "keywords": [ | |
| "volume|report", | |
| "candidates|count", | |
| "05958BR" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_volume_report", | |
| "args": { | |
| "requisition_id": "05958BR" | |
| } | |
| } | |
| ], | |
| "error_handling": { | |
| "error_type": "wrong_field_types", | |
| "should_report_error": false, | |
| "should_retry": false, | |
| "expected_behavior": "Agent should handle type coercion (string to int) transparently" | |
| }, | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_volume_report", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "candidate_count": "200", | |
| "hire_count": "3", | |
| "review_rate": "80.5%" | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "candidate_count": "516", | |
| "hire_count": "11", | |
| "review_rate": "11.0%" | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "candidate_count": "468", | |
| "hire_count": "10", | |
| "review_rate": "76.1%" | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "candidate_count": "410", | |
| "hire_count": "0", | |
| "review_rate": "77.1%" | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "candidate_count": "400", | |
| "hire_count": "5", | |
| "review_rate": "74.0%" | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "candidate_count": "519", | |
| "hire_count": "7", | |
| "review_rate": "75.1%" | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "candidate_count": "400", | |
| "hire_count": "4", | |
| "review_rate": "70.0%" | |
| } | |
| ], | |
| "total_candidates": "2913" | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| ] | |