BPO-Bench / data /tasks_schema_violations.json
haroldshipibm's picture
Upload folder using huggingface_hub
d075a5b verified
[
{
"name": "bpo-benchmark",
"user_info": [],
"test_cases": [
{
"name": "task_34",
"description": "Tests agent handling of untyped/unschema'd response. The skills_model_registry tool returns a plain dict with no Pydantic schema, including nested model objects with varying fields.",
"intent": "What ML models are registered for 05958BR?",
"difficulty": "medium",
"expected_output": {
"response": "The following ML models are registered for 05958BR: Skill relevance classifier (v2.1.0, active), SLA impact regression model (v1.4.2, active), and Funnel conversion recommender (v3.0.0-beta, staging).",
"keywords": [
"Skill relevance classifier",
"SLA impact regression model",
"Funnel conversion recommender",
"active|staging"
],
"tool_calls": [
{
"name": "skills_model_registry",
"args": {
"requisition_id": "05958BR"
}
}
],
"error_handling": {
"error_type": "missing_output_schema",
"should_report_error": false,
"should_retry": false,
"expected_behavior": "Agent should infer structure from the untyped response and present model info"
},
"tool_call_results": [
{
"name": "skills_model_registry",
"result": {
"requisition_id": "05958BR",
"models": [
{
"name": "Skill relevance classifier",
"version": "2.1.0",
"status": "active",
"last_trained": "2024-11-15",
"accuracy": 0.87
},
{
"name": "SLA impact regression model",
"version": "1.4.2",
"status": "active",
"last_trained": "2024-10-01",
"r_squared": 0.72
},
{
"name": "Funnel conversion recommender",
"version": "3.0.0-beta",
"status": "staging",
"last_trained": "2025-01-20",
"precision": 0.81
}
],
"registry_updated": "2025-04-29"
}
}
]
}
},
{
"name": "task_35",
"description": "Tests agent handling of undocumented/extra input parameters. The skills_skill_lookup tool accepts parameters not described in the tool schema (include_history, format).",
"intent": "Look up the skill Python for requisition 05958BR",
"difficulty": "medium",
"expected_output": {
"response": "Python for requisition 05958BR has an occurrence count across similar candidates, showing its prevalence in the candidate pool.",
"keywords": [
"Python",
"05958BR",
"occurrence|count|rate"
],
"tool_calls": [
{
"name": "skills_skill_lookup",
"args": {
"requisition_id": "05958BR",
"skill_name": "Python"
}
}
],
"error_handling": {
"error_type": "missing_input_schema",
"should_report_error": false,
"should_retry": false,
"expected_behavior": "Agent should infer required parameters and call the tool correctly"
},
"tool_call_results": [
{
"name": "skills_skill_lookup",
"result": {
"requisition_id": "05958BR",
"skill_name": "Python",
"occurrence_count": 200,
"total_candidates": 2913,
"occurrence_rate": 6.9
}
}
]
}
},
{
"name": "task_36",
"description": "Tests agent handling of response with missing required fields. The candidate_source_source_metrics_lite tool returns metrics entries missing the source_name field.",
"intent": "Get a lightweight summary of source metrics for 05958BR",
"difficulty": "hard",
"expected_output": {
"response": "Source metrics for 05958BR show candidate counts and hire counts per source. Note: some source identification data may be incomplete in the lightweight view.",
"keywords": [
"metrics|source",
"candidate|hire",
"05958BR"
],
"tool_calls": [
{
"name": "candidate_source_source_metrics_lite",
"args": {
"requisition_id": "05958BR"
}
}
],
"error_handling": {
"error_type": "missing_fields",
"should_report_error": true,
"should_retry": false,
"expected_behavior": "Agent should handle partial data and note missing source names"
},
"tool_call_results": [
{
"name": "candidate_source_source_metrics_lite",
"result": {
"requisition_id": "05958BR",
"metrics": [
{
"candidate_count": 200,
"hire_count": 3,
"sla_met_count": 108
},
{
"candidate_count": 516,
"hire_count": 11,
"sla_met_count": 54
},
{
"candidate_count": 468,
"hire_count": 10,
"sla_met_count": 320
},
{
"candidate_count": 410,
"hire_count": 0,
"sla_met_count": 272
},
{
"candidate_count": 400,
"hire_count": 5,
"sla_met_count": 281
},
{
"candidate_count": 519,
"hire_count": 7,
"sla_met_count": 370
},
{
"candidate_count": 400,
"hire_count": 4,
"sla_met_count": 266
}
],
"note": "Lightweight view — some fields may be omitted for performance."
}
}
]
}
},
{
"name": "task_37",
"description": "Tests agent handling of wrong field types in response. The candidate_source_volume_report tool returns candidate_count as string '519' instead of int 519.",
"intent": "Generate a volume report for 05958BR",
"difficulty": "medium",
"expected_output": {
"response": "Volume report for 05958BR shows candidate counts by source, with LinkedIn, Dice, and GitHub among the top contributors.",
"keywords": [
"volume|report",
"candidates|count",
"05958BR"
],
"tool_calls": [
{
"name": "candidate_source_volume_report",
"args": {
"requisition_id": "05958BR"
}
}
],
"error_handling": {
"error_type": "wrong_field_types",
"should_report_error": false,
"should_retry": false,
"expected_behavior": "Agent should handle type coercion (string to int) transparently"
},
"tool_call_results": [
{
"name": "candidate_source_volume_report",
"result": {
"requisition_id": "05958BR",
"metrics": [
{
"source_name": "CyberSec Jobs",
"candidate_count": "200",
"hire_count": "3",
"review_rate": "80.5%"
},
{
"source_name": "Dice",
"candidate_count": "516",
"hire_count": "11",
"review_rate": "11.0%"
},
{
"source_name": "GitHub",
"candidate_count": "468",
"hire_count": "10",
"review_rate": "76.1%"
},
{
"source_name": "Indeed",
"candidate_count": "410",
"hire_count": "0",
"review_rate": "77.1%"
},
{
"source_name": "Internal",
"candidate_count": "400",
"hire_count": "5",
"review_rate": "74.0%"
},
{
"source_name": "LinkedIn",
"candidate_count": "519",
"hire_count": "7",
"review_rate": "75.1%"
},
{
"source_name": "Referral",
"candidate_count": "400",
"hire_count": "4",
"review_rate": "70.0%"
}
],
"total_candidates": "2913"
}
}
]
}
}
]
}
]