Spaces:

ibm-research
/

BPO-Bench

Running

App Files Files Community

BPO-Bench / data /tasks_schema_violations.json

haroldshipibm

Upload folder using huggingface_hub

d075a5b verified 7 days ago

raw

history blame contribute delete

9.58 kB

	[
	{
	"name": "bpo-benchmark",
	"user_info": [],
	"test_cases": [
	{
	"name": "task_34",
	"description": "Tests agent handling of untyped/unschema'd response. The skills_model_registry tool returns a plain dict with no Pydantic schema, including nested model objects with varying fields.",
	"intent": "What ML models are registered for 05958BR?",
	"difficulty": "medium",
	"expected_output": {
	"response": "The following ML models are registered for 05958BR: Skill relevance classifier (v2.1.0, active), SLA impact regression model (v1.4.2, active), and Funnel conversion recommender (v3.0.0-beta, staging).",
	"keywords": [
	"Skill relevance classifier",
	"SLA impact regression model",
	"Funnel conversion recommender",
	"active\|staging"
	],
	"tool_calls": [
	{
	"name": "skills_model_registry",
	"args": {
	"requisition_id": "05958BR"
	}
	}
	],
	"error_handling": {
	"error_type": "missing_output_schema",
	"should_report_error": false,
	"should_retry": false,
	"expected_behavior": "Agent should infer structure from the untyped response and present model info"
	},
	"tool_call_results": [
	{
	"name": "skills_model_registry",
	"result": {
	"requisition_id": "05958BR",
	"models": [
	{
	"name": "Skill relevance classifier",
	"version": "2.1.0",
	"status": "active",
	"last_trained": "2024-11-15",
	"accuracy": 0.87
	},
	{
	"name": "SLA impact regression model",
	"version": "1.4.2",
	"status": "active",
	"last_trained": "2024-10-01",
	"r_squared": 0.72
	},
	{
	"name": "Funnel conversion recommender",
	"version": "3.0.0-beta",
	"status": "staging",
	"last_trained": "2025-01-20",
	"precision": 0.81
	}
	],
	"registry_updated": "2025-04-29"
	}
	}
	]
	}
	},
	{
	"name": "task_35",
	"description": "Tests agent handling of undocumented/extra input parameters. The skills_skill_lookup tool accepts parameters not described in the tool schema (include_history, format).",
	"intent": "Look up the skill Python for requisition 05958BR",
	"difficulty": "medium",
	"expected_output": {
	"response": "Python for requisition 05958BR has an occurrence count across similar candidates, showing its prevalence in the candidate pool.",
	"keywords": [
	"Python",
	"05958BR",
	"occurrence\|count\|rate"
	],
	"tool_calls": [
	{
	"name": "skills_skill_lookup",
	"args": {
	"requisition_id": "05958BR",
	"skill_name": "Python"
	}
	}
	],
	"error_handling": {
	"error_type": "missing_input_schema",
	"should_report_error": false,
	"should_retry": false,
	"expected_behavior": "Agent should infer required parameters and call the tool correctly"
	},
	"tool_call_results": [
	{
	"name": "skills_skill_lookup",
	"result": {
	"requisition_id": "05958BR",
	"skill_name": "Python",
	"occurrence_count": 200,
	"total_candidates": 2913,
	"occurrence_rate": 6.9
	}
	}
	]
	}
	},
	{
	"name": "task_36",
	"description": "Tests agent handling of response with missing required fields. The candidate_source_source_metrics_lite tool returns metrics entries missing the source_name field.",
	"intent": "Get a lightweight summary of source metrics for 05958BR",
	"difficulty": "hard",
	"expected_output": {
	"response": "Source metrics for 05958BR show candidate counts and hire counts per source. Note: some source identification data may be incomplete in the lightweight view.",
	"keywords": [
	"metrics\|source",
	"candidate\|hire",
	"05958BR"
	],
	"tool_calls": [
	{
	"name": "candidate_source_source_metrics_lite",
	"args": {
	"requisition_id": "05958BR"
	}
	}
	],
	"error_handling": {
	"error_type": "missing_fields",
	"should_report_error": true,
	"should_retry": false,
	"expected_behavior": "Agent should handle partial data and note missing source names"
	},
	"tool_call_results": [
	{
	"name": "candidate_source_source_metrics_lite",
	"result": {
	"requisition_id": "05958BR",
	"metrics": [
	{
	"candidate_count": 200,
	"hire_count": 3,
	"sla_met_count": 108
	},
	{
	"candidate_count": 516,
	"hire_count": 11,
	"sla_met_count": 54
	},
	{
	"candidate_count": 468,
	"hire_count": 10,
	"sla_met_count": 320
	},
	{
	"candidate_count": 410,
	"hire_count": 0,
	"sla_met_count": 272
	},
	{
	"candidate_count": 400,
	"hire_count": 5,
	"sla_met_count": 281
	},
	{
	"candidate_count": 519,
	"hire_count": 7,
	"sla_met_count": 370
	},
	{
	"candidate_count": 400,
	"hire_count": 4,
	"sla_met_count": 266
	}
	],
	"note": "Lightweight view — some fields may be omitted for performance."
	}
	}
	]
	}
	},
	{
	"name": "task_37",
	"description": "Tests agent handling of wrong field types in response. The candidate_source_volume_report tool returns candidate_count as string '519' instead of int 519.",
	"intent": "Generate a volume report for 05958BR",
	"difficulty": "medium",
	"expected_output": {
	"response": "Volume report for 05958BR shows candidate counts by source, with LinkedIn, Dice, and GitHub among the top contributors.",
	"keywords": [
	"volume\|report",
	"candidates\|count",
	"05958BR"
	],
	"tool_calls": [
	{
	"name": "candidate_source_volume_report",
	"args": {
	"requisition_id": "05958BR"
	}
	}
	],
	"error_handling": {
	"error_type": "wrong_field_types",
	"should_report_error": false,
	"should_retry": false,
	"expected_behavior": "Agent should handle type coercion (string to int) transparently"
	},
	"tool_call_results": [
	{
	"name": "candidate_source_volume_report",
	"result": {
	"requisition_id": "05958BR",
	"metrics": [
	{
	"source_name": "CyberSec Jobs",
	"candidate_count": "200",
	"hire_count": "3",
	"review_rate": "80.5%"
	},
	{
	"source_name": "Dice",
	"candidate_count": "516",
	"hire_count": "11",
	"review_rate": "11.0%"
	},
	{
	"source_name": "GitHub",
	"candidate_count": "468",
	"hire_count": "10",
	"review_rate": "76.1%"
	},
	{
	"source_name": "Indeed",
	"candidate_count": "410",
	"hire_count": "0",
	"review_rate": "77.1%"
	},
	{
	"source_name": "Internal",
	"candidate_count": "400",
	"hire_count": "5",
	"review_rate": "74.0%"
	},
	{
	"source_name": "LinkedIn",
	"candidate_count": "519",
	"hire_count": "7",
	"review_rate": "75.1%"
	},
	{
	"source_name": "Referral",
	"candidate_count": "400",
	"hire_count": "4",
	"review_rate": "70.0%"
	}
	],
	"total_candidates": "2913"
	}
	}
	]
	}
	}
	]
	}
	]