Spaces:

ibm-research
/

BPO-Bench

Running

File size: 9,583 Bytes

d075a5b

[
  {
    "name": "bpo-benchmark",
    "user_info": [],
    "test_cases": [
      {
        "name": "task_34",
        "description": "Tests agent handling of untyped/unschema'd response. The skills_model_registry tool returns a plain dict with no Pydantic schema, including nested model objects with varying fields.",
        "intent": "What ML models are registered for 05958BR?",
        "difficulty": "medium",
        "expected_output": {
          "response": "The following ML models are registered for 05958BR: Skill relevance classifier (v2.1.0, active), SLA impact regression model (v1.4.2, active), and Funnel conversion recommender (v3.0.0-beta, staging).",
          "keywords": [
            "Skill relevance classifier",
            "SLA impact regression model",
            "Funnel conversion recommender",
            "active|staging"
          ],
          "tool_calls": [
            {
              "name": "skills_model_registry",
              "args": {
                "requisition_id": "05958BR"
              }
            }
          ],
          "error_handling": {
            "error_type": "missing_output_schema",
            "should_report_error": false,
            "should_retry": false,
            "expected_behavior": "Agent should infer structure from the untyped response and present model info"
          },
          "tool_call_results": [
            {
              "name": "skills_model_registry",
              "result": {
                "requisition_id": "05958BR",
                "models": [
                  {
                    "name": "Skill relevance classifier",
                    "version": "2.1.0",
                    "status": "active",
                    "last_trained": "2024-11-15",
                    "accuracy": 0.87
                  },
                  {
                    "name": "SLA impact regression model",
                    "version": "1.4.2",
                    "status": "active",
                    "last_trained": "2024-10-01",
                    "r_squared": 0.72
                  },
                  {
                    "name": "Funnel conversion recommender",
                    "version": "3.0.0-beta",
                    "status": "staging",
                    "last_trained": "2025-01-20",
                    "precision": 0.81
                  }
                ],
                "registry_updated": "2025-04-29"
              }
            }
          ]
        }
      },
      {
        "name": "task_35",
        "description": "Tests agent handling of undocumented/extra input parameters. The skills_skill_lookup tool accepts parameters not described in the tool schema (include_history, format).",
        "intent": "Look up the skill Python for requisition 05958BR",
        "difficulty": "medium",
        "expected_output": {
          "response": "Python for requisition 05958BR has an occurrence count across similar candidates, showing its prevalence in the candidate pool.",
          "keywords": [
            "Python",
            "05958BR",
            "occurrence|count|rate"
          ],
          "tool_calls": [
            {
              "name": "skills_skill_lookup",
              "args": {
                "requisition_id": "05958BR",
                "skill_name": "Python"
              }
            }
          ],
          "error_handling": {
            "error_type": "missing_input_schema",
            "should_report_error": false,
            "should_retry": false,
            "expected_behavior": "Agent should infer required parameters and call the tool correctly"
          },
          "tool_call_results": [
            {
              "name": "skills_skill_lookup",
              "result": {
                "requisition_id": "05958BR",
                "skill_name": "Python",
                "occurrence_count": 200,
                "total_candidates": 2913,
                "occurrence_rate": 6.9
              }
            }
          ]
        }
      },
      {
        "name": "task_36",
        "description": "Tests agent handling of response with missing required fields. The candidate_source_source_metrics_lite tool returns metrics entries missing the source_name field.",
        "intent": "Get a lightweight summary of source metrics for 05958BR",
        "difficulty": "hard",
        "expected_output": {
          "response": "Source metrics for 05958BR show candidate counts and hire counts per source. Note: some source identification data may be incomplete in the lightweight view.",
          "keywords": [
            "metrics|source",
            "candidate|hire",
            "05958BR"
          ],
          "tool_calls": [
            {
              "name": "candidate_source_source_metrics_lite",
              "args": {
                "requisition_id": "05958BR"
              }
            }
          ],
          "error_handling": {
            "error_type": "missing_fields",
            "should_report_error": true,
            "should_retry": false,
            "expected_behavior": "Agent should handle partial data and note missing source names"
          },
          "tool_call_results": [
            {
              "name": "candidate_source_source_metrics_lite",
              "result": {
                "requisition_id": "05958BR",
                "metrics": [
                  {
                    "candidate_count": 200,
                    "hire_count": 3,
                    "sla_met_count": 108
                  },
                  {
                    "candidate_count": 516,
                    "hire_count": 11,
                    "sla_met_count": 54
                  },
                  {
                    "candidate_count": 468,
                    "hire_count": 10,
                    "sla_met_count": 320
                  },
                  {
                    "candidate_count": 410,
                    "hire_count": 0,
                    "sla_met_count": 272
                  },
                  {
                    "candidate_count": 400,
                    "hire_count": 5,
                    "sla_met_count": 281
                  },
                  {
                    "candidate_count": 519,
                    "hire_count": 7,
                    "sla_met_count": 370
                  },
                  {
                    "candidate_count": 400,
                    "hire_count": 4,
                    "sla_met_count": 266
                  }
                ],
                "note": "Lightweight view — some fields may be omitted for performance."
              }
            }
          ]
        }
      },
      {
        "name": "task_37",
        "description": "Tests agent handling of wrong field types in response. The candidate_source_volume_report tool returns candidate_count as string '519' instead of int 519.",
        "intent": "Generate a volume report for 05958BR",
        "difficulty": "medium",
        "expected_output": {
          "response": "Volume report for 05958BR shows candidate counts by source, with LinkedIn, Dice, and GitHub among the top contributors.",
          "keywords": [
            "volume|report",
            "candidates|count",
            "05958BR"
          ],
          "tool_calls": [
            {
              "name": "candidate_source_volume_report",
              "args": {
                "requisition_id": "05958BR"
              }
            }
          ],
          "error_handling": {
            "error_type": "wrong_field_types",
            "should_report_error": false,
            "should_retry": false,
            "expected_behavior": "Agent should handle type coercion (string to int) transparently"
          },
          "tool_call_results": [
            {
              "name": "candidate_source_volume_report",
              "result": {
                "requisition_id": "05958BR",
                "metrics": [
                  {
                    "source_name": "CyberSec Jobs",
                    "candidate_count": "200",
                    "hire_count": "3",
                    "review_rate": "80.5%"
                  },
                  {
                    "source_name": "Dice",
                    "candidate_count": "516",
                    "hire_count": "11",
                    "review_rate": "11.0%"
                  },
                  {
                    "source_name": "GitHub",
                    "candidate_count": "468",
                    "hire_count": "10",
                    "review_rate": "76.1%"
                  },
                  {
                    "source_name": "Indeed",
                    "candidate_count": "410",
                    "hire_count": "0",
                    "review_rate": "77.1%"
                  },
                  {
                    "source_name": "Internal",
                    "candidate_count": "400",
                    "hire_count": "5",
                    "review_rate": "74.0%"
                  },
                  {
                    "source_name": "LinkedIn",
                    "candidate_count": "519",
                    "hire_count": "7",
                    "review_rate": "75.1%"
                  },
                  {
                    "source_name": "Referral",
                    "candidate_count": "400",
                    "hire_count": "4",
                    "review_rate": "70.0%"
                  }
                ],
                "total_candidates": "2913"
              }
            }
          ]
        }
      }
    ]
  }
]