Spaces:
Running
Running
| [ | |
| { | |
| "name": "bpo-benchmark", | |
| "user_info": [], | |
| "test_cases": [ | |
| { | |
| "name": "task_1", | |
| "description": "Lists sources ranked by SLA success rate. | Explanation: CyberSec Jobs was identified as the lowest-performing source because its SLA success rate is 67 %, well below Dice (80 %), LinkedIn (79 %), GitHub (78 %), and the other sources returned by the API.", | |
| "intent": "For requisition 05958BR, which source has the lowest SLA performance?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "CyberSec Jobs with 67%", | |
| "keywords": [ | |
| "CyberSec Jobs", | |
| "67%|67 %|67" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "result": { | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "sla_percentage": 67 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "sla_percentage": 86 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "sla_percentage": 90 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "sla_percentage": 95 | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_2", | |
| "description": "Asks for the missing requisition id. | Explanation: The query lacks a requisition ID which is required for the API call.", | |
| "intent": "What's the percentage of hires and the total hires per source?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Can you share the requisition ID (e.g., 05958BR or 05959BR)?", | |
| "keywords": [ | |
| "requisition|req", | |
| "ID|id|identifier", | |
| "missing|without|share|provide|required" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_3", | |
| "description": "Shows each source's candidate volume and offer/hire success metrics for jobs similar to 05958BR. | Explanation: Candidate counts and percentages were taken from the candidate-volume API; hire counts and offer-acceptance rates were taken from the recommendation-summary API. The two tables were joined on \"source_name\", producing a combined view of volume and effectiveness for the three leading sources. | Note: Cross-references performance and volume per source. Requires joining APIs on 'source_name'.", | |
| "intent": "For requisitions like 05958BR, which sources provided the most candidates, and how effective were they at converting to hires?", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "LinkedIn: 519 candidates (18%), 7 hires. Offer acceptance rate: 70%. Dice: 516 candidates (18%), 11 hires. Offer acceptance rate: 79%. GitHub: 468 candidates (16%), 10 hires. Offer acceptance rate: 77%.", | |
| "keywords": [ | |
| "LinkedIn", | |
| "Dice", | |
| "GitHub", | |
| "Offer acceptance rate", | |
| "519", | |
| "516", | |
| "468", | |
| "18%|18 %|18", | |
| "70%|70 %|70", | |
| "79%|79 %|79", | |
| "77%|77 %|77", | |
| "hires" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_source_recommendation_summary", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "result": { | |
| "job_id": "05958BR", | |
| "total_candidate_volume": 2913, | |
| "metrics": [ | |
| { | |
| "source_name": "LinkedIn", | |
| "candidate_volume": 519, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "candidate_volume": 516, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "candidate_volume": 468, | |
| "percentage": 16 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "candidate_volume": 410, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "candidate_volume": 200, | |
| "percentage": 7 | |
| } | |
| ], | |
| "heading": "For requisitions similar to 05958BR, there were 2913 candidates over the past three years. Here's how many candidates came from each source (with percentages from the total number):" | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_source_recommendation_summary", | |
| "result": { | |
| "total_requisitions": 40, | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 80, | |
| "offer_acceptance_rate": 67, | |
| "total_hires": 3 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 11, | |
| "offer_acceptance_rate": 79, | |
| "total_hires": 11 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 76, | |
| "offer_acceptance_rate": 77, | |
| "total_hires": 10 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "jobs_filled_percentage": 0, | |
| "first_round_review_percentage": 77, | |
| "offer_acceptance_rate": 0, | |
| "total_hires": 0 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 74, | |
| "offer_acceptance_rate": 70, | |
| "total_hires": 5 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 75, | |
| "offer_acceptance_rate": 70, | |
| "total_hires": 7 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "jobs_filled_percentage": 2, | |
| "first_round_review_percentage": 70, | |
| "offer_acceptance_rate": 62, | |
| "total_hires": 4 | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_4", | |
| "description": "Asks for the missing requisition id. | Explanation: The query lacks a requisition ID which is required for the API call.", | |
| "intent": "Did Dice provide a good funnel conversion rate?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Can you share the requisition ID (e.g., 05958BR or 05959BR)?", | |
| "keywords": [ | |
| "requisition|req", | |
| "ID|id|identifier", | |
| "missing|without|share|provide|required" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_5", | |
| "description": "Asks for the missing requisition id. | Explanation: The query lacks a requisition ID which is required for the API call.", | |
| "intent": "Should I include the skill Python? What is its impact on SLA, fill rate, and overall relevance?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Can you share the requisition ID (e.g., 05958BR or 05959BR)?", | |
| "keywords": [ | |
| "requisition|req", | |
| "ID|id|identifier", | |
| "missing|without|share|provide|required" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_6", | |
| "description": "Recommends top-performing sources by combining SLA success, candidate volume, and funnel effectiveness. | Explanation: Each source received a weighted score (50 % SLA success, 30 % candidate volume share, 20 % offer-conversion rate). Dice and LinkedIn tied for top SLA (100 %) and high volume; GitHub's best-in-class conversion (2.8 %) offset its 80 % SLA. Indeed scored 0 on SLA and offers, so it was excluded. | Note: This benchmark tests multi-criteria decision-making and cross-API synthesis.", | |
| "intent": "What are the best sources to prioritize for 05959BR?", | |
| "difficulty": "hard", | |
| "expected_output": { | |
| "response": "You should prioritize Dice, GitHub, and LinkedIn. Dice and LinkedIn both met SLA 100% of the time and brought in 18% of all candidates. Dice had a strong offer conversion rate (2.7%), and GitHub had the highest conversion (2.8%) despite slightly lower SLA. Indeed should be avoided due to 0% SLA and 0% offer conversion.", | |
| "keywords": [ | |
| "Dice", | |
| "GitHub", | |
| "LinkedIn", | |
| "SLA", | |
| "Indeed" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_funnel_conversion_by_source", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "result": { | |
| "metrics": [ | |
| { | |
| "source_name": "Indeed", | |
| "sla_percentage": 0 | |
| }, | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "sla_percentage": 70 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "sla_percentage": 80 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "sla_percentage": 85 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "sla_percentage": 100 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "sla_percentage": 100 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "sla_percentage": 100 | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "result": { | |
| "job_id": "05959BR", | |
| "total_candidate_volume": 2913, | |
| "metrics": [ | |
| { | |
| "source_name": "Dice", | |
| "candidate_volume": 525, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "candidate_volume": 525, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "candidate_volume": 465, | |
| "percentage": 16 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "candidate_volume": 403, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "candidate_volume": 195, | |
| "percentage": 7 | |
| } | |
| ], | |
| "heading": "For requisitions similar to 05959BR, there were 2913 candidates over the past three years. Here's how many candidates came from each source (with percentages from the total number):" | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_funnel_conversion_by_source", | |
| "result": { | |
| "job_id": "05959BR", | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "first_round_review_percentage": 80.5, | |
| "interview_rate": 18.5, | |
| "offer_acceptance_rate": 3.1 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "first_round_review_percentage": 76.0, | |
| "interview_rate": 9.9, | |
| "offer_acceptance_rate": 2.7 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "first_round_review_percentage": 72.0, | |
| "interview_rate": 16.6, | |
| "offer_acceptance_rate": 2.8 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "first_round_review_percentage": 72.2, | |
| "interview_rate": 14.8, | |
| "offer_acceptance_rate": 0.0 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "first_round_review_percentage": 76.9, | |
| "interview_rate": 19.6, | |
| "offer_acceptance_rate": 2.5 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "first_round_review_percentage": 70.1, | |
| "interview_rate": 21.0, | |
| "offer_acceptance_rate": 1.9 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "first_round_review_percentage": 74.5, | |
| "interview_rate": 20.5, | |
| "offer_acceptance_rate": 2.0 | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_7", | |
| "description": "Asks for the missing requisition id. | Explanation: The query lacks a requisition ID which is required for the API call.", | |
| "intent": "Out of these skills — Python, Quantum Physics, Cyber Engineering, Risk Analysis, Wireshark — which ones negatively affect SLA performance?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Can you share the requisition ID (e.g., 05958BR or 05959BR)?", | |
| "keywords": [ | |
| "requisition|req", | |
| "ID|id|identifier", | |
| "missing|without|share|provide|required" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_8", | |
| "description": "Returns the definition of the SLA metric for the given requisition. | Explanation: The definitions-and-methodology endpoint contains a JSON field \"sla\" holding the textual definition; the agent extracted that string verbatim. | Note: Tests the agent's ability to locate and return a specific definition.", | |
| "intent": "How is the SLA metric defined for 05958BR?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "SLA is defined as 'Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)'.", | |
| "keywords": [ | |
| "SLA", | |
| "Percentage", | |
| "reviewed", | |
| "window" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "result": { | |
| "job_id": "05958BR", | |
| "definitions": { | |
| "sla": "Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)", | |
| "time_to_fill": "Average time from job posting to accepted offer", | |
| "success_rate": "Ratio of candidates who accepted offers out of those interviewed" | |
| }, | |
| "calculation_notes": "Metrics are computed from 1047 requisitions over the last 1.4 years. Funnel stats are based on system timestamps and recruiter actions in ATS.", | |
| "top_metrics_considered": [ | |
| "SLA %", | |
| "First round review %", | |
| "Offer acceptance rate", | |
| "Candidate volume", | |
| "Total hires" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_9", | |
| "description": "Returns the number of requisitions used to compute the reported metrics. | Explanation: The methodology response includes a note like \"Metrics calculated over N = 1047 requisitions\"; the agent parsed the integer 1047 and returned it. | Note: Tests string parsing / information extraction from notes field.", | |
| "intent": "How many requisitions were used to compute these metrics for 05958BR?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Metrics are computed from 1047 requisitions.", | |
| "keywords": [ | |
| "1047", | |
| "requisitions" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "result": { | |
| "job_id": "05958BR", | |
| "definitions": { | |
| "sla": "Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)", | |
| "time_to_fill": "Average time from job posting to accepted offer", | |
| "success_rate": "Ratio of candidates who accepted offers out of those interviewed" | |
| }, | |
| "calculation_notes": "Metrics are computed from 1047 requisitions over the last 1.4 years. Funnel stats are based on system timestamps and recruiter actions in ATS.", | |
| "top_metrics_considered": [ | |
| "SLA %", | |
| "First round review %", | |
| "Offer acceptance rate", | |
| "Candidate volume", | |
| "Total hires" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_10", | |
| "description": "Returns the list of top metrics considered for source evaluation. | Explanation: The agent read the \"top_metrics_considered\" array from the methodology API response and returned the metrics in the same order. | Note: Tests structured list extraction and formatting.", | |
| "intent": "What are the top metrics considered when evaluating candidate sources for 05958BR?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "The top metrics considered are: SLA %, First round review %, Offer acceptance rate, Candidate volume, Total hires.", | |
| "keywords": [ | |
| "SLA", | |
| "First round review", | |
| "Offer acceptance", | |
| "Candidate volume", | |
| "Total hires" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "result": { | |
| "job_id": "05958BR", | |
| "definitions": { | |
| "sla": "Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)", | |
| "time_to_fill": "Average time from job posting to accepted offer", | |
| "success_rate": "Ratio of candidates who accepted offers out of those interviewed" | |
| }, | |
| "calculation_notes": "Metrics are computed from 1047 requisitions over the last 1.4 years. Funnel stats are based on system timestamps and recruiter actions in ATS.", | |
| "top_metrics_considered": [ | |
| "SLA %", | |
| "First round review %", | |
| "Offer acceptance rate", | |
| "Candidate volume", | |
| "Total hires" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_11", | |
| "description": "Loops through the provided list of models and reports which ones were used. | Explanation: The agent compared each provided model name against the \"models_involved\" array returned by data-sources-used API and reported matches (used) or non-matches (not used). | Note: Tests loop-based reasoning and partial matching for list membership.", | |
| "intent": "Were the following models used to generate metrics for 05958BR: SLA impact regression model, Candidate ranking model, Skill relevance classifier?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Yes, 'SLA impact regression model' and 'Skill relevance classifier' were used. 'Candidate ranking model' was not listed among the models involved.", | |
| "keywords": [ | |
| "SLA impact regression model", | |
| "Skill relevance classifier", | |
| "Candidate ranking model" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_data_sources_used", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_data_sources_used", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "datasets_used": [ | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "models_involved": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_12", | |
| "description": "Loops through the provided list of data sources and reports which ones were used. | Explanation: Each candidate data source was checked against the \"datasets_used\" array from data-sources-used API; two matched and one did not, which the agent reported accordingly. | Note: Tests loop-based reasoning and partial matching for list membership.", | |
| "intent": "Were the following data sources used to compute the metrics for 05958BR: Historical hiring success data, Job description embeddings, Funnel conversion metrics?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Yes, 'Historical hiring success data' and 'Funnel conversion metrics' were used. 'Job description embeddings' was not listed among the data sources.", | |
| "keywords": [ | |
| "Historical hiring success data", | |
| "Funnel conversion metrics", | |
| "Job description embeddings" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_data_sources_used", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_data_sources_used", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "datasets_used": [ | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "models_involved": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_13", | |
| "description": "Combines model lookup, retrieves actual SLA delta, and returns SLA definition. | Explanation: The SLA-impact API showed a 0 % delta for Python; data-sources-used API confirmed the 'SLA impact regression model' was involved; the methodology API supplied the formal SLA definition. These three pieces were combined into one coherent answer. | Note: Agent must combine numerical result (delta), model lookup, and formal definition into unified answer.", | |
| "intent": "For 05958BR, when evaluating the SLA impact of Python, which models were used, what was the SLA delta, and what is the system definition of SLA?", | |
| "difficulty": "hard", | |
| "expected_output": { | |
| "response": "'SLA impact regression model' was used. The SLA delta for Python was 0.0%. SLA is defined as 'Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)'.", | |
| "keywords": [ | |
| "SLA impact regression model", | |
| "0.0%|0.0 %|0.0|0%|0 %|0", | |
| "SLA", | |
| "Percentage", | |
| "reviewed", | |
| "window" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_skill_impact_sla", | |
| "args": {} | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_skill_impact_sla", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "skill_name": "Python", | |
| "sla_achievement_with_skill": 90, | |
| "sla_achievement_without_skill": 90, | |
| "delta": 0 | |
| } | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "datasets_used": [ | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "models_involved": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender" | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_definitions_and_methodology", | |
| "result": { | |
| "job_id": "05958BR", | |
| "definitions": { | |
| "sla": "Percentage of candidates reviewed within the defined SLA window (e.g., 48 hours)", | |
| "time_to_fill": "Average time from job posting to accepted offer", | |
| "success_rate": "Ratio of candidates who accepted offers out of those interviewed" | |
| }, | |
| "calculation_notes": "Metrics are computed from 1047 requisitions over the last 1.4 years. Funnel stats are based on system timestamps and recruiter actions in ATS.", | |
| "top_metrics_considered": [ | |
| "SLA %", | |
| "First round review %", | |
| "Offer acceptance rate", | |
| "Candidate volume", | |
| "Total hires" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_14", | |
| "description": "States that Risk Analysis negatively affects SLA and lists the datasets that informed the analysis. | Explanation: The skill-analysis API flagged Risk Analysis as negatively correlated with SLA. The data-sources-used API listed the four datasets underpinning the evaluation, and both results were consolidated in the response. | Note: Correlation wording corrected to match API ('highly negative impact on SLA').", | |
| "intent": "Was 'Risk Analysis' considered historically effective, and what data sources informed this analysis for 05958BR?", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "'Risk Analysis' is **not** considered effective: historical analysis shows it is correlated with a **highly negative impact on SLA**. The evaluation used these data sources: Historical hiring success data, Requisition skill tagging, Funnel conversion metrics, and Candidate quality feedback.", | |
| "keywords": [ | |
| "Risk Analysis", | |
| "not", | |
| "effective", | |
| "highly negative impact on SLA", | |
| "SLA", | |
| "Historical hiring success data", | |
| "Requisition skill tagging" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_skill_analysis", | |
| "args": {} | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_skill_analysis", | |
| "result": { | |
| "historical_jobs": 40, | |
| "input_skills": [], | |
| "historical_skills_with_analysis": [ | |
| { | |
| "name": "AWS", | |
| "skill_occurrence": 948, | |
| "correlation": "slightly positive impact on SLA" | |
| }, | |
| { | |
| "name": "IT Support", | |
| "skill_occurrence": 868, | |
| "correlation": "slightly positive impact on SLA" | |
| }, | |
| { | |
| "name": "NIST Cybersecurity Framework", | |
| "skill_occurrence": 816, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Incident Management", | |
| "skill_occurrence": 748, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Firewalls", | |
| "skill_occurrence": 744, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Cloud Security", | |
| "skill_occurrence": 592, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Risk Assessment", | |
| "skill_occurrence": 570, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Risk Analysis", | |
| "skill_occurrence": 371, | |
| "correlation": "highly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Microsoft PowerShell", | |
| "skill_occurrence": 289, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Python", | |
| "skill_occurrence": 200, | |
| "correlation": "slightly negative impact on SLA" | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "datasets_used": [ | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "models_involved": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender" | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_15", | |
| "description": "Explains that no provided skills hurt SLA, no provided sources require avoidance, and lists datasets used. | Explanation: The agent looped over each skill, querying SLA-impact and skill-analysis APIs; none yielded a negative delta. It then looped over each source, combining SLA-per-source and funnel-conversion data; all three met both SLA and offer benchmarks, so none were flagged. Finally, the datasets_used array was fetched once and reported. | Note: Fully aligned with current API payloads—no unsupported claims.", | |
| "intent": "For requisition 05958BR, which of the following skills — Python, Quantum Physics, Wireshark — negatively impact SLA, and which of the following sources — Dice, GitHub, Referral — should be avoided based on low offer conversion or SLA performance? Also, which data sources were used to compute these insights?", | |
| "difficulty": "hard", | |
| "expected_output": { | |
| "response": "None of the listed skills show a negative SLA impact in the available data: Python's SLA delta is 0 %, and Quantum Physics and Wireshark are not present in the analysis. Likewise, Dice (SLA 95 %, offer-acceptance 79 %), GitHub (SLA 90 %, offer-acceptance 77 %), and Referral (SLA 95 %, offer-acceptance 62 %) all perform satisfactorily, so none of them need to be avoided. These insights were produced using the following datasets: Historical hiring success data, Requisition skill tagging, Funnel conversion metrics, and Candidate quality feedback.", | |
| "keywords": [ | |
| "Python", | |
| "Quantum Physics", | |
| "Wireshark", | |
| "Dice", | |
| "GitHub", | |
| "Referral", | |
| "SLA", | |
| "0%|0 %|0", | |
| "95%|95 %|95", | |
| "79%|79 %|79", | |
| "90%|90 %|90", | |
| "77%|77 %|77", | |
| "62%|62 %|62", | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "skills_skill_impact_sla", | |
| "args": {} | |
| }, | |
| { | |
| "name": "skills_skill_analysis", | |
| "args": {} | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_funnel_conversion_by_source", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "skills_skill_impact_sla", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "skill_name": "Python", | |
| "sla_achievement_with_skill": 90, | |
| "sla_achievement_without_skill": 90, | |
| "delta": 0 | |
| } | |
| }, | |
| { | |
| "name": "skills_skill_analysis", | |
| "result": { | |
| "historical_jobs": 40, | |
| "input_skills": [], | |
| "historical_skills_with_analysis": [ | |
| { | |
| "name": "AWS", | |
| "skill_occurrence": 948, | |
| "correlation": "slightly positive impact on SLA" | |
| }, | |
| { | |
| "name": "IT Support", | |
| "skill_occurrence": 868, | |
| "correlation": "slightly positive impact on SLA" | |
| }, | |
| { | |
| "name": "NIST Cybersecurity Framework", | |
| "skill_occurrence": 816, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Incident Management", | |
| "skill_occurrence": 748, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Firewalls", | |
| "skill_occurrence": 744, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Cloud Security", | |
| "skill_occurrence": 592, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Risk Assessment", | |
| "skill_occurrence": 570, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Risk Analysis", | |
| "skill_occurrence": 371, | |
| "correlation": "highly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Microsoft PowerShell", | |
| "skill_occurrence": 289, | |
| "correlation": "slightly negative impact on SLA" | |
| }, | |
| { | |
| "name": "Python", | |
| "skill_occurrence": 200, | |
| "correlation": "slightly negative impact on SLA" | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "skills_data_sources_used", | |
| "result": { | |
| "requisition_id": "05958BR", | |
| "datasets_used": [ | |
| "Historical hiring success data", | |
| "Requisition skill tagging", | |
| "Funnel conversion metrics", | |
| "Candidate quality feedback" | |
| ], | |
| "models_involved": [ | |
| "Skill relevance classifier", | |
| "SLA impact regression model", | |
| "Funnel conversion recommender" | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_funnel_conversion_by_source", | |
| "result": { | |
| "job_id": "05958BR", | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "first_round_review_percentage": 80.5, | |
| "interview_rate": 19.0, | |
| "offer_acceptance_rate": 3.0 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "first_round_review_percentage": 11.0, | |
| "interview_rate": 6.8, | |
| "offer_acceptance_rate": 2.7 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "first_round_review_percentage": 76.1, | |
| "interview_rate": 23.7, | |
| "offer_acceptance_rate": 2.8 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "first_round_review_percentage": 77.1, | |
| "interview_rate": 22.0, | |
| "offer_acceptance_rate": 0.0 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "first_round_review_percentage": 74.0, | |
| "interview_rate": 18.5, | |
| "offer_acceptance_rate": 2.5 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "first_round_review_percentage": 75.1, | |
| "interview_rate": 20.4, | |
| "offer_acceptance_rate": 1.9 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "first_round_review_percentage": 70.0, | |
| "interview_rate": 17.0, | |
| "offer_acceptance_rate": 2.0 | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_sla_per_source", | |
| "result": { | |
| "metrics": [ | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "sla_percentage": 67 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "sla_percentage": 86 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "sla_percentage": 90 | |
| }, | |
| { | |
| "source_name": "Dice", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "sla_percentage": 95 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "sla_percentage": 95 | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_16", | |
| "description": "Apologises for missing capability; optionally provides an illustrative optimisation if the feature existed. | Explanation: No endpoint returns or accepts free-text job descriptions, so optimisation is unsupported. The hypothetical section shows what the answer could look like if such an endpoint were added. | Note: Demonstrates graceful degradation plus a forward-looking example.", | |
| "intent": "Help me optimize the job description for 05959BR.", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Sorry — the available APIs do not expose the current job-description text, so I cannot directly optimise it.", | |
| "keywords": [ | |
| "APIs|API", | |
| "job-description|job description", | |
| "cannot|can't" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_17", | |
| "description": "Prompts the user for the missing job ID instead of guessing. | Explanation: Illustrates a clarification turn when a required parameter (requisition_id) is missing. | Note: Tests conversational error-handling with zero API usage.", | |
| "intent": "Which sourcing channel is the most effective for this job?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Can you share the requisition ID (e.g., 05958BR or 05959BR)?", | |
| "keywords": [ | |
| "requisition|req", | |
| "ID|id|identifier", | |
| "missing|without|share|provide|required" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_18", | |
| "description": "Explains unsupported request; shows what a successful answer might include. | Explanation: No endpoints expose hiring-manager data; a possible future answer is sketched for context. | Note: Covers gap #3: hiring-manager analytics.", | |
| "intent": "Who's the hiring manager for 05959BR and how responsive is she?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Sorry — current APIs don't return hiring-manager names or responsiveness metrics, so I can't answer that.", | |
| "keywords": [ | |
| "APIs", | |
| "hiring-manager", | |
| "responsive", | |
| "can't|cannot" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_19", | |
| "description": "States limitation; optional mock table shows desired granularity. | Explanation: The existing funnel-conversion API only returns percentages by source, not absolute counts or durations per stage. | Note: Covers gap #4: full funnel metrics.", | |
| "intent": "Show me the average candidate counts and days spent in each funnel stage for roles like 05959BR.", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "I'm sorry, but none of the available APIs provide stage-by-stage candidate counts or time-in-status metrics, so I can't generate a funnel table.", | |
| "keywords": [ | |
| "APIs|API", | |
| "stage", | |
| "candidate counts", | |
| "time-in-status", | |
| "funnel" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_20", | |
| "description": "Returns start/end dates, last update date, and requisition count. | Explanation: Pulled time_frame_start, time_frame_end, data_last_updated and total_requisitions_analysed from the metadata-and-timeframe endpoint. | Note: Demonstrates a fully supported recency / frequency query (gap #5).", | |
| "intent": "What's the data timeframe for 05958BR and how many similar requisitions were analysed?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "The metrics cover 9 Oct 2023 – 15 Mar 2025 (≈ 18 months) and were last updated on 29 Apr 2025. A total of 40 similar requisitions were analysed.", | |
| "keywords": [ | |
| "9 Oct 2023|Oct 9, 2023", | |
| "15 Mar 2025|Mar 15, 2025", | |
| "29 Apr 2025|Apr 29, 2025", | |
| "40", | |
| "requisitions" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_metadata_and_timeframe", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_metadata_and_timeframe", | |
| "result": { | |
| "job_id": "05958BR", | |
| "time_frame_start": "2023-10-09", | |
| "time_frame_end": "2025-03-15", | |
| "data_last_updated": "2025-04-29", | |
| "total_requisitions_analysed": 40 | |
| } | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "name": "task_21", | |
| "description": "Gracefully declines unsupported analysis; shows sample comparative table. | Explanation: Only the definition endpoint describes *what* time-to-fill means; no endpoint provides per-source values to compute deltas. | Note: Covers gap #6.", | |
| "intent": "How does average time-to-fill compare to SLA for each sourcing channel?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "I'm afraid I can't provide that comparison because the current APIs don't expose time-to-fill broken down by source.", | |
| "keywords": [ | |
| "APIs", | |
| "time-to-fill", | |
| "source" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_22", | |
| "description": "Explains unsupported geographic/channel filter and gives mock estimation. | Explanation: No endpoints accept country/channel parameters; the example illustrates desired behaviour. | Note: Covers gap #7: region & channel-specific analytics.", | |
| "intent": "If we posted this role internally in France only, how long would it take to fill?", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Current APIs don't allow filtering by geography or posting channel, so I'm unable to estimate a France-only internal posting timeline.", | |
| "keywords": [ | |
| "Sorry|no|unable|can't|cannot", | |
| "APIs|API|filtering|filter", | |
| "France|geography|geographic|region|channel", | |
| "internal|posting" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_23", | |
| "description": "States limitation and shows example list of near-deadline requisitions. | Explanation: No endpoint surfaces open requisitions with SLA dates. The hypothetical section demonstrates the desired list format. | Note: Covers gap #8 and introduces a potential future endpoint.", | |
| "intent": "List all requisitions that are within 30 days of their SLA deadline.", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Sorry — the API suite doesn't provide live requisition status or SLA countdowns, so I can't generate that list.", | |
| "keywords": [ | |
| "API|APIs", | |
| "SLA", | |
| "requisition|req", | |
| "status", | |
| "countdown|countdowns", | |
| "deadline|list" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_24", | |
| "description": "Graceful 'ID not found' message with suggested alternatives. | Explanation: Because 05960BR does not exist, the assistant returns a polite error plus four close-match IDs (simulating fuzzy search in the ATS). No API call is made for a bad ID. | Note: Error-handling scenario for invalid requisition IDs.", | |
| "intent": "Show candidate funnel for job id 05960BR", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "No job can be found with the ID 05960BR.\nDid you want to use one of the following job IDs instead?\n• UZLXBR — Sourcing Manager\n• F50HBR — Offering Manager\n• MJZ1BR — Offering Manager\n• 5TTKBR — Delivery Analyst", | |
| "keywords": [ | |
| "05960BR", | |
| "No job", | |
| "can be found|not found" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_25", | |
| "description": "Explains unsupported request and sketches desired output. | Explanation: There is no /job-details/ endpoint. The hypothetical section illustrates what the response would look like if such an endpoint became available. | Note: Completes coverage for full requisition card' requests.", | |
| "intent": "Show me the details of UZLXBR", | |
| "difficulty": "easy", | |
| "expected_output": { | |
| "response": "Sorry — none of the current APIs provide full job-card details (title, location, hiring-manager email, etc.), so I can't display that information.", | |
| "keywords": [ | |
| "APIs", | |
| "job-card", | |
| "details" | |
| ], | |
| "tool_calls": [] | |
| } | |
| }, | |
| { | |
| "name": "task_26", | |
| "description": "Returns average candidate count for comparable requisitions. | Explanation: candidate-volume-by-source returns `total_candidate_volume = 2913`; metadata-and-timeframe shows `total_requisitions_analysed = 40`. Dividing 2913 ÷ 40 ≈ 73 yields the average. | Note: Covers the repeated average candidate volume questions.", | |
| "intent": "How many candidates do we usually get for postings similar to 05959BR?", | |
| "difficulty": "medium", | |
| "expected_output": { | |
| "response": "On average, similar postings attract **73 candidates**.", | |
| "keywords": [ | |
| "73", | |
| "candidates", | |
| "average" | |
| ], | |
| "tool_calls": [ | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "args": {} | |
| }, | |
| { | |
| "name": "candidate_source_metadata_and_timeframe", | |
| "args": {} | |
| } | |
| ], | |
| "tool_call_results": [ | |
| { | |
| "name": "candidate_source_candidate_volume_by_source", | |
| "result": { | |
| "job_id": "05959BR", | |
| "total_candidate_volume": 2913, | |
| "metrics": [ | |
| { | |
| "source_name": "Dice", | |
| "candidate_volume": 525, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "LinkedIn", | |
| "candidate_volume": 525, | |
| "percentage": 18 | |
| }, | |
| { | |
| "source_name": "GitHub", | |
| "candidate_volume": 465, | |
| "percentage": 16 | |
| }, | |
| { | |
| "source_name": "Internal", | |
| "candidate_volume": 403, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Indeed", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "Referral", | |
| "candidate_volume": 400, | |
| "percentage": 14 | |
| }, | |
| { | |
| "source_name": "CyberSec Jobs", | |
| "candidate_volume": 195, | |
| "percentage": 7 | |
| } | |
| ], | |
| "heading": "For requisitions similar to 05959BR, there were 2913 candidates over the past three years. Here's how many candidates came from each source (with percentages from the total number):" | |
| } | |
| }, | |
| { | |
| "name": "candidate_source_metadata_and_timeframe", | |
| "result": { | |
| "job_id": "05959BR", | |
| "time_frame_start": "2023-10-09", | |
| "time_frame_end": "2025-03-15", | |
| "data_last_updated": "2025-04-29", | |
| "total_requisitions_analysed": 40 | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| } | |
| ] | |