"""Keyword-based evaluation for BPO benchmark.""" from typing import List, Dict, Any def check_keywords(response: str, expected_keywords: List[str]) -> Dict[str, Any]: """ Check if response contains expected keywords (supports OR with |). Args: response: The agent's response text expected_keywords: List of keywords to check. Each keyword can contain alternatives separated by | (e.g., "67%|67 %|67") Returns: Dictionary with found/missing keywords, match rate, and pass status """ found = [] missing = [] for keyword in expected_keywords: alternatives = keyword.split("|") if any(alt.lower() in response.lower() for alt in alternatives): found.append(keyword) else: missing.append(keyword) match_rate = len(found) / len(expected_keywords) if expected_keywords else 1.0 return { "found": found, "missing": missing, "match_rate": match_rate, "passed": len(missing) == 0 } def evaluate_task(task: Dict[str, Any], response: str, tool_calls: List[Dict[str, Any]]) -> Dict[str, Any]: """ Evaluate a single task. Args: task: Task definition from tasks.json response: The agent's response text tool_calls: List of tool calls made by the agent Returns: Evaluation result dictionary """ expected_output = task.get("expected_output", {}) keywords = expected_output.get("keywords", []) result = check_keywords(response, keywords) # Extract tool names from tool calls tool_names = [] for tc in tool_calls: if isinstance(tc, dict): name = tc.get("name") or tc.get("function", {}).get("name", "") if name: tool_names.append(name) elif isinstance(tc, str): tool_names.append(tc) # Check expected tool calls expected_tools = expected_output.get("tool_calls", []) expected_tool_names = [t.get("name", "") for t in expected_tools if isinstance(t, dict)] # Calculate tool call accuracy if expected_tool_names: matched_tools = sum(1 for t in expected_tool_names if any(t in tn for tn in tool_names)) tool_accuracy = matched_tools / len(expected_tool_names) else: # No tools expected - check that none were called or that's acceptable tool_accuracy = 1.0 if not tool_names else 0.5 # Calculate API count accuracy (lenient: correct if actual >= expected) api_call_count = len(tool_names) expected_api_count = len(expected_tool_names) api_count_correct = 1 if api_call_count >= expected_api_count else 0 return { "task_id": task.get("name", "unknown"), "difficulty": task.get("difficulty", "unknown"), "intent": task.get("intent", ""), "response": response, "expected_keywords": keywords, "found_keywords": result["found"], "missing_keywords": result["missing"], "match_rate": result["match_rate"], "passed": result["passed"], "tool_calls": tool_names, "expected_tool_calls": expected_tool_names, "tool_accuracy": tool_accuracy, "api_call_count": api_call_count, "expected_api_count": expected_api_count, "api_count_correct": api_count_correct, } def calculate_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]: """ Calculate summary statistics from evaluation results. Args: results: List of evaluation results from evaluate_task Returns: Summary dictionary with pass rates and averages """ if not results: return { "total_tasks": 0, "passed": 0, "pass_rate": 0.0, "avg_match_rate": 0.0, "avg_tool_accuracy": 0.0, "api_count_accuracy": 0.0, "by_difficulty": {}, } total = len(results) passed = sum(1 for r in results if r.get("passed", False)) avg_match = sum(r.get("match_rate", 0) for r in results) / total avg_tool = sum(r.get("tool_accuracy", 0) for r in results) / total api_count_correct = sum(r.get("api_count_correct", 0) for r in results) # Group by difficulty by_difficulty = {} for r in results: diff = r.get("difficulty", "unknown") if diff not in by_difficulty: by_difficulty[diff] = {"total": 0, "passed": 0} by_difficulty[diff]["total"] += 1 if r.get("passed", False): by_difficulty[diff]["passed"] += 1 for diff in by_difficulty: by_difficulty[diff]["pass_rate"] = ( by_difficulty[diff]["passed"] / by_difficulty[diff]["total"] ) return { "total_tasks": total, "passed": passed, "pass_rate": passed / total, "avg_match_rate": avg_match, "avg_tool_accuracy": avg_tool, "api_count_accuracy": api_count_correct / total, "by_difficulty": by_difficulty, }