"""Pocket Automator benchmark utilities — metrics, confusion matrix, failure reports.""" from __future__ import annotations import json from collections import defaultdict from pathlib import Path from typing import Any try: from src.skill_utils import ( normalize_param, normalize_skill, parameters_match, ) except ImportError: from skill_utils import ( normalize_param, normalize_skill, parameters_match, ) PROJECT_ROOT = Path(__file__).resolve().parent.parent DATA_DIR = PROJECT_ROOT / "data" BENCHMARK_PROMPTS_PATH = DATA_DIR / "pocket_benchmark_prompts.json" BENCHMARK_RESULTS_PATH = DATA_DIR / "pocket_benchmark_results.json" BENCHMARK_REPORT_PATH = DATA_DIR / "pocket_benchmark_report.txt" BENCHMARK_SKILLS: tuple[str, ...] = ( "create_alarm", "whatsapp_send_message", "spotify_pause", "spotify_play_playlist", "spotify_search_play", "uber_request_ride", "gmail_send_email", "calendar_create_event", "slack_open_channel", ) DOMAIN_BY_SKILL: dict[str, str] = { "create_alarm": "alarms", "whatsapp_send_message": "whatsapp", "spotify_pause": "spotify", "spotify_play_playlist": "spotify", "spotify_search_play": "spotify", "uber_request_ride": "uber", "gmail_send_email": "gmail", "calendar_create_event": "calendar", "slack_open_channel": "slack", } def load_benchmark_prompts(path: Path | None = None) -> list[dict[str, Any]]: benchmark_path = path or BENCHMARK_PROMPTS_PATH with benchmark_path.open(encoding="utf-8") as handle: return json.load(handle) def canonical_intent(skill: str, parameters: dict | None = None) -> str: params = parameters or {} normalized = {key: normalize_param(str(value)) for key, value in sorted(params.items())} payload = {"skill": skill, "parameters": normalized} return json.dumps(payload, separators=(",", ":"), sort_keys=True) def exact_intent_match( predicted: dict | None, expected_skill: str, expected_parameters: dict | None = None, ) -> bool: if not predicted: return False expected_params = expected_parameters or {} predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill") if predicted_skill != expected_skill: return False predicted_params = predicted.get("parameters") or {} if set(predicted_params.keys()) != set(expected_params.keys()): return False return parameters_match(predicted_params, expected_params) def evaluate_prediction( predicted: dict | None, expected: dict[str, Any], ) -> dict[str, bool]: expected_skill = expected["skill"] expected_params = expected.get("parameters", {}) skill_correct = predicted is not None and ( normalize_skill(predicted.get("skill")) or predicted.get("skill") ) == expected_skill parameter_correct = skill_correct and parameters_match( predicted.get("parameters", {}), expected_params, ) exact_json = exact_intent_match(predicted, expected_skill, expected_params) return { "skill_correct": skill_correct, "parameter_correct": parameter_correct, "exact_json_match": exact_json, } def build_confusion_matrix( results: list[dict[str, Any]], skills: tuple[str, ...] | None = None, ) -> dict[str, dict[str, int]]: skill_list = list(skills or BENCHMARK_SKILLS) matrix: dict[str, dict[str, int]] = { expected: {predicted: 0 for predicted in skill_list + [""]} for expected in skill_list } for result in results: expected_skill = result["expected"]["skill"] predicted = result.get("predicted") if predicted and predicted.get("skill"): predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill") if predicted_skill not in matrix[expected_skill]: predicted_skill = "" else: predicted_skill = "" matrix[expected_skill][predicted_skill] += 1 return matrix def summarize_metrics(results: list[dict[str, Any]]) -> dict[str, Any]: total = len(results) if total == 0: return { "total": 0, "skill_accuracy": 0.0, "parameter_accuracy": 0.0, "exact_json_match_rate": 0.0, "skill_correct": 0, "parameter_correct": 0, "exact_json_match": 0, } skill_correct = sum(1 for result in results if result["skill_correct"]) parameter_correct = sum(1 for result in results if result["parameter_correct"]) exact_json_match = sum(1 for result in results if result["exact_json_match"]) return { "total": total, "skill_correct": skill_correct, "parameter_correct": parameter_correct, "exact_json_match": exact_json_match, "skill_accuracy": skill_correct / total, "parameter_accuracy": parameter_correct / total, "exact_json_match_rate": exact_json_match / total, } def build_failure_report(results: list[dict[str, Any]]) -> list[dict[str, Any]]: failures: list[dict[str, Any]] = [] for result in results: if result["exact_json_match"]: continue failure_types: list[str] = [] if not result["skill_correct"]: failure_types.append("skill") elif not result["parameter_correct"]: failure_types.append("parameters") else: failure_types.append("json_format") failures.append( { "id": result.get("id"), "prompt": result["prompt"], "domain": result.get("domain"), "styles": result.get("styles", []), "expected": result["expected"], "predicted": result.get("predicted"), "raw_output": result.get("raw_output"), "failure_types": failure_types, } ) return failures def format_confusion_matrix(matrix: dict[str, dict[str, int]]) -> str: predicted_labels = list(next(iter(matrix.values())).keys()) corner_label = "expected \\ predicted" header = f"{corner_label:<28}" + "".join(f"{label:>18}" for label in predicted_labels) lines = [header, "-" * len(header)] for expected, counts in matrix.items(): row = f"{expected:<28}" + "".join(f"{counts[label]:>18}" for label in predicted_labels) lines.append(row) return "\n".join(lines) def format_benchmark_report( metrics: dict[str, Any], matrix: dict[str, dict[str, int]], failures: list[dict[str, Any]], *, title: str = "Pocket Automator Benchmark Report", ) -> str: lines = [ title, "=" * len(title), "", "Metrics", "-------", f"Total prompts: {metrics['total']}", f"Skill accuracy: {metrics['skill_correct']}/{metrics['total']} ({metrics['skill_accuracy']:.1%})", f"Parameter accuracy: {metrics['parameter_correct']}/{metrics['total']} ({metrics['parameter_accuracy']:.1%})", f"Exact JSON match: {metrics['exact_json_match']}/{metrics['total']} ({metrics['exact_json_match_rate']:.1%})", "", "Confusion Matrix (rows=expected, cols=predicted)", "------------------------------------------------", format_confusion_matrix(matrix), "", f"Failure Report ({len(failures)} failures)", "--------------", ] if not failures: lines.append("No failures — perfect score.") else: for index, failure in enumerate(failures, start=1): lines.extend( [ "", f"[{index}] {failure.get('id', 'n/a')} — {', '.join(failure['failure_types'])}", f"Prompt: {failure['prompt']}", f"Domain: {failure.get('domain', 'n/a')}", f"Styles: {', '.join(failure.get('styles') or [])}", f"Expected: {json.dumps(failure['expected'], separators=(',', ':'))}", f"Predicted: {json.dumps(failure.get('predicted'), separators=(',', ':')) if failure.get('predicted') else failure.get('raw_output', 'null')}", ] ) style_failures: dict[str, int] = defaultdict(int) domain_failures: dict[str, int] = defaultdict(int) for failure in failures: for style in failure.get("styles") or []: style_failures[style] += 1 domain = failure.get("domain") if domain: domain_failures[domain] += 1 if style_failures or domain_failures: lines.extend(["", "Failure breakdown", "-----------------"]) if style_failures: lines.append("By style:") for style, count in sorted(style_failures.items()): lines.append(f" {style}: {count}") if domain_failures: lines.append("By domain:") for domain, count in sorted(domain_failures.items()): lines.append(f" {domain}: {count}") return "\n".join(lines) + "\n" def record_result( case: dict[str, Any], raw_output: str, predicted: dict | None, ) -> dict[str, Any]: expected = case["expected"] checks = evaluate_prediction(predicted, expected) return { "id": case.get("id"), "prompt": case["prompt"], "domain": case.get("domain"), "styles": case.get("styles", []), "expected": expected, "predicted": predicted, "raw_output": raw_output, **checks, } def save_benchmark_outputs( results: list[dict[str, Any]], *, results_path: Path | None = None, report_path: Path | None = None, ) -> tuple[dict[str, Any], str]: metrics = summarize_metrics(results) matrix = build_confusion_matrix(results) failures = build_failure_report(results) report = format_benchmark_report(metrics, matrix, failures) resolved_results_path = results_path or BENCHMARK_RESULTS_PATH resolved_report_path = report_path or BENCHMARK_REPORT_PATH payload = { "metrics": metrics, "confusion_matrix": matrix, "failures": failures, "results": results, } resolved_results_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") resolved_report_path.write_text(report, encoding="utf-8") return metrics, report