| """Pocket Automator benchmark utilities — metrics, confusion matrix, failure reports.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from collections import defaultdict |
| from pathlib import Path |
| from typing import Any |
|
|
| try: |
| from src.skill_utils import ( |
| normalize_param, |
| normalize_skill, |
| parameters_match, |
| ) |
| except ImportError: |
| from skill_utils import ( |
| normalize_param, |
| normalize_skill, |
| parameters_match, |
| ) |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| DATA_DIR = PROJECT_ROOT / "data" |
|
|
| BENCHMARK_PROMPTS_PATH = DATA_DIR / "pocket_benchmark_prompts.json" |
| BENCHMARK_RESULTS_PATH = DATA_DIR / "pocket_benchmark_results.json" |
| BENCHMARK_REPORT_PATH = DATA_DIR / "pocket_benchmark_report.txt" |
|
|
| BENCHMARK_SKILLS: tuple[str, ...] = ( |
| "create_alarm", |
| "whatsapp_send_message", |
| "spotify_pause", |
| "spotify_play_playlist", |
| "spotify_search_play", |
| "uber_request_ride", |
| "gmail_send_email", |
| "calendar_create_event", |
| "slack_open_channel", |
| ) |
|
|
| DOMAIN_BY_SKILL: dict[str, str] = { |
| "create_alarm": "alarms", |
| "whatsapp_send_message": "whatsapp", |
| "spotify_pause": "spotify", |
| "spotify_play_playlist": "spotify", |
| "spotify_search_play": "spotify", |
| "uber_request_ride": "uber", |
| "gmail_send_email": "gmail", |
| "calendar_create_event": "calendar", |
| "slack_open_channel": "slack", |
| } |
|
|
|
|
| def load_benchmark_prompts(path: Path | None = None) -> list[dict[str, Any]]: |
| benchmark_path = path or BENCHMARK_PROMPTS_PATH |
| with benchmark_path.open(encoding="utf-8") as handle: |
| return json.load(handle) |
|
|
|
|
| def canonical_intent(skill: str, parameters: dict | None = None) -> str: |
| params = parameters or {} |
| normalized = {key: normalize_param(str(value)) for key, value in sorted(params.items())} |
| payload = {"skill": skill, "parameters": normalized} |
| return json.dumps(payload, separators=(",", ":"), sort_keys=True) |
|
|
|
|
| def exact_intent_match( |
| predicted: dict | None, |
| expected_skill: str, |
| expected_parameters: dict | None = None, |
| ) -> bool: |
| if not predicted: |
| return False |
|
|
| expected_params = expected_parameters or {} |
| predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill") |
| if predicted_skill != expected_skill: |
| return False |
|
|
| predicted_params = predicted.get("parameters") or {} |
| if set(predicted_params.keys()) != set(expected_params.keys()): |
| return False |
|
|
| return parameters_match(predicted_params, expected_params) |
|
|
|
|
| def evaluate_prediction( |
| predicted: dict | None, |
| expected: dict[str, Any], |
| ) -> dict[str, bool]: |
| expected_skill = expected["skill"] |
| expected_params = expected.get("parameters", {}) |
|
|
| skill_correct = predicted is not None and ( |
| normalize_skill(predicted.get("skill")) or predicted.get("skill") |
| ) == expected_skill |
| parameter_correct = skill_correct and parameters_match( |
| predicted.get("parameters", {}), |
| expected_params, |
| ) |
| exact_json = exact_intent_match(predicted, expected_skill, expected_params) |
|
|
| return { |
| "skill_correct": skill_correct, |
| "parameter_correct": parameter_correct, |
| "exact_json_match": exact_json, |
| } |
|
|
|
|
| def build_confusion_matrix( |
| results: list[dict[str, Any]], |
| skills: tuple[str, ...] | None = None, |
| ) -> dict[str, dict[str, int]]: |
| skill_list = list(skills or BENCHMARK_SKILLS) |
| matrix: dict[str, dict[str, int]] = { |
| expected: {predicted: 0 for predicted in skill_list + ["<invalid>"]} |
| for expected in skill_list |
| } |
|
|
| for result in results: |
| expected_skill = result["expected"]["skill"] |
| predicted = result.get("predicted") |
| if predicted and predicted.get("skill"): |
| predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill") |
| if predicted_skill not in matrix[expected_skill]: |
| predicted_skill = "<invalid>" |
| else: |
| predicted_skill = "<invalid>" |
|
|
| matrix[expected_skill][predicted_skill] += 1 |
|
|
| return matrix |
|
|
|
|
| def summarize_metrics(results: list[dict[str, Any]]) -> dict[str, Any]: |
| total = len(results) |
| if total == 0: |
| return { |
| "total": 0, |
| "skill_accuracy": 0.0, |
| "parameter_accuracy": 0.0, |
| "exact_json_match_rate": 0.0, |
| "skill_correct": 0, |
| "parameter_correct": 0, |
| "exact_json_match": 0, |
| } |
|
|
| skill_correct = sum(1 for result in results if result["skill_correct"]) |
| parameter_correct = sum(1 for result in results if result["parameter_correct"]) |
| exact_json_match = sum(1 for result in results if result["exact_json_match"]) |
|
|
| return { |
| "total": total, |
| "skill_correct": skill_correct, |
| "parameter_correct": parameter_correct, |
| "exact_json_match": exact_json_match, |
| "skill_accuracy": skill_correct / total, |
| "parameter_accuracy": parameter_correct / total, |
| "exact_json_match_rate": exact_json_match / total, |
| } |
|
|
|
|
| def build_failure_report(results: list[dict[str, Any]]) -> list[dict[str, Any]]: |
| failures: list[dict[str, Any]] = [] |
|
|
| for result in results: |
| if result["exact_json_match"]: |
| continue |
|
|
| failure_types: list[str] = [] |
| if not result["skill_correct"]: |
| failure_types.append("skill") |
| elif not result["parameter_correct"]: |
| failure_types.append("parameters") |
| else: |
| failure_types.append("json_format") |
|
|
| failures.append( |
| { |
| "id": result.get("id"), |
| "prompt": result["prompt"], |
| "domain": result.get("domain"), |
| "styles": result.get("styles", []), |
| "expected": result["expected"], |
| "predicted": result.get("predicted"), |
| "raw_output": result.get("raw_output"), |
| "failure_types": failure_types, |
| } |
| ) |
|
|
| return failures |
|
|
|
|
| def format_confusion_matrix(matrix: dict[str, dict[str, int]]) -> str: |
| predicted_labels = list(next(iter(matrix.values())).keys()) |
| corner_label = "expected \\ predicted" |
| header = f"{corner_label:<28}" + "".join(f"{label:>18}" for label in predicted_labels) |
| lines = [header, "-" * len(header)] |
|
|
| for expected, counts in matrix.items(): |
| row = f"{expected:<28}" + "".join(f"{counts[label]:>18}" for label in predicted_labels) |
| lines.append(row) |
|
|
| return "\n".join(lines) |
|
|
|
|
| def format_benchmark_report( |
| metrics: dict[str, Any], |
| matrix: dict[str, dict[str, int]], |
| failures: list[dict[str, Any]], |
| *, |
| title: str = "Pocket Automator Benchmark Report", |
| ) -> str: |
| lines = [ |
| title, |
| "=" * len(title), |
| "", |
| "Metrics", |
| "-------", |
| f"Total prompts: {metrics['total']}", |
| f"Skill accuracy: {metrics['skill_correct']}/{metrics['total']} ({metrics['skill_accuracy']:.1%})", |
| f"Parameter accuracy: {metrics['parameter_correct']}/{metrics['total']} ({metrics['parameter_accuracy']:.1%})", |
| f"Exact JSON match: {metrics['exact_json_match']}/{metrics['total']} ({metrics['exact_json_match_rate']:.1%})", |
| "", |
| "Confusion Matrix (rows=expected, cols=predicted)", |
| "------------------------------------------------", |
| format_confusion_matrix(matrix), |
| "", |
| f"Failure Report ({len(failures)} failures)", |
| "--------------", |
| ] |
|
|
| if not failures: |
| lines.append("No failures — perfect score.") |
| else: |
| for index, failure in enumerate(failures, start=1): |
| lines.extend( |
| [ |
| "", |
| f"[{index}] {failure.get('id', 'n/a')} — {', '.join(failure['failure_types'])}", |
| f"Prompt: {failure['prompt']}", |
| f"Domain: {failure.get('domain', 'n/a')}", |
| f"Styles: {', '.join(failure.get('styles') or [])}", |
| f"Expected: {json.dumps(failure['expected'], separators=(',', ':'))}", |
| f"Predicted: {json.dumps(failure.get('predicted'), separators=(',', ':')) if failure.get('predicted') else failure.get('raw_output', 'null')}", |
| ] |
| ) |
|
|
| style_failures: dict[str, int] = defaultdict(int) |
| domain_failures: dict[str, int] = defaultdict(int) |
| for failure in failures: |
| for style in failure.get("styles") or []: |
| style_failures[style] += 1 |
| domain = failure.get("domain") |
| if domain: |
| domain_failures[domain] += 1 |
|
|
| if style_failures or domain_failures: |
| lines.extend(["", "Failure breakdown", "-----------------"]) |
| if style_failures: |
| lines.append("By style:") |
| for style, count in sorted(style_failures.items()): |
| lines.append(f" {style}: {count}") |
| if domain_failures: |
| lines.append("By domain:") |
| for domain, count in sorted(domain_failures.items()): |
| lines.append(f" {domain}: {count}") |
|
|
| return "\n".join(lines) + "\n" |
|
|
|
|
| def record_result( |
| case: dict[str, Any], |
| raw_output: str, |
| predicted: dict | None, |
| ) -> dict[str, Any]: |
| expected = case["expected"] |
| checks = evaluate_prediction(predicted, expected) |
| return { |
| "id": case.get("id"), |
| "prompt": case["prompt"], |
| "domain": case.get("domain"), |
| "styles": case.get("styles", []), |
| "expected": expected, |
| "predicted": predicted, |
| "raw_output": raw_output, |
| **checks, |
| } |
|
|
|
|
| def save_benchmark_outputs( |
| results: list[dict[str, Any]], |
| *, |
| results_path: Path | None = None, |
| report_path: Path | None = None, |
| ) -> tuple[dict[str, Any], str]: |
| metrics = summarize_metrics(results) |
| matrix = build_confusion_matrix(results) |
| failures = build_failure_report(results) |
| report = format_benchmark_report(metrics, matrix, failures) |
|
|
| resolved_results_path = results_path or BENCHMARK_RESULTS_PATH |
| resolved_report_path = report_path or BENCHMARK_REPORT_PATH |
|
|
| payload = { |
| "metrics": metrics, |
| "confusion_matrix": matrix, |
| "failures": failures, |
| "results": results, |
| } |
|
|
| resolved_results_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") |
| resolved_report_path.write_text(report, encoding="utf-8") |
|
|
| return metrics, report |
|
|