android-skill-router / src /pocket_benchmark.py
kriyanshi's picture
Ship v2 intent extraction with API, demo UI, eval, and benchmark suite.
40a90bb
Raw
History Blame Contribute Delete
10.5 kB
"""Pocket Automator benchmark utilities — metrics, confusion matrix, failure reports."""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
from typing import Any
try:
from src.skill_utils import (
normalize_param,
normalize_skill,
parameters_match,
)
except ImportError:
from skill_utils import (
normalize_param,
normalize_skill,
parameters_match,
)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
BENCHMARK_PROMPTS_PATH = DATA_DIR / "pocket_benchmark_prompts.json"
BENCHMARK_RESULTS_PATH = DATA_DIR / "pocket_benchmark_results.json"
BENCHMARK_REPORT_PATH = DATA_DIR / "pocket_benchmark_report.txt"
BENCHMARK_SKILLS: tuple[str, ...] = (
"create_alarm",
"whatsapp_send_message",
"spotify_pause",
"spotify_play_playlist",
"spotify_search_play",
"uber_request_ride",
"gmail_send_email",
"calendar_create_event",
"slack_open_channel",
)
DOMAIN_BY_SKILL: dict[str, str] = {
"create_alarm": "alarms",
"whatsapp_send_message": "whatsapp",
"spotify_pause": "spotify",
"spotify_play_playlist": "spotify",
"spotify_search_play": "spotify",
"uber_request_ride": "uber",
"gmail_send_email": "gmail",
"calendar_create_event": "calendar",
"slack_open_channel": "slack",
}
def load_benchmark_prompts(path: Path | None = None) -> list[dict[str, Any]]:
benchmark_path = path or BENCHMARK_PROMPTS_PATH
with benchmark_path.open(encoding="utf-8") as handle:
return json.load(handle)
def canonical_intent(skill: str, parameters: dict | None = None) -> str:
params = parameters or {}
normalized = {key: normalize_param(str(value)) for key, value in sorted(params.items())}
payload = {"skill": skill, "parameters": normalized}
return json.dumps(payload, separators=(",", ":"), sort_keys=True)
def exact_intent_match(
predicted: dict | None,
expected_skill: str,
expected_parameters: dict | None = None,
) -> bool:
if not predicted:
return False
expected_params = expected_parameters or {}
predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill")
if predicted_skill != expected_skill:
return False
predicted_params = predicted.get("parameters") or {}
if set(predicted_params.keys()) != set(expected_params.keys()):
return False
return parameters_match(predicted_params, expected_params)
def evaluate_prediction(
predicted: dict | None,
expected: dict[str, Any],
) -> dict[str, bool]:
expected_skill = expected["skill"]
expected_params = expected.get("parameters", {})
skill_correct = predicted is not None and (
normalize_skill(predicted.get("skill")) or predicted.get("skill")
) == expected_skill
parameter_correct = skill_correct and parameters_match(
predicted.get("parameters", {}),
expected_params,
)
exact_json = exact_intent_match(predicted, expected_skill, expected_params)
return {
"skill_correct": skill_correct,
"parameter_correct": parameter_correct,
"exact_json_match": exact_json,
}
def build_confusion_matrix(
results: list[dict[str, Any]],
skills: tuple[str, ...] | None = None,
) -> dict[str, dict[str, int]]:
skill_list = list(skills or BENCHMARK_SKILLS)
matrix: dict[str, dict[str, int]] = {
expected: {predicted: 0 for predicted in skill_list + ["<invalid>"]}
for expected in skill_list
}
for result in results:
expected_skill = result["expected"]["skill"]
predicted = result.get("predicted")
if predicted and predicted.get("skill"):
predicted_skill = normalize_skill(predicted.get("skill")) or predicted.get("skill")
if predicted_skill not in matrix[expected_skill]:
predicted_skill = "<invalid>"
else:
predicted_skill = "<invalid>"
matrix[expected_skill][predicted_skill] += 1
return matrix
def summarize_metrics(results: list[dict[str, Any]]) -> dict[str, Any]:
total = len(results)
if total == 0:
return {
"total": 0,
"skill_accuracy": 0.0,
"parameter_accuracy": 0.0,
"exact_json_match_rate": 0.0,
"skill_correct": 0,
"parameter_correct": 0,
"exact_json_match": 0,
}
skill_correct = sum(1 for result in results if result["skill_correct"])
parameter_correct = sum(1 for result in results if result["parameter_correct"])
exact_json_match = sum(1 for result in results if result["exact_json_match"])
return {
"total": total,
"skill_correct": skill_correct,
"parameter_correct": parameter_correct,
"exact_json_match": exact_json_match,
"skill_accuracy": skill_correct / total,
"parameter_accuracy": parameter_correct / total,
"exact_json_match_rate": exact_json_match / total,
}
def build_failure_report(results: list[dict[str, Any]]) -> list[dict[str, Any]]:
failures: list[dict[str, Any]] = []
for result in results:
if result["exact_json_match"]:
continue
failure_types: list[str] = []
if not result["skill_correct"]:
failure_types.append("skill")
elif not result["parameter_correct"]:
failure_types.append("parameters")
else:
failure_types.append("json_format")
failures.append(
{
"id": result.get("id"),
"prompt": result["prompt"],
"domain": result.get("domain"),
"styles": result.get("styles", []),
"expected": result["expected"],
"predicted": result.get("predicted"),
"raw_output": result.get("raw_output"),
"failure_types": failure_types,
}
)
return failures
def format_confusion_matrix(matrix: dict[str, dict[str, int]]) -> str:
predicted_labels = list(next(iter(matrix.values())).keys())
corner_label = "expected \\ predicted"
header = f"{corner_label:<28}" + "".join(f"{label:>18}" for label in predicted_labels)
lines = [header, "-" * len(header)]
for expected, counts in matrix.items():
row = f"{expected:<28}" + "".join(f"{counts[label]:>18}" for label in predicted_labels)
lines.append(row)
return "\n".join(lines)
def format_benchmark_report(
metrics: dict[str, Any],
matrix: dict[str, dict[str, int]],
failures: list[dict[str, Any]],
*,
title: str = "Pocket Automator Benchmark Report",
) -> str:
lines = [
title,
"=" * len(title),
"",
"Metrics",
"-------",
f"Total prompts: {metrics['total']}",
f"Skill accuracy: {metrics['skill_correct']}/{metrics['total']} ({metrics['skill_accuracy']:.1%})",
f"Parameter accuracy: {metrics['parameter_correct']}/{metrics['total']} ({metrics['parameter_accuracy']:.1%})",
f"Exact JSON match: {metrics['exact_json_match']}/{metrics['total']} ({metrics['exact_json_match_rate']:.1%})",
"",
"Confusion Matrix (rows=expected, cols=predicted)",
"------------------------------------------------",
format_confusion_matrix(matrix),
"",
f"Failure Report ({len(failures)} failures)",
"--------------",
]
if not failures:
lines.append("No failures — perfect score.")
else:
for index, failure in enumerate(failures, start=1):
lines.extend(
[
"",
f"[{index}] {failure.get('id', 'n/a')}{', '.join(failure['failure_types'])}",
f"Prompt: {failure['prompt']}",
f"Domain: {failure.get('domain', 'n/a')}",
f"Styles: {', '.join(failure.get('styles') or [])}",
f"Expected: {json.dumps(failure['expected'], separators=(',', ':'))}",
f"Predicted: {json.dumps(failure.get('predicted'), separators=(',', ':')) if failure.get('predicted') else failure.get('raw_output', 'null')}",
]
)
style_failures: dict[str, int] = defaultdict(int)
domain_failures: dict[str, int] = defaultdict(int)
for failure in failures:
for style in failure.get("styles") or []:
style_failures[style] += 1
domain = failure.get("domain")
if domain:
domain_failures[domain] += 1
if style_failures or domain_failures:
lines.extend(["", "Failure breakdown", "-----------------"])
if style_failures:
lines.append("By style:")
for style, count in sorted(style_failures.items()):
lines.append(f" {style}: {count}")
if domain_failures:
lines.append("By domain:")
for domain, count in sorted(domain_failures.items()):
lines.append(f" {domain}: {count}")
return "\n".join(lines) + "\n"
def record_result(
case: dict[str, Any],
raw_output: str,
predicted: dict | None,
) -> dict[str, Any]:
expected = case["expected"]
checks = evaluate_prediction(predicted, expected)
return {
"id": case.get("id"),
"prompt": case["prompt"],
"domain": case.get("domain"),
"styles": case.get("styles", []),
"expected": expected,
"predicted": predicted,
"raw_output": raw_output,
**checks,
}
def save_benchmark_outputs(
results: list[dict[str, Any]],
*,
results_path: Path | None = None,
report_path: Path | None = None,
) -> tuple[dict[str, Any], str]:
metrics = summarize_metrics(results)
matrix = build_confusion_matrix(results)
failures = build_failure_report(results)
report = format_benchmark_report(metrics, matrix, failures)
resolved_results_path = results_path or BENCHMARK_RESULTS_PATH
resolved_report_path = report_path or BENCHMARK_REPORT_PATH
payload = {
"metrics": metrics,
"confusion_matrix": matrix,
"failures": failures,
"results": results,
}
resolved_results_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
resolved_report_path.write_text(report, encoding="utf-8")
return metrics, report