android-skill-router / scripts /generate_pocket_benchmark.py
kriyanshi's picture
Ship v2 intent extraction with API, demo UI, eval, and benchmark suite.
40a90bb
Raw
History Blame Contribute Delete
34.6 kB
#!/usr/bin/env python3
"""Generate the Pocket Automator real-world benchmark suite.
Produces data/pocket_benchmark_prompts.json — 200 held-out prompts designed to
measure generalization with slang, typos, incomplete phrasing, and casual speech.
Prompts are filtered against train_intent.jsonl and eval_intent_prompts.json.
Usage:
python scripts/generate_pocket_benchmark.py
python scripts/generate_pocket_benchmark.py --validate-only
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.pocket_benchmark import BENCHMARK_SKILLS, DOMAIN_BY_SKILL # noqa: E402
TRAIN_PATH = PROJECT_ROOT / "data" / "train_intent.jsonl"
EVAL_PATH = PROJECT_ROOT / "data" / "eval_intent_prompts.json"
OUTPUT_PATH = PROJECT_ROOT / "data" / "pocket_benchmark_prompts.json"
TARGET_COUNT = 200
def _case(
prompt: str,
skill: str,
parameters: dict,
*,
styles: list[str],
) -> dict:
return {
"prompt": prompt,
"expected": {"skill": skill, "parameters": parameters},
"domain": DOMAIN_BY_SKILL[skill],
"styles": styles,
}
# ---------------------------------------------------------------------------
# Curated benchmark cases — phrasing intentionally unlike training templates
# ---------------------------------------------------------------------------
BENCHMARK_CASES: list[dict] = [
# --- Alarms (22) ---
_case("yo set an alrm for like 5:45 tmrw morning pls", "create_alarm", {"time": "5:45", "day": "tmrw morning"}, styles=["slang", "typo", "conversational"]),
_case("need to b up at 6ish on monday ngl", "create_alarm", {"time": "6ish", "day": "monday"}, styles=["slang", "incomplete", "conversational"]),
_case("6am alarm pls", "create_alarm", {"time": "6am"}, styles=["incomplete"]),
_case("wake me 7:20 wednesday thx", "create_alarm", {"time": "7:20", "day": "wednesday"}, styles=["incomplete", "conversational"]),
_case("uh can u do a 10:15 pm thing tonight", "create_alarm", {"time": "10:15 pm", "day": "tonight"}, styles=["conversational", "incomplete"]),
_case("tomorow 4:30 wake up plz", "create_alarm", {"time": "4:30", "day": "tomorow"}, styles=["typo", "incomplete"]),
_case("deadass need alarm sunday noon", "create_alarm", {"time": "noon", "day": "sunday"}, styles=["slang", "incomplete"]),
_case("ok wait actually make it 8:05 am next tuesday instead", "create_alarm", {"time": "8:05 am", "day": "next tuesday"}, styles=["conversational"]),
_case("4:50 alarm for sat pls", "create_alarm", {"time": "4:50", "day": "sat"}, styles=["slang", "incomplete"]),
_case("gonna crash early — buzz me at 9:30 pm today", "create_alarm", {"time": "9:30 pm", "day": "today"}, styles=["slang", "conversational"]),
_case("hey siri vibes but pocket automator: 5 am thurs", "create_alarm", {"time": "5 am", "day": "thurs"}, styles=["slang", "conversational"]),
_case("lmk when its 6:30 — jk just set alarm 6:30 friday", "create_alarm", {"time": "6:30", "day": "friday"}, styles=["slang", "conversational"]),
_case("alarm?? 7:45 am this weekend", "create_alarm", {"time": "7:45 am", "day": "this weekend"}, styles=["incomplete", "conversational"]),
_case("pls dont let me oversleep — 6:10 tomorrow", "create_alarm", {"time": "6:10", "day": "tomorrow"}, styles=["conversational"]),
_case("5:55am wakeup call monday pls", "create_alarm", {"time": "5:55am", "day": "monday"}, styles=["incomplete"]),
_case("set wakey wakey 8am tmr", "create_alarm", {"time": "8am", "day": "tmr"}, styles=["slang", "incomplete"]),
_case("lowkey need 11:11 pm alarm tonight", "create_alarm", {"time": "11:11 pm", "day": "tonight"}, styles=["slang"]),
_case("can i get a 7 am buzzer next monday", "create_alarm", {"time": "7 am", "day": "next monday"}, styles=["conversational"]),
_case("alarm at half past six tomorrow morning", "create_alarm", {"time": "half past six", "day": "tomorrow morning"}, styles=["conversational"]),
_case("6:00 alarm 4 wednesday", "create_alarm", {"time": "6:00", "day": "wednesday"}, styles=["typo", "incomplete"]),
_case("wake me up round 9 on sunday", "create_alarm", {"time": "9", "day": "sunday"}, styles=["slang", "incomplete"]),
_case("fr need 5:15 am alarm this tuesday", "create_alarm", {"time": "5:15 am", "day": "this tuesday"}, styles=["slang"]),
# --- WhatsApp (22) ---
_case("hit up zoe on whatsapp say im omw", "whatsapp_send_message", {"contact": "zoe", "message": "im omw"}, styles=["slang", "typo", "incomplete"]),
_case("wa msg marcus 'running 20 min late'", "whatsapp_send_message", {"contact": "marcus", "message": "running 20 min late"}, styles=["slang", "incomplete"]),
_case("tell roomie on watsapp dishes are done", "whatsapp_send_message", {"contact": "roomie", "message": "dishes are done"}, styles=["slang", "typo"]),
_case("whastapp elena — meeting got moved to 4", "whatsapp_send_message", {"contact": "elena", "message": "meeting got moved to 4"}, styles=["typo", "incomplete", "conversational"]),
_case("shoot kevin from work a text on whatsapp: heads up im wfh", "whatsapp_send_message", {"contact": "kevin from work", "message": "heads up im wfh"}, styles=["slang", "conversational"]),
_case("msg grandpa on whatsapp happy bday!!", "whatsapp_send_message", {"contact": "grandpa", "message": "happy bday!!"}, styles=["slang", "typo"]),
_case("yo ping sis saying im outside", "whatsapp_send_message", {"contact": "sis", "message": "im outside"}, styles=["slang", "incomplete"]),
_case("whatsap daniel 'got ur package'", "whatsapp_send_message", {"contact": "daniel", "message": "got ur package"}, styles=["typo", "incomplete"]),
_case("can u text aunt lisa on whatsapp that flight lands 6", "whatsapp_send_message", {"contact": "aunt lisa", "message": "flight lands 6"}, styles=["conversational", "incomplete"]),
_case("drop a whatsapp to coach mike — practice cancelled", "whatsapp_send_message", {"contact": "coach mike", "message": "practice cancelled"}, styles=["slang", "incomplete"]),
_case("whatsapp my landlord rent sent", "whatsapp_send_message", {"contact": "my landlord", "message": "rent sent"}, styles=["incomplete"]),
_case("lemme msg hannah real quick on whatsapp: u free later?", "whatsapp_send_message", {"contact": "hannah", "message": "u free later?"}, styles=["slang", "conversational"]),
_case("send whatsapp to benji thx for covering my shift", "whatsapp_send_message", {"contact": "benji", "message": "thx for covering my shift"}, styles=["slang", "incomplete"]),
_case("text whatsapp to carpool group running late again sry", "whatsapp_send_message", {"contact": "carpool group", "message": "running late again sry"}, styles=["slang", "incomplete"]),
_case("ok whatsapp olivia saying docs r uploaded", "whatsapp_send_message", {"contact": "olivia", "message": "docs r uploaded"}, styles=["slang", "typo", "conversational"]),
_case("wahtapp msg to tomas — keys under mat", "whatsapp_send_message", {"contact": "tomas", "message": "keys under mat"}, styles=["typo", "incomplete"]),
_case("just tell jules on whatsapp im grabbing boba", "whatsapp_send_message", {"contact": "jules", "message": "im grabbing boba"}, styles=["slang", "conversational"]),
_case("whatsapp neighbor hey can u sign for delivery", "whatsapp_send_message", {"contact": "neighbor", "message": "hey can u sign for delivery"}, styles=["incomplete", "conversational"]),
_case("msg whatsapp to dr patel appointment confirmed", "whatsapp_send_message", {"contact": "dr patel", "message": "appointment confirmed"}, styles=["incomplete"]),
_case("hit my brother on whatsapp w/ 'u still coming?'", "whatsapp_send_message", {"contact": "my brother", "message": "u still coming?"}, styles=["slang", "incomplete"]),
_case("whatsapp bestie ngl im stressed af today", "whatsapp_send_message", {"contact": "bestie", "message": "ngl im stressed af today"}, styles=["slang", "incomplete"]),
_case("send whatsapp msg to felix — slide deck is ready", "whatsapp_send_message", {"contact": "felix", "message": "slide deck is ready"}, styles=["incomplete"]),
# --- Spotify pause (22) ---
_case("spotfy stop rn", "spotify_pause", {}, styles=["typo", "slang", "incomplete"]),
_case("yo pause the music", "spotify_pause", {}, styles=["slang", "incomplete"]),
_case("mute spotify pls im on a call", "spotify_pause", {}, styles=["conversational"]),
_case("can u kill the spotify track", "spotify_pause", {}, styles=["slang", "conversational"]),
_case("hold up pause spotify", "spotify_pause", {}, styles=["slang", "incomplete"]),
_case("spoitfy pause rq", "spotify_pause", {}, styles=["typo", "slang", "incomplete"]),
_case("stop spotify playback thx", "spotify_pause", {}, styles=["conversational"]),
_case("shhh pause spotify", "spotify_pause", {}, styles=["incomplete", "conversational"]),
_case("spotify off for a sec", "spotify_pause", {}, styles=["slang", "incomplete"]),
_case("pause whats playing on spotify", "spotify_pause", {}, styles=["incomplete"]),
_case("ok stop spotify music", "spotify_pause", {}, styles=["conversational", "incomplete"]),
_case("spotify halt plz", "spotify_pause", {}, styles=["incomplete"]),
_case("silence spotify real quick", "spotify_pause", {}, styles=["conversational"]),
_case("pause spotify — landlord knocking", "spotify_pause", {}, styles=["conversational"]),
_case("spotify stop the banger lol", "spotify_pause", {}, styles=["slang", "conversational"]),
_case("cut spotify audio", "spotify_pause", {}, styles=["slang", "incomplete"]),
_case("spotify pause pls headphones dying", "spotify_pause", {}, styles=["conversational"]),
_case("uh pause spotify", "spotify_pause", {}, styles=["conversational", "incomplete"]),
_case("stop the spotify pls", "spotify_pause", {}, styles=["incomplete"]),
_case("spotify freeze for now pls", "spotify_pause", {}, styles=["incomplete", "conversational"]),
_case("pause my spotify session", "spotify_pause", {}, styles=["incomplete"]),
_case("spotify quiet for now", "spotify_pause", {}, styles=["incomplete", "conversational"]),
# --- Spotify play playlist (22) ---
_case("put on my midnight drives mix on spotify", "spotify_play_playlist", {"playlist": "midnight drives mix"}, styles=["conversational"]),
_case("spotify play sunday scaries", "spotify_play_playlist", {"playlist": "sunday scaries"}, styles=["incomplete"]),
_case("queue up deep focus vibes on spotify", "spotify_play_playlist", {"playlist": "deep focus vibes"}, styles=["slang"]),
_case("play my 2010s throwbacks spotify", "spotify_play_playlist", {"playlist": "2010s throwbacks"}, styles=["incomplete"]),
_case("spotfy playlist: rainy day jazz", "spotify_play_playlist", {"playlist": "rainy day jazz"}, styles=["typo", "incomplete"]),
_case("yo spin the gym beast mode playlist", "spotify_play_playlist", {"playlist": "gym beast mode"}, styles=["slang", "incomplete"]),
_case("spotify — my cooking playlist pls", "spotify_play_playlist", {"playlist": "cooking"}, styles=["incomplete", "conversational"]),
_case("start spotify playlist called roadtrip 2024", "spotify_play_playlist", {"playlist": "roadtrip 2024"}, styles=["incomplete"]),
_case("play hyperpop mix on spotify", "spotify_play_playlist", {"playlist": "hyperpop mix"}, styles=["incomplete"]),
_case("spotify my chillhop essentials thx", "spotify_play_playlist", {"playlist": "chillhop essentials"}, styles=["incomplete", "conversational"]),
_case("can u play the study grind playlist on spotify", "spotify_play_playlist", {"playlist": "study grind"}, styles=["conversational"]),
_case("spotify playlist sleep sounds go", "spotify_play_playlist", {"playlist": "sleep sounds"}, styles=["incomplete"]),
_case("throw on indie roadtrip on spotify", "spotify_play_playlist", {"playlist": "indie roadtrip"}, styles=["slang", "incomplete"]),
_case("spotify play my liked tracks playlist", "spotify_play_playlist", {"playlist": "liked tracks"}, styles=["incomplete"]),
_case("play spotify playlist bossa nova mornings", "spotify_play_playlist", {"playlist": "bossa nova mornings"}, styles=["incomplete"]),
_case("spotify — queue phonk workout", "spotify_play_playlist", {"playlist": "phonk workout"}, styles=["incomplete"]),
_case("put spotify on my cozy rainy day list", "spotify_play_playlist", {"playlist": "cozy rainy day"}, styles=["conversational", "incomplete"]),
_case("play the spotify playlist french cafe", "spotify_play_playlist", {"playlist": "french cafe"}, styles=["incomplete"]),
_case("spotify start my pregame playlist", "spotify_play_playlist", {"playlist": "pregame"}, styles=["slang", "incomplete"]),
_case("ok spotify play downtempo focus", "spotify_play_playlist", {"playlist": "downtempo focus"}, styles=["conversational", "incomplete"]),
_case("spotify my discover mix playlist pls", "spotify_play_playlist", {"playlist": "discover mix"}, styles=["incomplete"]),
_case("play playlist morning commute on spotify", "spotify_play_playlist", {"playlist": "morning commute"}, styles=["incomplete"]),
# --- Spotify search play (22) ---
_case("spotify find bad bunny and play", "spotify_search_play", {"query": "bad bunny"}, styles=["incomplete"]),
_case("search spotify for lofi hip hop then play", "spotify_search_play", {"query": "lofi hip hop"}, styles=["conversational"]),
_case("play some arctic monkeys on spotify", "spotify_search_play", {"query": "arctic monkeys"}, styles=["conversational", "incomplete"]),
_case("spotfy search billie eilish play it", "spotify_search_play", {"query": "billie eilish"}, styles=["typo", "incomplete"]),
_case("look up sabrina carpenter on spotify n play", "spotify_search_play", {"query": "sabrina carpenter"}, styles=["slang", "incomplete"]),
_case("spotify: search neo soul and hit play", "spotify_search_play", {"query": "neo soul"}, styles=["incomplete"]),
_case("find tyler the creator on spotify and play", "spotify_search_play", {"query": "tyler the creator"}, styles=["incomplete"]),
_case("spotify play after searching radiohead", "spotify_search_play", {"query": "radiohead"}, styles=["incomplete"]),
_case("yo spotify search dnb and play", "spotify_search_play", {"query": "dnb"}, styles=["slang", "incomplete"]),
_case("can u find frank ocean on spotify and play it", "spotify_search_play", {"query": "frank ocean"}, styles=["conversational"]),
_case("spotify search classical piano play", "spotify_search_play", {"query": "classical piano"}, styles=["incomplete"]),
_case("spotify search play some drake tracks", "spotify_search_play", {"query": "drake tracks"}, styles=["incomplete"]),
_case("spotify find latin reggaeton and play", "spotify_search_play", {"query": "latin reggaeton"}, styles=["incomplete"]),
_case("search spotify for aphex twin then play", "spotify_search_play", {"query": "aphex twin"}, styles=["incomplete"]),
_case("spotify play whatever u find for synthwave", "spotify_search_play", {"query": "synthwave"}, styles=["slang", "conversational"]),
_case("find mitski on spotify play pls", "spotify_search_play", {"query": "mitski"}, styles=["incomplete", "conversational"]),
_case("soptify search play khruangbin", "spotify_search_play", {"query": "khruangbin"}, styles=["typo", "incomplete"]),
_case("look up burna boy spotify and play", "spotify_search_play", {"query": "burna boy"}, styles=["incomplete"]),
_case("spotify search play some afrobeats", "spotify_search_play", {"query": "afrobeats"}, styles=["incomplete"]),
_case("find fred again on spotify n play it", "spotify_search_play", {"query": "fred again"}, styles=["slang", "incomplete"]),
_case("spotify search play 90s r&b", "spotify_search_play", {"query": "90s r&b"}, styles=["incomplete"]),
_case("search spotify play tame impala", "spotify_search_play", {"query": "tame impala"}, styles=["incomplete"]),
# --- Uber (22) ---
_case("ubber to jfk pls", "uber_request_ride", {"destination": "jfk"}, styles=["typo", "incomplete", "conversational"]),
_case("need a ride to 42 oak street on uber", "uber_request_ride", {"destination": "42 oak street"}, styles=["conversational"]),
_case("hop in an uber to the vet clinic", "uber_request_ride", {"destination": "the vet clinic"}, styles=["slang"]),
_case("ubr me to whole foods on 5th", "uber_request_ride", {"destination": "whole foods on 5th"}, styles=["typo", "slang", "incomplete"]),
_case("get uber to coworking space downtown", "uber_request_ride", {"destination": "coworking space downtown"}, styles=["incomplete"]),
_case("book ride to mom's house via uber", "uber_request_ride", {"destination": "mom's house"}, styles=["incomplete"]),
_case("uber — take me to pier 39", "uber_request_ride", {"destination": "pier 39"}, styles=["incomplete", "conversational"]),
_case("can u call uber to stanford campus", "uber_request_ride", {"destination": "stanford campus"}, styles=["conversational"]),
_case("ride to airport terminal 2 uber", "uber_request_ride", {"destination": "airport terminal 2"}, styles=["incomplete"]),
_case("uber to the sushi place on main", "uber_request_ride", {"destination": "the sushi place on main"}, styles=["incomplete", "conversational"]),
_case("need uber to ikea rn", "uber_request_ride", {"destination": "ikea"}, styles=["slang", "incomplete"]),
_case("request uber to brooklyn bridge park", "uber_request_ride", {"destination": "brooklyn bridge park"}, styles=["incomplete"]),
_case("uber drop me at la guardia", "uber_request_ride", {"destination": "la guardia"}, styles=["slang", "incomplete"]),
_case("get a cab on uber to dentist on maple ave", "uber_request_ride", {"destination": "dentist on maple ave"}, styles=["conversational"]),
_case("uber to friends apartment 8b", "uber_request_ride", {"destination": "friends apartment 8b"}, styles=["incomplete"]),
_case("yo uber to the concert venue", "uber_request_ride", {"destination": "the concert venue"}, styles=["slang", "incomplete"]),
_case("book uber ride to seattle tacoma airport", "uber_request_ride", {"destination": "seattle tacoma airport"}, styles=["incomplete"]),
_case("uber pls to union station", "uber_request_ride", {"destination": "union station"}, styles=["incomplete", "conversational"]),
_case("ride share to target on broadway uber", "uber_request_ride", {"destination": "target on broadway"}, styles=["incomplete"]),
_case("uber me over to the marriott hotel", "uber_request_ride", {"destination": "the marriott hotel"}, styles=["slang", "incomplete"]),
_case("need uber to physio appointment on elm", "uber_request_ride", {"destination": "physio appointment on elm"}, styles=["incomplete"]),
_case("call uber to the dog park", "uber_request_ride", {"destination": "the dog park"}, styles=["incomplete"]),
# --- Gmail (22) ---
_case("gmal email recruiter saying im interested in the role", "gmail_send_email", {"recipient": "recruiter", "message": "im interested in the role"}, styles=["typo", "incomplete"]),
_case("shoot an email via gmail to finance@acme.co — invoice overdue", "gmail_send_email", {"recipient": "finance@acme.co", "message": "invoice overdue"}, styles=["slang", "conversational"]),
_case("gmail landlord asking about lease renewal", "gmail_send_email", {"recipient": "landlord", "message": "asking about lease renewal"}, styles=["incomplete"]),
_case("email my manager on gmail: heads up sick day tomorrow", "gmail_send_email", {"recipient": "my manager", "message": "heads up sick day tomorrow"}, styles=["conversational"]),
_case("compose gmail to nora.park@design.io portfolio link attached", "gmail_send_email", {"recipient": "nora.park@design.io", "message": "portfolio link attached"}, styles=["incomplete"]),
_case("gmail msg to thesis advisor draft attached for review", "gmail_send_email", {"recipient": "thesis advisor", "message": "draft attached for review"}, styles=["incomplete"]),
_case("send gmail to carpool buddy running 10 late", "gmail_send_email", {"recipient": "carpool buddy", "message": "running 10 late"}, styles=["slang", "incomplete"]),
_case("write gmail to billing@saas.com cancel subscription pls", "gmail_send_email", {"recipient": "billing@saas.com", "message": "cancel subscription pls"}, styles=["conversational"]),
_case("gmail email mentor thx for the intro", "gmail_send_email", {"recipient": "mentor", "message": "thx for the intro"}, styles=["slang", "incomplete"]),
_case("ok gmail to ops-team@startup.dev deploy failed rollback done", "gmail_send_email", {"recipient": "ops-team@startup.dev", "message": "deploy failed rollback done"}, styles=["conversational", "incomplete"]),
_case("email thru gmail to dentist office reschedule appt", "gmail_send_email", {"recipient": "dentist office", "message": "reschedule appt"}, styles=["incomplete"]),
_case("gmail send to contractor quote approved lets proceed", "gmail_send_email", {"recipient": "contractor", "message": "quote approved lets proceed"}, styles=["incomplete"]),
_case("hey gmail my professor question about deadline extension", "gmail_send_email", {"recipient": "my professor", "message": "question about deadline extension"}, styles=["conversational", "incomplete"]),
_case("gmail to hr@corp.net pto request for next week", "gmail_send_email", {"recipient": "hr@corp.net", "message": "pto request for next week"}, styles=["incomplete"]),
_case("send email gmail to vendor payment sent today", "gmail_send_email", {"recipient": "vendor", "message": "payment sent today"}, styles=["incomplete"]),
_case("gmial client@studio.com mockups ready for feedback", "gmail_send_email", {"recipient": "client@studio.com", "message": "mockups ready for feedback"}, styles=["typo", "incomplete"]),
_case("compose email on gmail to roommate utilities split attached", "gmail_send_email", {"recipient": "roommate", "message": "utilities split attached"}, styles=["incomplete"]),
_case("gmail to travel agent pls confirm hotel booking", "gmail_send_email", {"recipient": "travel agent", "message": "pls confirm hotel booking"}, styles=["conversational"]),
_case("email gmail coach thanks for the session notes", "gmail_send_email", {"recipient": "coach", "message": "thanks for the session notes"}, styles=["incomplete"]),
_case("gmail msg to editor final chapter attached", "gmail_send_email", {"recipient": "editor", "message": "final chapter attached"}, styles=["incomplete"]),
_case("send gmail to internship lead following up on application", "gmail_send_email", {"recipient": "internship lead", "message": "following up on application"}, styles=["incomplete"]),
_case("gmail email to dad flight details attached", "gmail_send_email", {"recipient": "dad", "message": "flight details attached"}, styles=["incomplete"]),
# --- Calendar (24) ---
_case("put dentist on calender next thurs 3pm", "calendar_create_event", {"title": "dentist", "date": "next thurs", "time": "3pm"}, styles=["typo", "incomplete"]),
_case("schedule 1:1 w/ manager monday 11am", "calendar_create_event", {"title": "1:1 w/ manager", "date": "monday", "time": "11am"}, styles=["slang", "incomplete"]),
_case("add calendar block focus time tuesday 9-11", "calendar_create_event", {"title": "focus time", "date": "tuesday", "time": "9-11"}, styles=["incomplete"]),
_case("book therapy session friday 5 pm on calendar", "calendar_create_event", {"title": "therapy session", "date": "friday", "time": "5 pm"}, styles=["incomplete"]),
_case("cal event: team retro wednesday 4pm", "calendar_create_event", {"title": "team retro", "date": "wednesday", "time": "4pm"}, styles=["slang", "incomplete"]),
_case("put parent teacher conf on calendar oct 12 6pm", "calendar_create_event", {"title": "parent teacher conf", "date": "oct 12", "time": "6pm"}, styles=["incomplete"]),
_case("schedule gym class saturday 8 am calendar", "calendar_create_event", {"title": "gym class", "date": "saturday", "time": "8 am"}, styles=["incomplete"]),
_case("add calendar entry flight to nyc sunday 7:45 am", "calendar_create_event", {"title": "flight to nyc", "date": "sunday", "time": "7:45 am"}, styles=["incomplete"]),
_case("calendar — lunch w/ priya tomorrow noon", "calendar_create_event", {"title": "lunch w/ priya", "date": "tomorrow", "time": "noon"}, styles=["slang", "incomplete", "conversational"]),
_case("set cal event sprint planning next monday 10", "calendar_create_event", {"title": "sprint planning", "date": "next monday", "time": "10"}, styles=["slang", "incomplete"]),
_case("put car inspection on calendar tuesday 2:30 pm", "calendar_create_event", {"title": "car inspection", "date": "tuesday", "time": "2:30 pm"}, styles=["incomplete"]),
_case("scheduel interview prep wednesday 6 pm", "calendar_create_event", {"title": "interview prep", "date": "wednesday", "time": "6 pm"}, styles=["typo", "incomplete"]),
_case("add to calendar vet appt thursday 1 pm", "calendar_create_event", {"title": "vet appt", "date": "thursday", "time": "1 pm"}, styles=["incomplete"]),
_case("calendar event board presentation friday 9 am", "calendar_create_event", {"title": "board presentation", "date": "friday", "time": "9 am"}, styles=["incomplete"]),
_case("book calendar slot date night saturday 8 pm", "calendar_create_event", {"title": "date night", "date": "saturday", "time": "8 pm"}, styles=["incomplete"]),
_case("put standup notes review on cal monday 9:30", "calendar_create_event", {"title": "standup notes review", "date": "monday", "time": "9:30"}, styles=["slang", "incomplete"]),
_case("schedule movers arrival next saturday 10 am", "calendar_create_event", {"title": "movers arrival", "date": "next saturday", "time": "10 am"}, styles=["incomplete"]),
_case("add calendar tax appointment april 5 11 am", "calendar_create_event", {"title": "tax appointment", "date": "april 5", "time": "11 am"}, styles=["incomplete"]),
_case("calender block deep work thursday morning 8", "calendar_create_event", {"title": "deep work", "date": "thursday morning", "time": "8"}, styles=["typo", "incomplete"]),
_case("put wedding rehearsal on calendar june 20 5 pm", "calendar_create_event", {"title": "wedding rehearsal", "date": "june 20", "time": "5 pm"}, styles=["incomplete"]),
_case("schedule product demo next wednesday 3 pm", "calendar_create_event", {"title": "product demo", "date": "next wednesday", "time": "3 pm"}, styles=["incomplete"]),
_case("add calendar coffee chat friday 10:30 am", "calendar_create_event", {"title": "coffee chat", "date": "friday", "time": "10:30 am"}, styles=["incomplete"]),
_case("book calendar orthodontist monday 4:15 pm", "calendar_create_event", {"title": "orthodontist", "date": "monday", "time": "4:15 pm"}, styles=["incomplete"]),
_case("put hackathon submission deadline on calendar sunday midnight", "calendar_create_event", {"title": "hackathon submission deadline", "date": "sunday", "time": "midnight"}, styles=["incomplete"]),
# --- Slack (22) ---
_case("open slack #incidents pls", "slack_open_channel", {"channel": "incidents"}, styles=["incomplete", "conversational"]),
_case("jump to product-launch channel on slack", "slack_open_channel", {"channel": "product-launch"}, styles=["slang", "incomplete"]),
_case("slack go to team-watercooler", "slack_open_channel", {"channel": "team-watercooler"}, styles=["incomplete"]),
_case("pull up slack channel ios-bugs", "slack_open_channel", {"channel": "ios-bugs"}, styles=["slang", "incomplete"]),
_case("switch slack to growth-experiments", "slack_open_channel", {"channel": "growth-experiments"}, styles=["incomplete"]),
_case("show slack channel design-system", "slack_open_channel", {"channel": "design-system"}, styles=["incomplete"]),
_case("navigate slack to platform-oncall", "slack_open_channel", {"channel": "platform-oncall"}, styles=["incomplete"]),
_case("open the slack channel ml-research thx", "slack_open_channel", {"channel": "ml-research"}, styles=["conversational"]),
_case("slack channel customer-escalations open it", "slack_open_channel", {"channel": "customer-escalations"}, styles=["incomplete"]),
_case("take me to #backend-standup in slack", "slack_open_channel", {"channel": "backend-standup"}, styles=["incomplete"]),
_case("open slack for channel release-train", "slack_open_channel", {"channel": "release-train"}, styles=["incomplete"]),
_case("slack — show me hiring-pipeline channel", "slack_open_channel", {"channel": "hiring-pipeline"}, styles=["conversational", "incomplete"]),
_case("go slack channel infra-alerts", "slack_open_channel", {"channel": "infra-alerts"}, styles=["incomplete"]),
_case("open slak channel mobile-crash-reports", "slack_open_channel", {"channel": "mobile-crash-reports"}, styles=["typo", "incomplete"]),
_case("pull slack up on qa-automation channel", "slack_open_channel", {"channel": "qa-automation"}, styles=["incomplete"]),
_case("switch to slack channel docs-and-wiki", "slack_open_channel", {"channel": "docs-and-wiki"}, styles=["incomplete"]),
_case("open slack growth-metrics channel rq", "slack_open_channel", {"channel": "growth-metrics"}, styles=["slang", "incomplete"]),
_case("show me slack #sales-wins", "slack_open_channel", {"channel": "sales-wins"}, styles=["incomplete"]),
_case("navigate to slack channel partner-integrations", "slack_open_channel", {"channel": "partner-integrations"}, styles=["incomplete"]),
_case("open slack channel called observability", "slack_open_channel", {"channel": "observability"}, styles=["incomplete"]),
_case("slack open eng-hiring channel pls", "slack_open_channel", {"channel": "eng-hiring"}, styles=["incomplete", "conversational"]),
_case("jump slack to ai-playground channel", "slack_open_channel", {"channel": "ai-playground"}, styles=["slang", "incomplete"]),
]
def normalize_prompt(text: str) -> str:
return " ".join(text.lower().strip().split())
def token_set(text: str) -> set[str]:
return set(re.findall(r"[a-z0-9@]+", text.lower()))
def jaccard_similarity(left: str, right: str) -> float:
left_tokens = token_set(left)
right_tokens = token_set(right)
if not left_tokens or not right_tokens:
return 0.0
return len(left_tokens & right_tokens) / len(left_tokens | right_tokens)
def load_existing_prompts() -> set[str]:
prompts: set[str] = set()
if TRAIN_PATH.exists():
with TRAIN_PATH.open(encoding="utf-8") as handle:
for line in handle:
record = json.loads(line)
for message in record["messages"]:
if message["role"] == "user":
prompts.add(normalize_prompt(message["content"]))
if EVAL_PATH.exists():
with EVAL_PATH.open(encoding="utf-8") as handle:
for case in json.load(handle):
prompts.add(normalize_prompt(case["prompt"]))
return prompts
def validate_cases(cases: list[dict], existing: set[str]) -> list[str]:
errors: list[str] = []
seen: set[str] = set()
if len(cases) != TARGET_COUNT:
errors.append(f"Expected {TARGET_COUNT} prompts, got {len(cases)}")
skill_counts: dict[str, int] = {}
style_counts: dict[str, int] = {}
for index, case in enumerate(cases, start=1):
prompt = case["prompt"]
normalized = normalize_prompt(prompt)
skill = case["expected"]["skill"]
if normalized in seen:
errors.append(f"Duplicate benchmark prompt: {prompt}")
seen.add(normalized)
if normalized in existing:
errors.append(f"Exact overlap with training/eval: {prompt}")
for existing_prompt in existing:
if jaccard_similarity(normalized, existing_prompt) >= 0.85:
errors.append(
f"High similarity ({jaccard_similarity(normalized, existing_prompt):.2f}) "
f"with existing prompt '{existing_prompt}': {prompt}"
)
break
skill_counts[skill] = skill_counts.get(skill, 0) + 1
for style in case.get("styles", []):
style_counts[style] = style_counts.get(style, 0) + 1
if skill not in BENCHMARK_SKILLS:
errors.append(f"Case {index}: unsupported skill {skill}")
for skill in BENCHMARK_SKILLS:
count = skill_counts.get(skill, 0)
if count == 0:
errors.append(f"Missing skill coverage: {skill}")
for style in ("slang", "typo", "incomplete", "conversational"):
count = style_counts.get(style, 0)
if count < 20:
errors.append(f"Insufficient '{style}' coverage: {count} (need >= 20)")
return errors
def assign_ids(cases: list[dict]) -> list[dict]:
output: list[dict] = []
for index, case in enumerate(cases, start=1):
output.append({"id": f"pa-{index:03d}", **case})
return output
def main() -> None:
parser = argparse.ArgumentParser(description="Generate Pocket Automator benchmark prompts.")
parser.add_argument(
"--validate-only",
action="store_true",
help="Validate existing benchmark file without rewriting it.",
)
args = parser.parse_args()
existing = load_existing_prompts()
cases = assign_ids(BENCHMARK_CASES)
errors = validate_cases(cases, existing)
if errors:
print("Validation failed:")
for error in errors:
print(f" - {error}")
sys.exit(1)
if args.validate_only:
print(f"Validation passed for {len(cases)} benchmark prompts.")
return
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_PATH.open("w", encoding="utf-8") as handle:
json.dump(cases, handle, indent=2)
handle.write("\n")
skill_counts = {}
for case in cases:
skill = case["expected"]["skill"]
skill_counts[skill] = skill_counts.get(skill, 0) + 1
print(f"Wrote {len(cases)} benchmark prompts to {OUTPUT_PATH}")
print("Skill distribution:")
for skill in BENCHMARK_SKILLS:
print(f" {skill}: {skill_counts.get(skill, 0)}")
if __name__ == "__main__":
main()