#!/usr/bin/env python3 """Generate the Pocket Automator real-world benchmark suite. Produces data/pocket_benchmark_prompts.json — 200 held-out prompts designed to measure generalization with slang, typos, incomplete phrasing, and casual speech. Prompts are filtered against train_intent.jsonl and eval_intent_prompts.json. Usage: python scripts/generate_pocket_benchmark.py python scripts/generate_pocket_benchmark.py --validate-only """ from __future__ import annotations import argparse import json import re import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from src.pocket_benchmark import BENCHMARK_SKILLS, DOMAIN_BY_SKILL # noqa: E402 TRAIN_PATH = PROJECT_ROOT / "data" / "train_intent.jsonl" EVAL_PATH = PROJECT_ROOT / "data" / "eval_intent_prompts.json" OUTPUT_PATH = PROJECT_ROOT / "data" / "pocket_benchmark_prompts.json" TARGET_COUNT = 200 def _case( prompt: str, skill: str, parameters: dict, *, styles: list[str], ) -> dict: return { "prompt": prompt, "expected": {"skill": skill, "parameters": parameters}, "domain": DOMAIN_BY_SKILL[skill], "styles": styles, } # --------------------------------------------------------------------------- # Curated benchmark cases — phrasing intentionally unlike training templates # --------------------------------------------------------------------------- BENCHMARK_CASES: list[dict] = [ # --- Alarms (22) --- _case("yo set an alrm for like 5:45 tmrw morning pls", "create_alarm", {"time": "5:45", "day": "tmrw morning"}, styles=["slang", "typo", "conversational"]), _case("need to b up at 6ish on monday ngl", "create_alarm", {"time": "6ish", "day": "monday"}, styles=["slang", "incomplete", "conversational"]), _case("6am alarm pls", "create_alarm", {"time": "6am"}, styles=["incomplete"]), _case("wake me 7:20 wednesday thx", "create_alarm", {"time": "7:20", "day": "wednesday"}, styles=["incomplete", "conversational"]), _case("uh can u do a 10:15 pm thing tonight", "create_alarm", {"time": "10:15 pm", "day": "tonight"}, styles=["conversational", "incomplete"]), _case("tomorow 4:30 wake up plz", "create_alarm", {"time": "4:30", "day": "tomorow"}, styles=["typo", "incomplete"]), _case("deadass need alarm sunday noon", "create_alarm", {"time": "noon", "day": "sunday"}, styles=["slang", "incomplete"]), _case("ok wait actually make it 8:05 am next tuesday instead", "create_alarm", {"time": "8:05 am", "day": "next tuesday"}, styles=["conversational"]), _case("4:50 alarm for sat pls", "create_alarm", {"time": "4:50", "day": "sat"}, styles=["slang", "incomplete"]), _case("gonna crash early — buzz me at 9:30 pm today", "create_alarm", {"time": "9:30 pm", "day": "today"}, styles=["slang", "conversational"]), _case("hey siri vibes but pocket automator: 5 am thurs", "create_alarm", {"time": "5 am", "day": "thurs"}, styles=["slang", "conversational"]), _case("lmk when its 6:30 — jk just set alarm 6:30 friday", "create_alarm", {"time": "6:30", "day": "friday"}, styles=["slang", "conversational"]), _case("alarm?? 7:45 am this weekend", "create_alarm", {"time": "7:45 am", "day": "this weekend"}, styles=["incomplete", "conversational"]), _case("pls dont let me oversleep — 6:10 tomorrow", "create_alarm", {"time": "6:10", "day": "tomorrow"}, styles=["conversational"]), _case("5:55am wakeup call monday pls", "create_alarm", {"time": "5:55am", "day": "monday"}, styles=["incomplete"]), _case("set wakey wakey 8am tmr", "create_alarm", {"time": "8am", "day": "tmr"}, styles=["slang", "incomplete"]), _case("lowkey need 11:11 pm alarm tonight", "create_alarm", {"time": "11:11 pm", "day": "tonight"}, styles=["slang"]), _case("can i get a 7 am buzzer next monday", "create_alarm", {"time": "7 am", "day": "next monday"}, styles=["conversational"]), _case("alarm at half past six tomorrow morning", "create_alarm", {"time": "half past six", "day": "tomorrow morning"}, styles=["conversational"]), _case("6:00 alarm 4 wednesday", "create_alarm", {"time": "6:00", "day": "wednesday"}, styles=["typo", "incomplete"]), _case("wake me up round 9 on sunday", "create_alarm", {"time": "9", "day": "sunday"}, styles=["slang", "incomplete"]), _case("fr need 5:15 am alarm this tuesday", "create_alarm", {"time": "5:15 am", "day": "this tuesday"}, styles=["slang"]), # --- WhatsApp (22) --- _case("hit up zoe on whatsapp say im omw", "whatsapp_send_message", {"contact": "zoe", "message": "im omw"}, styles=["slang", "typo", "incomplete"]), _case("wa msg marcus 'running 20 min late'", "whatsapp_send_message", {"contact": "marcus", "message": "running 20 min late"}, styles=["slang", "incomplete"]), _case("tell roomie on watsapp dishes are done", "whatsapp_send_message", {"contact": "roomie", "message": "dishes are done"}, styles=["slang", "typo"]), _case("whastapp elena — meeting got moved to 4", "whatsapp_send_message", {"contact": "elena", "message": "meeting got moved to 4"}, styles=["typo", "incomplete", "conversational"]), _case("shoot kevin from work a text on whatsapp: heads up im wfh", "whatsapp_send_message", {"contact": "kevin from work", "message": "heads up im wfh"}, styles=["slang", "conversational"]), _case("msg grandpa on whatsapp happy bday!!", "whatsapp_send_message", {"contact": "grandpa", "message": "happy bday!!"}, styles=["slang", "typo"]), _case("yo ping sis saying im outside", "whatsapp_send_message", {"contact": "sis", "message": "im outside"}, styles=["slang", "incomplete"]), _case("whatsap daniel 'got ur package'", "whatsapp_send_message", {"contact": "daniel", "message": "got ur package"}, styles=["typo", "incomplete"]), _case("can u text aunt lisa on whatsapp that flight lands 6", "whatsapp_send_message", {"contact": "aunt lisa", "message": "flight lands 6"}, styles=["conversational", "incomplete"]), _case("drop a whatsapp to coach mike — practice cancelled", "whatsapp_send_message", {"contact": "coach mike", "message": "practice cancelled"}, styles=["slang", "incomplete"]), _case("whatsapp my landlord rent sent", "whatsapp_send_message", {"contact": "my landlord", "message": "rent sent"}, styles=["incomplete"]), _case("lemme msg hannah real quick on whatsapp: u free later?", "whatsapp_send_message", {"contact": "hannah", "message": "u free later?"}, styles=["slang", "conversational"]), _case("send whatsapp to benji thx for covering my shift", "whatsapp_send_message", {"contact": "benji", "message": "thx for covering my shift"}, styles=["slang", "incomplete"]), _case("text whatsapp to carpool group running late again sry", "whatsapp_send_message", {"contact": "carpool group", "message": "running late again sry"}, styles=["slang", "incomplete"]), _case("ok whatsapp olivia saying docs r uploaded", "whatsapp_send_message", {"contact": "olivia", "message": "docs r uploaded"}, styles=["slang", "typo", "conversational"]), _case("wahtapp msg to tomas — keys under mat", "whatsapp_send_message", {"contact": "tomas", "message": "keys under mat"}, styles=["typo", "incomplete"]), _case("just tell jules on whatsapp im grabbing boba", "whatsapp_send_message", {"contact": "jules", "message": "im grabbing boba"}, styles=["slang", "conversational"]), _case("whatsapp neighbor hey can u sign for delivery", "whatsapp_send_message", {"contact": "neighbor", "message": "hey can u sign for delivery"}, styles=["incomplete", "conversational"]), _case("msg whatsapp to dr patel appointment confirmed", "whatsapp_send_message", {"contact": "dr patel", "message": "appointment confirmed"}, styles=["incomplete"]), _case("hit my brother on whatsapp w/ 'u still coming?'", "whatsapp_send_message", {"contact": "my brother", "message": "u still coming?"}, styles=["slang", "incomplete"]), _case("whatsapp bestie ngl im stressed af today", "whatsapp_send_message", {"contact": "bestie", "message": "ngl im stressed af today"}, styles=["slang", "incomplete"]), _case("send whatsapp msg to felix — slide deck is ready", "whatsapp_send_message", {"contact": "felix", "message": "slide deck is ready"}, styles=["incomplete"]), # --- Spotify pause (22) --- _case("spotfy stop rn", "spotify_pause", {}, styles=["typo", "slang", "incomplete"]), _case("yo pause the music", "spotify_pause", {}, styles=["slang", "incomplete"]), _case("mute spotify pls im on a call", "spotify_pause", {}, styles=["conversational"]), _case("can u kill the spotify track", "spotify_pause", {}, styles=["slang", "conversational"]), _case("hold up pause spotify", "spotify_pause", {}, styles=["slang", "incomplete"]), _case("spoitfy pause rq", "spotify_pause", {}, styles=["typo", "slang", "incomplete"]), _case("stop spotify playback thx", "spotify_pause", {}, styles=["conversational"]), _case("shhh pause spotify", "spotify_pause", {}, styles=["incomplete", "conversational"]), _case("spotify off for a sec", "spotify_pause", {}, styles=["slang", "incomplete"]), _case("pause whats playing on spotify", "spotify_pause", {}, styles=["incomplete"]), _case("ok stop spotify music", "spotify_pause", {}, styles=["conversational", "incomplete"]), _case("spotify halt plz", "spotify_pause", {}, styles=["incomplete"]), _case("silence spotify real quick", "spotify_pause", {}, styles=["conversational"]), _case("pause spotify — landlord knocking", "spotify_pause", {}, styles=["conversational"]), _case("spotify stop the banger lol", "spotify_pause", {}, styles=["slang", "conversational"]), _case("cut spotify audio", "spotify_pause", {}, styles=["slang", "incomplete"]), _case("spotify pause pls headphones dying", "spotify_pause", {}, styles=["conversational"]), _case("uh pause spotify", "spotify_pause", {}, styles=["conversational", "incomplete"]), _case("stop the spotify pls", "spotify_pause", {}, styles=["incomplete"]), _case("spotify freeze for now pls", "spotify_pause", {}, styles=["incomplete", "conversational"]), _case("pause my spotify session", "spotify_pause", {}, styles=["incomplete"]), _case("spotify quiet for now", "spotify_pause", {}, styles=["incomplete", "conversational"]), # --- Spotify play playlist (22) --- _case("put on my midnight drives mix on spotify", "spotify_play_playlist", {"playlist": "midnight drives mix"}, styles=["conversational"]), _case("spotify play sunday scaries", "spotify_play_playlist", {"playlist": "sunday scaries"}, styles=["incomplete"]), _case("queue up deep focus vibes on spotify", "spotify_play_playlist", {"playlist": "deep focus vibes"}, styles=["slang"]), _case("play my 2010s throwbacks spotify", "spotify_play_playlist", {"playlist": "2010s throwbacks"}, styles=["incomplete"]), _case("spotfy playlist: rainy day jazz", "spotify_play_playlist", {"playlist": "rainy day jazz"}, styles=["typo", "incomplete"]), _case("yo spin the gym beast mode playlist", "spotify_play_playlist", {"playlist": "gym beast mode"}, styles=["slang", "incomplete"]), _case("spotify — my cooking playlist pls", "spotify_play_playlist", {"playlist": "cooking"}, styles=["incomplete", "conversational"]), _case("start spotify playlist called roadtrip 2024", "spotify_play_playlist", {"playlist": "roadtrip 2024"}, styles=["incomplete"]), _case("play hyperpop mix on spotify", "spotify_play_playlist", {"playlist": "hyperpop mix"}, styles=["incomplete"]), _case("spotify my chillhop essentials thx", "spotify_play_playlist", {"playlist": "chillhop essentials"}, styles=["incomplete", "conversational"]), _case("can u play the study grind playlist on spotify", "spotify_play_playlist", {"playlist": "study grind"}, styles=["conversational"]), _case("spotify playlist sleep sounds go", "spotify_play_playlist", {"playlist": "sleep sounds"}, styles=["incomplete"]), _case("throw on indie roadtrip on spotify", "spotify_play_playlist", {"playlist": "indie roadtrip"}, styles=["slang", "incomplete"]), _case("spotify play my liked tracks playlist", "spotify_play_playlist", {"playlist": "liked tracks"}, styles=["incomplete"]), _case("play spotify playlist bossa nova mornings", "spotify_play_playlist", {"playlist": "bossa nova mornings"}, styles=["incomplete"]), _case("spotify — queue phonk workout", "spotify_play_playlist", {"playlist": "phonk workout"}, styles=["incomplete"]), _case("put spotify on my cozy rainy day list", "spotify_play_playlist", {"playlist": "cozy rainy day"}, styles=["conversational", "incomplete"]), _case("play the spotify playlist french cafe", "spotify_play_playlist", {"playlist": "french cafe"}, styles=["incomplete"]), _case("spotify start my pregame playlist", "spotify_play_playlist", {"playlist": "pregame"}, styles=["slang", "incomplete"]), _case("ok spotify play downtempo focus", "spotify_play_playlist", {"playlist": "downtempo focus"}, styles=["conversational", "incomplete"]), _case("spotify my discover mix playlist pls", "spotify_play_playlist", {"playlist": "discover mix"}, styles=["incomplete"]), _case("play playlist morning commute on spotify", "spotify_play_playlist", {"playlist": "morning commute"}, styles=["incomplete"]), # --- Spotify search play (22) --- _case("spotify find bad bunny and play", "spotify_search_play", {"query": "bad bunny"}, styles=["incomplete"]), _case("search spotify for lofi hip hop then play", "spotify_search_play", {"query": "lofi hip hop"}, styles=["conversational"]), _case("play some arctic monkeys on spotify", "spotify_search_play", {"query": "arctic monkeys"}, styles=["conversational", "incomplete"]), _case("spotfy search billie eilish play it", "spotify_search_play", {"query": "billie eilish"}, styles=["typo", "incomplete"]), _case("look up sabrina carpenter on spotify n play", "spotify_search_play", {"query": "sabrina carpenter"}, styles=["slang", "incomplete"]), _case("spotify: search neo soul and hit play", "spotify_search_play", {"query": "neo soul"}, styles=["incomplete"]), _case("find tyler the creator on spotify and play", "spotify_search_play", {"query": "tyler the creator"}, styles=["incomplete"]), _case("spotify play after searching radiohead", "spotify_search_play", {"query": "radiohead"}, styles=["incomplete"]), _case("yo spotify search dnb and play", "spotify_search_play", {"query": "dnb"}, styles=["slang", "incomplete"]), _case("can u find frank ocean on spotify and play it", "spotify_search_play", {"query": "frank ocean"}, styles=["conversational"]), _case("spotify search classical piano play", "spotify_search_play", {"query": "classical piano"}, styles=["incomplete"]), _case("spotify search play some drake tracks", "spotify_search_play", {"query": "drake tracks"}, styles=["incomplete"]), _case("spotify find latin reggaeton and play", "spotify_search_play", {"query": "latin reggaeton"}, styles=["incomplete"]), _case("search spotify for aphex twin then play", "spotify_search_play", {"query": "aphex twin"}, styles=["incomplete"]), _case("spotify play whatever u find for synthwave", "spotify_search_play", {"query": "synthwave"}, styles=["slang", "conversational"]), _case("find mitski on spotify play pls", "spotify_search_play", {"query": "mitski"}, styles=["incomplete", "conversational"]), _case("soptify search play khruangbin", "spotify_search_play", {"query": "khruangbin"}, styles=["typo", "incomplete"]), _case("look up burna boy spotify and play", "spotify_search_play", {"query": "burna boy"}, styles=["incomplete"]), _case("spotify search play some afrobeats", "spotify_search_play", {"query": "afrobeats"}, styles=["incomplete"]), _case("find fred again on spotify n play it", "spotify_search_play", {"query": "fred again"}, styles=["slang", "incomplete"]), _case("spotify search play 90s r&b", "spotify_search_play", {"query": "90s r&b"}, styles=["incomplete"]), _case("search spotify play tame impala", "spotify_search_play", {"query": "tame impala"}, styles=["incomplete"]), # --- Uber (22) --- _case("ubber to jfk pls", "uber_request_ride", {"destination": "jfk"}, styles=["typo", "incomplete", "conversational"]), _case("need a ride to 42 oak street on uber", "uber_request_ride", {"destination": "42 oak street"}, styles=["conversational"]), _case("hop in an uber to the vet clinic", "uber_request_ride", {"destination": "the vet clinic"}, styles=["slang"]), _case("ubr me to whole foods on 5th", "uber_request_ride", {"destination": "whole foods on 5th"}, styles=["typo", "slang", "incomplete"]), _case("get uber to coworking space downtown", "uber_request_ride", {"destination": "coworking space downtown"}, styles=["incomplete"]), _case("book ride to mom's house via uber", "uber_request_ride", {"destination": "mom's house"}, styles=["incomplete"]), _case("uber — take me to pier 39", "uber_request_ride", {"destination": "pier 39"}, styles=["incomplete", "conversational"]), _case("can u call uber to stanford campus", "uber_request_ride", {"destination": "stanford campus"}, styles=["conversational"]), _case("ride to airport terminal 2 uber", "uber_request_ride", {"destination": "airport terminal 2"}, styles=["incomplete"]), _case("uber to the sushi place on main", "uber_request_ride", {"destination": "the sushi place on main"}, styles=["incomplete", "conversational"]), _case("need uber to ikea rn", "uber_request_ride", {"destination": "ikea"}, styles=["slang", "incomplete"]), _case("request uber to brooklyn bridge park", "uber_request_ride", {"destination": "brooklyn bridge park"}, styles=["incomplete"]), _case("uber drop me at la guardia", "uber_request_ride", {"destination": "la guardia"}, styles=["slang", "incomplete"]), _case("get a cab on uber to dentist on maple ave", "uber_request_ride", {"destination": "dentist on maple ave"}, styles=["conversational"]), _case("uber to friends apartment 8b", "uber_request_ride", {"destination": "friends apartment 8b"}, styles=["incomplete"]), _case("yo uber to the concert venue", "uber_request_ride", {"destination": "the concert venue"}, styles=["slang", "incomplete"]), _case("book uber ride to seattle tacoma airport", "uber_request_ride", {"destination": "seattle tacoma airport"}, styles=["incomplete"]), _case("uber pls to union station", "uber_request_ride", {"destination": "union station"}, styles=["incomplete", "conversational"]), _case("ride share to target on broadway uber", "uber_request_ride", {"destination": "target on broadway"}, styles=["incomplete"]), _case("uber me over to the marriott hotel", "uber_request_ride", {"destination": "the marriott hotel"}, styles=["slang", "incomplete"]), _case("need uber to physio appointment on elm", "uber_request_ride", {"destination": "physio appointment on elm"}, styles=["incomplete"]), _case("call uber to the dog park", "uber_request_ride", {"destination": "the dog park"}, styles=["incomplete"]), # --- Gmail (22) --- _case("gmal email recruiter saying im interested in the role", "gmail_send_email", {"recipient": "recruiter", "message": "im interested in the role"}, styles=["typo", "incomplete"]), _case("shoot an email via gmail to finance@acme.co — invoice overdue", "gmail_send_email", {"recipient": "finance@acme.co", "message": "invoice overdue"}, styles=["slang", "conversational"]), _case("gmail landlord asking about lease renewal", "gmail_send_email", {"recipient": "landlord", "message": "asking about lease renewal"}, styles=["incomplete"]), _case("email my manager on gmail: heads up sick day tomorrow", "gmail_send_email", {"recipient": "my manager", "message": "heads up sick day tomorrow"}, styles=["conversational"]), _case("compose gmail to nora.park@design.io portfolio link attached", "gmail_send_email", {"recipient": "nora.park@design.io", "message": "portfolio link attached"}, styles=["incomplete"]), _case("gmail msg to thesis advisor draft attached for review", "gmail_send_email", {"recipient": "thesis advisor", "message": "draft attached for review"}, styles=["incomplete"]), _case("send gmail to carpool buddy running 10 late", "gmail_send_email", {"recipient": "carpool buddy", "message": "running 10 late"}, styles=["slang", "incomplete"]), _case("write gmail to billing@saas.com cancel subscription pls", "gmail_send_email", {"recipient": "billing@saas.com", "message": "cancel subscription pls"}, styles=["conversational"]), _case("gmail email mentor thx for the intro", "gmail_send_email", {"recipient": "mentor", "message": "thx for the intro"}, styles=["slang", "incomplete"]), _case("ok gmail to ops-team@startup.dev deploy failed rollback done", "gmail_send_email", {"recipient": "ops-team@startup.dev", "message": "deploy failed rollback done"}, styles=["conversational", "incomplete"]), _case("email thru gmail to dentist office reschedule appt", "gmail_send_email", {"recipient": "dentist office", "message": "reschedule appt"}, styles=["incomplete"]), _case("gmail send to contractor quote approved lets proceed", "gmail_send_email", {"recipient": "contractor", "message": "quote approved lets proceed"}, styles=["incomplete"]), _case("hey gmail my professor question about deadline extension", "gmail_send_email", {"recipient": "my professor", "message": "question about deadline extension"}, styles=["conversational", "incomplete"]), _case("gmail to hr@corp.net pto request for next week", "gmail_send_email", {"recipient": "hr@corp.net", "message": "pto request for next week"}, styles=["incomplete"]), _case("send email gmail to vendor payment sent today", "gmail_send_email", {"recipient": "vendor", "message": "payment sent today"}, styles=["incomplete"]), _case("gmial client@studio.com mockups ready for feedback", "gmail_send_email", {"recipient": "client@studio.com", "message": "mockups ready for feedback"}, styles=["typo", "incomplete"]), _case("compose email on gmail to roommate utilities split attached", "gmail_send_email", {"recipient": "roommate", "message": "utilities split attached"}, styles=["incomplete"]), _case("gmail to travel agent pls confirm hotel booking", "gmail_send_email", {"recipient": "travel agent", "message": "pls confirm hotel booking"}, styles=["conversational"]), _case("email gmail coach thanks for the session notes", "gmail_send_email", {"recipient": "coach", "message": "thanks for the session notes"}, styles=["incomplete"]), _case("gmail msg to editor final chapter attached", "gmail_send_email", {"recipient": "editor", "message": "final chapter attached"}, styles=["incomplete"]), _case("send gmail to internship lead following up on application", "gmail_send_email", {"recipient": "internship lead", "message": "following up on application"}, styles=["incomplete"]), _case("gmail email to dad flight details attached", "gmail_send_email", {"recipient": "dad", "message": "flight details attached"}, styles=["incomplete"]), # --- Calendar (24) --- _case("put dentist on calender next thurs 3pm", "calendar_create_event", {"title": "dentist", "date": "next thurs", "time": "3pm"}, styles=["typo", "incomplete"]), _case("schedule 1:1 w/ manager monday 11am", "calendar_create_event", {"title": "1:1 w/ manager", "date": "monday", "time": "11am"}, styles=["slang", "incomplete"]), _case("add calendar block focus time tuesday 9-11", "calendar_create_event", {"title": "focus time", "date": "tuesday", "time": "9-11"}, styles=["incomplete"]), _case("book therapy session friday 5 pm on calendar", "calendar_create_event", {"title": "therapy session", "date": "friday", "time": "5 pm"}, styles=["incomplete"]), _case("cal event: team retro wednesday 4pm", "calendar_create_event", {"title": "team retro", "date": "wednesday", "time": "4pm"}, styles=["slang", "incomplete"]), _case("put parent teacher conf on calendar oct 12 6pm", "calendar_create_event", {"title": "parent teacher conf", "date": "oct 12", "time": "6pm"}, styles=["incomplete"]), _case("schedule gym class saturday 8 am calendar", "calendar_create_event", {"title": "gym class", "date": "saturday", "time": "8 am"}, styles=["incomplete"]), _case("add calendar entry flight to nyc sunday 7:45 am", "calendar_create_event", {"title": "flight to nyc", "date": "sunday", "time": "7:45 am"}, styles=["incomplete"]), _case("calendar — lunch w/ priya tomorrow noon", "calendar_create_event", {"title": "lunch w/ priya", "date": "tomorrow", "time": "noon"}, styles=["slang", "incomplete", "conversational"]), _case("set cal event sprint planning next monday 10", "calendar_create_event", {"title": "sprint planning", "date": "next monday", "time": "10"}, styles=["slang", "incomplete"]), _case("put car inspection on calendar tuesday 2:30 pm", "calendar_create_event", {"title": "car inspection", "date": "tuesday", "time": "2:30 pm"}, styles=["incomplete"]), _case("scheduel interview prep wednesday 6 pm", "calendar_create_event", {"title": "interview prep", "date": "wednesday", "time": "6 pm"}, styles=["typo", "incomplete"]), _case("add to calendar vet appt thursday 1 pm", "calendar_create_event", {"title": "vet appt", "date": "thursday", "time": "1 pm"}, styles=["incomplete"]), _case("calendar event board presentation friday 9 am", "calendar_create_event", {"title": "board presentation", "date": "friday", "time": "9 am"}, styles=["incomplete"]), _case("book calendar slot date night saturday 8 pm", "calendar_create_event", {"title": "date night", "date": "saturday", "time": "8 pm"}, styles=["incomplete"]), _case("put standup notes review on cal monday 9:30", "calendar_create_event", {"title": "standup notes review", "date": "monday", "time": "9:30"}, styles=["slang", "incomplete"]), _case("schedule movers arrival next saturday 10 am", "calendar_create_event", {"title": "movers arrival", "date": "next saturday", "time": "10 am"}, styles=["incomplete"]), _case("add calendar tax appointment april 5 11 am", "calendar_create_event", {"title": "tax appointment", "date": "april 5", "time": "11 am"}, styles=["incomplete"]), _case("calender block deep work thursday morning 8", "calendar_create_event", {"title": "deep work", "date": "thursday morning", "time": "8"}, styles=["typo", "incomplete"]), _case("put wedding rehearsal on calendar june 20 5 pm", "calendar_create_event", {"title": "wedding rehearsal", "date": "june 20", "time": "5 pm"}, styles=["incomplete"]), _case("schedule product demo next wednesday 3 pm", "calendar_create_event", {"title": "product demo", "date": "next wednesday", "time": "3 pm"}, styles=["incomplete"]), _case("add calendar coffee chat friday 10:30 am", "calendar_create_event", {"title": "coffee chat", "date": "friday", "time": "10:30 am"}, styles=["incomplete"]), _case("book calendar orthodontist monday 4:15 pm", "calendar_create_event", {"title": "orthodontist", "date": "monday", "time": "4:15 pm"}, styles=["incomplete"]), _case("put hackathon submission deadline on calendar sunday midnight", "calendar_create_event", {"title": "hackathon submission deadline", "date": "sunday", "time": "midnight"}, styles=["incomplete"]), # --- Slack (22) --- _case("open slack #incidents pls", "slack_open_channel", {"channel": "incidents"}, styles=["incomplete", "conversational"]), _case("jump to product-launch channel on slack", "slack_open_channel", {"channel": "product-launch"}, styles=["slang", "incomplete"]), _case("slack go to team-watercooler", "slack_open_channel", {"channel": "team-watercooler"}, styles=["incomplete"]), _case("pull up slack channel ios-bugs", "slack_open_channel", {"channel": "ios-bugs"}, styles=["slang", "incomplete"]), _case("switch slack to growth-experiments", "slack_open_channel", {"channel": "growth-experiments"}, styles=["incomplete"]), _case("show slack channel design-system", "slack_open_channel", {"channel": "design-system"}, styles=["incomplete"]), _case("navigate slack to platform-oncall", "slack_open_channel", {"channel": "platform-oncall"}, styles=["incomplete"]), _case("open the slack channel ml-research thx", "slack_open_channel", {"channel": "ml-research"}, styles=["conversational"]), _case("slack channel customer-escalations open it", "slack_open_channel", {"channel": "customer-escalations"}, styles=["incomplete"]), _case("take me to #backend-standup in slack", "slack_open_channel", {"channel": "backend-standup"}, styles=["incomplete"]), _case("open slack for channel release-train", "slack_open_channel", {"channel": "release-train"}, styles=["incomplete"]), _case("slack — show me hiring-pipeline channel", "slack_open_channel", {"channel": "hiring-pipeline"}, styles=["conversational", "incomplete"]), _case("go slack channel infra-alerts", "slack_open_channel", {"channel": "infra-alerts"}, styles=["incomplete"]), _case("open slak channel mobile-crash-reports", "slack_open_channel", {"channel": "mobile-crash-reports"}, styles=["typo", "incomplete"]), _case("pull slack up on qa-automation channel", "slack_open_channel", {"channel": "qa-automation"}, styles=["incomplete"]), _case("switch to slack channel docs-and-wiki", "slack_open_channel", {"channel": "docs-and-wiki"}, styles=["incomplete"]), _case("open slack growth-metrics channel rq", "slack_open_channel", {"channel": "growth-metrics"}, styles=["slang", "incomplete"]), _case("show me slack #sales-wins", "slack_open_channel", {"channel": "sales-wins"}, styles=["incomplete"]), _case("navigate to slack channel partner-integrations", "slack_open_channel", {"channel": "partner-integrations"}, styles=["incomplete"]), _case("open slack channel called observability", "slack_open_channel", {"channel": "observability"}, styles=["incomplete"]), _case("slack open eng-hiring channel pls", "slack_open_channel", {"channel": "eng-hiring"}, styles=["incomplete", "conversational"]), _case("jump slack to ai-playground channel", "slack_open_channel", {"channel": "ai-playground"}, styles=["slang", "incomplete"]), ] def normalize_prompt(text: str) -> str: return " ".join(text.lower().strip().split()) def token_set(text: str) -> set[str]: return set(re.findall(r"[a-z0-9@]+", text.lower())) def jaccard_similarity(left: str, right: str) -> float: left_tokens = token_set(left) right_tokens = token_set(right) if not left_tokens or not right_tokens: return 0.0 return len(left_tokens & right_tokens) / len(left_tokens | right_tokens) def load_existing_prompts() -> set[str]: prompts: set[str] = set() if TRAIN_PATH.exists(): with TRAIN_PATH.open(encoding="utf-8") as handle: for line in handle: record = json.loads(line) for message in record["messages"]: if message["role"] == "user": prompts.add(normalize_prompt(message["content"])) if EVAL_PATH.exists(): with EVAL_PATH.open(encoding="utf-8") as handle: for case in json.load(handle): prompts.add(normalize_prompt(case["prompt"])) return prompts def validate_cases(cases: list[dict], existing: set[str]) -> list[str]: errors: list[str] = [] seen: set[str] = set() if len(cases) != TARGET_COUNT: errors.append(f"Expected {TARGET_COUNT} prompts, got {len(cases)}") skill_counts: dict[str, int] = {} style_counts: dict[str, int] = {} for index, case in enumerate(cases, start=1): prompt = case["prompt"] normalized = normalize_prompt(prompt) skill = case["expected"]["skill"] if normalized in seen: errors.append(f"Duplicate benchmark prompt: {prompt}") seen.add(normalized) if normalized in existing: errors.append(f"Exact overlap with training/eval: {prompt}") for existing_prompt in existing: if jaccard_similarity(normalized, existing_prompt) >= 0.85: errors.append( f"High similarity ({jaccard_similarity(normalized, existing_prompt):.2f}) " f"with existing prompt '{existing_prompt}': {prompt}" ) break skill_counts[skill] = skill_counts.get(skill, 0) + 1 for style in case.get("styles", []): style_counts[style] = style_counts.get(style, 0) + 1 if skill not in BENCHMARK_SKILLS: errors.append(f"Case {index}: unsupported skill {skill}") for skill in BENCHMARK_SKILLS: count = skill_counts.get(skill, 0) if count == 0: errors.append(f"Missing skill coverage: {skill}") for style in ("slang", "typo", "incomplete", "conversational"): count = style_counts.get(style, 0) if count < 20: errors.append(f"Insufficient '{style}' coverage: {count} (need >= 20)") return errors def assign_ids(cases: list[dict]) -> list[dict]: output: list[dict] = [] for index, case in enumerate(cases, start=1): output.append({"id": f"pa-{index:03d}", **case}) return output def main() -> None: parser = argparse.ArgumentParser(description="Generate Pocket Automator benchmark prompts.") parser.add_argument( "--validate-only", action="store_true", help="Validate existing benchmark file without rewriting it.", ) args = parser.parse_args() existing = load_existing_prompts() cases = assign_ids(BENCHMARK_CASES) errors = validate_cases(cases, existing) if errors: print("Validation failed:") for error in errors: print(f" - {error}") sys.exit(1) if args.validate_only: print(f"Validation passed for {len(cases)} benchmark prompts.") return OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with OUTPUT_PATH.open("w", encoding="utf-8") as handle: json.dump(cases, handle, indent=2) handle.write("\n") skill_counts = {} for case in cases: skill = case["expected"]["skill"] skill_counts[skill] = skill_counts.get(skill, 0) + 1 print(f"Wrote {len(cases)} benchmark prompts to {OUTPUT_PATH}") print("Skill distribution:") for skill in BENCHMARK_SKILLS: print(f" {skill}: {skill_counts.get(skill, 0)}") if __name__ == "__main__": main()