| |
| """Generate training JSONL with prompt variations for each skill in skills.jsonl.""" |
|
|
| import json |
| import re |
| import sys |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.classifier_prompt import SYSTEM_PROMPT |
|
|
| INPUT_FILE = PROJECT_ROOT / "data" / "skills.jsonl" |
| OUTPUT_FILE = PROJECT_ROOT / "data" / "train.jsonl" |
| VARIATIONS_PER_SKILL = 30 |
|
|
| |
| MUST_INCLUDE_PROMPTS: dict[str, str] = { |
| "play my workout playlist": "spotify_play_playlist", |
| "turn bluetooth on": "bluetooth_enable", |
| "wake me up tomorrow morning": "create_alarm", |
| "send ri a message on whatsapp": "whatsapp_send_message", |
| "send ri a message": "whatsapp_send_message", |
| "send alex a message": "whatsapp_send_message", |
| "open the engineering channel in slack": "slack_open_channel", |
| "pause spotify": "spotify_pause", |
| "book an uber to the airport": "uber_request_ride", |
| "search pasta recipes on youtube": "youtube_search", |
| "email my team saying project update": "gmail_send_email", |
| "send gmail to boss saying i'll be late": "gmail_send_email", |
| "write mail to john@company.com saying hello": "gmail_send_email", |
| "compose an email to sarah@gmail.com thanks for your help": "gmail_send_email", |
| "find parag shah in contacts": "contacts_search", |
| "look up john smith in my contacts": "contacts_search", |
| "search contact parag shah": "contacts_search", |
| "open slack channel data contributors": "slack_open_channel", |
| "go to general channel in slack": "slack_open_channel", |
| "navigate to data contributors on slack": "slack_open_channel", |
| } |
|
|
|
|
| def _unique_lower(prompts: list[str], limit: int) -> list[str]: |
| seen: set[str] = set() |
| result: list[str] = [] |
| for prompt in prompts: |
| key = prompt.lower().strip() |
| if not key or key in seen: |
| continue |
| seen.add(key) |
| result.append(prompt.strip()) |
| if len(result) >= limit: |
| break |
| return result |
|
|
|
|
| def _assistant_content(skill: str) -> str: |
| return json.dumps({"skill": skill}, separators=(",", ":")) |
|
|
|
|
| def _record(skill: str, user_content: str) -> dict: |
| return { |
| "messages": [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": user_content}, |
| {"role": "assistant", "content": _assistant_content(skill)}, |
| ] |
| } |
|
|
|
|
| def _parse_alarm(task: str) -> tuple[str, str]: |
| time_match = re.search( |
| r"(\d{1,2}(?::\d{2})?\s*(?:am|pm)|seven|eight|nine|six|five)", |
| task, |
| re.IGNORECASE, |
| ) |
| when_match = re.search( |
| r"(tomorrow(?:\s+morning)?|monday|tuesday|wednesday|thursday|friday|saturday|sunday|today)", |
| task, |
| re.IGNORECASE, |
| ) |
| time_str = time_match.group(1).lower() if time_match else "7 am" |
| when_str = when_match.group(1).lower() if when_match else "tomorrow" |
| return time_str, when_str |
|
|
|
|
| def generate_create_alarm(task: str) -> list[str]: |
| time_str, when_str = _parse_alarm(task) |
| return _unique_lower( |
| [ |
| task, |
| f"create alarm for {time_str} {when_str}", |
| f"set alarm for {time_str} {when_str}", |
| f"wake me up at {time_str} {when_str}", |
| f"alarm {when_str} {time_str}", |
| f"set a {time_str} alarm for {when_str}", |
| f"create {time_str} alarm {when_str} morning", |
| f"put an alarm for {time_str} {when_str}", |
| f"schedule alarm {time_str} {when_str}", |
| f"set wake up alarm {time_str} {when_str}", |
| f"alarm at {time_str} {when_str} morning", |
| f"create {when_str} alarm {time_str}", |
| f"set my alarm for {time_str} {when_str}", |
| f"need alarm {when_str} {time_str}", |
| f"{time_str} alarm {when_str} please", |
| f"wake up alarm {when_str} {time_str}", |
| f"set alarm {time_str} {when_str}", |
| f"alarm me at {time_str} {when_str}", |
| f"set daily alarm {time_str}", |
| f"wake me at {time_str} {when_str}", |
| f"wake me up tomorrow morning", |
| f"set alarm {when_str} morning {time_str}", |
| f"alarm for {time_str} {when_str}", |
| f"create early alarm {time_str} {when_str}", |
| f"alarm {when_str} {time_str} sharp", |
| f"schedule {time_str} wake up {when_str}", |
| f"put {time_str} alarm on for {when_str}", |
| f"set {when_str} {time_str} alarm", |
| f"remind me at {time_str} {when_str}", |
| f"set an alarm", |
| f"wake me up at {time_str}", |
| f"create a morning alarm", |
| f"alarm for {when_str} morning", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_calendar(task: str) -> tuple[str, str, str]: |
| time_match = re.search(r"(\d{1,2}\s*(?:am|pm)|noon)", task, re.IGNORECASE) |
| when_match = re.search( |
| r"(tomorrow|monday|tuesday|wednesday|thursday|friday|saturday|sunday|next\s+\w+day)", |
| task, |
| re.IGNORECASE, |
| ) |
| title_match = re.search( |
| r"(?:stating|titled|called|about|with)\s+(.+)$", |
| task, |
| re.IGNORECASE, |
| ) |
| time_str = time_match.group(1).lower() if time_match else "4 pm" |
| when_str = when_match.group(1).lower() if when_match else "tomorrow" |
| title = title_match.group(1).strip() if title_match else "meeting" |
| return when_str, time_str, title |
|
|
|
|
| def generate_calendar_create_event(task: str) -> list[str]: |
| when_str, time_str, title = _parse_calendar(task) |
| return _unique_lower( |
| [ |
| task, |
| f"create calendar event for {when_str} {time_str} {title}", |
| f"add a meeting {when_str} at {time_str} {title}", |
| f"schedule {title} {when_str} {time_str}", |
| f"put {title} on my calendar {when_str} at {time_str}", |
| f"create event {when_str} {time_str} {title}", |
| f"add calendar event {when_str} {time_str} {title}", |
| f"schedule {when_str} {time_str} {title}", |
| f"book a meeting {when_str} at {time_str} {title}", |
| f"create a calendar entry {when_str} {time_str} {title}", |
| f"add meeting {title} {when_str} afternoon {time_str}", |
| f"set up a meeting {when_str} {time_str} {title}", |
| f"calendar meeting {when_str} {time_str} {title}", |
| f"create event for team standup monday 9 am", |
| f"add dentist appointment friday 2 pm", |
| f"schedule lunch with alex wednesday noon", |
| f"put birthday party on calendar saturday 6 pm", |
| f"create calendar event project review thursday 3 pm", |
| f"add doctor visit next tuesday 10 am", |
| f"schedule interview monday 11 am", |
| f"create meeting friday 5 pm sync with team", |
| f"add gym session tomorrow 7 am to calendar", |
| f"schedule coffee with sam thursday 4 pm", |
| f"create calendar reminder presentation monday 2 pm", |
| f"add flight to calendar sunday 8 am", |
| f"schedule call with client wednesday 3 pm", |
| f"put dinner reservation on calendar tonight 8 pm", |
| f"create event study session saturday 10 am", |
| f"add yoga class to calendar tuesday 6 pm", |
| f"schedule 1 on 1 with manager friday 1 pm", |
| f"create calendar event {when_str} {time_str} {title}", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_wifi_enable(task: str) -> list[str]: |
| return _unique_lower( |
| [ |
| task, |
| "enable wifi", |
| "turn on wifi", |
| "switch on wifi", |
| "activate wifi", |
| "wifi on", |
| "turn wifi on please", |
| "enable wifi on my phone", |
| "switch wifi on", |
| "start wifi", |
| "put wifi on", |
| "can you turn on wifi", |
| "wifi enable", |
| "turn on my wifi", |
| "enable wireless network", |
| "activate wifi connection", |
| "switch on my wifi", |
| "wifi on now", |
| "please enable wifi", |
| "turn wifi back on", |
| "enable wifi settings", |
| "flip wifi on", |
| "get wifi running", |
| "power on wifi", |
| "i need wifi on", |
| "enable wlan", |
| "turn wlan on", |
| "wifi should be on", |
| "make sure wifi is enabled", |
| "set wifi to on", |
| "connect to wifi", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_bluetooth_enable(task: str) -> list[str]: |
| return _unique_lower( |
| [ |
| task, |
| "turn on bluetooth", |
| "enable bluetooth", |
| "switch on bluetooth", |
| "activate bluetooth", |
| "bluetooth on", |
| "turn bluetooth on please", |
| "enable bluetooth on my phone", |
| "switch bluetooth on", |
| "start bluetooth", |
| "put bluetooth on", |
| "can you turn on bluetooth", |
| "bluetooth enable", |
| "turn on my bluetooth", |
| "enable the bluetooth radio", |
| "activate bluetooth connection", |
| "switch on my bluetooth", |
| "bluetooth on now", |
| "please enable bluetooth", |
| "turn bluetooth back on", |
| "enable bluetooth settings", |
| "flip bluetooth on", |
| "get bluetooth running", |
| "power on bluetooth", |
| "connect bluetooth turn it on", |
| "i need bluetooth on", |
| "enable bt", |
| "turn bt on", |
| "bluetooth should be on", |
| "make sure bluetooth is on", |
| "set bluetooth to on", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_whatsapp(task: str) -> tuple[str, str]: |
| message_match = re.search( |
| r"(?:message|text|send|say(?:ing)?)\s+(.+?)\s+to\s+(\w+)", |
| task, |
| re.IGNORECASE, |
| ) |
| if message_match: |
| return message_match.group(2).lower(), message_match.group(1).lower() |
| to_match = re.search(r"to\s+(\w+)", task, re.IGNORECASE) |
| person = to_match.group(1).lower() if to_match else "biraj" |
| return person, "hi" |
|
|
|
|
| def generate_whatsapp_send_message(task: str) -> list[str]: |
| person, message = _parse_whatsapp(task) |
| return _unique_lower( |
| [ |
| task, |
| f"message {message} to {person} on whatsapp", |
| f"text {message} to {person} on whatsapp", |
| f"send {message} to {person} on whatsapp", |
| f"whatsapp {person} saying {message}", |
| f"message {person} on whatsapp {message}", |
| f"text {person} {message} on whatsapp", |
| f"send a whatsapp to {person} saying {message}", |
| f"whatsapp message {person} {message}", |
| f"tell {person} {message} on whatsapp", |
| f"shoot {person} a whatsapp saying {message}", |
| f"message ri saying see you soon", |
| f"text mom on whatsapp i'm on my way", |
| f"send whatsapp to dad saying good morning", |
| f"whatsapp sarah saying thanks", |
| f"message alex on whatsapp be there in 5", |
| f"text john on whatsapp hello", |
| f"send a message to priya on whatsapp hi", |
| f"whatsapp anita saying see you tonight", |
| f"message my friend on whatsapp hey", |
| f"text rahul on whatsapp running late", |
| f"send whatsapp message to maya saying ok", |
| f"whatsapp karan saying call me", |
| f"message neha on whatsapp good night", |
| f"text sam on whatsapp what's up", |
| f"send {message} via whatsapp to {person}", |
| f"drop {person} a whatsapp saying {message}", |
| f"ping {person} on whatsapp with {message}", |
| f"write to {person} on whatsapp {message}", |
| f"whatsapp {person} {message}", |
| f"send {person} a whatsapp text saying {message}", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_camera_take_photo(task: str) -> list[str]: |
| return _unique_lower( |
| [ |
| task, |
| "take a photo with the camera", |
| "open camera and take a picture", |
| "snap a photo", |
| "click a picture with camera", |
| "take a picture now", |
| "open the camera app and shoot", |
| "capture a photo", |
| "take a selfie", |
| "open camera and snap a pic", |
| "shoot a picture", |
| "click a photo please", |
| "take photo with rear camera", |
| "open camera take picture", |
| "snap a quick photo", |
| "capture an image with camera", |
| "take a picture of this", |
| "open camera and photograph", |
| "click picture using camera", |
| "take a shot with camera", |
| "launch camera and take photo", |
| "grab a photo with camera", |
| "take a camera picture", |
| "open camera snap photo", |
| "photograph this with camera", |
| "take pic with camera app", |
| "shoot photo now", |
| "camera open and click picture", |
| "take a quick picture", |
| "open camera capture photo", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_slack_channel(task: str) -> str: |
| match = re.search(r"channel\s+(.+)$", task, re.IGNORECASE) |
| return match.group(1).strip().lower() if match else "general" |
|
|
|
|
| def generate_slack_open_channel(task: str) -> list[str]: |
| channel = _parse_slack_channel(task) |
| return _unique_lower( |
| [ |
| task, |
| "open the engineering channel in slack", |
| "open slack channel data contributors", |
| "go to general channel in slack", |
| "navigate to data contributors on slack", |
| "switch to #general in slack", |
| "open the data contributors slack channel", |
| "take me to engineering on slack", |
| "show me the random channel on slack", |
| f"open slack channel {channel}", |
| f"go to {channel} channel on slack", |
| f"open the {channel} slack channel", |
| f"switch to {channel} in slack", |
| f"show me {channel} channel slack", |
| f"navigate to {channel} on slack", |
| f"open slack {channel}", |
| f"pull up {channel} slack channel", |
| f"take me to {channel} on slack", |
| f"open channel {channel} in slack", |
| f"slack open {channel}", |
| f"go to slack channel {channel}", |
| f"open the slack channel called {channel}", |
| f"show {channel} slack", |
| f"open engineering channel on slack", |
| f"go to general channel in slack", |
| f"open slack channel random", |
| f"switch to announcements on slack", |
| f"open product team channel slack", |
| f"navigate to design reviews slack channel", |
| f"open slack channel support tickets", |
| f"go to marketing channel on slack", |
| f"open devops alerts slack channel", |
| f"show me general slack channel", |
| f"open slack channel project alpha", |
| f"go to team updates on slack", |
| f"open hiring channel in slack", |
| f"switch to backend team slack channel", |
| f"open slack channel daily standup", |
| f"navigate to {channel} slack", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_spotify_pause(task: str) -> list[str]: |
| return _unique_lower( |
| [ |
| task, |
| "pause spotify", |
| "stop spotify music", |
| "pause the song on spotify", |
| "stop playing on spotify", |
| "pause playback spotify", |
| "hold the music on spotify", |
| "pause spotify playback", |
| "stop spotify for now", |
| "pause what's playing on spotify", |
| "mute spotify pause it", |
| "pause my spotify music", |
| "stop the music spotify", |
| "pause current track spotify", |
| "spotify pause", |
| "pause the spotify player", |
| "stop spotify song", |
| "pause spotify please", |
| "halt spotify music", |
| "pause spotify now", |
| "stop playback on spotify", |
| "pause the audio on spotify", |
| "spotify stop playing", |
| "pause my song on spotify", |
| "freeze spotify playback", |
| "pause spotify music player", |
| "stop spotify temporarily", |
| "pause whatever is on spotify", |
| "spotify pause music", |
| "hold spotify", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_playlist(task: str) -> str: |
| match = re.search(r"play\s+(.+?)(?:\s+(?:playlist|from))", task, re.IGNORECASE) |
| if match: |
| return match.group(1).strip().lower() |
| return "liked songs" |
|
|
|
|
| def generate_spotify_play_playlist(task: str) -> list[str]: |
| playlist = _parse_playlist(task) |
| return _unique_lower( |
| [ |
| task, |
| f"play my {playlist} on spotify", |
| f"start my {playlist} playlist spotify", |
| f"put on {playlist} spotify", |
| f"play {playlist} on spotify", |
| "open spotify and play liked songs", |
| "play my edm playlist", |
| "start spotify workout playlist", |
| "play workout playlist on spotify", |
| "put on my music spotify", |
| "play my playlist on spotify", |
| "start my chill playlist spotify", |
| "play discover weekly spotify", |
| "put on daily mix spotify", |
| "play my running playlist", |
| "start spotify and play my favorites", |
| "play release radar on spotify", |
| "put on party playlist spotify", |
| "play road trip playlist spotify", |
| "start my morning playlist on spotify", |
| "play focus playlist spotify", |
| "put on jazz playlist spotify", |
| "play top hits playlist", |
| "start liked songs on spotify", |
| "play my saved songs spotify", |
| f"open {playlist} playlist spotify", |
| "spotify play my playlist", |
| "play music from my liked songs", |
| "start playing my spotify playlist", |
| "play my spotify liked songs list", |
| f"play {playlist} playlist from spotify", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_destination(task: str) -> str: |
| match = re.search(r"(?:uber\s+(?:for|to)\s+|ride\s+to\s+)(.+)$", task, re.IGNORECASE) |
| if match: |
| return match.group(1).strip().lower() |
| match = re.search(r"for\s+(.+)$", task, re.IGNORECASE) |
| return match.group(1).strip().lower() if match else "the airport" |
|
|
|
|
| def generate_uber_request_ride(task: str) -> list[str]: |
| dest = _parse_destination(task) |
| return _unique_lower( |
| [ |
| task, |
| f"get an uber to {dest}", |
| f"uber to {dest}", |
| f"book uber to {dest}", |
| f"call an uber to {dest}", |
| f"request uber ride to {dest}", |
| f"uber me to {dest}", |
| f"get a ride to {dest} on uber", |
| f"book a cab to {dest} via uber", |
| "uber to the airport please", |
| "get an uber to downtown", |
| "book uber to 123 main street", |
| "call uber to my office", |
| "request a ride to central station on uber", |
| "uber home from here", |
| "get me an uber to the mall", |
| "book uber to union square", |
| "call a ride to the hotel on uber", |
| "uber to marina bay", |
| "get uber to work", |
| "book cab via uber to the restaurant", |
| "request uber to the train station", |
| f"uber to {dest} now", |
| f"need an uber to {dest}", |
| f"schedule uber to {dest}", |
| f"ride to {dest} using uber", |
| f"open uber and go to {dest}", |
| f"find uber to {dest}", |
| f"get ride to {dest}", |
| f"uber for {dest} please", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_person(task: str) -> str: |
| match = re.search( |
| r"search\s+(.+?)\s+(?:on\s+)?linkedin", |
| task, |
| re.IGNORECASE, |
| ) |
| if match: |
| return match.group(1).strip().lower() |
| return "arya sheth" |
|
|
|
|
| def generate_linkedin_search_person(task: str) -> list[str]: |
| person = _parse_person(task) |
| return _unique_lower( |
| [ |
| task, |
| f"look up {person} on linkedin", |
| f"find {person}'s linkedin profile", |
| f"linkedin search for {person}", |
| f"open linkedin and search {person}", |
| f"can you find {person} on linkedin", |
| f"search for {person} on linked in", |
| f"look for {person} on linkedin", |
| "find parag shah on linkedin", |
| f"search {person} linkedin", |
| f"find someone's linkedin profile for {person}", |
| "linkedin lookup for priya mehta", |
| "search linkedin for john smith", |
| "find jane doe on linkedin", |
| "look up my colleague on linkedin", |
| "search for a person on linkedin named arya", |
| f"find this person's linkedin {person}", |
| f"go to linkedin and search {person}", |
| f"pull up {person} on linkedin", |
| f"show me {person} on linkedin", |
| f"i need to find {person} on linkedin", |
| f"help me search linkedin for {person}", |
| f"linkedin find {person}", |
| f"search the name {person} on linkedin", |
| f"look up a contact on linkedin {person}", |
| f"find {person} professional profile", |
| f"search linkedin profiles for {person}", |
| f"open linkedin search {person}", |
| f"find {person.split()[0]} on linkedin", |
| f"linkedin search {person}", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_search_query(task: str, app: str) -> str: |
| match = re.search( |
| rf"search\s+(.+?)\s+(?:on|and|from)\s+{app}", |
| task, |
| re.IGNORECASE, |
| ) |
| if match: |
| return match.group(1).strip().lower() |
| match = re.search(r"search\s+(.+)$", task, re.IGNORECASE) |
| return match.group(1).strip().lower() if match else "music" |
|
|
|
|
| def generate_spotify_search_play(task: str) -> list[str]: |
| query = _parse_search_query(task, "spotify") |
| return _unique_lower( |
| [ |
| task, |
| f"find {query} on spotify and play it", |
| f"play {query} music on spotify", |
| f"search for {query} on spotify and start playing", |
| f"look up {query} tracks on spotify and play", |
| f"spotify search {query} and play", |
| f"find and play {query} on spotify", |
| f"search spotify for {query} music and play", |
| f"play some {query} on spotify", |
| f"open spotify search {query} and play", |
| "search jazz on spotify and play", |
| "find lo fi beats on spotify and play them", |
| "search taylor swift on spotify and play", |
| "play classical music search spotify", |
| "find workout music on spotify and play", |
| "search bollywood songs spotify play", |
| "look up rock music on spotify and play", |
| "search chill vibes spotify and play", |
| "find podcast on spotify and play", |
| "search drake on spotify and play his songs", |
| "play hip hop search spotify", |
| "search acoustic covers spotify play", |
| "find study music on spotify and play", |
| "search pop hits spotify and play", |
| "look for rain sounds on spotify and play", |
| f"search {query} playlist spotify play now", |
| f"find {query} spotify play", |
| f"spotify play {query} after searching", |
| f"search and play {query} tracks spotify", |
| f"put on {query} from spotify search", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_youtube_search(task: str) -> list[str]: |
| query = _parse_search_query(task, "youtube") |
| return _unique_lower( |
| [ |
| task, |
| f"look up {query} on youtube", |
| f"find {query} videos on youtube", |
| f"youtube search {query}", |
| f"search for {query} on youtube", |
| f"play {query} on youtube", |
| f"show me {query} on youtube", |
| f"open youtube and search {query}", |
| f"find cooking videos {query} youtube", |
| f"search youtube for {query} recipes", |
| f"look for {query} on youtube", |
| f"youtube {query}", |
| "search pasta recipes on youtube", |
| "find workout videos on youtube", |
| "look up python tutorials on youtube", |
| "search for cat videos on youtube", |
| "youtube search lo fi beats", |
| "find travel vlogs on youtube", |
| "search meditation music on youtube", |
| "look up guitar lessons on youtube", |
| "find review videos on youtube for phones", |
| "search stand up comedy on youtube", |
| "youtube how to bake bread", |
| "look for documentary on youtube", |
| "search asmr videos on youtube", |
| "find yoga classes on youtube", |
| "youtube search morning routine", |
| "look up news on youtube", |
| "search diy crafts on youtube", |
| f"find {query} studio food videos youtube", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_contact(task: str) -> str: |
| match = re.search(r"search\s+(.+?)\s+in\s+contacts", task, re.IGNORECASE) |
| return match.group(1).strip().lower() if match else "parag shah" |
|
|
|
|
| def generate_contacts_search(task: str) -> list[str]: |
| contact = _parse_contact(task) |
| return _unique_lower( |
| [ |
| task, |
| "find parag shah in contacts", |
| "look up john smith in my contacts", |
| "search contact parag shah", |
| "find mom in my contacts", |
| "search contacts for dad", |
| f"find {contact} in contacts", |
| f"look up {contact} in my contacts", |
| f"contacts search {contact}", |
| f"find {contact} contact", |
| f"search contacts for {contact}", |
| f"look for {contact} in phone contacts", |
| f"open contacts and search {contact}", |
| f"find {contact}'s number in contacts", |
| f"search my contacts for {contact}", |
| f"contacts find {contact}", |
| f"look up {contact} contact info", |
| f"search for {contact.split()[0]} in contacts", |
| "find arya sheth in contacts", |
| "search contacts arya sheth", |
| "look up john smith in contacts", |
| "find mom in my contacts", |
| "search contacts for dad", |
| "find sarah's contact", |
| "look up alex in contacts app", |
| "search phone contacts for maya", |
| "find contact named biraj", |
| "search contacts ri", |
| "look for dentist in contacts", |
| "find doctor contact in phone", |
| f"search my phone book for {contact}", |
| f"contacts lookup {contact}", |
| f"find {contact} phone number contacts", |
| f"search address book {contact}", |
| f"open contacts search {contact}", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def _parse_email(task: str) -> tuple[str, str]: |
| email_match = re.search(r"[\w.+-]+@[\w.-]+\.\w+", task) |
| body_match = re.search( |
| r"(?:saying|with|body|subject)\s+(.+)$", |
| task, |
| re.IGNORECASE, |
| ) |
| email = email_match.group(0).lower() if email_match else "user@example.com" |
| body = body_match.group(1).strip() if body_match else "hello" |
| return email, body |
|
|
|
|
| def generate_gmail_send_email(task: str) -> list[str]: |
| email, body = _parse_email(task) |
| return _unique_lower( |
| [ |
| task, |
| "email my team saying project update", |
| "send gmail to boss saying i'll be late", |
| "write mail to john@company.com saying hello", |
| "compose an email to sarah@gmail.com thanks for your help", |
| "send mail using gmail", |
| "write an email to the team", |
| "compose mail saying hello world", |
| f"email {email} {body} via gmail", |
| f"compose gmail to {email} saying {body}", |
| f"send email to {email} {body}", |
| f"gmail {email} with {body}", |
| f"write email to {email} saying {body}", |
| f"send a gmail message {body} to {email}", |
| f"email {email} from gmail {body}", |
| f"compose email {body} to {email}", |
| f"send {body} email to {email} gmail", |
| "email john@company.com saying project update", |
| "send gmail to sarah@gmail.com subject meeting notes", |
| "compose email to boss saying i'll be late", |
| "send mail to team@work.com weekly report", |
| "gmail alex@gmail.com thanks for your help", |
| "email mom saying happy birthday", |
| "send email to hr@company.com vacation request", |
| "compose gmail to client proposal attached", |
| "email professor asking about the assignment", |
| "send follow up email via gmail", |
| "gmail send to contact@gmail.com hello there", |
| "write and send email to partner@gmail.com", |
| f"send gmail message saying {body}", |
| f"email kriyanshi@gmail.com {body}", |
| f"shoot an email to {email} {body}", |
| f"mail {email} with {body}", |
| f"send gmail {body} to {email}", |
| f"compose and send {body} to {email}", |
| f"gmail to {email} body {body}", |
| f"send an email through gmail {body}", |
| ], |
| VARIATIONS_PER_SKILL, |
| ) |
|
|
|
|
| def generate_generic(skill: str, task: str) -> list[str]: |
| normalized = task.lower().strip() |
| prefixes = ["please ", "can you ", "i need to ", "help me "] |
| suffixes = [" please", " now", " for me"] |
| prompts = [task, normalized] |
| for prefix in prefixes: |
| prompts.append(f"{prefix}{normalized}") |
| for suffix in suffixes: |
| prompts.append(f"{normalized}{suffix}") |
| while len(prompts) < VARIATIONS_PER_SKILL: |
| prompts.append(normalized) |
| return _unique_lower(prompts, VARIATIONS_PER_SKILL) |
|
|
|
|
| GENERATORS = { |
| "create_alarm": generate_create_alarm, |
| "calendar_create_event": generate_calendar_create_event, |
| "wifi_enable": generate_wifi_enable, |
| "bluetooth_enable": generate_bluetooth_enable, |
| "whatsapp_send_message": generate_whatsapp_send_message, |
| "camera_take_photo": generate_camera_take_photo, |
| "slack_open_channel": generate_slack_open_channel, |
| "spotify_pause": generate_spotify_pause, |
| "spotify_play_playlist": generate_spotify_play_playlist, |
| "uber_request_ride": generate_uber_request_ride, |
| "linkedin_search_person": generate_linkedin_search_person, |
| "spotify_search_play": generate_spotify_search_play, |
| "youtube_search": generate_youtube_search, |
| "contacts_search": generate_contacts_search, |
| "gmail_send_email": generate_gmail_send_email, |
| } |
|
|
|
|
| def generate_contrastive_examples() -> list[dict]: |
| """Hard negatives: same phrasing, different app → different skill.""" |
| records: list[dict] = [] |
|
|
| search_names = ["parag shah", "arya sheth", "john smith", "mom", "sarah"] |
| for name in search_names: |
| records.extend( |
| [ |
| _record("contacts_search", f"search {name} in contacts"), |
| _record("contacts_search", f"find {name} in my contacts"), |
| _record("linkedin_search_person", f"search {name} on linkedin"), |
| _record("linkedin_search_person", f"find {name} on linkedin"), |
| _record("youtube_search", f"search {name} on youtube"), |
| _record("youtube_search", f"look up {name} videos on youtube"), |
| _record("spotify_search_play", f"search {name} on spotify and play"), |
| _record("spotify_search_play", f"find {name} music on spotify and play it"), |
| ] |
| ) |
|
|
| records.extend( |
| [ |
| _record("gmail_send_email", "email boss saying i'll be late"), |
| _record("gmail_send_email", "send mail to team@work.com weekly report"), |
| _record("gmail_send_email", "compose gmail to client proposal attached"), |
| _record("whatsapp_send_message", "message boss on whatsapp running late"), |
| _record("whatsapp_send_message", "text team on whatsapp meeting moved"), |
| _record("whatsapp_send_message", "send ri a message on whatsapp"), |
| _record("whatsapp_send_message", "send ri a message"), |
| _record("whatsapp_send_message", "send alex a message"), |
| _record("gmail_send_email", "send ri an email"), |
| _record("create_alarm", "wake me up tomorrow morning"), |
| _record("slack_open_channel", "open the engineering channel in slack"), |
| _record("slack_open_channel", "go to data contributors channel on slack"), |
| _record("slack_open_channel", "switch to announcements in slack"), |
| _record("youtube_search", "search pasta recipes on youtube"), |
| _record("contacts_search", "find parag shah in contacts"), |
| _record("gmail_send_email", "write mail to john@company.com saying hello"), |
| ] |
| ) |
|
|
| return records |
|
|
|
|
| def dedupe_skills(skills: list[dict]) -> list[dict]: |
| seen: set[str] = set() |
| result: list[dict] = [] |
| for entry in skills: |
| skill = entry["skill"] |
| if skill in seen: |
| continue |
| seen.add(skill) |
| result.append(entry) |
| return result |
|
|
|
|
| def generate_variations(skill: str, task: str) -> list[str]: |
| generator = GENERATORS.get(skill, lambda t: generate_generic(skill, t)) |
| return generator(task) |
|
|
|
|
| def load_skills(path: Path) -> list[dict]: |
| skills = [] |
| with path.open(encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if line: |
| skills.append(json.loads(line)) |
| return skills |
|
|
|
|
| def main() -> None: |
| skills = dedupe_skills(load_skills(INPUT_FILE)) |
| records: list[dict] = [] |
| seen_prompts: set[str] = set() |
|
|
| def add_record(skill: str, prompt: str) -> None: |
| key = prompt.lower().strip() |
| if not key or key in seen_prompts: |
| return |
| seen_prompts.add(key) |
| records.append(_record(skill, prompt)) |
|
|
| for entry in skills: |
| skill = entry["skill"] |
| task = entry["task"] |
| for prompt in generate_variations(skill, task): |
| add_record(skill, prompt) |
|
|
| for prompt, skill in MUST_INCLUDE_PROMPTS.items(): |
| add_record(skill, prompt) |
|
|
| for record in generate_contrastive_examples(): |
| prompt = record["messages"][1]["content"] |
| skill = json.loads(record["messages"][2]["content"])["skill"] |
| add_record(skill, prompt) |
|
|
| with OUTPUT_FILE.open("w", encoding="utf-8") as f: |
| for record in records: |
| f.write(json.dumps(record) + "\n") |
|
|
| skill_counts: dict[str, int] = {} |
| for record in records: |
| skill = json.loads(record["messages"][2]["content"])["skill"] |
| skill_counts[skill] = skill_counts.get(skill, 0) + 1 |
|
|
| print(f"Wrote {len(records)} training examples to {OUTPUT_FILE}") |
| print(f"Skills: {len(skills)} unique, contrastive + must-include prompts merged") |
| for skill, count in sorted(skill_counts.items()): |
| print(f" {skill}: {count}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|