| |
| """Generate intent-extraction training data with skill + parameters. |
| |
| Produces: |
| data/train_intent.jsonl — SFT examples for Unsloth (1000+ per skill) |
| data/eval_intent_prompts.json — held-out evaluation prompts with expected intents |
| |
| Usage: |
| python scripts/generate_intent_dataset.py |
| python scripts/generate_intent_dataset.py --examples-per-skill 1200 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import random |
| import sys |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.classifier_prompt import INTENT_SYSTEM_PROMPT |
|
|
| SCHEMAS_FILE = PROJECT_ROOT / "data" / "skill_schemas.json" |
| SKILLS_FILE = PROJECT_ROOT / "data" / "skills.jsonl" |
| TRAIN_OUTPUT = PROJECT_ROOT / "data" / "train_intent.jsonl" |
| EVAL_OUTPUT = PROJECT_ROOT / "data" / "eval_intent_prompts.json" |
|
|
| DEFAULT_EXAMPLES_PER_SKILL = 1000 |
| EVAL_PROMPTS_PER_SKILL = 6 |
| RANDOM_SEED = 42 |
|
|
| |
| |
| |
|
|
| CONTACTS = [ |
| "Ri", "Biraj", "Sarah", "Alex", "Mom", "Dad", "Priya", "Maya", "Rahul", |
| "Neha", "Sam", "Anita", "Karan", "Boss", "John Smith", "Parag Shah", |
| "Arya Sheth", "Jane Doe", "Emma Wilson", "David Chen", "Lisa Park", |
| "Michael Brown", "Sophie Martin", "James Lee", "Nina Patel", "Tom", |
| "Ravi", "Sneha", "Vikram", "Ananya", "Chris", "Jordan", "Taylor", |
| "Morgan", "Casey", "Jamie", "Riley", "Avery", "Quinn", "Drew", |
| "grandma", "uncle raj", "aunt meera", "my roommate", "the landlord", |
| "dentist office", "doctor", "plumber", "electrician", |
| ] |
|
|
| MESSAGES = [ |
| "I'll be late", "see you soon", "thanks", "good morning", "on my way", |
| "be there in 5", "running late", "call me", "what's up", "good night", |
| "hello", "hi there", "ok", "sounds good", "got it", "no worries", |
| "can we reschedule", "meeting moved to 3 pm", "i'm stuck in traffic", |
| "leaving now", "almost there", "happy birthday", "congrats", |
| "let me know when you're free", "pick up milk on the way home", |
| "dinner at 8", "see you tonight", "thanks for your help", |
| "project update attached", "weekly report is ready", |
| "i'll send the docs tomorrow", "running 10 minutes late", |
| "are you available for a call", "let's catch up this weekend", |
| "the package arrived", "payment sent", "invoice attached", |
| "reminder for tomorrow's meeting", "flight lands at 6 pm", |
| ] |
|
|
| ALARM_TIMES = [ |
| "5 am", "5:30 am", "6 am", "6:15 am", "6:30 am", "7 am", "7:30 am", |
| "8 am", "8:15 am", "9 am", "10 am", "11 am", "noon", "12:30 pm", |
| "1 pm", "2 pm", "3 pm", "4 pm", "5 pm", "6 pm", "7 pm", "8 pm", |
| "9 pm", "10 pm", "11 pm", |
| ] |
|
|
| ALARM_DAYS = [ |
| "today", "tomorrow", "tomorrow morning", "monday", "tuesday", |
| "wednesday", "thursday", "friday", "saturday", "sunday", |
| "next monday", "next tuesday", "next friday", "this weekend", |
| ] |
|
|
| CALENDAR_TITLES = [ |
| "team standup", "dentist appointment", "lunch with alex", "project review", |
| "doctor visit", "interview", "gym session", "coffee with sam", |
| "presentation", "flight", "call with client", "dinner reservation", |
| "study session", "yoga class", "1 on 1 with manager", "birthday party", |
| "meeting with kriyanshi", "sync with team", "board meeting", |
| "parent teacher conference", "car service", "vet appointment", |
| "book club", "therapy session", "haircut", "tax appointment", |
| ] |
|
|
| CALENDAR_DATES = [ |
| "tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", |
| "friday", "saturday", "sunday", "next tuesday", "next friday", |
| "next monday", "this weekend", "next week", |
| ] |
|
|
| CALENDAR_TIMES = [ |
| "7 am", "8 am", "9 am", "10 am", "11 am", "noon", "1 pm", "2 pm", |
| "3 pm", "4 pm", "5 pm", "6 pm", "7 pm", "8 pm", |
| ] |
|
|
| PLAYLISTS = [ |
| "liked songs", "workout", "chill", "edm", "discover weekly", "daily mix", |
| "release radar", "party", "road trip", "morning", "focus", "jazz", |
| "top hits", "running", "study", "acoustic", "hip hop", "classical", |
| "bollywood", "lo fi", "rain sounds", "sleep", "meditation", "rock", |
| "indie", "pop hits", "throwback", "summer vibes", "late night", |
| "gym pump", "coding focus", "driving", "cooking", "cleaning", |
| ] |
|
|
| YOUTUBE_QUERIES = [ |
| "pasta recipes", "workout videos", "python tutorials", "cat videos", |
| "lo fi beats", "travel vlogs", "meditation music", "guitar lessons", |
| "phone review", "stand up comedy", "how to bake bread", "documentary", |
| "asmr videos", "yoga classes", "morning routine", "news", |
| "diy crafts", "ghibli food", "korean street food", "home renovation", |
| "machine learning basics", "react tutorial", "interview tips", |
| "stretching routine", "origami tutorial", "watercolor painting", |
| "budget travel tips", "meal prep ideas", "indoor plants care", |
| "car maintenance", "photography tips", "skincare routine", |
| ] |
|
|
| SPOTIFY_QUERIES = [ |
| "jazz", "lo fi beats", "taylor swift", "classical music", "workout music", |
| "bollywood songs", "rock music", "chill vibes", "drake", "hip hop", |
| "acoustic covers", "study music", "pop hits", "rain sounds", "edm", |
| "indie folk", "synthwave", "reggaeton", "k-pop", "country hits", |
| "piano covers", "ambient", "80s hits", "90s throwback", "focus beats", |
| ] |
|
|
| SLACK_CHANNELS = [ |
| "general", "engineering", "data contributors", "random", "announcements", |
| "product team", "design reviews", "support tickets", "marketing", |
| "devops alerts", "project alpha", "team updates", "hiring", "backend team", |
| "daily standup", "sales", "customer success", "incidents", "on-call", |
| "frontend", "mobile", "qa", "releases", "watercooler", "leadership", |
| ] |
|
|
| DESTINATIONS = [ |
| "the airport", "downtown", "123 main street", "my office", "central station", |
| "home", "the mall", "union square", "marina bay", "work", "the restaurant", |
| "the train station", "pratishtha apartment unnamed road", "grand central", |
| "times square", "golden gate bridge", "hotel california", "city hospital", |
| "university campus", "grocery store", "gym", "coffee shop on 5th ave", |
| "convention center", "stadium", "library", "pharmacy", "post office", |
| ] |
|
|
| LINKEDIN_NAMES = [ |
| "arya sheth", "parag shah", "john smith", "jane doe", "priya mehta", |
| "david chen", "emma wilson", "michael brown", "sophie martin", "james lee", |
| "nina patel", "sarah johnson", "robert kim", "lisa anderson", "mark taylor", |
| "amanda white", "chris evans", "rachel green", "kevin hart", "olivia brown", |
| ] |
|
|
| EMAIL_RECIPIENTS = [ |
| "boss", "team", "mom", "hr", "client", "professor", "partner", |
| "john@company.com", "sarah@gmail.com", "team@work.com", "hr@company.com", |
| "alex@gmail.com", "client@startup.io", "professor@university.edu", |
| "kriyanshishah06@gmail.com", "contact@gmail.com", "partner@gmail.com", |
| "manager@corp.com", "support@service.com", "billing@company.com", |
| "recruiter@jobs.com", "design@agency.com", "dev@startup.io", |
| ] |
|
|
| EMAIL_MESSAGES = [ |
| "project update", "i'll be late", "hello", "thanks for your help", |
| "weekly report", "vacation request", "proposal attached", "meeting notes", |
| "follow up", "hello-world", "invoice attached", "contract for review", |
| "interview confirmation", "quarterly results", "onboarding docs", |
| "feedback on the design", "can we reschedule", "out of office notice", |
| "happy birthday", "congratulations on the promotion", |
| ] |
|
|
| NO_PARAM_PROMPTS: dict[str, list[str]] = { |
| "wifi_enable": [ |
| "enable wifi", "turn on wifi", "switch on wifi", "activate wifi", |
| "wifi on", "turn wifi on please", "enable wifi on my phone", |
| "switch wifi on", "start wifi", "put wifi on", "can you turn on wifi", |
| "wifi enable", "turn on my wifi", "enable wireless network", |
| "activate wifi connection", "switch on my wifi", "wifi on now", |
| "please enable wifi", "turn wifi back on", "enable wifi settings", |
| "flip wifi on", "get wifi running", "power on wifi", "i need wifi on", |
| "enable wlan", "turn wlan on", "wifi should be on", |
| "make sure wifi is enabled", "set wifi to on", "connect to wifi", |
| "please turn on wifi", "wifi needs to be on", "switch wlan on now", |
| "turn the wifi on", "enable my wifi connection", "get wifi on", |
| "wifi activation please", "power up wifi", "wifi switch on", |
| ], |
| "bluetooth_enable": [ |
| "turn on bluetooth", "enable bluetooth", "switch on bluetooth", |
| "activate bluetooth", "bluetooth on", "turn bluetooth on please", |
| "enable bluetooth on my phone", "switch bluetooth on", "start bluetooth", |
| "put bluetooth on", "can you turn on bluetooth", "bluetooth enable", |
| "turn on my bluetooth", "enable the bluetooth radio", |
| "activate bluetooth connection", "switch on my bluetooth", |
| "bluetooth on now", "please enable bluetooth", "turn bluetooth back on", |
| "enable bluetooth settings", "flip bluetooth on", "get bluetooth running", |
| "power on bluetooth", "connect bluetooth turn it on", "i need bluetooth on", |
| "enable bt", "turn bt on", "bluetooth should be on", |
| "make sure bluetooth is on", "set bluetooth to on", "turn bluetooth on now", |
| "bluetooth needs to be enabled", "switch bt on", "activate my bluetooth", |
| ], |
| "spotify_pause": [ |
| "pause spotify", "stop spotify music", "pause the song on spotify", |
| "stop playing on spotify", "pause playback spotify", "hold the music on spotify", |
| "pause spotify playback", "stop spotify for now", "pause what's playing on spotify", |
| "mute spotify pause it", "pause my spotify music", "stop the music spotify", |
| "pause current track spotify", "spotify pause", "pause the spotify player", |
| "stop spotify song", "pause spotify please", "halt spotify music", |
| "pause spotify now", "stop playback on spotify", "pause the audio on spotify", |
| "spotify stop playing", "pause my song on spotify", "freeze spotify playback", |
| "pause spotify music player", "stop spotify temporarily", |
| "pause whatever is on spotify", "spotify pause music", "hold spotify", |
| "stop the spotify track", "pause music on spotify", "silence spotify", |
| ], |
| "camera_take_photo": [ |
| "take a photo with the camera", "open camera and take a picture", |
| "snap a photo", "click a picture with camera", "take a picture now", |
| "open the camera app and shoot", "capture a photo", "take a selfie", |
| "open camera and snap a pic", "shoot a picture", "click a photo please", |
| "take photo with rear camera", "open camera take picture", |
| "snap a quick photo", "capture an image with camera", "take a picture of this", |
| "open camera and photograph", "click picture using camera", |
| "take a shot with camera", "launch camera and take photo", |
| "grab a photo with camera", "take a camera picture", |
| "open camera snap photo", "photograph this with camera", |
| "take pic with camera app", "shoot photo now", |
| "camera open and click picture", "take a quick picture", |
| "open camera capture photo", "open camera and click a picture", |
| ], |
| } |
|
|
| |
| |
| |
|
|
| ALARM_TEMPLATES = [ |
| "create alarm for {time} {day}", |
| "set alarm for {time} {day}", |
| "wake me up at {time} {day}", |
| "alarm {day} {time}", |
| "set a {time} alarm for {day}", |
| "put an alarm for {time} {day}", |
| "schedule alarm {time} {day}", |
| "set wake up alarm {time} {day}", |
| "alarm at {time} {day}", |
| "set my alarm for {time} {day}", |
| "need alarm {day} {time}", |
| "{time} alarm {day} please", |
| "wake up alarm {day} {time}", |
| "alarm me at {time} {day}", |
| "wake me at {time} {day}", |
| "alarm for {time} {day}", |
| "schedule {time} wake up {day}", |
| "remind me at {time} {day}", |
| "wake me up at {time}", |
| "set an alarm for {time} {day}", |
| "can you set alarm {time} {day}", |
| "i need to wake up at {time} {day}", |
| "please alarm {time} {day}", |
| "set {day} morning alarm {time}", |
| ] |
|
|
| CALENDAR_TEMPLATES = [ |
| "create calendar event for {date} {time} {title}", |
| "add a meeting {date} at {time} {title}", |
| "schedule {title} {date} {time}", |
| "put {title} on my calendar {date} at {time}", |
| "create event {date} {time} {title}", |
| "add calendar event {date} {time} {title}", |
| "book a meeting {date} at {time} {title}", |
| "create a calendar entry {date} {time} {title}", |
| "set up a meeting {date} {time} {title}", |
| "calendar meeting {date} {time} {title}", |
| "add {title} to calendar {date} {time}", |
| "schedule {title} on {date} at {time}", |
| "put {title} on calendar {date} {time}", |
| "create meeting {date} {time} {title}", |
| "add {title} appointment {date} {time}", |
| ] |
|
|
| WHATSAPP_TEMPLATES = [ |
| "message {message} to {contact} on whatsapp", |
| "text {message} to {contact} on whatsapp", |
| "send {message} to {contact} on whatsapp", |
| "whatsapp {contact} saying {message}", |
| "message {contact} on whatsapp {message}", |
| "text {contact} {message} on whatsapp", |
| "send a whatsapp to {contact} saying {message}", |
| "whatsapp message {contact} {message}", |
| "tell {contact} {message} on whatsapp", |
| "shoot {contact} a whatsapp saying {message}", |
| "send {message} via whatsapp to {contact}", |
| "drop {contact} a whatsapp saying {message}", |
| "ping {contact} on whatsapp with {message}", |
| "write to {contact} on whatsapp {message}", |
| "whatsapp {contact} {message}", |
| "send {contact} a whatsapp text saying {message}", |
| "text {contact} on whatsapp saying {message}", |
| ] |
|
|
| SLACK_TEMPLATES = [ |
| "open slack channel {channel}", |
| "go to {channel} channel in slack", |
| "open the {channel} slack channel", |
| "switch to {channel} in slack", |
| "show me {channel} channel on slack", |
| "navigate to {channel} on slack", |
| "open slack {channel}", |
| "pull up {channel} slack channel", |
| "take me to {channel} on slack", |
| "open channel {channel} in slack", |
| "slack open {channel}", |
| "go to slack channel {channel}", |
| "open the slack channel called {channel}", |
| "show {channel} slack", |
| "open the {channel} channel in slack", |
| "navigate to {channel} slack channel", |
| "switch to #{channel} in slack", |
| ] |
|
|
| PLAYLIST_TEMPLATES = [ |
| "play my {playlist} on spotify", |
| "start my {playlist} playlist spotify", |
| "put on {playlist} spotify", |
| "play {playlist} on spotify", |
| "play {playlist} playlist from spotify", |
| "open {playlist} playlist spotify", |
| "spotify play my {playlist}", |
| "play music from my {playlist}", |
| "start playing my {playlist} spotify playlist", |
| "play my spotify {playlist}", |
| "put on my {playlist} playlist on spotify", |
| "start {playlist} on spotify", |
| "play the {playlist} playlist", |
| "spotify start {playlist}", |
| "queue up {playlist} on spotify", |
| ] |
|
|
| UBER_TEMPLATES = [ |
| "get an uber to {destination}", |
| "uber to {destination}", |
| "book uber to {destination}", |
| "call an uber to {destination}", |
| "request uber ride to {destination}", |
| "uber me to {destination}", |
| "get a ride to {destination} on uber", |
| "book a cab to {destination} via uber", |
| "uber to {destination} now", |
| "need an uber to {destination}", |
| "schedule uber to {destination}", |
| "ride to {destination} using uber", |
| "open uber and go to {destination}", |
| "find uber to {destination}", |
| "get ride to {destination}", |
| "uber for {destination} please", |
| "book an uber to {destination}", |
| "search and uber for {destination}", |
| ] |
|
|
| LINKEDIN_TEMPLATES = [ |
| "search {name} on linkedin", |
| "look up {name} on linkedin", |
| "find {name}'s linkedin profile", |
| "linkedin search for {name}", |
| "open linkedin and search {name}", |
| "can you find {name} on linkedin", |
| "search for {name} on linked in", |
| "look for {name} on linkedin", |
| "find {name} on linkedin", |
| "linkedin find {name}", |
| "search linkedin for {name}", |
| "pull up {name} on linkedin", |
| "show me {name} on linkedin", |
| "i need to find {name} on linkedin", |
| "help me search linkedin for {name}", |
| ] |
|
|
| SPOTIFY_SEARCH_TEMPLATES = [ |
| "search {query} on spotify and play", |
| "find {query} on spotify and play it", |
| "play {query} music on spotify", |
| "search for {query} on spotify and start playing", |
| "look up {query} tracks on spotify and play", |
| "spotify search {query} and play", |
| "find and play {query} on spotify", |
| "search spotify for {query} music and play", |
| "play some {query} on spotify", |
| "open spotify search {query} and play", |
| "search {query} playlist spotify play now", |
| "find {query} spotify play", |
| "spotify play {query} after searching", |
| "search and play {query} tracks spotify", |
| "put on {query} from spotify search", |
| ] |
|
|
| YOUTUBE_TEMPLATES = [ |
| "search {query} on youtube", |
| "look up {query} on youtube", |
| "find {query} videos on youtube", |
| "youtube search {query}", |
| "search for {query} on youtube", |
| "play {query} on youtube", |
| "show me {query} on youtube", |
| "open youtube and search {query}", |
| "find {query} on youtube", |
| "youtube {query}", |
| "look for {query} videos on youtube", |
| "search youtube for {query}", |
| "find videos about {query} on youtube", |
| "youtube look up {query}", |
| ] |
|
|
| CONTACTS_TEMPLATES = [ |
| "search {contact} in contacts", |
| "find {contact} in contacts", |
| "look up {contact} in my contacts", |
| "search contact {contact}", |
| "find {contact} contact", |
| "search contacts for {contact}", |
| "look for {contact} in phone contacts", |
| "open contacts and search {contact}", |
| "find {contact}'s number in contacts", |
| "search my contacts for {contact}", |
| "contacts find {contact}", |
| "look up {contact} contact info", |
| "search my phone book for {contact}", |
| "contacts lookup {contact}", |
| "find {contact} phone number contacts", |
| "search address book {contact}", |
| "open contacts search {contact}", |
| "find parag shah in contacts", |
| ] |
|
|
| GMAIL_TEMPLATES = [ |
| "send mail using gmail to {recipient} saying {message}", |
| "email {recipient} saying {message}", |
| "send gmail to {recipient} saying {message}", |
| "write mail to {recipient} saying {message}", |
| "compose an email to {recipient} {message}", |
| "email {recipient} {message} via gmail", |
| "compose gmail to {recipient} saying {message}", |
| "send email to {recipient} {message}", |
| "gmail {recipient} with {message}", |
| "write email to {recipient} saying {message}", |
| "send a gmail message {message} to {recipient}", |
| "email {recipient} from gmail {message}", |
| "compose email {message} to {recipient}", |
| "send {message} email to {recipient} gmail", |
| "shoot an email to {recipient} {message}", |
| "mail {recipient} with {message}", |
| "send gmail {message} to {recipient}", |
| "compose and send {message} to {recipient}", |
| ] |
|
|
| |
| |
| |
|
|
|
|
| def load_schemas(path: Path) -> dict: |
| with path.open(encoding="utf-8") as handle: |
| return json.load(handle) |
|
|
|
|
| def load_skills(path: Path) -> list[dict]: |
| skills = [] |
| with path.open(encoding="utf-8") as handle: |
| for line in handle: |
| line = line.strip() |
| if line: |
| skills.append(json.loads(line)) |
| return skills |
|
|
|
|
| def format_intent(skill: str, parameters: dict) -> str: |
| return json.dumps({"skill": skill, "parameters": parameters}, separators=(",", ":")) |
|
|
|
|
| def make_record(prompt: str, skill: str, parameters: dict) -> dict: |
| return { |
| "messages": [ |
| {"role": "system", "content": INTENT_SYSTEM_PROMPT}, |
| {"role": "user", "content": prompt}, |
| {"role": "assistant", "content": format_intent(skill, parameters)}, |
| ] |
| } |
|
|
|
|
| def expand_combinations( |
| templates: list[str], |
| param_pools: dict[str, list[str]], |
| limit: int, |
| rng: random.Random, |
| ) -> list[tuple[str, dict]]: |
| """Sample template × parameter combinations without building the full cartesian product.""" |
| pool_keys = list(param_pools.keys()) |
| examples: list[tuple[str, dict]] = [] |
| seen: set[str] = set() |
| attempts = 0 |
| max_attempts = limit * 50 |
|
|
| fillers = ["", " thanks", " asap", " please", " now"] |
| while len(examples) < limit and attempts < max_attempts: |
| attempts += 1 |
| template = rng.choice(templates) |
| params = {key: rng.choice(pool) for key, pool in param_pools.items()} |
| try: |
| prompt = template.format(**params) |
| except KeyError: |
| continue |
| if rng.random() > 0.7: |
| prompt = f"{rng.choice(['please ', 'can you ', 'i need to ', ''] )}{prompt}{rng.choice(fillers)}" |
| key = prompt.lower().strip() |
| if not key or key in seen: |
| continue |
| seen.add(key) |
| examples.append((prompt.strip(), params)) |
|
|
| return examples |
|
|
|
|
| def generate_no_param_examples(skill: str, limit: int, rng: random.Random) -> list[tuple[str, dict]]: |
| prompts = list(NO_PARAM_PROMPTS.get(skill, [])) |
| rng.shuffle(prompts) |
|
|
| prefixes = ["please ", "can you ", "i need to ", "help me ", "quickly ", "just "] |
| suffixes = [" please", " now", " for me", " on my phone", " right away"] |
| seen: set[str] = set() |
| result: list[tuple[str, dict]] = [] |
|
|
| def add(prompt: str) -> bool: |
| key = prompt.lower().strip() |
| if not key or key in seen: |
| return False |
| seen.add(key) |
| result.append((prompt.strip(), {})) |
| return True |
|
|
| for prompt in prompts: |
| add(prompt) |
| for prefix in prefixes: |
| add(f"{prefix}{prompt}") |
| for suffix in suffixes: |
| add(f"{prompt}{suffix}") |
| if len(result) >= limit: |
| return result[:limit] |
|
|
| fillers = ["thanks", "asap", "when you can", "if possible", "real quick"] |
| attempt = 0 |
| while len(result) < limit and attempt < limit * 20: |
| attempt += 1 |
| base = rng.choice(prompts) |
| variant = ( |
| f"{rng.choice(prefixes)}{base}{rng.choice(suffixes)}" |
| f"{'' if rng.random() > 0.3 else ' ' + rng.choice(fillers)}" |
| ) |
| add(variant) |
|
|
| return result[:limit] |
|
|
|
|
| def generate_skill_examples( |
| skill: str, |
| limit: int, |
| rng: random.Random, |
| ) -> list[tuple[str, dict]]: |
| if skill in NO_PARAM_PROMPTS: |
| return generate_no_param_examples(skill, limit, rng) |
|
|
| generators = { |
| "create_alarm": lambda: expand_combinations( |
| ALARM_TEMPLATES, |
| {"time": ALARM_TIMES, "day": ALARM_DAYS}, |
| limit, |
| rng, |
| ), |
| "calendar_create_event": lambda: expand_combinations( |
| CALENDAR_TEMPLATES, |
| {"title": CALENDAR_TITLES, "date": CALENDAR_DATES, "time": CALENDAR_TIMES}, |
| limit, |
| rng, |
| ), |
| "whatsapp_send_message": lambda: expand_combinations( |
| WHATSAPP_TEMPLATES, |
| {"contact": CONTACTS, "message": MESSAGES}, |
| limit, |
| rng, |
| ), |
| "slack_open_channel": lambda: expand_combinations( |
| SLACK_TEMPLATES, |
| {"channel": SLACK_CHANNELS}, |
| limit, |
| rng, |
| ), |
| "spotify_play_playlist": lambda: expand_combinations( |
| PLAYLIST_TEMPLATES, |
| {"playlist": PLAYLISTS}, |
| limit, |
| rng, |
| ), |
| "uber_request_ride": lambda: expand_combinations( |
| UBER_TEMPLATES, |
| {"destination": DESTINATIONS}, |
| limit, |
| rng, |
| ), |
| "linkedin_search_person": lambda: expand_combinations( |
| LINKEDIN_TEMPLATES, |
| {"name": LINKEDIN_NAMES}, |
| limit, |
| rng, |
| ), |
| "spotify_search_play": lambda: expand_combinations( |
| SPOTIFY_SEARCH_TEMPLATES, |
| {"query": SPOTIFY_QUERIES}, |
| limit, |
| rng, |
| ), |
| "youtube_search": lambda: expand_combinations( |
| YOUTUBE_TEMPLATES, |
| {"query": YOUTUBE_QUERIES}, |
| limit, |
| rng, |
| ), |
| "contacts_search": lambda: expand_combinations( |
| CONTACTS_TEMPLATES, |
| {"contact": CONTACTS}, |
| limit, |
| rng, |
| ), |
| "gmail_send_email": lambda: expand_combinations( |
| GMAIL_TEMPLATES, |
| {"recipient": EMAIL_RECIPIENTS, "message": EMAIL_MESSAGES}, |
| limit, |
| rng, |
| ), |
| } |
|
|
| generator = generators.get(skill) |
| if not generator: |
| return [] |
|
|
| return generator() |
|
|
|
|
| def generate_contrastive_examples() -> list[dict]: |
| """Hard negatives: same entity, different app → different skill + parameters.""" |
| records: list[dict] = [] |
| names = ["parag shah", "arya sheth", "john smith", "mom", "sarah"] |
|
|
| for name in names: |
| records.extend( |
| [ |
| make_record(f"search {name} in contacts", "contacts_search", {"contact": name}), |
| make_record(f"find {name} in my contacts", "contacts_search", {"contact": name}), |
| make_record(f"search {name} on linkedin", "linkedin_search_person", {"name": name}), |
| make_record(f"find {name} on linkedin", "linkedin_search_person", {"name": name}), |
| make_record(f"search {name} on youtube", "youtube_search", {"query": name}), |
| make_record(f"look up {name} videos on youtube", "youtube_search", {"query": name}), |
| make_record( |
| f"search {name} on spotify and play", |
| "spotify_search_play", |
| {"query": name}, |
| ), |
| make_record( |
| f"find {name} music on spotify and play it", |
| "spotify_search_play", |
| {"query": name}, |
| ), |
| ] |
| ) |
|
|
| records.extend( |
| [ |
| make_record( |
| "email boss saying i'll be late", |
| "gmail_send_email", |
| {"recipient": "boss", "message": "i'll be late"}, |
| ), |
| make_record( |
| "message boss on whatsapp running late", |
| "whatsapp_send_message", |
| {"contact": "boss", "message": "running late"}, |
| ), |
| make_record( |
| "open the engineering channel in slack", |
| "slack_open_channel", |
| {"channel": "engineering"}, |
| ), |
| make_record( |
| "search pasta recipes on youtube", |
| "youtube_search", |
| {"query": "pasta recipes"}, |
| ), |
| make_record( |
| "find parag shah in contacts", |
| "contacts_search", |
| {"contact": "parag shah"}, |
| ), |
| make_record( |
| "send ri a message on whatsapp saying see you soon", |
| "whatsapp_send_message", |
| {"contact": "ri", "message": "see you soon"}, |
| ), |
| make_record( |
| "play my workout playlist", |
| "spotify_play_playlist", |
| {"playlist": "workout"}, |
| ), |
| make_record( |
| "wake me up tomorrow morning", |
| "create_alarm", |
| {"time": "7 am", "day": "tomorrow morning"}, |
| ), |
| ] |
| ) |
|
|
| return records |
|
|
|
|
| def generate_eval_prompts( |
| train_prompts: set[str], |
| per_skill: int, |
| rng: random.Random, |
| ) -> list[dict]: |
| """Generate held-out eval prompts not present in training.""" |
| eval_cases: list[dict] = [] |
|
|
| eval_templates = { |
| "create_alarm": [ |
| ("set a 6 am alarm for monday", {"time": "6 am", "day": "monday"}), |
| ("wake me up at 8:30 am next friday", {"time": "8:30 am", "day": "next friday"}), |
| ("alarm tomorrow 5:30 am", {"time": "5:30 am", "day": "tomorrow"}), |
| ("schedule 9 pm alarm tonight", {"time": "9 pm", "day": "tonight"}), |
| ("i need a 7:15 am alarm wednesday", {"time": "7:15 am", "day": "wednesday"}), |
| ("put alarm for 6:45 am saturday", {"time": "6:45 am", "day": "saturday"}), |
| ("wake up at noon tomorrow", {"time": "noon", "day": "tomorrow"}), |
| ], |
| "calendar_create_event": [ |
| ("add team standup to my calendar tuesday 10 am", {"title": "team standup", "date": "tuesday", "time": "10 am"}), |
| ("book dentist appointment next friday 3 pm", {"title": "dentist appointment", "date": "next friday", "time": "3 pm"}), |
| ("put lunch with priya on calendar wednesday noon", {"title": "lunch with priya", "date": "wednesday", "time": "noon"}), |
| ("schedule code review thursday 2 pm", {"title": "code review", "date": "thursday", "time": "2 pm"}), |
| ("add flight to calendar sunday 8 am", {"title": "flight", "date": "sunday", "time": "8 am"}), |
| ("create event hackathon demo monday 4 pm", {"title": "hackathon demo", "date": "monday", "time": "4 pm"}), |
| ], |
| "wifi_enable": [ |
| ("turn on wifi", {}), |
| ("please enable wifi on my phone", {}), |
| ("switch wlan on", {}), |
| ("activate my wifi connection", {}), |
| ("i need wifi enabled", {}), |
| ("get wifi running please", {}), |
| ], |
| "bluetooth_enable": [ |
| ("turn bluetooth on", {}), |
| ("enable bluetooth please", {}), |
| ("switch on my bluetooth", {}), |
| ("bluetooth should be on", {}), |
| ("power on bluetooth now", {}), |
| ("activate bluetooth radio", {}), |
| ], |
| "whatsapp_send_message": [ |
| ("text mom on whatsapp i'm on my way", {"contact": "mom", "message": "i'm on my way"}), |
| ("whatsapp sarah saying thanks", {"contact": "sarah", "message": "thanks"}), |
| ("message alex on whatsapp be there in 5", {"contact": "alex", "message": "be there in 5"}), |
| ("send ri a message saying i'll be late", {"contact": "ri", "message": "i'll be late"}), |
| ("tell biraj on whatsapp see you tonight", {"contact": "biraj", "message": "see you tonight"}), |
| ("ping priya on whatsapp with hello", {"contact": "priya", "message": "hello"}), |
| ], |
| "camera_take_photo": [ |
| ("snap a quick photo", {}), |
| ("open camera and take a picture", {}), |
| ("capture a photo with the camera", {}), |
| ("take a selfie now", {}), |
| ("shoot a picture please", {}), |
| ("launch camera and snap a pic", {}), |
| ], |
| "slack_open_channel": [ |
| ("go to general channel in slack", {"channel": "general"}), |
| ("open slack channel random", {"channel": "random"}), |
| ("navigate to design reviews slack channel", {"channel": "design reviews"}), |
| ("open the engineering channel in slack", {"channel": "engineering"}), |
| ("switch to announcements in slack", {"channel": "announcements"}), |
| ("show me data contributors on slack", {"channel": "data contributors"}), |
| ], |
| "spotify_pause": [ |
| ("pause spotify playback", {}), |
| ("stop the music on spotify", {}), |
| ("hold spotify for now", {}), |
| ("freeze spotify playback", {}), |
| ("halt the spotify player", {}), |
| ("spotify stop playing", {}), |
| ], |
| "spotify_play_playlist": [ |
| ("start my chill playlist on spotify", {"playlist": "chill"}), |
| ("play discover weekly on spotify", {"playlist": "discover weekly"}), |
| ("put on my liked songs spotify", {"playlist": "liked songs"}), |
| ("play my workout playlist", {"playlist": "workout"}), |
| ("queue up road trip on spotify", {"playlist": "road trip"}), |
| ("start focus playlist spotify", {"playlist": "focus"}), |
| ], |
| "uber_request_ride": [ |
| ("get an uber to the airport", {"destination": "the airport"}), |
| ("book uber to downtown", {"destination": "downtown"}), |
| ("request a ride to central station on uber", {"destination": "central station"}), |
| ("uber me to golden gate bridge", {"destination": "golden gate bridge"}), |
| ("need a ride to city hospital via uber", {"destination": "city hospital"}), |
| ("call uber to convention center", {"destination": "convention center"}), |
| ], |
| "linkedin_search_person": [ |
| ("look up jane doe on linkedin", {"name": "jane doe"}), |
| ("find parag shah's linkedin profile", {"name": "parag shah"}), |
| ("search linkedin for john smith", {"name": "john smith"}), |
| ("pull up sophie martin on linkedin", {"name": "sophie martin"}), |
| ("find david chen on linkedin", {"name": "david chen"}), |
| ("linkedin search nina patel", {"name": "nina patel"}), |
| ], |
| "spotify_search_play": [ |
| ("find jazz on spotify and play it", {"query": "jazz"}), |
| ("search lo fi beats on spotify and play them", {"query": "lo fi beats"}), |
| ("spotify search taylor swift and play", {"query": "taylor swift"}), |
| ("search k-pop on spotify and play", {"query": "k-pop"}), |
| ("find ambient music on spotify and play", {"query": "ambient"}), |
| ("play reggaeton after searching spotify", {"query": "reggaeton"}), |
| ], |
| "youtube_search": [ |
| ("search pasta recipes on youtube", {"query": "pasta recipes"}), |
| ("find workout videos on youtube", {"query": "workout videos"}), |
| ("look up python tutorials on youtube", {"query": "python tutorials"}), |
| ("youtube search morning routine", {"query": "morning routine"}), |
| ("find korean street food on youtube", {"query": "korean street food"}), |
| ("search machine learning basics on youtube", {"query": "machine learning basics"}), |
| ], |
| "contacts_search": [ |
| ("find mom in my contacts", {"contact": "mom"}), |
| ("search contacts for dad", {"contact": "dad"}), |
| ("look up john smith in contacts", {"contact": "john smith"}), |
| ("find dentist office in contacts", {"contact": "dentist office"}), |
| ("search my contacts for maya", {"contact": "maya"}), |
| ("contacts lookup ri", {"contact": "ri"}), |
| ], |
| "gmail_send_email": [ |
| ("email boss saying i'll be late", {"recipient": "boss", "message": "i'll be late"}), |
| ("send gmail to sarah@gmail.com subject meeting notes", {"recipient": "sarah@gmail.com", "message": "meeting notes"}), |
| ("compose email to client proposal attached", {"recipient": "client", "message": "proposal attached"}), |
| ("write mail to hr@company.com vacation request", {"recipient": "hr@company.com", "message": "vacation request"}), |
| ("send email to professor asking about assignment", {"recipient": "professor", "message": "asking about assignment"}), |
| ("gmail team@work.com weekly report", {"recipient": "team@work.com", "message": "weekly report"}), |
| ], |
| } |
|
|
| for skill, templates in eval_templates.items(): |
| added = 0 |
| rng.shuffle(templates) |
| for prompt, params in templates: |
| key = prompt.lower().strip() |
| if key in train_prompts: |
| continue |
| eval_cases.append( |
| { |
| "prompt": prompt, |
| "expected": {"skill": skill, "parameters": params}, |
| } |
| ) |
| added += 1 |
| if added >= per_skill: |
| break |
|
|
| return eval_cases |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Generate intent extraction dataset.") |
| parser.add_argument( |
| "--examples-per-skill", |
| type=int, |
| default=DEFAULT_EXAMPLES_PER_SKILL, |
| help=f"Training examples per skill (default: {DEFAULT_EXAMPLES_PER_SKILL})", |
| ) |
| parser.add_argument( |
| "--eval-per-skill", |
| type=int, |
| default=EVAL_PROMPTS_PER_SKILL, |
| help=f"Eval prompts per skill (default: {EVAL_PROMPTS_PER_SKILL})", |
| ) |
| parser.add_argument( |
| "--seed", |
| type=int, |
| default=RANDOM_SEED, |
| help=f"Random seed (default: {RANDOM_SEED})", |
| ) |
| args = parser.parse_args() |
|
|
| rng = random.Random(args.seed) |
| schemas = load_schemas(SCHEMAS_FILE) |
| skills = load_skills(SKILLS_FILE) |
|
|
| records: list[dict] = [] |
| train_prompts: set[str] = set() |
| skill_counts: dict[str, int] = {} |
|
|
| for entry in skills: |
| skill = entry["skill"] |
| if skill not in schemas: |
| print(f"Warning: no schema for skill {skill}, skipping") |
| continue |
|
|
| examples = generate_skill_examples(skill, args.examples_per_skill, rng) |
| for prompt, params in examples: |
| key = prompt.lower().strip() |
| if key in train_prompts: |
| continue |
| train_prompts.add(key) |
| records.append(make_record(prompt, skill, params)) |
| skill_counts[skill] = skill_counts.get(skill, 0) + 1 |
|
|
| for record in generate_contrastive_examples(): |
| prompt = record["messages"][1]["content"] |
| key = prompt.lower().strip() |
| if key in train_prompts: |
| continue |
| train_prompts.add(key) |
| records.append(record) |
| skill = json.loads(record["messages"][2]["content"])["skill"] |
| skill_counts[skill] = skill_counts.get(skill, 0) + 1 |
|
|
| rng.shuffle(records) |
|
|
| with TRAIN_OUTPUT.open("w", encoding="utf-8") as handle: |
| for record in records: |
| handle.write(json.dumps(record) + "\n") |
|
|
| eval_prompts = generate_eval_prompts(train_prompts, args.eval_per_skill, rng) |
| with EVAL_OUTPUT.open("w", encoding="utf-8") as handle: |
| json.dump(eval_prompts, handle, indent=2) |
| handle.write("\n") |
|
|
| print(f"Wrote {len(records)} training examples to {TRAIN_OUTPUT}") |
| print(f"Wrote {len(eval_prompts)} eval prompts to {EVAL_OUTPUT}") |
| print(f"Skills: {len(skill_counts)}") |
| for skill, count in sorted(skill_counts.items()): |
| status = "OK" if count >= args.examples_per_skill else "LOW" |
| print(f" {skill}: {count} [{status}]") |
|
|
| low_skills = [s for s, c in skill_counts.items() if c < args.examples_per_skill] |
| if low_skills: |
| print(f"\nWarning: {len(low_skills)} skills below target count: {', '.join(low_skills)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|