#!/usr/bin/env python3 """Generate intent-extraction training data with skill + parameters. Produces: data/train_intent.jsonl — SFT examples for Unsloth (1000+ per skill) data/eval_intent_prompts.json — held-out evaluation prompts with expected intents Usage: python scripts/generate_intent_dataset.py python scripts/generate_intent_dataset.py --examples-per-skill 1200 """ from __future__ import annotations import argparse import json import random import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from src.classifier_prompt import INTENT_SYSTEM_PROMPT # noqa: E402 SCHEMAS_FILE = PROJECT_ROOT / "data" / "skill_schemas.json" SKILLS_FILE = PROJECT_ROOT / "data" / "skills.jsonl" TRAIN_OUTPUT = PROJECT_ROOT / "data" / "train_intent.jsonl" EVAL_OUTPUT = PROJECT_ROOT / "data" / "eval_intent_prompts.json" DEFAULT_EXAMPLES_PER_SKILL = 1000 EVAL_PROMPTS_PER_SKILL = 6 RANDOM_SEED = 42 # --------------------------------------------------------------------------- # Synthetic entity pools # --------------------------------------------------------------------------- CONTACTS = [ "Ri", "Biraj", "Sarah", "Alex", "Mom", "Dad", "Priya", "Maya", "Rahul", "Neha", "Sam", "Anita", "Karan", "Boss", "John Smith", "Parag Shah", "Arya Sheth", "Jane Doe", "Emma Wilson", "David Chen", "Lisa Park", "Michael Brown", "Sophie Martin", "James Lee", "Nina Patel", "Tom", "Ravi", "Sneha", "Vikram", "Ananya", "Chris", "Jordan", "Taylor", "Morgan", "Casey", "Jamie", "Riley", "Avery", "Quinn", "Drew", "grandma", "uncle raj", "aunt meera", "my roommate", "the landlord", "dentist office", "doctor", "plumber", "electrician", ] MESSAGES = [ "I'll be late", "see you soon", "thanks", "good morning", "on my way", "be there in 5", "running late", "call me", "what's up", "good night", "hello", "hi there", "ok", "sounds good", "got it", "no worries", "can we reschedule", "meeting moved to 3 pm", "i'm stuck in traffic", "leaving now", "almost there", "happy birthday", "congrats", "let me know when you're free", "pick up milk on the way home", "dinner at 8", "see you tonight", "thanks for your help", "project update attached", "weekly report is ready", "i'll send the docs tomorrow", "running 10 minutes late", "are you available for a call", "let's catch up this weekend", "the package arrived", "payment sent", "invoice attached", "reminder for tomorrow's meeting", "flight lands at 6 pm", ] ALARM_TIMES = [ "5 am", "5:30 am", "6 am", "6:15 am", "6:30 am", "7 am", "7:30 am", "8 am", "8:15 am", "9 am", "10 am", "11 am", "noon", "12:30 pm", "1 pm", "2 pm", "3 pm", "4 pm", "5 pm", "6 pm", "7 pm", "8 pm", "9 pm", "10 pm", "11 pm", ] ALARM_DAYS = [ "today", "tomorrow", "tomorrow morning", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "next monday", "next tuesday", "next friday", "this weekend", ] CALENDAR_TITLES = [ "team standup", "dentist appointment", "lunch with alex", "project review", "doctor visit", "interview", "gym session", "coffee with sam", "presentation", "flight", "call with client", "dinner reservation", "study session", "yoga class", "1 on 1 with manager", "birthday party", "meeting with kriyanshi", "sync with team", "board meeting", "parent teacher conference", "car service", "vet appointment", "book club", "therapy session", "haircut", "tax appointment", ] CALENDAR_DATES = [ "tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "next tuesday", "next friday", "next monday", "this weekend", "next week", ] CALENDAR_TIMES = [ "7 am", "8 am", "9 am", "10 am", "11 am", "noon", "1 pm", "2 pm", "3 pm", "4 pm", "5 pm", "6 pm", "7 pm", "8 pm", ] PLAYLISTS = [ "liked songs", "workout", "chill", "edm", "discover weekly", "daily mix", "release radar", "party", "road trip", "morning", "focus", "jazz", "top hits", "running", "study", "acoustic", "hip hop", "classical", "bollywood", "lo fi", "rain sounds", "sleep", "meditation", "rock", "indie", "pop hits", "throwback", "summer vibes", "late night", "gym pump", "coding focus", "driving", "cooking", "cleaning", ] YOUTUBE_QUERIES = [ "pasta recipes", "workout videos", "python tutorials", "cat videos", "lo fi beats", "travel vlogs", "meditation music", "guitar lessons", "phone review", "stand up comedy", "how to bake bread", "documentary", "asmr videos", "yoga classes", "morning routine", "news", "diy crafts", "ghibli food", "korean street food", "home renovation", "machine learning basics", "react tutorial", "interview tips", "stretching routine", "origami tutorial", "watercolor painting", "budget travel tips", "meal prep ideas", "indoor plants care", "car maintenance", "photography tips", "skincare routine", ] SPOTIFY_QUERIES = [ "jazz", "lo fi beats", "taylor swift", "classical music", "workout music", "bollywood songs", "rock music", "chill vibes", "drake", "hip hop", "acoustic covers", "study music", "pop hits", "rain sounds", "edm", "indie folk", "synthwave", "reggaeton", "k-pop", "country hits", "piano covers", "ambient", "80s hits", "90s throwback", "focus beats", ] SLACK_CHANNELS = [ "general", "engineering", "data contributors", "random", "announcements", "product team", "design reviews", "support tickets", "marketing", "devops alerts", "project alpha", "team updates", "hiring", "backend team", "daily standup", "sales", "customer success", "incidents", "on-call", "frontend", "mobile", "qa", "releases", "watercooler", "leadership", ] DESTINATIONS = [ "the airport", "downtown", "123 main street", "my office", "central station", "home", "the mall", "union square", "marina bay", "work", "the restaurant", "the train station", "pratishtha apartment unnamed road", "grand central", "times square", "golden gate bridge", "hotel california", "city hospital", "university campus", "grocery store", "gym", "coffee shop on 5th ave", "convention center", "stadium", "library", "pharmacy", "post office", ] LINKEDIN_NAMES = [ "arya sheth", "parag shah", "john smith", "jane doe", "priya mehta", "david chen", "emma wilson", "michael brown", "sophie martin", "james lee", "nina patel", "sarah johnson", "robert kim", "lisa anderson", "mark taylor", "amanda white", "chris evans", "rachel green", "kevin hart", "olivia brown", ] EMAIL_RECIPIENTS = [ "boss", "team", "mom", "hr", "client", "professor", "partner", "john@company.com", "sarah@gmail.com", "team@work.com", "hr@company.com", "alex@gmail.com", "client@startup.io", "professor@university.edu", "kriyanshishah06@gmail.com", "contact@gmail.com", "partner@gmail.com", "manager@corp.com", "support@service.com", "billing@company.com", "recruiter@jobs.com", "design@agency.com", "dev@startup.io", ] EMAIL_MESSAGES = [ "project update", "i'll be late", "hello", "thanks for your help", "weekly report", "vacation request", "proposal attached", "meeting notes", "follow up", "hello-world", "invoice attached", "contract for review", "interview confirmation", "quarterly results", "onboarding docs", "feedback on the design", "can we reschedule", "out of office notice", "happy birthday", "congratulations on the promotion", ] NO_PARAM_PROMPTS: dict[str, list[str]] = { "wifi_enable": [ "enable wifi", "turn on wifi", "switch on wifi", "activate wifi", "wifi on", "turn wifi on please", "enable wifi on my phone", "switch wifi on", "start wifi", "put wifi on", "can you turn on wifi", "wifi enable", "turn on my wifi", "enable wireless network", "activate wifi connection", "switch on my wifi", "wifi on now", "please enable wifi", "turn wifi back on", "enable wifi settings", "flip wifi on", "get wifi running", "power on wifi", "i need wifi on", "enable wlan", "turn wlan on", "wifi should be on", "make sure wifi is enabled", "set wifi to on", "connect to wifi", "please turn on wifi", "wifi needs to be on", "switch wlan on now", "turn the wifi on", "enable my wifi connection", "get wifi on", "wifi activation please", "power up wifi", "wifi switch on", ], "bluetooth_enable": [ "turn on bluetooth", "enable bluetooth", "switch on bluetooth", "activate bluetooth", "bluetooth on", "turn bluetooth on please", "enable bluetooth on my phone", "switch bluetooth on", "start bluetooth", "put bluetooth on", "can you turn on bluetooth", "bluetooth enable", "turn on my bluetooth", "enable the bluetooth radio", "activate bluetooth connection", "switch on my bluetooth", "bluetooth on now", "please enable bluetooth", "turn bluetooth back on", "enable bluetooth settings", "flip bluetooth on", "get bluetooth running", "power on bluetooth", "connect bluetooth turn it on", "i need bluetooth on", "enable bt", "turn bt on", "bluetooth should be on", "make sure bluetooth is on", "set bluetooth to on", "turn bluetooth on now", "bluetooth needs to be enabled", "switch bt on", "activate my bluetooth", ], "spotify_pause": [ "pause spotify", "stop spotify music", "pause the song on spotify", "stop playing on spotify", "pause playback spotify", "hold the music on spotify", "pause spotify playback", "stop spotify for now", "pause what's playing on spotify", "mute spotify pause it", "pause my spotify music", "stop the music spotify", "pause current track spotify", "spotify pause", "pause the spotify player", "stop spotify song", "pause spotify please", "halt spotify music", "pause spotify now", "stop playback on spotify", "pause the audio on spotify", "spotify stop playing", "pause my song on spotify", "freeze spotify playback", "pause spotify music player", "stop spotify temporarily", "pause whatever is on spotify", "spotify pause music", "hold spotify", "stop the spotify track", "pause music on spotify", "silence spotify", ], "camera_take_photo": [ "take a photo with the camera", "open camera and take a picture", "snap a photo", "click a picture with camera", "take a picture now", "open the camera app and shoot", "capture a photo", "take a selfie", "open camera and snap a pic", "shoot a picture", "click a photo please", "take photo with rear camera", "open camera take picture", "snap a quick photo", "capture an image with camera", "take a picture of this", "open camera and photograph", "click picture using camera", "take a shot with camera", "launch camera and take photo", "grab a photo with camera", "take a camera picture", "open camera snap photo", "photograph this with camera", "take pic with camera app", "shoot photo now", "camera open and click picture", "take a quick picture", "open camera capture photo", "open camera and click a picture", ], } # --------------------------------------------------------------------------- # Prompt templates (placeholders: {contact}, {message}, {time}, {day}, etc.) # --------------------------------------------------------------------------- ALARM_TEMPLATES = [ "create alarm for {time} {day}", "set alarm for {time} {day}", "wake me up at {time} {day}", "alarm {day} {time}", "set a {time} alarm for {day}", "put an alarm for {time} {day}", "schedule alarm {time} {day}", "set wake up alarm {time} {day}", "alarm at {time} {day}", "set my alarm for {time} {day}", "need alarm {day} {time}", "{time} alarm {day} please", "wake up alarm {day} {time}", "alarm me at {time} {day}", "wake me at {time} {day}", "alarm for {time} {day}", "schedule {time} wake up {day}", "remind me at {time} {day}", "wake me up at {time}", "set an alarm for {time} {day}", "can you set alarm {time} {day}", "i need to wake up at {time} {day}", "please alarm {time} {day}", "set {day} morning alarm {time}", ] CALENDAR_TEMPLATES = [ "create calendar event for {date} {time} {title}", "add a meeting {date} at {time} {title}", "schedule {title} {date} {time}", "put {title} on my calendar {date} at {time}", "create event {date} {time} {title}", "add calendar event {date} {time} {title}", "book a meeting {date} at {time} {title}", "create a calendar entry {date} {time} {title}", "set up a meeting {date} {time} {title}", "calendar meeting {date} {time} {title}", "add {title} to calendar {date} {time}", "schedule {title} on {date} at {time}", "put {title} on calendar {date} {time}", "create meeting {date} {time} {title}", "add {title} appointment {date} {time}", ] WHATSAPP_TEMPLATES = [ "message {message} to {contact} on whatsapp", "text {message} to {contact} on whatsapp", "send {message} to {contact} on whatsapp", "whatsapp {contact} saying {message}", "message {contact} on whatsapp {message}", "text {contact} {message} on whatsapp", "send a whatsapp to {contact} saying {message}", "whatsapp message {contact} {message}", "tell {contact} {message} on whatsapp", "shoot {contact} a whatsapp saying {message}", "send {message} via whatsapp to {contact}", "drop {contact} a whatsapp saying {message}", "ping {contact} on whatsapp with {message}", "write to {contact} on whatsapp {message}", "whatsapp {contact} {message}", "send {contact} a whatsapp text saying {message}", "text {contact} on whatsapp saying {message}", ] SLACK_TEMPLATES = [ "open slack channel {channel}", "go to {channel} channel in slack", "open the {channel} slack channel", "switch to {channel} in slack", "show me {channel} channel on slack", "navigate to {channel} on slack", "open slack {channel}", "pull up {channel} slack channel", "take me to {channel} on slack", "open channel {channel} in slack", "slack open {channel}", "go to slack channel {channel}", "open the slack channel called {channel}", "show {channel} slack", "open the {channel} channel in slack", "navigate to {channel} slack channel", "switch to #{channel} in slack", ] PLAYLIST_TEMPLATES = [ "play my {playlist} on spotify", "start my {playlist} playlist spotify", "put on {playlist} spotify", "play {playlist} on spotify", "play {playlist} playlist from spotify", "open {playlist} playlist spotify", "spotify play my {playlist}", "play music from my {playlist}", "start playing my {playlist} spotify playlist", "play my spotify {playlist}", "put on my {playlist} playlist on spotify", "start {playlist} on spotify", "play the {playlist} playlist", "spotify start {playlist}", "queue up {playlist} on spotify", ] UBER_TEMPLATES = [ "get an uber to {destination}", "uber to {destination}", "book uber to {destination}", "call an uber to {destination}", "request uber ride to {destination}", "uber me to {destination}", "get a ride to {destination} on uber", "book a cab to {destination} via uber", "uber to {destination} now", "need an uber to {destination}", "schedule uber to {destination}", "ride to {destination} using uber", "open uber and go to {destination}", "find uber to {destination}", "get ride to {destination}", "uber for {destination} please", "book an uber to {destination}", "search and uber for {destination}", ] LINKEDIN_TEMPLATES = [ "search {name} on linkedin", "look up {name} on linkedin", "find {name}'s linkedin profile", "linkedin search for {name}", "open linkedin and search {name}", "can you find {name} on linkedin", "search for {name} on linked in", "look for {name} on linkedin", "find {name} on linkedin", "linkedin find {name}", "search linkedin for {name}", "pull up {name} on linkedin", "show me {name} on linkedin", "i need to find {name} on linkedin", "help me search linkedin for {name}", ] SPOTIFY_SEARCH_TEMPLATES = [ "search {query} on spotify and play", "find {query} on spotify and play it", "play {query} music on spotify", "search for {query} on spotify and start playing", "look up {query} tracks on spotify and play", "spotify search {query} and play", "find and play {query} on spotify", "search spotify for {query} music and play", "play some {query} on spotify", "open spotify search {query} and play", "search {query} playlist spotify play now", "find {query} spotify play", "spotify play {query} after searching", "search and play {query} tracks spotify", "put on {query} from spotify search", ] YOUTUBE_TEMPLATES = [ "search {query} on youtube", "look up {query} on youtube", "find {query} videos on youtube", "youtube search {query}", "search for {query} on youtube", "play {query} on youtube", "show me {query} on youtube", "open youtube and search {query}", "find {query} on youtube", "youtube {query}", "look for {query} videos on youtube", "search youtube for {query}", "find videos about {query} on youtube", "youtube look up {query}", ] CONTACTS_TEMPLATES = [ "search {contact} in contacts", "find {contact} in contacts", "look up {contact} in my contacts", "search contact {contact}", "find {contact} contact", "search contacts for {contact}", "look for {contact} in phone contacts", "open contacts and search {contact}", "find {contact}'s number in contacts", "search my contacts for {contact}", "contacts find {contact}", "look up {contact} contact info", "search my phone book for {contact}", "contacts lookup {contact}", "find {contact} phone number contacts", "search address book {contact}", "open contacts search {contact}", "find parag shah in contacts", ] GMAIL_TEMPLATES = [ "send mail using gmail to {recipient} saying {message}", "email {recipient} saying {message}", "send gmail to {recipient} saying {message}", "write mail to {recipient} saying {message}", "compose an email to {recipient} {message}", "email {recipient} {message} via gmail", "compose gmail to {recipient} saying {message}", "send email to {recipient} {message}", "gmail {recipient} with {message}", "write email to {recipient} saying {message}", "send a gmail message {message} to {recipient}", "email {recipient} from gmail {message}", "compose email {message} to {recipient}", "send {message} email to {recipient} gmail", "shoot an email to {recipient} {message}", "mail {recipient} with {message}", "send gmail {message} to {recipient}", "compose and send {message} to {recipient}", ] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def load_schemas(path: Path) -> dict: with path.open(encoding="utf-8") as handle: return json.load(handle) def load_skills(path: Path) -> list[dict]: skills = [] with path.open(encoding="utf-8") as handle: for line in handle: line = line.strip() if line: skills.append(json.loads(line)) return skills def format_intent(skill: str, parameters: dict) -> str: return json.dumps({"skill": skill, "parameters": parameters}, separators=(",", ":")) def make_record(prompt: str, skill: str, parameters: dict) -> dict: return { "messages": [ {"role": "system", "content": INTENT_SYSTEM_PROMPT}, {"role": "user", "content": prompt}, {"role": "assistant", "content": format_intent(skill, parameters)}, ] } def expand_combinations( templates: list[str], param_pools: dict[str, list[str]], limit: int, rng: random.Random, ) -> list[tuple[str, dict]]: """Sample template × parameter combinations without building the full cartesian product.""" pool_keys = list(param_pools.keys()) examples: list[tuple[str, dict]] = [] seen: set[str] = set() attempts = 0 max_attempts = limit * 50 fillers = ["", " thanks", " asap", " please", " now"] while len(examples) < limit and attempts < max_attempts: attempts += 1 template = rng.choice(templates) params = {key: rng.choice(pool) for key, pool in param_pools.items()} try: prompt = template.format(**params) except KeyError: continue if rng.random() > 0.7: prompt = f"{rng.choice(['please ', 'can you ', 'i need to ', ''] )}{prompt}{rng.choice(fillers)}" key = prompt.lower().strip() if not key or key in seen: continue seen.add(key) examples.append((prompt.strip(), params)) return examples def generate_no_param_examples(skill: str, limit: int, rng: random.Random) -> list[tuple[str, dict]]: prompts = list(NO_PARAM_PROMPTS.get(skill, [])) rng.shuffle(prompts) prefixes = ["please ", "can you ", "i need to ", "help me ", "quickly ", "just "] suffixes = [" please", " now", " for me", " on my phone", " right away"] seen: set[str] = set() result: list[tuple[str, dict]] = [] def add(prompt: str) -> bool: key = prompt.lower().strip() if not key or key in seen: return False seen.add(key) result.append((prompt.strip(), {})) return True for prompt in prompts: add(prompt) for prefix in prefixes: add(f"{prefix}{prompt}") for suffix in suffixes: add(f"{prompt}{suffix}") if len(result) >= limit: return result[:limit] fillers = ["thanks", "asap", "when you can", "if possible", "real quick"] attempt = 0 while len(result) < limit and attempt < limit * 20: attempt += 1 base = rng.choice(prompts) variant = ( f"{rng.choice(prefixes)}{base}{rng.choice(suffixes)}" f"{'' if rng.random() > 0.3 else ' ' + rng.choice(fillers)}" ) add(variant) return result[:limit] def generate_skill_examples( skill: str, limit: int, rng: random.Random, ) -> list[tuple[str, dict]]: if skill in NO_PARAM_PROMPTS: return generate_no_param_examples(skill, limit, rng) generators = { "create_alarm": lambda: expand_combinations( ALARM_TEMPLATES, {"time": ALARM_TIMES, "day": ALARM_DAYS}, limit, rng, ), "calendar_create_event": lambda: expand_combinations( CALENDAR_TEMPLATES, {"title": CALENDAR_TITLES, "date": CALENDAR_DATES, "time": CALENDAR_TIMES}, limit, rng, ), "whatsapp_send_message": lambda: expand_combinations( WHATSAPP_TEMPLATES, {"contact": CONTACTS, "message": MESSAGES}, limit, rng, ), "slack_open_channel": lambda: expand_combinations( SLACK_TEMPLATES, {"channel": SLACK_CHANNELS}, limit, rng, ), "spotify_play_playlist": lambda: expand_combinations( PLAYLIST_TEMPLATES, {"playlist": PLAYLISTS}, limit, rng, ), "uber_request_ride": lambda: expand_combinations( UBER_TEMPLATES, {"destination": DESTINATIONS}, limit, rng, ), "linkedin_search_person": lambda: expand_combinations( LINKEDIN_TEMPLATES, {"name": LINKEDIN_NAMES}, limit, rng, ), "spotify_search_play": lambda: expand_combinations( SPOTIFY_SEARCH_TEMPLATES, {"query": SPOTIFY_QUERIES}, limit, rng, ), "youtube_search": lambda: expand_combinations( YOUTUBE_TEMPLATES, {"query": YOUTUBE_QUERIES}, limit, rng, ), "contacts_search": lambda: expand_combinations( CONTACTS_TEMPLATES, {"contact": CONTACTS}, limit, rng, ), "gmail_send_email": lambda: expand_combinations( GMAIL_TEMPLATES, {"recipient": EMAIL_RECIPIENTS, "message": EMAIL_MESSAGES}, limit, rng, ), } generator = generators.get(skill) if not generator: return [] return generator() def generate_contrastive_examples() -> list[dict]: """Hard negatives: same entity, different app → different skill + parameters.""" records: list[dict] = [] names = ["parag shah", "arya sheth", "john smith", "mom", "sarah"] for name in names: records.extend( [ make_record(f"search {name} in contacts", "contacts_search", {"contact": name}), make_record(f"find {name} in my contacts", "contacts_search", {"contact": name}), make_record(f"search {name} on linkedin", "linkedin_search_person", {"name": name}), make_record(f"find {name} on linkedin", "linkedin_search_person", {"name": name}), make_record(f"search {name} on youtube", "youtube_search", {"query": name}), make_record(f"look up {name} videos on youtube", "youtube_search", {"query": name}), make_record( f"search {name} on spotify and play", "spotify_search_play", {"query": name}, ), make_record( f"find {name} music on spotify and play it", "spotify_search_play", {"query": name}, ), ] ) records.extend( [ make_record( "email boss saying i'll be late", "gmail_send_email", {"recipient": "boss", "message": "i'll be late"}, ), make_record( "message boss on whatsapp running late", "whatsapp_send_message", {"contact": "boss", "message": "running late"}, ), make_record( "open the engineering channel in slack", "slack_open_channel", {"channel": "engineering"}, ), make_record( "search pasta recipes on youtube", "youtube_search", {"query": "pasta recipes"}, ), make_record( "find parag shah in contacts", "contacts_search", {"contact": "parag shah"}, ), make_record( "send ri a message on whatsapp saying see you soon", "whatsapp_send_message", {"contact": "ri", "message": "see you soon"}, ), make_record( "play my workout playlist", "spotify_play_playlist", {"playlist": "workout"}, ), make_record( "wake me up tomorrow morning", "create_alarm", {"time": "7 am", "day": "tomorrow morning"}, ), ] ) return records def generate_eval_prompts( train_prompts: set[str], per_skill: int, rng: random.Random, ) -> list[dict]: """Generate held-out eval prompts not present in training.""" eval_cases: list[dict] = [] eval_templates = { "create_alarm": [ ("set a 6 am alarm for monday", {"time": "6 am", "day": "monday"}), ("wake me up at 8:30 am next friday", {"time": "8:30 am", "day": "next friday"}), ("alarm tomorrow 5:30 am", {"time": "5:30 am", "day": "tomorrow"}), ("schedule 9 pm alarm tonight", {"time": "9 pm", "day": "tonight"}), ("i need a 7:15 am alarm wednesday", {"time": "7:15 am", "day": "wednesday"}), ("put alarm for 6:45 am saturday", {"time": "6:45 am", "day": "saturday"}), ("wake up at noon tomorrow", {"time": "noon", "day": "tomorrow"}), ], "calendar_create_event": [ ("add team standup to my calendar tuesday 10 am", {"title": "team standup", "date": "tuesday", "time": "10 am"}), ("book dentist appointment next friday 3 pm", {"title": "dentist appointment", "date": "next friday", "time": "3 pm"}), ("put lunch with priya on calendar wednesday noon", {"title": "lunch with priya", "date": "wednesday", "time": "noon"}), ("schedule code review thursday 2 pm", {"title": "code review", "date": "thursday", "time": "2 pm"}), ("add flight to calendar sunday 8 am", {"title": "flight", "date": "sunday", "time": "8 am"}), ("create event hackathon demo monday 4 pm", {"title": "hackathon demo", "date": "monday", "time": "4 pm"}), ], "wifi_enable": [ ("turn on wifi", {}), ("please enable wifi on my phone", {}), ("switch wlan on", {}), ("activate my wifi connection", {}), ("i need wifi enabled", {}), ("get wifi running please", {}), ], "bluetooth_enable": [ ("turn bluetooth on", {}), ("enable bluetooth please", {}), ("switch on my bluetooth", {}), ("bluetooth should be on", {}), ("power on bluetooth now", {}), ("activate bluetooth radio", {}), ], "whatsapp_send_message": [ ("text mom on whatsapp i'm on my way", {"contact": "mom", "message": "i'm on my way"}), ("whatsapp sarah saying thanks", {"contact": "sarah", "message": "thanks"}), ("message alex on whatsapp be there in 5", {"contact": "alex", "message": "be there in 5"}), ("send ri a message saying i'll be late", {"contact": "ri", "message": "i'll be late"}), ("tell biraj on whatsapp see you tonight", {"contact": "biraj", "message": "see you tonight"}), ("ping priya on whatsapp with hello", {"contact": "priya", "message": "hello"}), ], "camera_take_photo": [ ("snap a quick photo", {}), ("open camera and take a picture", {}), ("capture a photo with the camera", {}), ("take a selfie now", {}), ("shoot a picture please", {}), ("launch camera and snap a pic", {}), ], "slack_open_channel": [ ("go to general channel in slack", {"channel": "general"}), ("open slack channel random", {"channel": "random"}), ("navigate to design reviews slack channel", {"channel": "design reviews"}), ("open the engineering channel in slack", {"channel": "engineering"}), ("switch to announcements in slack", {"channel": "announcements"}), ("show me data contributors on slack", {"channel": "data contributors"}), ], "spotify_pause": [ ("pause spotify playback", {}), ("stop the music on spotify", {}), ("hold spotify for now", {}), ("freeze spotify playback", {}), ("halt the spotify player", {}), ("spotify stop playing", {}), ], "spotify_play_playlist": [ ("start my chill playlist on spotify", {"playlist": "chill"}), ("play discover weekly on spotify", {"playlist": "discover weekly"}), ("put on my liked songs spotify", {"playlist": "liked songs"}), ("play my workout playlist", {"playlist": "workout"}), ("queue up road trip on spotify", {"playlist": "road trip"}), ("start focus playlist spotify", {"playlist": "focus"}), ], "uber_request_ride": [ ("get an uber to the airport", {"destination": "the airport"}), ("book uber to downtown", {"destination": "downtown"}), ("request a ride to central station on uber", {"destination": "central station"}), ("uber me to golden gate bridge", {"destination": "golden gate bridge"}), ("need a ride to city hospital via uber", {"destination": "city hospital"}), ("call uber to convention center", {"destination": "convention center"}), ], "linkedin_search_person": [ ("look up jane doe on linkedin", {"name": "jane doe"}), ("find parag shah's linkedin profile", {"name": "parag shah"}), ("search linkedin for john smith", {"name": "john smith"}), ("pull up sophie martin on linkedin", {"name": "sophie martin"}), ("find david chen on linkedin", {"name": "david chen"}), ("linkedin search nina patel", {"name": "nina patel"}), ], "spotify_search_play": [ ("find jazz on spotify and play it", {"query": "jazz"}), ("search lo fi beats on spotify and play them", {"query": "lo fi beats"}), ("spotify search taylor swift and play", {"query": "taylor swift"}), ("search k-pop on spotify and play", {"query": "k-pop"}), ("find ambient music on spotify and play", {"query": "ambient"}), ("play reggaeton after searching spotify", {"query": "reggaeton"}), ], "youtube_search": [ ("search pasta recipes on youtube", {"query": "pasta recipes"}), ("find workout videos on youtube", {"query": "workout videos"}), ("look up python tutorials on youtube", {"query": "python tutorials"}), ("youtube search morning routine", {"query": "morning routine"}), ("find korean street food on youtube", {"query": "korean street food"}), ("search machine learning basics on youtube", {"query": "machine learning basics"}), ], "contacts_search": [ ("find mom in my contacts", {"contact": "mom"}), ("search contacts for dad", {"contact": "dad"}), ("look up john smith in contacts", {"contact": "john smith"}), ("find dentist office in contacts", {"contact": "dentist office"}), ("search my contacts for maya", {"contact": "maya"}), ("contacts lookup ri", {"contact": "ri"}), ], "gmail_send_email": [ ("email boss saying i'll be late", {"recipient": "boss", "message": "i'll be late"}), ("send gmail to sarah@gmail.com subject meeting notes", {"recipient": "sarah@gmail.com", "message": "meeting notes"}), ("compose email to client proposal attached", {"recipient": "client", "message": "proposal attached"}), ("write mail to hr@company.com vacation request", {"recipient": "hr@company.com", "message": "vacation request"}), ("send email to professor asking about assignment", {"recipient": "professor", "message": "asking about assignment"}), ("gmail team@work.com weekly report", {"recipient": "team@work.com", "message": "weekly report"}), ], } for skill, templates in eval_templates.items(): added = 0 rng.shuffle(templates) for prompt, params in templates: key = prompt.lower().strip() if key in train_prompts: continue eval_cases.append( { "prompt": prompt, "expected": {"skill": skill, "parameters": params}, } ) added += 1 if added >= per_skill: break return eval_cases def main() -> None: parser = argparse.ArgumentParser(description="Generate intent extraction dataset.") parser.add_argument( "--examples-per-skill", type=int, default=DEFAULT_EXAMPLES_PER_SKILL, help=f"Training examples per skill (default: {DEFAULT_EXAMPLES_PER_SKILL})", ) parser.add_argument( "--eval-per-skill", type=int, default=EVAL_PROMPTS_PER_SKILL, help=f"Eval prompts per skill (default: {EVAL_PROMPTS_PER_SKILL})", ) parser.add_argument( "--seed", type=int, default=RANDOM_SEED, help=f"Random seed (default: {RANDOM_SEED})", ) args = parser.parse_args() rng = random.Random(args.seed) schemas = load_schemas(SCHEMAS_FILE) skills = load_skills(SKILLS_FILE) records: list[dict] = [] train_prompts: set[str] = set() skill_counts: dict[str, int] = {} for entry in skills: skill = entry["skill"] if skill not in schemas: print(f"Warning: no schema for skill {skill}, skipping") continue examples = generate_skill_examples(skill, args.examples_per_skill, rng) for prompt, params in examples: key = prompt.lower().strip() if key in train_prompts: continue train_prompts.add(key) records.append(make_record(prompt, skill, params)) skill_counts[skill] = skill_counts.get(skill, 0) + 1 for record in generate_contrastive_examples(): prompt = record["messages"][1]["content"] key = prompt.lower().strip() if key in train_prompts: continue train_prompts.add(key) records.append(record) skill = json.loads(record["messages"][2]["content"])["skill"] skill_counts[skill] = skill_counts.get(skill, 0) + 1 rng.shuffle(records) with TRAIN_OUTPUT.open("w", encoding="utf-8") as handle: for record in records: handle.write(json.dumps(record) + "\n") eval_prompts = generate_eval_prompts(train_prompts, args.eval_per_skill, rng) with EVAL_OUTPUT.open("w", encoding="utf-8") as handle: json.dump(eval_prompts, handle, indent=2) handle.write("\n") print(f"Wrote {len(records)} training examples to {TRAIN_OUTPUT}") print(f"Wrote {len(eval_prompts)} eval prompts to {EVAL_OUTPUT}") print(f"Skills: {len(skill_counts)}") for skill, count in sorted(skill_counts.items()): status = "OK" if count >= args.examples_per_skill else "LOW" print(f" {skill}: {count} [{status}]") low_skills = [s for s, c in skill_counts.items() if c < args.examples_per_skill] if low_skills: print(f"\nWarning: {len(low_skills)} skills below target count: {', '.join(low_skills)}") if __name__ == "__main__": main()