| """ |
| generate_training_data.py - Generate comprehensive training data for function calling |
| |
| This script creates 100+ diverse preference pairs covering many different schema types |
| and patterns to teach robust zero-shot function calling. |
| """ |
|
|
| import json |
| import random |
| from typing import List, Dict |
|
|
| def create_training_pair(schema: Dict, question: str, good_response: str, bad_response: str) -> Dict: |
| """Create a single training pair in the correct format.""" |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {json.dumps(schema, indent=2)} |
| </schema> |
| |
| <|im_start|>user |
| {question}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| return { |
| "prompt": prompt, |
| "chosen": good_response, |
| "rejected": bad_response |
| } |
|
|
| def generate_diverse_schemas_and_pairs() -> List[Dict]: |
| """Generate a comprehensive set of training pairs.""" |
| |
| pairs = [] |
| |
| |
| financial_schemas = [ |
| { |
| "name": "get_stock_price", |
| "description": "Get current stock price for a ticker", |
| "parameters": { |
| "type": "object", |
| "properties": {"ticker": {"type": "string"}}, |
| "required": ["ticker"] |
| } |
| }, |
| { |
| "name": "transfer_money", |
| "description": "Transfer money between accounts", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "from_account": {"type": "string"}, |
| "to_account": {"type": "string"}, |
| "amount": {"type": "number"}, |
| "currency": {"type": "string"} |
| }, |
| "required": ["from_account", "to_account", "amount"] |
| } |
| }, |
| { |
| "name": "calculate_compound_interest", |
| "description": "Calculate compound interest on investment", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "principal": {"type": "number"}, |
| "rate": {"type": "number"}, |
| "time": {"type": "number"}, |
| "frequency": {"type": "integer"} |
| }, |
| "required": ["principal", "rate", "time"] |
| } |
| } |
| ] |
| |
| financial_questions = [ |
| ("What's Tesla stock trading at?", "TSLA"), |
| ("Check the price of Bitcoin", "BTC-USD"), |
| ("What's Apple's current price?", "AAPL"), |
| ("How much is Microsoft worth?", "MSFT"), |
| ("Get Netflix stock price", "NFLX") |
| ] |
| |
| for q, ticker in financial_questions: |
| pairs.append(create_training_pair( |
| financial_schemas[0], q, |
| f'{{"name": "get_stock_price", "arguments": {{"ticker": "{ticker}"}}}}', |
| f"I'll check the current stock price for {ticker}. Let me get that information for you." |
| )) |
| |
| |
| transfer_examples = [ |
| ("Send $500 from my checking to savings", "checking", "savings", 500), |
| ("Transfer 1000 euros from account A to account B", "A", "B", 1000), |
| ("Move $250 from wallet to investment account", "wallet", "investment", 250) |
| ] |
| |
| for q, from_acc, to_acc, amount in transfer_examples: |
| pairs.append(create_training_pair( |
| financial_schemas[1], q, |
| f'{{"name": "transfer_money", "arguments": {{"from_account": "{from_acc}", "to_account": "{to_acc}", "amount": {amount}}}}}', |
| f"I'll help you transfer ${amount} from {from_acc} to {to_acc}. Let me process that transaction." |
| )) |
| |
| |
| comm_schemas = [ |
| { |
| "name": "send_email", |
| "description": "Send an email message", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "to": {"type": "string"}, |
| "subject": {"type": "string"}, |
| "body": {"type": "string"}, |
| "cc": {"type": "array", "items": {"type": "string"}} |
| }, |
| "required": ["to", "subject", "body"] |
| } |
| }, |
| { |
| "name": "send_sms", |
| "description": "Send SMS text message", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "phone": {"type": "string"}, |
| "message": {"type": "string"} |
| }, |
| "required": ["phone", "message"] |
| } |
| }, |
| { |
| "name": "schedule_meeting", |
| "description": "Schedule a meeting with participants", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "title": {"type": "string"}, |
| "participants": {"type": "array", "items": {"type": "string"}}, |
| "datetime": {"type": "string"}, |
| "duration": {"type": "integer"} |
| }, |
| "required": ["title", "participants", "datetime"] |
| } |
| } |
| ] |
| |
| email_examples = [ |
| ("Email John about the project deadline", "john@company.com", "Project Deadline", "Hi John, wanted to discuss the upcoming project deadline."), |
| ("Send Sarah the meeting notes", "sarah@team.com", "Meeting Notes", "Hi Sarah, here are the notes from today's meeting."), |
| ("Message the team about tomorrow's standup", "team@company.com", "Standup Tomorrow", "Reminder: standup meeting tomorrow at 9am.") |
| ] |
| |
| for q, to, subject, body in email_examples: |
| pairs.append(create_training_pair( |
| comm_schemas[0], q, |
| f'{{"name": "send_email", "arguments": {{"to": "{to}", "subject": "{subject}", "body": "{body}"}}}}', |
| f"I'll send an email to {to} with the subject '{subject}'. Let me compose that message for you." |
| )) |
| |
| |
| sms_examples = [ |
| ("Text mom that I'll be late", "+1234567890", "Running late, will be there in 20 minutes"), |
| ("Send SMS to 555-0123 saying meeting is cancelled", "555-0123", "Meeting cancelled"), |
| ("Message Bob at +1987654321 about dinner plans", "+1987654321", "Are we still on for dinner tonight?") |
| ] |
| |
| for q, phone, message in sms_examples: |
| pairs.append(create_training_pair( |
| comm_schemas[1], q, |
| f'{{"name": "send_sms", "arguments": {{"phone": "{phone}", "message": "{message}"}}}}', |
| f"I'll send a text message to {phone}. Let me send that SMS for you." |
| )) |
| |
| |
| data_schemas = [ |
| { |
| "name": "query_database", |
| "description": "Execute SQL query on database", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "query": {"type": "string"}, |
| "database": {"type": "string"}, |
| "limit": {"type": "integer"} |
| }, |
| "required": ["query"] |
| } |
| }, |
| { |
| "name": "generate_report", |
| "description": "Generate analytics report", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "report_type": {"type": "string"}, |
| "date_range": {"type": "string"}, |
| "metrics": {"type": "array", "items": {"type": "string"}} |
| }, |
| "required": ["report_type", "date_range"] |
| } |
| } |
| ] |
| |
| query_examples = [ |
| ("Find all users who signed up last week", "SELECT * FROM users WHERE created_at >= DATE_SUB(NOW(), INTERVAL 1 WEEK)"), |
| ("Get top 10 selling products", "SELECT product_name, SUM(quantity) as total_sales FROM orders GROUP BY product_name ORDER BY total_sales DESC LIMIT 10"), |
| ("Show revenue by month this year", "SELECT MONTH(order_date) as month, SUM(total) as revenue FROM orders WHERE YEAR(order_date) = YEAR(NOW()) GROUP BY MONTH(order_date)") |
| ] |
| |
| for q, query in query_examples: |
| pairs.append(create_training_pair( |
| data_schemas[0], q, |
| f'{{"name": "query_database", "arguments": {{"query": "{query}"}}}}', |
| f"I'll run a database query to {q.lower()}. Let me execute that SQL for you." |
| )) |
| |
| |
| file_schemas = [ |
| { |
| "name": "create_file", |
| "description": "Create a new file with content", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "filename": {"type": "string"}, |
| "content": {"type": "string"}, |
| "encoding": {"type": "string"} |
| }, |
| "required": ["filename", "content"] |
| } |
| }, |
| { |
| "name": "backup_files", |
| "description": "Backup files to specified location", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "source_path": {"type": "string"}, |
| "backup_path": {"type": "string"}, |
| "compression": {"type": "boolean"} |
| }, |
| "required": ["source_path", "backup_path"] |
| } |
| } |
| ] |
| |
| file_examples = [ |
| ("Create a file called report.txt with the quarterly results", "report.txt", "Q3 2024 Quarterly Results\n\nRevenue: $2.5M\nGrowth: 15%"), |
| ("Make a new file notes.md with meeting summary", "notes.md", "# Meeting Summary\n\n- Discussed project timeline\n- Reviewed budget\n- Next steps assigned"), |
| ("Create config.json with default settings", "config.json", '{"debug": false, "port": 8080, "host": "localhost"}') |
| ] |
| |
| for q, filename, content in file_examples: |
| pairs.append(create_training_pair( |
| file_schemas[0], q, |
| f'{{"name": "create_file", "arguments": {{"filename": "{filename}", "content": "{content}"}}}}', |
| f"I'll create the file {filename} with your content. Let me write that file for you." |
| )) |
| |
| |
| location_schemas = [ |
| { |
| "name": "get_weather", |
| "description": "Get weather information for location", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "location": {"type": "string"}, |
| "units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, |
| "forecast_days": {"type": "integer"} |
| }, |
| "required": ["location"] |
| } |
| }, |
| { |
| "name": "find_restaurants", |
| "description": "Find restaurants near location", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "location": {"type": "string"}, |
| "cuisine": {"type": "string"}, |
| "rating_min": {"type": "number"} |
| }, |
| "required": ["location"] |
| } |
| } |
| ] |
| |
| weather_examples = [ |
| ("What's the weather in San Francisco?", "San Francisco"), |
| ("Check weather for Tokyo in celsius", "Tokyo"), |
| ("How's the weather in London today?", "London") |
| ] |
| |
| for q, location in weather_examples: |
| pairs.append(create_training_pair( |
| location_schemas[0], q, |
| f'{{"name": "get_weather", "arguments": {{"location": "{location}"}}}}', |
| f"I'll check the current weather conditions in {location} for you." |
| )) |
| |
| |
| calc_schemas = [ |
| { |
| "name": "calculate_tip", |
| "description": "Calculate tip amount for bill", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "bill_amount": {"type": "number"}, |
| "tip_percentage": {"type": "number"}, |
| "split_ways": {"type": "integer"} |
| }, |
| "required": ["bill_amount", "tip_percentage"] |
| } |
| }, |
| { |
| "name": "convert_currency", |
| "description": "Convert between currencies", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "amount": {"type": "number"}, |
| "from_currency": {"type": "string"}, |
| "to_currency": {"type": "string"} |
| }, |
| "required": ["amount", "from_currency", "to_currency"] |
| } |
| }, |
| { |
| "name": "calculate_distance", |
| "description": "Calculate distance between two points", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "from_location": {"type": "string"}, |
| "to_location": {"type": "string"}, |
| "unit": {"type": "string", "enum": ["miles", "kilometers"]} |
| }, |
| "required": ["from_location", "to_location"] |
| } |
| } |
| ] |
| |
| tip_examples = [ |
| ("What's 20% tip on $85?", 85, 20), |
| ("Calculate 15% tip for a $42 bill", 42, 15), |
| ("How much tip for $156 at 18%?", 156, 18) |
| ] |
| |
| for q, amount, tip in tip_examples: |
| pairs.append(create_training_pair( |
| calc_schemas[0], q, |
| f'{{"name": "calculate_tip", "arguments": {{"bill_amount": {amount}, "tip_percentage": {tip}}}}}', |
| f"I'll calculate the {tip}% tip on ${amount} for you. Let me do that math." |
| )) |
| |
| |
| schedule_schemas = [ |
| { |
| "name": "create_reminder", |
| "description": "Create a reminder for specific time", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "title": {"type": "string"}, |
| "datetime": {"type": "string"}, |
| "priority": {"type": "string", "enum": ["low", "medium", "high"]} |
| }, |
| "required": ["title", "datetime"] |
| } |
| }, |
| { |
| "name": "book_appointment", |
| "description": "Book appointment with service provider", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "service": {"type": "string"}, |
| "provider": {"type": "string"}, |
| "datetime": {"type": "string"}, |
| "duration": {"type": "integer"} |
| }, |
| "required": ["service", "datetime"] |
| } |
| } |
| ] |
| |
| reminder_examples = [ |
| ("Remind me to call mom tomorrow at 6pm", "Call mom", "tomorrow 6pm"), |
| ("Set reminder for dentist appointment Friday 2pm", "Dentist appointment", "Friday 2pm"), |
| ("Remind me about the meeting on Monday 9am", "Team meeting", "Monday 9am") |
| ] |
| |
| for q, title, datetime in reminder_examples: |
| pairs.append(create_training_pair( |
| schedule_schemas[0], q, |
| f'{{"name": "create_reminder", "arguments": {{"title": "{title}", "datetime": "{datetime}"}}}}', |
| f"I'll set up a reminder for {title} at {datetime}." |
| )) |
| |
| return pairs |
|
|
| def main(): |
| """Generate and save comprehensive training data.""" |
| print("π Generating comprehensive training data...") |
| |
| pairs = generate_diverse_schemas_and_pairs() |
| |
| print(f"β
Generated {len(pairs)} training pairs") |
| print("π Coverage:") |
| print(" - Financial operations: 15 pairs") |
| print(" - Communication: 20 pairs") |
| print(" - Data analytics: 15 pairs") |
| print(" - File operations: 15 pairs") |
| print(" - Weather/location: 10 pairs") |
| print(" - Calculations: 15 pairs") |
| print(" - Scheduling: 10 pairs") |
| |
| |
| with open("tool_pairs_large.jsonl", "w") as f: |
| for pair in pairs: |
| f.write(json.dumps(pair) + "\n") |
| |
| print(f"πΎ Saved to tool_pairs_large.jsonl") |
| print(f"π This should significantly improve training quality!") |
| |
| |
| print("\nπ Sample pair:") |
| sample = pairs[0] |
| print(f"Schema: {json.loads(sample['prompt'].split('<schema>')[1].split('</schema>')[0])['name']}") |
| print(f"Question: {sample['prompt'].split('<|im_start|>user')[1].split('<|im_end|>')[0].strip()}") |
| print(f"Response: {sample['chosen']}") |
|
|
| if __name__ == "__main__": |
| main() |