| """ |
| generate_json_syntax_training.py - Ultra-Focused JSON Syntax Training |
| |
| This script creates training data specifically targeting the "Expecting ',' delimiter" |
| errors that are the root cause of our 93% failure rate. |
| |
| Analysis of failures shows the model has issues with: |
| 1. String parameters containing quotes and special characters |
| 2. Proper JSON object structure and comma placement |
| 3. Consistent quote escaping in nested parameters |
| """ |
|
|
| import json |
| import random |
| from typing import List, Dict, Any |
|
|
| def create_training_pair(schema: Dict, question: str, good_response: str, bad_response: str) -> Dict: |
| """Create a single training pair focused on JSON syntax.""" |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {json.dumps(schema, indent=2)} |
| </schema> |
| |
| <|im_start|>user |
| {question}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| return { |
| "prompt": prompt, |
| "chosen": good_response, |
| "rejected": bad_response |
| } |
|
|
| def generate_simple_json_patterns(): |
| """Generate basic JSON structure patterns to establish fundamentals.""" |
| examples = [] |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "simple_function", |
| "description": "Simple function with one parameter", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "text": {"type": "string"} |
| }, |
| "required": ["text"] |
| } |
| }, |
| "Call with hello world", |
| '{"name": "simple_function", "arguments": {"text": "hello world"}}', |
| "I'll call the function with hello world" |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "two_param_function", |
| "description": "Function with two parameters", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "name": {"type": "string"}, |
| "age": {"type": "integer"} |
| }, |
| "required": ["name", "age"] |
| } |
| }, |
| "Call with name John and age 25", |
| '{"name": "two_param_function", "arguments": {"name": "John", "age": 25}}', |
| '{"name": "two_param_function", "arguments": {"name": "John" "age": 25}}' |
| )) |
| |
| return examples |
|
|
| def generate_string_escaping_patterns(): |
| """Generate patterns specifically for string parameter handling.""" |
| examples = [] |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "analyze_text", |
| "description": "Analyze text content", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "content": {"type": "string"}, |
| "type": {"type": "string"} |
| }, |
| "required": ["content", "type"] |
| } |
| }, |
| "Analyze this text: The CEO said we have made tremendous progress this quarter", |
| '{"name": "analyze_text", "arguments": {"content": "The CEO said we have made tremendous progress this quarter", "type": "analysis"}}', |
| 'I will analyze that text for you' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "send_message", |
| "description": "Send a message", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "to": {"type": "string"}, |
| "subject": {"type": "string"}, |
| "body": {"type": "string"} |
| }, |
| "required": ["to", "subject", "body"] |
| } |
| }, |
| "Send email to john@company.com with subject Meeting Update and body The meeting has been rescheduled to tomorrow at 2 PM", |
| '{"name": "send_message", "arguments": {"to": "john@company.com", "subject": "Meeting Update", "body": "The meeting has been rescheduled to tomorrow at 2 PM"}}', |
| 'I will send that email for you' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "process_query", |
| "description": "Process database query", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "query": {"type": "string"}, |
| "database": {"type": "string"} |
| }, |
| "required": ["query", "database"] |
| } |
| }, |
| "Run query SELECT name FROM users WHERE created_at > 2023-01-01 on the main database", |
| '{"name": "process_query", "arguments": {"query": "SELECT name FROM users WHERE created_at > 2023-01-01", "database": "main"}}', |
| 'I will run that database query for you' |
| )) |
| |
| return examples |
|
|
| def generate_complex_parameter_patterns(): |
| """Generate patterns for complex parameter combinations.""" |
| examples = [] |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "configure_system", |
| "description": "Configure system settings", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "timeout": {"type": "integer"}, |
| "enabled": {"type": "boolean"}, |
| "level": {"type": "string"} |
| }, |
| "required": ["timeout", "enabled"] |
| } |
| }, |
| "Set timeout to 30 seconds, enable the system, and set level to debug", |
| '{"name": "configure_system", "arguments": {"timeout": 30, "enabled": true, "level": "debug"}}', |
| 'I will configure the system with those settings' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "process_files", |
| "description": "Process multiple files", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "files": {"type": "array", "items": {"type": "string"}}, |
| "operation": {"type": "string"} |
| }, |
| "required": ["files", "operation"] |
| } |
| }, |
| "Process files data.csv, results.json, and report.pdf with merge operation", |
| '{"name": "process_files", "arguments": {"files": ["data.csv", "results.json", "report.pdf"], "operation": "merge"}}', |
| 'I will process those files for you' |
| )) |
| |
| return examples |
|
|
| def generate_exact_failure_patterns(): |
| """Generate training examples that exactly match our failing schemas.""" |
| examples = [] |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "summarize_document", |
| "description": "Summarize document content", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "document_url": {"type": "string"}, |
| "summary_length": {"type": "string"}, |
| "target_audience": {"type": "string"} |
| }, |
| "required": ["document_url"] |
| } |
| }, |
| "Summarize the document at https://example.com/report.pdf for executives with brief length", |
| '{"name": "summarize_document", "arguments": {"document_url": "https://example.com/report.pdf", "summary_length": "brief", "target_audience": "executive"}}', |
| 'I will summarize that document for executives' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "analyze_sentiment", |
| "description": "Analyze text sentiment", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "text": {"type": "string"}, |
| "language": {"type": "string"}, |
| "include_emotions": {"type": "boolean"} |
| }, |
| "required": ["text"] |
| } |
| }, |
| "Analyze sentiment of this text: The product was excellent and delivery was fast with emotion details in English", |
| '{"name": "analyze_sentiment", "arguments": {"text": "The product was excellent and delivery was fast", "language": "en", "include_emotions": true}}', |
| 'I will analyze the sentiment of that text' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "get_weather_forecast", |
| "description": "Get weather forecast", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "location": {"type": "string"}, |
| "days": {"type": "integer"}, |
| "units": {"type": "string"}, |
| "include_hourly": {"type": "boolean"} |
| }, |
| "required": ["location", "days"] |
| } |
| }, |
| "Get 3-day weather forecast for New York in metric units with hourly details", |
| '{"name": "get_weather_forecast", "arguments": {"location": "New York", "days": 3, "units": "metric", "include_hourly": true}}', |
| 'I will get the weather forecast for New York' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "convert_currency", |
| "description": "Convert currency amounts", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "amount": {"type": "number"}, |
| "from_currency": {"type": "string"}, |
| "to_currency": {"type": "string"}, |
| "include_fees": {"type": "boolean"} |
| }, |
| "required": ["amount", "from_currency", "to_currency"] |
| } |
| }, |
| "Convert 100 US dollars to Euros with fees included", |
| '{"name": "convert_currency", "arguments": {"amount": 100, "from_currency": "USD", "to_currency": "EUR", "include_fees": true}}', |
| 'I will convert that currency amount for you' |
| )) |
| |
| |
| examples.append(create_training_pair( |
| { |
| "name": "optimize_database_query", |
| "description": "Optimize database query", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "sql_query": {"type": "string"}, |
| "database_type": {"type": "string"}, |
| "performance_target": {"type": "string"} |
| }, |
| "required": ["sql_query", "database_type"] |
| } |
| }, |
| "Optimize this MySQL query for speed: SELECT id, name FROM users WHERE active = 1", |
| '{"name": "optimize_database_query", "arguments": {"sql_query": "SELECT id, name FROM users WHERE active = 1", "database_type": "mysql", "performance_target": "speed"}}', |
| 'I will optimize that database query for you' |
| )) |
| |
| return examples |
|
|
| def main(): |
| """Generate ultra-focused JSON syntax training dataset.""" |
| print("π― Generating Ultra-Focused JSON Syntax Training...") |
| |
| all_examples = [] |
| |
| |
| print("π Adding simple JSON patterns...") |
| base_examples = generate_simple_json_patterns() |
| all_examples.extend(base_examples) |
| |
| print("π Adding string escaping patterns...") |
| string_examples = generate_string_escaping_patterns() |
| all_examples.extend(string_examples) |
| |
| print("π Adding complex parameter patterns...") |
| complex_examples = generate_complex_parameter_patterns() |
| all_examples.extend(complex_examples) |
| |
| print("π Adding exact failure patterns...") |
| failure_examples = generate_exact_failure_patterns() |
| all_examples.extend(failure_examples) |
| |
| |
| print("π Adding 10x repetitions of exact failure patterns...") |
| for _ in range(10): |
| all_examples.extend(failure_examples) |
| all_examples.extend(string_examples) |
| all_examples.extend(complex_examples) |
| |
| |
| output_file = "tool_pairs_json_syntax.jsonl" |
| with open(output_file, 'w') as f: |
| for example in all_examples: |
| f.write(json.dumps(example) + '\n') |
| |
| print(f"β
Generated {len(all_examples)} ultra-focused training examples") |
| print(f"πΎ Saved to {output_file}") |
| |
| |
| categories = { |
| "Simple JSON patterns": len(base_examples), |
| "String escaping patterns": len(string_examples) * 11, |
| "Complex parameters": len(complex_examples) * 11, |
| "Exact failure patterns": len(failure_examples) * 11 |
| } |
| |
| print(f"\nπ Ultra-Focused Training Composition:") |
| for category, count in categories.items(): |
| print(f" {category}: {count} examples") |
| |
| print(f"\nπ― Ultra-Focused Approach:") |
| print(f" β’ 11x repetition of exact failing patterns") |
| print(f" β’ Progressive complexity from simple to exact failures") |
| print(f" β’ JSON syntax comma and quote handling emphasis") |
| print(f" β’ Directly targeting 'Expecting , delimiter' errors") |
| |
| return len(all_examples) |
|
|
| if __name__ == "__main__": |
| main() |