| """ |
| Robustness Testing for Dynamic Function-Calling Agent |
| |
| Tests model stability with: |
| 1. Shuffled JSON key order |
| 2. Distractor text before schema |
| 3. Noisy prompts |
| |
| Quick test that doesn't require retraining. |
| """ |
|
|
| import json |
| import random |
| from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema |
|
|
| def shuffle_json_keys(obj): |
| """Recursively shuffle the order of keys in JSON objects""" |
| if isinstance(obj, dict): |
| items = list(obj.items()) |
| random.shuffle(items) |
| return {k: shuffle_json_keys(v) for k, v in items} |
| elif isinstance(obj, list): |
| return [shuffle_json_keys(item) for item in obj] |
| return obj |
|
|
| def add_distractor_text(schema_str): |
| """Add distracting text before the schema""" |
| distractors = [ |
| "Note: This is a complex API with many parameters.", |
| "Important: Please review all requirements carefully.", |
| "Warning: Some fields may be optional depending on context.", |
| "Info: This function supports multiple data formats.", |
| "Reminder: Check authentication before making calls." |
| ] |
| distractor = random.choice(distractors) |
| return f"{distractor}\n\n{schema_str}" |
|
|
| def test_robustness(): |
| """Run robustness tests on the function calling agent""" |
| print("π§ͺ Starting Robustness Tests...") |
| |
| |
| model, tokenizer = load_trained_model() |
| |
| |
| base_schema = { |
| "name": "get_weather_forecast", |
| "description": "Get weather forecast for a location", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "location": {"type": "string", "description": "City name"}, |
| "days": {"type": "integer", "description": "Number of days", "minimum": 1}, |
| "units": {"type": "string", "enum": ["metric", "imperial"]}, |
| "include_hourly": {"type": "boolean", "default": False} |
| }, |
| "required": ["location", "days"] |
| } |
| } |
| |
| test_queries = [ |
| "Get 3-day weather for Paris", |
| "Weather forecast for Tokyo, 5 days, metric units", |
| "I need the weather for London for the next week" |
| ] |
| |
| results = { |
| "baseline": [], |
| "shuffled_keys": [], |
| "with_distractors": [], |
| "both_shuffled_and_distractors": [] |
| } |
| |
| print("\nπ Running test scenarios...") |
| |
| for query in test_queries: |
| print(f"\nπ Query: '{query}'") |
| |
| |
| schema = create_json_schema(base_schema) |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {json.dumps(base_schema, indent=2)} |
| </schema> |
| |
| <|im_start|>user |
| {query}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| results["baseline"].append(success) |
| print(f" β
Baseline: {'β' if success else 'β'}") |
| |
| |
| shuffled_schema = shuffle_json_keys(base_schema) |
| schema = create_json_schema(shuffled_schema) |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {json.dumps(shuffled_schema, indent=2)} |
| </schema> |
| |
| <|im_start|>user |
| {query}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| results["shuffled_keys"].append(success) |
| print(f" π Shuffled: {'β' if success else 'β'}") |
| |
| |
| schema = create_json_schema(base_schema) |
| schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2)) |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {schema_with_distractor} |
| </schema> |
| |
| <|im_start|>user |
| {query}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| results["with_distractors"].append(success) |
| print(f" π Distractor: {'β' if success else 'β'}") |
| |
| |
| shuffled_schema = shuffle_json_keys(base_schema) |
| schema = create_json_schema(shuffled_schema) |
| schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2)) |
| prompt = f"""<|im_start|>system |
| You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| |
| <schema> |
| {schema_with_distractor} |
| </schema> |
| |
| <|im_start|>user |
| {query}<|im_end|> |
| <|im_start|>assistant |
| """ |
| |
| response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| results["both_shuffled_and_distractors"].append(success) |
| print(f" ππ Both: {'β' if success else 'β'}") |
| |
| |
| print("\nπ Robustness Test Results:") |
| print("=" * 50) |
| |
| for test_name, test_results in results.items(): |
| success_rate = (sum(test_results) / len(test_results)) * 100 |
| print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})") |
| |
| print("\nπ― Analysis:") |
| baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100 |
| |
| for test_name, test_results in results.items(): |
| if test_name != "baseline": |
| test_rate = (sum(test_results) / len(test_results)) * 100 |
| diff = test_rate - baseline_rate |
| status = "π’" if diff >= -10 else "π‘" if diff >= -20 else "π΄" |
| print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline") |
| |
| return results |
|
|
| if __name__ == "__main__": |
| test_robustness() |