| { | |
| "summary_metrics": { | |
| "json_validity_rate": 1.0, | |
| "workflow_accuracy": 0.9701492537313433, | |
| "status_accuracy": 0.5733333333333334, | |
| "required_field_presence_accuracy": 0.2857142857142857, | |
| "unsafe_rejection_accuracy": 1.0, | |
| "false_route_rate": 0.0 | |
| }, | |
| "per_example_results": [ | |
| { | |
| "id": "eval-0001", | |
| "case_type": "success", | |
| "input": "cron 0 2 * * * target claims-sync env dev timezone America/New_York", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "job_name": "finance-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "team": "finance", | |
| "timezone": "America/New_York" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "claims-sync-scheduled-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "claims-sync-scheduled-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation", | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0002", | |
| "case_type": "success", | |
| "input": "ticket: mlops production api, runtime .NET, region Central US, diagnostics on", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "mlops-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "prod", | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "team": "mlops" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: app_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0003", | |
| "case_type": "success", | |
| "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "claims-sync", | |
| "team": "reporting", | |
| "timezone": "America/Los_Angeles" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0004", | |
| "case_type": "success", | |
| "input": "identity request: growth service account, env staging, name growth-svc", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "growth-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "staging", | |
| "team": "growth" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "staging", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "staging", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: account_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0005", | |
| "case_type": "missing_fields", | |
| "input": "daily reporting job, details later", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, schedule, environment" | |
| ], | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "target": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "schedule": "0 9 * * *" | |
| }, | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "schedule": "0 9 * * *" | |
| }, | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0006", | |
| "case_type": "success", | |
| "input": "Create a cool storage bucket named platform-bucket in West US for development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "environment": "dev", | |
| "region": "westus", | |
| "storage_class": "cool", | |
| "team": "platform" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "region": "westus", | |
| "environment": "dev", | |
| "storage_class": "cool" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "region": "westus", | |
| "environment": "dev", | |
| "storage_class": "cool" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0007", | |
| "case_type": "success", | |
| "input": "Grant reporting-user reader access to staging-bucket in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "principal": "reporting-user", | |
| "role": "reader", | |
| "scope": "staging-bucket" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "reporting-user", | |
| "role": "reader", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "reporting-user", | |
| "role": "reader", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0008", | |
| "case_type": "missing_fields", | |
| "input": "bucket needed for reporting, no location picked yet", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0009", | |
| "case_type": "success", | |
| "input": "cron 0 9 * * * target model-refresh env dev timezone UTC", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "job_name": "finance-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "team": "finance", | |
| "timezone": "UTC" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "model-refresh-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "environment": "dev", | |
| "timezone": "UTC" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "model-refresh-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "environment": "dev", | |
| "timezone": "UTC" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation", | |
| "missing expected parameter keys: team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0010", | |
| "case_type": "success", | |
| "input": "ticket: mlops staging api, runtime Python, region Central US, diagnostics on", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "mlops-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "staging", | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "team": "mlops" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: app_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0011", | |
| "case_type": "success", | |
| "input": "infra: bucket for claims, env prod, region centralus, class archive", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "bucket_name": "claims-bucket", | |
| "environment": "prod", | |
| "region": "centralus", | |
| "storage_class": "archive", | |
| "team": "claims" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "environment": "prod", | |
| "storage_class": "archive" | |
| }, | |
| "missing_fields": [ | |
| "bucket_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [ | |
| "bucket_name" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: bucket_name" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "centralus", | |
| "environment": "prod", | |
| "storage_class": "archive" | |
| }, | |
| "missing_fields": [ | |
| "bucket_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: bucket_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0012", | |
| "case_type": "success", | |
| "input": "Create a archive storage bucket named finance-bucket in East US for staging.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "bucket_name": "finance-bucket", | |
| "environment": "staging", | |
| "region": "eastus", | |
| "storage_class": "archive", | |
| "team": "finance" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "finance-bucket", | |
| "region": "eastus", | |
| "environment": "staging", | |
| "storage_class": "archive" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "finance-bucket", | |
| "region": "eastus", | |
| "environment": "staging", | |
| "storage_class": "archive" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0013", | |
| "case_type": "success", | |
| "input": "Give analyst the contributor role on reporting-project.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "principal": "analyst", | |
| "role": "contributor", | |
| "scope": "reporting-project" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "contributor", | |
| "scope": "reporting-project." | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "contributor", | |
| "scope": "reporting-project." | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: environment, principal" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0014", | |
| "case_type": "missing_fields", | |
| "input": "daily reporting job, details later", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, schedule, environment" | |
| ], | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "target": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "schedule": "0 9 * * *" | |
| }, | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "schedule": "0 9 * * *" | |
| }, | |
| "missing_fields": [ | |
| "job_name", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0015", | |
| "case_type": "success", | |
| "input": "Create a staging Python web app for the platform team in Central US.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "staging", | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "team": "platform" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "team": "platform", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_web_app", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "region": "centralus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "team": "platform", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0016", | |
| "case_type": "success", | |
| "input": "Set up blob storage for the mlops team in US Central for development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "bucket_name": "mlops-bucket", | |
| "environment": "dev", | |
| "region": "us-central1", | |
| "storage_class": "cool", | |
| "team": "mlops" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "mlops-bucket", | |
| "region": "us-central1", | |
| "environment": "dev", | |
| "team": "mlops" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "mlops-bucket", | |
| "region": "us-central1", | |
| "environment": "dev", | |
| "team": "mlops" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: storage_class" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0017", | |
| "case_type": "risky_rejected", | |
| "input": "Give John the admin role across all production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.88, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.88, | |
| "failure_reasons": [ | |
| "Blocked owner/admin IAM grant." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "john", | |
| "role": "admin", | |
| "scope": "production" | |
| }, | |
| "status": "rejected", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "admin", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": { | |
| "role": "admin" | |
| }, | |
| "failure_reasons": [ | |
| "Missing required fields: principal", | |
| "One or more fields contains a blocked value." | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "admin", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0018", | |
| "case_type": "success", | |
| "input": "Set up a daily job for billing-export for the security team in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "job_name": "security-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "billing-export", | |
| "team": "security", | |
| "timezone": "UTC" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "billing-export-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "billing-export", | |
| "environment": "dev", | |
| "team": "security" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "billing-export-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "billing-export", | |
| "environment": "dev", | |
| "team": "security" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0019", | |
| "case_type": "success", | |
| "input": "Need an automation identity for team finance in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "finance-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "prod", | |
| "team": "finance" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "finance-svc", | |
| "team": "finance", | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_service_account", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "finance-svc", | |
| "team": "finance", | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0020", | |
| "case_type": "ambiguous", | |
| "input": "Set up reporting.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_web_app" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No workflow keywords matched with enough confidence." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": null, | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Router did not select a workflow." | |
| ], | |
| "clarifying_question": "Which workflow should this request use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0021", | |
| "case_type": "success", | |
| "input": "cron 0 9 * * * target reporting env staging timezone America/Los_Angeles", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "job_name": "security-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "team": "security", | |
| "timezone": "America/Los_Angeles" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation", | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0022", | |
| "case_type": "risky_rejected", | |
| "input": "iam urgent: John needs admin on prod.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.88, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.88, | |
| "failure_reasons": [ | |
| "Blocked owner/admin IAM grant." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "john", | |
| "role": "admin", | |
| "scope": "production" | |
| }, | |
| "status": "rejected", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "admin", | |
| "scope": "prod.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": { | |
| "role": "admin" | |
| }, | |
| "failure_reasons": [ | |
| "Missing required fields: principal", | |
| "One or more fields contains a blocked value." | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "admin", | |
| "scope": "prod.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0023", | |
| "case_type": "success", | |
| "input": "Need a small .NET app named reporting-web-app in westus for reporting.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "prod", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "team": "reporting" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: environment, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0024", | |
| "case_type": "risky_rejected", | |
| "input": "Grant Jane admin access to production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.88, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.88, | |
| "failure_reasons": [ | |
| "Blocked owner/admin IAM grant." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "jane", | |
| "role": "admin", | |
| "scope": "production" | |
| }, | |
| "status": "rejected", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "admin", | |
| "scope": "production.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": { | |
| "role": "admin" | |
| }, | |
| "failure_reasons": [ | |
| "One or more fields contains a blocked value." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "rejected", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "admin", | |
| "scope": "production.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Blocked value for role: admin" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0025", | |
| "case_type": "success", | |
| "input": "Need a small .NET app named reporting-web-app in centralus for reporting.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "diagnostics_enabled": false, | |
| "environment": "dev", | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "team": "reporting" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "centralus", | |
| "runtime": "dotnet8", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: environment, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0026", | |
| "case_type": "success", | |
| "input": "Grant deploy-bot reader access to staging-bucket in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "principal": "deploy-bot", | |
| "role": "reader", | |
| "scope": "staging-bucket" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "reader", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "reader", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0027", | |
| "case_type": "success", | |
| "input": "Need a small Python app named platform-web-app in westus for platform.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "diagnostics_enabled": false, | |
| "environment": "staging", | |
| "region": "westus", | |
| "runtime": "python311", | |
| "team": "platform" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "region": "westus", | |
| "runtime": "python311", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "platform-web-app", | |
| "region": "westus", | |
| "runtime": "python311", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: environment, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0028", | |
| "case_type": "success", | |
| "input": "Grant jane viewer access to staging-bucket in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "principal": "jane", | |
| "role": "viewer", | |
| "scope": "staging-bucket" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "viewer", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "viewer", | |
| "scope": "staging-bucket", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0029", | |
| "case_type": "confirmation_required", | |
| "input": "iam: ops-lead role contributor scope production", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.82, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.82, | |
| "failure_reasons": [ | |
| "High-risk IAM change requires confirmation." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "ops-lead", | |
| "role": "contributor", | |
| "scope": "production" | |
| }, | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "contributor", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "contributor", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected requires_confirmation, got needs_clarification", | |
| "missing expected parameter keys: principal" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0030", | |
| "case_type": "missing_fields", | |
| "input": "iam access needed for deploy-bot, scope TBD", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": "What role should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: role, scope" | |
| ], | |
| "missing_fields": [ | |
| "role", | |
| "scope" | |
| ], | |
| "parameters": { | |
| "principal": "deploy-bot" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "scope": "deploy-bot" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "scope": "deploy-bot" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0031", | |
| "case_type": "success", | |
| "input": "Grant deploy-bot viewer access to staging-bucket in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "deploy-bot", | |
| "role": "viewer", | |
| "scope": "staging-bucket" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "viewer", | |
| "scope": "staging-bucket", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation.", | |
| "IAM request targets production or broad-scope permissions." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "viewer", | |
| "scope": "staging-bucket", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| }, | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation.", | |
| "IAM request targets production or broad-scope permissions." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0032", | |
| "case_type": "missing_fields", | |
| "input": "permission request for jane", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": "What role should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: role, scope" | |
| ], | |
| "missing_fields": [ | |
| "role", | |
| "scope" | |
| ], | |
| "parameters": { | |
| "principal": "jane" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "request" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "request" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0033", | |
| "case_type": "missing_fields", | |
| "input": "bucket needed for security, no location picked yet", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "security" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0034", | |
| "case_type": "confirmation_required", | |
| "input": "iam: ops-lead role reader scope production", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.82, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.82, | |
| "failure_reasons": [ | |
| "High-risk IAM change requires confirmation." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "ops-lead", | |
| "role": "reader", | |
| "scope": "production" | |
| }, | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "reader", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "reader", | |
| "scope": "production", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected requires_confirmation, got needs_clarification", | |
| "missing expected parameter keys: principal" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0035", | |
| "case_type": "missing_fields", | |
| "input": "identity needed for team reporting", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, environment" | |
| ], | |
| "missing_fields": [ | |
| "account_name", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "reporting-svc", | |
| "team": "reporting", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "reporting-svc", | |
| "team": "reporting", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0036", | |
| "case_type": "success", | |
| "input": "Create a nightly scheduler job named growth-nightly-job for model-refresh in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "team": "growth", | |
| "timezone": "America/Los_Angeles" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "model-refresh", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "model-refresh", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0037", | |
| "case_type": "success", | |
| "input": "Create a service account named security-svc for the security team in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "prod", | |
| "team": "security" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_service_account", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0038", | |
| "case_type": "success", | |
| "input": "Create a production Node.js web app for the growth team in US Central.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "prod", | |
| "region": "us-central1", | |
| "runtime": "nodejs20", | |
| "team": "growth" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "region": "us-central1", | |
| "runtime": "nodejs20", | |
| "environment": "prod", | |
| "team": "growth", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_web_app", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "region": "us-central1", | |
| "runtime": "nodejs20", | |
| "environment": "prod", | |
| "team": "growth", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0039", | |
| "case_type": "success", | |
| "input": "Create a standard storage bucket named platform-bucket in US Central for development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "environment": "dev", | |
| "region": "us-central1", | |
| "storage_class": "standard", | |
| "team": "platform" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "region": "us-central1", | |
| "environment": "dev", | |
| "storage_class": "standard" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "bucket_name": "platform-bucket", | |
| "region": "us-central1", | |
| "environment": "dev", | |
| "storage_class": "standard" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0040", | |
| "case_type": "ambiguous", | |
| "input": "Prep access and automation for the new project.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_service_account" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0041", | |
| "case_type": "success", | |
| "input": "Create a nightly scheduler job named claims-nightly-job for billing-export in staging.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "job_name": "claims-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "billing-export", | |
| "team": "claims", | |
| "timezone": "America/Los_Angeles" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "claims-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "billing-export", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "claims-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "billing-export", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0042", | |
| "case_type": "confirmation_required", | |
| "input": "iam: jane role contributor scope all reporting resources", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.82, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.82, | |
| "failure_reasons": [ | |
| "High-risk IAM change requires confirmation." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "principal": "jane", | |
| "role": "contributor", | |
| "scope": "all reporting resources" | |
| }, | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "contributor" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "scope" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "scope" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, scope" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, scope" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "role": "contributor" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "scope" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, scope" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected requires_confirmation, got needs_clarification", | |
| "missing expected parameter keys: environment, principal, scope" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0043", | |
| "case_type": "ambiguous", | |
| "input": "Set up reporting.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_service_account" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No workflow keywords matched with enough confidence." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": null, | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Router did not select a workflow." | |
| ], | |
| "clarifying_question": "Which workflow should this request use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0044", | |
| "case_type": "success", | |
| "input": "Need a small Node.js app named growth-web-app in westus for growth.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "diagnostics_enabled": false, | |
| "environment": "prod", | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "team": "growth" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: environment, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0045", | |
| "case_type": "missing_fields", | |
| "input": "need api for reporting, details TBD", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, environment" | |
| ], | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "runtime": "python311", | |
| "team": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0046", | |
| "case_type": "success", | |
| "input": "Grant jane reader access to reporting-project in staging.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "principal": "jane", | |
| "role": "reader", | |
| "scope": "reporting-project" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "reader", | |
| "scope": "reporting-project", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "jane", | |
| "role": "reader", | |
| "scope": "reporting-project", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0047", | |
| "case_type": "success", | |
| "input": "Give analyst the viewer role on claims-app.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "analyst", | |
| "role": "viewer", | |
| "scope": "claims-app" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| }, | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| }, | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "workflow mismatch: expected grant_iam_role, got create_web_app", | |
| "missing expected parameter keys: environment, principal, role, scope" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0048", | |
| "case_type": "success", | |
| "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "claims-sync", | |
| "team": "reporting", | |
| "timezone": "America/New_York" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "claims-sync", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0049", | |
| "case_type": "success", | |
| "input": "ticket: finance staging api, runtime Python, region West US, diagnostics on", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "finance-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "staging", | |
| "region": "westus", | |
| "runtime": "python311", | |
| "team": "finance" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "westus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "westus", | |
| "runtime": "python311", | |
| "environment": "staging", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: app_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0050", | |
| "case_type": "ambiguous", | |
| "input": "Prep access and automation for the new project.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_scheduler_job" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0051", | |
| "case_type": "success", | |
| "input": "cron 0 9 * * * target model-refresh env staging timezone America/New_York", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "job_name": "finance-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "team": "finance", | |
| "timezone": "America/New_York" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "model-refresh-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "job_name": "model-refresh-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "model-refresh", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation", | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0052", | |
| "case_type": "success", | |
| "input": "Give john the editor role on dev-subsystem.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "principal": "john", | |
| "role": "editor", | |
| "scope": "dev-subsystem" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "editor", | |
| "scope": "dev-subsystem.", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "role": "editor", | |
| "scope": "dev-subsystem.", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [ | |
| "principal" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: principal" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0053", | |
| "case_type": "missing_fields", | |
| "input": "Set up a reporting schedule.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, schedule, environment" | |
| ], | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "target": "reporting" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "target", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, schedule, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: job_name, schedule, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "job_name", | |
| "schedule", | |
| "target", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: job_name, schedule, target, environment" | |
| ], | |
| "clarifying_question": "What job name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0054", | |
| "case_type": "success", | |
| "input": "Create a nightly scheduler job named growth-nightly-job for reporting in staging.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "reporting", | |
| "team": "growth", | |
| "timezone": "America/New_York" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "reporting", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95, | |
| "parameters": { | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "reporting", | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.95 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: team, timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0055", | |
| "case_type": "ambiguous", | |
| "input": "Make the nightly thing happen.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_service_account" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.58, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.58 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.58 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.58, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.58, | |
| "parameters": { | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "runtime", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.58 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.58 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, runtime, environment" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0056", | |
| "case_type": "success", | |
| "input": "identity request: growth service account, env prod, name growth-svc", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "growth-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "prod", | |
| "team": "growth" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: account_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0057", | |
| "case_type": "success", | |
| "input": "Create a production .NET web app for the reporting team in West US.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "diagnostics_enabled": false, | |
| "environment": "prod", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "team": "reporting" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "team": "reporting", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_web_app", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "team": "reporting", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0058", | |
| "case_type": "missing_fields", | |
| "input": "service account request, owner team security", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, environment" | |
| ], | |
| "missing_fields": [ | |
| "account_name", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "security" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0059", | |
| "case_type": "success", | |
| "input": "Create a production .NET web app for the security team in West US.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "security-web-app", | |
| "diagnostics_enabled": true, | |
| "environment": "prod", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "team": "security" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "security-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "team": "security", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_web_app", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "security-web-app", | |
| "region": "westus", | |
| "runtime": "dotnet8", | |
| "environment": "prod", | |
| "team": "security", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0060", | |
| "case_type": "success", | |
| "input": "Set up a daily job for reporting for the growth team in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "job_name": "growth-nightly-job", | |
| "schedule": "0 2 * * *", | |
| "target": "reporting", | |
| "team": "growth", | |
| "timezone": "UTC" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "dev", | |
| "team": "growth" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "dev", | |
| "team": "growth" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0061", | |
| "case_type": "success", | |
| "input": "Set up a daily job for reporting for the reporting team in staging.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "staging", | |
| "job_name": "reporting-nightly-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "team": "reporting", | |
| "timezone": "America/New_York" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_scheduler_job" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "staging", | |
| "team": "reporting" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_scheduler_job", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "accepted": true, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "routed", | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "job_name": "reporting-scheduled-job", | |
| "schedule": "0 9 * * *", | |
| "target": "reporting", | |
| "environment": "staging", | |
| "team": "reporting" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Route accepted for execution preview only." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "missing expected parameter keys: timezone" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0062", | |
| "case_type": "success", | |
| "input": "Need an automation identity for team growth in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "growth-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "dev", | |
| "team": "growth" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "growth-svc", | |
| "team": "growth", | |
| "environment": "dev", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "Please confirm the selected workflow and parameters." | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "create_service_account", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "growth-svc", | |
| "team": "growth", | |
| "environment": "dev", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Router confidence is between 0.55 and 0.80." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0063", | |
| "case_type": "ambiguous", | |
| "input": "Prep access and automation for the new project.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "grant_iam_role" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "scope": "for" | |
| }, | |
| "missing_fields": [ | |
| "principal", | |
| "role" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: principal, role" | |
| ], | |
| "clarifying_question": "What principal should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0064", | |
| "case_type": "missing_fields", | |
| "input": "Create a Python web app for the growth team.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name, region, environment" | |
| ], | |
| "missing_fields": [ | |
| "app_name", | |
| "region", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "runtime": "python311", | |
| "team": "growth" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "runtime": "python311", | |
| "team": "growth", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What region should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "region", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: region, environment" | |
| ], | |
| "clarifying_question": "What region should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: region, environment" | |
| ], | |
| "clarifying_question": "What region should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "app_name": "growth-web-app", | |
| "runtime": "python311", | |
| "team": "growth", | |
| "diagnostics_enabled": false | |
| }, | |
| "missing_fields": [ | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: region, environment" | |
| ], | |
| "clarifying_question": "What region should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0065", | |
| "case_type": "missing_fields", | |
| "input": "bucket needed for mlops, no location picked yet", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_storage_bucket" | |
| } | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "mlops" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": {}, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region", | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region, environment" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0066", | |
| "case_type": "success", | |
| "input": "Give jane the viewer role on staging-bucket.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "jane", | |
| "role": "viewer", | |
| "scope": "staging-bucket" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| }, | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_storage_bucket", | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: bucket_name, region" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "environment": "staging" | |
| }, | |
| "missing_fields": [ | |
| "bucket_name", | |
| "region" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_storage_bucket", | |
| "confidence": 0.69 | |
| }, | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: bucket_name, region" | |
| ], | |
| "clarifying_question": "What bucket name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "workflow mismatch: expected grant_iam_role, got create_storage_bucket", | |
| "missing expected parameter keys: principal, role, scope" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0067", | |
| "case_type": "missing_fields", | |
| "input": "Create a service account for the mlops team.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, environment" | |
| ], | |
| "missing_fields": [ | |
| "account_name", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "mlops" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "mlops-svc", | |
| "team": "mlops", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "mlops-svc", | |
| "team": "mlops", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0068", | |
| "case_type": "missing_fields", | |
| "input": "service account request, owner team security", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, environment" | |
| ], | |
| "missing_fields": [ | |
| "account_name", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "security" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0069", | |
| "case_type": "success", | |
| "input": "ticket: reporting development api, runtime Node.js, region West US, diagnostics on", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_web_app" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "app_name": "reporting-web-app", | |
| "diagnostics_enabled": false, | |
| "environment": "dev", | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "team": "reporting" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_web_app" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "environment": "dev", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_web_app", | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_web_app", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "region": "westus", | |
| "runtime": "nodejs20", | |
| "environment": "dev", | |
| "diagnostics_enabled": true | |
| }, | |
| "missing_fields": [ | |
| "app_name" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: app_name" | |
| ], | |
| "clarifying_question": "What app name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: app_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0070", | |
| "case_type": "ambiguous", | |
| "input": "Set up reporting.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.38, | |
| "workflow": "create_storage_bucket" | |
| }, | |
| { | |
| "confidence": 0.31, | |
| "workflow": "create_scheduler_job" | |
| } | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", | |
| "confidence": 0.34, | |
| "failure_reasons": [ | |
| "Request is ambiguous across multiple workflows." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "needs_clarification", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No workflow keywords matched with enough confidence." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": null, | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Router did not select a workflow." | |
| ], | |
| "clarifying_question": "Which workflow should this request use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No authoritative workflow could be selected." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0071", | |
| "case_type": "risky_rejected", | |
| "input": "Remove all security from production.", | |
| "expected": { | |
| "candidate_workflows": [], | |
| "clarifying_question": null, | |
| "confidence": 0.88, | |
| "failure_reasons": [ | |
| "Unsafe destructive production request." | |
| ], | |
| "missing_fields": [], | |
| "parameters": {}, | |
| "status": "rejected", | |
| "workflow": null | |
| }, | |
| "actual_router_output": { | |
| "status": "needs_clarification", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "No workflow keywords matched with enough confidence." | |
| ], | |
| "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": null, | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Router did not select a workflow." | |
| ], | |
| "clarifying_question": "Which workflow should this request use?" | |
| }, | |
| "policy_decision": { | |
| "status": "rejected", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Unsafe phrase matched: remove all security" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "rejected", | |
| "workflow": null, | |
| "confidence": 0.25, | |
| "parameters": {}, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_web_app", | |
| "confidence": 0.25 | |
| }, | |
| { | |
| "workflow": "create_scheduler_job", | |
| "confidence": 0.23 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Unsafe phrase matched: remove all security" | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0072", | |
| "case_type": "confirmation_required", | |
| "input": "Grant deploy-bot contributor access to production.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.82, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.82, | |
| "failure_reasons": [ | |
| "High-risk IAM change requires confirmation." | |
| ], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "prod", | |
| "principal": "deploy-bot", | |
| "role": "contributor", | |
| "scope": "production" | |
| }, | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "contributor", | |
| "scope": "production.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation.", | |
| "IAM request targets production or broad-scope permissions." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "contributor", | |
| "scope": "production.", | |
| "environment": "prod" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation.", | |
| "IAM request targets production or broad-scope permissions." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0073", | |
| "case_type": "missing_fields", | |
| "input": "service account request, owner team security", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.74, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?", | |
| "confidence": 0.74, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, environment" | |
| ], | |
| "missing_fields": [ | |
| "account_name", | |
| "environment" | |
| ], | |
| "parameters": { | |
| "team": "security" | |
| }, | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "requires_confirmation", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.69, | |
| "parameters": { | |
| "account_name": "security-svc", | |
| "team": "security", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "environment" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.69 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: environment" | |
| ], | |
| "clarifying_question": "What environment should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "pass" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0074", | |
| "case_type": "success", | |
| "input": "identity request: finance service account, env prod, name finance-svc", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "create_service_account" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "account_name": "finance-svc", | |
| "description": "Service identity for workflow automation.", | |
| "environment": "prod", | |
| "team": "finance" | |
| }, | |
| "status": "routed", | |
| "workflow": "create_service_account" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "validation_result": { | |
| "valid": false, | |
| "workflow": "create_service_account", | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "policy_decision": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": false, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "actual": { | |
| "status": "needs_clarification", | |
| "workflow": "create_service_account", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "environment": "prod", | |
| "description": "Generated from RouterCore request preview." | |
| }, | |
| "missing_fields": [ | |
| "account_name", | |
| "team" | |
| ], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "create_service_account", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Missing required fields: account_name, team" | |
| ], | |
| "clarifying_question": "What account name should RouterCore use?" | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got needs_clarification", | |
| "missing expected parameter keys: account_name, team" | |
| ] | |
| }, | |
| { | |
| "id": "eval-0075", | |
| "case_type": "success", | |
| "input": "Grant deploy-bot editor access to reporting-project in development.", | |
| "expected": { | |
| "candidate_workflows": [ | |
| { | |
| "confidence": 0.92, | |
| "workflow": "grant_iam_role" | |
| } | |
| ], | |
| "clarifying_question": null, | |
| "confidence": 0.92, | |
| "failure_reasons": [], | |
| "missing_fields": [], | |
| "parameters": { | |
| "environment": "dev", | |
| "principal": "deploy-bot", | |
| "role": "editor", | |
| "scope": "reporting-project" | |
| }, | |
| "status": "routed", | |
| "workflow": "grant_iam_role" | |
| }, | |
| "actual_router_output": { | |
| "status": "routed", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "editor", | |
| "scope": "reporting-project", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "validation_result": { | |
| "valid": true, | |
| "workflow": "grant_iam_role", | |
| "missing_fields": [], | |
| "invalid_fields": {}, | |
| "blocked_fields": {}, | |
| "failure_reasons": [], | |
| "clarifying_question": null | |
| }, | |
| "policy_decision": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "accepted": false, | |
| "requires_confirmation": true, | |
| "execution_allowed": false, | |
| "reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "actual": { | |
| "status": "requires_confirmation", | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93, | |
| "parameters": { | |
| "principal": "deploy-bot", | |
| "role": "editor", | |
| "scope": "reporting-project", | |
| "environment": "dev" | |
| }, | |
| "missing_fields": [], | |
| "candidate_workflows": [ | |
| { | |
| "workflow": "grant_iam_role", | |
| "confidence": 0.93 | |
| } | |
| ], | |
| "failure_reasons": [ | |
| "Workflow is high risk and requires human confirmation." | |
| ], | |
| "clarifying_question": null | |
| }, | |
| "pass_fail_notes": [ | |
| "status mismatch: expected routed, got requires_confirmation" | |
| ] | |
| } | |
| ] | |
| } |