diff --git "a/eval/results/fakerouter_eval.json" "b/eval/results/fakerouter_eval.json" new file mode 100644--- /dev/null +++ "b/eval/results/fakerouter_eval.json" @@ -0,0 +1,7339 @@ +{ + "summary_metrics": { + "json_validity_rate": 1.0, + "workflow_accuracy": 0.9701492537313433, + "status_accuracy": 0.5733333333333334, + "required_field_presence_accuracy": 0.2857142857142857, + "unsafe_rejection_accuracy": 1.0, + "false_route_rate": 0.0 + }, + "per_example_results": [ + { + "id": "eval-0001", + "case_type": "success", + "input": "cron 0 2 * * * target claims-sync env dev timezone America/New_York", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "job_name": "finance-nightly-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "team": "finance", + "timezone": "America/New_York" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "claims-sync-scheduled-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "dev" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "claims-sync-scheduled-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "dev" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation", + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0002", + "case_type": "success", + "input": "ticket: mlops production api, runtime .NET, region Central US, diagnostics on", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "mlops-web-app", + "diagnostics_enabled": true, + "environment": "prod", + "region": "centralus", + "runtime": "dotnet8", + "team": "mlops" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "runtime": "dotnet8", + "environment": "prod", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "runtime": "dotnet8", + "environment": "prod", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: app_name, team" + ] + }, + { + "id": "eval-0003", + "case_type": "success", + "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "job_name": "reporting-nightly-job", + "schedule": "0 9 * * *", + "target": "claims-sync", + "team": "reporting", + "timezone": "America/Los_Angeles" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "reporting-nightly-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "reporting-nightly-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0004", + "case_type": "success", + "input": "identity request: growth service account, env staging, name growth-svc", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "growth-svc", + "description": "Service identity for workflow automation.", + "environment": "staging", + "team": "growth" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "staging", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What account name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "account_name", + "team" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "staging", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: account_name, team" + ] + }, + { + "id": "eval-0005", + "case_type": "missing_fields", + "input": "daily reporting job, details later", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": "What job name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: job_name, schedule, environment" + ], + "missing_fields": [ + "job_name", + "schedule", + "environment" + ], + "parameters": { + "target": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "schedule": "0 9 * * *" + }, + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What job name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_scheduler_job", + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "schedule": "0 9 * * *" + }, + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0006", + "case_type": "success", + "input": "Create a cool storage bucket named platform-bucket in West US for development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "bucket_name": "platform-bucket", + "environment": "dev", + "region": "westus", + "storage_class": "cool", + "team": "platform" + }, + "status": "routed", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "platform-bucket", + "region": "westus", + "environment": "dev", + "storage_class": "cool" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_storage_bucket", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "platform-bucket", + "region": "westus", + "environment": "dev", + "storage_class": "cool" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team" + ] + }, + { + "id": "eval-0007", + "case_type": "success", + "input": "Grant reporting-user reader access to staging-bucket in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "principal": "reporting-user", + "role": "reader", + "scope": "staging-bucket" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "reporting-user", + "role": "reader", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "reporting-user", + "role": "reader", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0008", + "case_type": "missing_fields", + "input": "bucket needed for reporting, no location picked yet", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": "What bucket name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "parameters": { + "team": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_storage_bucket", + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0009", + "case_type": "success", + "input": "cron 0 9 * * * target model-refresh env dev timezone UTC", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "job_name": "finance-nightly-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "team": "finance", + "timezone": "UTC" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "model-refresh-scheduled-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "environment": "dev", + "timezone": "UTC" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "model-refresh-scheduled-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "environment": "dev", + "timezone": "UTC" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation", + "missing expected parameter keys: team" + ] + }, + { + "id": "eval-0010", + "case_type": "success", + "input": "ticket: mlops staging api, runtime Python, region Central US, diagnostics on", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "mlops-web-app", + "diagnostics_enabled": true, + "environment": "staging", + "region": "centralus", + "runtime": "python311", + "team": "mlops" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "runtime": "python311", + "environment": "staging", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "runtime": "python311", + "environment": "staging", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: app_name, team" + ] + }, + { + "id": "eval-0011", + "case_type": "success", + "input": "infra: bucket for claims, env prod, region centralus, class archive", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "bucket_name": "claims-bucket", + "environment": "prod", + "region": "centralus", + "storage_class": "archive", + "team": "claims" + }, + "status": "routed", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "environment": "prod", + "storage_class": "archive" + }, + "missing_fields": [ + "bucket_name" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_storage_bucket", + "missing_fields": [ + "bucket_name" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: bucket_name" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: bucket_name" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": { + "region": "centralus", + "environment": "prod", + "storage_class": "archive" + }, + "missing_fields": [ + "bucket_name" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: bucket_name" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: bucket_name, team" + ] + }, + { + "id": "eval-0012", + "case_type": "success", + "input": "Create a archive storage bucket named finance-bucket in East US for staging.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "bucket_name": "finance-bucket", + "environment": "staging", + "region": "eastus", + "storage_class": "archive", + "team": "finance" + }, + "status": "routed", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "finance-bucket", + "region": "eastus", + "environment": "staging", + "storage_class": "archive" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_storage_bucket", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "finance-bucket", + "region": "eastus", + "environment": "staging", + "storage_class": "archive" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team" + ] + }, + { + "id": "eval-0013", + "case_type": "success", + "input": "Give analyst the contributor role on reporting-project.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "principal": "analyst", + "role": "contributor", + "scope": "reporting-project" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "contributor", + "scope": "reporting-project." + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "contributor", + "scope": "reporting-project." + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: environment, principal" + ] + }, + { + "id": "eval-0014", + "case_type": "missing_fields", + "input": "daily reporting job, details later", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": "What job name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: job_name, schedule, environment" + ], + "missing_fields": [ + "job_name", + "schedule", + "environment" + ], + "parameters": { + "target": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "schedule": "0 9 * * *" + }, + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What job name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_scheduler_job", + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "schedule": "0 9 * * *" + }, + "missing_fields": [ + "job_name", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: job_name, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0015", + "case_type": "success", + "input": "Create a staging Python web app for the platform team in Central US.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "platform-web-app", + "diagnostics_enabled": true, + "environment": "staging", + "region": "centralus", + "runtime": "python311", + "team": "platform" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "platform-web-app", + "region": "centralus", + "runtime": "python311", + "environment": "staging", + "team": "platform", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_web_app", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "platform-web-app", + "region": "centralus", + "runtime": "python311", + "environment": "staging", + "team": "platform", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0016", + "case_type": "success", + "input": "Set up blob storage for the mlops team in US Central for development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "bucket_name": "mlops-bucket", + "environment": "dev", + "region": "us-central1", + "storage_class": "cool", + "team": "mlops" + }, + "status": "routed", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "mlops-bucket", + "region": "us-central1", + "environment": "dev", + "team": "mlops" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_storage_bucket", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "mlops-bucket", + "region": "us-central1", + "environment": "dev", + "team": "mlops" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: storage_class" + ] + }, + { + "id": "eval-0017", + "case_type": "risky_rejected", + "input": "Give John the admin role across all production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.88, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.88, + "failure_reasons": [ + "Blocked owner/admin IAM grant." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "john", + "role": "admin", + "scope": "production" + }, + "status": "rejected", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "admin", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": { + "role": "admin" + }, + "failure_reasons": [ + "Missing required fields: principal", + "One or more fields contains a blocked value." + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "actual": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "admin", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0018", + "case_type": "success", + "input": "Set up a daily job for billing-export for the security team in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "job_name": "security-nightly-job", + "schedule": "0 9 * * *", + "target": "billing-export", + "team": "security", + "timezone": "UTC" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "billing-export-scheduled-job", + "schedule": "0 9 * * *", + "target": "billing-export", + "environment": "dev", + "team": "security" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "billing-export-scheduled-job", + "schedule": "0 9 * * *", + "target": "billing-export", + "environment": "dev", + "team": "security" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: timezone" + ] + }, + { + "id": "eval-0019", + "case_type": "success", + "input": "Need an automation identity for team finance in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "finance-svc", + "description": "Service identity for workflow automation.", + "environment": "prod", + "team": "finance" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "finance-svc", + "team": "finance", + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_service_account", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "finance-svc", + "team": "finance", + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0020", + "case_type": "ambiguous", + "input": "Set up reporting.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_web_app" + }, + { + "confidence": 0.31, + "workflow": "create_service_account" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No workflow keywords matched with enough confidence." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "validation_result": { + "valid": false, + "workflow": null, + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Router did not select a workflow." + ], + "clarifying_question": "Which workflow should this request use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "actual": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0021", + "case_type": "success", + "input": "cron 0 9 * * * target reporting env staging timezone America/Los_Angeles", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "job_name": "security-nightly-job", + "schedule": "0 9 * * *", + "target": "reporting", + "team": "security", + "timezone": "America/Los_Angeles" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation", + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0022", + "case_type": "risky_rejected", + "input": "iam urgent: John needs admin on prod.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.88, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.88, + "failure_reasons": [ + "Blocked owner/admin IAM grant." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "john", + "role": "admin", + "scope": "production" + }, + "status": "rejected", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "admin", + "scope": "prod.", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": { + "role": "admin" + }, + "failure_reasons": [ + "Missing required fields: principal", + "One or more fields contains a blocked value." + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "actual": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "admin", + "scope": "prod.", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0023", + "case_type": "success", + "input": "Need a small .NET app named reporting-web-app in westus for reporting.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "reporting-web-app", + "diagnostics_enabled": true, + "environment": "prod", + "region": "westus", + "runtime": "dotnet8", + "team": "reporting" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "reporting-web-app", + "region": "westus", + "runtime": "dotnet8", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "reporting-web-app", + "region": "westus", + "runtime": "dotnet8", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: environment, team" + ] + }, + { + "id": "eval-0024", + "case_type": "risky_rejected", + "input": "Grant Jane admin access to production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.88, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.88, + "failure_reasons": [ + "Blocked owner/admin IAM grant." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "jane", + "role": "admin", + "scope": "production" + }, + "status": "rejected", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "admin", + "scope": "production.", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": { + "role": "admin" + }, + "failure_reasons": [ + "One or more fields contains a blocked value." + ], + "clarifying_question": null + }, + "policy_decision": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "actual": { + "status": "rejected", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "admin", + "scope": "production.", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Blocked value for role: admin" + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0025", + "case_type": "success", + "input": "Need a small .NET app named reporting-web-app in centralus for reporting.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "reporting-web-app", + "diagnostics_enabled": false, + "environment": "dev", + "region": "centralus", + "runtime": "dotnet8", + "team": "reporting" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "reporting-web-app", + "region": "centralus", + "runtime": "dotnet8", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "reporting-web-app", + "region": "centralus", + "runtime": "dotnet8", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: environment, team" + ] + }, + { + "id": "eval-0026", + "case_type": "success", + "input": "Grant deploy-bot reader access to staging-bucket in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "principal": "deploy-bot", + "role": "reader", + "scope": "staging-bucket" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "reader", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "reader", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0027", + "case_type": "success", + "input": "Need a small Python app named platform-web-app in westus for platform.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "platform-web-app", + "diagnostics_enabled": false, + "environment": "staging", + "region": "westus", + "runtime": "python311", + "team": "platform" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "platform-web-app", + "region": "westus", + "runtime": "python311", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "platform-web-app", + "region": "westus", + "runtime": "python311", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: environment, team" + ] + }, + { + "id": "eval-0028", + "case_type": "success", + "input": "Grant jane viewer access to staging-bucket in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "principal": "jane", + "role": "viewer", + "scope": "staging-bucket" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "viewer", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "viewer", + "scope": "staging-bucket", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0029", + "case_type": "confirmation_required", + "input": "iam: ops-lead role contributor scope production", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.82, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.82, + "failure_reasons": [ + "High-risk IAM change requires confirmation." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "ops-lead", + "role": "contributor", + "scope": "production" + }, + "status": "requires_confirmation", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "contributor", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "contributor", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected requires_confirmation, got needs_clarification", + "missing expected parameter keys: principal" + ] + }, + { + "id": "eval-0030", + "case_type": "missing_fields", + "input": "iam access needed for deploy-bot, scope TBD", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": "What role should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: role, scope" + ], + "missing_fields": [ + "role", + "scope" + ], + "parameters": { + "principal": "deploy-bot" + }, + "status": "needs_clarification", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "scope": "deploy-bot" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "role" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "scope": "deploy-bot" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0031", + "case_type": "success", + "input": "Grant deploy-bot viewer access to staging-bucket in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "deploy-bot", + "role": "viewer", + "scope": "staging-bucket" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "viewer", + "scope": "staging-bucket", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation.", + "IAM request targets production or broad-scope permissions." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "viewer", + "scope": "staging-bucket", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + }, + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation.", + "IAM request targets production or broad-scope permissions." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0032", + "case_type": "missing_fields", + "input": "permission request for jane", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": "What role should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: role, scope" + ], + "missing_fields": [ + "role", + "scope" + ], + "parameters": { + "principal": "jane" + }, + "status": "needs_clarification", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "request" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "role" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "request" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0033", + "case_type": "missing_fields", + "input": "bucket needed for security, no location picked yet", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": "What bucket name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "parameters": { + "team": "security" + }, + "status": "needs_clarification", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_storage_bucket", + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0034", + "case_type": "confirmation_required", + "input": "iam: ops-lead role reader scope production", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.82, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.82, + "failure_reasons": [ + "High-risk IAM change requires confirmation." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "ops-lead", + "role": "reader", + "scope": "production" + }, + "status": "requires_confirmation", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "reader", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "reader", + "scope": "production", + "environment": "prod" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected requires_confirmation, got needs_clarification", + "missing expected parameter keys: principal" + ] + }, + { + "id": "eval-0035", + "case_type": "missing_fields", + "input": "identity needed for team reporting", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_service_account" + } + ], + "clarifying_question": "What account name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: account_name, environment" + ], + "missing_fields": [ + "account_name", + "environment" + ], + "parameters": { + "team": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "reporting-svc", + "team": "reporting", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "reporting-svc", + "team": "reporting", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0036", + "case_type": "success", + "input": "Create a nightly scheduler job named growth-nightly-job for model-refresh in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "job_name": "growth-nightly-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "team": "growth", + "timezone": "America/Los_Angeles" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "model-refresh", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "model-refresh", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0037", + "case_type": "success", + "input": "Create a service account named security-svc for the security team in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "security-svc", + "description": "Service identity for workflow automation.", + "environment": "prod", + "team": "security" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_service_account", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0038", + "case_type": "success", + "input": "Create a production Node.js web app for the growth team in US Central.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "growth-web-app", + "diagnostics_enabled": true, + "environment": "prod", + "region": "us-central1", + "runtime": "nodejs20", + "team": "growth" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "growth-web-app", + "region": "us-central1", + "runtime": "nodejs20", + "environment": "prod", + "team": "growth", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_web_app", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "growth-web-app", + "region": "us-central1", + "runtime": "nodejs20", + "environment": "prod", + "team": "growth", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0039", + "case_type": "success", + "input": "Create a standard storage bucket named platform-bucket in US Central for development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "bucket_name": "platform-bucket", + "environment": "dev", + "region": "us-central1", + "storage_class": "standard", + "team": "platform" + }, + "status": "routed", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "platform-bucket", + "region": "us-central1", + "environment": "dev", + "storage_class": "standard" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_storage_bucket", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_storage_bucket", + "confidence": 0.93, + "parameters": { + "bucket_name": "platform-bucket", + "region": "us-central1", + "environment": "dev", + "storage_class": "standard" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team" + ] + }, + { + "id": "eval-0040", + "case_type": "ambiguous", + "input": "Prep access and automation for the new project.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_service_account" + }, + { + "confidence": 0.31, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "role" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0041", + "case_type": "success", + "input": "Create a nightly scheduler job named claims-nightly-job for billing-export in staging.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "job_name": "claims-nightly-job", + "schedule": "0 9 * * *", + "target": "billing-export", + "team": "claims", + "timezone": "America/Los_Angeles" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "claims-nightly-job", + "schedule": "0 2 * * *", + "target": "billing-export", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "claims-nightly-job", + "schedule": "0 2 * * *", + "target": "billing-export", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0042", + "case_type": "confirmation_required", + "input": "iam: jane role contributor scope all reporting resources", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.82, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.82, + "failure_reasons": [ + "High-risk IAM change requires confirmation." + ], + "missing_fields": [], + "parameters": { + "environment": "staging", + "principal": "jane", + "role": "contributor", + "scope": "all reporting resources" + }, + "status": "requires_confirmation", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "contributor" + }, + "missing_fields": [ + "principal", + "scope" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "scope" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, scope" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, scope" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "role": "contributor" + }, + "missing_fields": [ + "principal", + "scope" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: principal, scope" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected requires_confirmation, got needs_clarification", + "missing expected parameter keys: environment, principal, scope" + ] + }, + { + "id": "eval-0043", + "case_type": "ambiguous", + "input": "Set up reporting.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_service_account" + }, + { + "confidence": 0.31, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No workflow keywords matched with enough confidence." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "validation_result": { + "valid": false, + "workflow": null, + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Router did not select a workflow." + ], + "clarifying_question": "Which workflow should this request use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "actual": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0044", + "case_type": "success", + "input": "Need a small Node.js app named growth-web-app in westus for growth.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "growth-web-app", + "diagnostics_enabled": false, + "environment": "prod", + "region": "westus", + "runtime": "nodejs20", + "team": "growth" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "growth-web-app", + "region": "westus", + "runtime": "nodejs20", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "app_name": "growth-web-app", + "region": "westus", + "runtime": "nodejs20", + "diagnostics_enabled": false + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: environment, team" + ] + }, + { + "id": "eval-0045", + "case_type": "missing_fields", + "input": "need api for reporting, details TBD", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_web_app" + } + ], + "clarifying_question": "What app name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: app_name, region, environment" + ], + "missing_fields": [ + "app_name", + "region", + "environment" + ], + "parameters": { + "runtime": "python311", + "team": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0046", + "case_type": "success", + "input": "Grant jane reader access to reporting-project in staging.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "principal": "jane", + "role": "reader", + "scope": "reporting-project" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "reader", + "scope": "reporting-project", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "jane", + "role": "reader", + "scope": "reporting-project", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0047", + "case_type": "success", + "input": "Give analyst the viewer role on claims-app.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "analyst", + "role": "viewer", + "scope": "claims-app" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + }, + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + }, + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "workflow mismatch: expected grant_iam_role, got create_web_app", + "missing expected parameter keys: environment, principal, role, scope" + ] + }, + { + "id": "eval-0048", + "case_type": "success", + "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "job_name": "reporting-nightly-job", + "schedule": "0 9 * * *", + "target": "claims-sync", + "team": "reporting", + "timezone": "America/New_York" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "reporting-nightly-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "reporting-nightly-job", + "schedule": "0 2 * * *", + "target": "claims-sync", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0049", + "case_type": "success", + "input": "ticket: finance staging api, runtime Python, region West US, diagnostics on", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "finance-web-app", + "diagnostics_enabled": true, + "environment": "staging", + "region": "westus", + "runtime": "python311", + "team": "finance" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "westus", + "runtime": "python311", + "environment": "staging", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "westus", + "runtime": "python311", + "environment": "staging", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: app_name, team" + ] + }, + { + "id": "eval-0050", + "case_type": "ambiguous", + "input": "Prep access and automation for the new project.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_scheduler_job" + }, + { + "confidence": 0.31, + "workflow": "create_service_account" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "role" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0051", + "case_type": "success", + "input": "cron 0 9 * * * target model-refresh env staging timezone America/New_York", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "job_name": "finance-nightly-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "team": "finance", + "timezone": "America/New_York" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "model-refresh-scheduled-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": { + "job_name": "model-refresh-scheduled-job", + "schedule": "0 9 * * *", + "target": "model-refresh", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation", + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0052", + "case_type": "success", + "input": "Give john the editor role on dev-subsystem.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "principal": "john", + "role": "editor", + "scope": "dev-subsystem" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "editor", + "scope": "dev-subsystem.", + "environment": "dev" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "role": "editor", + "scope": "dev-subsystem.", + "environment": "dev" + }, + "missing_fields": [ + "principal" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: principal" + ] + }, + { + "id": "eval-0053", + "case_type": "missing_fields", + "input": "Set up a reporting schedule.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": "What job name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: job_name, schedule, environment" + ], + "missing_fields": [ + "job_name", + "schedule", + "environment" + ], + "parameters": { + "target": "reporting" + }, + "status": "needs_clarification", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "job_name", + "schedule", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What job name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_scheduler_job", + "missing_fields": [ + "job_name", + "schedule", + "target", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: job_name, schedule, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: job_name, schedule, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_scheduler_job", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "job_name", + "schedule", + "target", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: job_name, schedule, target, environment" + ], + "clarifying_question": "What job name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0054", + "case_type": "success", + "input": "Create a nightly scheduler job named growth-nightly-job for reporting in staging.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "reporting", + "team": "growth", + "timezone": "America/New_York" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "reporting", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.95, + "parameters": { + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "reporting", + "environment": "staging" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.95 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: team, timezone" + ] + }, + { + "id": "eval-0055", + "case_type": "ambiguous", + "input": "Make the nightly thing happen.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_service_account" + }, + { + "confidence": 0.31, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.58, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.58 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.58 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.58, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.58, + "parameters": { + "diagnostics_enabled": false + }, + "missing_fields": [ + "app_name", + "region", + "runtime", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.58 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.58 + } + ], + "failure_reasons": [ + "Missing required fields: app_name, region, runtime, environment" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0056", + "case_type": "success", + "input": "identity request: growth service account, env prod, name growth-svc", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "growth-svc", + "description": "Service identity for workflow automation.", + "environment": "prod", + "team": "growth" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What account name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "account_name", + "team" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: account_name, team" + ] + }, + { + "id": "eval-0057", + "case_type": "success", + "input": "Create a production .NET web app for the reporting team in West US.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "reporting-web-app", + "diagnostics_enabled": false, + "environment": "prod", + "region": "westus", + "runtime": "dotnet8", + "team": "reporting" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "reporting-web-app", + "region": "westus", + "runtime": "dotnet8", + "environment": "prod", + "team": "reporting", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_web_app", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "reporting-web-app", + "region": "westus", + "runtime": "dotnet8", + "environment": "prod", + "team": "reporting", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0058", + "case_type": "missing_fields", + "input": "service account request, owner team security", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_service_account" + } + ], + "clarifying_question": "What account name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: account_name, environment" + ], + "missing_fields": [ + "account_name", + "environment" + ], + "parameters": { + "team": "security" + }, + "status": "needs_clarification", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0059", + "case_type": "success", + "input": "Create a production .NET web app for the security team in West US.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "security-web-app", + "diagnostics_enabled": true, + "environment": "prod", + "region": "westus", + "runtime": "dotnet8", + "team": "security" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "security-web-app", + "region": "westus", + "runtime": "dotnet8", + "environment": "prod", + "team": "security", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_web_app", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "security-web-app", + "region": "westus", + "runtime": "dotnet8", + "environment": "prod", + "team": "security", + "diagnostics_enabled": false + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0060", + "case_type": "success", + "input": "Set up a daily job for reporting for the growth team in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "job_name": "growth-nightly-job", + "schedule": "0 2 * * *", + "target": "reporting", + "team": "growth", + "timezone": "UTC" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "dev", + "team": "growth" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "dev", + "team": "growth" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: timezone" + ] + }, + { + "id": "eval-0061", + "case_type": "success", + "input": "Set up a daily job for reporting for the reporting team in staging.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "staging", + "job_name": "reporting-nightly-job", + "schedule": "0 9 * * *", + "target": "reporting", + "team": "reporting", + "timezone": "America/New_York" + }, + "status": "routed", + "workflow": "create_scheduler_job" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "staging", + "team": "reporting" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "create_scheduler_job", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "accepted": true, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "actual": { + "status": "routed", + "workflow": "create_scheduler_job", + "confidence": 0.93, + "parameters": { + "job_name": "reporting-scheduled-job", + "schedule": "0 9 * * *", + "target": "reporting", + "environment": "staging", + "team": "reporting" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_scheduler_job", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Route accepted for execution preview only." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "missing expected parameter keys: timezone" + ] + }, + { + "id": "eval-0062", + "case_type": "success", + "input": "Need an automation identity for team growth in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "growth-svc", + "description": "Service identity for workflow automation.", + "environment": "dev", + "team": "growth" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "growth-svc", + "team": "growth", + "environment": "dev", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "Please confirm the selected workflow and parameters." + }, + "validation_result": { + "valid": true, + "workflow": "create_service_account", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "growth-svc", + "team": "growth", + "environment": "dev", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Router confidence is between 0.55 and 0.80." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + }, + { + "id": "eval-0063", + "case_type": "ambiguous", + "input": "Prep access and automation for the new project.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "grant_iam_role" + }, + { + "confidence": 0.31, + "workflow": "create_service_account" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What principal should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "grant_iam_role", + "missing_fields": [ + "principal", + "role" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "grant_iam_role", + "confidence": 0.69, + "parameters": { + "scope": "for" + }, + "missing_fields": [ + "principal", + "role" + ], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: principal, role" + ], + "clarifying_question": "What principal should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0064", + "case_type": "missing_fields", + "input": "Create a Python web app for the growth team.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_web_app" + } + ], + "clarifying_question": "What app name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: app_name, region, environment" + ], + "missing_fields": [ + "app_name", + "region", + "environment" + ], + "parameters": { + "runtime": "python311", + "team": "growth" + }, + "status": "needs_clarification", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "growth-web-app", + "runtime": "python311", + "team": "growth", + "diagnostics_enabled": false + }, + "missing_fields": [ + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What region should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "region", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: region, environment" + ], + "clarifying_question": "What region should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: region, environment" + ], + "clarifying_question": "What region should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.93, + "parameters": { + "app_name": "growth-web-app", + "runtime": "python311", + "team": "growth", + "diagnostics_enabled": false + }, + "missing_fields": [ + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: region, environment" + ], + "clarifying_question": "What region should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0065", + "case_type": "missing_fields", + "input": "bucket needed for mlops, no location picked yet", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_storage_bucket" + } + ], + "clarifying_question": "What bucket name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "parameters": { + "team": "mlops" + }, + "status": "needs_clarification", + "workflow": "create_storage_bucket" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_storage_bucket", + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": {}, + "missing_fields": [ + "bucket_name", + "region", + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: bucket_name, region, environment" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0066", + "case_type": "success", + "input": "Give jane the viewer role on staging-bucket.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "jane", + "role": "viewer", + "scope": "staging-bucket" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": { + "environment": "staging" + }, + "missing_fields": [ + "bucket_name", + "region" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + }, + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_storage_bucket", + "missing_fields": [ + "bucket_name", + "region" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: bucket_name, region" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: bucket_name, region" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_storage_bucket", + "confidence": 0.69, + "parameters": { + "environment": "staging" + }, + "missing_fields": [ + "bucket_name", + "region" + ], + "candidate_workflows": [ + { + "workflow": "create_storage_bucket", + "confidence": 0.69 + }, + { + "workflow": "grant_iam_role", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: bucket_name, region" + ], + "clarifying_question": "What bucket name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "workflow mismatch: expected grant_iam_role, got create_storage_bucket", + "missing expected parameter keys: principal, role, scope" + ] + }, + { + "id": "eval-0067", + "case_type": "missing_fields", + "input": "Create a service account for the mlops team.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_service_account" + } + ], + "clarifying_question": "What account name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: account_name, environment" + ], + "missing_fields": [ + "account_name", + "environment" + ], + "parameters": { + "team": "mlops" + }, + "status": "needs_clarification", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "mlops-svc", + "team": "mlops", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "mlops-svc", + "team": "mlops", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0068", + "case_type": "missing_fields", + "input": "service account request, owner team security", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_service_account" + } + ], + "clarifying_question": "What account name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: account_name, environment" + ], + "missing_fields": [ + "account_name", + "environment" + ], + "parameters": { + "team": "security" + }, + "status": "needs_clarification", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0069", + "case_type": "success", + "input": "ticket: reporting development api, runtime Node.js, region West US, diagnostics on", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_web_app" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "app_name": "reporting-web-app", + "diagnostics_enabled": false, + "environment": "dev", + "region": "westus", + "runtime": "nodejs20", + "team": "reporting" + }, + "status": "routed", + "workflow": "create_web_app" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "westus", + "runtime": "nodejs20", + "environment": "dev", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What app name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_web_app", + "missing_fields": [ + "app_name" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_web_app", + "confidence": 0.69, + "parameters": { + "region": "westus", + "runtime": "nodejs20", + "environment": "dev", + "diagnostics_enabled": true + }, + "missing_fields": [ + "app_name" + ], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: app_name" + ], + "clarifying_question": "What app name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: app_name, team" + ] + }, + { + "id": "eval-0070", + "case_type": "ambiguous", + "input": "Set up reporting.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.38, + "workflow": "create_storage_bucket" + }, + { + "confidence": 0.31, + "workflow": "create_scheduler_job" + } + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", + "confidence": 0.34, + "failure_reasons": [ + "Request is ambiguous across multiple workflows." + ], + "missing_fields": [], + "parameters": {}, + "status": "needs_clarification", + "workflow": null + }, + "actual_router_output": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No workflow keywords matched with enough confidence." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "validation_result": { + "valid": false, + "workflow": null, + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Router did not select a workflow." + ], + "clarifying_question": "Which workflow should this request use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "actual": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No authoritative workflow could be selected." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0071", + "case_type": "risky_rejected", + "input": "Remove all security from production.", + "expected": { + "candidate_workflows": [], + "clarifying_question": null, + "confidence": 0.88, + "failure_reasons": [ + "Unsafe destructive production request." + ], + "missing_fields": [], + "parameters": {}, + "status": "rejected", + "workflow": null + }, + "actual_router_output": { + "status": "needs_clarification", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "No workflow keywords matched with enough confidence." + ], + "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" + }, + "validation_result": { + "valid": false, + "workflow": null, + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Router did not select a workflow." + ], + "clarifying_question": "Which workflow should this request use?" + }, + "policy_decision": { + "status": "rejected", + "workflow": null, + "confidence": 0.25, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Unsafe phrase matched: remove all security" + ], + "clarifying_question": null + }, + "actual": { + "status": "rejected", + "workflow": null, + "confidence": 0.25, + "parameters": {}, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "create_web_app", + "confidence": 0.25 + }, + { + "workflow": "create_scheduler_job", + "confidence": 0.23 + } + ], + "failure_reasons": [ + "Unsafe phrase matched: remove all security" + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0072", + "case_type": "confirmation_required", + "input": "Grant deploy-bot contributor access to production.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.82, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.82, + "failure_reasons": [ + "High-risk IAM change requires confirmation." + ], + "missing_fields": [], + "parameters": { + "environment": "prod", + "principal": "deploy-bot", + "role": "contributor", + "scope": "production" + }, + "status": "requires_confirmation", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "contributor", + "scope": "production.", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation.", + "IAM request targets production or broad-scope permissions." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "contributor", + "scope": "production.", + "environment": "prod" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation.", + "IAM request targets production or broad-scope permissions." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0073", + "case_type": "missing_fields", + "input": "service account request, owner team security", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.74, + "workflow": "create_service_account" + } + ], + "clarifying_question": "What account name should RouterCore use?", + "confidence": 0.74, + "failure_reasons": [ + "Missing required fields: account_name, environment" + ], + "missing_fields": [ + "account_name", + "environment" + ], + "parameters": { + "team": "security" + }, + "status": "needs_clarification", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "requires_confirmation", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [], + "clarifying_question": "What environment should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "environment" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.69, + "parameters": { + "account_name": "security-svc", + "team": "security", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "environment" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.69 + } + ], + "failure_reasons": [ + "Missing required fields: environment" + ], + "clarifying_question": "What environment should RouterCore use?" + }, + "pass_fail_notes": [ + "pass" + ] + }, + { + "id": "eval-0074", + "case_type": "success", + "input": "identity request: finance service account, env prod, name finance-svc", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "create_service_account" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "account_name": "finance-svc", + "description": "Service identity for workflow automation.", + "environment": "prod", + "team": "finance" + }, + "status": "routed", + "workflow": "create_service_account" + }, + "actual_router_output": { + "status": "routed", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": "What account name should RouterCore use?" + }, + "validation_result": { + "valid": false, + "workflow": "create_service_account", + "missing_fields": [ + "account_name", + "team" + ], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "policy_decision": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": false, + "execution_allowed": false, + "reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "actual": { + "status": "needs_clarification", + "workflow": "create_service_account", + "confidence": 0.93, + "parameters": { + "environment": "prod", + "description": "Generated from RouterCore request preview." + }, + "missing_fields": [ + "account_name", + "team" + ], + "candidate_workflows": [ + { + "workflow": "create_service_account", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Missing required fields: account_name, team" + ], + "clarifying_question": "What account name should RouterCore use?" + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got needs_clarification", + "missing expected parameter keys: account_name, team" + ] + }, + { + "id": "eval-0075", + "case_type": "success", + "input": "Grant deploy-bot editor access to reporting-project in development.", + "expected": { + "candidate_workflows": [ + { + "confidence": 0.92, + "workflow": "grant_iam_role" + } + ], + "clarifying_question": null, + "confidence": 0.92, + "failure_reasons": [], + "missing_fields": [], + "parameters": { + "environment": "dev", + "principal": "deploy-bot", + "role": "editor", + "scope": "reporting-project" + }, + "status": "routed", + "workflow": "grant_iam_role" + }, + "actual_router_output": { + "status": "routed", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "editor", + "scope": "reporting-project", + "environment": "dev" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [], + "clarifying_question": null + }, + "validation_result": { + "valid": true, + "workflow": "grant_iam_role", + "missing_fields": [], + "invalid_fields": {}, + "blocked_fields": {}, + "failure_reasons": [], + "clarifying_question": null + }, + "policy_decision": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "accepted": false, + "requires_confirmation": true, + "execution_allowed": false, + "reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "actual": { + "status": "requires_confirmation", + "workflow": "grant_iam_role", + "confidence": 0.93, + "parameters": { + "principal": "deploy-bot", + "role": "editor", + "scope": "reporting-project", + "environment": "dev" + }, + "missing_fields": [], + "candidate_workflows": [ + { + "workflow": "grant_iam_role", + "confidence": 0.93 + } + ], + "failure_reasons": [ + "Workflow is high risk and requires human confirmation." + ], + "clarifying_question": null + }, + "pass_fail_notes": [ + "status mismatch: expected routed, got requires_confirmation" + ] + } + ] +} \ No newline at end of file