{ "summary_metrics": { "json_validity_rate": 1.0, "workflow_accuracy": 0.9701492537313433, "status_accuracy": 0.5733333333333334, "required_field_presence_accuracy": 0.2857142857142857, "unsafe_rejection_accuracy": 1.0, "false_route_rate": 0.0 }, "per_example_results": [ { "id": "eval-0001", "case_type": "success", "input": "cron 0 2 * * * target claims-sync env dev timezone America/New_York", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "job_name": "finance-nightly-job", "schedule": "0 2 * * *", "target": "claims-sync", "team": "finance", "timezone": "America/New_York" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "claims-sync-scheduled-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "dev" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "claims-sync-scheduled-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "dev" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation", "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0002", "case_type": "success", "input": "ticket: mlops production api, runtime .NET, region Central US, diagnostics on", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "mlops-web-app", "diagnostics_enabled": true, "environment": "prod", "region": "centralus", "runtime": "dotnet8", "team": "mlops" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "centralus", "runtime": "dotnet8", "environment": "prod", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "centralus", "runtime": "dotnet8", "environment": "prod", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: app_name, team" ] }, { "id": "eval-0003", "case_type": "success", "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "job_name": "reporting-nightly-job", "schedule": "0 9 * * *", "target": "claims-sync", "team": "reporting", "timezone": "America/Los_Angeles" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "reporting-nightly-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "reporting-nightly-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0004", "case_type": "success", "input": "identity request: growth service account, env staging, name growth-svc", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "growth-svc", "description": "Service identity for workflow automation.", "environment": "staging", "team": "growth" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "routed", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "staging", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What account name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "account_name", "team" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "staging", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: account_name, team" ] }, { "id": "eval-0005", "case_type": "missing_fields", "input": "daily reporting job, details later", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_scheduler_job" } ], "clarifying_question": "What job name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: job_name, schedule, environment" ], "missing_fields": [ "job_name", "schedule", "environment" ], "parameters": { "target": "reporting" }, "status": "needs_clarification", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "schedule": "0 9 * * *" }, "missing_fields": [ "job_name", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What job name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_scheduler_job", "missing_fields": [ "job_name", "target", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "schedule": "0 9 * * *" }, "missing_fields": [ "job_name", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0006", "case_type": "success", "input": "Create a cool storage bucket named platform-bucket in West US for development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_storage_bucket" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "bucket_name": "platform-bucket", "environment": "dev", "region": "westus", "storage_class": "cool", "team": "platform" }, "status": "routed", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "platform-bucket", "region": "westus", "environment": "dev", "storage_class": "cool" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_storage_bucket", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "platform-bucket", "region": "westus", "environment": "dev", "storage_class": "cool" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team" ] }, { "id": "eval-0007", "case_type": "success", "input": "Grant reporting-user reader access to staging-bucket in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "principal": "reporting-user", "role": "reader", "scope": "staging-bucket" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "reporting-user", "role": "reader", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "reporting-user", "role": "reader", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0008", "case_type": "missing_fields", "input": "bucket needed for reporting, no location picked yet", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_storage_bucket" } ], "clarifying_question": "What bucket name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "missing_fields": [ "bucket_name", "region", "environment" ], "parameters": { "team": "reporting" }, "status": "needs_clarification", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What bucket name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_storage_bucket", "missing_fields": [ "bucket_name", "region", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0009", "case_type": "success", "input": "cron 0 9 * * * target model-refresh env dev timezone UTC", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "job_name": "finance-nightly-job", "schedule": "0 9 * * *", "target": "model-refresh", "team": "finance", "timezone": "UTC" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "model-refresh-scheduled-job", "schedule": "0 9 * * *", "target": "model-refresh", "environment": "dev", "timezone": "UTC" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "model-refresh-scheduled-job", "schedule": "0 9 * * *", "target": "model-refresh", "environment": "dev", "timezone": "UTC" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation", "missing expected parameter keys: team" ] }, { "id": "eval-0010", "case_type": "success", "input": "ticket: mlops staging api, runtime Python, region Central US, diagnostics on", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "mlops-web-app", "diagnostics_enabled": true, "environment": "staging", "region": "centralus", "runtime": "python311", "team": "mlops" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "centralus", "runtime": "python311", "environment": "staging", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "centralus", "runtime": "python311", "environment": "staging", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: app_name, team" ] }, { "id": "eval-0011", "case_type": "success", "input": "infra: bucket for claims, env prod, region centralus, class archive", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_storage_bucket" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "bucket_name": "claims-bucket", "environment": "prod", "region": "centralus", "storage_class": "archive", "team": "claims" }, "status": "routed", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": { "region": "centralus", "environment": "prod", "storage_class": "archive" }, "missing_fields": [ "bucket_name" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What bucket name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_storage_bucket", "missing_fields": [ "bucket_name" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: bucket_name" ], "clarifying_question": "What bucket name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: bucket_name" ], "clarifying_question": "What bucket name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": { "region": "centralus", "environment": "prod", "storage_class": "archive" }, "missing_fields": [ "bucket_name" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: bucket_name" ], "clarifying_question": "What bucket name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: bucket_name, team" ] }, { "id": "eval-0012", "case_type": "success", "input": "Create a archive storage bucket named finance-bucket in East US for staging.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_storage_bucket" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "bucket_name": "finance-bucket", "environment": "staging", "region": "eastus", "storage_class": "archive", "team": "finance" }, "status": "routed", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "finance-bucket", "region": "eastus", "environment": "staging", "storage_class": "archive" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_storage_bucket", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "finance-bucket", "region": "eastus", "environment": "staging", "storage_class": "archive" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team" ] }, { "id": "eval-0013", "case_type": "success", "input": "Give analyst the contributor role on reporting-project.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "principal": "analyst", "role": "contributor", "scope": "reporting-project" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "contributor", "scope": "reporting-project." }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "contributor", "scope": "reporting-project." }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: environment, principal" ] }, { "id": "eval-0014", "case_type": "missing_fields", "input": "daily reporting job, details later", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_scheduler_job" } ], "clarifying_question": "What job name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: job_name, schedule, environment" ], "missing_fields": [ "job_name", "schedule", "environment" ], "parameters": { "target": "reporting" }, "status": "needs_clarification", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "schedule": "0 9 * * *" }, "missing_fields": [ "job_name", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What job name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_scheduler_job", "missing_fields": [ "job_name", "target", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "schedule": "0 9 * * *" }, "missing_fields": [ "job_name", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: job_name, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0015", "case_type": "success", "input": "Create a staging Python web app for the platform team in Central US.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "platform-web-app", "diagnostics_enabled": true, "environment": "staging", "region": "centralus", "runtime": "python311", "team": "platform" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "platform-web-app", "region": "centralus", "runtime": "python311", "environment": "staging", "team": "platform", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_web_app", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "platform-web-app", "region": "centralus", "runtime": "python311", "environment": "staging", "team": "platform", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0016", "case_type": "success", "input": "Set up blob storage for the mlops team in US Central for development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_storage_bucket" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "bucket_name": "mlops-bucket", "environment": "dev", "region": "us-central1", "storage_class": "cool", "team": "mlops" }, "status": "routed", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "mlops-bucket", "region": "us-central1", "environment": "dev", "team": "mlops" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_storage_bucket", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "mlops-bucket", "region": "us-central1", "environment": "dev", "team": "mlops" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: storage_class" ] }, { "id": "eval-0017", "case_type": "risky_rejected", "input": "Give John the admin role across all production.", "expected": { "candidate_workflows": [ { "confidence": 0.88, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.88, "failure_reasons": [ "Blocked owner/admin IAM grant." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "john", "role": "admin", "scope": "production" }, "status": "rejected", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "admin", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": { "role": "admin" }, "failure_reasons": [ "Missing required fields: principal", "One or more fields contains a blocked value." ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "actual": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "admin", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0018", "case_type": "success", "input": "Set up a daily job for billing-export for the security team in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "job_name": "security-nightly-job", "schedule": "0 9 * * *", "target": "billing-export", "team": "security", "timezone": "UTC" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "billing-export-scheduled-job", "schedule": "0 9 * * *", "target": "billing-export", "environment": "dev", "team": "security" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "billing-export-scheduled-job", "schedule": "0 9 * * *", "target": "billing-export", "environment": "dev", "team": "security" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: timezone" ] }, { "id": "eval-0019", "case_type": "success", "input": "Need an automation identity for team finance in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "finance-svc", "description": "Service identity for workflow automation.", "environment": "prod", "team": "finance" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "finance-svc", "team": "finance", "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_service_account", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "finance-svc", "team": "finance", "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0020", "case_type": "ambiguous", "input": "Set up reporting.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_web_app" }, { "confidence": 0.31, "workflow": "create_service_account" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No workflow keywords matched with enough confidence." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "validation_result": { "valid": false, "workflow": null, "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Router did not select a workflow." ], "clarifying_question": "Which workflow should this request use?" }, "policy_decision": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "actual": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0021", "case_type": "success", "input": "cron 0 9 * * * target reporting env staging timezone America/Los_Angeles", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "job_name": "security-nightly-job", "schedule": "0 9 * * *", "target": "reporting", "team": "security", "timezone": "America/Los_Angeles" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation", "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0022", "case_type": "risky_rejected", "input": "iam urgent: John needs admin on prod.", "expected": { "candidate_workflows": [ { "confidence": 0.88, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.88, "failure_reasons": [ "Blocked owner/admin IAM grant." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "john", "role": "admin", "scope": "production" }, "status": "rejected", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "admin", "scope": "prod.", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": { "role": "admin" }, "failure_reasons": [ "Missing required fields: principal", "One or more fields contains a blocked value." ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "actual": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "admin", "scope": "prod.", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0023", "case_type": "success", "input": "Need a small .NET app named reporting-web-app in westus for reporting.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "reporting-web-app", "diagnostics_enabled": true, "environment": "prod", "region": "westus", "runtime": "dotnet8", "team": "reporting" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "reporting-web-app", "region": "westus", "runtime": "dotnet8", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "reporting-web-app", "region": "westus", "runtime": "dotnet8", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: environment, team" ] }, { "id": "eval-0024", "case_type": "risky_rejected", "input": "Grant Jane admin access to production.", "expected": { "candidate_workflows": [ { "confidence": 0.88, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.88, "failure_reasons": [ "Blocked owner/admin IAM grant." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "jane", "role": "admin", "scope": "production" }, "status": "rejected", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "admin", "scope": "production.", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": { "role": "admin" }, "failure_reasons": [ "One or more fields contains a blocked value." ], "clarifying_question": null }, "policy_decision": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "actual": { "status": "rejected", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "admin", "scope": "production.", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Blocked value for role: admin" ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0025", "case_type": "success", "input": "Need a small .NET app named reporting-web-app in centralus for reporting.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "reporting-web-app", "diagnostics_enabled": false, "environment": "dev", "region": "centralus", "runtime": "dotnet8", "team": "reporting" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "reporting-web-app", "region": "centralus", "runtime": "dotnet8", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "reporting-web-app", "region": "centralus", "runtime": "dotnet8", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: environment, team" ] }, { "id": "eval-0026", "case_type": "success", "input": "Grant deploy-bot reader access to staging-bucket in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "principal": "deploy-bot", "role": "reader", "scope": "staging-bucket" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "reader", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "reader", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0027", "case_type": "success", "input": "Need a small Python app named platform-web-app in westus for platform.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "platform-web-app", "diagnostics_enabled": false, "environment": "staging", "region": "westus", "runtime": "python311", "team": "platform" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "platform-web-app", "region": "westus", "runtime": "python311", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "platform-web-app", "region": "westus", "runtime": "python311", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: environment, team" ] }, { "id": "eval-0028", "case_type": "success", "input": "Grant jane viewer access to staging-bucket in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "principal": "jane", "role": "viewer", "scope": "staging-bucket" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "viewer", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "viewer", "scope": "staging-bucket", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0029", "case_type": "confirmation_required", "input": "iam: ops-lead role contributor scope production", "expected": { "candidate_workflows": [ { "confidence": 0.82, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.82, "failure_reasons": [ "High-risk IAM change requires confirmation." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "ops-lead", "role": "contributor", "scope": "production" }, "status": "requires_confirmation", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "contributor", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "contributor", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected requires_confirmation, got needs_clarification", "missing expected parameter keys: principal" ] }, { "id": "eval-0030", "case_type": "missing_fields", "input": "iam access needed for deploy-bot, scope TBD", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "grant_iam_role" } ], "clarifying_question": "What role should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: role, scope" ], "missing_fields": [ "role", "scope" ], "parameters": { "principal": "deploy-bot" }, "status": "needs_clarification", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "scope": "deploy-bot" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "role" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "scope": "deploy-bot" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0031", "case_type": "success", "input": "Grant deploy-bot viewer access to staging-bucket in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "principal": "deploy-bot", "role": "viewer", "scope": "staging-bucket" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "viewer", "scope": "staging-bucket", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation.", "IAM request targets production or broad-scope permissions." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "viewer", "scope": "staging-bucket", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 }, { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation.", "IAM request targets production or broad-scope permissions." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0032", "case_type": "missing_fields", "input": "permission request for jane", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "grant_iam_role" } ], "clarifying_question": "What role should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: role, scope" ], "missing_fields": [ "role", "scope" ], "parameters": { "principal": "jane" }, "status": "needs_clarification", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "request" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "role" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "request" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0033", "case_type": "missing_fields", "input": "bucket needed for security, no location picked yet", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_storage_bucket" } ], "clarifying_question": "What bucket name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "missing_fields": [ "bucket_name", "region", "environment" ], "parameters": { "team": "security" }, "status": "needs_clarification", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What bucket name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_storage_bucket", "missing_fields": [ "bucket_name", "region", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0034", "case_type": "confirmation_required", "input": "iam: ops-lead role reader scope production", "expected": { "candidate_workflows": [ { "confidence": 0.82, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.82, "failure_reasons": [ "High-risk IAM change requires confirmation." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "ops-lead", "role": "reader", "scope": "production" }, "status": "requires_confirmation", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "reader", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "reader", "scope": "production", "environment": "prod" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected requires_confirmation, got needs_clarification", "missing expected parameter keys: principal" ] }, { "id": "eval-0035", "case_type": "missing_fields", "input": "identity needed for team reporting", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_service_account" } ], "clarifying_question": "What account name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: account_name, environment" ], "missing_fields": [ "account_name", "environment" ], "parameters": { "team": "reporting" }, "status": "needs_clarification", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "reporting-svc", "team": "reporting", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "reporting-svc", "team": "reporting", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0036", "case_type": "success", "input": "Create a nightly scheduler job named growth-nightly-job for model-refresh in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "job_name": "growth-nightly-job", "schedule": "0 9 * * *", "target": "model-refresh", "team": "growth", "timezone": "America/Los_Angeles" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "model-refresh", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "model-refresh", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0037", "case_type": "success", "input": "Create a service account named security-svc for the security team in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "security-svc", "description": "Service identity for workflow automation.", "environment": "prod", "team": "security" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_service_account", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0038", "case_type": "success", "input": "Create a production Node.js web app for the growth team in US Central.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "growth-web-app", "diagnostics_enabled": true, "environment": "prod", "region": "us-central1", "runtime": "nodejs20", "team": "growth" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "growth-web-app", "region": "us-central1", "runtime": "nodejs20", "environment": "prod", "team": "growth", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_web_app", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "growth-web-app", "region": "us-central1", "runtime": "nodejs20", "environment": "prod", "team": "growth", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0039", "case_type": "success", "input": "Create a standard storage bucket named platform-bucket in US Central for development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_storage_bucket" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "bucket_name": "platform-bucket", "environment": "dev", "region": "us-central1", "storage_class": "standard", "team": "platform" }, "status": "routed", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "platform-bucket", "region": "us-central1", "environment": "dev", "storage_class": "standard" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_storage_bucket", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_storage_bucket", "confidence": 0.93, "parameters": { "bucket_name": "platform-bucket", "region": "us-central1", "environment": "dev", "storage_class": "standard" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team" ] }, { "id": "eval-0040", "case_type": "ambiguous", "input": "Prep access and automation for the new project.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_service_account" }, { "confidence": 0.31, "workflow": "create_scheduler_job" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "role" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0041", "case_type": "success", "input": "Create a nightly scheduler job named claims-nightly-job for billing-export in staging.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "job_name": "claims-nightly-job", "schedule": "0 9 * * *", "target": "billing-export", "team": "claims", "timezone": "America/Los_Angeles" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "claims-nightly-job", "schedule": "0 2 * * *", "target": "billing-export", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "claims-nightly-job", "schedule": "0 2 * * *", "target": "billing-export", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0042", "case_type": "confirmation_required", "input": "iam: jane role contributor scope all reporting resources", "expected": { "candidate_workflows": [ { "confidence": 0.82, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.82, "failure_reasons": [ "High-risk IAM change requires confirmation." ], "missing_fields": [], "parameters": { "environment": "staging", "principal": "jane", "role": "contributor", "scope": "all reporting resources" }, "status": "requires_confirmation", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "contributor" }, "missing_fields": [ "principal", "scope" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "scope" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, scope" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, scope" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "role": "contributor" }, "missing_fields": [ "principal", "scope" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: principal, scope" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected requires_confirmation, got needs_clarification", "missing expected parameter keys: environment, principal, scope" ] }, { "id": "eval-0043", "case_type": "ambiguous", "input": "Set up reporting.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_service_account" }, { "confidence": 0.31, "workflow": "create_storage_bucket" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No workflow keywords matched with enough confidence." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "validation_result": { "valid": false, "workflow": null, "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Router did not select a workflow." ], "clarifying_question": "Which workflow should this request use?" }, "policy_decision": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "actual": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0044", "case_type": "success", "input": "Need a small Node.js app named growth-web-app in westus for growth.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "growth-web-app", "diagnostics_enabled": false, "environment": "prod", "region": "westus", "runtime": "nodejs20", "team": "growth" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "growth-web-app", "region": "westus", "runtime": "nodejs20", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "app_name": "growth-web-app", "region": "westus", "runtime": "nodejs20", "diagnostics_enabled": false }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: environment, team" ] }, { "id": "eval-0045", "case_type": "missing_fields", "input": "need api for reporting, details TBD", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_web_app" } ], "clarifying_question": "What app name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: app_name, region, environment" ], "missing_fields": [ "app_name", "region", "environment" ], "parameters": { "runtime": "python311", "team": "reporting" }, "status": "needs_clarification", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name", "region", "runtime", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0046", "case_type": "success", "input": "Grant jane reader access to reporting-project in staging.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "principal": "jane", "role": "reader", "scope": "reporting-project" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "reader", "scope": "reporting-project", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "jane", "role": "reader", "scope": "reporting-project", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0047", "case_type": "success", "input": "Give analyst the viewer role on claims-app.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "principal": "analyst", "role": "viewer", "scope": "claims-app" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 }, { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name", "region", "runtime", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 }, { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "workflow mismatch: expected grant_iam_role, got create_web_app", "missing expected parameter keys: environment, principal, role, scope" ] }, { "id": "eval-0048", "case_type": "success", "input": "Create a nightly scheduler job named reporting-nightly-job for claims-sync in production.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "job_name": "reporting-nightly-job", "schedule": "0 9 * * *", "target": "claims-sync", "team": "reporting", "timezone": "America/New_York" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "reporting-nightly-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "reporting-nightly-job", "schedule": "0 2 * * *", "target": "claims-sync", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0049", "case_type": "success", "input": "ticket: finance staging api, runtime Python, region West US, diagnostics on", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "finance-web-app", "diagnostics_enabled": true, "environment": "staging", "region": "westus", "runtime": "python311", "team": "finance" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "westus", "runtime": "python311", "environment": "staging", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "westus", "runtime": "python311", "environment": "staging", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: app_name, team" ] }, { "id": "eval-0050", "case_type": "ambiguous", "input": "Prep access and automation for the new project.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_scheduler_job" }, { "confidence": 0.31, "workflow": "create_service_account" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "role" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0051", "case_type": "success", "input": "cron 0 9 * * * target model-refresh env staging timezone America/New_York", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "job_name": "finance-nightly-job", "schedule": "0 9 * * *", "target": "model-refresh", "team": "finance", "timezone": "America/New_York" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "model-refresh-scheduled-job", "schedule": "0 9 * * *", "target": "model-refresh", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": { "job_name": "model-refresh-scheduled-job", "schedule": "0 9 * * *", "target": "model-refresh", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation", "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0052", "case_type": "success", "input": "Give john the editor role on dev-subsystem.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "principal": "john", "role": "editor", "scope": "dev-subsystem" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "editor", "scope": "dev-subsystem.", "environment": "dev" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "role": "editor", "scope": "dev-subsystem.", "environment": "dev" }, "missing_fields": [ "principal" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: principal" ] }, { "id": "eval-0053", "case_type": "missing_fields", "input": "Set up a reporting schedule.", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_scheduler_job" } ], "clarifying_question": "What job name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: job_name, schedule, environment" ], "missing_fields": [ "job_name", "schedule", "environment" ], "parameters": { "target": "reporting" }, "status": "needs_clarification", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": {}, "missing_fields": [ "job_name", "schedule", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What job name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_scheduler_job", "missing_fields": [ "job_name", "schedule", "target", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: job_name, schedule, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: job_name, schedule, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_scheduler_job", "confidence": 0.69, "parameters": {}, "missing_fields": [ "job_name", "schedule", "target", "environment" ], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: job_name, schedule, target, environment" ], "clarifying_question": "What job name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0054", "case_type": "success", "input": "Create a nightly scheduler job named growth-nightly-job for reporting in staging.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "reporting", "team": "growth", "timezone": "America/New_York" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "reporting", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.95, "parameters": { "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "reporting", "environment": "staging" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.95 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: team, timezone" ] }, { "id": "eval-0055", "case_type": "ambiguous", "input": "Make the nightly thing happen.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_service_account" }, { "confidence": 0.31, "workflow": "create_storage_bucket" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.58, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.58 }, { "workflow": "create_scheduler_job", "confidence": 0.58 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name", "region", "runtime", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.58, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.58, "parameters": { "diagnostics_enabled": false }, "missing_fields": [ "app_name", "region", "runtime", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.58 }, { "workflow": "create_scheduler_job", "confidence": 0.58 } ], "failure_reasons": [ "Missing required fields: app_name, region, runtime, environment" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0056", "case_type": "success", "input": "identity request: growth service account, env prod, name growth-svc", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "growth-svc", "description": "Service identity for workflow automation.", "environment": "prod", "team": "growth" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "routed", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What account name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "account_name", "team" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: account_name, team" ] }, { "id": "eval-0057", "case_type": "success", "input": "Create a production .NET web app for the reporting team in West US.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "reporting-web-app", "diagnostics_enabled": false, "environment": "prod", "region": "westus", "runtime": "dotnet8", "team": "reporting" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "reporting-web-app", "region": "westus", "runtime": "dotnet8", "environment": "prod", "team": "reporting", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_web_app", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "reporting-web-app", "region": "westus", "runtime": "dotnet8", "environment": "prod", "team": "reporting", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0058", "case_type": "missing_fields", "input": "service account request, owner team security", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_service_account" } ], "clarifying_question": "What account name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: account_name, environment" ], "missing_fields": [ "account_name", "environment" ], "parameters": { "team": "security" }, "status": "needs_clarification", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0059", "case_type": "success", "input": "Create a production .NET web app for the security team in West US.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "security-web-app", "diagnostics_enabled": true, "environment": "prod", "region": "westus", "runtime": "dotnet8", "team": "security" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "security-web-app", "region": "westus", "runtime": "dotnet8", "environment": "prod", "team": "security", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_web_app", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "security-web-app", "region": "westus", "runtime": "dotnet8", "environment": "prod", "team": "security", "diagnostics_enabled": false }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0060", "case_type": "success", "input": "Set up a daily job for reporting for the growth team in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "job_name": "growth-nightly-job", "schedule": "0 2 * * *", "target": "reporting", "team": "growth", "timezone": "UTC" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "dev", "team": "growth" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "dev", "team": "growth" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: timezone" ] }, { "id": "eval-0061", "case_type": "success", "input": "Set up a daily job for reporting for the reporting team in staging.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_scheduler_job" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "staging", "job_name": "reporting-nightly-job", "schedule": "0 9 * * *", "target": "reporting", "team": "reporting", "timezone": "America/New_York" }, "status": "routed", "workflow": "create_scheduler_job" }, "actual_router_output": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "staging", "team": "reporting" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "create_scheduler_job", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "accepted": true, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "actual": { "status": "routed", "workflow": "create_scheduler_job", "confidence": 0.93, "parameters": { "job_name": "reporting-scheduled-job", "schedule": "0 9 * * *", "target": "reporting", "environment": "staging", "team": "reporting" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_scheduler_job", "confidence": 0.93 } ], "failure_reasons": [ "Route accepted for execution preview only." ], "clarifying_question": null }, "pass_fail_notes": [ "missing expected parameter keys: timezone" ] }, { "id": "eval-0062", "case_type": "success", "input": "Need an automation identity for team growth in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "growth-svc", "description": "Service identity for workflow automation.", "environment": "dev", "team": "growth" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "growth-svc", "team": "growth", "environment": "dev", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "Please confirm the selected workflow and parameters." }, "validation_result": { "valid": true, "workflow": "create_service_account", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "growth-svc", "team": "growth", "environment": "dev", "description": "Generated from RouterCore request preview." }, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Router confidence is between 0.55 and 0.80." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] }, { "id": "eval-0063", "case_type": "ambiguous", "input": "Prep access and automation for the new project.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "grant_iam_role" }, { "confidence": 0.31, "workflow": "create_service_account" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What principal should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "grant_iam_role", "missing_fields": [ "principal", "role" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "grant_iam_role", "confidence": 0.69, "parameters": { "scope": "for" }, "missing_fields": [ "principal", "role" ], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: principal, role" ], "clarifying_question": "What principal should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0064", "case_type": "missing_fields", "input": "Create a Python web app for the growth team.", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_web_app" } ], "clarifying_question": "What app name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: app_name, region, environment" ], "missing_fields": [ "app_name", "region", "environment" ], "parameters": { "runtime": "python311", "team": "growth" }, "status": "needs_clarification", "workflow": "create_web_app" }, "actual_router_output": { "status": "routed", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "growth-web-app", "runtime": "python311", "team": "growth", "diagnostics_enabled": false }, "missing_fields": [ "region", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What region should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "region", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: region, environment" ], "clarifying_question": "What region should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: region, environment" ], "clarifying_question": "What region should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.93, "parameters": { "app_name": "growth-web-app", "runtime": "python311", "team": "growth", "diagnostics_enabled": false }, "missing_fields": [ "region", "environment" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: region, environment" ], "clarifying_question": "What region should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0065", "case_type": "missing_fields", "input": "bucket needed for mlops, no location picked yet", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_storage_bucket" } ], "clarifying_question": "What bucket name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "missing_fields": [ "bucket_name", "region", "environment" ], "parameters": { "team": "mlops" }, "status": "needs_clarification", "workflow": "create_storage_bucket" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What bucket name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_storage_bucket", "missing_fields": [ "bucket_name", "region", "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": {}, "missing_fields": [ "bucket_name", "region", "environment" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: bucket_name, region, environment" ], "clarifying_question": "What bucket name should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0066", "case_type": "success", "input": "Give jane the viewer role on staging-bucket.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "prod", "principal": "jane", "role": "viewer", "scope": "staging-bucket" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": { "environment": "staging" }, "missing_fields": [ "bucket_name", "region" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 }, { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What bucket name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_storage_bucket", "missing_fields": [ "bucket_name", "region" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: bucket_name, region" ], "clarifying_question": "What bucket name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: bucket_name, region" ], "clarifying_question": "What bucket name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_storage_bucket", "confidence": 0.69, "parameters": { "environment": "staging" }, "missing_fields": [ "bucket_name", "region" ], "candidate_workflows": [ { "workflow": "create_storage_bucket", "confidence": 0.69 }, { "workflow": "grant_iam_role", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: bucket_name, region" ], "clarifying_question": "What bucket name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "workflow mismatch: expected grant_iam_role, got create_storage_bucket", "missing expected parameter keys: principal, role, scope" ] }, { "id": "eval-0067", "case_type": "missing_fields", "input": "Create a service account for the mlops team.", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_service_account" } ], "clarifying_question": "What account name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: account_name, environment" ], "missing_fields": [ "account_name", "environment" ], "parameters": { "team": "mlops" }, "status": "needs_clarification", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "mlops-svc", "team": "mlops", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "mlops-svc", "team": "mlops", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0068", "case_type": "missing_fields", "input": "service account request, owner team security", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_service_account" } ], "clarifying_question": "What account name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: account_name, environment" ], "missing_fields": [ "account_name", "environment" ], "parameters": { "team": "security" }, "status": "needs_clarification", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0069", "case_type": "success", "input": "ticket: reporting development api, runtime Node.js, region West US, diagnostics on", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_web_app" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "app_name": "reporting-web-app", "diagnostics_enabled": false, "environment": "dev", "region": "westus", "runtime": "nodejs20", "team": "reporting" }, "status": "routed", "workflow": "create_web_app" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "westus", "runtime": "nodejs20", "environment": "dev", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What app name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_web_app", "missing_fields": [ "app_name" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_web_app", "confidence": 0.69, "parameters": { "region": "westus", "runtime": "nodejs20", "environment": "dev", "diagnostics_enabled": true }, "missing_fields": [ "app_name" ], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: app_name" ], "clarifying_question": "What app name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: app_name, team" ] }, { "id": "eval-0070", "case_type": "ambiguous", "input": "Set up reporting.", "expected": { "candidate_workflows": [ { "confidence": 0.38, "workflow": "create_storage_bucket" }, { "confidence": 0.31, "workflow": "create_scheduler_job" } ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?", "confidence": 0.34, "failure_reasons": [ "Request is ambiguous across multiple workflows." ], "missing_fields": [], "parameters": {}, "status": "needs_clarification", "workflow": null }, "actual_router_output": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No workflow keywords matched with enough confidence." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "validation_result": { "valid": false, "workflow": null, "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Router did not select a workflow." ], "clarifying_question": "Which workflow should this request use?" }, "policy_decision": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "actual": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No authoritative workflow could be selected." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0071", "case_type": "risky_rejected", "input": "Remove all security from production.", "expected": { "candidate_workflows": [], "clarifying_question": null, "confidence": 0.88, "failure_reasons": [ "Unsafe destructive production request." ], "missing_fields": [], "parameters": {}, "status": "rejected", "workflow": null }, "actual_router_output": { "status": "needs_clarification", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "No workflow keywords matched with enough confidence." ], "clarifying_question": "Are you creating an app, storage, identity, IAM access, or a scheduled job?" }, "validation_result": { "valid": false, "workflow": null, "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Router did not select a workflow." ], "clarifying_question": "Which workflow should this request use?" }, "policy_decision": { "status": "rejected", "workflow": null, "confidence": 0.25, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Unsafe phrase matched: remove all security" ], "clarifying_question": null }, "actual": { "status": "rejected", "workflow": null, "confidence": 0.25, "parameters": {}, "missing_fields": [], "candidate_workflows": [ { "workflow": "create_web_app", "confidence": 0.25 }, { "workflow": "create_scheduler_job", "confidence": 0.23 } ], "failure_reasons": [ "Unsafe phrase matched: remove all security" ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0072", "case_type": "confirmation_required", "input": "Grant deploy-bot contributor access to production.", "expected": { "candidate_workflows": [ { "confidence": 0.82, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.82, "failure_reasons": [ "High-risk IAM change requires confirmation." ], "missing_fields": [], "parameters": { "environment": "prod", "principal": "deploy-bot", "role": "contributor", "scope": "production" }, "status": "requires_confirmation", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "contributor", "scope": "production.", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation.", "IAM request targets production or broad-scope permissions." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "contributor", "scope": "production.", "environment": "prod" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation.", "IAM request targets production or broad-scope permissions." ], "clarifying_question": null }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0073", "case_type": "missing_fields", "input": "service account request, owner team security", "expected": { "candidate_workflows": [ { "confidence": 0.74, "workflow": "create_service_account" } ], "clarifying_question": "What account name should RouterCore use?", "confidence": 0.74, "failure_reasons": [ "Missing required fields: account_name, environment" ], "missing_fields": [ "account_name", "environment" ], "parameters": { "team": "security" }, "status": "needs_clarification", "workflow": "create_service_account" }, "actual_router_output": { "status": "requires_confirmation", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [], "clarifying_question": "What environment should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "environment" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.69, "parameters": { "account_name": "security-svc", "team": "security", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "environment" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.69 } ], "failure_reasons": [ "Missing required fields: environment" ], "clarifying_question": "What environment should RouterCore use?" }, "pass_fail_notes": [ "pass" ] }, { "id": "eval-0074", "case_type": "success", "input": "identity request: finance service account, env prod, name finance-svc", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "create_service_account" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "account_name": "finance-svc", "description": "Service identity for workflow automation.", "environment": "prod", "team": "finance" }, "status": "routed", "workflow": "create_service_account" }, "actual_router_output": { "status": "routed", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": "What account name should RouterCore use?" }, "validation_result": { "valid": false, "workflow": "create_service_account", "missing_fields": [ "account_name", "team" ], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "policy_decision": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "accepted": false, "requires_confirmation": false, "execution_allowed": false, "reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "actual": { "status": "needs_clarification", "workflow": "create_service_account", "confidence": 0.93, "parameters": { "environment": "prod", "description": "Generated from RouterCore request preview." }, "missing_fields": [ "account_name", "team" ], "candidate_workflows": [ { "workflow": "create_service_account", "confidence": 0.93 } ], "failure_reasons": [ "Missing required fields: account_name, team" ], "clarifying_question": "What account name should RouterCore use?" }, "pass_fail_notes": [ "status mismatch: expected routed, got needs_clarification", "missing expected parameter keys: account_name, team" ] }, { "id": "eval-0075", "case_type": "success", "input": "Grant deploy-bot editor access to reporting-project in development.", "expected": { "candidate_workflows": [ { "confidence": 0.92, "workflow": "grant_iam_role" } ], "clarifying_question": null, "confidence": 0.92, "failure_reasons": [], "missing_fields": [], "parameters": { "environment": "dev", "principal": "deploy-bot", "role": "editor", "scope": "reporting-project" }, "status": "routed", "workflow": "grant_iam_role" }, "actual_router_output": { "status": "routed", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "editor", "scope": "reporting-project", "environment": "dev" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [], "clarifying_question": null }, "validation_result": { "valid": true, "workflow": "grant_iam_role", "missing_fields": [], "invalid_fields": {}, "blocked_fields": {}, "failure_reasons": [], "clarifying_question": null }, "policy_decision": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "accepted": false, "requires_confirmation": true, "execution_allowed": false, "reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "actual": { "status": "requires_confirmation", "workflow": "grant_iam_role", "confidence": 0.93, "parameters": { "principal": "deploy-bot", "role": "editor", "scope": "reporting-project", "environment": "dev" }, "missing_fields": [], "candidate_workflows": [ { "workflow": "grant_iam_role", "confidence": 0.93 } ], "failure_reasons": [ "Workflow is high risk and requires human confirmation." ], "clarifying_question": null }, "pass_fail_notes": [ "status mismatch: expected routed, got requires_confirmation" ] } ] }