Spaces:
Running
Running
Expand benchmark dataset to 108 incidents
Browse files- README.md +9 -8
- incidents.py +105 -0
- openenv.yaml +6 -6
- tests/test_env.py +2 -1
README.md
CHANGED
|
@@ -22,13 +22,13 @@ The environment is built for the OpenEnv hackathon requirements:
|
|
| 22 |
|
| 23 |
## Overview
|
| 24 |
|
| 25 |
-
The dataset contains
|
| 26 |
|
| 27 |
| Task | Difficulty | Count | Objective |
|
| 28 |
|---|---|---:|---|
|
| 29 |
-
| `task1` | easy |
|
| 30 |
-
| `task2` | medium |
|
| 31 |
-
| `task3` | hard |
|
| 32 |
|
| 33 |
The incidents cover realistic production scenarios such as payment failures, queue backlogs, regional network loss, failed deploys, infrastructure saturation, third-party degradation, and failover decisions.
|
| 34 |
|
|
@@ -228,13 +228,14 @@ Latest local deterministic baseline:
|
|
| 228 |
|
| 229 |
| Metric | Value |
|
| 230 |
|---|---:|
|
| 231 |
-
| Episodes |
|
| 232 |
-
| Average score | 0.
|
| 233 |
| `task1` average | 1.0000 |
|
| 234 |
-
| `task2` average | 0.
|
| 235 |
| `task3` average | 1.0000 |
|
| 236 |
|
| 237 |
-
|
|
|
|
| 238 |
|
| 239 |
## Quick API Example
|
| 240 |
|
|
|
|
| 22 |
|
| 23 |
## Overview
|
| 24 |
|
| 25 |
+
The dataset contains 108 incidents across three task families:
|
| 26 |
|
| 27 |
| Task | Difficulty | Count | Objective |
|
| 28 |
|---|---|---:|---|
|
| 29 |
+
| `task1` | easy | 36 | Predict incident severity as `SEV1`, `SEV2`, or `SEV3` |
|
| 30 |
+
| `task2` | medium | 36 | Predict the most likely root cause domain |
|
| 31 |
+
| `task3` | hard | 36 | Predict the best immediate operational action |
|
| 32 |
|
| 33 |
The incidents cover realistic production scenarios such as payment failures, queue backlogs, regional network loss, failed deploys, infrastructure saturation, third-party degradation, and failover decisions.
|
| 34 |
|
|
|
|
| 228 |
|
| 229 |
| Metric | Value |
|
| 230 |
|---|---:|
|
| 231 |
+
| Episodes | 108 |
|
| 232 |
+
| Average score | 0.9954 |
|
| 233 |
| `task1` average | 1.0000 |
|
| 234 |
+
| `task2` average | 0.9861 |
|
| 235 |
| `task3` average | 1.0000 |
|
| 236 |
|
| 237 |
+
This deterministic local run completed in about `1.34s` on the current machine.
|
| 238 |
+
Results are written by default to `/tmp/outputs/baseline_scores.json`.
|
| 239 |
|
| 240 |
## Quick API Example
|
| 241 |
|
incidents.py
CHANGED
|
@@ -459,3 +459,108 @@ TICKETS = [
|
|
| 459 |
}
|
| 460 |
|
| 461 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
}
|
| 460 |
|
| 461 |
]
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
def _make_ticket(
|
| 465 |
+
incident_id: str,
|
| 466 |
+
task_type: str,
|
| 467 |
+
alert_text: str,
|
| 468 |
+
context: dict,
|
| 469 |
+
expected_field: str,
|
| 470 |
+
expected_value: str,
|
| 471 |
+
) -> dict:
|
| 472 |
+
return {
|
| 473 |
+
"incident_id": incident_id,
|
| 474 |
+
"task_type": task_type,
|
| 475 |
+
"alert_text": alert_text,
|
| 476 |
+
"context": context,
|
| 477 |
+
"ground_truth": {expected_field: expected_value},
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
_EXPANDED_TASK1 = [
|
| 482 |
+
("INC-037", "[CRITICAL] Checkout API returning 502 for 58% of requests. Revenue impact confirmed during peak sale.", {"service": "checkout-api", "error_rate_pct": 58, "affected_users": 87000, "revenue_impact": True, "region": "us-west-2"}, "SEV1"),
|
| 483 |
+
("INC-038", "[WARNING] Billing dashboard latency elevated to 4200ms. 11% of invoice lookups timing out.", {"service": "billing-dashboard", "p95_latency_ms": 4200, "timeout_rate_pct": 11, "region": "eu-central-1", "revenue_impact": False}, "SEV2"),
|
| 484 |
+
("INC-039", "[INFO] Employee directory avatar placeholders rendering incorrectly. Internal only and purely cosmetic.", {"service": "employee-directory", "affected_users": "internal only", "user_impact": "cosmetic", "release": "2026.04.12"}, "SEV3"),
|
| 485 |
+
("INC-040", "[CRITICAL] Search edge returning errors globally. Error rate reached 73% across web and mobile clients.", {"service": "search-edge", "error_rate_pct": 73, "region": "global", "affected_channels": ["web", "mobile"], "on_call_notified": True}, "SEV1"),
|
| 486 |
+
("INC-041", "[WARNING] Refund worker backlog growing. 12,400 refund jobs delayed by roughly 27 minutes.", {"service": "refund-worker", "queue_backlog": 12400, "avg_delay_min": 27, "consumer_lag": "high", "revenue_impact": False}, "SEV2"),
|
| 487 |
+
("INC-042", "[INFO] Analytics dashboard legend text overlaps on one widget. No user-facing impact.", {"service": "analytics-dashboard", "impact": "cosmetic", "reported_by": "internal QA", "user_impact": False}, "SEV3"),
|
| 488 |
+
("INC-043", "[CRITICAL] Trading API rejecting 47% of high-value order placements. Revenue impact confirmed.", {"service": "trading-api", "error_rate_pct": 47, "affected_segment": "institutional", "revenue_impact": True, "region": "us-east-1"}, "SEV1"),
|
| 489 |
+
("INC-044", "[WARNING] Product catalog reads slowed to p99 3600ms. 9% of requests timing out in one region.", {"service": "catalog-read", "p99_latency_ms": 3600, "timeout_rate_pct": 9, "region": "ap-south-1", "cache_hit_rate": 61}, "SEV2"),
|
| 490 |
+
("INC-045", "[INFO] Settings page icon alignment shifted after theme update. Cosmetic only.", {"service": "settings-ui", "impact": "cosmetic", "affected_users": 430, "last_deploy": "20m ago"}, "SEV3"),
|
| 491 |
+
("INC-046", "[CRITICAL] OTP verification failures reached 41% across all regions. Login and checkout MFA both impacted.", {"service": "otp-service", "error_rate_pct": 41, "affected_flows": ["login", "checkout_mfa"], "region": "global", "on_call_notified": True}, "SEV1"),
|
| 492 |
+
("INC-047", "[WARNING] Partner sync jobs delayed by 19 minutes. Queue depth increasing but customer traffic remains steady.", {"service": "partner-sync", "queue_delay_min": 19, "queue_backlog": 3900, "revenue_impact": False, "region": "us-east-2"}, "SEV2"),
|
| 493 |
+
("INC-048", "[INFO] Internal admin export button style broke after CSS refactor. Internal only.", {"service": "admin-console", "affected_users": "internal only", "user_impact": "cosmetic", "last_deploy": "45m ago"}, "SEV3"),
|
| 494 |
+
("INC-049", "[CRITICAL] Reservation service timing out for 82% of hotel bookings. 95,000 users affected.", {"service": "reservation-service", "error_rate_pct": 82, "affected_users": 95000, "region": "eu-west-1", "revenue_impact": True}, "SEV1"),
|
| 495 |
+
("INC-050", "[WARNING] Session refresh tokens intermittently failing in one region. Error rate holding at 14%.", {"service": "session-service", "error_rate_pct": 14, "region": "sa-east-1", "affected_flows": ["token_refresh"], "revenue_impact": False}, "SEV2"),
|
| 496 |
+
("INC-051", "[INFO] Support knowledge-base thumbnails missing on article cards. No user-facing impact for customers.", {"service": "support-kb", "affected_users": "internal only", "impact": "cosmetic", "user_impact": False}, "SEV3"),
|
| 497 |
+
("INC-052", "[CRITICAL] Shipment label generation returning 500 for 52% of attempts. Revenue impact confirmed for same-day orders.", {"service": "label-service", "error_rate_pct": 52, "affected_segment": "same-day delivery", "revenue_impact": True, "region": "us-central-1"}, "SEV1"),
|
| 498 |
+
("INC-053", "[WARNING] Recommendation API latency spiked to 5100ms. 7% of requests timing out.", {"service": "recommendation-api", "p99_latency_ms": 5100, "timeout_rate_pct": 7, "region": "eu-north-1", "cache_status": "degraded"}, "SEV2"),
|
| 499 |
+
("INC-054", "[INFO] Marketing site promo banner shifted below the fold after a CSS tweak. Cosmetic only.", {"service": "marketing-site", "impact": "cosmetic", "last_deploy": "15m ago", "user_impact": "cosmetic"}, "SEV3"),
|
| 500 |
+
("INC-055", "[CRITICAL] Payroll export API failing for 100% of scheduled runs. Finance operations blocked.", {"service": "payroll-export", "http_500_rate": "100%", "affected_users": "internal finance", "region": "us-east-1", "on_call_notified": True}, "SEV1"),
|
| 501 |
+
("INC-056", "[WARNING] Loyalty points processor lagging behind by 14 minutes. Customers see delayed balances.", {"service": "loyalty-points", "avg_delay_min": 14, "affected_users_pct": 9, "revenue_impact": False, "region": "us-west-1"}, "SEV2"),
|
| 502 |
+
("INC-057", "[INFO] Observability annotation drawer not rendering markdown bullets correctly. No user-facing impact.", {"service": "observability-ui", "impact": "cosmetic", "user_impact": False, "reported_by": "internal SRE"}, "SEV3"),
|
| 503 |
+
("INC-058", "[CRITICAL] Wallet debit flow failing for 44% of attempts worldwide. Revenue dependency is high.", {"service": "wallet-debit", "error_rate_pct": 44, "region": "global", "revenue_dependency": "high", "affected_flows": ["wallet_topup", "wallet_pay"]}, "SEV1"),
|
| 504 |
+
("INC-059", "[WARNING] Fraud-score enrichment responses slowed significantly. 15% of requests timing out.", {"service": "fraud-enrichment", "timeout_rate_pct": 15, "p99_latency_ms": 3900, "region": "eu-west-2", "revenue_impact": False}, "SEV2"),
|
| 505 |
+
("INC-060", "[INFO] CMS preview page using fallback font after stylesheet cache miss. Purely cosmetic.", {"service": "cms-preview", "impact": "cosmetic", "user_impact": "cosmetic", "last_deploy": "1h ago"}, "SEV3"),
|
| 506 |
+
("INC-061", "[CRITICAL] Invoice generation failing for 65% of enterprise accounts. Revenue impact confirmed.", {"service": "invoice-engine", "error_rate_pct": 65, "affected_segment": "enterprise", "revenue_impact": True, "region": "us-east-1"}, "SEV1"),
|
| 507 |
+
]
|
| 508 |
+
|
| 509 |
+
_EXPANDED_TASK2 = [
|
| 510 |
+
("INC-062", "[CRITICAL] PostgreSQL vacuum lag increasing rapidly. Connection pool pinned at 480/500 on primary.", {"db": "postgres-orders", "connection_pool": "480/500", "replica_lag_sec": 41, "slow_query_count": 122}, "DATABASE"),
|
| 511 |
+
("INC-063", "[CRITICAL] Packet loss 27% on transit link. Route instability causing inter-region request failures.", {"packet_loss_pct": 27, "route_state": "flapping", "affected_regions": ["us-east-1", "ca-central-1"], "traceroute": "drops at transit hop 5"}, "NETWORK"),
|
| 512 |
+
("INC-064", "[ERROR] Exception flood began after deploy. Stack trace points to inventory_rules.py.", {"service": "inventory-service", "exception": "IllegalStateException", "stack_trace": "inventory_rules.py:118", "last_deploy": "15m ago"}, "APPLICATION"),
|
| 513 |
+
("INC-065", "[WARNING] SendGrid webhook acknowledgements failing. Vendor status page reports degraded mail delivery.", {"vendor": "SendGrid", "webhook_failures": 540, "sendgrid_status": "degraded", "our_service_health": "healthy"}, "THIRD_PARTY"),
|
| 514 |
+
("INC-066", "[CRITICAL] Kubernetes cluster degraded. Node NotReady events triggered pod evictions in checkout namespace.", {"cluster": "prod-checkout", "nodes_not_ready": 4, "evicted_pods": 29, "namespace": "checkout", "cause": "EC2 maintenance"}, "INFRASTRUCTURE"),
|
| 515 |
+
("INC-067", "[CRITICAL] Database slow query storm detected. Write queries blocked behind lock contention on Postgres.", {"database": "customer-profile", "slow_query_count": 301, "write_queries_blocked": True, "replica_lag_sec": 19}, "DATABASE"),
|
| 516 |
+
("INC-068", "[CRITICAL] Cross-region API calls failing during route flap. Traceroute drops beyond the carrier edge.", {"cross_region_calls": "failing", "route_flap": True, "traceroute": "carrier edge timeout", "packet_loss_pct": 18}, "NETWORK"),
|
| 517 |
+
("INC-069", "[ERROR] Crash loop started immediately after deploy. Code path in tax_adapter raises a NullPointerException.", {"service": "tax-adapter", "crash_count": 37, "exception": "NullPointerException", "deploy_version": "2026.04.12.4"}, "APPLICATION"),
|
| 518 |
+
("INC-070", "[WARNING] Stripe external API returning 502s. Our workers healthy but payment confirmations delayed.", {"vendor": "Stripe", "external_api_errors": 502, "worker_health": "healthy", "webhook_queue": 220}, "THIRD_PARTY"),
|
| 519 |
+
("INC-071", "[CRITICAL] Node memory pressure spread across the analytics cluster after EC2 spot interruption.", {"cluster": "analytics-prod", "node_memory_pressure": True, "spot_interruption": True, "pods_pending": 41}, "INFRASTRUCTURE"),
|
| 520 |
+
("INC-072", "[CRITICAL] Postgres replica lag hit 88 seconds. Connection pool exhausted and disk spill rising.", {"db": "postgres-ledger", "replica_lag_sec": 88, "connection_pool": "500/500", "disk_spill": True}, "DATABASE"),
|
| 521 |
+
("INC-073", "[CRITICAL] BGP convergence instability causing packet loss on network fabric between edge POPs.", {"bgp_flap": True, "packet_loss_pct": 21, "affected_regions": ["sin", "mum"], "provider": "Equinix"}, "NETWORK"),
|
| 522 |
+
("INC-074", "[ERROR] TimeoutException rate surged after release. No infrastructure alarms fired.", {"service": "pricing-engine", "exception": "TimeoutException", "last_deploy": "8m ago", "infra_health": "normal"}, "APPLICATION"),
|
| 523 |
+
("INC-075", "[WARNING] Twilio callback deliveries failing. Vendor dashboard marked degraded for messaging webhooks.", {"vendor": "Twilio", "callback_failures": 610, "twilio_status": "degraded", "webhook_latency_ms": 2600}, "THIRD_PARTY"),
|
| 524 |
+
("INC-076", "[CRITICAL] Cluster autoscaler could not recover capacity. Multiple Node NotReady alerts remain active.", {"cluster": "media-prod", "nodes_not_ready": 5, "pod_restarts": 84, "autoscaler_state": "stalled"}, "INFRASTRUCTURE"),
|
| 525 |
+
("INC-077", "[WARNING] DATABASE CPU saturation causing slow query growth after migration batch kicked off.", {"database": "ledger-db", "db_cpu_pct": 94, "slow_query_count": 143, "migration_job": "running"}, "DATABASE"),
|
| 526 |
+
("INC-078", "[CRITICAL] Traceroute drops observed at transit hop 8. Cross-region route instability persists.", {"traceroute": "drops at transit hop 8", "packet_loss_pct": 24, "cross_region_errors": 1900, "network_team_paged": True}, "NETWORK"),
|
| 527 |
+
("INC-079", "[ERROR] Random crash reports tied to auth middleware. Stack trace repeats in the same code path.", {"service": "auth-gateway", "crash_logs": True, "stack_trace": "auth_middleware.py:52", "deploy": "none"}, "APPLICATION"),
|
| 528 |
+
("INC-080", "[WARNING] External API vendor degraded. Webhook retries growing while our services remain healthy.", {"vendor": "Mapbox", "external_api_status": "degraded", "webhook_retries": 1180, "our_service_health": "healthy"}, "THIRD_PARTY"),
|
| 529 |
+
("INC-081", "[CRITICAL] Kubernetes control plane healthy, but worker cluster showing NotReady nodes and pod evictions.", {"cluster": "streaming-prod", "nodes_not_ready": 3, "evicted_pods": 24, "namespace": "transcoding"}, "INFRASTRUCTURE"),
|
| 530 |
+
("INC-082", "[CRITICAL] Write queries spilling to disk on DATABASE primary. Slow query count jumped above 400.", {"database": "orders-db", "write_queries": "spilling", "slow_query_count": 417, "replica_lag_sec": 33}, "DATABASE"),
|
| 531 |
+
("INC-083", "[CRITICAL] Network route instability causing 31% packet loss on partner edge connectivity.", {"route_state": "unstable", "packet_loss_pct": 31, "partner_edge": "degraded", "traceroute": "loss after peering route"}, "NETWORK"),
|
| 532 |
+
("INC-084", "[ERROR] Deploy triggered exception burst in settlement service. Stack trace points to a new code branch.", {"service": "settlement-service", "exception": "ValueError", "stack_trace": "settlement_flow.py:211", "last_deploy": "12m ago"}, "APPLICATION"),
|
| 533 |
+
("INC-085", "[WARNING] Vendor outage degrading payment gateway callbacks. Stripe webhook backlog keeps climbing.", {"vendor": "Stripe", "webhook_backlog": 930, "stripe_status": "incident", "our_service_health": "healthy"}, "THIRD_PARTY"),
|
| 534 |
+
]
|
| 535 |
+
|
| 536 |
+
_EXPANDED_TASK3 = [
|
| 537 |
+
("INC-086", "[CRITICAL] Error rate spiked immediately after deploy. Previous stable image already validated. Rollback window open.", {"service": "checkout-api", "recent_deploy_caused": True, "previous_stable": "2026.04.10.1", "rollback_tested": True}, "ROLLBACK"),
|
| 538 |
+
("INC-087", "[WARNING] Search queue depth exploded during flash sale. CPU pinned and autoscaler already at max_replicas.", {"service": "search-aggregator", "queue_depth": 18200, "cpu_pct": 97, "max_replicas": 24, "event": "flash sale"}, "SCALE_UP"),
|
| 539 |
+
("INC-088", "[ERROR] Inventory worker deadlock detected. Process not responding and health check failing continuously.", {"service": "inventory-worker", "deadlock_detected": True, "health_check": "failing", "process_state": "not responding"}, "RESTART_SERVICE"),
|
| 540 |
+
("INC-089", "[CRITICAL] Primary database down. Read replica healthy but writes failing across all tenants.", {"db": "tenant-primary", "primary_down": True, "read_replica": "healthy", "writes_failing": True}, "FAILOVER"),
|
| 541 |
+
("INC-090", "[WARNING] Stripe vendor incident causing 429s and webhook retries. Local infra healthy.", {"vendor": "Stripe", "webhook_retries": 760, "stripe_status": "degraded", "our_system": "healthy"}, "NOTIFY_VENDOR"),
|
| 542 |
+
("INC-091", "[INFO] Minor UI glitch on profile badges after CSS tweak. Cosmetic issue only.", {"service": "profile-ui", "impact": "cosmetic", "reported_users": 27}, "NO_ACTION"),
|
| 543 |
+
("INC-092", "[WARNING] Latency and retries increased gradually without a single obvious trigger. Mixed signals across services.", {"service": "document-renderer", "latency_ms": 2400, "retry_rate_pct": 8, "synthetic_probe_status": "passing"}, "INVESTIGATE"),
|
| 544 |
+
("INC-093", "[CRITICAL] Immediately after deploy, checkout failures jumped to 61%. Previous stable release is available.", {"service": "checkout-core", "immediately_after_deploy": True, "previous_stable": "2026.04.11.2", "error_rate_pct": 61}, "ROLLBACK"),
|
| 545 |
+
("INC-094", "[WARNING] Queue backlog growing fast during traffic spike. CPU saturated and autoscaler at MAX_REPLICAS.", {"service": "feed-generator", "traffic_spike": True, "queue_depth": 9700, "cpu_pct": 95, "max_replicas": 18}, "SCALE_UP"),
|
| 546 |
+
("INC-095", "[ERROR] Recommendation worker appears stuck. No response on health check endpoint for 11 minutes.", {"service": "recommendation-worker", "stuck": True, "health_check": "failing", "last_response": "11m ago"}, "RESTART_SERVICE"),
|
| 547 |
+
("INC-096", "[CRITICAL] Primary RDS unreachable. Failover has not happened yet and writes are failing.", {"db": "orders-rds-primary", "primary_rds": "unreachable", "failover": "pending", "writes_failing": True}, "FAILOVER"),
|
| 548 |
+
("INC-097", "[WARNING] Twilio vendor API returning intermittent errors. Our retry workers remain healthy.", {"vendor": "Twilio", "api_errors_pct": 23, "retry_workers": "healthy", "callback_delay_sec": 90}, "NOTIFY_VENDOR"),
|
| 549 |
+
("INC-098", "[INFO] Minor UI glitch on internal reporting theme toggle. Cosmetic only.", {"service": "reporting-ui", "impact": "cosmetic", "affected_users": "internal only"}, "NO_ACTION"),
|
| 550 |
+
("INC-099", "[WARNING] Slow degradation observed with mixed latency and retry symptoms. No single fix stands out yet.", {"service": "export-service", "latency_ms": 2800, "retry_rate_pct": 6, "error_rate_pct": 2}, "INVESTIGATE"),
|
| 551 |
+
("INC-100", "[CRITICAL] Recent deploy caused auth failures and rollback candidate already passed smoke tests.", {"service": "auth-api", "recent_deploy_caused": True, "rollback_tested": True, "previous_stable": "2026.04.11.6"}, "ROLLBACK"),
|
| 552 |
+
("INC-101", "[WARNING] CPU at 99% with queue growth on image pipeline. Autoscaler already capped at max replicas.", {"service": "image-pipeline", "cpu_pct": 99, "queue_depth": 11100, "autoscaler": "max_replicas"}, "SCALE_UP"),
|
| 553 |
+
("INC-102", "[ERROR] Scheduler deadlock detected. Process not responding to health check and jobs are stalled.", {"service": "scheduler", "deadlock_detected": True, "health_check": "failing", "job_backlog": 4400}, "RESTART_SERVICE"),
|
| 554 |
+
("INC-103", "[CRITICAL] Read replica healthy while primary down. Writes failing and customer operations blocked.", {"db": "ledger-primary", "read_replica": "healthy", "primary_down": True, "writes_failing": True}, "FAILOVER"),
|
| 555 |
+
("INC-104", "[WARNING] SendGrid vendor degradation causing delivery failures for transactional mail.", {"vendor": "SendGrid", "delivery_failures": 1300, "sendgrid_status": "investigating", "our_infra": "healthy"}, "NOTIFY_VENDOR"),
|
| 556 |
+
("INC-105", "[INFO] Cosmetic issue on loyalty badge colors after stylesheet refresh. Minor UI glitch only.", {"service": "loyalty-ui", "impact": "cosmetic", "reported_users": 61}, "NO_ACTION"),
|
| 557 |
+
("INC-106", "[WARNING] Memory usage trending upward slowly, but service still responds. Root cause not isolated yet.", {"service": "document-cache", "memory_growth": True, "probe_status": "passing", "error_rate_pct": 1}, "INVESTIGATE"),
|
| 558 |
+
("INC-107", "[CRITICAL] Recent deploy caused 54% login failures. Previous stable artifact is ready for rollback.", {"service": "login-orchestrator", "recent_deploy_caused": True, "error_rate_pct": 54, "previous_stable": "2026.04.10.9"}, "ROLLBACK"),
|
| 559 |
+
("INC-108", "[WARNING] Traffic spike from campaign launch pushed CPU to 92%. Queue depth climbing and autoscaler at max_replicas.", {"service": "campaign-router", "traffic_spike": True, "cpu_pct": 92, "queue_depth": 8600, "max_replicas": 16}, "SCALE_UP"),
|
| 560 |
+
]
|
| 561 |
+
|
| 562 |
+
TICKETS.extend(
|
| 563 |
+
[_make_ticket(incident_id, "task1", alert_text, context, "severity", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK1]
|
| 564 |
+
+ [_make_ticket(incident_id, "task2", alert_text, context, "root_cause", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK2]
|
| 565 |
+
+ [_make_ticket(incident_id, "task3", alert_text, context, "action", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK3]
|
| 566 |
+
)
|
openenv.yaml
CHANGED
|
@@ -83,18 +83,18 @@ tasks:
|
|
| 83 |
reward: "1.0 exact | 0.4 safe investigate fallback | 0.25 related action | 0.0 wrong"
|
| 84 |
|
| 85 |
dataset:
|
| 86 |
-
total_tickets:
|
| 87 |
split:
|
| 88 |
-
task1:
|
| 89 |
-
task2:
|
| 90 |
-
task3:
|
| 91 |
|
| 92 |
baseline:
|
| 93 |
script: inference.py
|
| 94 |
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
|
| 95 |
optional_env_vars: [ENV_URL]
|
| 96 |
-
latest_local_score: 0.
|
| 97 |
-
latest_local_episodes:
|
| 98 |
|
| 99 |
reproducibility:
|
| 100 |
inference_temperature: 0.0
|
|
|
|
| 83 |
reward: "1.0 exact | 0.4 safe investigate fallback | 0.25 related action | 0.0 wrong"
|
| 84 |
|
| 85 |
dataset:
|
| 86 |
+
total_tickets: 108
|
| 87 |
split:
|
| 88 |
+
task1: 36
|
| 89 |
+
task2: 36
|
| 90 |
+
task3: 36
|
| 91 |
|
| 92 |
baseline:
|
| 93 |
script: inference.py
|
| 94 |
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
|
| 95 |
optional_env_vars: [ENV_URL]
|
| 96 |
+
latest_local_score: 0.9954
|
| 97 |
+
latest_local_episodes: 108
|
| 98 |
|
| 99 |
reproducibility:
|
| 100 |
inference_temperature: 0.0
|
tests/test_env.py
CHANGED
|
@@ -4,6 +4,7 @@ from fastapi.testclient import TestClient
|
|
| 4 |
|
| 5 |
from app import app, completed_states, sessions
|
| 6 |
from environment import IncidentEnv, validate_ticket_dataset
|
|
|
|
| 7 |
from models import IncidentAction, IncidentState, TaskType
|
| 8 |
|
| 9 |
|
|
@@ -45,7 +46,7 @@ class IncidentEnvApiTests(unittest.TestCase):
|
|
| 45 |
response = self.client.get("/tickets")
|
| 46 |
self.assertEqual(response.status_code, 200)
|
| 47 |
body = response.json()
|
| 48 |
-
self.assertEqual(body["count"],
|
| 49 |
self.assertEqual(body["tickets"][0]["incident_id"], "INC-001")
|
| 50 |
self.assertIn("expected_field", body["tickets"][0])
|
| 51 |
self.assertNotIn("ground_truth", body["tickets"][0])
|
|
|
|
| 4 |
|
| 5 |
from app import app, completed_states, sessions
|
| 6 |
from environment import IncidentEnv, validate_ticket_dataset
|
| 7 |
+
from incidents import TICKETS
|
| 8 |
from models import IncidentAction, IncidentState, TaskType
|
| 9 |
|
| 10 |
|
|
|
|
| 46 |
response = self.client.get("/tickets")
|
| 47 |
self.assertEqual(response.status_code, 200)
|
| 48 |
body = response.json()
|
| 49 |
+
self.assertEqual(body["count"], len(TICKETS))
|
| 50 |
self.assertEqual(body["tickets"][0]["incident_id"], "INC-001")
|
| 51 |
self.assertIn("expected_field", body["tickets"][0])
|
| 52 |
self.assertNotIn("ground_truth", body["tickets"][0])
|