incident-triage-env / incidents.py
XcodeAddy's picture
Expand benchmark dataset to 108 incidents
4b84bac
#----- Edited file--------------
# incidents.py
TICKETS = [
# TASK 1: Severity Classification
{
"incident_id": "INC-001",
"task_type": "task1",
"alert_text": "[CRITICAL] Payment service returning HTTP 503. Error rate: 94%. Affected users: ~120,000. Revenue impact confirmed.",
"context": {
"service": "payment-service",
"error_rate_pct": 94,
"affected_users": 120000,
"region": "us-east-1",
"last_deploy": "2h ago",
"on_call_notified": True
},
"ground_truth": {"severity": "SEV1"}
},
{
"incident_id": "INC-002",
"task_type": "task1",
"alert_text": "[WARNING] Checkout latency elevated. p99 response time: 4800ms (threshold: 2000ms). 18% of requests timing out.",
"context": {
"service": "checkout-service",
"p99_latency_ms": 4800,
"timeout_rate_pct": 18,
"db_connections": "82/100",
"region": "eu-west-1"
},
"ground_truth": {"severity": "SEV2"}
},
{
"incident_id": "INC-003",
"task_type": "task1",
"alert_text": "[INFO] Admin dashboard CSS assets returning 404. Static file path misconfigured after deploy.",
"context": {
"service": "admin-ui",
"affected_users": "internal only",
"error_type": "404 on /static/main.css",
"last_deploy": "30m ago",
"user_impact": "cosmetic"
},
"ground_truth": {"severity": "SEV3"}
},
{
"incident_id": "INC-004",
"task_type": "task1",
"alert_text": "[CRITICAL] Auth service down. All login attempts failing with 500. SSO token validation endpoint unreachable.",
"context": {
"service": "auth-service",
"http_500_rate": "100%",
"affected_flows": ["login", "token_refresh", "SSO"],
"pod_status": "CrashLoopBackOff",
"region": "global"
},
"ground_truth": {"severity": "SEV1"}
},
{
"incident_id": "INC-005",
"task_type": "task1",
"alert_text": "[WARNING] Notification service email queue backlog growing. 14,000 emails pending. Delivery delay: ~22 minutes.",
"context": {
"service": "notification-service",
"queue_backlog": 14000,
"avg_delay_min": 22,
"consumer_lag": "high",
"revenue_impact": False
},
"ground_truth": {"severity": "SEV2"}
},
# TASK 2: Root Cause Classification
{
"incident_id": "INC-006",
"task_type": "task2",
"alert_text": "[CRITICAL] PostgreSQL replica lag: 94 seconds. Write queries spilling to disk. Connection pool exhausted on primary.",
"context": {
"db": "postgres-primary",
"replica_lag_sec": 94,
"connection_pool": "500/500",
"disk_spill": True,
"slow_query_count": 312
},
"ground_truth": {"root_cause": "DATABASE"}
},
{
"incident_id": "INC-007",
"task_type": "task2",
"alert_text": "[CRITICAL] Packet loss 38% between us-east-1 and eu-west-1. Cross-region API calls failing. BGP route flapping detected.",
"context": {
"packet_loss_pct": 38,
"affected_regions": ["us-east-1", "eu-west-1"],
"bgp_flap": True,
"provider": "AWS",
"traceroute": "drops at transit hop 7"
},
"ground_truth": {"root_cause": "NETWORK"}
},
{
"incident_id": "INC-008",
"task_type": "task2",
"alert_text": "[ERROR] NullPointerException in order-processing-service. Stack trace points to discount_calculator.py line 84. Deploy happened 40min ago.",
"context": {
"service": "order-processing",
"exception": "NullPointerException",
"file": "discount_calculator.py",
"line": 84,
"last_deploy": "40min ago",
"git_commit": "a3f9c21"
},
"ground_truth": {"root_cause": "APPLICATION"}
},
{
"incident_id": "INC-009",
"task_type": "task2",
"alert_text": "[WARNING] Stripe webhook delivery failures spiking. 503s from Stripe API. Stripe status page shows degraded payment processing.",
"context": {
"vendor": "Stripe",
"webhook_failures": 840,
"stripe_status": "degraded",
"our_service_health": "healthy",
"stripe_status_url": "https://status.stripe.com"
},
"ground_truth": {"root_cause": "THIRD_PARTY"}
},
{
"incident_id": "INC-010",
"task_type": "task2",
"alert_text": "[CRITICAL] Node group in Kubernetes cluster terminated. 6/10 worker nodes NotReady. Pods evicted across analytics namespace.",
"context": {
"cluster": "prod-k8s-us-east",
"nodes_not_ready": 6,
"total_nodes": 10,
"evicted_pods": 47,
"namespace": "analytics",
"cause": "EC2 spot interruption"
},
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
},
# TASK 3: Recommended Action
{
"incident_id": "INC-011",
"task_type": "task3",
"alert_text": "[CRITICAL] API error rate jumped from 0.2% to 67% immediately after deploy v2.4.1. Rollback candidate identified.",
"context": {
"service": "api-gateway",
"error_rate_before": "0.2%",
"error_rate_after": "67%",
"deploy_version": "v2.4.1",
"previous_stable": "v2.4.0",
"rollback_tested": True
},
"ground_truth": {"action": "ROLLBACK"}
},
{
"incident_id": "INC-012",
"task_type": "task3",
"alert_text": "[WARNING] Search service CPU at 98%. Request queue growing. Pod autoscaler at max replicas. Flash sale traffic spike ongoing.",
"context": {
"service": "search-service",
"cpu_pct": 98,
"current_replicas": 20,
"max_replicas_configured": 20,
"queue_depth": 9400,
"event": "flash sale"
},
"ground_truth": {"action": "SCALE_UP"}
},
{
"incident_id": "INC-013",
"task_type": "task3",
"alert_text": "[ERROR] Worker service stuck in deadlock. Memory usage flat at 99%. Process not responding to health checks. No deploy in 6 days.",
"context": {
"service": "background-worker",
"memory_pct": 99,
"health_check": "failing",
"last_deploy_days_ago": 6,
"deadlock_detected": True
},
"ground_truth": {"action": "RESTART_SERVICE"}
},
{
"incident_id": "INC-014",
"task_type": "task3",
"alert_text": "[CRITICAL] Primary RDS instance unresponsive. Failover to read replica not yet triggered. Data writes failing across all services.",
"context": {
"db": "rds-postgres-primary",
"status": "unresponsive",
"read_replica": "healthy",
"auto_failover": "disabled",
"write_failure_rate": "100%"
},
"ground_truth": {"action": "FAILOVER"}
},
{
"incident_id": "INC-015",
"task_type": "task3",
"alert_text": "[WARNING] SendGrid bounce rate at 34% for transactional emails. Delivery failures concentrated on @yahoo.com domains. No infra changes.",
"context": {
"vendor": "SendGrid",
"bounce_rate_pct": 34,
"affected_domains": ["yahoo.com"],
"our_infra_changes": False,
"sendgrid_status": "investigating"
},
"ground_truth": {"action": "NOTIFY_VENDOR"}
},
{
"incident_id": "INC-016",
"task_type": "task1",
"alert_text": "[INFO] Cart service intermittently failing for premium users only. Error rate: 12%.",
"context": {
"service": "cart-service",
"error_rate_pct": 12,
"affected_segment": "premium users",
"revenue_dependency": "high",
"region": "global"
},
"ground_truth": {"severity": "SEV1"}
},
# TASK 1: Severity (Ambiguous + Edge)
{
"incident_id": "INC-017",
"task_type": "task1",
"alert_text": "[WARNING] API latency increased to 3.2s. Error rate low (2%) but affecting checkout flow.",
"context": {
"service": "api-service",
"latency_ms": 3200,
"error_rate_pct": 2,
"business_impact": "checkout delay"
},
"ground_truth": {"severity": "SEV2"}
},
{
"incident_id": "INC-018",
"task_type": "task1",
"alert_text": "[CRITICAL] Cart service failing for 40% users. Premium users impacted more. Revenue drop observed.",
"context": {
"error_rate_pct": 40,
"affected_segment": "premium",
"revenue_impact": True
},
"ground_truth": {"severity": "SEV1"}
},
{
"incident_id": "INC-019",
"task_type": "task1",
"alert_text": "[INFO] Logging service delay in ingestion pipeline. No user-facing impact.",
"context": {
"service": "logging",
"delay_sec": 120,
"user_impact": False
},
"ground_truth": {"severity": "SEV3"}
},
# TASK 2: Root Cause (Confusing Signals)
{
"incident_id": "INC-020",
"task_type": "task2",
"alert_text": "[CRITICAL] API failures with DB latency high and packet loss observed.",
"context": {
"db_latency_ms": 2800,
"packet_loss_pct": 15,
"recent_deploy": False
},
"ground_truth": {"root_cause": "NETWORK"}
},
{
"incident_id": "INC-021",
"task_type": "task2",
"alert_text": "[ERROR] Service throwing timeout exceptions. No infra alerts. Code deployed 10 mins ago.",
"context": {
"exception": "TimeoutException",
"deploy_time": "10m ago",
"infra_health": "normal"
},
"ground_truth": {"root_cause": "APPLICATION"}
},
{
"incident_id": "INC-022",
"task_type": "task2",
"alert_text": "[WARNING] DB CPU high and slow queries increasing gradually.",
"context": {
"db_cpu_pct": 92,
"slow_queries": 210,
"replica_lag": 5
},
"ground_truth": {"root_cause": "DATABASE"}
},
{
"incident_id": "INC-023",
"task_type": "task2",
"alert_text": "[CRITICAL] Multiple pods evicted. Node memory pressure warnings.",
"context": {
"pods_evicted": 30,
"node_memory_pressure": True,
"cluster_health": "degraded"
},
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
},
# TASK 3: Action (Ambiguous Decisions)
{
"incident_id": "INC-024",
"task_type": "task3",
"alert_text": "[WARNING] CPU high but traffic spike detected. Autoscaling already active.",
"context": {
"cpu_pct": 90,
"traffic_spike": True,
"autoscaling": "active"
},
"ground_truth": {"action": "SCALE_UP"}
},
{
"incident_id": "INC-025",
"task_type": "task3",
"alert_text": "[ERROR] New deploy caused minor errors (5%). System stable otherwise.",
"context": {
"error_rate": 5,
"deploy": "recent",
"system_stability": "mostly stable"
},
"ground_truth": {"action": "INVESTIGATE"}
},
{
"incident_id": "INC-026",
"task_type": "task3",
"alert_text": "[CRITICAL] Service stuck. No response. Health checks failing continuously.",
"context": {
"health_check": "failing",
"response": "none",
"deploy": "old"
},
"ground_truth": {"action": "RESTART_SERVICE"}
},
{
"incident_id": "INC-027",
"task_type": "task3",
"alert_text": "[WARNING] Vendor API returning intermittent failures.",
"context": {
"vendor": "Twilio",
"failure_rate": 18,
"our_system": "healthy"
},
"ground_truth": {"action": "NOTIFY_VENDOR"}
},
{
"incident_id": "INC-028",
"task_type": "task3",
"alert_text": "[CRITICAL] DB primary down, replica healthy.",
"context": {
"primary_status": "down",
"replica": "healthy",
"writes": "failing"
},
"ground_truth": {"action": "FAILOVER"}
},
# HARD CASES (REAL THINKING)
{
"incident_id": "INC-029",
"task_type": "task3",
"alert_text": "[WARNING] Latency increased after deploy but no errors observed.",
"context": {
"latency": 2500,
"error_rate": 0,
"deploy": "recent"
},
"ground_truth": {"action": "INVESTIGATE"}
},
{
"incident_id": "INC-030",
"task_type": "task2",
"alert_text": "[CRITICAL] Failures observed. External API slow and DB connections also high.",
"context": {
"external_api_latency": 3000,
"db_connections": "95%",
"recent_deploy": False
},
"ground_truth": {"root_cause": "THIRD_PARTY"}
},
{
"incident_id": "INC-031",
"task_type": "task1",
"alert_text": "[WARNING] Partial outage in recommendation engine. Affects 10% users.",
"context": {
"affected_users_pct": 10,
"service": "recommendation",
"revenue_impact": "low"
},
"ground_truth": {"severity": "SEV2"}
},
{
"incident_id": "INC-032",
"task_type": "task2",
"alert_text": "[ERROR] Random crashes in service. No infra issues. No recent deploy.",
"context": {
"crash_logs": True,
"infra_health": "good",
"deploy": "none"
},
"ground_truth": {"root_cause": "APPLICATION"}
},
{
"incident_id": "INC-033",
"task_type": "task3",
"alert_text": "[INFO] Minor UI glitch reported by users.",
"context": {
"impact": "cosmetic",
"users_affected": 50
},
"ground_truth": {"action": "NO_ACTION"}
},
{
"incident_id": "INC-034",
"task_type": "task1",
"alert_text": "[CRITICAL] Login failures spike to 70% but only in one region.",
"context": {
"failure_rate": 70,
"region": "ap-south-1",
"global_impact": False
},
"ground_truth": {"severity": "SEV1"}
},
{
"incident_id": "INC-035",
"task_type": "task2",
"alert_text": "[WARNING] Increased retries and timeouts. Network stable. DB stable.",
"context": {
"timeouts": True,
"network": "stable",
"db": "stable"
},
"ground_truth": {"root_cause": "APPLICATION"}
},
{
"incident_id": "INC-036",
"task_type": "task3",
"alert_text": "[WARNING] Memory leak suspected. Service degrading slowly.",
"context": {
"memory_growth": True,
"crash": False,
"impact": "gradual"
},
"ground_truth": {"action": "INVESTIGATE"}
}
]
def _make_ticket(
incident_id: str,
task_type: str,
alert_text: str,
context: dict,
expected_field: str,
expected_value: str,
) -> dict:
return {
"incident_id": incident_id,
"task_type": task_type,
"alert_text": alert_text,
"context": context,
"ground_truth": {expected_field: expected_value},
}
_EXPANDED_TASK1 = [
("INC-037", "[CRITICAL] Checkout API returning 502 for 58% of requests. Revenue impact confirmed during peak sale.", {"service": "checkout-api", "error_rate_pct": 58, "affected_users": 87000, "revenue_impact": True, "region": "us-west-2"}, "SEV1"),
("INC-038", "[WARNING] Billing dashboard latency elevated to 4200ms. 11% of invoice lookups timing out.", {"service": "billing-dashboard", "p95_latency_ms": 4200, "timeout_rate_pct": 11, "region": "eu-central-1", "revenue_impact": False}, "SEV2"),
("INC-039", "[INFO] Employee directory avatar placeholders rendering incorrectly. Internal only and purely cosmetic.", {"service": "employee-directory", "affected_users": "internal only", "user_impact": "cosmetic", "release": "2026.04.12"}, "SEV3"),
("INC-040", "[CRITICAL] Search edge returning errors globally. Error rate reached 73% across web and mobile clients.", {"service": "search-edge", "error_rate_pct": 73, "region": "global", "affected_channels": ["web", "mobile"], "on_call_notified": True}, "SEV1"),
("INC-041", "[WARNING] Refund worker backlog growing. 12,400 refund jobs delayed by roughly 27 minutes.", {"service": "refund-worker", "queue_backlog": 12400, "avg_delay_min": 27, "consumer_lag": "high", "revenue_impact": False}, "SEV2"),
("INC-042", "[INFO] Analytics dashboard legend text overlaps on one widget. No user-facing impact.", {"service": "analytics-dashboard", "impact": "cosmetic", "reported_by": "internal QA", "user_impact": False}, "SEV3"),
("INC-043", "[CRITICAL] Trading API rejecting 47% of high-value order placements. Revenue impact confirmed.", {"service": "trading-api", "error_rate_pct": 47, "affected_segment": "institutional", "revenue_impact": True, "region": "us-east-1"}, "SEV1"),
("INC-044", "[WARNING] Product catalog reads slowed to p99 3600ms. 9% of requests timing out in one region.", {"service": "catalog-read", "p99_latency_ms": 3600, "timeout_rate_pct": 9, "region": "ap-south-1", "cache_hit_rate": 61}, "SEV2"),
("INC-045", "[INFO] Settings page icon alignment shifted after theme update. Cosmetic only.", {"service": "settings-ui", "impact": "cosmetic", "affected_users": 430, "last_deploy": "20m ago"}, "SEV3"),
("INC-046", "[CRITICAL] OTP verification failures reached 41% across all regions. Login and checkout MFA both impacted.", {"service": "otp-service", "error_rate_pct": 41, "affected_flows": ["login", "checkout_mfa"], "region": "global", "on_call_notified": True}, "SEV1"),
("INC-047", "[WARNING] Partner sync jobs delayed by 19 minutes. Queue depth increasing but customer traffic remains steady.", {"service": "partner-sync", "queue_delay_min": 19, "queue_backlog": 3900, "revenue_impact": False, "region": "us-east-2"}, "SEV2"),
("INC-048", "[INFO] Internal admin export button style broke after CSS refactor. Internal only.", {"service": "admin-console", "affected_users": "internal only", "user_impact": "cosmetic", "last_deploy": "45m ago"}, "SEV3"),
("INC-049", "[CRITICAL] Reservation service timing out for 82% of hotel bookings. 95,000 users affected.", {"service": "reservation-service", "error_rate_pct": 82, "affected_users": 95000, "region": "eu-west-1", "revenue_impact": True}, "SEV1"),
("INC-050", "[WARNING] Session refresh tokens intermittently failing in one region. Error rate holding at 14%.", {"service": "session-service", "error_rate_pct": 14, "region": "sa-east-1", "affected_flows": ["token_refresh"], "revenue_impact": False}, "SEV2"),
("INC-051", "[INFO] Support knowledge-base thumbnails missing on article cards. No user-facing impact for customers.", {"service": "support-kb", "affected_users": "internal only", "impact": "cosmetic", "user_impact": False}, "SEV3"),
("INC-052", "[CRITICAL] Shipment label generation returning 500 for 52% of attempts. Revenue impact confirmed for same-day orders.", {"service": "label-service", "error_rate_pct": 52, "affected_segment": "same-day delivery", "revenue_impact": True, "region": "us-central-1"}, "SEV1"),
("INC-053", "[WARNING] Recommendation API latency spiked to 5100ms. 7% of requests timing out.", {"service": "recommendation-api", "p99_latency_ms": 5100, "timeout_rate_pct": 7, "region": "eu-north-1", "cache_status": "degraded"}, "SEV2"),
("INC-054", "[INFO] Marketing site promo banner shifted below the fold after a CSS tweak. Cosmetic only.", {"service": "marketing-site", "impact": "cosmetic", "last_deploy": "15m ago", "user_impact": "cosmetic"}, "SEV3"),
("INC-055", "[CRITICAL] Payroll export API failing for 100% of scheduled runs. Finance operations blocked.", {"service": "payroll-export", "http_500_rate": "100%", "affected_users": "internal finance", "region": "us-east-1", "on_call_notified": True}, "SEV1"),
("INC-056", "[WARNING] Loyalty points processor lagging behind by 14 minutes. Customers see delayed balances.", {"service": "loyalty-points", "avg_delay_min": 14, "affected_users_pct": 9, "revenue_impact": False, "region": "us-west-1"}, "SEV2"),
("INC-057", "[INFO] Observability annotation drawer not rendering markdown bullets correctly. No user-facing impact.", {"service": "observability-ui", "impact": "cosmetic", "user_impact": False, "reported_by": "internal SRE"}, "SEV3"),
("INC-058", "[CRITICAL] Wallet debit flow failing for 44% of attempts worldwide. Revenue dependency is high.", {"service": "wallet-debit", "error_rate_pct": 44, "region": "global", "revenue_dependency": "high", "affected_flows": ["wallet_topup", "wallet_pay"]}, "SEV1"),
("INC-059", "[WARNING] Fraud-score enrichment responses slowed significantly. 15% of requests timing out.", {"service": "fraud-enrichment", "timeout_rate_pct": 15, "p99_latency_ms": 3900, "region": "eu-west-2", "revenue_impact": False}, "SEV2"),
("INC-060", "[INFO] CMS preview page using fallback font after stylesheet cache miss. Purely cosmetic.", {"service": "cms-preview", "impact": "cosmetic", "user_impact": "cosmetic", "last_deploy": "1h ago"}, "SEV3"),
("INC-061", "[CRITICAL] Invoice generation failing for 65% of enterprise accounts. Revenue impact confirmed.", {"service": "invoice-engine", "error_rate_pct": 65, "affected_segment": "enterprise", "revenue_impact": True, "region": "us-east-1"}, "SEV1"),
]
_EXPANDED_TASK2 = [
("INC-062", "[CRITICAL] PostgreSQL vacuum lag increasing rapidly. Connection pool pinned at 480/500 on primary.", {"db": "postgres-orders", "connection_pool": "480/500", "replica_lag_sec": 41, "slow_query_count": 122}, "DATABASE"),
("INC-063", "[CRITICAL] Packet loss 27% on transit link. Route instability causing inter-region request failures.", {"packet_loss_pct": 27, "route_state": "flapping", "affected_regions": ["us-east-1", "ca-central-1"], "traceroute": "drops at transit hop 5"}, "NETWORK"),
("INC-064", "[ERROR] Exception flood began after deploy. Stack trace points to inventory_rules.py.", {"service": "inventory-service", "exception": "IllegalStateException", "stack_trace": "inventory_rules.py:118", "last_deploy": "15m ago"}, "APPLICATION"),
("INC-065", "[WARNING] SendGrid webhook acknowledgements failing. Vendor status page reports degraded mail delivery.", {"vendor": "SendGrid", "webhook_failures": 540, "sendgrid_status": "degraded", "our_service_health": "healthy"}, "THIRD_PARTY"),
("INC-066", "[CRITICAL] Kubernetes cluster degraded. Node NotReady events triggered pod evictions in checkout namespace.", {"cluster": "prod-checkout", "nodes_not_ready": 4, "evicted_pods": 29, "namespace": "checkout", "cause": "EC2 maintenance"}, "INFRASTRUCTURE"),
("INC-067", "[CRITICAL] Database slow query storm detected. Write queries blocked behind lock contention on Postgres.", {"database": "customer-profile", "slow_query_count": 301, "write_queries_blocked": True, "replica_lag_sec": 19}, "DATABASE"),
("INC-068", "[CRITICAL] Cross-region API calls failing during route flap. Traceroute drops beyond the carrier edge.", {"cross_region_calls": "failing", "route_flap": True, "traceroute": "carrier edge timeout", "packet_loss_pct": 18}, "NETWORK"),
("INC-069", "[ERROR] Crash loop started immediately after deploy. Code path in tax_adapter raises a NullPointerException.", {"service": "tax-adapter", "crash_count": 37, "exception": "NullPointerException", "deploy_version": "2026.04.12.4"}, "APPLICATION"),
("INC-070", "[WARNING] Stripe external API returning 502s. Our workers healthy but payment confirmations delayed.", {"vendor": "Stripe", "external_api_errors": 502, "worker_health": "healthy", "webhook_queue": 220}, "THIRD_PARTY"),
("INC-071", "[CRITICAL] Node memory pressure spread across the analytics cluster after EC2 spot interruption.", {"cluster": "analytics-prod", "node_memory_pressure": True, "spot_interruption": True, "pods_pending": 41}, "INFRASTRUCTURE"),
("INC-072", "[CRITICAL] Postgres replica lag hit 88 seconds. Connection pool exhausted and disk spill rising.", {"db": "postgres-ledger", "replica_lag_sec": 88, "connection_pool": "500/500", "disk_spill": True}, "DATABASE"),
("INC-073", "[CRITICAL] BGP convergence instability causing packet loss on network fabric between edge POPs.", {"bgp_flap": True, "packet_loss_pct": 21, "affected_regions": ["sin", "mum"], "provider": "Equinix"}, "NETWORK"),
("INC-074", "[ERROR] TimeoutException rate surged after release. No infrastructure alarms fired.", {"service": "pricing-engine", "exception": "TimeoutException", "last_deploy": "8m ago", "infra_health": "normal"}, "APPLICATION"),
("INC-075", "[WARNING] Twilio callback deliveries failing. Vendor dashboard marked degraded for messaging webhooks.", {"vendor": "Twilio", "callback_failures": 610, "twilio_status": "degraded", "webhook_latency_ms": 2600}, "THIRD_PARTY"),
("INC-076", "[CRITICAL] Cluster autoscaler could not recover capacity. Multiple Node NotReady alerts remain active.", {"cluster": "media-prod", "nodes_not_ready": 5, "pod_restarts": 84, "autoscaler_state": "stalled"}, "INFRASTRUCTURE"),
("INC-077", "[WARNING] DATABASE CPU saturation causing slow query growth after migration batch kicked off.", {"database": "ledger-db", "db_cpu_pct": 94, "slow_query_count": 143, "migration_job": "running"}, "DATABASE"),
("INC-078", "[CRITICAL] Traceroute drops observed at transit hop 8. Cross-region route instability persists.", {"traceroute": "drops at transit hop 8", "packet_loss_pct": 24, "cross_region_errors": 1900, "network_team_paged": True}, "NETWORK"),
("INC-079", "[ERROR] Random crash reports tied to auth middleware. Stack trace repeats in the same code path.", {"service": "auth-gateway", "crash_logs": True, "stack_trace": "auth_middleware.py:52", "deploy": "none"}, "APPLICATION"),
("INC-080", "[WARNING] External API vendor degraded. Webhook retries growing while our services remain healthy.", {"vendor": "Mapbox", "external_api_status": "degraded", "webhook_retries": 1180, "our_service_health": "healthy"}, "THIRD_PARTY"),
("INC-081", "[CRITICAL] Kubernetes control plane healthy, but worker cluster showing NotReady nodes and pod evictions.", {"cluster": "streaming-prod", "nodes_not_ready": 3, "evicted_pods": 24, "namespace": "transcoding"}, "INFRASTRUCTURE"),
("INC-082", "[CRITICAL] Write queries spilling to disk on DATABASE primary. Slow query count jumped above 400.", {"database": "orders-db", "write_queries": "spilling", "slow_query_count": 417, "replica_lag_sec": 33}, "DATABASE"),
("INC-083", "[CRITICAL] Network route instability causing 31% packet loss on partner edge connectivity.", {"route_state": "unstable", "packet_loss_pct": 31, "partner_edge": "degraded", "traceroute": "loss after peering route"}, "NETWORK"),
("INC-084", "[ERROR] Deploy triggered exception burst in settlement service. Stack trace points to a new code branch.", {"service": "settlement-service", "exception": "ValueError", "stack_trace": "settlement_flow.py:211", "last_deploy": "12m ago"}, "APPLICATION"),
("INC-085", "[WARNING] Vendor outage degrading payment gateway callbacks. Stripe webhook backlog keeps climbing.", {"vendor": "Stripe", "webhook_backlog": 930, "stripe_status": "incident", "our_service_health": "healthy"}, "THIRD_PARTY"),
]
_EXPANDED_TASK3 = [
("INC-086", "[CRITICAL] Error rate spiked immediately after deploy. Previous stable image already validated. Rollback window open.", {"service": "checkout-api", "recent_deploy_caused": True, "previous_stable": "2026.04.10.1", "rollback_tested": True}, "ROLLBACK"),
("INC-087", "[WARNING] Search queue depth exploded during flash sale. CPU pinned and autoscaler already at max_replicas.", {"service": "search-aggregator", "queue_depth": 18200, "cpu_pct": 97, "max_replicas": 24, "event": "flash sale"}, "SCALE_UP"),
("INC-088", "[ERROR] Inventory worker deadlock detected. Process not responding and health check failing continuously.", {"service": "inventory-worker", "deadlock_detected": True, "health_check": "failing", "process_state": "not responding"}, "RESTART_SERVICE"),
("INC-089", "[CRITICAL] Primary database down. Read replica healthy but writes failing across all tenants.", {"db": "tenant-primary", "primary_down": True, "read_replica": "healthy", "writes_failing": True}, "FAILOVER"),
("INC-090", "[WARNING] Stripe vendor incident causing 429s and webhook retries. Local infra healthy.", {"vendor": "Stripe", "webhook_retries": 760, "stripe_status": "degraded", "our_system": "healthy"}, "NOTIFY_VENDOR"),
("INC-091", "[INFO] Minor UI glitch on profile badges after CSS tweak. Cosmetic issue only.", {"service": "profile-ui", "impact": "cosmetic", "reported_users": 27}, "NO_ACTION"),
("INC-092", "[WARNING] Latency and retries increased gradually without a single obvious trigger. Mixed signals across services.", {"service": "document-renderer", "latency_ms": 2400, "retry_rate_pct": 8, "synthetic_probe_status": "passing"}, "INVESTIGATE"),
("INC-093", "[CRITICAL] Immediately after deploy, checkout failures jumped to 61%. Previous stable release is available.", {"service": "checkout-core", "immediately_after_deploy": True, "previous_stable": "2026.04.11.2", "error_rate_pct": 61}, "ROLLBACK"),
("INC-094", "[WARNING] Queue backlog growing fast during traffic spike. CPU saturated and autoscaler at MAX_REPLICAS.", {"service": "feed-generator", "traffic_spike": True, "queue_depth": 9700, "cpu_pct": 95, "max_replicas": 18}, "SCALE_UP"),
("INC-095", "[ERROR] Recommendation worker appears stuck. No response on health check endpoint for 11 minutes.", {"service": "recommendation-worker", "stuck": True, "health_check": "failing", "last_response": "11m ago"}, "RESTART_SERVICE"),
("INC-096", "[CRITICAL] Primary RDS unreachable. Failover has not happened yet and writes are failing.", {"db": "orders-rds-primary", "primary_rds": "unreachable", "failover": "pending", "writes_failing": True}, "FAILOVER"),
("INC-097", "[WARNING] Twilio vendor API returning intermittent errors. Our retry workers remain healthy.", {"vendor": "Twilio", "api_errors_pct": 23, "retry_workers": "healthy", "callback_delay_sec": 90}, "NOTIFY_VENDOR"),
("INC-098", "[INFO] Minor UI glitch on internal reporting theme toggle. Cosmetic only.", {"service": "reporting-ui", "impact": "cosmetic", "affected_users": "internal only"}, "NO_ACTION"),
("INC-099", "[WARNING] Slow degradation observed with mixed latency and retry symptoms. No single fix stands out yet.", {"service": "export-service", "latency_ms": 2800, "retry_rate_pct": 6, "error_rate_pct": 2}, "INVESTIGATE"),
("INC-100", "[CRITICAL] Recent deploy caused auth failures and rollback candidate already passed smoke tests.", {"service": "auth-api", "recent_deploy_caused": True, "rollback_tested": True, "previous_stable": "2026.04.11.6"}, "ROLLBACK"),
("INC-101", "[WARNING] CPU at 99% with queue growth on image pipeline. Autoscaler already capped at max replicas.", {"service": "image-pipeline", "cpu_pct": 99, "queue_depth": 11100, "autoscaler": "max_replicas"}, "SCALE_UP"),
("INC-102", "[ERROR] Scheduler deadlock detected. Process not responding to health check and jobs are stalled.", {"service": "scheduler", "deadlock_detected": True, "health_check": "failing", "job_backlog": 4400}, "RESTART_SERVICE"),
("INC-103", "[CRITICAL] Read replica healthy while primary down. Writes failing and customer operations blocked.", {"db": "ledger-primary", "read_replica": "healthy", "primary_down": True, "writes_failing": True}, "FAILOVER"),
("INC-104", "[WARNING] SendGrid vendor degradation causing delivery failures for transactional mail.", {"vendor": "SendGrid", "delivery_failures": 1300, "sendgrid_status": "investigating", "our_infra": "healthy"}, "NOTIFY_VENDOR"),
("INC-105", "[INFO] Cosmetic issue on loyalty badge colors after stylesheet refresh. Minor UI glitch only.", {"service": "loyalty-ui", "impact": "cosmetic", "reported_users": 61}, "NO_ACTION"),
("INC-106", "[WARNING] Memory usage trending upward slowly, but service still responds. Root cause not isolated yet.", {"service": "document-cache", "memory_growth": True, "probe_status": "passing", "error_rate_pct": 1}, "INVESTIGATE"),
("INC-107", "[CRITICAL] Recent deploy caused 54% login failures. Previous stable artifact is ready for rollback.", {"service": "login-orchestrator", "recent_deploy_caused": True, "error_rate_pct": 54, "previous_stable": "2026.04.10.9"}, "ROLLBACK"),
("INC-108", "[WARNING] Traffic spike from campaign launch pushed CPU to 92%. Queue depth climbing and autoscaler at max_replicas.", {"service": "campaign-router", "traffic_spike": True, "cpu_pct": 92, "queue_depth": 8600, "max_replicas": 16}, "SCALE_UP"),
]
TICKETS.extend(
[_make_ticket(incident_id, "task1", alert_text, context, "severity", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK1]
+ [_make_ticket(incident_id, "task2", alert_text, context, "root_cause", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK2]
+ [_make_ticket(incident_id, "task3", alert_text, context, "action", expected_value) for incident_id, alert_text, context, expected_value in _EXPANDED_TASK3]
)