# ============================================================================= # Prometheus Alerting Rules — Platform Health # ============================================================================= apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: platform-alerts namespace: monitoring labels: release: kube-prometheus-stack spec: groups: # --- Infrastructure Alerts --- - name: infrastructure rules: - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 5m labels: severity: critical team: platform annotations: summary: "Node {{ $labels.instance }} is down" runbook: "https://runbook.platform.internal/node-down" - alert: HighMemoryUsage expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 for: 10m labels: severity: warning team: platform annotations: summary: "Node {{ $labels.instance }} has <10% memory available" - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15 for: 10m labels: severity: warning team: platform annotations: summary: "Node {{ $labels.instance }} has <15% disk space" - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) > 0 for: 5m labels: severity: warning team: platform annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" # --- Application Alerts --- - name: application rules: - alert: HighErrorRate expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: critical team: app annotations: summary: "{{ $labels.service }} error rate >5%" runbook: "https://runbook.platform.internal/high-error-rate" - alert: HighLatency expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 10m labels: severity: warning team: app annotations: summary: "{{ $labels.service }} P99 latency >2s" - alert: DatabaseConnectionPoolExhausted expr: db_connection_pool_available < 2 for: 5m labels: severity: critical team: app annotations: summary: "DB connection pool nearly exhausted" # --- Security Alerts --- - name: security rules: - alert: FalcoRuntimeAlert expr: falco_events_total{priority="Critical"} > 0 for: 1m labels: severity: critical team: security annotations: summary: "Falco critical event: {{ $labels.rule }}" runbook: "https://runbook.platform.internal/falco-alert" - alert: TrivyCriticalVulnerability expr: trivy_vulnerability_id{severity="CRITICAL"} > 0 for: 1h labels: severity: critical team: security annotations: summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}" # --- SLO Burn Rate Alerts --- - name: slo-burn-rate rules: - alert: HighErrorBudgetBurn expr: | ( rate(http_requests_total{code=~"5.."}[1h]) / rate(http_requests_total[1h]) ) > (14.4 * 0.001) for: 5m labels: severity: critical team: platform annotations: summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold"