| |
| |
| |
|
|
| apiVersion: monitoring.coreos.com/v1 |
| kind: PrometheusRule |
| metadata: |
| name: platform-alerts |
| namespace: monitoring |
| labels: |
| release: kube-prometheus-stack |
| spec: |
| groups: |
| |
| - name: infrastructure |
| rules: |
| - alert: NodeDown |
| expr: up{job="node-exporter"} == 0 |
| for: 5m |
| labels: |
| severity: critical |
| team: platform |
| annotations: |
| summary: "Node {{ $labels.instance }} is down" |
| runbook: "https://runbook.platform.internal/node-down" |
|
|
| - alert: HighMemoryUsage |
| expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 |
| for: 10m |
| labels: |
| severity: warning |
| team: platform |
| annotations: |
| summary: "Node {{ $labels.instance }} has <10% memory available" |
|
|
| - alert: DiskSpaceLow |
| expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15 |
| for: 10m |
| labels: |
| severity: warning |
| team: platform |
| annotations: |
| summary: "Node {{ $labels.instance }} has <15% disk space" |
|
|
| - alert: PodCrashLooping |
| expr: rate(kube_pod_container_status_restarts_total[15m]) > 0 |
| for: 5m |
| labels: |
| severity: warning |
| team: platform |
| annotations: |
| summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" |
|
|
| |
| - name: application |
| rules: |
| - alert: HighErrorRate |
| expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 |
| for: 5m |
| labels: |
| severity: critical |
| team: app |
| annotations: |
| summary: "{{ $labels.service }} error rate >5%" |
| runbook: "https://runbook.platform.internal/high-error-rate" |
|
|
| - alert: HighLatency |
| expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2 |
| for: 10m |
| labels: |
| severity: warning |
| team: app |
| annotations: |
| summary: "{{ $labels.service }} P99 latency >2s" |
|
|
| - alert: DatabaseConnectionPoolExhausted |
| expr: db_connection_pool_available < 2 |
| for: 5m |
| labels: |
| severity: critical |
| team: app |
| annotations: |
| summary: "DB connection pool nearly exhausted" |
|
|
| |
| - name: security |
| rules: |
| - alert: FalcoRuntimeAlert |
| expr: falco_events_total{priority="Critical"} > 0 |
| for: 1m |
| labels: |
| severity: critical |
| team: security |
| annotations: |
| summary: "Falco critical event: {{ $labels.rule }}" |
| runbook: "https://runbook.platform.internal/falco-alert" |
|
|
| - alert: TrivyCriticalVulnerability |
| expr: trivy_vulnerability_id{severity="CRITICAL"} > 0 |
| for: 1h |
| labels: |
| severity: critical |
| team: security |
| annotations: |
| summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}" |
|
|
| |
| - name: slo-burn-rate |
| rules: |
| - alert: HighErrorBudgetBurn |
| expr: | |
| ( |
| rate(http_requests_total{code=~"5.."}[1h]) |
| / |
| rate(http_requests_total[1h]) |
| ) > (14.4 * 0.001) |
| for: 5m |
| labels: |
| severity: critical |
| team: platform |
| annotations: |
| summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold" |
|
|