File size: 3,937 Bytes
7c19d46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | # =============================================================================
# Prometheus Alerting Rules — Platform Health
# =============================================================================
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: platform-alerts
namespace: monitoring
labels:
release: kube-prometheus-stack
spec:
groups:
# --- Infrastructure Alerts ---
- name: infrastructure
rules:
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "Node {{ $labels.instance }} is down"
runbook: "https://runbook.platform.internal/node-down"
- alert: HighMemoryUsage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Node {{ $labels.instance }} has <10% memory available"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Node {{ $labels.instance }} has <15% disk space"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
# --- Application Alerts ---
- name: application
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
team: app
annotations:
summary: "{{ $labels.service }} error rate >5%"
runbook: "https://runbook.platform.internal/high-error-rate"
- alert: HighLatency
expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 10m
labels:
severity: warning
team: app
annotations:
summary: "{{ $labels.service }} P99 latency >2s"
- alert: DatabaseConnectionPoolExhausted
expr: db_connection_pool_available < 2
for: 5m
labels:
severity: critical
team: app
annotations:
summary: "DB connection pool nearly exhausted"
# --- Security Alerts ---
- name: security
rules:
- alert: FalcoRuntimeAlert
expr: falco_events_total{priority="Critical"} > 0
for: 1m
labels:
severity: critical
team: security
annotations:
summary: "Falco critical event: {{ $labels.rule }}"
runbook: "https://runbook.platform.internal/falco-alert"
- alert: TrivyCriticalVulnerability
expr: trivy_vulnerability_id{severity="CRITICAL"} > 0
for: 1h
labels:
severity: critical
team: security
annotations:
summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}"
# --- SLO Burn Rate Alerts ---
- name: slo-burn-rate
rules:
- alert: HighErrorBudgetBurn
expr: |
(
rate(http_requests_total{code=~"5.."}[1h])
/
rate(http_requests_total[1h])
) > (14.4 * 0.001)
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold"
|