File size: 3,937 Bytes
7c19d46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# =============================================================================
# Prometheus Alerting Rules — Platform Health
# =============================================================================

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: platform-alerts
  namespace: monitoring
  labels:
    release: kube-prometheus-stack
spec:
  groups:
    # --- Infrastructure Alerts ---
    - name: infrastructure
      rules:
        - alert: NodeDown
          expr: up{job="node-exporter"} == 0
          for: 5m
          labels:
            severity: critical
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} is down"
            runbook: "https://runbook.platform.internal/node-down"

        - alert: HighMemoryUsage
          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
          for: 10m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} has <10% memory available"

        - alert: DiskSpaceLow
          expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15
          for: 10m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} has <15% disk space"

        - alert: PodCrashLooping
          expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
          for: 5m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"

    # --- Application Alerts ---
    - name: application
      rules:
        - alert: HighErrorRate
          expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
          for: 5m
          labels:
            severity: critical
            team: app
          annotations:
            summary: "{{ $labels.service }} error rate >5%"
            runbook: "https://runbook.platform.internal/high-error-rate"

        - alert: HighLatency
          expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
          for: 10m
          labels:
            severity: warning
            team: app
          annotations:
            summary: "{{ $labels.service }} P99 latency >2s"

        - alert: DatabaseConnectionPoolExhausted
          expr: db_connection_pool_available < 2
          for: 5m
          labels:
            severity: critical
            team: app
          annotations:
            summary: "DB connection pool nearly exhausted"

    # --- Security Alerts ---
    - name: security
      rules:
        - alert: FalcoRuntimeAlert
          expr: falco_events_total{priority="Critical"} > 0
          for: 1m
          labels:
            severity: critical
            team: security
          annotations:
            summary: "Falco critical event: {{ $labels.rule }}"
            runbook: "https://runbook.platform.internal/falco-alert"

        - alert: TrivyCriticalVulnerability
          expr: trivy_vulnerability_id{severity="CRITICAL"} > 0
          for: 1h
          labels:
            severity: critical
            team: security
          annotations:
            summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}"

    # --- SLO Burn Rate Alerts ---
    - name: slo-burn-rate
      rules:
        - alert: HighErrorBudgetBurn
          expr: |
            (
              rate(http_requests_total{code=~"5.."}[1h])
              /
              rate(http_requests_total[1h])
            ) > (14.4 * 0.001)
          for: 5m
          labels:
            severity: critical
            team: platform
          annotations:
            summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold"