File size: 3,937 Bytes

7c19d46

# =============================================================================
# Prometheus Alerting Rules — Platform Health
# =============================================================================

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: platform-alerts
  namespace: monitoring
  labels:
    release: kube-prometheus-stack
spec:
  groups:
    # --- Infrastructure Alerts ---
    - name: infrastructure
      rules:
        - alert: NodeDown
          expr: up{job="node-exporter"} == 0
          for: 5m
          labels:
            severity: critical
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} is down"
            runbook: "https://runbook.platform.internal/node-down"

        - alert: HighMemoryUsage
          expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
          for: 10m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} has <10% memory available"

        - alert: DiskSpaceLow
          expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15
          for: 10m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Node {{ $labels.instance }} has <15% disk space"

        - alert: PodCrashLooping
          expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
          for: 5m
          labels:
            severity: warning
            team: platform
          annotations:
            summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"

    # --- Application Alerts ---
    - name: application
      rules:
        - alert: HighErrorRate
          expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
          for: 5m
          labels:
            severity: critical
            team: app
          annotations:
            summary: "{{ $labels.service }} error rate >5%"
            runbook: "https://runbook.platform.internal/high-error-rate"

        - alert: HighLatency
          expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
          for: 10m
          labels:
            severity: warning
            team: app
          annotations:
            summary: "{{ $labels.service }} P99 latency >2s"

        - alert: DatabaseConnectionPoolExhausted
          expr: db_connection_pool_available < 2
          for: 5m
          labels:
            severity: critical
            team: app
          annotations:
            summary: "DB connection pool nearly exhausted"

    # --- Security Alerts ---
    - name: security
      rules:
        - alert: FalcoRuntimeAlert
          expr: falco_events_total{priority="Critical"} > 0
          for: 1m
          labels:
            severity: critical
            team: security
          annotations:
            summary: "Falco critical event: {{ $labels.rule }}"
            runbook: "https://runbook.platform.internal/falco-alert"

        - alert: TrivyCriticalVulnerability
          expr: trivy_vulnerability_id{severity="CRITICAL"} > 0
          for: 1h
          labels:
            severity: critical
            team: security
          annotations:
            summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}"

    # --- SLO Burn Rate Alerts ---
    - name: slo-burn-rate
      rules:
        - alert: HighErrorBudgetBurn
          expr: |
            (
              rate(http_requests_total{code=~"5.."}[1h])
              /
              rate(http_requests_total[1h])
            ) > (14.4 * 0.001)
          for: 5m
          labels:
            severity: critical
            team: platform
          annotations:
            summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold"