refactor: merged structure - model at center, DevSecOps wrapped around it

9d4d5c7 verified 26 days ago

3.94 kB

	# =============================================================================
	# Prometheus Alerting Rules — Platform Health
	# =============================================================================

	apiVersion: monitoring.coreos.com/v1
	kind: PrometheusRule
	metadata:
	name: platform-alerts
	namespace: monitoring
	labels:
	release: kube-prometheus-stack
	spec:
	groups:
	# --- Infrastructure Alerts ---
	- name: infrastructure
	rules:
	- alert: NodeDown
	expr: up{job="node-exporter"} == 0
	for: 5m
	labels:
	severity: critical
	team: platform
	annotations:
	summary: "Node {{ $labels.instance }} is down"
	runbook: "https://runbook.platform.internal/node-down"

	- alert: HighMemoryUsage
	expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
	for: 10m
	labels:
	severity: warning
	team: platform
	annotations:
	summary: "Node {{ $labels.instance }} has <10% memory available"

	- alert: DiskSpaceLow
	expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.15
	for: 10m
	labels:
	severity: warning
	team: platform
	annotations:
	summary: "Node {{ $labels.instance }} has <15% disk space"

	- alert: PodCrashLooping
	expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
	for: 5m
	labels:
	severity: warning
	team: platform
	annotations:
	summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"

	# --- Application Alerts ---
	- name: application
	rules:
	- alert: HighErrorRate
	expr: rate(http_requests_total{code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
	for: 5m
	labels:
	severity: critical
	team: app
	annotations:
	summary: "{{ $labels.service }} error rate >5%"
	runbook: "https://runbook.platform.internal/high-error-rate"

	- alert: HighLatency
	expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 2
	for: 10m
	labels:
	severity: warning
	team: app
	annotations:
	summary: "{{ $labels.service }} P99 latency >2s"

	- alert: DatabaseConnectionPoolExhausted
	expr: db_connection_pool_available < 2
	for: 5m
	labels:
	severity: critical
	team: app
	annotations:
	summary: "DB connection pool nearly exhausted"

	# --- Security Alerts ---
	- name: security
	rules:
	- alert: FalcoRuntimeAlert
	expr: falco_events_total{priority="Critical"} > 0
	for: 1m
	labels:
	severity: critical
	team: security
	annotations:
	summary: "Falco critical event: {{ $labels.rule }}"
	runbook: "https://runbook.platform.internal/falco-alert"

	- alert: TrivyCriticalVulnerability
	expr: trivy_vulnerability_id{severity="CRITICAL"} > 0
	for: 1h
	labels:
	severity: critical
	team: security
	annotations:
	summary: "Critical CVE {{ $labels.vulnerability_id }} in {{ $labels.image }}"

	# --- SLO Burn Rate Alerts ---
	- name: slo-burn-rate
	rules:
	- alert: HighErrorBudgetBurn
	expr: \|
	(
	rate(http_requests_total{code=~"5.."}[1h])
	/
	rate(http_requests_total[1h])
	) > (14.4 * 0.001)
	for: 5m
	labels:
	severity: critical
	team: platform
	annotations:
	summary: "Error budget burning too fast — 1h burn rate exceeds 14.4x threshold"