shaikhsalman commited on
Commit
5a0146e
·
verified ·
1 Parent(s): 20b0f17

Upload k8s/base/slos/slos.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. k8s/base/slos/slos.yaml +68 -0
k8s/base/slos/slos.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # Service Level Objectives — Platform SLOs
3
+ # =============================================================================
4
+ # SLOs define reliability targets. Error budgets = 100% - SLO.
5
+ # Burn rate alerts fire when error budget is consumed too fast.
6
+ # =============================================================================
7
+
8
+ # --- API Availability SLO: 99.95% (21.9 min/month error budget) ---
9
+ apiVersion: monitoring.coreos.com/v1
10
+ kind: PrometheusRule
11
+ metadata:
12
+ name: slo-api-availability
13
+ namespace: monitoring
14
+ labels:
15
+ release: kube-prometheus-stack
16
+ slo: "true"
17
+ spec:
18
+ groups:
19
+ - name: slo.api.availability
20
+ rules:
21
+ # SLO metric: 5m success rate
22
+ - record: slo:api_availability:rate5m
23
+ expr: |
24
+ sum(rate(http_requests_total{code!~"5.."}[5m]))
25
+ /
26
+ sum(rate(http_requests_total[5m]))
27
+
28
+ # 1h error budget burn rate (14.4x = consume 30d budget in 2d)
29
+ - alert: SLOAPIAvailabilityBurnRateHigh
30
+ expr: |
31
+ (
32
+ (1 - slo:api_availability:rate5m) > (14.4 * 0.001)
33
+ )
34
+ for: 5m
35
+ labels:
36
+ severity: critical
37
+ slo: api-availability
38
+ annotations:
39
+ summary: "API availability SLO budget burning too fast"
40
+ runbook: "https://runbook.platform.internal/slo-api-burn"
41
+
42
+ - name: slo.api.latency
43
+ rules:
44
+ # Latency SLO: P99 < 2s, 99.9% of requests
45
+ - record: slo:api_latency_p99:rate5m
46
+ expr: |
47
+ histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
48
+
49
+ - alert: SLOAPILatencyBurnRateHigh
50
+ expr: |
51
+ slo:api_latency_p99:rate5m > 2
52
+ for: 10m
53
+ labels:
54
+ severity: warning
55
+ slo: api-latency
56
+ annotations:
57
+ summary: "API P99 latency exceeding 2s SLO"
58
+
59
+ - name: slo.error_budget
60
+ rules:
61
+ # Remaining error budget (percentage)
62
+ - record: slo:error_budget_remaining:ratio
63
+ expr: |
64
+ 1 - (
65
+ (1 - slo:api_availability:rate5m)
66
+ /
67
+ 0.0005
68
+ )