File size: 2,527 Bytes
d00203b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: voiceforge-backend
  namespace: monitoring
  labels:
    app: voiceforge-backend
    release: prometheus
spec:
  selector:
    matchLabels:
      app: voiceforge-backend
  namespaceSelector:
    matchNames:
      - voiceforge
  endpoints:
    - port: http
      path: /metrics
      interval: 30s
      scrapeTimeout: 10s
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: voiceforge-alerts
  namespace: monitoring
  labels:
    app: voiceforge
    release: prometheus
spec:
  groups:
    - name: voiceforge.rules
      rules:
        - alert: HighErrorRate
          expr: sum(rate(http_requests_total{app="voiceforge-backend",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="voiceforge-backend"}[5m])) > 0.05
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "High error rate on VoiceForge API"
            description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)"

        - alert: HighLatency
          expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="voiceforge-backend"}[5m])) > 2
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "High latency on VoiceForge API"
            description: "p95 latency is {{ $value }}s (threshold: 2s)"

        - alert: PodRestarts
          expr: increase(kube_pod_container_status_restarts_total{namespace="voiceforge"}[1h]) > 3
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Pod restarts detected"
            description: "Pod {{ $labels.pod }} has restarted {{ $value }} times in the last hour"

        - alert: HighCPU
          expr: avg(container_cpu_usage_seconds_total{pod=~"voiceforge-.*"}) > 0.8
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "High CPU usage on VoiceForge pods"
            description: "Average CPU usage is {{ $value | humanizePercentage }}"

        - alert: HighMemory
          expr: avg(container_memory_usage_bytes{pod=~"voiceforge-.*"}) / avg(container_spec_memory_limit_bytes{pod=~"voiceforge-.*"}) > 0.9
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "High memory usage on VoiceForge pods"
            description: "Memory usage is at {{ $value | humanizePercentage }}"