saemstunes commited on
Commit
5e1e0b6
·
verified ·
1 Parent(s): 780d2f2

Update monitoring/alerts.yml

Browse files
Files changed (1) hide show
  1. monitoring/alerts.yml +103 -17
monitoring/alerts.yml CHANGED
@@ -1,38 +1,124 @@
1
  groups:
2
- - name: saems_ai_alerts
3
  rules:
4
  - alert: HighErrorRate
5
- expr: rate(ai_inference_errors_total[5m]) > 0.05
6
- for: 2m
7
  labels:
8
  severity: warning
 
9
  annotations:
10
- summary: "High error rate detected"
11
- description: "Error rate is above 5% for the last 5 minutes"
12
 
13
- - alert: SlowResponseTime
14
- expr: ai_inference_duration_seconds{quantile="0.95"} > 10
15
  for: 2m
16
  labels:
17
- severity: warning
 
18
  annotations:
19
- summary: "Slow response times detected"
20
- description: "95th percentile response time is above 10 seconds"
21
 
22
- - alert: HighCPUUsage
23
- expr: ai_system_cpu_percent > 80
24
  for: 5m
25
  labels:
26
  severity: warning
 
27
  annotations:
28
- summary: "High CPU usage"
29
- description: "CPU usage is above 80% for 5 minutes"
30
 
31
- - alert: HighMemoryUsage
32
  expr: ai_system_memory_percent > 90
33
  for: 5m
34
  labels:
35
- severity: critical
 
36
  annotations:
37
  summary: "High memory usage"
38
- description: "Memory usage is above 90% for 5 minutes"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  groups:
2
+ - name: saems_tunes_ai_alerts
3
  rules:
4
  - alert: HighErrorRate
5
+ expr: ai_error_rate_percent > 10
6
+ for: 5m
7
  labels:
8
  severity: warning
9
+ service: ai-system
10
  annotations:
11
+ summary: "High error rate in AI system"
12
+ description: "AI system error rate is {{ $value }}% for more than 5 minutes"
13
 
14
+ - alert: CriticalErrorRate
15
+ expr: ai_error_rate_percent > 25
16
  for: 2m
17
  labels:
18
+ severity: critical
19
+ service: ai-system
20
  annotations:
21
+ summary: "Critical error rate in AI system"
22
+ description: "AI system error rate is {{ $value }}% for more than 2 minutes"
23
 
24
+ - alert: HighResponseTime
25
+ expr: ai_response_time_95th_percentile > 30
26
  for: 5m
27
  labels:
28
  severity: warning
29
+ service: ai-system
30
  annotations:
31
+ summary: "High response time in AI system"
32
+ description: "AI system 95th percentile response time is {{ $value }}s"
33
 
34
+ - alert: SystemMemoryHigh
35
  expr: ai_system_memory_percent > 90
36
  for: 5m
37
  labels:
38
+ severity: warning
39
+ service: infrastructure
40
  annotations:
41
  summary: "High memory usage"
42
+ description: "System memory usage is {{ $value }}%"
43
+
44
+ - alert: SystemCPUHigh
45
+ expr: ai_system_cpu_percent > 85
46
+ for: 5m
47
+ labels:
48
+ severity: warning
49
+ service: infrastructure
50
+ annotations:
51
+ summary: "High CPU usage"
52
+ description: "System CPU usage is {{ $value }}%"
53
+
54
+ - alert: DiskSpaceLow
55
+ expr: ai_system_disk_percent > 90
56
+ for: 2m
57
+ labels:
58
+ severity: critical
59
+ service: infrastructure
60
+ annotations:
61
+ summary: "Low disk space"
62
+ description: "Disk usage is {{ $value }}%"
63
+
64
+ - alert: ModelInferenceErrors
65
+ expr: rate(ai_inference_requests_total{status="error"}[5m]) > 0.1
66
+ for: 3m
67
+ labels:
68
+ severity: warning
69
+ service: ai-system
70
+ annotations:
71
+ summary: "High model inference error rate"
72
+ description: "Model inference error rate is high"
73
+
74
+ - alert: CacheHitRateLow
75
+ expr: ai_cache_hit_rate_percent < 20
76
+ for: 10m
77
+ labels:
78
+ severity: warning
79
+ service: ai-system
80
+ annotations:
81
+ summary: "Low cache hit rate"
82
+ description: "Cache hit rate is {{ $value }}%"
83
+
84
+ - alert: ThroughputSpike
85
+ expr: rate(ai_inference_requests_total[5m]) > 100
86
+ for: 2m
87
+ labels:
88
+ severity: warning
89
+ service: ai-system
90
+ annotations:
91
+ summary: "High request throughput"
92
+ description: "Request throughput is {{ $value }} requests/minute"
93
+
94
+ - alert: SecurityThreatDetected
95
+ expr: rate(security_threats_total[5m]) > 5
96
+ for: 1m
97
+ labels:
98
+ severity: critical
99
+ service: security
100
+ annotations:
101
+ summary: "High security threat rate"
102
+ description: "Security threat detection rate is {{ $value }} threats/minute"
103
+
104
+ - name: supabase_alerts
105
+ rules:
106
+ - alert: SupabaseConnectionFailed
107
+ expr: up{job="saems-tunes-ai"} == 0
108
+ for: 1m
109
+ labels:
110
+ severity: critical
111
+ service: database
112
+ annotations:
113
+ summary: "Supabase connection failed"
114
+ description: "Cannot connect to Supabase database"
115
+
116
+ - alert: SupabaseHighLatency
117
+ expr: supabase_query_duration_seconds{quantile="0.95"} > 5
118
+ for: 5m
119
+ labels:
120
+ severity: warning
121
+ service: database
122
+ annotations:
123
+ summary: "High Supabase query latency"
124
+ description: "Supabase 95th percentile query latency is {{ $value }}s"