STA-AI / monitoring /alerts.yml
saemstunes's picture
Update monitoring/alerts.yml
5e1e0b6 verified
groups:
- name: saems_tunes_ai_alerts
rules:
- alert: HighErrorRate
expr: ai_error_rate_percent > 10
for: 5m
labels:
severity: warning
service: ai-system
annotations:
summary: "High error rate in AI system"
description: "AI system error rate is {{ $value }}% for more than 5 minutes"
- alert: CriticalErrorRate
expr: ai_error_rate_percent > 25
for: 2m
labels:
severity: critical
service: ai-system
annotations:
summary: "Critical error rate in AI system"
description: "AI system error rate is {{ $value }}% for more than 2 minutes"
- alert: HighResponseTime
expr: ai_response_time_95th_percentile > 30
for: 5m
labels:
severity: warning
service: ai-system
annotations:
summary: "High response time in AI system"
description: "AI system 95th percentile response time is {{ $value }}s"
- alert: SystemMemoryHigh
expr: ai_system_memory_percent > 90
for: 5m
labels:
severity: warning
service: infrastructure
annotations:
summary: "High memory usage"
description: "System memory usage is {{ $value }}%"
- alert: SystemCPUHigh
expr: ai_system_cpu_percent > 85
for: 5m
labels:
severity: warning
service: infrastructure
annotations:
summary: "High CPU usage"
description: "System CPU usage is {{ $value }}%"
- alert: DiskSpaceLow
expr: ai_system_disk_percent > 90
for: 2m
labels:
severity: critical
service: infrastructure
annotations:
summary: "Low disk space"
description: "Disk usage is {{ $value }}%"
- alert: ModelInferenceErrors
expr: rate(ai_inference_requests_total{status="error"}[5m]) > 0.1
for: 3m
labels:
severity: warning
service: ai-system
annotations:
summary: "High model inference error rate"
description: "Model inference error rate is high"
- alert: CacheHitRateLow
expr: ai_cache_hit_rate_percent < 20
for: 10m
labels:
severity: warning
service: ai-system
annotations:
summary: "Low cache hit rate"
description: "Cache hit rate is {{ $value }}%"
- alert: ThroughputSpike
expr: rate(ai_inference_requests_total[5m]) > 100
for: 2m
labels:
severity: warning
service: ai-system
annotations:
summary: "High request throughput"
description: "Request throughput is {{ $value }} requests/minute"
- alert: SecurityThreatDetected
expr: rate(security_threats_total[5m]) > 5
for: 1m
labels:
severity: critical
service: security
annotations:
summary: "High security threat rate"
description: "Security threat detection rate is {{ $value }} threats/minute"
- name: supabase_alerts
rules:
- alert: SupabaseConnectionFailed
expr: up{job="saems-tunes-ai"} == 0
for: 1m
labels:
severity: critical
service: database
annotations:
summary: "Supabase connection failed"
description: "Cannot connect to Supabase database"
- alert: SupabaseHighLatency
expr: supabase_query_duration_seconds{quantile="0.95"} > 5
for: 5m
labels:
severity: warning
service: database
annotations:
summary: "High Supabase query latency"
description: "Supabase 95th percentile query latency is {{ $value }}s"