DaCrow13
feat: Add Prometheus and Alertmanager services with alert rules, configurations, and a verification report.
a0e85b1
| groups: | |
| - name: hopcroft_alerts | |
| rules: | |
| - alert: ServiceDown | |
| expr: up == 0 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: "Service {{ $labels.instance }} is down" | |
| description: "The job {{ $labels.job }} has been down for more than 1 minute." | |
| - alert: HighErrorRate | |
| expr: | | |
| sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m])) | |
| / | |
| sum(rate(hopcroft_requests_total[5m])) > 0.1 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: "High error rate on {{ $labels.instance }}" | |
| description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value | printf \"%.2f\" }})." | |
| - alert: SlowRequests | |
| expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: "Slow requests on {{ $labels.endpoint }}" | |
| description: "95th percentile of request latency is above 2s (current value: {{ $value | printf \"%.2f\" }}s)." | |