groups: - name: hopcroft_alerts rules: - alert: ServiceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "Service {{ $labels.instance }} is down" description: "The job {{ $labels.job }} has been down for more than 1 minute." - alert: HighErrorRate expr: | sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m])) / sum(rate(hopcroft_requests_total[5m])) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate on {{ $labels.instance }}" description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value | printf \"%.2f\" }})." - alert: SlowRequests expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2 for: 5m labels: severity: warning annotations: summary: "Slow requests on {{ $labels.endpoint }}" description: "95th percentile of request latency is above 2s (current value: {{ $value | printf \"%.2f\" }}s)."