File size: 1,186 Bytes
a0e85b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
groups:
  - name: hopcroft_alerts
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.instance }} is down"
          description: "The job {{ $labels.job }} has been down for more than 1 minute."

      - alert: HighErrorRate
        expr: |
          sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m])) 
          / 
          sum(rate(hopcroft_requests_total[5m])) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.instance }}"
          description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value | printf \"%.2f\" }})."

      - alert: SlowRequests
        expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow requests on {{ $labels.endpoint }}"
          description: "95th percentile of request latency is above 2s (current value: {{ $value | printf \"%.2f\" }}s)."