Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

DaCrow13

feat: Add Prometheus and Alertmanager services with alert rules, configurations, and a verification report.

a0e85b1 about 1 month ago

1.19 kB

	groups:
	- name: hopcroft_alerts
	rules:
	- alert: ServiceDown
	expr: up == 0
	for: 1m
	labels:
	severity: critical
	annotations:
	summary: "Service {{ $labels.instance }} is down"
	description: "The job {{ $labels.job }} has been down for more than 1 minute."

	- alert: HighErrorRate
	expr: \|
	sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m]))
	/
	sum(rate(hopcroft_requests_total[5m])) > 0.1
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "High error rate on {{ $labels.instance }}"
	description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value \| printf \"%.2f\" }})."

	- alert: SlowRequests
	expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2
	for: 5m
	labels:
	severity: warning
	annotations:
	summary: "Slow requests on {{ $labels.endpoint }}"
	description: "95th percentile of request latency is above 2s (current value: {{ $value \| printf \"%.2f\" }}s)."