smartclass-ops / monitoring /smartclass_alerts.yml

Add Prometheus alert rules

359fa21 verified 18 days ago

5.32 kB

	# SmartClass Alert Rules for Prometheus
	groups:
	- name: smartclass_edge_alerts
	rules:
	# ─── Edge Pipeline Performance ──────────────────────────────────
	- alert: SmartClassLowFPS
	expr: smartclass_edge_fps < 5
	for: 60s
	labels:
	severity: warning
	team: edge
	annotations:
	summary: "Low FPS on edge node {{ $labels.instance }}"
	description: >
	Edge node {{ $labels.instance }} (section {{ $labels.section }})
	has FPS of {{ $value \| printf "%.1f" }} (threshold: 5).
	Check pipeline load, camera feed, or CPU throttling.
	runbook_url: "https://docs.smartclass.internal/runbooks/low-fps"

	- alert: SmartClassHighRecognitionLatency
	expr: smartclass_recognition_latency_seconds > 0.05
	for: 60s
	labels:
	severity: warning
	team: edge
	annotations:
	summary: "High recognition latency on {{ $labels.instance }}"
	description: >
	Recognition latency is {{ $value \| printf "%.3f" }}s
	(threshold: 50ms) on {{ $labels.instance }}.
	Consider optimizing the model or reducing concurrent faces.

	# ─── Hardware Health ────────────────────────────────────────────
	- alert: SmartClassHighCPUTemp
	expr: smartclass_cpu_temperature_celsius > 80
	for: 60s
	labels:
	severity: critical
	team: infra
	annotations:
	summary: "Critical CPU temperature on {{ $labels.instance }}"
	description: >
	CPU temperature is {{ $value \| printf "%.1f" }}°C
	(threshold: 80°C) on {{ $labels.instance }}.
	Check cooling system, reduce workload, or shut down to prevent damage.
	runbook_url: "https://docs.smartclass.internal/runbooks/high-cpu-temp"

	- alert: SmartClassHighMemoryUsage
	expr: smartclass_memory_usage_percent > 85
	for: 60s
	labels:
	severity: warning
	team: infra
	annotations:
	summary: "High memory usage on {{ $labels.instance }}"
	description: >
	Memory usage is {{ $value \| printf "%.1f" }}%
	(threshold: 85%) on {{ $labels.instance }}.
	Check for memory leaks, large FAISS indices, or excessive buffering.

	# ─── Connectivity ──────────────────────────────────────────────
	- alert: SmartClassOfflineQueueBacklog
	expr: smartclass_offline_queue_size > 1000
	for: 2m
	labels:
	severity: warning
	team: infra
	annotations:
	summary: "Large offline queue on {{ $labels.instance }}"
	description: >
	Offline queue has {{ $value }} events pending
	(threshold: 1000) on {{ $labels.instance }}.
	Check network connectivity to Redis/central server.
	runbook_url: "https://docs.smartclass.internal/runbooks/offline-queue"

	- alert: SmartClassEdgeUnreachable
	expr: up{job="smartclass-edge"} == 0
	for: 2m
	labels:
	severity: critical
	team: infra
	annotations:
	summary: "Edge node {{ $labels.instance }} is unreachable"
	description: >
	Edge node {{ $labels.instance }} (section {{ $labels.section }})
	has been unreachable for more than 2 minutes.
	Check device power, network, and Docker container status.
	runbook_url: "https://docs.smartclass.internal/runbooks/edge-unreachable"

	- name: smartclass_api_alerts
	rules:
	# ─── API Server Health ─────────────────────────────────────────
	- alert: SmartClassAPIHighLatency
	expr: histogram_quantile(0.95, rate(smartclass_api_request_duration_seconds_bucket[5m])) > 2
	for: 60s
	labels:
	severity: warning
	team: backend
	annotations:
	summary: "High API latency (p95 > 2s)"
	description: >
	95th percentile API latency is {{ $value \| printf "%.2f" }}s.
	Check database query performance and Redis connectivity.

	- alert: SmartClassAPIHighErrorRate
	expr: rate(smartclass_api_requests_total{status=~"5.."}[5m]) / rate(smartclass_api_requests_total[5m]) > 0.05
	for: 60s
	labels:
	severity: critical
	team: backend
	annotations:
	summary: "High API error rate (>5%)"
	description: >
	API 5xx error rate is {{ $value \| printf "%.2f" }}%.
	Check API logs, database health, and service dependencies.

	- alert: SmartClassAPIDown
	expr: up{job="smartclass-api"} == 0
	for: 30s
	labels:
	severity: critical
	team: backend
	annotations:
	summary: "SmartClass API is down"
	description: >
	The SmartClass API server is unreachable.
	Check container status: docker compose ps api