smartclass-ops / monitoring /smartclass_alerts.yml
balaji958685's picture
Add Prometheus alert rules
359fa21 verified
# SmartClass Alert Rules for Prometheus
groups:
- name: smartclass_edge_alerts
rules:
# ─── Edge Pipeline Performance ──────────────────────────────────
- alert: SmartClassLowFPS
expr: smartclass_edge_fps < 5
for: 60s
labels:
severity: warning
team: edge
annotations:
summary: "Low FPS on edge node {{ $labels.instance }}"
description: >
Edge node {{ $labels.instance }} (section {{ $labels.section }})
has FPS of {{ $value | printf "%.1f" }} (threshold: 5).
Check pipeline load, camera feed, or CPU throttling.
runbook_url: "https://docs.smartclass.internal/runbooks/low-fps"
- alert: SmartClassHighRecognitionLatency
expr: smartclass_recognition_latency_seconds > 0.05
for: 60s
labels:
severity: warning
team: edge
annotations:
summary: "High recognition latency on {{ $labels.instance }}"
description: >
Recognition latency is {{ $value | printf "%.3f" }}s
(threshold: 50ms) on {{ $labels.instance }}.
Consider optimizing the model or reducing concurrent faces.
# ─── Hardware Health ────────────────────────────────────────────
- alert: SmartClassHighCPUTemp
expr: smartclass_cpu_temperature_celsius > 80
for: 60s
labels:
severity: critical
team: infra
annotations:
summary: "Critical CPU temperature on {{ $labels.instance }}"
description: >
CPU temperature is {{ $value | printf "%.1f" }}Β°C
(threshold: 80Β°C) on {{ $labels.instance }}.
Check cooling system, reduce workload, or shut down to prevent damage.
runbook_url: "https://docs.smartclass.internal/runbooks/high-cpu-temp"
- alert: SmartClassHighMemoryUsage
expr: smartclass_memory_usage_percent > 85
for: 60s
labels:
severity: warning
team: infra
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: >
Memory usage is {{ $value | printf "%.1f" }}%
(threshold: 85%) on {{ $labels.instance }}.
Check for memory leaks, large FAISS indices, or excessive buffering.
# ─── Connectivity ──────────────────────────────────────────────
- alert: SmartClassOfflineQueueBacklog
expr: smartclass_offline_queue_size > 1000
for: 2m
labels:
severity: warning
team: infra
annotations:
summary: "Large offline queue on {{ $labels.instance }}"
description: >
Offline queue has {{ $value }} events pending
(threshold: 1000) on {{ $labels.instance }}.
Check network connectivity to Redis/central server.
runbook_url: "https://docs.smartclass.internal/runbooks/offline-queue"
- alert: SmartClassEdgeUnreachable
expr: up{job="smartclass-edge"} == 0
for: 2m
labels:
severity: critical
team: infra
annotations:
summary: "Edge node {{ $labels.instance }} is unreachable"
description: >
Edge node {{ $labels.instance }} (section {{ $labels.section }})
has been unreachable for more than 2 minutes.
Check device power, network, and Docker container status.
runbook_url: "https://docs.smartclass.internal/runbooks/edge-unreachable"
- name: smartclass_api_alerts
rules:
# ─── API Server Health ─────────────────────────────────────────
- alert: SmartClassAPIHighLatency
expr: histogram_quantile(0.95, rate(smartclass_api_request_duration_seconds_bucket[5m])) > 2
for: 60s
labels:
severity: warning
team: backend
annotations:
summary: "High API latency (p95 > 2s)"
description: >
95th percentile API latency is {{ $value | printf "%.2f" }}s.
Check database query performance and Redis connectivity.
- alert: SmartClassAPIHighErrorRate
expr: rate(smartclass_api_requests_total{status=~"5.."}[5m]) / rate(smartclass_api_requests_total[5m]) > 0.05
for: 60s
labels:
severity: critical
team: backend
annotations:
summary: "High API error rate (>5%)"
description: >
API 5xx error rate is {{ $value | printf "%.2f" }}%.
Check API logs, database health, and service dependencies.
- alert: SmartClassAPIDown
expr: up{job="smartclass-api"} == 0
for: 30s
labels:
severity: critical
team: backend
annotations:
summary: "SmartClass API is down"
description: >
The SmartClass API server is unreachable.
Check container status: docker compose ps api