# SmartClass Alert Rules for Prometheus groups: - name: smartclass_edge_alerts rules: # ─── Edge Pipeline Performance ────────────────────────────────── - alert: SmartClassLowFPS expr: smartclass_edge_fps < 5 for: 60s labels: severity: warning team: edge annotations: summary: "Low FPS on edge node {{ $labels.instance }}" description: > Edge node {{ $labels.instance }} (section {{ $labels.section }}) has FPS of {{ $value | printf "%.1f" }} (threshold: 5). Check pipeline load, camera feed, or CPU throttling. runbook_url: "https://docs.smartclass.internal/runbooks/low-fps" - alert: SmartClassHighRecognitionLatency expr: smartclass_recognition_latency_seconds > 0.05 for: 60s labels: severity: warning team: edge annotations: summary: "High recognition latency on {{ $labels.instance }}" description: > Recognition latency is {{ $value | printf "%.3f" }}s (threshold: 50ms) on {{ $labels.instance }}. Consider optimizing the model or reducing concurrent faces. # ─── Hardware Health ──────────────────────────────────────────── - alert: SmartClassHighCPUTemp expr: smartclass_cpu_temperature_celsius > 80 for: 60s labels: severity: critical team: infra annotations: summary: "Critical CPU temperature on {{ $labels.instance }}" description: > CPU temperature is {{ $value | printf "%.1f" }}°C (threshold: 80°C) on {{ $labels.instance }}. Check cooling system, reduce workload, or shut down to prevent damage. runbook_url: "https://docs.smartclass.internal/runbooks/high-cpu-temp" - alert: SmartClassHighMemoryUsage expr: smartclass_memory_usage_percent > 85 for: 60s labels: severity: warning team: infra annotations: summary: "High memory usage on {{ $labels.instance }}" description: > Memory usage is {{ $value | printf "%.1f" }}% (threshold: 85%) on {{ $labels.instance }}. Check for memory leaks, large FAISS indices, or excessive buffering. # ─── Connectivity ────────────────────────────────────────────── - alert: SmartClassOfflineQueueBacklog expr: smartclass_offline_queue_size > 1000 for: 2m labels: severity: warning team: infra annotations: summary: "Large offline queue on {{ $labels.instance }}" description: > Offline queue has {{ $value }} events pending (threshold: 1000) on {{ $labels.instance }}. Check network connectivity to Redis/central server. runbook_url: "https://docs.smartclass.internal/runbooks/offline-queue" - alert: SmartClassEdgeUnreachable expr: up{job="smartclass-edge"} == 0 for: 2m labels: severity: critical team: infra annotations: summary: "Edge node {{ $labels.instance }} is unreachable" description: > Edge node {{ $labels.instance }} (section {{ $labels.section }}) has been unreachable for more than 2 minutes. Check device power, network, and Docker container status. runbook_url: "https://docs.smartclass.internal/runbooks/edge-unreachable" - name: smartclass_api_alerts rules: # ─── API Server Health ───────────────────────────────────────── - alert: SmartClassAPIHighLatency expr: histogram_quantile(0.95, rate(smartclass_api_request_duration_seconds_bucket[5m])) > 2 for: 60s labels: severity: warning team: backend annotations: summary: "High API latency (p95 > 2s)" description: > 95th percentile API latency is {{ $value | printf "%.2f" }}s. Check database query performance and Redis connectivity. - alert: SmartClassAPIHighErrorRate expr: rate(smartclass_api_requests_total{status=~"5.."}[5m]) / rate(smartclass_api_requests_total[5m]) > 0.05 for: 60s labels: severity: critical team: backend annotations: summary: "High API error rate (>5%)" description: > API 5xx error rate is {{ $value | printf "%.2f" }}%. Check API logs, database health, and service dependencies. - alert: SmartClassAPIDown expr: up{job="smartclass-api"} == 0 for: 30s labels: severity: critical team: backend annotations: summary: "SmartClass API is down" description: > The SmartClass API server is unreachable. Check container status: docker compose ps api