# SmartClass Alert Rules for Prometheus
groups:
  - name: smartclass_edge_alerts
    rules:
      # ─── Edge Pipeline Performance ──────────────────────────────────
      - alert: SmartClassLowFPS
        expr: smartclass_edge_fps < 5
        for: 60s
        labels:
          severity: warning
          team: edge
        annotations:
          summary: "Low FPS on edge node {{ $labels.instance }}"
          description: >
            Edge node {{ $labels.instance }} (section {{ $labels.section }})
            has FPS of {{ $value | printf "%.1f" }} (threshold: 5).
            Check pipeline load, camera feed, or CPU throttling.
          runbook_url: "https://docs.smartclass.internal/runbooks/low-fps"

      - alert: SmartClassHighRecognitionLatency
        expr: smartclass_recognition_latency_seconds > 0.05
        for: 60s
        labels:
          severity: warning
          team: edge
        annotations:
          summary: "High recognition latency on {{ $labels.instance }}"
          description: >
            Recognition latency is {{ $value | printf "%.3f" }}s
            (threshold: 50ms) on {{ $labels.instance }}.
            Consider optimizing the model or reducing concurrent faces.

      # ─── Hardware Health ────────────────────────────────────────────
      - alert: SmartClassHighCPUTemp
        expr: smartclass_cpu_temperature_celsius > 80
        for: 60s
        labels:
          severity: critical
          team: infra
        annotations:
          summary: "Critical CPU temperature on {{ $labels.instance }}"
          description: >
            CPU temperature is {{ $value | printf "%.1f" }}°C
            (threshold: 80°C) on {{ $labels.instance }}.
            Check cooling system, reduce workload, or shut down to prevent damage.
          runbook_url: "https://docs.smartclass.internal/runbooks/high-cpu-temp"

      - alert: SmartClassHighMemoryUsage
        expr: smartclass_memory_usage_percent > 85
        for: 60s
        labels:
          severity: warning
          team: infra
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: >
            Memory usage is {{ $value | printf "%.1f" }}%
            (threshold: 85%) on {{ $labels.instance }}.
            Check for memory leaks, large FAISS indices, or excessive buffering.

      # ─── Connectivity ──────────────────────────────────────────────
      - alert: SmartClassOfflineQueueBacklog
        expr: smartclass_offline_queue_size > 1000
        for: 2m
        labels:
          severity: warning
          team: infra
        annotations:
          summary: "Large offline queue on {{ $labels.instance }}"
          description: >
            Offline queue has {{ $value }} events pending
            (threshold: 1000) on {{ $labels.instance }}.
            Check network connectivity to Redis/central server.
          runbook_url: "https://docs.smartclass.internal/runbooks/offline-queue"

      - alert: SmartClassEdgeUnreachable
        expr: up{job="smartclass-edge"} == 0
        for: 2m
        labels:
          severity: critical
          team: infra
        annotations:
          summary: "Edge node {{ $labels.instance }} is unreachable"
          description: >
            Edge node {{ $labels.instance }} (section {{ $labels.section }})
            has been unreachable for more than 2 minutes.
            Check device power, network, and Docker container status.
          runbook_url: "https://docs.smartclass.internal/runbooks/edge-unreachable"

  - name: smartclass_api_alerts
    rules:
      # ─── API Server Health ─────────────────────────────────────────
      - alert: SmartClassAPIHighLatency
        expr: histogram_quantile(0.95, rate(smartclass_api_request_duration_seconds_bucket[5m])) > 2
        for: 60s
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High API latency (p95 > 2s)"
          description: >
            95th percentile API latency is {{ $value | printf "%.2f" }}s.
            Check database query performance and Redis connectivity.

      - alert: SmartClassAPIHighErrorRate
        expr: rate(smartclass_api_requests_total{status=~"5.."}[5m]) / rate(smartclass_api_requests_total[5m]) > 0.05
        for: 60s
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "High API error rate (>5%)"
          description: >
            API 5xx error rate is {{ $value | printf "%.2f" }}%.
            Check API logs, database health, and service dependencies.

      - alert: SmartClassAPIDown
        expr: up{job="smartclass-api"} == 0
        for: 30s
        labels:
          severity: critical
          team: backend
        annotations:
          summary: "SmartClass API is down"
          description: >
            The SmartClass API server is unreachable.
            Check container status: docker compose ps api