apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: voiceforge-backend namespace: monitoring labels: app: voiceforge-backend release: prometheus spec: selector: matchLabels: app: voiceforge-backend namespaceSelector: matchNames: - voiceforge endpoints: - port: http path: /metrics interval: 30s scrapeTimeout: 10s --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: voiceforge-alerts namespace: monitoring labels: app: voiceforge release: prometheus spec: groups: - name: voiceforge.rules rules: - alert: HighErrorRate expr: sum(rate(http_requests_total{app="voiceforge-backend",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="voiceforge-backend"}[5m])) > 0.05 for: 5m labels: severity: critical annotations: summary: "High error rate on VoiceForge API" description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)" - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{app="voiceforge-backend"}[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High latency on VoiceForge API" description: "p95 latency is {{ $value }}s (threshold: 2s)" - alert: PodRestarts expr: increase(kube_pod_container_status_restarts_total{namespace="voiceforge"}[1h]) > 3 for: 5m labels: severity: warning annotations: summary: "Pod restarts detected" description: "Pod {{ $labels.pod }} has restarted {{ $value }} times in the last hour" - alert: HighCPU expr: avg(container_cpu_usage_seconds_total{pod=~"voiceforge-.*"}) > 0.8 for: 10m labels: severity: warning annotations: summary: "High CPU usage on VoiceForge pods" description: "Average CPU usage is {{ $value | humanizePercentage }}" - alert: HighMemory expr: avg(container_memory_usage_bytes{pod=~"voiceforge-.*"}) / avg(container_spec_memory_limit_bytes{pod=~"voiceforge-.*"}) > 0.9 for: 10m labels: severity: critical annotations: summary: "High memory usage on VoiceForge pods" description: "Memory usage is at {{ $value | humanizePercentage }}"