Spaces:
Paused
Paused
| """Prometheus Alertmanager configuration""" | |
| groups: | |
| - name: fraud_detection_alerts | |
| interval: 30s | |
| rules: | |
| # System Health Alerts | |
| - alert: HighMemoryUsage | |
| expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 20 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| component: system | |
| annotations: | |
| summary: "High memory usage detected" | |
| description: "Memory usage is above 80% for more than 5 minutes" | |
| - alert: HighCPUUsage | |
| expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| component: system | |
| annotations: | |
| summary: "High CPU usage detected" | |
| description: "CPU usage is above 80% for more than 5 minutes" | |
| - alert: DiskSpaceLow | |
| expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10 | |
| for: 5m | |
| labels: | |
| severity: critical | |
| component: system | |
| annotations: | |
| summary: "Disk space critically low" | |
| description: "Less than 10% disk space available" | |
| # Application Alerts | |
| - alert: HighErrorRate | |
| expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| component: application | |
| annotations: | |
| summary: "High error rate detected" | |
| description: "More than 5% of requests are failing with 5xx errors" | |
| - alert: SlowResponseTime | |
| expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| component: application | |
| annotations: | |
| summary: "Slow API response times" | |
| description: "95th percentile response time is above 2 seconds" | |
| - alert: HighFraudDetectionRate | |
| expr: rate(fraud_detections_total[10m]) > 100 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| component: fraud_engine | |
| annotations: | |
| summary: "Unusually high fraud detection rate" | |
| description: "More than 100 fraud cases detected per 10 minutes" | |
| # Database Alerts | |
| - alert: DatabaseConnectionPoolExhausted | |
| expr: db_pool_size - db_pool_available < 2 | |
| for: 2m | |
| labels: | |
| severity: critical | |
| component: database | |
| annotations: | |
| summary: "Database connection pool nearly exhausted" | |
| description: "Less than 2 database connections available" | |
| - alert: SlowDatabaseQueries | |
| expr: rate(db_query_duration_seconds_sum[5m]) / rate(db_query_duration_seconds_count[5m]) > 1 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| component: database | |
| annotations: | |
| summary: "Slow database queries detected" | |
| description: "Average query time is above 1 second" | |
| # Service Availability | |
| - alert: ServiceDown | |
| expr: up{job="fraud-detection-backend"} == 0 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| component: application | |
| annotations: | |
| summary: "Service is down" | |
| description: "Fraud detection backend service is not responding" | |
| - alert: DatabaseDown | |
| expr: up{job="postgres"} == 0 | |
| for: 1m | |
| labels: | |
| severity: critical | |
| component: database | |
| annotations: | |
| summary: "Database is down" | |
| description: "PostgreSQL database is not responding" | |
| # Alertmanager configuration | |
| alertmanager_config: | | |
| global: | |
| resolve_timeout: 5m | |
| route: | |
| group_by: ['alertname', 'component'] | |
| group_wait: 10s | |
| group_interval: 10s | |
| repeat_interval: 12h | |
| receiver: 'default' | |
| routes: | |
| - match: | |
| severity: critical | |
| receiver: 'pagerduty' | |
| continue: true | |
| - match: | |
| severity: warning | |
| receiver: 'slack' | |
| receivers: | |
| - name: 'default' | |
| email_configs: | |
| - to: 'ops@example.com' | |
| from: 'alertmanager@example.com' | |
| smarthost: 'smtp.example.com:587' | |
| auth_username: 'alertmanager@example.com' | |
| auth_password: '${SMTP_PASSWORD}' | |
| - name: 'slack' | |
| slack_configs: | |
| - api_url: '${SLACK_WEBHOOK_URL}' | |
| channel: '#fraud-detection-alerts' | |
| title: '{{ .GroupLabels.alertname }}' | |
| text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' | |
| - name: 'pagerduty' | |
| pagerduty_configs: | |
| - service_key: '${PAGERDUTY_SERVICE_KEY}' | |
| description: '{{ .GroupLabels.alertname }}: {{ .GroupLabels.component }}' | |