Spaces:

teoat
/

zenith-backend

Paused

zenith-backend / monitoring /alerting.yml

teoat

deploy: sync from main Sun Jan 11 18:43:53 WIT 2026

4a2ab42 3 months ago

4.77 kB

	"""Prometheus Alertmanager configuration"""

	groups:
	- name: fraud_detection_alerts
	interval: 30s
	rules:
	# System Health Alerts
	- alert: HighMemoryUsage
	expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 20
	for: 5m
	labels:
	severity: warning
	component: system
	annotations:
	summary: "High memory usage detected"
	description: "Memory usage is above 80% for more than 5 minutes"

	- alert: HighCPUUsage
	expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
	for: 5m
	labels:
	severity: warning
	component: system
	annotations:
	summary: "High CPU usage detected"
	description: "CPU usage is above 80% for more than 5 minutes"

	- alert: DiskSpaceLow
	expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 10
	for: 5m
	labels:
	severity: critical
	component: system
	annotations:
	summary: "Disk space critically low"
	description: "Less than 10% disk space available"

	# Application Alerts
	- alert: HighErrorRate
	expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
	for: 2m
	labels:
	severity: critical
	component: application
	annotations:
	summary: "High error rate detected"
	description: "More than 5% of requests are failing with 5xx errors"

	- alert: SlowResponseTime
	expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
	for: 5m
	labels:
	severity: warning
	component: application
	annotations:
	summary: "Slow API response times"
	description: "95th percentile response time is above 2 seconds"

	- alert: HighFraudDetectionRate
	expr: rate(fraud_detections_total[10m]) > 100
	for: 5m
	labels:
	severity: warning
	component: fraud_engine
	annotations:
	summary: "Unusually high fraud detection rate"
	description: "More than 100 fraud cases detected per 10 minutes"

	# Database Alerts
	- alert: DatabaseConnectionPoolExhausted
	expr: db_pool_size - db_pool_available < 2
	for: 2m
	labels:
	severity: critical
	component: database
	annotations:
	summary: "Database connection pool nearly exhausted"
	description: "Less than 2 database connections available"

	- alert: SlowDatabaseQueries
	expr: rate(db_query_duration_seconds_sum[5m]) / rate(db_query_duration_seconds_count[5m]) > 1
	for: 5m
	labels:
	severity: warning
	component: database
	annotations:
	summary: "Slow database queries detected"
	description: "Average query time is above 1 second"

	# Service Availability
	- alert: ServiceDown
	expr: up{job="fraud-detection-backend"} == 0
	for: 1m
	labels:
	severity: critical
	component: application
	annotations:
	summary: "Service is down"
	description: "Fraud detection backend service is not responding"

	- alert: DatabaseDown
	expr: up{job="postgres"} == 0
	for: 1m
	labels:
	severity: critical
	component: database
	annotations:
	summary: "Database is down"
	description: "PostgreSQL database is not responding"

	# Alertmanager configuration
	alertmanager_config: \|
	global:
	resolve_timeout: 5m

	route:
	group_by: ['alertname', 'component']
	group_wait: 10s
	group_interval: 10s
	repeat_interval: 12h
	receiver: 'default'
	routes:
	- match:
	severity: critical
	receiver: 'pagerduty'
	continue: true
	- match:
	severity: warning
	receiver: 'slack'

	receivers:
	- name: 'default'
	email_configs:
	- to: 'ops@example.com'
	from: 'alertmanager@example.com'
	smarthost: 'smtp.example.com:587'
	auth_username: 'alertmanager@example.com'
	auth_password: '${SMTP_PASSWORD}'

	- name: 'slack'
	slack_configs:
	- api_url: '${SLACK_WEBHOOK_URL}'
	channel: '#fraud-detection-alerts'
	title: '{{ .GroupLabels.alertname }}'
	text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

	- name: 'pagerduty'
	pagerduty_configs:
	- service_key: '${PAGERDUTY_SERVICE_KEY}'
	description: '{{ .GroupLabels.alertname }}: {{ .GroupLabels.component }}'