Spaces:

teoat
/

zenith-backend

Paused

zenith-backend / config /prometheus_alerts.yml

teoat

deploy: sync from main Sun Jan 11 18:43:53 WIT 2026

4a2ab42 4 months ago

7.89 kB

	# Prometheus Alerting Rules for Simple378 Fraud Detection
	# Save this file as: backend/config/prometheus_alerts.yml
	#
	# To use with Prometheus, add to prometheus.yml:
	# rule_files:
	# - "alert_rules.yml"

	groups:
	- name: 378x492_alerts
	interval: 30s
	rules:

	# High Error Rate Alert
	- alert: HighErrorRate
	expr: \|
	rate(http_requests_total{status=~"5.."}[5m]) > 0.05
	for: 5m
	labels:
	severity: warning
	component: backend
	annotations:
	summary: "High error rate detected"
	description: "Error rate is {{ $value \| humanizePercentage }} over the last 5 minutes"

	# Critical Error Rate
	- alert: CriticalErrorRate
	expr: \|
	rate(http_requests_total{status=~"5.."}[5m]) > 0.1
	for: 2m
	labels:
	severity: critical
	component: backend
	annotations:
	summary: "Critical error rate detected"
	description: "Error rate is {{ $value \| humanizePercentage }} - immediate action required"

	# Slow Query Alert
	- alert: SlowDatabaseQueries
	expr: \|
	rate(db_slow_queries_total[5m]) > 10
	for: 5m
	labels:
	severity: warning
	component: database
	annotations:
	summary: "High number of slow database queries"
	description: "{{ $value }} slow queries per second in the last 5 minutes"

	# Database Connection Issues
	- alert: DatabaseConnectionErrors
	expr: \|
	rate(db_query_errors_total[5m]) > 0.01
	for: 2m
	labels:
	severity: critical
	component: database
	annotations:
	summary: "Database connection errors detected"
	description: "{{ $value }} database errors per second"

	# High Memory Usage
	- alert: HighMemoryUsage
	expr: \|
	(process_resident_memory_bytes / 1024 / 1024) > 1024
	for: 10m
	labels:
	severity: warning
	component: backend
	annotations:
	summary: "High memory usage detected"
	description: "Process is using {{ $value }}MB of memory"

	# Critical Memory Usage
	- alert: CriticalMemoryUsage
	expr: \|
	(process_resident_memory_bytes / 1024 / 1024) > 2048
	for: 5m
	labels:
	severity: critical
	component: backend
	annotations:
	summary: "Critical memory usage - possible memory leak"
	description: "Process is using {{ $value }}MB of memory - investigate immediately"

	# High CPU Usage
	- alert: HighCPUUsage
	expr: \|
	rate(process_cpu_seconds_total[5m]) > 0.8
	for: 10m
	labels:
	severity: warning
	component: backend
	annotations:
	summary: "High CPU usage detected"
	description: "CPU usage is {{ $value \| humanizePercentage }}"

	# Response Time Degradation
	- alert: SlowResponseTime
	expr: \|
	histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
	for: 5m
	labels:
	severity: warning
	component: backend
	annotations:
	summary: "API response time degradation"
	description: "95th percentile response time is {{ $value }}s"

	# Critical Response Time
	- alert: CriticalResponseTime
	expr: \|
	histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 5
	for: 2m
	labels:
	severity: critical
	component: backend
	annotations:
	summary: "Critical API response time"
	description: "95th percentile response time is {{ $value }}s - investigate immediately"

	# Cache Performance
	- alert: LowCacheHitRate
	expr: \|
	(rate(cache_hits_total[10m]) / (rate(cache_hits_total[10m]) + rate(cache_misses_total[10m]))) < 0.7
	for: 15m
	labels:
	severity: warning
	component: cache
	annotations:
	summary: "Low cache hit rate"
	description: "Cache hit rate is {{ $value \| humanizePercentage }} - consider cache tuning"

	# Fraud Detection Performance
	- alert: FraudDetectionFailures
	expr: \|
	rate(fraud_detection_errors_total[5m]) > 0.01
	for: 5m
	labels:
	severity: warning
	component: fraud_detection
	annotations:
	summary: "Fraud detection failures detected"
	description: "{{ $value }} fraud detection errors per second"

	# File Processing Issues
	- alert: FileProcessingFailures
	expr: \|
	rate(file_processing_errors_total[5m]) > 0.05
	for: 5m
	labels:
	severity: warning
	component: evidence_processing
	annotations:
	summary: "High file processing failure rate"
	description: "{{ $value }} file processing errors per second"

	# CSRF Protection Violations
	- alert: CSRFViolations
	expr: \|
	rate(http_requests_total{status="403",path=~".*"}[5m]) > 1
	for: 5m
	labels:
	severity: warning
	component: security
	annotations:
	summary: "Potential CSRF attack detected"
	description: "{{ $value }} CSRF violations per second - possible attack"

	# Rate Limit Exceeded
	- alert: RateLimitExceeded
	expr: \|
	rate(http_requests_total{status="429"}[5m]) > 10
	for: 5m
	labels:
	severity: info
	component: security
	annotations:
	summary: "High rate limit violations"
	description: "{{ $value }} rate limit violations per second"

	# Disk Space Warning
	- alert: LowDiskSpace
	expr: \|
	(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.2
	for: 10m
	labels:
	severity: warning
	component: infrastructure
	annotations:
	summary: "Low disk space"
	description: "Only {{ $value \| humanizePercentage }} disk space remaining"

	# Critical Disk Space
	- alert: CriticalDiskSpace
	expr: \|
	(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.1
	for: 5m
	labels:
	severity: critical
	component: infrastructure
	annotations:
	summary: "Critical disk space - immediate action required"
	description: "Only {{ $value \| humanizePercentage }} disk space remaining"

	# Service Down
	- alert: ServiceDown
	expr: \|
	up{job="378x492-backend"} == 0
	for: 1m
	labels:
	severity: critical
	component: backend
	annotations:
	summary: "Simple378 backend service is down"
	description: "The backend service has been down for more than 1 minute"


	# Alert Routing Configuration (for Alertmanager)
	# Save as: backend/config/alertmanager.yml
	#
	# route:
	# receiver: 'team-notifications'
	# group_by: ['alertname', 'severity']
	# group_wait: 30s
	# group_interval: 5m
	# repeat_interval: 4h
	#
	# routes:
	# - match:
	# severity: critical
	# receiver: 'critical-alerts'
	# repeat_interval: 1h
	#
	# - match:
	# severity: warning
	# receiver: 'warning-alerts'
	# repeat_interval: 4h
	#
	# receivers:
	# - name: 'team-notifications'
	# # Configure your notification channels here
	# # Examples: email, Slack, PagerDuty, etc.
	#
	# - name: 'critical-alerts'
	# # Critical alerts go here (SMS, PagerDuty, etc.)
	#
	# - name: 'warning-alerts'
	# # Warning alerts (email, Slack, etc.)