| """ |
| Alert firing engine. |
| |
| Alerts fire based on metric thresholds — the agent sees what fired |
| but must investigate to find why. Alert correlation (multiple alerts |
| from a cascading failure) is represented by shared source timestamps. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import random |
| from typing import Any, Dict, List |
|
|
| from .service import ServiceState |
|
|
|
|
| |
| |
| |
|
|
| _ALERT_RULES = [ |
| { |
| "name": "HighErrorRate", |
| "field": "error_rate_percent", |
| "threshold": 10.0, |
| "severity": "critical", |
| "description": "{service}: error rate {value:.1f}% exceeds threshold 10%", |
| }, |
| { |
| "name": "HighMemoryUsage", |
| "field": "memory_percent", |
| "threshold": 80.0, |
| "severity": "critical", |
| "description": "{service}: memory usage {value:.0f}% exceeds threshold 80%", |
| }, |
| { |
| "name": "HighLatencyP99", |
| "field": "latency_p99_ms", |
| "threshold": 1000.0, |
| "severity": "warning", |
| "description": "{service}: p99 latency {value:.0f}ms exceeds threshold 1000ms", |
| }, |
| { |
| "name": "HighLatencyP95", |
| "field": "latency_p95_ms", |
| "threshold": 500.0, |
| "severity": "warning", |
| "description": "{service}: p95 latency {value:.0f}ms exceeds threshold 500ms", |
| }, |
| { |
| "name": "HighCPU", |
| "field": "cpu_percent", |
| "threshold": 80.0, |
| "severity": "warning", |
| "description": "{service}: CPU usage {value:.0f}% exceeds threshold 80%", |
| }, |
| { |
| "name": "ServiceDown", |
| "field": "status", |
| "threshold": "down", |
| "severity": "critical", |
| "description": "{service}: service is DOWN — health check failing", |
| }, |
| { |
| "name": "ServiceDegraded", |
| "field": "status", |
| "threshold": "degraded", |
| "severity": "warning", |
| "description": "{service}: service is DEGRADED — partial failures detected", |
| }, |
| { |
| "name": "LowRequestRate", |
| "field": "requests_per_sec", |
| "threshold": 100.0, |
| "severity": "warning", |
| "description": "{service}: request rate {value:.0f} rps dropped below threshold 100 rps", |
| "below": True, |
| }, |
| ] |
|
|
|
|
| def evaluate_alerts( |
| services: Dict[str, ServiceState], |
| current_minute: int, |
| ) -> List[Dict[str, Any]]: |
| """ |
| Evaluate all alert rules against current service states. |
| Returns list of firing alert dicts. |
| """ |
| alerts = [] |
| alert_counter = 0 |
|
|
| for svc_name, svc in services.items(): |
| for rule in _ALERT_RULES: |
| field = rule["field"] |
| threshold = rule["threshold"] |
|
|
| |
| if field == "status": |
| if svc.status == threshold: |
| alert_counter += 1 |
| alerts.append({ |
| "alert_id": f"alert-{alert_counter:03d}", |
| "severity": rule["severity"], |
| "source_service": svc_name, |
| "description": rule["description"].format( |
| service=svc_name, value=0), |
| "firing_since": f"2025-01-15T14:{max(0, current_minute - svc.ticks_in_down):02d}:00Z" |
| if threshold == "down" |
| else f"2025-01-15T14:{max(0, current_minute - svc.ticks_in_degraded):02d}:00Z", |
| "rule_name": rule["name"], |
| }) |
| continue |
|
|
| |
| value = getattr(svc, field, 0) |
| is_below = rule.get("below", False) |
| triggered = value < threshold if is_below else value > threshold |
|
|
| if triggered: |
| alert_counter += 1 |
| ticks_firing = max(1, len([ |
| h for h in svc.metric_history[-10:] |
| if (h.get(field.replace("_percent", "").replace("_ms", ""), |
| h.get(field, 0)) |
| < threshold if is_below |
| else h.get(field.replace("_percent", "").replace("_ms", ""), |
| h.get(field, 0)) |
| > threshold) |
| ])) |
| alerts.append({ |
| "alert_id": f"alert-{alert_counter:03d}", |
| "severity": rule["severity"], |
| "source_service": svc_name, |
| "description": rule["description"].format( |
| service=svc_name, value=value), |
| "firing_since": f"2025-01-15T14:{max(0, current_minute - ticks_firing):02d}:00Z", |
| "rule_name": rule["name"], |
| }) |
|
|
| |
| severity_order = {"critical": 0, "warning": 1, "info": 2} |
| alerts.sort(key=lambda a: severity_order.get(a["severity"], 9)) |
|
|
| return alerts |
|
|