Aethero_github / Aethero_App /monitoring /aetheros_rules.yml
xvadur's picture
Add complete Aethero_App and aethero_protocol directories
46f737d
groups:
- name: AetheroOS Alerts
rules:
# Agent Health Alerts
- alert: AgentDown
expr: up{job=~"aetheros_agents.*"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Agent {{ $labels.agent_id }} is down"
description: "Agent {{ $labels.agent_id }} has been down for more than 1 minute"
- alert: HighAgentLatency
expr: rate(aetheros_agent_response_time_seconds_sum[5m]) / rate(aetheros_agent_response_time_seconds_count[5m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High latency for {{ $labels.agent_id }}"
description: "Agent {{ $labels.agent_id }} has response time > 2s for 5 minutes"
# Reflection Quality Alerts
- alert: LowReflectionQuality
expr: aetheros_reflection_quality_score < 0.7
for: 15m
labels:
severity: warning
annotations:
summary: "Low reflection quality detected"
description: "Reflection quality score has been below 0.7 for 15 minutes"
- alert: ReflectionProcessingStalled
expr: rate(aetheros_reflections_processed_total[15m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Reflection processing has stalled"
description: "No reflections have been processed in the last 15 minutes"
# Memory System Alerts
- alert: HighMemoryLatency
expr: histogram_quantile(0.95, sum(rate(aetheros_mem_latency_bucket[5m])) by (le)) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High memory system latency"
description: "95th percentile of memory operations taking >500ms"
- alert: HighMemoryErrorRate
expr: rate(aetheros_mem_operations_error_total[5m]) / rate(aetheros_mem_operations_total[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "High memory operation error rate"
description: "Memory operation error rate > 1% for 5 minutes"
# Pipeline Execution Alerts
- alert: LowPipelineSuccessRate
expr: sum(rate(aetheros_pipeline_executions_success[5m])) / sum(rate(aetheros_pipeline_executions_total[5m])) < 0.95
for: 15m
labels:
severity: critical
annotations:
summary: "Low pipeline success rate"
description: "Pipeline success rate below 95% for 15 minutes"
- alert: LongPipelineDuration
expr: histogram_quantile(0.95, sum(rate(aetheros_pipeline_duration_seconds_bucket[5m])) by (le)) > 300
for: 15m
labels:
severity: warning
annotations:
summary: "Long pipeline execution times"
description: "95th percentile of pipeline executions taking >5 minutes"
# Resource Usage Alerts
- alert: HighCPUUsage
expr: rate(process_cpu_seconds_total{job=~"aetheros_agents.*"}[5m]) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage for {{ $labels.agent_id }}"
description: "Agent {{ $labels.agent_id }} CPU usage >80% for 10 minutes"
- alert: HighMemoryUsage
expr: process_resident_memory_bytes{job=~"aetheros_agents.*"} / node_memory_MemTotal_bytes > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage for {{ $labels.agent_id }}"
description: "Agent {{ $labels.agent_id }} memory usage >80% for 10 minutes"
# System Health Alerts
- alert: HighErrorRate
expr: sum(rate(aetheros_error_total[5m])) by (agent_id) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate for {{ $labels.agent_id }}"
description: "Error rate >5% for {{ $labels.agent_id }} over 5 minutes"
- alert: SystemOverload
expr: sum(rate(aetheros_agent_queue_size[5m])) by (agent_id) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "System overload for {{ $labels.agent_id }}"
description: "Queue size >1000 for {{ $labels.agent_id }} over 5 minutes"