| """ |
| ActionExplainer β post-hoc natural language explanations for agent decisions. |
| |
| Explanations are derived from the *current observation* at decision time, |
| not from the model internals. This is honest and transparent: we describe |
| what the system state shows, which is exactly what a human NOC engineer |
| would say when justifying the same action. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from .models import ActionType, IncidentType, SystemMetrics |
|
|
|
|
| class ActionExplainer: |
| """ |
| Maps (action, metrics, incident_type) β a human-readable explanation string. |
| |
| Each explanation surfaces the dominant signal that makes the action |
| reasonable. If the action is wrong/irrelevant, that is also noted β |
| useful during the demo to show the agent has learned to avoid bad moves. |
| """ |
|
|
| def explain( |
| self, |
| action: ActionType, |
| metrics: SystemMetrics, |
| incident_type: IncidentType, |
| ) -> str: |
| if action == ActionType.DO_NOTHING: |
| return self._do_nothing(metrics) |
| if action == ActionType.RESTART_SERVICE: |
| return self._restart_service(metrics) |
| if action == ActionType.THROTTLE_CPU: |
| return self._throttle_cpu(metrics) |
| if action == ActionType.CLEAR_CACHE: |
| return self._clear_cache(metrics) |
| if action == ActionType.REROUTE_TRAFFIC: |
| return self._reroute_traffic(metrics) |
| if action == ActionType.SCALE_UP: |
| return self._scale_up(metrics) |
| return f"Agent applied action. System health: {metrics.health_score:.0%}." |
|
|
| |
| |
| |
|
|
| @staticmethod |
| def _do_nothing(metrics: SystemMetrics) -> str: |
| if metrics.health_score > 0.75: |
| return "System metrics are within acceptable range. Monitoring β no intervention needed." |
| return ( |
| f"Agent chose to wait. Health score is {metrics.health_score:.0%}. " |
| "Observing for one step before acting." |
| ) |
|
|
| @staticmethod |
| def _restart_service(metrics: SystemMetrics) -> str: |
| reasons: list[str] = [] |
| if metrics.service_healthy < 1.0: |
| reasons.append(f"service is partially down ({metrics.service_healthy:.0%} health)") |
| if metrics.error_rate > 0.35: |
| reasons.append(f"error rate is critically high ({metrics.error_rate:.0%})") |
| if not reasons: |
| reasons.append("attempting to clear transient faults via process restart") |
| return "Restarting service β " + "; ".join(reasons) + "." |
|
|
| @staticmethod |
| def _throttle_cpu(metrics: SystemMetrics) -> str: |
| if metrics.cpu_usage > 0.80: |
| return ( |
| f"CPU is saturated at {metrics.cpu_usage:.0%}. " |
| "Throttling runaway process to free cycles and reduce latency." |
| ) |
| return ( |
| f"CPU usage is {metrics.cpu_usage:.0%}. " |
| "Throttling as a precaution β marginal effect expected." |
| ) |
|
|
| @staticmethod |
| def _clear_cache(metrics: SystemMetrics) -> str: |
| if metrics.memory_usage > 0.75: |
| return ( |
| f"Memory usage is {metrics.memory_usage:.0%}. " |
| "Clearing application cache to reclaim heap space and prevent OOM." |
| ) |
| return ( |
| f"Memory usage is {metrics.memory_usage:.0%}. " |
| "Cache flush has limited benefit at this memory level." |
| ) |
|
|
| @staticmethod |
| def _reroute_traffic(metrics: SystemMetrics) -> str: |
| reasons: list[str] = [] |
| if metrics.latency > 0.40: |
| reasons.append(f"latency is {metrics.latency * 500:.0f} ms") |
| if metrics.packet_loss > 0.10: |
| reasons.append(f"packet loss is {metrics.packet_loss:.0%}") |
| if not reasons: |
| reasons.append("network metrics are within normal range β limited benefit expected") |
| return "Rerouting traffic β " + "; ".join(reasons) + "." |
|
|
| @staticmethod |
| def _scale_up(metrics: SystemMetrics) -> str: |
| if metrics.cpu_usage > 0.70 or metrics.memory_usage > 0.70: |
| return ( |
| f"Resource pressure detected (CPU {metrics.cpu_usage:.0%}, " |
| f"Mem {metrics.memory_usage:.0%}). Scaling up to distribute load." |
| ) |
| return ( |
| "Scaling up to add capacity headroom. " |
| "Effective but slow β better for sustained overload than spikes." |
| ) |
|
|