noc_agent / explainability.py
srini047's picture
Upload folder using huggingface_hub
5899fec verified
"""
ActionExplainer β€” post-hoc natural language explanations for agent decisions.
Explanations are derived from the *current observation* at decision time,
not from the model internals. This is honest and transparent: we describe
what the system state shows, which is exactly what a human NOC engineer
would say when justifying the same action.
"""
from __future__ import annotations
from .models import ActionType, IncidentType, SystemMetrics
class ActionExplainer:
"""
Maps (action, metrics, incident_type) β†’ a human-readable explanation string.
Each explanation surfaces the dominant signal that makes the action
reasonable. If the action is wrong/irrelevant, that is also noted β€”
useful during the demo to show the agent has learned to avoid bad moves.
"""
def explain(
self,
action: ActionType,
metrics: SystemMetrics,
incident_type: IncidentType, # reserved for future incident-aware explanations
) -> str:
if action == ActionType.DO_NOTHING:
return self._do_nothing(metrics)
if action == ActionType.RESTART_SERVICE:
return self._restart_service(metrics)
if action == ActionType.THROTTLE_CPU:
return self._throttle_cpu(metrics)
if action == ActionType.CLEAR_CACHE:
return self._clear_cache(metrics)
if action == ActionType.REROUTE_TRAFFIC:
return self._reroute_traffic(metrics)
if action == ActionType.SCALE_UP:
return self._scale_up(metrics)
return f"Agent applied action. System health: {metrics.health_score:.0%}."
# ------------------------------------------------------------------
# Per-action explanation builders
# ------------------------------------------------------------------
@staticmethod
def _do_nothing(metrics: SystemMetrics) -> str:
if metrics.health_score > 0.75:
return "System metrics are within acceptable range. Monitoring β€” no intervention needed."
return (
f"Agent chose to wait. Health score is {metrics.health_score:.0%}. "
"Observing for one step before acting."
)
@staticmethod
def _restart_service(metrics: SystemMetrics) -> str:
reasons: list[str] = []
if metrics.service_healthy < 1.0:
reasons.append(f"service is partially down ({metrics.service_healthy:.0%} health)")
if metrics.error_rate > 0.35:
reasons.append(f"error rate is critically high ({metrics.error_rate:.0%})")
if not reasons:
reasons.append("attempting to clear transient faults via process restart")
return "Restarting service β€” " + "; ".join(reasons) + "."
@staticmethod
def _throttle_cpu(metrics: SystemMetrics) -> str:
if metrics.cpu_usage > 0.80:
return (
f"CPU is saturated at {metrics.cpu_usage:.0%}. "
"Throttling runaway process to free cycles and reduce latency."
)
return (
f"CPU usage is {metrics.cpu_usage:.0%}. "
"Throttling as a precaution β€” marginal effect expected."
)
@staticmethod
def _clear_cache(metrics: SystemMetrics) -> str:
if metrics.memory_usage > 0.75:
return (
f"Memory usage is {metrics.memory_usage:.0%}. "
"Clearing application cache to reclaim heap space and prevent OOM."
)
return (
f"Memory usage is {metrics.memory_usage:.0%}. "
"Cache flush has limited benefit at this memory level."
)
@staticmethod
def _reroute_traffic(metrics: SystemMetrics) -> str:
reasons: list[str] = []
if metrics.latency > 0.40:
reasons.append(f"latency is {metrics.latency * 500:.0f} ms")
if metrics.packet_loss > 0.10:
reasons.append(f"packet loss is {metrics.packet_loss:.0%}")
if not reasons:
reasons.append("network metrics are within normal range β€” limited benefit expected")
return "Rerouting traffic β€” " + "; ".join(reasons) + "."
@staticmethod
def _scale_up(metrics: SystemMetrics) -> str:
if metrics.cpu_usage > 0.70 or metrics.memory_usage > 0.70:
return (
f"Resource pressure detected (CPU {metrics.cpu_usage:.0%}, "
f"Mem {metrics.memory_usage:.0%}). Scaling up to distribute load."
)
return (
"Scaling up to add capacity headroom. "
"Effective but slow β€” better for sustained overload than spikes."
)