Spaces:

srini047
/

noc_agent

Sleeping

App Files Files Community

noc_agent / explainability.py

srini047

Upload folder using huggingface_hub

5899fec verified about 1 month ago

raw

history blame contribute delete

4.64 kB

	"""
	ActionExplainer — post-hoc natural language explanations for agent decisions.

	Explanations are derived from the current observation at decision time,
	not from the model internals. This is honest and transparent: we describe
	what the system state shows, which is exactly what a human NOC engineer
	would say when justifying the same action.
	"""

	from __future__ import annotations

	from .models import ActionType, IncidentType, SystemMetrics


	class ActionExplainer:
	"""
	Maps (action, metrics, incident_type) → a human-readable explanation string.

	Each explanation surfaces the dominant signal that makes the action
	reasonable. If the action is wrong/irrelevant, that is also noted —
	useful during the demo to show the agent has learned to avoid bad moves.
	"""

	def explain(
	self,
	action: ActionType,
	metrics: SystemMetrics,
	incident_type: IncidentType, # reserved for future incident-aware explanations
	) -> str:
	if action == ActionType.DO_NOTHING:
	return self._do_nothing(metrics)
	if action == ActionType.RESTART_SERVICE:
	return self._restart_service(metrics)
	if action == ActionType.THROTTLE_CPU:
	return self._throttle_cpu(metrics)
	if action == ActionType.CLEAR_CACHE:
	return self._clear_cache(metrics)
	if action == ActionType.REROUTE_TRAFFIC:
	return self._reroute_traffic(metrics)
	if action == ActionType.SCALE_UP:
	return self._scale_up(metrics)
	return f"Agent applied action. System health: {metrics.health_score:.0%}."

	# ------------------------------------------------------------------
	# Per-action explanation builders
	# ------------------------------------------------------------------

	@staticmethod
	def _do_nothing(metrics: SystemMetrics) -> str:
	if metrics.health_score > 0.75:
	return "System metrics are within acceptable range. Monitoring — no intervention needed."
	return (
	f"Agent chose to wait. Health score is {metrics.health_score:.0%}. "
	"Observing for one step before acting."
	)

	@staticmethod
	def _restart_service(metrics: SystemMetrics) -> str:
	reasons: list[str] = []
	if metrics.service_healthy < 1.0:
	reasons.append(f"service is partially down ({metrics.service_healthy:.0%} health)")
	if metrics.error_rate > 0.35:
	reasons.append(f"error rate is critically high ({metrics.error_rate:.0%})")
	if not reasons:
	reasons.append("attempting to clear transient faults via process restart")
	return "Restarting service — " + "; ".join(reasons) + "."

	@staticmethod
	def _throttle_cpu(metrics: SystemMetrics) -> str:
	if metrics.cpu_usage > 0.80:
	return (
	f"CPU is saturated at {metrics.cpu_usage:.0%}. "
	"Throttling runaway process to free cycles and reduce latency."
	)
	return (
	f"CPU usage is {metrics.cpu_usage:.0%}. "
	"Throttling as a precaution — marginal effect expected."
	)

	@staticmethod
	def _clear_cache(metrics: SystemMetrics) -> str:
	if metrics.memory_usage > 0.75:
	return (
	f"Memory usage is {metrics.memory_usage:.0%}. "
	"Clearing application cache to reclaim heap space and prevent OOM."
	)
	return (
	f"Memory usage is {metrics.memory_usage:.0%}. "
	"Cache flush has limited benefit at this memory level."
	)

	@staticmethod
	def _reroute_traffic(metrics: SystemMetrics) -> str:
	reasons: list[str] = []
	if metrics.latency > 0.40:
	reasons.append(f"latency is {metrics.latency * 500:.0f} ms")
	if metrics.packet_loss > 0.10:
	reasons.append(f"packet loss is {metrics.packet_loss:.0%}")
	if not reasons:
	reasons.append("network metrics are within normal range — limited benefit expected")
	return "Rerouting traffic — " + "; ".join(reasons) + "."

	@staticmethod
	def _scale_up(metrics: SystemMetrics) -> str:
	if metrics.cpu_usage > 0.70 or metrics.memory_usage > 0.70:
	return (
	f"Resource pressure detected (CPU {metrics.cpu_usage:.0%}, "
	f"Mem {metrics.memory_usage:.0%}). Scaling up to distribute load."
	)
	return (
	"Scaling up to add capacity headroom. "
	"Effective but slow — better for sustained overload than spikes."
	)