Spaces:

srini047
/

noc_agent

Sleeping

File size: 4,643 Bytes

5899fec

"""
ActionExplainer — post-hoc natural language explanations for agent decisions.

Explanations are derived from the *current observation* at decision time,
not from the model internals.  This is honest and transparent: we describe
what the system state shows, which is exactly what a human NOC engineer
would say when justifying the same action.
"""

from __future__ import annotations

from .models import ActionType, IncidentType, SystemMetrics


class ActionExplainer:
    """
    Maps (action, metrics, incident_type) → a human-readable explanation string.

    Each explanation surfaces the dominant signal that makes the action
    reasonable.  If the action is wrong/irrelevant, that is also noted —
    useful during the demo to show the agent has learned to avoid bad moves.
    """

    def explain(
        self,
        action: ActionType,
        metrics: SystemMetrics,
        incident_type: IncidentType,  # reserved for future incident-aware explanations
    ) -> str:
        if action == ActionType.DO_NOTHING:
            return self._do_nothing(metrics)
        if action == ActionType.RESTART_SERVICE:
            return self._restart_service(metrics)
        if action == ActionType.THROTTLE_CPU:
            return self._throttle_cpu(metrics)
        if action == ActionType.CLEAR_CACHE:
            return self._clear_cache(metrics)
        if action == ActionType.REROUTE_TRAFFIC:
            return self._reroute_traffic(metrics)
        if action == ActionType.SCALE_UP:
            return self._scale_up(metrics)
        return f"Agent applied action. System health: {metrics.health_score:.0%}."

    # ------------------------------------------------------------------
    # Per-action explanation builders
    # ------------------------------------------------------------------

    @staticmethod
    def _do_nothing(metrics: SystemMetrics) -> str:
        if metrics.health_score > 0.75:
            return "System metrics are within acceptable range. Monitoring — no intervention needed."
        return (
            f"Agent chose to wait. Health score is {metrics.health_score:.0%}. "
            "Observing for one step before acting."
        )

    @staticmethod
    def _restart_service(metrics: SystemMetrics) -> str:
        reasons: list[str] = []
        if metrics.service_healthy < 1.0:
            reasons.append(f"service is partially down ({metrics.service_healthy:.0%} health)")
        if metrics.error_rate > 0.35:
            reasons.append(f"error rate is critically high ({metrics.error_rate:.0%})")
        if not reasons:
            reasons.append("attempting to clear transient faults via process restart")
        return "Restarting service — " + "; ".join(reasons) + "."

    @staticmethod
    def _throttle_cpu(metrics: SystemMetrics) -> str:
        if metrics.cpu_usage > 0.80:
            return (
                f"CPU is saturated at {metrics.cpu_usage:.0%}. "
                "Throttling runaway process to free cycles and reduce latency."
            )
        return (
            f"CPU usage is {metrics.cpu_usage:.0%}. "
            "Throttling as a precaution — marginal effect expected."
        )

    @staticmethod
    def _clear_cache(metrics: SystemMetrics) -> str:
        if metrics.memory_usage > 0.75:
            return (
                f"Memory usage is {metrics.memory_usage:.0%}. "
                "Clearing application cache to reclaim heap space and prevent OOM."
            )
        return (
            f"Memory usage is {metrics.memory_usage:.0%}. "
            "Cache flush has limited benefit at this memory level."
        )

    @staticmethod
    def _reroute_traffic(metrics: SystemMetrics) -> str:
        reasons: list[str] = []
        if metrics.latency > 0.40:
            reasons.append(f"latency is {metrics.latency * 500:.0f} ms")
        if metrics.packet_loss > 0.10:
            reasons.append(f"packet loss is {metrics.packet_loss:.0%}")
        if not reasons:
            reasons.append("network metrics are within normal range — limited benefit expected")
        return "Rerouting traffic — " + "; ".join(reasons) + "."

    @staticmethod
    def _scale_up(metrics: SystemMetrics) -> str:
        if metrics.cpu_usage > 0.70 or metrics.memory_usage > 0.70:
            return (
                f"Resource pressure detected (CPU {metrics.cpu_usage:.0%}, "
                f"Mem {metrics.memory_usage:.0%}). Scaling up to distribute load."
            )
        return (
            "Scaling up to add capacity headroom. "
            "Effective but slow — better for sustained overload than spikes."
        )