File size: 4,643 Bytes
5899fec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
ActionExplainer β€” post-hoc natural language explanations for agent decisions.

Explanations are derived from the *current observation* at decision time,
not from the model internals.  This is honest and transparent: we describe
what the system state shows, which is exactly what a human NOC engineer
would say when justifying the same action.
"""

from __future__ import annotations

from .models import ActionType, IncidentType, SystemMetrics


class ActionExplainer:
    """
    Maps (action, metrics, incident_type) β†’ a human-readable explanation string.

    Each explanation surfaces the dominant signal that makes the action
    reasonable.  If the action is wrong/irrelevant, that is also noted β€”
    useful during the demo to show the agent has learned to avoid bad moves.
    """

    def explain(
        self,
        action: ActionType,
        metrics: SystemMetrics,
        incident_type: IncidentType,  # reserved for future incident-aware explanations
    ) -> str:
        if action == ActionType.DO_NOTHING:
            return self._do_nothing(metrics)
        if action == ActionType.RESTART_SERVICE:
            return self._restart_service(metrics)
        if action == ActionType.THROTTLE_CPU:
            return self._throttle_cpu(metrics)
        if action == ActionType.CLEAR_CACHE:
            return self._clear_cache(metrics)
        if action == ActionType.REROUTE_TRAFFIC:
            return self._reroute_traffic(metrics)
        if action == ActionType.SCALE_UP:
            return self._scale_up(metrics)
        return f"Agent applied action. System health: {metrics.health_score:.0%}."

    # ------------------------------------------------------------------
    # Per-action explanation builders
    # ------------------------------------------------------------------

    @staticmethod
    def _do_nothing(metrics: SystemMetrics) -> str:
        if metrics.health_score > 0.75:
            return "System metrics are within acceptable range. Monitoring β€” no intervention needed."
        return (
            f"Agent chose to wait. Health score is {metrics.health_score:.0%}. "
            "Observing for one step before acting."
        )

    @staticmethod
    def _restart_service(metrics: SystemMetrics) -> str:
        reasons: list[str] = []
        if metrics.service_healthy < 1.0:
            reasons.append(f"service is partially down ({metrics.service_healthy:.0%} health)")
        if metrics.error_rate > 0.35:
            reasons.append(f"error rate is critically high ({metrics.error_rate:.0%})")
        if not reasons:
            reasons.append("attempting to clear transient faults via process restart")
        return "Restarting service β€” " + "; ".join(reasons) + "."

    @staticmethod
    def _throttle_cpu(metrics: SystemMetrics) -> str:
        if metrics.cpu_usage > 0.80:
            return (
                f"CPU is saturated at {metrics.cpu_usage:.0%}. "
                "Throttling runaway process to free cycles and reduce latency."
            )
        return (
            f"CPU usage is {metrics.cpu_usage:.0%}. "
            "Throttling as a precaution β€” marginal effect expected."
        )

    @staticmethod
    def _clear_cache(metrics: SystemMetrics) -> str:
        if metrics.memory_usage > 0.75:
            return (
                f"Memory usage is {metrics.memory_usage:.0%}. "
                "Clearing application cache to reclaim heap space and prevent OOM."
            )
        return (
            f"Memory usage is {metrics.memory_usage:.0%}. "
            "Cache flush has limited benefit at this memory level."
        )

    @staticmethod
    def _reroute_traffic(metrics: SystemMetrics) -> str:
        reasons: list[str] = []
        if metrics.latency > 0.40:
            reasons.append(f"latency is {metrics.latency * 500:.0f} ms")
        if metrics.packet_loss > 0.10:
            reasons.append(f"packet loss is {metrics.packet_loss:.0%}")
        if not reasons:
            reasons.append("network metrics are within normal range β€” limited benefit expected")
        return "Rerouting traffic β€” " + "; ".join(reasons) + "."

    @staticmethod
    def _scale_up(metrics: SystemMetrics) -> str:
        if metrics.cpu_usage > 0.70 or metrics.memory_usage > 0.70:
            return (
                f"Resource pressure detected (CPU {metrics.cpu_usage:.0%}, "
                f"Mem {metrics.memory_usage:.0%}). Scaling up to distribute load."
            )
        return (
            "Scaling up to add capacity headroom. "
            "Effective but slow β€” better for sustained overload than spikes."
        )