File size: 5,325 Bytes
5fe9036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Service simulation helpers — generates alerts, formats data, cascades dependency health.
"""

from typing import Any, Dict, List, Set, Tuple

from models import ServiceStatus


def generate_alerts(
    services: Dict[str, Any],
    scenario_alerts: List[str],
    fixed_services: Set[str],
) -> List[str]:
    """Regenerate alerts based on current service state.
    If all root-cause services are fixed, alerts clear."""
    alerts: List[str] = []
    for svc_name, svc in services.items():
        status = svc["status"]
        if status == ServiceStatus.DOWN and svc_name not in fixed_services:
            alerts.append(f"[ALERT SEV-1] {svc_name}: service is DOWN, 0 healthy pods")
        elif status == ServiceStatus.DEGRADED and svc_name not in fixed_services:
            alerts.append(f"[ALERT SEV-2] {svc_name}: service is DEGRADED")
    if not alerts:
        return ["[INFO] All services HEALTHY — no active alerts."]
    return alerts


def recompute_health(
    services: Dict[str, Any],
    dependencies: Dict[str, List[str]],
    fixed_services: Set[str],
    root_cause_map: Dict[str, str],
) -> Dict[str, Any]:
    """Walk the dependency graph and update service health.

    Rules:
    - A root-cause service that has been fixed becomes HEALTHY.
    - A non-root-cause service becomes HEALTHY if all its deps are HEALTHY.
    - A non-root-cause service becomes DEGRADED if any dep is DEGRADED.
    - A non-root-cause service becomes DOWN if any dep is DOWN.
    """
    updated = {k: dict(v) for k, v in services.items()}

    # First, fix root-cause services that have been remediated
    for svc_name in fixed_services:
        if svc_name in updated:
            updated[svc_name]["status"] = ServiceStatus.HEALTHY

    # Iteratively propagate health (max 5 rounds to handle chains)
    for _ in range(5):
        changed = False
        for svc_name, deps in dependencies.items():
            if svc_name in fixed_services:
                continue
            if svc_name in root_cause_map and svc_name not in fixed_services:
                continue  # still broken

            if not deps:
                continue

            dep_statuses = [updated[d]["status"] for d in deps if d in updated]
            if not dep_statuses:
                continue

            if any(s == ServiceStatus.DOWN for s in dep_statuses):
                new_status = ServiceStatus.DEGRADED  # downstream of DOWN = DEGRADED
            elif any(s == ServiceStatus.DEGRADED for s in dep_statuses):
                new_status = ServiceStatus.DEGRADED
            else:
                new_status = ServiceStatus.HEALTHY

            if updated[svc_name]["status"] != new_status:
                updated[svc_name]["status"] = new_status
                changed = True

        if not changed:
            break

    return updated


def format_metrics(metrics_list: List[Dict[str, Any]]) -> str:
    """Format time-series metrics into a readable table."""
    if not metrics_list:
        return "No metrics available for this service."

    # Get all keys from the first entry
    keys = list(metrics_list[0].keys())
    header = "  ".join(f"{k:<18}" for k in keys)
    lines = [header, "-" * len(header)]
    for row in metrics_list:
        vals = []
        for k in keys:
            v = row.get(k, "")
            vals.append(f"{str(v):<18}")
        lines.append("  ".join(vals))
    return "\n".join(lines)


def format_logs(log_lines: List[str]) -> str:
    """Join log lines with newlines."""
    if not log_lines:
        return "No logs available for this service."
    return "\n".join(log_lines)


def format_traces(trace_lines: List[str]) -> str:
    """Format trace data."""
    if not trace_lines:
        return "No traces available for this service."
    return "\n".join(trace_lines)


def format_deploy_history(deploy_lines: List[str]) -> str:
    """Format deploy history."""
    if not deploy_lines:
        return "No deploy history available for this service."
    return "\n".join(deploy_lines)


def format_dependencies(deps: List[str]) -> str:
    """Format dependency list."""
    if not deps:
        return "This service has no upstream dependencies."
    return "Dependencies: " + ", ".join(deps)


def format_runbook(runbook: str) -> str:
    """Return runbook text."""
    if not runbook:
        return "No runbook available for this service."
    return runbook


def format_config_diff(config_data: Dict[str, str]) -> str:
    """Format config diff."""
    if not config_data:
        return "No config data available for this service."
    result = []
    if "diff" in config_data:
        result.append(f"Config diff: {config_data['diff']}")
    if "current" in config_data:
        result.append(f"\nCurrent config:\n{config_data['current']}")
    return "\n".join(result)


def ping_service(status: ServiceStatus, service_name: str) -> str:
    """Simulate a ping to a service."""
    if status == ServiceStatus.HEALTHY:
        return f"PING {service_name}: responding on :8080/healthz — 200 OK (latency: 5ms)"
    elif status == ServiceStatus.DEGRADED:
        return f"PING {service_name}: responding on :8080/healthz — 200 OK (latency: 1200ms, SLOW)"
    else:
        return f"PING {service_name}: connection refused on :8080/healthz — service unreachable"