File size: 7,143 Bytes
34bd75f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
Interactive simulation runner β€” demonstrates OnCallEnv with optimal agent strategies.
Shows step-by-step investigation and remediation for all 6 tasks.
"""

import sys
sys.path.insert(0, ".")

from environment import OnCallEnvironment
from models import Action
from graders import grade_task


def banner(text: str):
    w = 70
    print("\n" + "=" * w)
    print(f"  {text}")
    print("=" * w)


def run_task(env: OnCallEnvironment, task_id: str, actions: list[str]):
    """Run a task with a predefined action sequence and print each step."""
    obs = env.reset(task_id)
    print(f"\n  Task:       {obs.task_id}")
    print(f"  Goal:       {obs.goal}")
    print(f"  Max steps:  {obs.max_steps}")
    print(f"  Services:   {', '.join(obs.services)}")
    print(f"  Alerts:")
    for a in obs.alerts:
        print(f"    [{a.severity.upper()}] {a.service}: {a.message}")
    print()

    for i, cmd in enumerate(actions, 1):
        resp = env.step(Action(command=cmd))
        obs = resp.observation
        status = "ERROR" if obs.last_action_error else "OK"
        print(f"  Step {i}: {cmd}")
        lines = (obs.last_action_result or "").split("\n")
        for line in lines[:5]:
            print(f"    | {line}")
        if len(lines) > 5:
            print(f"    | ... ({len(lines) - 5} more lines)")
        print(f"    [{status}] Running score: {resp.reward.total}")
        print()

        if resp.done:
            print(f"  >>> Episode finished: {resp.info.get('reason', '?')}")
            break

    state = env.state()
    grader_score = grade_task(task_id, state)
    print(f"\n  -- Final Results --")
    print(f"  Env Score:     {state.score}")
    print(f"  Grader Score:  {grader_score}")
    print(f"  Breakdown:")
    for k, v in state.reward_breakdown.items():
        print(f"    {k:20s} {v:+.3f}")
    print(f"  Steps used:     {state.step}")
    print(f"  Root cause ID:  {state.root_cause_identified}")
    print(f"  Remediation:    {state.remediation_applied}")
    return state.score


def main():
    env = OnCallEnvironment()
    scores = {}

    # ── EASY ──────────────────────────────────────────────────────────
    banner("EASY: Memory Leak in Payment Service")
    scores["easy_memory_leak"] = run_task(env, "easy_memory_leak", [
        "check_logs payment-service",
        "check_metrics payment-service",
        "check_metrics api-gateway",
        "restart_service payment-service",
        "mark_resolved payment-service memory leak OOM out of memory causing repeated kills",
    ])

    # ── MEDIUM ────────────────────────────────────────────────────────
    banner("MEDIUM: Cascading Connection Pool Exhaustion")
    scores["medium_cascading_failure"] = run_task(env, "medium_cascading_failure", [
        "check_metrics api-gateway",
        "check_logs api-gateway",
        "check_dependencies api-gateway",
        "check_metrics order-service",
        "check_logs order-service",
        "check_config order-service",
        "update_config order-service db_pool_size 50",
        "mark_resolved order-service connection pool exhausted db_pool_size config changed to 5 by auto-scaler",
    ])

    # ── HARD (Cache) ──────────────────────────────────────────────────
    banner("HARD: Subtle Cache Bug Causing Cross-Service Degradation")
    scores["hard_cache_degradation"] = run_task(env, "hard_cache_degradation", [
        "check_metrics api-gateway",
        "check_metrics order-service",
        "check_metrics product-service",
        "check_metrics cache-service",
        "check_logs cache-service",
        "check_deploy_history cache-service",
        "check_metrics postgres-primary",
        "rollback_deploy cache-service",
        "mark_resolved cache-service deployment changed key hashing algorithm causing 60% cache miss rate",
    ])

    # ── MEDIUM (DNS) ──────────────────────────────────────────────────
    banner("MEDIUM: DNS Misconfiguration Causing Intermittent Failures")
    scores["medium_dns_misconfiguration"] = run_task(env, "medium_dns_misconfiguration", [
        "check_metrics order-service",
        "check_logs order-service",
        "check_config order-service",
        "check_metrics inventory-service",
        "check_metrics api-gateway",
        "update_config order-service inventory_host inventory-service.internal",
        "mark_resolved order-service dns hostname misconfiguration inventory_host pointed to decommissioned host",
    ])

    # ── HARD (Replication) ────────────────────────────────────────────
    banner("HARD: Database Replication Lag from Runaway Batch Job")
    scores["hard_replication_lag"] = run_task(env, "hard_replication_lag", [
        "check_metrics user-service",
        "check_logs user-service",
        "check_metrics order-service",
        "check_logs order-service",
        "check_metrics postgres-primary",
        "check_logs postgres-primary",
        "check_config postgres-primary",
        "check_metrics postgres-replica",
        "update_config postgres-primary batch_job_enabled false",
        "mark_resolved postgres-primary batch job nightly_aggregation running during peak hours causing replication lag",
    ])

    # ── EXPERT (Multi-Root-Cause) ────────────────────────────────────
    banner("EXPERT: Simultaneous Bad Deployment and Config Drift")
    scores["expert_multi_root_cause"] = run_task(env, "expert_multi_root_cause", [
        "check_metrics api-gateway",
        "check_logs api-gateway",
        "check_metrics search-service",
        "check_logs search-service",
        "check_deploy_history search-service",
        "check_metrics order-service",
        "check_logs order-service",
        "check_config order-service",
        "check_metrics elasticsearch",
        "rollback_deploy search-service",
        "update_config order-service db_pool_size 50",
        "mark_resolved search-service bad deployment v3.1.0 broke elasticsearch query AND order-service db_pool_size config reduced to 3 by capacity-planner both issues fixed",
    ])

    # ── SUMMARY ───────────────────────────────────────────────────────
    banner("SIMULATION SUMMARY")
    total = 0.0
    for tid, score in scores.items():
        print(f"  {tid:35s}  {score:.3f}")
        total += score
    avg = total / len(scores)
    print(f"  {'':35s}  -----")
    print(f"  {'Average':35s}  {avg:.3f}")
    print()


if __name__ == "__main__":
    main()