File size: 11,222 Bytes
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4887b5f
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df52e99
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4887b5f
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4887b5f
06b4790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/env python3
"""
validate.py — Pre-submission validation script.

Run this before submitting to confirm all checklist items pass:
    python validate.py

Exit code 0 = all checks passed.
Exit code 1 = one or more checks failed.
"""
import sys
import os
import random
import traceback

sys.path.insert(0, os.path.dirname(__file__))

PASS = "\033[92m✓\033[0m"
FAIL = "\033[91m✗\033[0m"
WARN = "\033[93m!\033[0m"

failures = []


def check(name: str, fn):
    try:
        result = fn()
        if result is True or result is None:
            print(f"  {PASS}  {name}")
            return True
        else:
            print(f"  {FAIL}  {name}: {result}")
            failures.append(name)
            return False
    except Exception as e:
        print(f"  {FAIL}  {name}: {e}")
        traceback.print_exc()
        failures.append(name)
        return False


def main():
    print("\n=== DevOps Incident Response — OpenEnv Validation ===\n")

    # --- Imports ---
    print("[ Imports ]")

    def check_imports():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType, Observation, StepResult, State
        from graders.grader import grade_episode
        return True

    check("All modules import cleanly", check_imports)

    # --- Reset returns valid Observation ---
    print("\n[ reset() ]")

    def check_reset_easy():
        from env import DevOpsIncidentEnv
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        obs = env.reset()
        assert obs.step == 0
        assert len(obs.services) > 0
        assert len(obs.active_alerts) > 0
        assert obs.task_id == "easy"
        return True

    def check_reset_all_tasks():
        from env import DevOpsIncidentEnv
        for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]:
            env = DevOpsIncidentEnv(task_id=task_id, seed=42)
            obs = env.reset()
            assert obs.task_id == task_id, f"task_id mismatch for {task_id}"
            assert obs.max_steps > 0
        return True

    def check_reset_reproducible():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        results = []
        for _ in range(3):
            env = DevOpsIncidentEnv(task_id="easy", seed=42)
            obs = env.reset()
            results.append(obs.services[0].memory_percent)
        assert len(set(results)) == 1, f"Different results for same seed: {results}"
        return True

    def check_seed_variety():
        from env import DevOpsIncidentEnv
        roots = set()
        for seed in range(10):
            env = DevOpsIncidentEnv(task_id="easy", seed=seed)
            env.reset()
            s = env.state()
            roots.add(s.ground_truth_root_cause)
        assert len(roots) > 1, f"All seeds produce same scenario: {roots}"
        return True

    check("reset() returns valid Observation for easy task", check_reset_easy)
    check("reset() works for all 7 tasks", check_reset_all_tasks)
    check("Same seed always produces same episode", check_reset_reproducible)
    check("Different seeds produce different scenarios", check_seed_variety)

    # --- step() ---
    print("\n[ step() ]")

    def check_step_returns_result():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType, StepResult
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        env.reset()
        result = env.step(Action(action_type=ActionType.NOOP))
        assert isinstance(result, StepResult)
        assert isinstance(result.reward, float)
        assert isinstance(result.done, bool)
        assert result.observation.step == 1
        return True

    def check_step_reward_in_range():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        rng = random.Random(0)
        for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]:
            env = DevOpsIncidentEnv(task_id=task_id, seed=42)
            env.reset()
            done = False
            steps = 0
            while not done and steps < 30:
                action = Action(action_type=rng.choice(list(ActionType)))
                result = env.step(action)
                assert -1.0 <= result.reward <= 1.0, f"reward={result.reward} out of range"
                done = result.done
                steps += 1
        return True

    def check_max_steps_terminates():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        env.reset()
        done = False
        steps = 0
        while not done:
            result = env.step(Action(action_type=ActionType.NOOP))
            done = result.done
            steps += 1
            assert steps <= 20, "Episode never terminated"
        return True

    check("step() returns valid StepResult", check_step_returns_result)
    check("step() rewards always in [-1.0, 1.0]", check_step_reward_in_range)
    check("Episode terminates at max_steps", check_max_steps_terminates)

    # --- state() ---
    print("\n[ state() ]")

    def check_state_has_ground_truth():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        env = DevOpsIncidentEnv(task_id="medium", seed=42)
        env.reset()
        env.step(Action(action_type=ActionType.NOOP))
        s = env.state()
        assert s.ground_truth_root_cause != ""
        assert s.ground_truth_fix != ""
        assert len(s.action_history) == 1
        return True

    check("state() returns ground truth and action history", check_state_has_ground_truth)

    # --- Graders ---
    print("\n[ Graders ]")

    def check_graders_in_range():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        from graders.grader import grade_episode
        rng = random.Random(99)
        for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]:
            env = DevOpsIncidentEnv(task_id=task_id, seed=42)
            env.reset()
            done = False
            steps = 0
            while not done and steps < 30:
                action = Action(action_type=rng.choice(list(ActionType)))
                result = env.step(action)
                done = result.done
                steps += 1
            s = env.state()
            score = grade_episode(
                task_id, s.action_history, s.ground_truth_root_cause,
                s.ground_truth_fix, s.incident_resolved, s.total_reward,
            )
            assert 0.0 <= score <= 1.0, f"{task_id} score={score} out of [0,1]"
        return True

    def check_graders_not_constant():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        from graders.grader import grade_episode
        scores = []
        for seed in [1, 2, 3, 42, 99]:
            rng = random.Random(seed * 7)
            env = DevOpsIncidentEnv(task_id="easy", seed=seed)
            env.reset()
            done = False
            steps = 0
            while not done and steps < 15:
                action = Action(action_type=rng.choice(list(ActionType)))
                result = env.step(action)
                done = result.done
                steps += 1
            s = env.state()
            score = grade_episode(
                "easy", s.action_history, s.ground_truth_root_cause,
                s.ground_truth_fix, s.incident_resolved, s.total_reward,
            )
            scores.append(score)
        assert len(set(scores)) > 1, f"Grader returns constant score: {scores}"
        return True

    def check_optimal_agent_scores_high():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        from graders.grader import grade_episode
        # Easy task optimal sequence
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        env.reset()
        s0 = env.state()
        failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-")
        for act in [
            Action(action_type=ActionType.READ_LOGS, service=failing),
            Action(action_type=ActionType.READ_METRICS, service=failing),
            Action(action_type=ActionType.DIAGNOSE, root_cause=f"memory leak {failing}"),
            Action(action_type=ActionType.RESTART_SERVICE, service=failing),
        ]:
            result = env.step(act)
            if result.done:
                break
        s = env.state()
        score = grade_episode(
            "easy", s.action_history, s.ground_truth_root_cause,
            s.ground_truth_fix, s.incident_resolved, s.total_reward,
        )
        assert score >= 0.85, f"Optimal agent scored only {score:.3f} on easy"
        return True

    check("All graders return scores in [0.0, 1.0]", check_graders_in_range)
    check("Grader does not return constant scores across episodes", check_graders_not_constant)
    check("Optimal agent scores >= 0.85 on easy task", check_optimal_agent_scores_high)

    # --- Collateral damage penalty ---
    print("\n[ Reward shaping ]")

    def check_collateral_damage_penalty():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        env.reset()
        s0 = env.state()
        healthy = [svc for svc in s0.current_observation.services
                   if svc.status == "healthy"]
        assert len(healthy) > 0, "No healthy services to test with"
        result = env.step(Action(action_type=ActionType.RESTART_SERVICE,
                                 service=healthy[0].name))
        assert result.reward < 0, f"Expected negative reward for healthy restart, got {result.reward}"
        return True

    def check_info_gathering_rewarded():
        from env import DevOpsIncidentEnv
        from models import Action, ActionType
        env = DevOpsIncidentEnv(task_id="easy", seed=42)
        env.reset()
        s0 = env.state()
        failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-")
        result = env.step(Action(action_type=ActionType.READ_LOGS, service=failing))
        assert result.reward > 0, f"Expected positive reward for reading failing service logs, got {result.reward}"
        return True

    check("Restarting healthy service gives negative reward", check_collateral_damage_penalty)
    check("Reading failing service logs gives positive reward", check_info_gathering_rewarded)

    # --- Files present ---
    print("\n[ Required files ]")

    for fname in ["openenv.yaml", "Dockerfile", "requirements.txt",
                  "inference.py", "README.md", "env.py", "api.py"]:
        path = os.path.join(os.path.dirname(__file__), fname)
        check(f"{fname} exists", lambda p=path: os.path.exists(p) or f"Missing: {p}")

    # --- Summary ---
    print()
    if not failures:
        print(f"{PASS} All checks passed! Ready to submit.\n")
        sys.exit(0)
    else:
        print(f"{FAIL} {len(failures)} check(s) failed: {failures}\n")
        sys.exit(1)


if __name__ == "__main__":
    main()