#!/usr/bin/env python3 """ validate.py — Pre-submission validation script. Run this before submitting to confirm all checklist items pass: python validate.py Exit code 0 = all checks passed. Exit code 1 = one or more checks failed. """ import sys import os import random import traceback sys.path.insert(0, os.path.dirname(__file__)) PASS = "\033[92m✓\033[0m" FAIL = "\033[91m✗\033[0m" WARN = "\033[93m!\033[0m" failures = [] def check(name: str, fn): try: result = fn() if result is True or result is None: print(f" {PASS} {name}") return True else: print(f" {FAIL} {name}: {result}") failures.append(name) return False except Exception as e: print(f" {FAIL} {name}: {e}") traceback.print_exc() failures.append(name) return False def main(): print("\n=== DevOps Incident Response — OpenEnv Validation ===\n") # --- Imports --- print("[ Imports ]") def check_imports(): from env import DevOpsIncidentEnv from models import Action, ActionType, Observation, StepResult, State from graders.grader import grade_episode return True check("All modules import cleanly", check_imports) # --- Reset returns valid Observation --- print("\n[ reset() ]") def check_reset_easy(): from env import DevOpsIncidentEnv env = DevOpsIncidentEnv(task_id="easy", seed=42) obs = env.reset() assert obs.step == 0 assert len(obs.services) > 0 assert len(obs.active_alerts) > 0 assert obs.task_id == "easy" return True def check_reset_all_tasks(): from env import DevOpsIncidentEnv for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]: env = DevOpsIncidentEnv(task_id=task_id, seed=42) obs = env.reset() assert obs.task_id == task_id, f"task_id mismatch for {task_id}" assert obs.max_steps > 0 return True def check_reset_reproducible(): from env import DevOpsIncidentEnv from models import Action, ActionType results = [] for _ in range(3): env = DevOpsIncidentEnv(task_id="easy", seed=42) obs = env.reset() results.append(obs.services[0].memory_percent) assert len(set(results)) == 1, f"Different results for same seed: {results}" return True def check_seed_variety(): from env import DevOpsIncidentEnv roots = set() for seed in range(10): env = DevOpsIncidentEnv(task_id="easy", seed=seed) env.reset() s = env.state() roots.add(s.ground_truth_root_cause) assert len(roots) > 1, f"All seeds produce same scenario: {roots}" return True check("reset() returns valid Observation for easy task", check_reset_easy) check("reset() works for all 7 tasks", check_reset_all_tasks) check("Same seed always produces same episode", check_reset_reproducible) check("Different seeds produce different scenarios", check_seed_variety) # --- step() --- print("\n[ step() ]") def check_step_returns_result(): from env import DevOpsIncidentEnv from models import Action, ActionType, StepResult env = DevOpsIncidentEnv(task_id="easy", seed=42) env.reset() result = env.step(Action(action_type=ActionType.NOOP)) assert isinstance(result, StepResult) assert isinstance(result.reward, float) assert isinstance(result.done, bool) assert result.observation.step == 1 return True def check_step_reward_in_range(): from env import DevOpsIncidentEnv from models import Action, ActionType rng = random.Random(0) for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]: env = DevOpsIncidentEnv(task_id=task_id, seed=42) env.reset() done = False steps = 0 while not done and steps < 30: action = Action(action_type=rng.choice(list(ActionType))) result = env.step(action) assert -1.0 <= result.reward <= 1.0, f"reward={result.reward} out of range" done = result.done steps += 1 return True def check_max_steps_terminates(): from env import DevOpsIncidentEnv from models import Action, ActionType env = DevOpsIncidentEnv(task_id="easy", seed=42) env.reset() done = False steps = 0 while not done: result = env.step(Action(action_type=ActionType.NOOP)) done = result.done steps += 1 assert steps <= 20, "Episode never terminated" return True check("step() returns valid StepResult", check_step_returns_result) check("step() rewards always in [-1.0, 1.0]", check_step_reward_in_range) check("Episode terminates at max_steps", check_max_steps_terminates) # --- state() --- print("\n[ state() ]") def check_state_has_ground_truth(): from env import DevOpsIncidentEnv from models import Action, ActionType env = DevOpsIncidentEnv(task_id="medium", seed=42) env.reset() env.step(Action(action_type=ActionType.NOOP)) s = env.state() assert s.ground_truth_root_cause != "" assert s.ground_truth_fix != "" assert len(s.action_history) == 1 return True check("state() returns ground truth and action history", check_state_has_ground_truth) # --- Graders --- print("\n[ Graders ]") def check_graders_in_range(): from env import DevOpsIncidentEnv from models import Action, ActionType from graders.grader import grade_episode rng = random.Random(99) for task_id in ["easy", "medium", "hard", "bonus", "security", "database", "failover"]: env = DevOpsIncidentEnv(task_id=task_id, seed=42) env.reset() done = False steps = 0 while not done and steps < 30: action = Action(action_type=rng.choice(list(ActionType))) result = env.step(action) done = result.done steps += 1 s = env.state() score = grade_episode( task_id, s.action_history, s.ground_truth_root_cause, s.ground_truth_fix, s.incident_resolved, s.total_reward, ) assert 0.0 <= score <= 1.0, f"{task_id} score={score} out of [0,1]" return True def check_graders_not_constant(): from env import DevOpsIncidentEnv from models import Action, ActionType from graders.grader import grade_episode scores = [] for seed in [1, 2, 3, 42, 99]: rng = random.Random(seed * 7) env = DevOpsIncidentEnv(task_id="easy", seed=seed) env.reset() done = False steps = 0 while not done and steps < 15: action = Action(action_type=rng.choice(list(ActionType))) result = env.step(action) done = result.done steps += 1 s = env.state() score = grade_episode( "easy", s.action_history, s.ground_truth_root_cause, s.ground_truth_fix, s.incident_resolved, s.total_reward, ) scores.append(score) assert len(set(scores)) > 1, f"Grader returns constant score: {scores}" return True def check_optimal_agent_scores_high(): from env import DevOpsIncidentEnv from models import Action, ActionType from graders.grader import grade_episode # Easy task optimal sequence env = DevOpsIncidentEnv(task_id="easy", seed=42) env.reset() s0 = env.state() failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-") for act in [ Action(action_type=ActionType.READ_LOGS, service=failing), Action(action_type=ActionType.READ_METRICS, service=failing), Action(action_type=ActionType.DIAGNOSE, root_cause=f"memory leak {failing}"), Action(action_type=ActionType.RESTART_SERVICE, service=failing), ]: result = env.step(act) if result.done: break s = env.state() score = grade_episode( "easy", s.action_history, s.ground_truth_root_cause, s.ground_truth_fix, s.incident_resolved, s.total_reward, ) assert score >= 0.85, f"Optimal agent scored only {score:.3f} on easy" return True check("All graders return scores in [0.0, 1.0]", check_graders_in_range) check("Grader does not return constant scores across episodes", check_graders_not_constant) check("Optimal agent scores >= 0.85 on easy task", check_optimal_agent_scores_high) # --- Collateral damage penalty --- print("\n[ Reward shaping ]") def check_collateral_damage_penalty(): from env import DevOpsIncidentEnv from models import Action, ActionType env = DevOpsIncidentEnv(task_id="easy", seed=42) env.reset() s0 = env.state() healthy = [svc for svc in s0.current_observation.services if svc.status == "healthy"] assert len(healthy) > 0, "No healthy services to test with" result = env.step(Action(action_type=ActionType.RESTART_SERVICE, service=healthy[0].name)) assert result.reward < 0, f"Expected negative reward for healthy restart, got {result.reward}" return True def check_info_gathering_rewarded(): from env import DevOpsIncidentEnv from models import Action, ActionType env = DevOpsIncidentEnv(task_id="easy", seed=42) env.reset() s0 = env.state() failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-") result = env.step(Action(action_type=ActionType.READ_LOGS, service=failing)) assert result.reward > 0, f"Expected positive reward for reading failing service logs, got {result.reward}" return True check("Restarting healthy service gives negative reward", check_collateral_damage_penalty) check("Reading failing service logs gives positive reward", check_info_gathering_rewarded) # --- Files present --- print("\n[ Required files ]") for fname in ["openenv.yaml", "Dockerfile", "requirements.txt", "inference.py", "README.md", "env.py", "api.py"]: path = os.path.join(os.path.dirname(__file__), fname) check(f"{fname} exists", lambda p=path: os.path.exists(p) or f"Missing: {p}") # --- Summary --- print() if not failures: print(f"{PASS} All checks passed! Ready to submit.\n") sys.exit(0) else: print(f"{FAIL} {len(failures)} check(s) failed: {failures}\n") sys.exit(1) if __name__ == "__main__": main()