#!/usr/bin/env python3 """ Pre-submission validation script for the MLOps Firefighter environment. Checks all requirements from the hackathon rubric: 1. openenv.yaml exists and is valid 2. Typed Pydantic models exist 3. step()/reset()/state() work correctly 4. 3+ tasks with graders 5. Grader scores in 0.0–1.0 range 6. All required endpoints respond 7. Baseline produces scores 8. Dockerfile exists """ import json import sys import os import yaml sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "server")) PASS = "✅" FAIL = "❌" results = [] def check(name: str, condition: bool, detail: str = ""): status = PASS if condition else FAIL results.append((name, condition)) msg = f" {status} {name}" if detail: msg += f" — {detail}" print(msg) return condition def main(): print("\n" + "=" * 60) print(" MLOps Firefighter — Pre-Submission Validator") print("=" * 60 + "\n") # 1. openenv.yaml print("[1/8] OpenEnv manifest (openenv.yaml)") yaml_path = os.path.join(os.path.dirname(__file__), "openenv.yaml") has_yaml = os.path.exists(yaml_path) check("openenv.yaml exists", has_yaml) if has_yaml: with open(yaml_path) as f: manifest = yaml.safe_load(f) check("Has name", "name" in manifest) check("Has version", "version" in manifest) check("Has description", "description" in manifest) check("Has tasks", "tasks" in manifest and len(manifest["tasks"]) >= 3) check("Has 'openenv' tag", "openenv" in manifest.get("tags", [])) # 2. Typed Pydantic models print("\n[2/8] Typed Pydantic models") try: from models import MLOpsAction, MLOpsObservation, ActionType check("MLOpsAction importable", True) check("MLOpsObservation importable", True) check("ActionType enum exists", len(ActionType) >= 10) # Verify they're Pydantic a = MLOpsAction(action_type=ActionType.INSPECT_METRICS) check("MLOpsAction is Pydantic", hasattr(a, "model_dump")) except Exception as e: check("Models import", False, str(e)) # 3. step()/reset()/state() print("\n[3/8] Environment interface (reset/step/state)") try: from server.environment import MLOpsFirefighterEnvironment env = MLOpsFirefighterEnvironment() obs = env.reset(task_id="task_threshold_drift") check("reset() returns observation", obs is not None) check("reset() obs has done=False", obs.done is False) check("reset() obs has step_number=0", obs.step_number == 0) from models import MLOpsAction, ActionType obs2 = env.step(MLOpsAction(action_type=ActionType.INSPECT_METRICS)) check("step() returns observation", obs2 is not None) check("step() increments step_number", obs2.step_number == 1) check("step() returns reward", isinstance(obs2.reward, float)) st = env.state() check("state() returns dict", isinstance(st, dict)) check("state() has episode_id", "episode_id" in st) check("state() has step_count", "step_count" in st) except Exception as e: check("Environment interface", False, str(e)) # 4. 3+ tasks print("\n[4/8] Task definitions") try: from tasks import ALL_TASKS check("3+ tasks defined", len(ALL_TASKS) >= 3) difficulties = {t.difficulty for t in ALL_TASKS.values()} check("Has easy task", "easy" in difficulties) check("Has medium task", "medium" in difficulties) check("Has hard task", "hard" in difficulties) for tid, task in ALL_TASKS.items(): check(f"Task '{tid}' has root_causes", len(task.root_causes) > 0) check(f"Task '{tid}' has diagnostics", len(task.required_diagnostics) > 0) check(f"Task '{tid}' has remediations", len(task.correct_remediations) > 0) except Exception as e: check("Tasks", False, str(e)) # 5. Grader scores in range print("\n[5/8] Grader scoring (0.0–1.0)") try: from tasks import grade_episode, ALL_TASKS from models import ActionType for tid, task in ALL_TASKS.items(): # Perfect score, bd = grade_episode( task=task, actions_taken=[{"action_type": d.value} for d in task.required_diagnostics], diagnosis_submitted={"root_cause": task.root_causes[0]}, remediation_applied=[r.value for r in task.correct_remediations], total_steps=len(task.required_diagnostics) + 2, ) check(f"'{tid}' perfect score in [0,1]", 0.0 <= score <= 1.0, f"{score:.3f}") # Empty score_z, _ = grade_episode( task=task, actions_taken=[], diagnosis_submitted=None, remediation_applied=[], total_steps=task.max_steps, ) check(f"'{tid}' empty score in [0,1]", 0.0 <= score_z <= 1.0, f"{score_z:.3f}") # Partial credit varies check(f"'{tid}' grader differentiates", score > score_z, f"perfect={score:.3f} > empty={score_z:.3f}") except Exception as e: check("Grader", False, str(e)) # 6. All endpoints print("\n[6/8] HTTP endpoints") try: from fastapi.testclient import TestClient from server.app import app client = TestClient(app) r = client.get("/health") check("/health returns 200", r.status_code == 200) r = client.get("/tasks") check("/tasks returns 200", r.status_code == 200) check("/tasks has action_schema", "action_schema" in r.json()) r = client.post("/reset", json={"task_id": "task_threshold_drift"}) check("/reset returns 200", r.status_code == 200) r = client.post("/step", json={"action_type": "inspect_metrics"}) check("/step returns 200", r.status_code == 200) r = client.get("/state") check("/state returns 200", r.status_code == 200) # Complete an episode for grader test client.post("/reset", json={"task_id": "task_threshold_drift"}) client.post("/step", json={"action_type": "inspect_metrics"}) client.post("/step", json={"action_type": "submit_diagnosis", "parameters": {"root_cause": "test", "summary": "t"}}) r = client.post("/grader", json={}) check("/grader returns 200", r.status_code == 200) r = client.post("/baseline") check("/baseline returns 200", r.status_code == 200) check("/baseline has scores", "average_score" in r.json()) except Exception as e: check("Endpoints", False, str(e)) # 7. Baseline produces scores print("\n[7/8] Baseline scoring") try: r = client.post("/baseline") data = r.json() avg = data["average_score"] check("Baseline avg score > 0", avg > 0, f"avg={avg}") for tid, result in data["baseline_results"].items(): s = result["score"] check(f"Baseline '{tid}' in [0,1]", 0.0 <= s <= 1.0, f"{s:.3f}") except Exception as e: check("Baseline", False, str(e)) # 8. Dockerfile exists print("\n[8/8] Dockerfile") df_path = os.path.join(os.path.dirname(__file__), "Dockerfile") check("Dockerfile exists", os.path.exists(df_path)) if os.path.exists(df_path): with open(df_path) as f: content = f.read() check("Dockerfile has FROM", "FROM" in content) check("Dockerfile has EXPOSE", "EXPOSE" in content) check("Dockerfile has CMD", "CMD" in content) # Summary total = len(results) passed = sum(1 for _, ok in results if ok) failed = total - passed print("\n" + "=" * 60) if failed == 0: print(f" {PASS} ALL {total} CHECKS PASSED — Ready to submit!") else: print(f" {FAIL} {failed}/{total} checks failed") for name, ok in results: if not ok: print(f" - {name}") print("=" * 60 + "\n") return 0 if failed == 0 else 1 if __name__ == "__main__": sys.exit(main())