Spaces:

ShubhanKamat
/

mlops-firefighter

Running

App Files Files Community

mlops-firefighter / validate.py

ShubhanKamat

MLOps Firefighter - OpenEnv environment

670f19f about 2 months ago

raw

history blame contribute delete

8.25 kB

	#!/usr/bin/env python3
	"""
	Pre-submission validation script for the MLOps Firefighter environment.

	Checks all requirements from the hackathon rubric:
	1. openenv.yaml exists and is valid
	2. Typed Pydantic models exist
	3. step()/reset()/state() work correctly
	4. 3+ tasks with graders
	5. Grader scores in 0.0–1.0 range
	6. All required endpoints respond
	7. Baseline produces scores
	8. Dockerfile exists
	"""

	import json
	import sys
	import os
	import yaml

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "server"))

	PASS = "✅"
	FAIL = "❌"
	results = []


	def check(name: str, condition: bool, detail: str = ""):
	status = PASS if condition else FAIL
	results.append((name, condition))
	msg = f" {status} {name}"
	if detail:
	msg += f" — {detail}"
	print(msg)
	return condition


	def main():
	print("\n" + "=" * 60)
	print(" MLOps Firefighter — Pre-Submission Validator")
	print("=" * 60 + "\n")

	# 1. openenv.yaml
	print("[1/8] OpenEnv manifest (openenv.yaml)")
	yaml_path = os.path.join(os.path.dirname(__file__), "openenv.yaml")
	has_yaml = os.path.exists(yaml_path)
	check("openenv.yaml exists", has_yaml)
	if has_yaml:
	with open(yaml_path) as f:
	manifest = yaml.safe_load(f)
	check("Has name", "name" in manifest)
	check("Has version", "version" in manifest)
	check("Has description", "description" in manifest)
	check("Has tasks", "tasks" in manifest and len(manifest["tasks"]) >= 3)
	check("Has 'openenv' tag", "openenv" in manifest.get("tags", []))

	# 2. Typed Pydantic models
	print("\n[2/8] Typed Pydantic models")
	try:
	from models import MLOpsAction, MLOpsObservation, ActionType
	check("MLOpsAction importable", True)
	check("MLOpsObservation importable", True)
	check("ActionType enum exists", len(ActionType) >= 10)
	# Verify they're Pydantic
	a = MLOpsAction(action_type=ActionType.INSPECT_METRICS)
	check("MLOpsAction is Pydantic", hasattr(a, "model_dump"))
	except Exception as e:
	check("Models import", False, str(e))

	# 3. step()/reset()/state()
	print("\n[3/8] Environment interface (reset/step/state)")
	try:
	from server.environment import MLOpsFirefighterEnvironment
	env = MLOpsFirefighterEnvironment()

	obs = env.reset(task_id="task_threshold_drift")
	check("reset() returns observation", obs is not None)
	check("reset() obs has done=False", obs.done is False)
	check("reset() obs has step_number=0", obs.step_number == 0)

	from models import MLOpsAction, ActionType
	obs2 = env.step(MLOpsAction(action_type=ActionType.INSPECT_METRICS))
	check("step() returns observation", obs2 is not None)
	check("step() increments step_number", obs2.step_number == 1)
	check("step() returns reward", isinstance(obs2.reward, float))

	st = env.state()
	check("state() returns dict", isinstance(st, dict))
	check("state() has episode_id", "episode_id" in st)
	check("state() has step_count", "step_count" in st)
	except Exception as e:
	check("Environment interface", False, str(e))

	# 4. 3+ tasks
	print("\n[4/8] Task definitions")
	try:
	from tasks import ALL_TASKS
	check("3+ tasks defined", len(ALL_TASKS) >= 3)
	difficulties = {t.difficulty for t in ALL_TASKS.values()}
	check("Has easy task", "easy" in difficulties)
	check("Has medium task", "medium" in difficulties)
	check("Has hard task", "hard" in difficulties)
	for tid, task in ALL_TASKS.items():
	check(f"Task '{tid}' has root_causes", len(task.root_causes) > 0)
	check(f"Task '{tid}' has diagnostics", len(task.required_diagnostics) > 0)
	check(f"Task '{tid}' has remediations", len(task.correct_remediations) > 0)
	except Exception as e:
	check("Tasks", False, str(e))

	# 5. Grader scores in range
	print("\n[5/8] Grader scoring (0.0–1.0)")
	try:
	from tasks import grade_episode, ALL_TASKS
	from models import ActionType
	for tid, task in ALL_TASKS.items():
	# Perfect
	score, bd = grade_episode(
	task=task,
	actions_taken=[{"action_type": d.value} for d in task.required_diagnostics],
	diagnosis_submitted={"root_cause": task.root_causes[0]},
	remediation_applied=[r.value for r in task.correct_remediations],
	total_steps=len(task.required_diagnostics) + 2,
	)
	check(f"'{tid}' perfect score in [0,1]", 0.0 <= score <= 1.0, f"{score:.3f}")

	# Empty
	score_z, _ = grade_episode(
	task=task, actions_taken=[], diagnosis_submitted=None,
	remediation_applied=[], total_steps=task.max_steps,
	)
	check(f"'{tid}' empty score in [0,1]", 0.0 <= score_z <= 1.0, f"{score_z:.3f}")

	# Partial credit varies
	check(f"'{tid}' grader differentiates", score > score_z, f"perfect={score:.3f} > empty={score_z:.3f}")
	except Exception as e:
	check("Grader", False, str(e))

	# 6. All endpoints
	print("\n[6/8] HTTP endpoints")
	try:
	from fastapi.testclient import TestClient
	from server.app import app
	client = TestClient(app)

	r = client.get("/health")
	check("/health returns 200", r.status_code == 200)

	r = client.get("/tasks")
	check("/tasks returns 200", r.status_code == 200)
	check("/tasks has action_schema", "action_schema" in r.json())

	r = client.post("/reset", json={"task_id": "task_threshold_drift"})
	check("/reset returns 200", r.status_code == 200)

	r = client.post("/step", json={"action_type": "inspect_metrics"})
	check("/step returns 200", r.status_code == 200)

	r = client.get("/state")
	check("/state returns 200", r.status_code == 200)

	# Complete an episode for grader test
	client.post("/reset", json={"task_id": "task_threshold_drift"})
	client.post("/step", json={"action_type": "inspect_metrics"})
	client.post("/step", json={"action_type": "submit_diagnosis",
	"parameters": {"root_cause": "test", "summary": "t"}})
	r = client.post("/grader", json={})
	check("/grader returns 200", r.status_code == 200)

	r = client.post("/baseline")
	check("/baseline returns 200", r.status_code == 200)
	check("/baseline has scores", "average_score" in r.json())
	except Exception as e:
	check("Endpoints", False, str(e))

	# 7. Baseline produces scores
	print("\n[7/8] Baseline scoring")
	try:
	r = client.post("/baseline")
	data = r.json()
	avg = data["average_score"]
	check("Baseline avg score > 0", avg > 0, f"avg={avg}")
	for tid, result in data["baseline_results"].items():
	s = result["score"]
	check(f"Baseline '{tid}' in [0,1]", 0.0 <= s <= 1.0, f"{s:.3f}")
	except Exception as e:
	check("Baseline", False, str(e))

	# 8. Dockerfile exists
	print("\n[8/8] Dockerfile")
	df_path = os.path.join(os.path.dirname(__file__), "Dockerfile")
	check("Dockerfile exists", os.path.exists(df_path))
	if os.path.exists(df_path):
	with open(df_path) as f:
	content = f.read()
	check("Dockerfile has FROM", "FROM" in content)
	check("Dockerfile has EXPOSE", "EXPOSE" in content)
	check("Dockerfile has CMD", "CMD" in content)

	# Summary
	total = len(results)
	passed = sum(1 for _, ok in results if ok)
	failed = total - passed

	print("\n" + "=" * 60)
	if failed == 0:
	print(f" {PASS} ALL {total} CHECKS PASSED — Ready to submit!")
	else:
	print(f" {FAIL} {failed}/{total} checks failed")
	for name, ok in results:
	if not ok:
	print(f" - {name}")
	print("=" * 60 + "\n")

	return 0 if failed == 0 else 1


	if __name__ == "__main__":
	sys.exit(main())