Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| validate.py — Pre-Submission Validation Script | |
| =============================================== | |
| Run this before submitting to catch any disqualifying issues. | |
| Usage: | |
| python validate.py # full validation (no server needed) | |
| python validate.py --url <ENV_BASE_URL> # also ping a live server | |
| Exit code 0 = all checks passed. | |
| Exit code 1 = one or more checks failed. | |
| """ | |
| import importlib | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| import time | |
| from pathlib import Path | |
| from typing import Callable, List, Optional, Tuple | |
| # --------------------------------------------------------------------------- | |
| # ANSI colours | |
| # --------------------------------------------------------------------------- | |
| GREEN = "\033[92m" | |
| RED = "\033[91m" | |
| YELLOW = "\033[93m" | |
| RESET = "\033[0m" | |
| BOLD = "\033[1m" | |
| PASS = f"{GREEN}✓ PASS{RESET}" | |
| FAIL = f"{RED}✗ FAIL{RESET}" | |
| WARN = f"{YELLOW}⚠ WARN{RESET}" | |
| ROOT = Path(__file__).parent.resolve() | |
| results: List[Tuple[str, bool, str]] = [] # (name, passed, detail) | |
| def check(name: str) -> Callable: | |
| """Decorator — registers a check function and records its result.""" | |
| def decorator(fn: Callable) -> Callable: | |
| def wrapper(*args, **kwargs): | |
| try: | |
| passed, detail = fn(*args, **kwargs) | |
| except Exception as exc: | |
| passed, detail = False, f"Exception: {exc}" | |
| results.append((name, passed, detail)) | |
| status = PASS if passed else FAIL | |
| print(f" {status} {name}") | |
| if detail: | |
| prefix = " " | |
| for line in detail.splitlines(): | |
| print(f"{prefix}{line}") | |
| return wrapper | |
| return decorator | |
| # --------------------------------------------------------------------------- | |
| # Checks | |
| # --------------------------------------------------------------------------- | |
| def check_openenv_yaml(): | |
| p = ROOT / "openenv.yaml" | |
| if not p.exists(): | |
| return False, "openenv.yaml not found" | |
| try: | |
| import yaml # type: ignore | |
| data = yaml.safe_load(p.read_text()) | |
| except ImportError: | |
| # Fallback: naive key check | |
| text = p.read_text() | |
| required = ["spec_version", "name", "app", "port", "tasks"] | |
| missing = [k for k in required if k not in text] | |
| if missing: | |
| return False, f"Missing keys: {missing}" | |
| return True, "yaml library not installed — basic text check passed" | |
| required = ["spec_version", "name", "app", "port", "tasks"] | |
| missing = [k for k in required if k not in data] | |
| if missing: | |
| return False, f"Missing keys in openenv.yaml: {missing}" | |
| tasks = data.get("tasks", []) | |
| if len(tasks) < 3: | |
| return False, f"Need at least 3 tasks, found {len(tasks)}" | |
| return True, f"spec_version={data['spec_version']} | tasks={[t['name'] for t in tasks]}" | |
| def check_dockerfile(): | |
| p = ROOT / "Dockerfile" | |
| if not p.exists(): | |
| return False, "Dockerfile not found at project root" | |
| text = p.read_text() | |
| checks = { | |
| "FROM": "FROM" in text, | |
| "COPY": "COPY" in text, | |
| "CMD": "CMD" in text, | |
| "port 8000": "8000" in text, | |
| } | |
| missing = [k for k, v in checks.items() if not v] | |
| if missing: | |
| return False, f"Dockerfile may be incomplete — missing: {missing}" | |
| return True, "Dockerfile is valid" | |
| def check_inference_exists(): | |
| p = ROOT / "inference.py" | |
| if not p.exists(): | |
| return False, "inference.py not found — must be at the project root" | |
| return True, str(p) | |
| def check_inference_format(): | |
| p = ROOT / "inference.py" | |
| if not p.exists(): | |
| return False, "inference.py not found" | |
| text = p.read_text() | |
| markers = { | |
| "[START]": "[START]" in text, | |
| "[STEP]": "[STEP]" in text, | |
| "[END]": "[END]" in text, | |
| "task=": "task=" in text, | |
| "reward=": "reward=" in text, | |
| "success=": "success=" in text, | |
| "steps=": "steps=" in text, | |
| "rewards=": "rewards=" in text, | |
| } | |
| missing = [k for k, v in markers.items() if not v] | |
| if missing: | |
| return False, f"Missing log markers: {missing}" | |
| return True, "All required log markers present" | |
| def check_openai_usage(): | |
| p = ROOT / "inference.py" | |
| if not p.exists(): | |
| return False, "inference.py not found" | |
| text = p.read_text() | |
| if "from openai import OpenAI" not in text and "import openai" not in text: | |
| return False, "OpenAI client import not found" | |
| if "API_BASE_URL" not in text or "MODEL_NAME" not in text or "HF_TOKEN" not in text: | |
| return False, "Required env vars (API_BASE_URL / MODEL_NAME / HF_TOKEN) not referenced" | |
| return True, "OpenAI client + required env vars found" | |
| def check_models(): | |
| sys.path.insert(0, str(ROOT)) | |
| try: | |
| import importlib | |
| models = importlib.import_module("models") | |
| action = models.DispatchTriageAction(incident_id=0, unit_id=1) | |
| obs = models.DispatchTriageObservation( | |
| done=False, reward=0.0, | |
| incidents=[], units=[], | |
| dispatch_count=0, message="test", score_so_far=0.0, | |
| ) | |
| state = models.DispatchTriageState() | |
| return True, f"Action={action} | State difficulty={state.difficulty}" | |
| except Exception as exc: | |
| return False, str(exc) | |
| def check_environment_logic(): | |
| sys.path.insert(0, str(ROOT)) | |
| try: | |
| env_mod = importlib.import_module("server.Dispatch_triage_env_environment") | |
| models = importlib.import_module("models") | |
| Env = env_mod.DispatchTriageEnvironment | |
| Action = models.DispatchTriageAction | |
| except Exception as exc: | |
| return False, f"Import failed: {exc}" | |
| report = [] | |
| for difficulty in ["easy", "medium", "hard"]: | |
| try: | |
| env = Env() | |
| obs = env.reset(difficulty=difficulty) | |
| assert not obs.done, "reset() returned done=True" | |
| assert len(obs.incidents) > 0, "no incidents in observation" | |
| assert len(obs.units) > 0, "no units in observation" | |
| assert 0.0 <= obs.score_so_far <= 1.0, f"score_so_far out of range: {obs.score_so_far}" | |
| # Take one valid step | |
| inc = next(i for i in obs.incidents if not i.resolved) | |
| unit = next(u for u in obs.units if u.available) | |
| obs2 = env.step(Action(incident_id=inc.id, unit_id=unit.id)) | |
| assert 0.0 <= obs2.score_so_far <= 1.0, \ | |
| f"score_so_far out of range after step: {obs2.score_so_far}" | |
| report.append(f"{difficulty}: score={obs2.score_so_far:.4f} done={obs2.done}") | |
| except Exception as exc: | |
| return False, f"{difficulty} failed: {exc}" | |
| return True, " | ".join(report) | |
| def check_reward_range(): | |
| sys.path.insert(0, str(ROOT)) | |
| try: | |
| env_mod = importlib.import_module("server.Dispatch_triage_env_environment") | |
| models = importlib.import_module("models") | |
| Env = env_mod.DispatchTriageEnvironment | |
| Action = models.DispatchTriageAction | |
| except Exception as exc: | |
| return False, f"Import failed: {exc}" | |
| bad = [] | |
| for difficulty in ["easy", "medium", "hard"]: | |
| env = Env() | |
| obs = env.reset(difficulty=difficulty) | |
| for _ in range(20): | |
| if obs.done: | |
| break | |
| avail_incs = [i for i in obs.incidents if not i.resolved] | |
| avail_units = [u for u in obs.units if u.available] | |
| if not avail_incs or not avail_units: | |
| break | |
| obs = env.step(Action(incident_id=avail_incs[0].id, unit_id=avail_units[0].id)) | |
| r = obs.score_so_far | |
| if not (0.0 <= r <= 1.0): | |
| bad.append(f"{difficulty}: reward={r}") | |
| if bad: | |
| return False, f"Out-of-range rewards: {bad}" | |
| return True, "All rewards in [0.0, 1.0] across easy/medium/hard" | |
| def check_cascade_penalty(): | |
| sys.path.insert(0, str(ROOT)) | |
| try: | |
| env_mod = importlib.import_module("server.Dispatch_triage_env_environment") | |
| models = importlib.import_module("models") | |
| Env = env_mod.DispatchTriageEnvironment | |
| Action = models.DispatchTriageAction | |
| except Exception as exc: | |
| return False, f"Import failed: {exc}" | |
| # Optimal: resolve gas leak (id=1) before cardiac (id=0) | |
| env_opt = Env() | |
| obs = env_opt.reset(difficulty="hard") | |
| obs = env_opt.step(Action(incident_id=2, unit_id=1)) # fire → fire_truck | |
| obs = env_opt.step(Action(incident_id=1, unit_id=0)) # gas → ambulance (wrong type but no cascade) | |
| obs = env_opt.step(Action(incident_id=0, unit_id=2)) # cardiac after gas resolved | |
| optimal_score = obs.score_so_far | |
| # Sub-optimal: dispatch cardiac (id=0) before gas (id=1) | |
| env_bad = Env() | |
| obs2 = env_bad.reset(difficulty="hard") | |
| obs2 = env_bad.step(Action(incident_id=2, unit_id=1)) # fire → fire_truck | |
| obs2 = env_bad.step(Action(incident_id=0, unit_id=0)) # cardiac BEFORE gas → cascade penalty | |
| obs2 = env_bad.step(Action(incident_id=1, unit_id=2)) # gas after cardiac | |
| bad_score = obs2.score_so_far | |
| if bad_score >= optimal_score: | |
| return False, ( | |
| f"Cascade penalty not working: wrong order score ({bad_score:.4f}) " | |
| f">= correct order score ({optimal_score:.4f})" | |
| ) | |
| return True, ( | |
| f"Correct order: {optimal_score:.4f} | Wrong order (cascade): {bad_score:.4f} — " | |
| "penalty is functioning correctly" | |
| ) | |
| def check_task_count(): | |
| sys.path.insert(0, str(ROOT)) | |
| try: | |
| env_mod = importlib.import_module("server.Dispatch_triage_env_environment") | |
| scenarios = env_mod.SCENARIOS | |
| diffs = list(scenarios.keys()) | |
| if len(diffs) < 3: | |
| return False, f"Only {len(diffs)} difficulty levels: {diffs}" | |
| for d, data in scenarios.items(): | |
| n_inc = len(data["incidents"]) | |
| n_unit = len(data["units"]) | |
| n_meta = len(data["_meta"]) | |
| if n_inc != n_meta: | |
| return False, f"{d}: incidents ({n_inc}) vs _meta ({n_meta}) count mismatch" | |
| if n_inc < 3 or n_unit < 3: | |
| return False, f"{d}: need ≥3 incidents and ≥3 units, got {n_inc}/{n_unit}" | |
| return True, f"Difficulties: {diffs} | incidents per level: {[len(v['incidents']) for v in scenarios.values()]}" | |
| except Exception as exc: | |
| return False, str(exc) | |
| def check_pyproject(): | |
| p = ROOT / "pyproject.toml" | |
| if not p.exists(): | |
| return False, "pyproject.toml not found" | |
| text = p.read_text() | |
| required = ["openenv-core", "openai"] | |
| missing = [dep for dep in required if dep not in text] | |
| if missing: | |
| return False, f"Missing dependencies: {missing}" | |
| return True, f"Found: {required}" | |
| def check_readme(): | |
| p = ROOT / "README.md" | |
| if not p.exists(): | |
| return False, "README.md not found" | |
| text = p.read_text().lower() | |
| # Must reference dispatch-specific terms | |
| required_terms = ["incident", "dispatch", "unit", "ambulance", "reward"] | |
| missing = [t for t in required_terms if t not in text] | |
| if missing: | |
| return False, f"README missing domain terms: {missing} (may still be echo template)" | |
| # Must NOT still contain echo-env boilerplate | |
| bad_terms = ["echoed_message", "message_length", "echo environment"] | |
| present = [t for t in bad_terms if t in text] | |
| if present: | |
| return False, f"README still contains echo-environment template text: {present}" | |
| return True, "README covers the dispatch environment correctly" | |
| # --------------------------------------------------------------------------- | |
| # Optional: live server ping | |
| # --------------------------------------------------------------------------- | |
| def check_live_server(url: str) -> None: | |
| """Ping a running server and test reset() via HTTP.""" | |
| import urllib.request | |
| import urllib.error | |
| print(f"\n{BOLD}[Live Server Check] {url}{RESET}") | |
| # Health check | |
| try: | |
| name = "GET /health returns 200" | |
| resp = urllib.request.urlopen(f"{url.rstrip('/')}/health", timeout=10) | |
| if resp.status == 200: | |
| results.append((name, True, f"status={resp.status}")) | |
| print(f" {PASS} {name}") | |
| else: | |
| results.append((name, False, f"status={resp.status}")) | |
| print(f" {FAIL} {name}") | |
| except Exception as exc: | |
| results.append(("GET /health returns 200", False, str(exc))) | |
| print(f" {FAIL} GET /health returns 200 — {exc}") | |
| # POST /reset | |
| try: | |
| name = "POST /reset responds correctly" | |
| payload = json.dumps({"difficulty": "easy"}).encode() | |
| req = urllib.request.Request( | |
| f"{url.rstrip('/')}/reset", | |
| data=payload, | |
| headers={"Content-Type": "application/json"}, | |
| method="POST", | |
| ) | |
| resp = urllib.request.urlopen(req, timeout=15) | |
| body = json.loads(resp.read()) | |
| # Accept either flat observation or nested StepResult | |
| obs = body.get("observation", body) | |
| has_inc = "incidents" in obs and len(obs["incidents"]) > 0 | |
| has_uni = "units" in obs and len(obs["units"]) > 0 | |
| if has_inc and has_uni: | |
| results.append((name, True, f"incidents={len(obs['incidents'])} units={len(obs['units'])}")) | |
| print(f" {PASS} {name}") | |
| else: | |
| results.append((name, False, f"Response missing incidents/units: {list(obs.keys())}")) | |
| print(f" {FAIL} {name}") | |
| except Exception as exc: | |
| results.append(("POST /reset responds correctly", False, str(exc))) | |
| print(f" {FAIL} POST /reset responds correctly — {exc}") | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main() -> int: | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Pre-submission validation for Dispatch Triage Env") | |
| parser.add_argument("--url", help="Live server URL to ping (optional)", default=None) | |
| args = parser.parse_args() | |
| print(f"\n{BOLD}{'='*60}{RESET}") | |
| print(f"{BOLD} Dispatch Triage Env — Pre-Submission Validation{RESET}") | |
| print(f"{BOLD}{'='*60}{RESET}\n") | |
| # Run all registered checks | |
| print(f"{BOLD}[Static / Logic Checks]{RESET}") | |
| check_openenv_yaml() | |
| check_dockerfile() | |
| check_inference_exists() | |
| check_inference_format() | |
| check_openai_usage() | |
| check_pyproject() | |
| check_readme() | |
| print(f"\n{BOLD}[Environment Logic Checks]{RESET}") | |
| check_models() | |
| check_environment_logic() | |
| check_reward_range() | |
| check_cascade_penalty() | |
| check_task_count() | |
| # Optional live server | |
| if args.url: | |
| check_live_server(args.url) | |
| # Summary | |
| passed = sum(1 for _, ok, _ in results if ok) | |
| total = len(results) | |
| failed = [(n, d) for n, ok, d in results if not ok] | |
| print(f"\n{BOLD}{'='*60}{RESET}") | |
| print(f"{BOLD} Results: {passed}/{total} checks passed{RESET}") | |
| if failed: | |
| print(f"\n{RED}{BOLD} FAILED CHECKS:{RESET}") | |
| for name, detail in failed: | |
| print(f" {RED}✗ {name}{RESET}") | |
| if detail: | |
| for line in detail.splitlines(): | |
| print(f" {line}") | |
| print(f"\n{RED}Submission is NOT ready. Fix the issues above.{RESET}\n") | |
| return 1 | |
| else: | |
| print(f"\n{GREEN}{BOLD} All checks passed! Submission is ready.{RESET}\n") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |