Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| validate.py β Pre-submission validation script. | |
| Checks all OpenEnv compliance requirements before submitting. | |
| Run: python validate.py | |
| """ | |
| import sys | |
| import json | |
| import yaml | |
| import importlib | |
| from pathlib import Path | |
| PASS = "β " | |
| FAIL = "β" | |
| WARN = "β οΈ " | |
| errors = [] | |
| warnings = [] | |
| def check(condition: bool, label: str, detail: str = ""): | |
| if condition: | |
| print(f" {PASS} {label}") | |
| else: | |
| print(f" {FAIL} {label}" + (f": {detail}" if detail else "")) | |
| errors.append(label) | |
| def warn(condition: bool, label: str, detail: str = ""): | |
| if condition: | |
| print(f" {PASS} {label}") | |
| else: | |
| print(f" {WARN} {label}" + (f": {detail}" if detail else "")) | |
| warnings.append(label) | |
| print("\n" + "="*60) | |
| print(" OpenEnv Validation β email-triage-env") | |
| print("="*60 + "\n") | |
| # βββ 1. File structure ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("1. Required files") | |
| required_files = [ | |
| "openenv.yaml", "Dockerfile", "requirements.txt", "inference.py", | |
| "README.md", "models.py", "environment.py", "server.py", | |
| "graders.py", "dataset.py", | |
| ] | |
| for f in required_files: | |
| check(Path(f).exists(), f) | |
| # βββ 2. openenv.yaml βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n2. openenv.yaml spec") | |
| try: | |
| with open("openenv.yaml") as fh: | |
| cfg = yaml.safe_load(fh) | |
| check("name" in cfg, "has name field") | |
| check("version" in cfg, "has version field") | |
| check("tasks" in cfg and len(cfg["tasks"]) >= 3, "has 3+ tasks") | |
| check("endpoints" in cfg, "has endpoints section") | |
| check("observation_space" in cfg, "has observation_space") | |
| check("action_space" in cfg, "has action_space") | |
| except Exception as e: | |
| check(False, "openenv.yaml parseable", str(e)) | |
| # βββ 3. Pydantic models βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n3. Typed models (Pydantic)") | |
| try: | |
| from models import Observation, Action, Reward, StepResponse, EnvState | |
| check(True, "Observation model imports") | |
| check(True, "Action model imports") | |
| check(True, "Reward model imports") | |
| check(True, "StepResponse model imports") | |
| check(True, "EnvState model imports") | |
| # Validate field ranges | |
| r = Reward(value=0.5, feedback="test") | |
| check(0.0 <= r.value <= 1.0, "Reward value in [0.0, 1.0]") | |
| except Exception as e: | |
| check(False, "Models import cleanly", str(e)) | |
| # βββ 4. Environment API βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n4. Environment API (reset/step/state)") | |
| try: | |
| from environment import EmailTriageEnv | |
| from models import Action, UrgencyLevel, EmailCategory, EmailAction | |
| env = EmailTriageEnv() | |
| # reset() | |
| obs = env.reset("task_easy") | |
| check(obs is not None, "reset() returns Observation") | |
| check(obs.current_email is not None, "reset() observation has current_email") | |
| check(obs.task_id == "task_easy", "reset() sets task_id") | |
| # state() | |
| state = env.state() | |
| check(state is not None, "state() returns EnvState") | |
| check(state.task_id == "task_easy", "state() has correct task_id") | |
| # step() | |
| act = Action( | |
| urgency=UrgencyLevel.MEDIUM, | |
| category=EmailCategory.SPAM, | |
| action=EmailAction.DELETE, | |
| ) | |
| result = env.step(act) | |
| check(result is not None, "step() returns StepResponse") | |
| check(0.0 <= result.reward.value <= 1.0, "step() reward in [0.0, 1.0]") | |
| check(isinstance(result.done, bool), "step() returns done boolean") | |
| check(result.info.get("episode_id") is not None, "step() info has episode_id") | |
| # All 3 tasks | |
| for tid in ["task_easy", "task_medium", "task_hard"]: | |
| env2 = EmailTriageEnv() | |
| obs2 = env2.reset(tid) | |
| check(obs2.emails_remaining > 0, f"task {tid} has emails") | |
| except Exception as e: | |
| check(False, "Environment API works", str(e)) | |
| # βββ 5. Graders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n5. Task graders (3 tasks, scores in [0,1])") | |
| try: | |
| from graders import grade | |
| from dataset import TASK_EMAILS | |
| for tid in ["task_easy", "task_medium", "task_hard"]: | |
| emails = TASK_EMAILS[tid] | |
| rewards = [] | |
| for email in emails[:3]: # spot-check first 3 | |
| act = Action( | |
| urgency=UrgencyLevel.MEDIUM, | |
| category=EmailCategory.OTHER, | |
| action=EmailAction.ARCHIVE, | |
| ) | |
| r = grade(tid, act, email) | |
| rewards.append(r.value) | |
| all_valid = all(0.0 <= v <= 1.0 for v in rewards) | |
| check(all_valid, f"{tid} grader scores in [0.0, 1.0]", str(rewards)) | |
| except Exception as e: | |
| check(False, "Graders work", str(e)) | |
| # βββ 6. Dockerfile ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n6. Dockerfile") | |
| try: | |
| dockerfile = Path("Dockerfile").read_text() | |
| check("FROM python" in dockerfile, "has Python base image") | |
| check("EXPOSE" in dockerfile, "has EXPOSE directive") | |
| check("HEALTHCHECK" in dockerfile, "has HEALTHCHECK directive") | |
| check("uvicorn" in dockerfile or "CMD" in dockerfile, "has CMD to start server") | |
| except Exception as e: | |
| check(False, "Dockerfile readable", str(e)) | |
| # βββ 7. inference.py ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n7. inference.py") | |
| try: | |
| src = Path("inference.py").read_text() | |
| check("API_BASE_URL" in src, "reads API_BASE_URL") | |
| check("MODEL_NAME" in src, "reads MODEL_NAME") | |
| check("HF_TOKEN" in src, "reads HF_TOKEN") | |
| check("[START]" in src, "emits [START] log") | |
| check("[STEP]" in src, "emits [STEP] log") | |
| check("[END]" in src, "emits [END] log") | |
| check("OpenAI(" in src, "uses OpenAI client") | |
| except Exception as e: | |
| check(False, "inference.py readable", str(e)) | |
| # βββ 8. README ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n8. README.md") | |
| try: | |
| readme = Path("README.md").read_text().lower() | |
| check("action" in readme, "documents action space") | |
| check("observation" in readme, "documents observation space") | |
| check("task" in readme, "describes tasks") | |
| check("docker" in readme, "includes Docker instructions") | |
| check("baseline" in readme or "score" in readme, "includes baseline scores") | |
| except Exception as e: | |
| check(False, "README.md readable", str(e)) | |
| # βββ Summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "="*60) | |
| if not errors: | |
| print(f" {PASS} ALL CHECKS PASSED β Ready to submit!") | |
| else: | |
| print(f" {FAIL} {len(errors)} check(s) FAILED:") | |
| for e in errors: | |
| print(f" β’ {e}") | |
| if warnings: | |
| print(f"\n {WARN} {len(warnings)} warning(s):") | |
| for w in warnings: | |
| print(f" β’ {w}") | |
| print("="*60 + "\n") | |
| sys.exit(0 if not errors else 1) | |