""" validate.py ----------- Pre-submission validation script. Checks all OpenEnv spec requirements locally before submitting. Usage: python validate.py Exit code 0 = all checks pass. Exit code 1 = one or more checks failed. """ import json import sys import traceback from typing import Callable, List, Tuple # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── PASS = "✅" FAIL = "❌" SKIP = "⏭ " results: List[Tuple[str, bool, str]] = [] def check(name: str, fn: Callable[[], None]) -> None: try: fn() results.append((name, True, "")) print(f" {PASS} {name}") except Exception as e: tb = traceback.format_exc(limit=3) results.append((name, False, str(e))) print(f" {FAIL} {name}") print(f" {e}") # ───────────────────────────────────────────────────────────────────────────── # Checks # ───────────────────────────────────────────────────────────────────────────── def check_imports(): from env.schemas import Observation, Action, Reward, StepResult, ResetResult, StateResult from tasks.task1.environment import Task1Environment from tasks.task1.grader import Task1Grader from data.data_loader import load_contracts def check_openenv_yaml(): import yaml with open("openenv.yaml") as f: spec = yaml.safe_load(f) assert "name" in spec assert "tasks" in spec assert len(spec["tasks"]) >= 3, "Need at least 3 tasks defined" assert "observation_space" in spec assert "action_space" in spec assert "reward" in spec def check_pydantic_models(): from env.schemas import Observation, Action, ActionType, Reward, StepResult, ResetResult, StateResult # Instantiate each model obs = Observation( task_id="t1", contract_name="C", contract_description="D", available_actions=["submit"] ) assert obs.task_id == "t1" action = Action(action_type=ActionType.LIST_FUNCTIONS) assert action.action_type == ActionType.LIST_FUNCTIONS reward = Reward(value=1.0, reason="test") assert reward.value == 1.0 step = StepResult(observation=obs, reward=reward, done=False) assert not step.done reset = ResetResult(observation=obs) assert reset.observation.task_id == "t1" state = StateResult(task_id="t1", contract_name="C", step_count=0, cumulative_reward=0.0, done=False) assert state.step_count == 0 def check_data_loading(): from data.data_loader import load_contracts, get_all_vulnerable_entries contracts = load_contracts() assert len(contracts) >= 1, "No contracts loaded" entries = get_all_vulnerable_entries(contracts) assert len(entries) >= 3, f"Need >= 3 vulnerable functions, got {len(entries)}" for contract, fn in entries: assert fn.get("vulnerable") is True assert fn.get("vulnerability_details") is not None assert "issue" in fn["vulnerability_details"] def check_env_reset(): from tasks.task1.environment import Task1Environment env = Task1Environment() result = env.reset(seed=42) assert result.observation is not None assert result.observation.task_id == "task1_vuln_detection" assert result.observation.contract_name != "" assert not result.observation.done assert result.observation.step_count == 0 def check_env_step(): from tasks.task1.environment import Task1Environment from env.schemas import Action, ActionType env = Task1Environment() env.reset(seed=42) result = env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) assert result.observation is not None assert isinstance(result.reward.value, float) assert isinstance(result.done, bool) assert "info" in result.model_dump() def check_env_state(): from tasks.task1.environment import Task1Environment env = Task1Environment() env.reset(seed=42) state = env.state() assert state.task_id == "task1_vuln_detection" assert state.contract_name != "" assert state.target_function is not None # exposed for debugging def check_grader_scores_in_range(): from tasks.task1.grader import Task1Grader cases = [ ("withdraw", "Reentrancy vulnerability", "withdraw", "reentrancy", 1.0), ("withdraw", "Reentrancy vulnerability", "withdraw", "something else", 0.5), ("withdraw", "Reentrancy vulnerability", "deposit", "reentrancy", 0.0), ] for tf, issue, sf, sv, expected in cases: g = Task1Grader(tf, issue) score = g.grade_submission(sf, sv) assert 0.0 <= score <= 1.0, f"Score {score} out of range" assert abs(score - expected) < 0.01, f"Expected {expected}, got {score}" def check_grader_deterministic(): from tasks.task1.grader import Task1Grader g = Task1Grader("withdraw", "Reentrancy vulnerability") s1 = g.grade_submission("withdraw", "reentrancy") s2 = g.grade_submission("withdraw", "reentrancy") assert s1 == s2 == 1.0, "Grader must be deterministic" def check_reward_shaping(): """Verify reward is non-binary (multiple distinct values across steps).""" from tasks.task1.environment import Task1Environment from env.schemas import Action, ActionType env = Task1Environment() env.reset(seed=1) rewards = set() for at in [ActionType.LIST_FUNCTIONS, ActionType.GET_FILE_METADATA, ActionType.GET_CALL_GRAPH]: r = env.step(Action(action_type=at)) rewards.add(round(r.reward.value, 4)) # Should have at least 2 distinct shaping reward values assert len(rewards) >= 2, f"Expected multiple reward values, got {rewards}" def check_episode_boundary(): """Episode must end after submit and raise on subsequent step.""" from tasks.task1.environment import Task1Environment from env.schemas import Action, ActionType env = Task1Environment() env.reset(seed=2) env.step(Action(action_type=ActionType.SUBMIT, params={ "function_name": "withdraw", "vulnerability_type": "test" })) try: env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) raise AssertionError("Should have raised RuntimeError after episode end") except RuntimeError: pass # Expected def check_repeated_query_penalty(): from tasks.task1.environment import Task1Environment from env.schemas import Action, ActionType env = Task1Environment() env.reset(seed=3) env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) r = env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) assert r.reward.value == -0.40, f"Expected -0.40 for repeated query, got {r.reward.value}" def check_tasks_list(): """All three tasks must be listed (even if placeholders).""" from tasks.task2 import __all__ as t2 # noqa from tasks.task3 import __all__ as t3 # noqa def check_dockerfile_exists(): import os assert os.path.exists("Dockerfile"), "Dockerfile is missing" with open("Dockerfile") as f: content = f.read() assert "7860" in content, "Dockerfile must EXPOSE 7860 (HF Spaces)" assert "uvicorn" in content or "CMD" in content def check_inference_script(): import os assert os.path.exists("inference.py"), "inference.py is missing" with open("inference.py") as f: content = f.read() assert "OPENAI_API_KEY" in content or "HF_TOKEN" in content, \ "inference.py must read API credentials from env vars" assert "API_BASE_URL" in content assert "MODEL_NAME" in content def check_baseline_json_schema(): """baseline_scores.json must have valid schema if it exists.""" import os if not os.path.exists("baseline_scores.json"): return # OK — file is generated at runtime with open("baseline_scores.json") as f: data = json.load(f) assert "tasks" in data for task in data["tasks"]: score = task["avg_grader_score"] assert 0.0 <= score <= 1.0, f"Score {score} out of range" # ───────────────────────────────────────────────────────────────────────────── # Runner # ───────────────────────────────────────────────────────────────────────────── def main(): print("=" * 60) print("OpenEnv Pre-Submission Validation") print("=" * 60) all_checks = [ ("Python imports", check_imports), ("openenv.yaml format", check_openenv_yaml), ("Pydantic model types", check_pydantic_models), ("Dataset loading (3+ vulns)", check_data_loading), ("env.reset() → ResetResult", check_env_reset), ("env.step() → StepResult", check_env_step), ("env.state() → StateResult", check_env_state), ("Grader scores in [0.0, 1.0]", check_grader_scores_in_range), ("Grader is deterministic", check_grader_deterministic), ("Reward shaping (non-binary)", check_reward_shaping), ("Episode boundary (done=True)",check_episode_boundary), ("Repeated query penalty", check_repeated_query_penalty), ("Task 2 & 3 placeholders", check_tasks_list), ("Dockerfile exists + port", check_dockerfile_exists), ("inference.py exists + vars", check_inference_script), ("baseline_scores.json schema", check_baseline_json_schema), ] print() for name, fn in all_checks: check(name, fn) print() passed = sum(1 for _, ok, _ in results if ok) total = len(results) failed = [(n, msg) for n, ok, msg in results if not ok] print("=" * 60) print(f"Results: {passed}/{total} checks passed") if failed: print("\nFailed checks:") for name, msg in failed: print(f" {FAIL} {name}: {msg}") print() print("❌ VALIDATION FAILED — fix the issues above before submitting.") sys.exit(1) else: print() print("✅ ALL CHECKS PASSED — ready to submit!") sys.exit(0) if __name__ == "__main__": main()