""" validate.py ----------- Pre-submission validation — 24 checks across all three tasks. Usage: python validate.py Exit 0 = all pass. Exit 1 = failures. """ import json, sys from typing import Callable, List, Tuple PASS = "✅"; FAIL = "❌" results: List[Tuple[str, bool, str]] = [] def check(name: str, fn: Callable) -> None: try: fn(); results.append((name, True, "")) print(f" {PASS} {name}") except Exception as e: results.append((name, False, str(e))) print(f" {FAIL} {name}\n {e}") # ── Checks ──────────────────────────────────────────────────────────────────── def check_imports(): from env.schemas import Observation, Action, Reward, StepResult, ResetResult, StateResult, ActionType from tasks.task1.environment import Task1Environment; from tasks.task1.grader import Task1Grader from tasks.task2.environment import Task2Environment; from tasks.task2.grader import Task2Grader from tasks.task3.environment import Task3Environment; from tasks.task3.grader import Task3Grader from data.data_loader import load_contracts def check_openenv_yaml(): import yaml with open("openenv.yaml") as f: spec = yaml.safe_load(f) assert "name" in spec and len(spec.get("tasks", [])) >= 3 assert "observation_space" in spec and "action_space" in spec and "reward" in spec tasks = spec["tasks"] active = [t for t in tasks if t.get("status") == "active"] assert len(active) >= 2, f"Expected >=2 active tasks, got {len(active)}" def check_pydantic_models(): from env.schemas import Observation, Action, ActionType, Reward, StepResult, ResetResult obs = Observation(task_id="t", contract_name="C", contract_description="D", available_actions=[]) for at in [ActionType.LIST_FUNCTIONS, ActionType.SUBMIT_PROPERTY, ActionType.GET_PROPERTY_SPECIFICATION, ActionType.SUBMIT_FUNCTION]: Action(action_type=at) Reward(value=-1.5, reason="test") StepResult(observation=obs, reward=Reward(value=0, reason=""), done=False) def check_data_loading(): from data.data_loader import (load_contracts, get_all_vulnerable_entries, get_all_property_entries, get_all_task3_entries) c = load_contracts() assert len(get_all_vulnerable_entries(c)) >= 3 assert len(get_all_property_entries(c)) >= 3 entries = get_all_task3_entries(c) assert len(entries) >= 3, f"Need >=3 task3 entries, got {len(entries)}" for _, fn in entries: t3 = fn.get("task3", {}) assert t3.get("property_english"), f"{fn['name']} missing property_english" assert t3.get("property_formal"), f"{fn['name']} missing property_formal" def check_t1_env(): from tasks.task1.environment import Task1Environment from env.schemas import Action, ActionType env = Task1Environment() r = env.reset(seed=42); assert r.observation.task_id == "task1_vuln_detection" s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) assert s.reward.value == -0.05 and s.observation.step_count == 1 assert env.state().target_function is not None def check_t2_env(): from tasks.task2.environment import Task2Environment from env.schemas import Action, ActionType env = Task2Environment() r = env.reset(seed=42) assert r.observation.task_id == "task2_property_discovery" assert "target_function" in r.observation.extra for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC, ActionType.GET_FILE_NATSPEC, ActionType.GET_SIGNATURE, ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]: env.step(Action(action_type=at)) def check_t3_env(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment() r = env.reset(seed=42) assert r.observation.task_id == "task3_rule_checker" assert "property_english" in r.observation.extra prop = r.observation.extra["property_english"] assert len(prop) > 10, "property_english too short" for at in [ActionType.LIST_FUNCTIONS, ActionType.GET_PROPERTY_SPECIFICATION, ActionType.GET_CALL_GRAPH, ActionType.GET_STATE_VARIABLE]: s = env.step(Action(action_type=at)) assert s.reward.value < 0, f"{at.value} should have negative shaping reward" def check_t3_action_costs(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=42) costs = { ActionType.GET_PROPERTY_SPECIFICATION: -0.03, ActionType.LIST_FUNCTIONS: -0.05, ActionType.GET_CALL_GRAPH: -0.08, } for at, expected in costs.items(): e2 = Task3Environment(); e2.reset(seed=42) s = e2.step(Action(action_type=at)) assert abs(s.reward.value - expected) < 0.001, \ f"{at.value}: expected {expected}, got {s.reward.value}" def check_t3_function_metadata(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=43) s = env.step(Action(action_type=ActionType.GET_FUNCTION_METADATA, params={"function_name": "withdraw"})) assert "Visibility" in s.observation.last_action_result assert s.reward.value == -0.05 def check_t3_submit_correct(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=42) target = env.state().target_function s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION, params={"function_name": target})) assert s.done and s.reward.value == 5.0, \ f"Expected reward=5.0, got {s.reward.value}" def check_t3_submit_subfunction(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType # seed 45 → bid with subfunction getPrice env = Task3Environment(); env.reset(seed=45) assert env.state().target_function == "bid" s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION, params={"function_name": "getPrice"})) assert s.done and s.reward.value == 1.5, \ f"Expected partial reward=1.5, got {s.reward.value}" def check_t3_submit_wrong(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=42) s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION, params={"function_name": "constructor"})) assert s.done and s.reward.value == -1.5 def check_t3_one_submit_only(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=42) env.step(Action(action_type=ActionType.SUBMIT_FUNCTION, params={"function_name": "deposit"})) try: env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) raise AssertionError("Should raise RuntimeError after done") except RuntimeError: pass def check_t3_repeated_penalty(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=42) env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS)) assert s.reward.value == -0.40 def check_t1_grader(): from tasks.task1.grader import Task1Grader g = Task1Grader("withdraw", "Reentrancy vulnerability") assert g.grade_submission("withdraw", "reentrancy") == 1.0 assert g.grade_submission("withdraw", "vague") == 0.5 assert g.grade_submission("deposit", "reentrancy") == 0.0 def check_t2_grader(): from tasks.task2.grader import Task2Grader from data.data_loader import load_contracts, get_all_property_entries for c, fn in get_all_property_entries(load_contracts()): g = Task2Grader(fn["name"], fn["property"]) assert g.grade(fn["property"])[0] >= 0.65 assert g.grade("") == 0.0 s = g.grade("test"); assert s == g.grade("test") # deterministic def check_t3_grader(): from tasks.task3.grader import Task3Grader g = Task3Grader("withdraw", ["deposit"], "some rule") assert g.grade("withdraw") == 1.0 assert g.grade("WITHDRAW") == 1.0 # case-insensitive assert g.grade("deposit") == 0.3 assert g.grade("constructor") == 0.0 s, r = g.grade_and_reward("withdraw"); assert s == 1.0 and r == 5.0 s, r = g.grade_and_reward("deposit"); assert s == 0.3 and r == 1.5 s, r = g.grade_and_reward("other"); assert s == 0.0 and r == -1.5 def check_reward_shaping(): from tasks.task3.environment import Task3Environment from env.schemas import Action, ActionType env = Task3Environment(); env.reset(seed=1) rewards = {env.step(Action(action_type=at)).reward.value for at in [ActionType.LIST_FUNCTIONS, ActionType.GET_PROPERTY_SPECIFICATION, ActionType.GET_CALL_GRAPH]} assert len(rewards) >= 2 def check_app_imports(): from app import app from fastapi.testclient import TestClient client = TestClient(app) r = client.get("/health"); assert r.status_code == 200 tasks = client.get("/tasks").json()["tasks"] active = [t for t in tasks if t["status"] == "active"] assert len(active) == 3, f"Expected 3 active tasks, got {len(active)}: {active}" def check_t3_http_reset(): from app import app from fastapi.testclient import TestClient client = TestClient(app) r = client.post("/reset", json={"task_id": "task3_rule_checker", "seed": 42}) assert r.status_code == 200 obs = r.json()["observation"] assert obs["task_id"] == "task3_rule_checker" assert "property_english" in obs["extra"] def check_dockerfile(): import os assert os.path.exists("Dockerfile") c = open("Dockerfile").read() assert "7860" in c and ("uvicorn" in c or "CMD" in c) def check_inference_script(): import os assert os.path.exists("inference.py") c = open("inference.py").read() assert "HF_TOKEN" in c and "API_BASE_URL" in c and "MODEL_NAME" in c assert "Task3Environment" in c or "run_task3" in c assert "submit_function" in c def check_baseline_json(): import os if not os.path.exists("baseline_scores.json"): return data = json.load(open("baseline_scores.json")) for t in data.get("tasks", []): assert 0.0 <= t["avg_grader_score"] <= 1.0 # ── Runner ──────────────────────────────────────────────────────────────────── ALL_CHECKS = [ ("Python imports (T1+T2+T3)", check_imports), ("openenv.yaml: 3 tasks, ≥2 active", check_openenv_yaml), ("Pydantic models (all ActionTypes)", check_pydantic_models), ("Dataset: vuln+property+task3 entries",check_data_loading), ("T1 env: reset/step/state", check_t1_env), ("T2 env: reset + 6 browse actions", check_t2_env), ("T3 env: reset + browse actions", check_t3_env), ("T3 action costs (formalized -0.03)", check_t3_action_costs), ("T3 get_function_metadata", check_t3_function_metadata), ("T3 submit correct → +5.0", check_t3_submit_correct), ("T3 submit subfunction → +1.5", check_t3_submit_subfunction), ("T3 submit wrong → -1.5", check_t3_submit_wrong), ("T3 one submit per episode", check_t3_one_submit_only), ("T3 repeated query → -0.40", check_t3_repeated_penalty), ("T1 grader: 0/0.5/1.0 rubric", check_t1_grader), ("T2 grader: all 11 properties", check_t2_grader), ("T3 grader: 1.0/0.3/0.0 + case-ins.", check_t3_grader), ("Reward shaping non-binary (T3)", check_reward_shaping), ("FastAPI: 3 active tasks", check_app_imports), ("FastAPI: T3 reset endpoint", check_t3_http_reset), ("Dockerfile + port 7860", check_dockerfile), ("inference.py: T3 code present", check_inference_script), ("baseline_scores.json schema", check_baseline_json), ] def main(): print("=" * 64) print("OpenEnv Pre-Submission Validation (Task 1 + 2 + 3)") print("=" * 64) print() for name, fn in ALL_CHECKS: check(name, fn) passed = sum(1 for _, ok, _ in results if ok) total = len(results) failed = [(n, m) for n, ok, m in results if not ok] print() print("=" * 64) print(f"Results: {passed}/{total} checks passed") if failed: print("\nFailed checks:") for n, m in failed: print(f" {FAIL} {n}: {m}") print("\n❌ VALIDATION FAILED") sys.exit(1) else: print("\n✅ ALL CHECKS PASSED — ready to submit!") sys.exit(0) if __name__ == "__main__": main()