code-debug-env / validator /pre_submit_check.py
Souravdanyal's picture
working
66d8c67
#!/usr/bin/env python3
# validator/pre_submit_check.py
# Run this BEFORE submitting to catch any disqualifying issues.
#
# Usage:
# python validator/pre_submit_check.py
# python validator/pre_submit_check.py --url https://your-space.hf.space
import os
import sys
import json
import argparse
import requests
PASS = "βœ…"
FAIL = "❌"
WARN = "⚠️"
results = []
def check(name: str, passed: bool, detail: str = ""):
status = PASS if passed else FAIL
results.append({"check": name, "passed": passed, "detail": detail})
print(f" {status} {name}" + (f": {detail}" if detail else ""))
return passed
def run_checks(base_url: str):
print(f"\n{'='*60}")
print(f" Code Debug Environment β€” Pre-Submission Validator")
print(f" Target: {base_url}")
print(f"{'='*60}\n")
all_passed = True
# ── 1. Health check ───────────────────────────────────────────
print("[ CHECK 1 ] Health endpoint")
try:
r = requests.get(f"{base_url}/health", timeout=10)
passed = r.status_code == 200 and r.json().get("status") == "ok"
check("GET /health returns 200 with status=ok", passed, f"HTTP {r.status_code}")
all_passed &= passed
except Exception as e:
check("GET /health", False, str(e))
all_passed = False
# ── 2. Reset responds ─────────────────────────────────────────
print("\n[ CHECK 2 ] POST /reset")
obs = None
for difficulty in ["easy", "medium", "hard"]:
try:
r = requests.post(f"{base_url}/reset", json={"difficulty": difficulty}, timeout=15)
data = r.json()
obs = data.get("observation", {})
has_fields = all(k in obs for k in ["task_id", "difficulty", "buggy_code", "instructions"])
passed = r.status_code == 200 and has_fields
check(f"reset(difficulty='{difficulty}') returns valid observation", passed,
f"task_id={obs.get('task_id', 'MISSING')}")
all_passed &= passed
except Exception as e:
check(f"reset(difficulty='{difficulty}')", False, str(e))
all_passed = False
# ── 3. Step responds ──────────────────────────────────────────
print("\n[ CHECK 3 ] POST /step")
try:
# Reset first to get a fresh task
r = requests.post(f"{base_url}/reset", json={"difficulty": "easy"}, timeout=15)
buggy_code = r.json()["observation"]["buggy_code"]
# Submit the buggy code as-is (reward may be 0, that's fine)
r = requests.post(f"{base_url}/step", json={"fixed_code": buggy_code}, timeout=15)
data = r.json()
has_reward = "reward" in data and isinstance(data["reward"], (int, float))
has_done = "done" in data and isinstance(data["done"], bool)
reward_in_range = 0.0 <= data.get("reward", -1) <= 1.0
passed = r.status_code == 200 and has_reward and has_done and reward_in_range
check("step() returns reward in [0.0, 1.0] and done flag", passed,
f"reward={data.get('reward')}, done={data.get('done')}")
all_passed &= passed
except Exception as e:
check("POST /step", False, str(e))
all_passed = False
# ── 4. State responds ─────────────────────────────────────────
print("\n[ CHECK 4 ] GET /state")
try:
r = requests.get(f"{base_url}/state", timeout=10)
data = r.json()
has_fields = all(k in data for k in ["episode_id", "step_count", "difficulty"])
passed = r.status_code == 200 and has_fields
check("GET /state returns episode_id, step_count, difficulty", passed)
all_passed &= passed
except Exception as e:
check("GET /state", False, str(e))
all_passed = False
# ── 5. 3 difficulties all work ────────────────────────────────
print("\n[ CHECK 5 ] All 3 task difficulties functional")
for difficulty in ["easy", "medium", "hard"]:
try:
r = requests.post(f"{base_url}/reset", json={"difficulty": difficulty}, timeout=15)
obs = r.json()["observation"]
passed = obs.get("difficulty") == difficulty
check(f"difficulty='{difficulty}' task loads correctly",
passed, f"got difficulty={obs.get('difficulty')}")
all_passed &= passed
except Exception as e:
check(f"difficulty='{difficulty}'", False, str(e))
all_passed = False
# ── 6. Reward range on perfect answer ─────────────────────────
print("\n[ CHECK 6 ] Reward range validation (correct fix)")
try:
from server.tasks.task_easy import EASY_TASKS
task = EASY_TASKS[0]
# Reset with the first easy task
r = requests.post(f"{base_url}/reset", json={"difficulty": "easy"}, timeout=15)
# Submit the known correct fix
r = requests.post(f"{base_url}/step",
json={"fixed_code": task["fixed_code"]}, timeout=15)
data = r.json()
reward = data.get("reward", -1)
passed = 0.0 <= reward <= 1.0
check(f"Submitting correct fix yields reward in [0.0, 1.0]", passed,
f"reward={reward}")
all_passed &= passed
except Exception as e:
check("Reward range check", False, str(e))
all_passed = False
# ── 7. openenv.yaml exists ────────────────────────────────────
print("\n[ CHECK 7 ] Project structure")
required_files = [
"openenv.yaml",
"inference.py",
"models.py",
"server/app.py",
"server/environment.py",
"server/Dockerfile",
"server/requirements.txt",
"pyproject.toml",
"README.md",
]
for fname in required_files:
exists = os.path.exists(fname)
check(f"File exists: {fname}", exists)
all_passed &= exists
# ── 8. inference.py has required log format ───────────────────
print("\n[ CHECK 8 ] inference.py log format")
try:
with open("inference.py") as f:
content = f.read()
has_start = "[START] task=" in content
has_step = "[STEP] step=" in content
has_end = "[END] success=" in content
avoids_json_logs = "print(json.dumps(log_entry)" not in content
rewards_csv = "rewards=[" not in content
check("inference.py emits [START] logs", has_start)
check("inference.py emits [STEP] logs", has_step)
check("inference.py emits [END] logs", has_end)
check("inference.py avoids JSON log dict dumps", avoids_json_logs)
check("inference.py emits CSV rewards in [END]", rewards_csv)
all_passed &= has_start and has_step and has_end and avoids_json_logs and rewards_csv
except Exception as e:
check("inference.py log format", False, str(e))
all_passed = False
# ── Final summary ─────────────────────────────────────────────
total = len(results)
passed_count = sum(1 for r in results if r["passed"])
print(f"\n{'='*60}")
print(f" Results: {passed_count}/{total} checks passed")
if all_passed:
print(f" {PASS} ALL CHECKS PASSED β€” you are safe to submit!")
else:
failed = [r["check"] for r in results if not r["passed"]]
print(f" {FAIL} FAILED CHECKS β€” fix these before submitting:")
for f in failed:
print(f" β€’ {f}")
print(f"{'='*60}\n")
return all_passed
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--url", default="http://localhost:7860",
help="Base URL of the running environment")
args = parser.parse_args()
success = run_checks(args.url.rstrip("/"))
sys.exit(0 if success else 1)