#!/usr/bin/env bash # verify.sh — smoke-test the PR Review OpenEnv project without Docker or live LLM. set -euo pipefail cd "$(dirname "$0")" PASS=0 FAIL=0 _check() { local label=$1; shift if "$@" 2>/dev/null; then printf " PASS %s\n" "$label" PASS=$((PASS + 1)) else printf " FAIL %s\n" "$label" FAIL=$((FAIL + 1)) fi } echo "=== PR Review OpenEnv — verify.sh ===" echo "" # --------------------------------------------------------------------------- # 1. Required files # --------------------------------------------------------------------------- echo "[1] Required files" for f in \ openenv.yaml \ Dockerfile \ requirements.txt \ README.md \ src/__init__.py \ src/api.py \ src/env.py \ src/models.py \ src/grader.py \ inference.py; do _check "$f" test -f "$f" done # --------------------------------------------------------------------------- # 2. Scenario JSON files # --------------------------------------------------------------------------- echo "" echo "[2] Scenario JSON files" _check "at least 15 scenario files" bash -c \ '[ "$(ls data/scenarios/*.json 2>/dev/null | wc -l)" -ge 15 ]' _check "all JSON files parse cleanly" venv/bin/python - <<'EOF' import json, glob, sys files = glob.glob("data/scenarios/*.json") for p in files: with open(p) as f: d = json.load(f) for key in ("pr_title", "pr_description", "diff", "ground_truth"): assert key in d, f"Missing '{key}' in {p}" gt = d["ground_truth"] assert "bugs" in gt and "should_approve" in gt, f"Bad ground_truth in {p}" print(f" {len(files)} files OK") EOF _check "at least one should_approve=true scenario" venv/bin/python - <<'EOF' import json, glob files = glob.glob("data/scenarios/*.json") clean = [p for p in files if json.load(open(p))["ground_truth"]["should_approve"]] assert len(clean) >= 1, "No approvable scenario found" EOF # --------------------------------------------------------------------------- # 3. Python imports (package style — no PYTHONPATH tricks) # --------------------------------------------------------------------------- echo "" echo "[3] Python imports" _check "src.grader imports" venv/bin/python -c "from src.grader import grade" _check "src.models imports" venv/bin/python -c \ "from src.models import PRReviewAction, PRReviewObservation, PRReviewReward" _check "src.env imports" venv/bin/python -c "from src.env import PRReviewEnv" _check "src.api imports" venv/bin/python -c "import src.api" # --------------------------------------------------------------------------- # 4. Grader unit tests # --------------------------------------------------------------------------- echo "" echo "[4] Grader" _check "perfect score = 1.0" venv/bin/python - <<'EOF' from src.grader import grade gt = {"bugs": [["off-by-one", "IndexError"], ["inner loop"]], "should_approve": False} r = grade(gt, ["off-by-one in inner loop causes IndexError"], "reject") assert r["score"] == 1.0, r EOF _check "zero score = 0.0" venv/bin/python - <<'EOF' from src.grader import grade gt = {"bugs": [["null"]], "should_approve": False} r = grade(gt, [], "approve") assert r["score"] == 0.0, r EOF _check "clean PR approved = 1.0" venv/bin/python - <<'EOF' from src.grader import grade r = grade({"bugs": [], "should_approve": True}, [], "approve") assert r["score"] == 1.0, r EOF _check "no false positive: null not in nullable" venv/bin/python - <<'EOF' from src.grader import grade r = grade({"bugs": [["null"]], "should_approve": False}, ["nullable field"], "reject") assert r["bugs_found"] == 0, r EOF _check "false rejection penalty applied" venv/bin/python - <<'EOF' from src.grader import grade r = grade({"bugs": [], "should_approve": True}, [], "reject") assert r["false_rejection_penalty"] == -0.2, r assert r["score"] < 1.0, r EOF # --------------------------------------------------------------------------- # 5. PRReviewEnv unit tests # --------------------------------------------------------------------------- echo "" echo "[5] PRReviewEnv" _check "reset returns valid observation" venv/bin/python - <<'EOF' from src.env import PRReviewEnv e = PRReviewEnv(task="easy") obs = e.reset() assert obs.diff and obs.pr_title assert obs.step_count == 0 and not obs.done EOF _check "comment step: not done, step_count incremented" venv/bin/python - <<'EOF' from src.env import PRReviewEnv from src.models import PRReviewAction e = PRReviewEnv(task="easy"); e.reset() obs, rew, done, _ = e.step(PRReviewAction(action_type="comment", body="test")) assert not done and obs.step_count == 1 and -1.0 <= rew.value <= 1.0 EOF _check "approve step: done=True, score set" venv/bin/python - <<'EOF' from src.env import PRReviewEnv from src.models import PRReviewAction e = PRReviewEnv(task="easy"); e.reset() _, rew, done, info = e.step(PRReviewAction(action_type="approve")) assert done and -0.3 <= rew.value <= 0.3 and "score" in info EOF _check "request_changes step: done=True" venv/bin/python - <<'EOF' from src.env import PRReviewEnv from src.models import PRReviewAction e = PRReviewEnv(task="medium"); e.reset() _, _, done, _ = e.step(PRReviewAction(action_type="request_changes")) assert done EOF _check "invalid task raises ValueError" venv/bin/python - <<'EOF' from src.env import PRReviewEnv try: PRReviewEnv(task="impossible") assert False except ValueError: pass EOF _check "all three task tiers have scenarios" venv/bin/python - <<'EOF' from src.env import PRReviewEnv for task in ("easy", "medium", "hard"): e = PRReviewEnv(task=task) obs = e.reset() assert obs.scenario_id, f"No scenario_id for task={task}" EOF # --------------------------------------------------------------------------- # 6. FastAPI endpoints # --------------------------------------------------------------------------- echo "" echo "[6] FastAPI endpoints" _check "GET / returns ok" venv/bin/python - <<'EOF' from fastapi.testclient import TestClient import src.api as api with TestClient(api.app) as c: r = c.get("/") assert r.json() == {"status": "ok"} EOF _check "POST /reset returns observation" venv/bin/python - <<'EOF' from fastapi.testclient import TestClient import src.api as api with TestClient(api.app) as c: for task in ("easy", "medium", "hard"): r = c.post(f"/reset?task={task}") assert r.status_code == 200, r.text obs = r.json() assert "diff" in obs and "step_count" in obs and not obs["done"] EOF _check "POST /step comment keeps episode open" venv/bin/python - <<'EOF' from fastapi.testclient import TestClient import src.api as api with TestClient(api.app) as c: c.post("/reset?task=easy") r = c.post("/step", json={"action_type": "comment", "body": "found a bug"}) assert r.status_code == 200 and r.json()["done"] == False EOF _check "POST /step approve ends episode" venv/bin/python - <<'EOF' from fastapi.testclient import TestClient import src.api as api with TestClient(api.app) as c: c.post("/reset?task=easy") r = c.post("/step", json={"action_type": "approve", "body": ""}) assert r.status_code == 200 and r.json()["done"] == True assert -0.3 <= r.json()["reward"]["value"] <= 0.3 EOF _check "GET /state returns state" venv/bin/python - <<'EOF' from fastapi.testclient import TestClient import src.api as api with TestClient(api.app) as c: c.post("/reset?task=hard") r = c.get("/state") assert r.status_code == 200 s = r.json() assert s["task"] == "hard" and s["done"] == False EOF # --------------------------------------------------------------------------- # 7. openenv.yaml structure # --------------------------------------------------------------------------- echo "" echo "[7] openenv.yaml" _check "has name field" grep -q "^name:" openenv.yaml _check "has version field" grep -q "^version:" openenv.yaml _check "has easy task" grep -q "id: easy" openenv.yaml _check "has medium task" grep -q "id: medium" openenv.yaml _check "has hard task" grep -q "id: hard" openenv.yaml _check "has observation_space" grep -q "^observation_space:" openenv.yaml _check "has action_space" grep -q "^action_space:" openenv.yaml _check "has reward_range" grep -q "^reward_range:" openenv.yaml # --------------------------------------------------------------------------- # 8. inference.py structure (via grep — avoids needing live env vars) # --------------------------------------------------------------------------- echo "" echo "[8] inference.py structure" _check "MODEL_NAME from os.environ" \ grep -q 'MODEL_NAME.*=.*os\.environ\["MODEL_NAME"\]' inference.py _check "HF_TOKEN from os.environ" \ grep -q 'HF_TOKEN.*=.*os\.environ\["HF_TOKEN"\]' inference.py _check "log_start defined" \ grep -q "def log_start" inference.py _check "log_step defined" \ grep -q "def log_step" inference.py _check "log_end defined" \ grep -q "def log_end" inference.py _check "runs easy task" grep -q '"easy"' inference.py _check "runs medium task" grep -q '"medium"' inference.py _check "runs hard task" grep -q '"hard"' inference.py # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- echo "" echo "=======================================" printf " PASSED: %d\n" "$PASS" printf " FAILED: %d\n" "$FAIL" echo "=======================================" [ "$FAIL" -eq 0 ]