Spaces:
Sleeping
Sleeping
| # verify.sh — smoke-test the PR Review OpenEnv project without Docker or live LLM. | |
| set -euo pipefail | |
| cd "$(dirname "$0")" | |
| PASS=0 | |
| FAIL=0 | |
| _check() { | |
| local label=$1; shift | |
| if "$@" 2>/dev/null; then | |
| printf " PASS %s\n" "$label" | |
| PASS=$((PASS + 1)) | |
| else | |
| printf " FAIL %s\n" "$label" | |
| FAIL=$((FAIL + 1)) | |
| fi | |
| } | |
| echo "=== PR Review OpenEnv — verify.sh ===" | |
| echo "" | |
| # --------------------------------------------------------------------------- | |
| # 1. Required files | |
| # --------------------------------------------------------------------------- | |
| echo "[1] Required files" | |
| for f in \ | |
| openenv.yaml \ | |
| Dockerfile \ | |
| requirements.txt \ | |
| README.md \ | |
| src/__init__.py \ | |
| src/api.py \ | |
| src/env.py \ | |
| src/models.py \ | |
| src/grader.py \ | |
| inference.py; do | |
| _check "$f" test -f "$f" | |
| done | |
| # --------------------------------------------------------------------------- | |
| # 2. Scenario JSON files | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[2] Scenario JSON files" | |
| _check "at least 15 scenario files" bash -c \ | |
| '[ "$(ls data/scenarios/*.json 2>/dev/null | wc -l)" -ge 15 ]' | |
| _check "all JSON files parse cleanly" venv/bin/python - <<'EOF' | |
| import json, glob, sys | |
| files = glob.glob("data/scenarios/*.json") | |
| for p in files: | |
| with open(p) as f: | |
| d = json.load(f) | |
| for key in ("pr_title", "pr_description", "diff", "ground_truth"): | |
| assert key in d, f"Missing '{key}' in {p}" | |
| gt = d["ground_truth"] | |
| assert "bugs" in gt and "should_approve" in gt, f"Bad ground_truth in {p}" | |
| print(f" {len(files)} files OK") | |
| EOF | |
| _check "at least one should_approve=true scenario" venv/bin/python - <<'EOF' | |
| import json, glob | |
| files = glob.glob("data/scenarios/*.json") | |
| clean = [p for p in files if json.load(open(p))["ground_truth"]["should_approve"]] | |
| assert len(clean) >= 1, "No approvable scenario found" | |
| EOF | |
| # --------------------------------------------------------------------------- | |
| # 3. Python imports (package style — no PYTHONPATH tricks) | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[3] Python imports" | |
| _check "src.grader imports" venv/bin/python -c "from src.grader import grade" | |
| _check "src.models imports" venv/bin/python -c \ | |
| "from src.models import PRReviewAction, PRReviewObservation, PRReviewReward" | |
| _check "src.env imports" venv/bin/python -c "from src.env import PRReviewEnv" | |
| _check "src.api imports" venv/bin/python -c "import src.api" | |
| # --------------------------------------------------------------------------- | |
| # 4. Grader unit tests | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[4] Grader" | |
| _check "perfect score = 1.0" venv/bin/python - <<'EOF' | |
| from src.grader import grade | |
| gt = {"bugs": [["off-by-one", "IndexError"], ["inner loop"]], "should_approve": False} | |
| r = grade(gt, ["off-by-one in inner loop causes IndexError"], "reject") | |
| assert r["score"] == 1.0, r | |
| EOF | |
| _check "zero score = 0.0" venv/bin/python - <<'EOF' | |
| from src.grader import grade | |
| gt = {"bugs": [["null"]], "should_approve": False} | |
| r = grade(gt, [], "approve") | |
| assert r["score"] == 0.0, r | |
| EOF | |
| _check "clean PR approved = 1.0" venv/bin/python - <<'EOF' | |
| from src.grader import grade | |
| r = grade({"bugs": [], "should_approve": True}, [], "approve") | |
| assert r["score"] == 1.0, r | |
| EOF | |
| _check "no false positive: null not in nullable" venv/bin/python - <<'EOF' | |
| from src.grader import grade | |
| r = grade({"bugs": [["null"]], "should_approve": False}, ["nullable field"], "reject") | |
| assert r["bugs_found"] == 0, r | |
| EOF | |
| _check "false rejection penalty applied" venv/bin/python - <<'EOF' | |
| from src.grader import grade | |
| r = grade({"bugs": [], "should_approve": True}, [], "reject") | |
| assert r["false_rejection_penalty"] == -0.2, r | |
| assert r["score"] < 1.0, r | |
| EOF | |
| # --------------------------------------------------------------------------- | |
| # 5. PRReviewEnv unit tests | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[5] PRReviewEnv" | |
| _check "reset returns valid observation" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| e = PRReviewEnv(task="easy") | |
| obs = e.reset() | |
| assert obs.diff and obs.pr_title | |
| assert obs.step_count == 0 and not obs.done | |
| EOF | |
| _check "comment step: not done, step_count incremented" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| from src.models import PRReviewAction | |
| e = PRReviewEnv(task="easy"); e.reset() | |
| obs, rew, done, _ = e.step(PRReviewAction(action_type="comment", body="test")) | |
| assert not done and obs.step_count == 1 and -1.0 <= rew.value <= 1.0 | |
| EOF | |
| _check "approve step: done=True, score set" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| from src.models import PRReviewAction | |
| e = PRReviewEnv(task="easy"); e.reset() | |
| _, rew, done, info = e.step(PRReviewAction(action_type="approve")) | |
| assert done and -0.3 <= rew.value <= 0.3 and "score" in info | |
| EOF | |
| _check "request_changes step: done=True" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| from src.models import PRReviewAction | |
| e = PRReviewEnv(task="medium"); e.reset() | |
| _, _, done, _ = e.step(PRReviewAction(action_type="request_changes")) | |
| assert done | |
| EOF | |
| _check "invalid task raises ValueError" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| try: | |
| PRReviewEnv(task="impossible") | |
| assert False | |
| except ValueError: | |
| pass | |
| EOF | |
| _check "all three task tiers have scenarios" venv/bin/python - <<'EOF' | |
| from src.env import PRReviewEnv | |
| for task in ("easy", "medium", "hard"): | |
| e = PRReviewEnv(task=task) | |
| obs = e.reset() | |
| assert obs.scenario_id, f"No scenario_id for task={task}" | |
| EOF | |
| # --------------------------------------------------------------------------- | |
| # 6. FastAPI endpoints | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[6] FastAPI endpoints" | |
| _check "GET / returns ok" venv/bin/python - <<'EOF' | |
| from fastapi.testclient import TestClient | |
| import src.api as api | |
| with TestClient(api.app) as c: | |
| r = c.get("/") | |
| assert r.json() == {"status": "ok"} | |
| EOF | |
| _check "POST /reset returns observation" venv/bin/python - <<'EOF' | |
| from fastapi.testclient import TestClient | |
| import src.api as api | |
| with TestClient(api.app) as c: | |
| for task in ("easy", "medium", "hard"): | |
| r = c.post(f"/reset?task={task}") | |
| assert r.status_code == 200, r.text | |
| obs = r.json() | |
| assert "diff" in obs and "step_count" in obs and not obs["done"] | |
| EOF | |
| _check "POST /step comment keeps episode open" venv/bin/python - <<'EOF' | |
| from fastapi.testclient import TestClient | |
| import src.api as api | |
| with TestClient(api.app) as c: | |
| c.post("/reset?task=easy") | |
| r = c.post("/step", json={"action_type": "comment", "body": "found a bug"}) | |
| assert r.status_code == 200 and r.json()["done"] == False | |
| EOF | |
| _check "POST /step approve ends episode" venv/bin/python - <<'EOF' | |
| from fastapi.testclient import TestClient | |
| import src.api as api | |
| with TestClient(api.app) as c: | |
| c.post("/reset?task=easy") | |
| r = c.post("/step", json={"action_type": "approve", "body": ""}) | |
| assert r.status_code == 200 and r.json()["done"] == True | |
| assert -0.3 <= r.json()["reward"]["value"] <= 0.3 | |
| EOF | |
| _check "GET /state returns state" venv/bin/python - <<'EOF' | |
| from fastapi.testclient import TestClient | |
| import src.api as api | |
| with TestClient(api.app) as c: | |
| c.post("/reset?task=hard") | |
| r = c.get("/state") | |
| assert r.status_code == 200 | |
| s = r.json() | |
| assert s["task"] == "hard" and s["done"] == False | |
| EOF | |
| # --------------------------------------------------------------------------- | |
| # 7. openenv.yaml structure | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[7] openenv.yaml" | |
| _check "has name field" grep -q "^name:" openenv.yaml | |
| _check "has version field" grep -q "^version:" openenv.yaml | |
| _check "has easy task" grep -q "id: easy" openenv.yaml | |
| _check "has medium task" grep -q "id: medium" openenv.yaml | |
| _check "has hard task" grep -q "id: hard" openenv.yaml | |
| _check "has observation_space" grep -q "^observation_space:" openenv.yaml | |
| _check "has action_space" grep -q "^action_space:" openenv.yaml | |
| _check "has reward_range" grep -q "^reward_range:" openenv.yaml | |
| # --------------------------------------------------------------------------- | |
| # 8. inference.py structure (via grep — avoids needing live env vars) | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "[8] inference.py structure" | |
| _check "MODEL_NAME from os.environ" \ | |
| grep -q 'MODEL_NAME.*=.*os\.environ\["MODEL_NAME"\]' inference.py | |
| _check "HF_TOKEN from os.environ" \ | |
| grep -q 'HF_TOKEN.*=.*os\.environ\["HF_TOKEN"\]' inference.py | |
| _check "log_start defined" \ | |
| grep -q "def log_start" inference.py | |
| _check "log_step defined" \ | |
| grep -q "def log_step" inference.py | |
| _check "log_end defined" \ | |
| grep -q "def log_end" inference.py | |
| _check "runs easy task" grep -q '"easy"' inference.py | |
| _check "runs medium task" grep -q '"medium"' inference.py | |
| _check "runs hard task" grep -q '"hard"' inference.py | |
| # --------------------------------------------------------------------------- | |
| # Summary | |
| # --------------------------------------------------------------------------- | |
| echo "" | |
| echo "=======================================" | |
| printf " PASSED: %d\n" "$PASS" | |
| printf " FAILED: %d\n" "$FAIL" | |
| echo "=======================================" | |
| [ "$FAIL" -eq 0 ] |