meta-hackathon / verify.sh
Rushhaabhhh's picture
Added code for hackathon
8d96200 verified
#!/usr/bin/env bash
# verify.sh — smoke-test the PR Review OpenEnv project without Docker or live LLM.
set -euo pipefail
cd "$(dirname "$0")"
PASS=0
FAIL=0
_check() {
local label=$1; shift
if "$@" 2>/dev/null; then
printf " PASS %s\n" "$label"
PASS=$((PASS + 1))
else
printf " FAIL %s\n" "$label"
FAIL=$((FAIL + 1))
fi
}
echo "=== PR Review OpenEnv — verify.sh ==="
echo ""
# ---------------------------------------------------------------------------
# 1. Required files
# ---------------------------------------------------------------------------
echo "[1] Required files"
for f in \
openenv.yaml \
Dockerfile \
requirements.txt \
README.md \
src/__init__.py \
src/api.py \
src/env.py \
src/models.py \
src/grader.py \
inference.py; do
_check "$f" test -f "$f"
done
# ---------------------------------------------------------------------------
# 2. Scenario JSON files
# ---------------------------------------------------------------------------
echo ""
echo "[2] Scenario JSON files"
_check "at least 15 scenario files" bash -c \
'[ "$(ls data/scenarios/*.json 2>/dev/null | wc -l)" -ge 15 ]'
_check "all JSON files parse cleanly" venv/bin/python - <<'EOF'
import json, glob, sys
files = glob.glob("data/scenarios/*.json")
for p in files:
with open(p) as f:
d = json.load(f)
for key in ("pr_title", "pr_description", "diff", "ground_truth"):
assert key in d, f"Missing '{key}' in {p}"
gt = d["ground_truth"]
assert "bugs" in gt and "should_approve" in gt, f"Bad ground_truth in {p}"
print(f" {len(files)} files OK")
EOF
_check "at least one should_approve=true scenario" venv/bin/python - <<'EOF'
import json, glob
files = glob.glob("data/scenarios/*.json")
clean = [p for p in files if json.load(open(p))["ground_truth"]["should_approve"]]
assert len(clean) >= 1, "No approvable scenario found"
EOF
# ---------------------------------------------------------------------------
# 3. Python imports (package style — no PYTHONPATH tricks)
# ---------------------------------------------------------------------------
echo ""
echo "[3] Python imports"
_check "src.grader imports" venv/bin/python -c "from src.grader import grade"
_check "src.models imports" venv/bin/python -c \
"from src.models import PRReviewAction, PRReviewObservation, PRReviewReward"
_check "src.env imports" venv/bin/python -c "from src.env import PRReviewEnv"
_check "src.api imports" venv/bin/python -c "import src.api"
# ---------------------------------------------------------------------------
# 4. Grader unit tests
# ---------------------------------------------------------------------------
echo ""
echo "[4] Grader"
_check "perfect score = 1.0" venv/bin/python - <<'EOF'
from src.grader import grade
gt = {"bugs": [["off-by-one", "IndexError"], ["inner loop"]], "should_approve": False}
r = grade(gt, ["off-by-one in inner loop causes IndexError"], "reject")
assert r["score"] == 1.0, r
EOF
_check "zero score = 0.0" venv/bin/python - <<'EOF'
from src.grader import grade
gt = {"bugs": [["null"]], "should_approve": False}
r = grade(gt, [], "approve")
assert r["score"] == 0.0, r
EOF
_check "clean PR approved = 1.0" venv/bin/python - <<'EOF'
from src.grader import grade
r = grade({"bugs": [], "should_approve": True}, [], "approve")
assert r["score"] == 1.0, r
EOF
_check "no false positive: null not in nullable" venv/bin/python - <<'EOF'
from src.grader import grade
r = grade({"bugs": [["null"]], "should_approve": False}, ["nullable field"], "reject")
assert r["bugs_found"] == 0, r
EOF
_check "false rejection penalty applied" venv/bin/python - <<'EOF'
from src.grader import grade
r = grade({"bugs": [], "should_approve": True}, [], "reject")
assert r["false_rejection_penalty"] == -0.2, r
assert r["score"] < 1.0, r
EOF
# ---------------------------------------------------------------------------
# 5. PRReviewEnv unit tests
# ---------------------------------------------------------------------------
echo ""
echo "[5] PRReviewEnv"
_check "reset returns valid observation" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
e = PRReviewEnv(task="easy")
obs = e.reset()
assert obs.diff and obs.pr_title
assert obs.step_count == 0 and not obs.done
EOF
_check "comment step: not done, step_count incremented" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
from src.models import PRReviewAction
e = PRReviewEnv(task="easy"); e.reset()
obs, rew, done, _ = e.step(PRReviewAction(action_type="comment", body="test"))
assert not done and obs.step_count == 1 and -1.0 <= rew.value <= 1.0
EOF
_check "approve step: done=True, score set" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
from src.models import PRReviewAction
e = PRReviewEnv(task="easy"); e.reset()
_, rew, done, info = e.step(PRReviewAction(action_type="approve"))
assert done and -0.3 <= rew.value <= 0.3 and "score" in info
EOF
_check "request_changes step: done=True" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
from src.models import PRReviewAction
e = PRReviewEnv(task="medium"); e.reset()
_, _, done, _ = e.step(PRReviewAction(action_type="request_changes"))
assert done
EOF
_check "invalid task raises ValueError" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
try:
PRReviewEnv(task="impossible")
assert False
except ValueError:
pass
EOF
_check "all three task tiers have scenarios" venv/bin/python - <<'EOF'
from src.env import PRReviewEnv
for task in ("easy", "medium", "hard"):
e = PRReviewEnv(task=task)
obs = e.reset()
assert obs.scenario_id, f"No scenario_id for task={task}"
EOF
# ---------------------------------------------------------------------------
# 6. FastAPI endpoints
# ---------------------------------------------------------------------------
echo ""
echo "[6] FastAPI endpoints"
_check "GET / returns ok" venv/bin/python - <<'EOF'
from fastapi.testclient import TestClient
import src.api as api
with TestClient(api.app) as c:
r = c.get("/")
assert r.json() == {"status": "ok"}
EOF
_check "POST /reset returns observation" venv/bin/python - <<'EOF'
from fastapi.testclient import TestClient
import src.api as api
with TestClient(api.app) as c:
for task in ("easy", "medium", "hard"):
r = c.post(f"/reset?task={task}")
assert r.status_code == 200, r.text
obs = r.json()
assert "diff" in obs and "step_count" in obs and not obs["done"]
EOF
_check "POST /step comment keeps episode open" venv/bin/python - <<'EOF'
from fastapi.testclient import TestClient
import src.api as api
with TestClient(api.app) as c:
c.post("/reset?task=easy")
r = c.post("/step", json={"action_type": "comment", "body": "found a bug"})
assert r.status_code == 200 and r.json()["done"] == False
EOF
_check "POST /step approve ends episode" venv/bin/python - <<'EOF'
from fastapi.testclient import TestClient
import src.api as api
with TestClient(api.app) as c:
c.post("/reset?task=easy")
r = c.post("/step", json={"action_type": "approve", "body": ""})
assert r.status_code == 200 and r.json()["done"] == True
assert -0.3 <= r.json()["reward"]["value"] <= 0.3
EOF
_check "GET /state returns state" venv/bin/python - <<'EOF'
from fastapi.testclient import TestClient
import src.api as api
with TestClient(api.app) as c:
c.post("/reset?task=hard")
r = c.get("/state")
assert r.status_code == 200
s = r.json()
assert s["task"] == "hard" and s["done"] == False
EOF
# ---------------------------------------------------------------------------
# 7. openenv.yaml structure
# ---------------------------------------------------------------------------
echo ""
echo "[7] openenv.yaml"
_check "has name field" grep -q "^name:" openenv.yaml
_check "has version field" grep -q "^version:" openenv.yaml
_check "has easy task" grep -q "id: easy" openenv.yaml
_check "has medium task" grep -q "id: medium" openenv.yaml
_check "has hard task" grep -q "id: hard" openenv.yaml
_check "has observation_space" grep -q "^observation_space:" openenv.yaml
_check "has action_space" grep -q "^action_space:" openenv.yaml
_check "has reward_range" grep -q "^reward_range:" openenv.yaml
# ---------------------------------------------------------------------------
# 8. inference.py structure (via grep — avoids needing live env vars)
# ---------------------------------------------------------------------------
echo ""
echo "[8] inference.py structure"
_check "MODEL_NAME from os.environ" \
grep -q 'MODEL_NAME.*=.*os\.environ\["MODEL_NAME"\]' inference.py
_check "HF_TOKEN from os.environ" \
grep -q 'HF_TOKEN.*=.*os\.environ\["HF_TOKEN"\]' inference.py
_check "log_start defined" \
grep -q "def log_start" inference.py
_check "log_step defined" \
grep -q "def log_step" inference.py
_check "log_end defined" \
grep -q "def log_end" inference.py
_check "runs easy task" grep -q '"easy"' inference.py
_check "runs medium task" grep -q '"medium"' inference.py
_check "runs hard task" grep -q '"hard"' inference.py
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
echo ""
echo "======================================="
printf " PASSED: %d\n" "$PASS"
printf " FAILED: %d\n" "$FAIL"
echo "======================================="
[ "$FAIL" -eq 0 ]