Spaces:

Rushhaabhhh
/

meta-hackathon

Sleeping

App Files Files Community

meta-hackathon / verify.sh

Rushhaabhhh

Added code for hackathon

8d96200 verified about 2 months ago

raw

history blame contribute delete

9.52 kB

	#!/usr/bin/env bash
	# verify.sh — smoke-test the PR Review OpenEnv project without Docker or live LLM.
	set -euo pipefail

	cd "$(dirname "$0")"

	PASS=0
	FAIL=0

	_check() {
	local label=$1; shift
	if "$@" 2>/dev/null; then
	printf " PASS %s\n" "$label"
	PASS=$((PASS + 1))
	else
	printf " FAIL %s\n" "$label"
	FAIL=$((FAIL + 1))
	fi
	}

	echo "=== PR Review OpenEnv — verify.sh ==="
	echo ""

	# ---------------------------------------------------------------------------
	# 1. Required files
	# ---------------------------------------------------------------------------
	echo "[1] Required files"
	for f in \
	openenv.yaml \
	Dockerfile \
	requirements.txt \
	README.md \
	src/__init__.py \
	src/api.py \
	src/env.py \
	src/models.py \
	src/grader.py \
	inference.py; do
	_check "$f" test -f "$f"
	done

	# ---------------------------------------------------------------------------
	# 2. Scenario JSON files
	# ---------------------------------------------------------------------------
	echo ""
	echo "[2] Scenario JSON files"
	_check "at least 15 scenario files" bash -c \
	'[ "$(ls data/scenarios/*.json 2>/dev/null \| wc -l)" -ge 15 ]'
	_check "all JSON files parse cleanly" venv/bin/python - <<'EOF'
	import json, glob, sys
	files = glob.glob("data/scenarios/*.json")
	for p in files:
	with open(p) as f:
	d = json.load(f)
	for key in ("pr_title", "pr_description", "diff", "ground_truth"):
	assert key in d, f"Missing '{key}' in {p}"
	gt = d["ground_truth"]
	assert "bugs" in gt and "should_approve" in gt, f"Bad ground_truth in {p}"
	print(f" {len(files)} files OK")
	EOF
	_check "at least one should_approve=true scenario" venv/bin/python - <<'EOF'
	import json, glob
	files = glob.glob("data/scenarios/*.json")
	clean = [p for p in files if json.load(open(p))["ground_truth"]["should_approve"]]
	assert len(clean) >= 1, "No approvable scenario found"
	EOF

	# ---------------------------------------------------------------------------
	# 3. Python imports (package style — no PYTHONPATH tricks)
	# ---------------------------------------------------------------------------
	echo ""
	echo "[3] Python imports"
	_check "src.grader imports" venv/bin/python -c "from src.grader import grade"
	_check "src.models imports" venv/bin/python -c \
	"from src.models import PRReviewAction, PRReviewObservation, PRReviewReward"
	_check "src.env imports" venv/bin/python -c "from src.env import PRReviewEnv"
	_check "src.api imports" venv/bin/python -c "import src.api"

	# ---------------------------------------------------------------------------
	# 4. Grader unit tests
	# ---------------------------------------------------------------------------
	echo ""
	echo "[4] Grader"
	_check "perfect score = 1.0" venv/bin/python - <<'EOF'
	from src.grader import grade
	gt = {"bugs": [["off-by-one", "IndexError"], ["inner loop"]], "should_approve": False}
	r = grade(gt, ["off-by-one in inner loop causes IndexError"], "reject")
	assert r["score"] == 1.0, r
	EOF
	_check "zero score = 0.0" venv/bin/python - <<'EOF'
	from src.grader import grade
	gt = {"bugs": [["null"]], "should_approve": False}
	r = grade(gt, [], "approve")
	assert r["score"] == 0.0, r
	EOF
	_check "clean PR approved = 1.0" venv/bin/python - <<'EOF'
	from src.grader import grade
	r = grade({"bugs": [], "should_approve": True}, [], "approve")
	assert r["score"] == 1.0, r
	EOF
	_check "no false positive: null not in nullable" venv/bin/python - <<'EOF'
	from src.grader import grade
	r = grade({"bugs": [["null"]], "should_approve": False}, ["nullable field"], "reject")
	assert r["bugs_found"] == 0, r
	EOF
	_check "false rejection penalty applied" venv/bin/python - <<'EOF'
	from src.grader import grade
	r = grade({"bugs": [], "should_approve": True}, [], "reject")
	assert r["false_rejection_penalty"] == -0.2, r
	assert r["score"] < 1.0, r
	EOF

	# ---------------------------------------------------------------------------
	# 5. PRReviewEnv unit tests
	# ---------------------------------------------------------------------------
	echo ""
	echo "[5] PRReviewEnv"
	_check "reset returns valid observation" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	e = PRReviewEnv(task="easy")
	obs = e.reset()
	assert obs.diff and obs.pr_title
	assert obs.step_count == 0 and not obs.done
	EOF
	_check "comment step: not done, step_count incremented" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	from src.models import PRReviewAction
	e = PRReviewEnv(task="easy"); e.reset()
	obs, rew, done, _ = e.step(PRReviewAction(action_type="comment", body="test"))
	assert not done and obs.step_count == 1 and -1.0 <= rew.value <= 1.0
	EOF
	_check "approve step: done=True, score set" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	from src.models import PRReviewAction
	e = PRReviewEnv(task="easy"); e.reset()
	_, rew, done, info = e.step(PRReviewAction(action_type="approve"))
	assert done and -0.3 <= rew.value <= 0.3 and "score" in info
	EOF
	_check "request_changes step: done=True" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	from src.models import PRReviewAction
	e = PRReviewEnv(task="medium"); e.reset()
	_, _, done, _ = e.step(PRReviewAction(action_type="request_changes"))
	assert done
	EOF
	_check "invalid task raises ValueError" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	try:
	PRReviewEnv(task="impossible")
	assert False
	except ValueError:
	pass
	EOF
	_check "all three task tiers have scenarios" venv/bin/python - <<'EOF'
	from src.env import PRReviewEnv
	for task in ("easy", "medium", "hard"):
	e = PRReviewEnv(task=task)
	obs = e.reset()
	assert obs.scenario_id, f"No scenario_id for task={task}"
	EOF

	# ---------------------------------------------------------------------------
	# 6. FastAPI endpoints
	# ---------------------------------------------------------------------------
	echo ""
	echo "[6] FastAPI endpoints"
	_check "GET / returns ok" venv/bin/python - <<'EOF'
	from fastapi.testclient import TestClient
	import src.api as api
	with TestClient(api.app) as c:
	r = c.get("/")
	assert r.json() == {"status": "ok"}
	EOF
	_check "POST /reset returns observation" venv/bin/python - <<'EOF'
	from fastapi.testclient import TestClient
	import src.api as api
	with TestClient(api.app) as c:
	for task in ("easy", "medium", "hard"):
	r = c.post(f"/reset?task={task}")
	assert r.status_code == 200, r.text
	obs = r.json()
	assert "diff" in obs and "step_count" in obs and not obs["done"]
	EOF
	_check "POST /step comment keeps episode open" venv/bin/python - <<'EOF'
	from fastapi.testclient import TestClient
	import src.api as api
	with TestClient(api.app) as c:
	c.post("/reset?task=easy")
	r = c.post("/step", json={"action_type": "comment", "body": "found a bug"})
	assert r.status_code == 200 and r.json()["done"] == False
	EOF
	_check "POST /step approve ends episode" venv/bin/python - <<'EOF'
	from fastapi.testclient import TestClient
	import src.api as api
	with TestClient(api.app) as c:
	c.post("/reset?task=easy")
	r = c.post("/step", json={"action_type": "approve", "body": ""})
	assert r.status_code == 200 and r.json()["done"] == True
	assert -0.3 <= r.json()["reward"]["value"] <= 0.3
	EOF
	_check "GET /state returns state" venv/bin/python - <<'EOF'
	from fastapi.testclient import TestClient
	import src.api as api
	with TestClient(api.app) as c:
	c.post("/reset?task=hard")
	r = c.get("/state")
	assert r.status_code == 200
	s = r.json()
	assert s["task"] == "hard" and s["done"] == False
	EOF

	# ---------------------------------------------------------------------------
	# 7. openenv.yaml structure
	# ---------------------------------------------------------------------------
	echo ""
	echo "[7] openenv.yaml"
	_check "has name field" grep -q "^name:" openenv.yaml
	_check "has version field" grep -q "^version:" openenv.yaml
	_check "has easy task" grep -q "id: easy" openenv.yaml
	_check "has medium task" grep -q "id: medium" openenv.yaml
	_check "has hard task" grep -q "id: hard" openenv.yaml
	_check "has observation_space" grep -q "^observation_space:" openenv.yaml
	_check "has action_space" grep -q "^action_space:" openenv.yaml
	_check "has reward_range" grep -q "^reward_range:" openenv.yaml

	# ---------------------------------------------------------------------------
	# 8. inference.py structure (via grep — avoids needing live env vars)
	# ---------------------------------------------------------------------------
	echo ""
	echo "[8] inference.py structure"
	_check "MODEL_NAME from os.environ" \
	grep -q 'MODEL_NAME.=.os\.environ\["MODEL_NAME"\]' inference.py
	_check "HF_TOKEN from os.environ" \
	grep -q 'HF_TOKEN.=.os\.environ\["HF_TOKEN"\]' inference.py
	_check "log_start defined" \
	grep -q "def log_start" inference.py
	_check "log_step defined" \
	grep -q "def log_step" inference.py
	_check "log_end defined" \
	grep -q "def log_end" inference.py
	_check "runs easy task" grep -q '"easy"' inference.py
	_check "runs medium task" grep -q '"medium"' inference.py
	_check "runs hard task" grep -q '"hard"' inference.py

	# ---------------------------------------------------------------------------
	# Summary
	# ---------------------------------------------------------------------------
	echo ""
	echo "======================================="
	printf " PASSED: %d\n" "$PASS"
	printf " FAILED: %d\n" "$FAIL"
	echo "======================================="
	[ "$FAIL" -eq 0 ]