openenv-rl-environment / scripts /validate_submission.sh
Sid8421's picture
Improve README, tests, and validation script for RL environment
aae9736
#!/usr/bin/env bash
set -euo pipefail
echo "[validate] Running pytest"
python -m pytest -q
echo "[validate] Running grader determinism/bounds checks"
python -m pytest -q tests/test_graders.py
echo "[validate] Verifying openenv.yaml parses"
python - <<'PY'
import yaml
with open("openenv.yaml", "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
required = ["name", "version", "description", "action_space", "observation_space", "reward_description"]
missing = [k for k in required if k not in data]
if missing:
raise SystemExit(f"openenv.yaml missing required keys: {missing}")
print("openenv.yaml OK")
PY
echo "[validate] Verifying API endpoints and reset/step/state behavior"
python - <<'PY'
from fastapi.testclient import TestClient
from server.app import app
client = TestClient(app)
r = client.get("/")
if r.status_code != 200:
raise SystemExit(f"GET / failed with status {r.status_code}")
reset_resp = client.post("/reset", json={"task_id": "task_easy_1"})
if reset_resp.status_code != 200:
raise SystemExit(f"POST /reset failed with status {reset_resp.status_code}")
payload = reset_resp.json()
session_id = payload.get("session_id")
if not session_id:
raise SystemExit("/reset response missing session_id")
step_resp = client.post(
"/step",
json={
"session_id": session_id,
"action": {"action_type": "check_policy", "parameters": {}},
},
)
if step_resp.status_code != 200:
raise SystemExit(f"POST /step failed with status {step_resp.status_code}")
state_resp = client.get(f"/state?session_id={session_id}")
if state_resp.status_code != 200:
raise SystemExit(f"GET /state failed with status {state_resp.status_code}")
print("API endpoint checks OK")
PY
echo "[validate] Verifying task difficulty progression and reward ranges"
python - <<'PY'
from env.tasks import TASKS
from env.environment import SupportTicketEnv
from env.models import Action
# Difficulty coverage
difficulties = {task["difficulty"] for task in TASKS.values()}
expected = {"easy", "medium", "hard"}
if not expected.issubset(difficulties):
raise SystemExit(f"Missing expected difficulties: {expected - difficulties}")
# Reward range check across canonical task runs
canonical = {
"task_easy_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="issue_refund", parameters={"amount": "full"}),
Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
],
"task_medium_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
],
"task_hard_1": [
Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
Action(action_type="reply_to_customer", parameters={"message": "Escalating to billing tier 2."}),
Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
],
}
for task_id, actions in canonical.items():
env = SupportTicketEnv(task_id=task_id)
env.reset()
final_score = 0.0
for a in actions:
_, _, done, info = env.step(a)
final_score = info.get("current_reward", final_score)
if done:
break
if not (0.0 <= final_score <= 1.0):
raise SystemExit(f"Score out of range for {task_id}: {final_score}")
print("Task checks OK")
PY
echo "[validate] Running baseline evaluation harness"
python evaluate.py
echo "[validate] Checking inference script smoke-run and timing"
export API_BASE_URL="${API_BASE_URL:-https://api.openai.com/v1}"
export MODEL_NAME="${MODEL_NAME:-gpt-4o}"
export HF_TOKEN="${HF_TOKEN:-dummy-key}"
/usr/bin/time -p python inference.py > /tmp/inference_validation.log 2>&1 || true
if ! grep -q "\[START\]" /tmp/inference_validation.log; then
echo "Missing [START] in inference output"
exit 1
fi
if ! grep -q "\[STEP\]" /tmp/inference_validation.log; then
echo "Missing [STEP] in inference output"
exit 1
fi
if ! grep -q "\[END\]" /tmp/inference_validation.log; then
echo "Missing [END] in inference output"
exit 1
fi
echo "[validate] All non-docker validation checks completed"