Spaces:
Sleeping
Sleeping
File size: 4,272 Bytes
aae9736 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | #!/usr/bin/env bash
set -euo pipefail
echo "[validate] Running pytest"
python -m pytest -q
echo "[validate] Running grader determinism/bounds checks"
python -m pytest -q tests/test_graders.py
echo "[validate] Verifying openenv.yaml parses"
python - <<'PY'
import yaml
with open("openenv.yaml", "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
required = ["name", "version", "description", "action_space", "observation_space", "reward_description"]
missing = [k for k in required if k not in data]
if missing:
raise SystemExit(f"openenv.yaml missing required keys: {missing}")
print("openenv.yaml OK")
PY
echo "[validate] Verifying API endpoints and reset/step/state behavior"
python - <<'PY'
from fastapi.testclient import TestClient
from server.app import app
client = TestClient(app)
r = client.get("/")
if r.status_code != 200:
raise SystemExit(f"GET / failed with status {r.status_code}")
reset_resp = client.post("/reset", json={"task_id": "task_easy_1"})
if reset_resp.status_code != 200:
raise SystemExit(f"POST /reset failed with status {reset_resp.status_code}")
payload = reset_resp.json()
session_id = payload.get("session_id")
if not session_id:
raise SystemExit("/reset response missing session_id")
step_resp = client.post(
"/step",
json={
"session_id": session_id,
"action": {"action_type": "check_policy", "parameters": {}},
},
)
if step_resp.status_code != 200:
raise SystemExit(f"POST /step failed with status {step_resp.status_code}")
state_resp = client.get(f"/state?session_id={session_id}")
if state_resp.status_code != 200:
raise SystemExit(f"GET /state failed with status {state_resp.status_code}")
print("API endpoint checks OK")
PY
echo "[validate] Verifying task difficulty progression and reward ranges"
python - <<'PY'
from env.tasks import TASKS
from env.environment import SupportTicketEnv
from env.models import Action
# Difficulty coverage
difficulties = {task["difficulty"] for task in TASKS.values()}
expected = {"easy", "medium", "hard"}
if not expected.issubset(difficulties):
raise SystemExit(f"Missing expected difficulties: {expected - difficulties}")
# Reward range check across canonical task runs
canonical = {
"task_easy_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="issue_refund", parameters={"amount": "full"}),
Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
],
"task_medium_1": [
Action(action_type="check_policy", parameters={}),
Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
],
"task_hard_1": [
Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
Action(action_type="reply_to_customer", parameters={"message": "Escalating to billing tier 2."}),
Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
],
}
for task_id, actions in canonical.items():
env = SupportTicketEnv(task_id=task_id)
env.reset()
final_score = 0.0
for a in actions:
_, _, done, info = env.step(a)
final_score = info.get("current_reward", final_score)
if done:
break
if not (0.0 <= final_score <= 1.0):
raise SystemExit(f"Score out of range for {task_id}: {final_score}")
print("Task checks OK")
PY
echo "[validate] Running baseline evaluation harness"
python evaluate.py
echo "[validate] Checking inference script smoke-run and timing"
export API_BASE_URL="${API_BASE_URL:-https://api.openai.com/v1}"
export MODEL_NAME="${MODEL_NAME:-gpt-4o}"
export HF_TOKEN="${HF_TOKEN:-dummy-key}"
/usr/bin/time -p python inference.py > /tmp/inference_validation.log 2>&1 || true
if ! grep -q "\[START\]" /tmp/inference_validation.log; then
echo "Missing [START] in inference output"
exit 1
fi
if ! grep -q "\[STEP\]" /tmp/inference_validation.log; then
echo "Missing [STEP] in inference output"
exit 1
fi
if ! grep -q "\[END\]" /tmp/inference_validation.log; then
echo "Missing [END] in inference output"
exit 1
fi
echo "[validate] All non-docker validation checks completed"
|