File size: 4,272 Bytes
aae9736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env bash
set -euo pipefail

echo "[validate] Running pytest"
python -m pytest -q

echo "[validate] Running grader determinism/bounds checks"
python -m pytest -q tests/test_graders.py

echo "[validate] Verifying openenv.yaml parses"
python - <<'PY'
import yaml

with open("openenv.yaml", "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

required = ["name", "version", "description", "action_space", "observation_space", "reward_description"]
missing = [k for k in required if k not in data]
if missing:
    raise SystemExit(f"openenv.yaml missing required keys: {missing}")

print("openenv.yaml OK")
PY

echo "[validate] Verifying API endpoints and reset/step/state behavior"
python - <<'PY'
from fastapi.testclient import TestClient
from server.app import app

client = TestClient(app)

r = client.get("/")
if r.status_code != 200:
    raise SystemExit(f"GET / failed with status {r.status_code}")

reset_resp = client.post("/reset", json={"task_id": "task_easy_1"})
if reset_resp.status_code != 200:
    raise SystemExit(f"POST /reset failed with status {reset_resp.status_code}")

payload = reset_resp.json()
session_id = payload.get("session_id")
if not session_id:
    raise SystemExit("/reset response missing session_id")

step_resp = client.post(
    "/step",
    json={
        "session_id": session_id,
        "action": {"action_type": "check_policy", "parameters": {}},
    },
)
if step_resp.status_code != 200:
    raise SystemExit(f"POST /step failed with status {step_resp.status_code}")

state_resp = client.get(f"/state?session_id={session_id}")
if state_resp.status_code != 200:
    raise SystemExit(f"GET /state failed with status {state_resp.status_code}")

print("API endpoint checks OK")
PY

echo "[validate] Verifying task difficulty progression and reward ranges"
python - <<'PY'
from env.tasks import TASKS
from env.environment import SupportTicketEnv
from env.models import Action

# Difficulty coverage
difficulties = {task["difficulty"] for task in TASKS.values()}
expected = {"easy", "medium", "hard"}
if not expected.issubset(difficulties):
    raise SystemExit(f"Missing expected difficulties: {expected - difficulties}")

# Reward range check across canonical task runs
canonical = {
    "task_easy_1": [
        Action(action_type="check_policy", parameters={}),
        Action(action_type="issue_refund", parameters={"amount": "full"}),
        Action(action_type="close_ticket", parameters={"resolution": "refunded"}),
    ],
    "task_medium_1": [
        Action(action_type="check_policy", parameters={}),
        Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}),
        Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}),
    ],
    "task_hard_1": [
        Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}),
        Action(action_type="reply_to_customer", parameters={"message": "Escalating to billing tier 2."}),
        Action(action_type="escalate", parameters={"reason": "billing_tier2"}),
    ],
}

for task_id, actions in canonical.items():
    env = SupportTicketEnv(task_id=task_id)
    env.reset()
    final_score = 0.0
    for a in actions:
        _, _, done, info = env.step(a)
        final_score = info.get("current_reward", final_score)
        if done:
            break
    if not (0.0 <= final_score <= 1.0):
        raise SystemExit(f"Score out of range for {task_id}: {final_score}")

print("Task checks OK")
PY

echo "[validate] Running baseline evaluation harness"
python evaluate.py

echo "[validate] Checking inference script smoke-run and timing"
export API_BASE_URL="${API_BASE_URL:-https://api.openai.com/v1}"
export MODEL_NAME="${MODEL_NAME:-gpt-4o}"
export HF_TOKEN="${HF_TOKEN:-dummy-key}"
/usr/bin/time -p python inference.py > /tmp/inference_validation.log 2>&1 || true
if ! grep -q "\[START\]" /tmp/inference_validation.log; then
  echo "Missing [START] in inference output"
  exit 1
fi
if ! grep -q "\[STEP\]" /tmp/inference_validation.log; then
  echo "Missing [STEP] in inference output"
  exit 1
fi
if ! grep -q "\[END\]" /tmp/inference_validation.log; then
  echo "Missing [END] in inference output"
  exit 1
fi

echo "[validate] All non-docker validation checks completed"