Spaces:
Paused
Paused
Iteration_4: expand task bank 20→30, add variants for all tasks, fix baseline policy for chain-gated CRIT tasks
ffea7f4 | # run_tests.sh — PayOps v2 full test suite | |
| # Usage: bash payops_env/run_tests.sh [BASE_URL] | |
| # Requires the server to be running. Default: http://localhost:8000 | |
| BASE="${1:-http://localhost:8000}" | |
| PASS=0 | |
| FAIL=0 | |
| SKIP=0 | |
| # ── helpers ────────────────────────────────────────────────────────────────── | |
| check() { | |
| local name="$1" | |
| local got="$2" | |
| local want="$3" | |
| if echo "$got" | grep -qE "$want"; then | |
| printf " \033[32m✓\033[0m %s\n" "$name" | |
| PASS=$((PASS+1)) | |
| else | |
| printf " \033[31m✗\033[0m %s\n" "$name" | |
| printf " expected pattern : %s\n" "$want" | |
| printf " got : %s\n" "$(echo "$got" | head -c 200)" | |
| FAIL=$((FAIL+1)) | |
| fi | |
| } | |
| check_absent() { | |
| local name="$1" | |
| local got="$2" | |
| local absent="$3" | |
| if echo "$got" | grep -qE "$absent"; then | |
| printf " \033[31m✗\033[0m %s (unexpected: %s)\n" "$name" "$absent" | |
| FAIL=$((FAIL+1)) | |
| else | |
| printf " \033[32m✓\033[0m %s\n" "$name" | |
| PASS=$((PASS+1)) | |
| fi | |
| } | |
| step() { | |
| curl -s -X POST "$BASE/step" \ | |
| -H "Content-Type: application/json" \ | |
| -d "$1" | |
| } | |
| reset() { | |
| curl -s -X POST "$BASE/reset" -H 'Content-Type: application/json' -d '{"seed":0}' > /dev/null | |
| } | |
| section() { | |
| echo "" | |
| echo "── $1 ──" | |
| } | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════╗" | |
| echo "║ PayOps v2 — Full Post-Main Test Suite ║" | |
| echo "║ Target: $BASE" | |
| echo "╚══════════════════════════════════════════════════════╝" | |
| echo "" | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP A — Server / Infrastructure | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP A — Server / Infrastructure" | |
| # A-01 Health check | |
| check "A-01 GET /health → status ok" \ | |
| "$(curl -s $BASE/health)" \ | |
| '"status":"ok"' | |
| # A-02 Version is v2 | |
| check "A-02 GET /health → version 2.0.0" \ | |
| "$(curl -s $BASE/health)" \ | |
| '"version":"2.0.0"' | |
| # A-03 Schema action model present | |
| check "A-03 GET /schema → PayOpsAction schema" \ | |
| "$(curl -s $BASE/schema)" \ | |
| '"PayOpsAction"' | |
| # A-04 Schema observation model present | |
| check "A-04 GET /schema → PayOpsObservation schema" \ | |
| "$(curl -s $BASE/schema)" \ | |
| '"PayOpsObservation"' | |
| # A-05 Schema state model present | |
| check "A-05 GET /schema → PayOpsState schema" \ | |
| "$(curl -s $BASE/schema)" \ | |
| '"PayOpsState"' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP B — Tasks endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP B — Tasks endpoint" | |
| TASKS_RESP="$(curl -s $BASE/tasks)" | |
| # B-01 30 tasks | |
| check "B-01 GET /tasks → count=30" \ | |
| "$TASKS_RESP" '"count":30' | |
| # B-02 All 4 difficulty tiers present | |
| check "B-02 Tasks include 'easy' tier" "$TASKS_RESP" '"difficulty":"easy"' | |
| check "B-03 Tasks include 'medium' tier" "$TASKS_RESP" '"difficulty":"medium"' | |
| check "B-04 Tasks include 'hard' tier" "$TASKS_RESP" '"difficulty":"hard"' | |
| check "B-05 Tasks include 'critical' tier" "$TASKS_RESP" '"difficulty":"critical"' | |
| # B-06 Regulatory tasks present | |
| check "B-06 Tasks include regulatory_action flag" \ | |
| "$TASKS_RESP" '"regulatory_action":true' | |
| # B-07 Multi-step chain tasks present | |
| check "B-07 Tasks include chain_total > 1" \ | |
| "$TASKS_RESP" '"chain_total":3' | |
| # B-08 requires_investigation populated | |
| check "B-08 Tasks include requires_investigation" \ | |
| "$TASKS_RESP" '"requires_investigation":\[' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP C — Reset | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP C — Reset" | |
| RESET_RESP="$(curl -s -X POST $BASE/reset)" | |
| check "C-01 POST /reset → returns EASY-001" "$RESET_RESP" '"task_id":"EASY-001"' | |
| check "C-02 POST /reset → done=false" "$RESET_RESP" '"done":false' | |
| check "C-03 POST /reset → budget_remaining=5.0" "$RESET_RESP" '"budget_remaining":5.0' | |
| check "C-04 POST /reset → risk_score present" "$RESET_RESP" '"risk_score":' | |
| check "C-05 POST /reset → ml_confidence present" "$RESET_RESP" '"ml_confidence":' | |
| check "C-06 POST /reset → chain_total present" "$RESET_RESP" '"chain_total":' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP D — Terminal Actions (correct decisions) | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP D — Terminal Actions (correct decisions)" | |
| reset | |
| # D-01 approve correct → reward=1.0 | |
| check "D-01 EASY-001 approve → reward=1.0" \ | |
| "$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')" \ | |
| '"reward":1.0' | |
| # D-02 reject correct → reward=1.0 | |
| check "D-02 EASY-002 reject → reward=1.0" \ | |
| "$(step '{"action_type":"reject","transaction_id":"TXN-E002"}')" \ | |
| '"reward":1.0' | |
| # D-03 approve correct (refund) → reward=1.0 | |
| check "D-03 EASY-003 approve → reward=1.0" \ | |
| "$(step '{"action_type":"approve","transaction_id":"TXN-E003"}')" \ | |
| '"reward":1.0' | |
| # D-04 flag correct → reward=1.0 | |
| check "D-04 EASY-004 flag → reward=1.0" \ | |
| "$(step '{"action_type":"flag","transaction_id":"TXN-E004"}')" \ | |
| '"reward":1.0' | |
| # D-04b approve correct (mortgage repayment) → reward=1.0 | |
| check "D-04b EASY-005 approve → reward=1.0" \ | |
| "$(step '{"action_type":"approve","transaction_id":"TXN-E005"}')" \ | |
| '"reward":1.0' | |
| # D-04c flag correct (duplicate payment) → reward=1.0 | |
| check "D-04c EASY-006 flag → reward=1.0" \ | |
| "$(step '{"action_type":"flag","transaction_id":"TXN-E006"}')" \ | |
| '"reward":1.0' | |
| # D-05 escalate correct → reward=1.0 | |
| check "D-05 MED-001 escalate → reward=1.0" \ | |
| "$(step '{"action_type":"escalate","transaction_id":"TXN-M001"}')" \ | |
| '"reward":1.0' | |
| # D-06 hold correct → reward=1.0 | |
| check "D-06 MED-002 hold → reward=1.0" \ | |
| "$(step '{"action_type":"hold","transaction_id":"TXN-M002"}')" \ | |
| '"reward":1.0' | |
| # D-07 task advances after terminal | |
| R=$(step '{"action_type":"flag","transaction_id":"TXN-M003"}') | |
| check "D-07 After flag on MED-003, next task is MED-004" "$R" '"task_id":"MED-004"' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP E — Terminal Actions (wrong decisions / partial credit) | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP E — Wrong actions & partial credit" | |
| reset | |
| # E-01 approve when should reject → -1.0 (fraud approval) | |
| step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null # advance past E001 | |
| check "E-01 Approve fraud (EASY-002) → reward=-1.0" \ | |
| "$(step '{"action_type":"approve","transaction_id":"TXN-E002"}')" \ | |
| '"reward":-1.0' | |
| reset | |
| step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null | |
| step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null | |
| # E-02 reject correct → approve → -0.5 | |
| check "E-02 Reject legit (EASY-003) → reward=-0.5" \ | |
| "$(step '{"action_type":"reject","transaction_id":"TXN-E003"}')" \ | |
| '"reward":-0.5' | |
| reset | |
| # E-03 partial credit — escalate instead of escalate on MED-001 is correct | |
| # flag instead of escalate earns partial | |
| step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null | |
| step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null | |
| step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null | |
| step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null | |
| step '{"action_type":"approve","transaction_id":"TXN-E005"}' > /dev/null | |
| step '{"action_type":"flag","transaction_id":"TXN-E006"}' > /dev/null | |
| R=$(step '{"action_type":"flag","transaction_id":"TXN-M001"}') | |
| check "E-03 Partial credit: flag on MED-001 (correct=escalate) → reward > 0" \ | |
| "$R" '"reward":0\.[0-9]' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP F — Investigation Sub-Actions | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP F — Investigation sub-actions" | |
| reset | |
| # F-01 inspect → reward=0.15, stays on same task | |
| R=$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}') | |
| check "F-01 inspect → reward=0.15" "$R" '"reward":0.15' | |
| check "F-02 inspect → does NOT advance task (still EASY-001)" "$R" '"task_id":"EASY-001"' | |
| check "F-03 inspect → inspection_notes populated" "$R" '"inspection_notes":"' | |
| check "F-04 inspect → budget_remaining=4.9" "$R" '"budget_remaining":4.9' | |
| # F-05 inspect again on same task → reward=0.0 (no double-dip) | |
| check "F-05 second inspect → reward=0.0" \ | |
| "$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')" \ | |
| '"reward":0.0' | |
| # Advance to EASY-002 | |
| step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null | |
| # F-06 request_docs → reward=0.15, docs_notes populated | |
| R=$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}') | |
| check "F-06 request_docs → reward=0.15" "$R" '"reward":0.15' | |
| check "F-07 request_docs → docs_notes populated" "$R" '"docs_notes":"' | |
| check "F-08 request_docs → budget_remaining=4.6 (cost=0.2)" "$R" '"budget_remaining":4.6' | |
| # F-09 request_docs again → reward=0.0 | |
| check "F-09 second request_docs → reward=0.0" \ | |
| "$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')" \ | |
| '"reward":0.0' | |
| step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null # advance | |
| # F-10 verify_kyc → reward=0.15, kyc_notes populated | |
| R=$(step '{"action_type":"verify_kyc","transaction_id":"TXN-E003"}') | |
| check "F-10 verify_kyc → reward=0.15" "$R" '"reward":0.15' | |
| check "F-11 verify_kyc → kyc_notes populated" "$R" '"kyc_notes":"' | |
| check "F-12 verify_kyc → budget cost=0.2 deducted" "$R" '"budget_remaining":4.[0-9]' | |
| step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null | |
| # F-13 contact_sender → reward=0.15, contact_notes populated | |
| R=$(step '{"action_type":"contact_sender","transaction_id":"TXN-E004"}') | |
| check "F-13 contact_sender → reward=0.15" "$R" '"reward":0.15' | |
| check "F-14 contact_sender → contact_notes populated" "$R" '"contact_notes":"' | |
| check "F-15 contact_sender → budget cost=0.3 deducted" "$R" '"budget_remaining":3\.[0-9]' | |
| step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null | |
| # F-16 file_sar → reward=0.15, docs_notes mentions SAR | |
| R=$(step '{"action_type":"file_sar","transaction_id":"TXN-M001"}') | |
| check "F-16 file_sar → reward=0.15" "$R" '"reward":0.15' | |
| check "F-17 file_sar → docs_notes mentions SAR" "$R" '"docs_notes":"SAR' | |
| check "F-18 file_sar → budget cost=0.05 deducted" "$R" '"budget_remaining":3\.[0-9]' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP G — /state endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP G — State endpoint" | |
| STATE="$(curl -s $BASE/state)" | |
| check "G-01 GET /state → episode_id set" "$STATE" '"episode_id":"' | |
| check "G-02 GET /state → step_count > 0" "$STATE" '"step_count":[1-9]' | |
| check "G-03 GET /state → budget_spent > 0" "$STATE" '"budget_spent":[0-9]' | |
| check "G-04 GET /state → investigation_actions_used list" "$STATE" '"investigation_actions_used":\[' | |
| check "G-05 GET /state → recent_decisions list" "$STATE" '"recent_decisions":\[' | |
| check "G-06 GET /state → correct_decisions >= 0" "$STATE" '"correct_decisions":[0-9]' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP H — /grader endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP H — Grader endpoint" | |
| GRADER="$(curl -s $BASE/grader)" | |
| check "H-01 GET /grader → normalised_score present" "$GRADER" '"normalised_score":' | |
| check "H-02 GET /grader → total_reward present" "$GRADER" '"total_reward":' | |
| check "H-03 GET /grader → budget_spent present" "$GRADER" '"budget_spent":' | |
| check "H-04 GET /grader → budget_penalty present" "$GRADER" '"budget_penalty":' | |
| check "H-05 GET /grader → per_task array" "$GRADER" '"per_task":\[' | |
| check "H-06 GET /grader → reward_breakdown in per_task" "$GRADER" '"reward_breakdown":' | |
| check "H-07 GET /grader → passed field present" "$GRADER" '"passed":' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP I — /replay endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP I — /replay endpoint (offline eval)" | |
| # I-01 Replay with all 30 correct terminal actions (no investigation) | |
| # Hard/critical tasks should be penalised (×0.80) → 0.7–0.9 range | |
| # Task order: EASY×6, MED×8, HARD×10, CRIT×6 | |
| # Correct actions: | |
| # EASY: approve reject approve flag approve flag | |
| # MED: escalate hold flag flag hold escalate hold flag | |
| # HARD: escalate reject reject approve escalate flag reject reject escalate reject | |
| # CRIT: approve reject escalate reject reject escalate | |
| REPLAY_PERFECT='{"actions":["approve","reject","approve","flag","approve","flag", | |
| "escalate","hold","flag","flag","hold","escalate","hold","flag", | |
| "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", | |
| "approve","reject","escalate","reject","reject","escalate"]}' | |
| R=$(curl -s -X POST $BASE/replay \ | |
| -H "Content-Type: application/json" \ | |
| -d "$REPLAY_PERFECT") | |
| check "I-01 Replay correct terminals (no investigation) → score<1.0 (hard/critical penalised)" "$R" '"normalised_score":0\.[7-9]' | |
| check "I-02 Replay → passed=true (score>0.5 despite penalty)" "$R" '"passed":true' | |
| check "I-03 Replay → budget_spent=0.0 (no inv actions)" "$R" '"budget_spent":0.0' | |
| # I-04 Replay with investigation actions included (inspect before first task) | |
| REPLAY_WITH_INV='{"actions":["inspect","approve","reject","approve","flag","approve","flag", | |
| "escalate","hold","flag","flag","hold","escalate","hold","flag", | |
| "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", | |
| "approve","reject","escalate","reject","reject","escalate"]}' | |
| R=$(curl -s -X POST $BASE/replay \ | |
| -H "Content-Type: application/json" \ | |
| -d "$REPLAY_WITH_INV") | |
| check "I-04 Replay with inspect → budget_spent=0.1" "$R" '"budget_spent":0.1' | |
| # I-05 Replay with invalid action → 422 | |
| check "I-05 Replay invalid action → 422 error" \ | |
| "$(curl -s -X POST $BASE/replay \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"actions":["delete","approve"]}')" \ | |
| "Invalid action" | |
| # I-06 Replay result has per_task breakdown | |
| check "I-06 Replay → per_task array present" "$R" '"per_task":\[' | |
| check "I-07 Replay → reward_breakdown in each task" "$R" '"reward_breakdown":' | |
| # I-08 Replay with confidences | |
| REPLAY_CONF='{"actions":["approve","reject"],"confidences":[0.95,0.90]}' | |
| check "I-08 Replay with confidences → ok" \ | |
| "$(curl -s -X POST $BASE/replay \ | |
| -H 'Content-Type: application/json' \ | |
| -d "$REPLAY_CONF")" \ | |
| '"normalised_score":' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP J — /baseline endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP J — Baseline agent" | |
| BASELINE="$(curl -s -X POST $BASE/baseline)" | |
| check "J-01 POST /baseline → normalised_score present" "$BASELINE" '"normalised_score":' | |
| check "J-02 POST /baseline → total_reward present" "$BASELINE" '"total_reward":' | |
| check "J-03 POST /baseline → steps > 0" "$BASELINE" '"steps":[1-9]' | |
| check "J-04 POST /baseline → per_task scores present" "$BASELINE" '"scores":\[' | |
| check "J-05 POST /baseline → score >= 0.5 (passes)" "$BASELINE" '"normalised_score":(1\.0|0\.[5-9])' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP K — /analytics endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP K — Analytics endpoint" | |
| # Run a full episode first so analytics has data | |
| reset | |
| BASE=$BASE python3 - <<'PYEOF' | |
| import os, ssl, urllib.request, json, sys | |
| BASE = os.environ.get("BASE", "http://localhost:8000") | |
| _ssl = ssl.create_default_context() | |
| _ssl.check_hostname = False | |
| _ssl.verify_mode = ssl.CERT_NONE | |
| def post(path, body=None): | |
| req = urllib.request.Request(f"{BASE}{path}", | |
| data=json.dumps(body).encode() if body else None, | |
| headers={"Content-Type": "application/json"}, method="POST") | |
| with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read()) | |
| def get(path): | |
| with urllib.request.urlopen(f"{BASE}{path}", context=_ssl) as r: return json.loads(r.read()) | |
| post("/reset", {"seed": 0}) | |
| # EASY x6 + MED x8 + HARD x10: no chain gate, terminal only | |
| for action, txn in [ | |
| ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"), | |
| ("approve","TXN-E005"),("flag","TXN-E006"), | |
| ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"), | |
| ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"), | |
| ("escalate","TXN-H001"),("reject","TXN-H002"),("reject","TXN-H003"),("approve","TXN-H004"), | |
| ("escalate","TXN-H005"),("flag","TXN-H006"),("reject","TXN-H007"),("reject","TXN-H008"), | |
| ("escalate","TXN-H009"),("reject","TXN-H010"), | |
| ]: | |
| post("/step", {"action_type": action, "transaction_id": txn}) | |
| # CRIT x6: chain-gated — must provide chain_min investigation steps first | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"}) | |
| post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C006"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"}) | |
| PYEOF | |
| ANA="$(curl -s $BASE/analytics)" | |
| check "K-01 GET /analytics → episodes_completed >= 1" "$ANA" '"episodes_completed":[1-9]' | |
| check "K-02 GET /analytics → best_score present" "$ANA" '"best_score":' | |
| check "K-03 GET /analytics → avg_score present" "$ANA" '"avg_score":' | |
| check "K-04 GET /analytics → avg_budget_spent present" "$ANA" '"avg_budget_spent":' | |
| check "K-05 GET /analytics → current_episode present" "$ANA" '"current_episode":' | |
| check "K-06 GET /analytics → by_difficulty present" "$ANA" '"by_difficulty":' | |
| check "K-07 GET /analytics → easy accuracy" "$ANA" '"easy":' | |
| check "K-08 GET /analytics → critical accuracy" "$ANA" '"critical":' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP L — /leaderboard endpoint | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP L — Leaderboard endpoint" | |
| LB="$(curl -s $BASE/leaderboard)" | |
| check "L-01 GET /leaderboard → count >= 1 (episode recorded)" "$LB" '"count":[1-9]' | |
| check "L-02 GET /leaderboard → entries array present" "$LB" '"entries":\[' | |
| check "L-03 GET /leaderboard → episode_id in entry" "$LB" '"episode_id":"' | |
| check "L-04 GET /leaderboard → normalised_score in entry" "$LB" '"normalised_score":' | |
| check "L-05 GET /leaderboard → timestamp in entry" "$LB" '"timestamp":"' | |
| check "L-06 GET /leaderboard → budget_spent in entry" "$LB" '"budget_spent":' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP M — Perfect episode score | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP M — Perfect episode (all 30 correct + investigation on hard/critical)" | |
| reset | |
| BASE=$BASE python3 - <<'PYEOF' | |
| import os, ssl, urllib.request, json | |
| BASE = os.environ.get("BASE", "http://localhost:8000") | |
| _ssl = ssl.create_default_context() | |
| _ssl.check_hostname = False | |
| _ssl.verify_mode = ssl.CERT_NONE | |
| def post(path, body=None): | |
| req = urllib.request.Request(f"{BASE}{path}", | |
| data=json.dumps(body).encode() if body else None, | |
| headers={"Content-Type": "application/json"}, method="POST") | |
| with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read()) | |
| post("/reset", {"seed": 0}) | |
| # Easy x6 + Medium x8: terminal only | |
| for action, txn in [ | |
| ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"), | |
| ("approve","TXN-E005"),("flag","TXN-E006"), | |
| ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"), | |
| ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"), | |
| ]: | |
| post("/step", {"action_type": action, "transaction_id": txn}) | |
| # Hard x10: one required investigation sub-action before each terminal | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-H001"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-H001"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H002"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-H002"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-H003"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-H003"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-H004"}) | |
| post("/step", {"action_type": "approve", "transaction_id": "TXN-H004"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-H005"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H005"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-H005"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-H006"}) | |
| post("/step", {"action_type": "flag", "transaction_id": "TXN-H006"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H007"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-H007"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H008"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-H008"}) | |
| post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-H009"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-H009"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H010"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-H010"}) | |
| # Critical x6: required investigation sub-actions before each terminal | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"}) | |
| post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"}) | |
| post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-C004"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"}) | |
| post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"}) | |
| post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C006"}) | |
| post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"}) | |
| PYEOF | |
| GRADER="$(curl -s $BASE/grader)" | |
| check "M-01 Perfect episode → normalised_score=1.0" "$GRADER" '"normalised_score":1.0' | |
| check "M-02 Perfect episode → passed=true" "$GRADER" '"passed":true' | |
| check "M-03 Perfect episode → budget_penalty=0.0" "$GRADER" '"budget_penalty":0.0' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP N — Difficulty weighting | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP N — Difficulty weighting in grader" | |
| # A critical task correct should contribute more weight than easy | |
| REPLAY_CRIT='{"actions":["approve","reject","approve","flag","approve","flag", | |
| "escalate","hold","flag","flag","hold","escalate","hold","flag", | |
| "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", | |
| "approve","reject","escalate","reject","reject","escalate"]}' | |
| FULL=$(curl -s -X POST $BASE/replay \ | |
| -H "Content-Type: application/json" \ | |
| -d "$REPLAY_CRIT") | |
| check "N-01 Full correct replay → max_possible_reward includes weights" \ | |
| "$FULL" '"max_possible_reward":4[0-9]\.' | |
| check "N-02 Correct-but-no-investigation → total_reward below max (penalty applied)" \ | |
| "$(echo "$FULL" | python3 -c " | |
| import sys,json,re | |
| d=json.load(sys.stdin) | |
| print('DIFF' if d['total_reward'] < d['max_possible_reward'] - 0.01 else 'EQUAL') | |
| ")" "DIFF" | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP O — Budget mechanics | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP O — Budget mechanics" | |
| reset | |
| # Burn through budget with contact_sender (cost=0.3) × 5 = 1.5 | |
| # then request_docs × 5 = 1.0, then verify_kyc × 5 = 1.0 (total = 3.5 so far ≤5) | |
| # Total exceeding: won't exceed in these few calls, test budget tracking | |
| step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # -0.30 | |
| step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # dup, but cost still deducted | |
| step '{"action_type":"request_docs","transaction_id":"TXN-E001"}' > /dev/null # -0.20 | |
| step '{"action_type":"verify_kyc","transaction_id":"TXN-E001"}' > /dev/null # -0.20 | |
| step '{"action_type":"file_sar","transaction_id":"TXN-E001"}' > /dev/null # -0.05 | |
| R=$(step '{"action_type":"approve","transaction_id":"TXN-E001"}') | |
| # budget_remaining = 5.0 - 0.30 - 0.30 - 0.20 - 0.20 - 0.05 = 3.95 | |
| check "O-01 Budget correctly deducted after multiple inv actions" \ | |
| "$R" '"budget_remaining":3\.[0-9]' | |
| # grader shows budget_spent | |
| GRADER="$(curl -s $BASE/grader)" | |
| check "O-02 Grader → budget_spent > 0" "$GRADER" '"budget_spent":[1-9]' | |
| # replay that overshoots budget → budget_penalty > 0 | |
| HEAVY='{"actions":["contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","approve","reject","approve","flag","escalate","hold","flag","flag","hold","escalate","escalate","reject","reject","approve","escalate","flag","approve","reject","escalate","reject"]}' | |
| R=$(curl -s -X POST $BASE/replay \ | |
| -H "Content-Type: application/json" \ | |
| -d "$HEAVY") | |
| check "O-03 Over-budget replay → budget_penalty > 0" "$R" '"budget_penalty":0\.[0-9]*[1-9]' | |
| check "O-04 Over-budget replay → budget_overspend > 0" "$R" '"budget_overspend":0\.[1-9]' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP P — Edge cases & errors | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP P — Edge cases & error handling" | |
| # P-01 Step before reset → 400 | |
| check "P-01 Step on fresh env (reset then done) → handled" \ | |
| "$(curl -s -X POST $BASE/reset > /dev/null && curl -s -X POST $BASE/step \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"action_type":"approve","transaction_id":"TXN-NONE"}')" \ | |
| '"reward"' | |
| # P-02 Unknown action_type → 422 | |
| check "P-02 Unknown action_type → 422" \ | |
| "$(curl -s -X POST $BASE/step \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"action_type":"nuke","transaction_id":"TXN-E001"}')" \ | |
| "Invalid action_type" | |
| # P-03 Missing action_type field → 422 | |
| check "P-03 Missing action_type field → error" \ | |
| "$(curl -s -X POST $BASE/step \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"transaction_id":"TXN-E001"}')" \ | |
| '"detail"' | |
| # P-04 Empty replay actions list | |
| check "P-04 Replay empty actions → returns score" \ | |
| "$(curl -s -X POST $BASE/replay \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"actions":[]}')" \ | |
| '"normalised_score"' | |
| # P-05 Replay with confidences shorter than actions → still works | |
| check "P-05 Replay with partial confidences → score present" \ | |
| "$(curl -s -X POST $BASE/replay \ | |
| -H 'Content-Type: application/json' \ | |
| -d '{"actions":["approve","reject"],"confidences":[0.9]}')" \ | |
| '"normalised_score"' | |
| # P-06 Grader before any reset → 400 or returns score | |
| check "P-06 Grader before actions → handled gracefully" \ | |
| "$(curl -s $BASE/grader)" \ | |
| '"normalised_score"|"error"' | |
| # P-07 Analytics before any completed episode → handled | |
| reset | |
| check "P-07 Analytics after fresh reset → message or data" \ | |
| "$(curl -s $BASE/analytics)" \ | |
| '"message"|"episodes_completed"' | |
| # ═══════════════════════════════════════════════════════════ | |
| # GROUP Q — WebSocket | |
| # ═══════════════════════════════════════════════════════════ | |
| section "GROUP Q — WebSocket (requires wscat or python)" | |
| if command -v python3 &>/dev/null; then | |
| WS_RESULT=$(BASE=$BASE python3 - <<'PYEOF' | |
| import os, ssl, urllib.request, json | |
| try: | |
| BASE = os.environ.get("BASE", "http://localhost:8000") | |
| _ssl = ssl.create_default_context() | |
| _ssl.check_hostname = False | |
| _ssl.verify_mode = ssl.CERT_NONE | |
| # ws upgrade check via HTTP (will get 426 Upgrade Required — proves endpoint exists) | |
| req = urllib.request.Request(f"{BASE}/ws") | |
| try: | |
| urllib.request.urlopen(req, context=_ssl) | |
| except Exception as e: | |
| msg = str(e) | |
| if "426" in msg or "101" in msg or "Switching" in msg or "upgrade" in msg.lower() or "404" in msg: | |
| print("WS_OK") | |
| else: | |
| print(f"WS_UNKNOWN:{msg[:60]}") | |
| except Exception as e: | |
| print(f"WS_ERR:{e}") | |
| PYEOF | |
| ) | |
| check "Q-01 WS /ws endpoint exists (426 upgrade = correct)" \ | |
| "$WS_RESULT" "WS_OK" | |
| else | |
| printf " \033[33m-\033[0m Q-01 WebSocket check skipped (no python3)\n" | |
| SKIP=$((SKIP+1)) | |
| fi | |
| # ═══════════════════════════════════════════════════════════ | |
| # SUMMARY | |
| # ═══════════════════════════════════════════════════════════ | |
| echo "" | |
| echo "╔══════════════════════════════════════════════════════╗" | |
| TOTAL=$((PASS+FAIL)) | |
| if [ "$FAIL" -eq 0 ]; then | |
| printf "║ ✓ All %d tests passed" "$TOTAL" | |
| else | |
| printf "║ Results: %d/%d passed, %d failed" "$PASS" "$TOTAL" "$FAIL" | |
| fi | |
| [ "$SKIP" -gt 0 ] && printf " (%d skipped)" "$SKIP" | |
| echo "" | |
| echo "╚══════════════════════════════════════════════════════╝" | |
| echo "" | |
| # Exit with failure code if any tests failed | |
| [ "$FAIL" -eq 0 ] | |