#!/usr/bin/env bash # run_tests.sh — PayOps v2 full test suite # Usage: bash payops_env/run_tests.sh [BASE_URL] # Requires the server to be running. Default: http://localhost:8000 BASE="${1:-http://localhost:8000}" PASS=0 FAIL=0 SKIP=0 # ── helpers ────────────────────────────────────────────────────────────────── check() { local name="$1" local got="$2" local want="$3" if echo "$got" | grep -qE "$want"; then printf " \033[32m✓\033[0m %s\n" "$name" PASS=$((PASS+1)) else printf " \033[31m✗\033[0m %s\n" "$name" printf " expected pattern : %s\n" "$want" printf " got : %s\n" "$(echo "$got" | head -c 200)" FAIL=$((FAIL+1)) fi } check_absent() { local name="$1" local got="$2" local absent="$3" if echo "$got" | grep -qE "$absent"; then printf " \033[31m✗\033[0m %s (unexpected: %s)\n" "$name" "$absent" FAIL=$((FAIL+1)) else printf " \033[32m✓\033[0m %s\n" "$name" PASS=$((PASS+1)) fi } step() { curl -s -X POST "$BASE/step" \ -H "Content-Type: application/json" \ -d "$1" } reset() { curl -s -X POST "$BASE/reset" -H 'Content-Type: application/json' -d '{"seed":0}' > /dev/null } section() { echo "" echo "── $1 ──" } echo "" echo "╔══════════════════════════════════════════════════════╗" echo "║ PayOps v2 — Full Post-Main Test Suite ║" echo "║ Target: $BASE" echo "╚══════════════════════════════════════════════════════╝" echo "" # ═══════════════════════════════════════════════════════════ # GROUP A — Server / Infrastructure # ═══════════════════════════════════════════════════════════ section "GROUP A — Server / Infrastructure" # A-01 Health check check "A-01 GET /health → status ok" \ "$(curl -s $BASE/health)" \ '"status":"ok"' # A-02 Version is v2 check "A-02 GET /health → version 2.0.0" \ "$(curl -s $BASE/health)" \ '"version":"2.0.0"' # A-03 Schema action model present check "A-03 GET /schema → PayOpsAction schema" \ "$(curl -s $BASE/schema)" \ '"PayOpsAction"' # A-04 Schema observation model present check "A-04 GET /schema → PayOpsObservation schema" \ "$(curl -s $BASE/schema)" \ '"PayOpsObservation"' # A-05 Schema state model present check "A-05 GET /schema → PayOpsState schema" \ "$(curl -s $BASE/schema)" \ '"PayOpsState"' # ═══════════════════════════════════════════════════════════ # GROUP B — Tasks endpoint # ═══════════════════════════════════════════════════════════ section "GROUP B — Tasks endpoint" TASKS_RESP="$(curl -s $BASE/tasks)" # B-01 30 tasks check "B-01 GET /tasks → count=30" \ "$TASKS_RESP" '"count":30' # B-02 All 4 difficulty tiers present check "B-02 Tasks include 'easy' tier" "$TASKS_RESP" '"difficulty":"easy"' check "B-03 Tasks include 'medium' tier" "$TASKS_RESP" '"difficulty":"medium"' check "B-04 Tasks include 'hard' tier" "$TASKS_RESP" '"difficulty":"hard"' check "B-05 Tasks include 'critical' tier" "$TASKS_RESP" '"difficulty":"critical"' # B-06 Regulatory tasks present check "B-06 Tasks include regulatory_action flag" \ "$TASKS_RESP" '"regulatory_action":true' # B-07 Multi-step chain tasks present check "B-07 Tasks include chain_total > 1" \ "$TASKS_RESP" '"chain_total":3' # B-08 requires_investigation populated check "B-08 Tasks include requires_investigation" \ "$TASKS_RESP" '"requires_investigation":\[' # ═══════════════════════════════════════════════════════════ # GROUP C — Reset # ═══════════════════════════════════════════════════════════ section "GROUP C — Reset" RESET_RESP="$(curl -s -X POST $BASE/reset)" check "C-01 POST /reset → returns EASY-001" "$RESET_RESP" '"task_id":"EASY-001"' check "C-02 POST /reset → done=false" "$RESET_RESP" '"done":false' check "C-03 POST /reset → budget_remaining=5.0" "$RESET_RESP" '"budget_remaining":5.0' check "C-04 POST /reset → risk_score present" "$RESET_RESP" '"risk_score":' check "C-05 POST /reset → ml_confidence present" "$RESET_RESP" '"ml_confidence":' check "C-06 POST /reset → chain_total present" "$RESET_RESP" '"chain_total":' # ═══════════════════════════════════════════════════════════ # GROUP D — Terminal Actions (correct decisions) # ═══════════════════════════════════════════════════════════ section "GROUP D — Terminal Actions (correct decisions)" reset # D-01 approve correct → reward=1.0 check "D-01 EASY-001 approve → reward=1.0" \ "$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')" \ '"reward":1.0' # D-02 reject correct → reward=1.0 check "D-02 EASY-002 reject → reward=1.0" \ "$(step '{"action_type":"reject","transaction_id":"TXN-E002"}')" \ '"reward":1.0' # D-03 approve correct (refund) → reward=1.0 check "D-03 EASY-003 approve → reward=1.0" \ "$(step '{"action_type":"approve","transaction_id":"TXN-E003"}')" \ '"reward":1.0' # D-04 flag correct → reward=1.0 check "D-04 EASY-004 flag → reward=1.0" \ "$(step '{"action_type":"flag","transaction_id":"TXN-E004"}')" \ '"reward":1.0' # D-04b approve correct (mortgage repayment) → reward=1.0 check "D-04b EASY-005 approve → reward=1.0" \ "$(step '{"action_type":"approve","transaction_id":"TXN-E005"}')" \ '"reward":1.0' # D-04c flag correct (duplicate payment) → reward=1.0 check "D-04c EASY-006 flag → reward=1.0" \ "$(step '{"action_type":"flag","transaction_id":"TXN-E006"}')" \ '"reward":1.0' # D-05 escalate correct → reward=1.0 check "D-05 MED-001 escalate → reward=1.0" \ "$(step '{"action_type":"escalate","transaction_id":"TXN-M001"}')" \ '"reward":1.0' # D-06 hold correct → reward=1.0 check "D-06 MED-002 hold → reward=1.0" \ "$(step '{"action_type":"hold","transaction_id":"TXN-M002"}')" \ '"reward":1.0' # D-07 task advances after terminal R=$(step '{"action_type":"flag","transaction_id":"TXN-M003"}') check "D-07 After flag on MED-003, next task is MED-004" "$R" '"task_id":"MED-004"' # ═══════════════════════════════════════════════════════════ # GROUP E — Terminal Actions (wrong decisions / partial credit) # ═══════════════════════════════════════════════════════════ section "GROUP E — Wrong actions & partial credit" reset # E-01 approve when should reject → -1.0 (fraud approval) step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null # advance past E001 check "E-01 Approve fraud (EASY-002) → reward=-1.0" \ "$(step '{"action_type":"approve","transaction_id":"TXN-E002"}')" \ '"reward":-1.0' reset step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null # E-02 reject correct → approve → -0.5 check "E-02 Reject legit (EASY-003) → reward=-0.5" \ "$(step '{"action_type":"reject","transaction_id":"TXN-E003"}')" \ '"reward":-0.5' reset # E-03 partial credit — escalate instead of escalate on MED-001 is correct # flag instead of escalate earns partial step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null step '{"action_type":"approve","transaction_id":"TXN-E005"}' > /dev/null step '{"action_type":"flag","transaction_id":"TXN-E006"}' > /dev/null R=$(step '{"action_type":"flag","transaction_id":"TXN-M001"}') check "E-03 Partial credit: flag on MED-001 (correct=escalate) → reward > 0" \ "$R" '"reward":0\.[0-9]' # ═══════════════════════════════════════════════════════════ # GROUP F — Investigation Sub-Actions # ═══════════════════════════════════════════════════════════ section "GROUP F — Investigation sub-actions" reset # F-01 inspect → reward=0.15, stays on same task R=$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}') check "F-01 inspect → reward=0.15" "$R" '"reward":0.15' check "F-02 inspect → does NOT advance task (still EASY-001)" "$R" '"task_id":"EASY-001"' check "F-03 inspect → inspection_notes populated" "$R" '"inspection_notes":"' check "F-04 inspect → budget_remaining=4.9" "$R" '"budget_remaining":4.9' # F-05 inspect again on same task → reward=0.0 (no double-dip) check "F-05 second inspect → reward=0.0" \ "$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')" \ '"reward":0.0' # Advance to EASY-002 step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null # F-06 request_docs → reward=0.15, docs_notes populated R=$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}') check "F-06 request_docs → reward=0.15" "$R" '"reward":0.15' check "F-07 request_docs → docs_notes populated" "$R" '"docs_notes":"' check "F-08 request_docs → budget_remaining=4.6 (cost=0.2)" "$R" '"budget_remaining":4.6' # F-09 request_docs again → reward=0.0 check "F-09 second request_docs → reward=0.0" \ "$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')" \ '"reward":0.0' step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null # advance # F-10 verify_kyc → reward=0.15, kyc_notes populated R=$(step '{"action_type":"verify_kyc","transaction_id":"TXN-E003"}') check "F-10 verify_kyc → reward=0.15" "$R" '"reward":0.15' check "F-11 verify_kyc → kyc_notes populated" "$R" '"kyc_notes":"' check "F-12 verify_kyc → budget cost=0.2 deducted" "$R" '"budget_remaining":4.[0-9]' step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null # F-13 contact_sender → reward=0.15, contact_notes populated R=$(step '{"action_type":"contact_sender","transaction_id":"TXN-E004"}') check "F-13 contact_sender → reward=0.15" "$R" '"reward":0.15' check "F-14 contact_sender → contact_notes populated" "$R" '"contact_notes":"' check "F-15 contact_sender → budget cost=0.3 deducted" "$R" '"budget_remaining":3\.[0-9]' step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null # F-16 file_sar → reward=0.15, docs_notes mentions SAR R=$(step '{"action_type":"file_sar","transaction_id":"TXN-M001"}') check "F-16 file_sar → reward=0.15" "$R" '"reward":0.15' check "F-17 file_sar → docs_notes mentions SAR" "$R" '"docs_notes":"SAR' check "F-18 file_sar → budget cost=0.05 deducted" "$R" '"budget_remaining":3\.[0-9]' # ═══════════════════════════════════════════════════════════ # GROUP G — /state endpoint # ═══════════════════════════════════════════════════════════ section "GROUP G — State endpoint" STATE="$(curl -s $BASE/state)" check "G-01 GET /state → episode_id set" "$STATE" '"episode_id":"' check "G-02 GET /state → step_count > 0" "$STATE" '"step_count":[1-9]' check "G-03 GET /state → budget_spent > 0" "$STATE" '"budget_spent":[0-9]' check "G-04 GET /state → investigation_actions_used list" "$STATE" '"investigation_actions_used":\[' check "G-05 GET /state → recent_decisions list" "$STATE" '"recent_decisions":\[' check "G-06 GET /state → correct_decisions >= 0" "$STATE" '"correct_decisions":[0-9]' # ═══════════════════════════════════════════════════════════ # GROUP H — /grader endpoint # ═══════════════════════════════════════════════════════════ section "GROUP H — Grader endpoint" GRADER="$(curl -s $BASE/grader)" check "H-01 GET /grader → normalised_score present" "$GRADER" '"normalised_score":' check "H-02 GET /grader → total_reward present" "$GRADER" '"total_reward":' check "H-03 GET /grader → budget_spent present" "$GRADER" '"budget_spent":' check "H-04 GET /grader → budget_penalty present" "$GRADER" '"budget_penalty":' check "H-05 GET /grader → per_task array" "$GRADER" '"per_task":\[' check "H-06 GET /grader → reward_breakdown in per_task" "$GRADER" '"reward_breakdown":' check "H-07 GET /grader → passed field present" "$GRADER" '"passed":' # ═══════════════════════════════════════════════════════════ # GROUP I — /replay endpoint # ═══════════════════════════════════════════════════════════ section "GROUP I — /replay endpoint (offline eval)" # I-01 Replay with all 30 correct terminal actions (no investigation) # Hard/critical tasks should be penalised (×0.80) → 0.7–0.9 range # Task order: EASY×6, MED×8, HARD×10, CRIT×6 # Correct actions: # EASY: approve reject approve flag approve flag # MED: escalate hold flag flag hold escalate hold flag # HARD: escalate reject reject approve escalate flag reject reject escalate reject # CRIT: approve reject escalate reject reject escalate REPLAY_PERFECT='{"actions":["approve","reject","approve","flag","approve","flag", "escalate","hold","flag","flag","hold","escalate","hold","flag", "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", "approve","reject","escalate","reject","reject","escalate"]}' R=$(curl -s -X POST $BASE/replay \ -H "Content-Type: application/json" \ -d "$REPLAY_PERFECT") check "I-01 Replay correct terminals (no investigation) → score<1.0 (hard/critical penalised)" "$R" '"normalised_score":0\.[7-9]' check "I-02 Replay → passed=true (score>0.5 despite penalty)" "$R" '"passed":true' check "I-03 Replay → budget_spent=0.0 (no inv actions)" "$R" '"budget_spent":0.0' # I-04 Replay with investigation actions included (inspect before first task) REPLAY_WITH_INV='{"actions":["inspect","approve","reject","approve","flag","approve","flag", "escalate","hold","flag","flag","hold","escalate","hold","flag", "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", "approve","reject","escalate","reject","reject","escalate"]}' R=$(curl -s -X POST $BASE/replay \ -H "Content-Type: application/json" \ -d "$REPLAY_WITH_INV") check "I-04 Replay with inspect → budget_spent=0.1" "$R" '"budget_spent":0.1' # I-05 Replay with invalid action → 422 check "I-05 Replay invalid action → 422 error" \ "$(curl -s -X POST $BASE/replay \ -H 'Content-Type: application/json' \ -d '{"actions":["delete","approve"]}')" \ "Invalid action" # I-06 Replay result has per_task breakdown check "I-06 Replay → per_task array present" "$R" '"per_task":\[' check "I-07 Replay → reward_breakdown in each task" "$R" '"reward_breakdown":' # I-08 Replay with confidences REPLAY_CONF='{"actions":["approve","reject"],"confidences":[0.95,0.90]}' check "I-08 Replay with confidences → ok" \ "$(curl -s -X POST $BASE/replay \ -H 'Content-Type: application/json' \ -d "$REPLAY_CONF")" \ '"normalised_score":' # ═══════════════════════════════════════════════════════════ # GROUP J — /baseline endpoint # ═══════════════════════════════════════════════════════════ section "GROUP J — Baseline agent" BASELINE="$(curl -s -X POST $BASE/baseline)" check "J-01 POST /baseline → normalised_score present" "$BASELINE" '"normalised_score":' check "J-02 POST /baseline → total_reward present" "$BASELINE" '"total_reward":' check "J-03 POST /baseline → steps > 0" "$BASELINE" '"steps":[1-9]' check "J-04 POST /baseline → per_task scores present" "$BASELINE" '"scores":\[' check "J-05 POST /baseline → score >= 0.5 (passes)" "$BASELINE" '"normalised_score":(1\.0|0\.[5-9])' # ═══════════════════════════════════════════════════════════ # GROUP K — /analytics endpoint # ═══════════════════════════════════════════════════════════ section "GROUP K — Analytics endpoint" # Run a full episode first so analytics has data reset BASE=$BASE python3 - <<'PYEOF' import os, ssl, urllib.request, json, sys BASE = os.environ.get("BASE", "http://localhost:8000") _ssl = ssl.create_default_context() _ssl.check_hostname = False _ssl.verify_mode = ssl.CERT_NONE def post(path, body=None): req = urllib.request.Request(f"{BASE}{path}", data=json.dumps(body).encode() if body else None, headers={"Content-Type": "application/json"}, method="POST") with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read()) def get(path): with urllib.request.urlopen(f"{BASE}{path}", context=_ssl) as r: return json.loads(r.read()) post("/reset", {"seed": 0}) # EASY x6 + MED x8 + HARD x10: no chain gate, terminal only for action, txn in [ ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"), ("approve","TXN-E005"),("flag","TXN-E006"), ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"), ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"), ("escalate","TXN-H001"),("reject","TXN-H002"),("reject","TXN-H003"),("approve","TXN-H004"), ("escalate","TXN-H005"),("flag","TXN-H006"),("reject","TXN-H007"),("reject","TXN-H008"), ("escalate","TXN-H009"),("reject","TXN-H010"), ]: post("/step", {"action_type": action, "transaction_id": txn}) # CRIT x6: chain-gated — must provide chain_min investigation steps first post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"}) post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C006"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"}) PYEOF ANA="$(curl -s $BASE/analytics)" check "K-01 GET /analytics → episodes_completed >= 1" "$ANA" '"episodes_completed":[1-9]' check "K-02 GET /analytics → best_score present" "$ANA" '"best_score":' check "K-03 GET /analytics → avg_score present" "$ANA" '"avg_score":' check "K-04 GET /analytics → avg_budget_spent present" "$ANA" '"avg_budget_spent":' check "K-05 GET /analytics → current_episode present" "$ANA" '"current_episode":' check "K-06 GET /analytics → by_difficulty present" "$ANA" '"by_difficulty":' check "K-07 GET /analytics → easy accuracy" "$ANA" '"easy":' check "K-08 GET /analytics → critical accuracy" "$ANA" '"critical":' # ═══════════════════════════════════════════════════════════ # GROUP L — /leaderboard endpoint # ═══════════════════════════════════════════════════════════ section "GROUP L — Leaderboard endpoint" LB="$(curl -s $BASE/leaderboard)" check "L-01 GET /leaderboard → count >= 1 (episode recorded)" "$LB" '"count":[1-9]' check "L-02 GET /leaderboard → entries array present" "$LB" '"entries":\[' check "L-03 GET /leaderboard → episode_id in entry" "$LB" '"episode_id":"' check "L-04 GET /leaderboard → normalised_score in entry" "$LB" '"normalised_score":' check "L-05 GET /leaderboard → timestamp in entry" "$LB" '"timestamp":"' check "L-06 GET /leaderboard → budget_spent in entry" "$LB" '"budget_spent":' # ═══════════════════════════════════════════════════════════ # GROUP M — Perfect episode score # ═══════════════════════════════════════════════════════════ section "GROUP M — Perfect episode (all 30 correct + investigation on hard/critical)" reset BASE=$BASE python3 - <<'PYEOF' import os, ssl, urllib.request, json BASE = os.environ.get("BASE", "http://localhost:8000") _ssl = ssl.create_default_context() _ssl.check_hostname = False _ssl.verify_mode = ssl.CERT_NONE def post(path, body=None): req = urllib.request.Request(f"{BASE}{path}", data=json.dumps(body).encode() if body else None, headers={"Content-Type": "application/json"}, method="POST") with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read()) post("/reset", {"seed": 0}) # Easy x6 + Medium x8: terminal only for action, txn in [ ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"), ("approve","TXN-E005"),("flag","TXN-E006"), ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"), ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"), ]: post("/step", {"action_type": action, "transaction_id": txn}) # Hard x10: one required investigation sub-action before each terminal post("/step", {"action_type": "inspect", "transaction_id": "TXN-H001"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-H001"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H002"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-H002"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-H003"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-H003"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-H004"}) post("/step", {"action_type": "approve", "transaction_id": "TXN-H004"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-H005"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H005"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-H005"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-H006"}) post("/step", {"action_type": "flag", "transaction_id": "TXN-H006"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H007"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-H007"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H008"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-H008"}) post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-H009"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-H009"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H010"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-H010"}) # Critical x6: required investigation sub-actions before each terminal post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"}) post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"}) post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-C004"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"}) post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"}) post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C006"}) post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"}) PYEOF GRADER="$(curl -s $BASE/grader)" check "M-01 Perfect episode → normalised_score=1.0" "$GRADER" '"normalised_score":1.0' check "M-02 Perfect episode → passed=true" "$GRADER" '"passed":true' check "M-03 Perfect episode → budget_penalty=0.0" "$GRADER" '"budget_penalty":0.0' # ═══════════════════════════════════════════════════════════ # GROUP N — Difficulty weighting # ═══════════════════════════════════════════════════════════ section "GROUP N — Difficulty weighting in grader" # A critical task correct should contribute more weight than easy REPLAY_CRIT='{"actions":["approve","reject","approve","flag","approve","flag", "escalate","hold","flag","flag","hold","escalate","hold","flag", "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject", "approve","reject","escalate","reject","reject","escalate"]}' FULL=$(curl -s -X POST $BASE/replay \ -H "Content-Type: application/json" \ -d "$REPLAY_CRIT") check "N-01 Full correct replay → max_possible_reward includes weights" \ "$FULL" '"max_possible_reward":4[0-9]\.' check "N-02 Correct-but-no-investigation → total_reward below max (penalty applied)" \ "$(echo "$FULL" | python3 -c " import sys,json,re d=json.load(sys.stdin) print('DIFF' if d['total_reward'] < d['max_possible_reward'] - 0.01 else 'EQUAL') ")" "DIFF" # ═══════════════════════════════════════════════════════════ # GROUP O — Budget mechanics # ═══════════════════════════════════════════════════════════ section "GROUP O — Budget mechanics" reset # Burn through budget with contact_sender (cost=0.3) × 5 = 1.5 # then request_docs × 5 = 1.0, then verify_kyc × 5 = 1.0 (total = 3.5 so far ≤5) # Total exceeding: won't exceed in these few calls, test budget tracking step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # -0.30 step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # dup, but cost still deducted step '{"action_type":"request_docs","transaction_id":"TXN-E001"}' > /dev/null # -0.20 step '{"action_type":"verify_kyc","transaction_id":"TXN-E001"}' > /dev/null # -0.20 step '{"action_type":"file_sar","transaction_id":"TXN-E001"}' > /dev/null # -0.05 R=$(step '{"action_type":"approve","transaction_id":"TXN-E001"}') # budget_remaining = 5.0 - 0.30 - 0.30 - 0.20 - 0.20 - 0.05 = 3.95 check "O-01 Budget correctly deducted after multiple inv actions" \ "$R" '"budget_remaining":3\.[0-9]' # grader shows budget_spent GRADER="$(curl -s $BASE/grader)" check "O-02 Grader → budget_spent > 0" "$GRADER" '"budget_spent":[1-9]' # replay that overshoots budget → budget_penalty > 0 HEAVY='{"actions":["contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","approve","reject","approve","flag","escalate","hold","flag","flag","hold","escalate","escalate","reject","reject","approve","escalate","flag","approve","reject","escalate","reject"]}' R=$(curl -s -X POST $BASE/replay \ -H "Content-Type: application/json" \ -d "$HEAVY") check "O-03 Over-budget replay → budget_penalty > 0" "$R" '"budget_penalty":0\.[0-9]*[1-9]' check "O-04 Over-budget replay → budget_overspend > 0" "$R" '"budget_overspend":0\.[1-9]' # ═══════════════════════════════════════════════════════════ # GROUP P — Edge cases & errors # ═══════════════════════════════════════════════════════════ section "GROUP P — Edge cases & error handling" # P-01 Step before reset → 400 check "P-01 Step on fresh env (reset then done) → handled" \ "$(curl -s -X POST $BASE/reset > /dev/null && curl -s -X POST $BASE/step \ -H 'Content-Type: application/json' \ -d '{"action_type":"approve","transaction_id":"TXN-NONE"}')" \ '"reward"' # P-02 Unknown action_type → 422 check "P-02 Unknown action_type → 422" \ "$(curl -s -X POST $BASE/step \ -H 'Content-Type: application/json' \ -d '{"action_type":"nuke","transaction_id":"TXN-E001"}')" \ "Invalid action_type" # P-03 Missing action_type field → 422 check "P-03 Missing action_type field → error" \ "$(curl -s -X POST $BASE/step \ -H 'Content-Type: application/json' \ -d '{"transaction_id":"TXN-E001"}')" \ '"detail"' # P-04 Empty replay actions list check "P-04 Replay empty actions → returns score" \ "$(curl -s -X POST $BASE/replay \ -H 'Content-Type: application/json' \ -d '{"actions":[]}')" \ '"normalised_score"' # P-05 Replay with confidences shorter than actions → still works check "P-05 Replay with partial confidences → score present" \ "$(curl -s -X POST $BASE/replay \ -H 'Content-Type: application/json' \ -d '{"actions":["approve","reject"],"confidences":[0.9]}')" \ '"normalised_score"' # P-06 Grader before any reset → 400 or returns score check "P-06 Grader before actions → handled gracefully" \ "$(curl -s $BASE/grader)" \ '"normalised_score"|"error"' # P-07 Analytics before any completed episode → handled reset check "P-07 Analytics after fresh reset → message or data" \ "$(curl -s $BASE/analytics)" \ '"message"|"episodes_completed"' # ═══════════════════════════════════════════════════════════ # GROUP Q — WebSocket # ═══════════════════════════════════════════════════════════ section "GROUP Q — WebSocket (requires wscat or python)" if command -v python3 &>/dev/null; then WS_RESULT=$(BASE=$BASE python3 - <<'PYEOF' import os, ssl, urllib.request, json try: BASE = os.environ.get("BASE", "http://localhost:8000") _ssl = ssl.create_default_context() _ssl.check_hostname = False _ssl.verify_mode = ssl.CERT_NONE # ws upgrade check via HTTP (will get 426 Upgrade Required — proves endpoint exists) req = urllib.request.Request(f"{BASE}/ws") try: urllib.request.urlopen(req, context=_ssl) except Exception as e: msg = str(e) if "426" in msg or "101" in msg or "Switching" in msg or "upgrade" in msg.lower() or "404" in msg: print("WS_OK") else: print(f"WS_UNKNOWN:{msg[:60]}") except Exception as e: print(f"WS_ERR:{e}") PYEOF ) check "Q-01 WS /ws endpoint exists (426 upgrade = correct)" \ "$WS_RESULT" "WS_OK" else printf " \033[33m-\033[0m Q-01 WebSocket check skipped (no python3)\n" SKIP=$((SKIP+1)) fi # ═══════════════════════════════════════════════════════════ # SUMMARY # ═══════════════════════════════════════════════════════════ echo "" echo "╔══════════════════════════════════════════════════════╗" TOTAL=$((PASS+FAIL)) if [ "$FAIL" -eq 0 ]; then printf "║ ✓ All %d tests passed" "$TOTAL" else printf "║ Results: %d/%d passed, %d failed" "$PASS" "$TOTAL" "$FAIL" fi [ "$SKIP" -gt 0 ] && printf " (%d skipped)" "$SKIP" echo "" echo "╚══════════════════════════════════════════════════════╝" echo "" # Exit with failure code if any tests failed [ "$FAIL" -eq 0 ]