payops_env / run_tests.sh
padmapriyagosakan's picture
Iteration_4: expand task bank 20→30, add variants for all tasks, fix baseline policy for chain-gated CRIT tasks
ffea7f4
Raw
History Blame Contribute Delete
35.9 kB
#!/usr/bin/env bash
# run_tests.sh — PayOps v2 full test suite
# Usage: bash payops_env/run_tests.sh [BASE_URL]
# Requires the server to be running. Default: http://localhost:8000
BASE="${1:-http://localhost:8000}"
PASS=0
FAIL=0
SKIP=0
# ── helpers ──────────────────────────────────────────────────────────────────
check() {
local name="$1"
local got="$2"
local want="$3"
if echo "$got" | grep -qE "$want"; then
printf " \033[32m✓\033[0m %s\n" "$name"
PASS=$((PASS+1))
else
printf " \033[31m✗\033[0m %s\n" "$name"
printf " expected pattern : %s\n" "$want"
printf " got : %s\n" "$(echo "$got" | head -c 200)"
FAIL=$((FAIL+1))
fi
}
check_absent() {
local name="$1"
local got="$2"
local absent="$3"
if echo "$got" | grep -qE "$absent"; then
printf " \033[31m✗\033[0m %s (unexpected: %s)\n" "$name" "$absent"
FAIL=$((FAIL+1))
else
printf " \033[32m✓\033[0m %s\n" "$name"
PASS=$((PASS+1))
fi
}
step() {
curl -s -X POST "$BASE/step" \
-H "Content-Type: application/json" \
-d "$1"
}
reset() {
curl -s -X POST "$BASE/reset" -H 'Content-Type: application/json' -d '{"seed":0}' > /dev/null
}
section() {
echo ""
echo "── $1 ──"
}
echo ""
echo "╔══════════════════════════════════════════════════════╗"
echo "║ PayOps v2 — Full Post-Main Test Suite ║"
echo "║ Target: $BASE"
echo "╚══════════════════════════════════════════════════════╝"
echo ""
# ═══════════════════════════════════════════════════════════
# GROUP A — Server / Infrastructure
# ═══════════════════════════════════════════════════════════
section "GROUP A — Server / Infrastructure"
# A-01 Health check
check "A-01 GET /health → status ok" \
"$(curl -s $BASE/health)" \
'"status":"ok"'
# A-02 Version is v2
check "A-02 GET /health → version 2.0.0" \
"$(curl -s $BASE/health)" \
'"version":"2.0.0"'
# A-03 Schema action model present
check "A-03 GET /schema → PayOpsAction schema" \
"$(curl -s $BASE/schema)" \
'"PayOpsAction"'
# A-04 Schema observation model present
check "A-04 GET /schema → PayOpsObservation schema" \
"$(curl -s $BASE/schema)" \
'"PayOpsObservation"'
# A-05 Schema state model present
check "A-05 GET /schema → PayOpsState schema" \
"$(curl -s $BASE/schema)" \
'"PayOpsState"'
# ═══════════════════════════════════════════════════════════
# GROUP B — Tasks endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP B — Tasks endpoint"
TASKS_RESP="$(curl -s $BASE/tasks)"
# B-01 30 tasks
check "B-01 GET /tasks → count=30" \
"$TASKS_RESP" '"count":30'
# B-02 All 4 difficulty tiers present
check "B-02 Tasks include 'easy' tier" "$TASKS_RESP" '"difficulty":"easy"'
check "B-03 Tasks include 'medium' tier" "$TASKS_RESP" '"difficulty":"medium"'
check "B-04 Tasks include 'hard' tier" "$TASKS_RESP" '"difficulty":"hard"'
check "B-05 Tasks include 'critical' tier" "$TASKS_RESP" '"difficulty":"critical"'
# B-06 Regulatory tasks present
check "B-06 Tasks include regulatory_action flag" \
"$TASKS_RESP" '"regulatory_action":true'
# B-07 Multi-step chain tasks present
check "B-07 Tasks include chain_total > 1" \
"$TASKS_RESP" '"chain_total":3'
# B-08 requires_investigation populated
check "B-08 Tasks include requires_investigation" \
"$TASKS_RESP" '"requires_investigation":\['
# ═══════════════════════════════════════════════════════════
# GROUP C — Reset
# ═══════════════════════════════════════════════════════════
section "GROUP C — Reset"
RESET_RESP="$(curl -s -X POST $BASE/reset)"
check "C-01 POST /reset → returns EASY-001" "$RESET_RESP" '"task_id":"EASY-001"'
check "C-02 POST /reset → done=false" "$RESET_RESP" '"done":false'
check "C-03 POST /reset → budget_remaining=5.0" "$RESET_RESP" '"budget_remaining":5.0'
check "C-04 POST /reset → risk_score present" "$RESET_RESP" '"risk_score":'
check "C-05 POST /reset → ml_confidence present" "$RESET_RESP" '"ml_confidence":'
check "C-06 POST /reset → chain_total present" "$RESET_RESP" '"chain_total":'
# ═══════════════════════════════════════════════════════════
# GROUP D — Terminal Actions (correct decisions)
# ═══════════════════════════════════════════════════════════
section "GROUP D — Terminal Actions (correct decisions)"
reset
# D-01 approve correct → reward=1.0
check "D-01 EASY-001 approve → reward=1.0" \
"$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')" \
'"reward":1.0'
# D-02 reject correct → reward=1.0
check "D-02 EASY-002 reject → reward=1.0" \
"$(step '{"action_type":"reject","transaction_id":"TXN-E002"}')" \
'"reward":1.0'
# D-03 approve correct (refund) → reward=1.0
check "D-03 EASY-003 approve → reward=1.0" \
"$(step '{"action_type":"approve","transaction_id":"TXN-E003"}')" \
'"reward":1.0'
# D-04 flag correct → reward=1.0
check "D-04 EASY-004 flag → reward=1.0" \
"$(step '{"action_type":"flag","transaction_id":"TXN-E004"}')" \
'"reward":1.0'
# D-04b approve correct (mortgage repayment) → reward=1.0
check "D-04b EASY-005 approve → reward=1.0" \
"$(step '{"action_type":"approve","transaction_id":"TXN-E005"}')" \
'"reward":1.0'
# D-04c flag correct (duplicate payment) → reward=1.0
check "D-04c EASY-006 flag → reward=1.0" \
"$(step '{"action_type":"flag","transaction_id":"TXN-E006"}')" \
'"reward":1.0'
# D-05 escalate correct → reward=1.0
check "D-05 MED-001 escalate → reward=1.0" \
"$(step '{"action_type":"escalate","transaction_id":"TXN-M001"}')" \
'"reward":1.0'
# D-06 hold correct → reward=1.0
check "D-06 MED-002 hold → reward=1.0" \
"$(step '{"action_type":"hold","transaction_id":"TXN-M002"}')" \
'"reward":1.0'
# D-07 task advances after terminal
R=$(step '{"action_type":"flag","transaction_id":"TXN-M003"}')
check "D-07 After flag on MED-003, next task is MED-004" "$R" '"task_id":"MED-004"'
# ═══════════════════════════════════════════════════════════
# GROUP E — Terminal Actions (wrong decisions / partial credit)
# ═══════════════════════════════════════════════════════════
section "GROUP E — Wrong actions & partial credit"
reset
# E-01 approve when should reject → -1.0 (fraud approval)
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null # advance past E001
check "E-01 Approve fraud (EASY-002) → reward=-1.0" \
"$(step '{"action_type":"approve","transaction_id":"TXN-E002"}')" \
'"reward":-1.0'
reset
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null
step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null
# E-02 reject correct → approve → -0.5
check "E-02 Reject legit (EASY-003) → reward=-0.5" \
"$(step '{"action_type":"reject","transaction_id":"TXN-E003"}')" \
'"reward":-0.5'
reset
# E-03 partial credit — escalate instead of escalate on MED-001 is correct
# flag instead of escalate earns partial
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null
step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null
step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null
step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null
step '{"action_type":"approve","transaction_id":"TXN-E005"}' > /dev/null
step '{"action_type":"flag","transaction_id":"TXN-E006"}' > /dev/null
R=$(step '{"action_type":"flag","transaction_id":"TXN-M001"}')
check "E-03 Partial credit: flag on MED-001 (correct=escalate) → reward > 0" \
"$R" '"reward":0\.[0-9]'
# ═══════════════════════════════════════════════════════════
# GROUP F — Investigation Sub-Actions
# ═══════════════════════════════════════════════════════════
section "GROUP F — Investigation sub-actions"
reset
# F-01 inspect → reward=0.15, stays on same task
R=$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')
check "F-01 inspect → reward=0.15" "$R" '"reward":0.15'
check "F-02 inspect → does NOT advance task (still EASY-001)" "$R" '"task_id":"EASY-001"'
check "F-03 inspect → inspection_notes populated" "$R" '"inspection_notes":"'
check "F-04 inspect → budget_remaining=4.9" "$R" '"budget_remaining":4.9'
# F-05 inspect again on same task → reward=0.0 (no double-dip)
check "F-05 second inspect → reward=0.0" \
"$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')" \
'"reward":0.0'
# Advance to EASY-002
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null
# F-06 request_docs → reward=0.15, docs_notes populated
R=$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')
check "F-06 request_docs → reward=0.15" "$R" '"reward":0.15'
check "F-07 request_docs → docs_notes populated" "$R" '"docs_notes":"'
check "F-08 request_docs → budget_remaining=4.6 (cost=0.2)" "$R" '"budget_remaining":4.6'
# F-09 request_docs again → reward=0.0
check "F-09 second request_docs → reward=0.0" \
"$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')" \
'"reward":0.0'
step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null # advance
# F-10 verify_kyc → reward=0.15, kyc_notes populated
R=$(step '{"action_type":"verify_kyc","transaction_id":"TXN-E003"}')
check "F-10 verify_kyc → reward=0.15" "$R" '"reward":0.15'
check "F-11 verify_kyc → kyc_notes populated" "$R" '"kyc_notes":"'
check "F-12 verify_kyc → budget cost=0.2 deducted" "$R" '"budget_remaining":4.[0-9]'
step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null
# F-13 contact_sender → reward=0.15, contact_notes populated
R=$(step '{"action_type":"contact_sender","transaction_id":"TXN-E004"}')
check "F-13 contact_sender → reward=0.15" "$R" '"reward":0.15'
check "F-14 contact_sender → contact_notes populated" "$R" '"contact_notes":"'
check "F-15 contact_sender → budget cost=0.3 deducted" "$R" '"budget_remaining":3\.[0-9]'
step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null
# F-16 file_sar → reward=0.15, docs_notes mentions SAR
R=$(step '{"action_type":"file_sar","transaction_id":"TXN-M001"}')
check "F-16 file_sar → reward=0.15" "$R" '"reward":0.15'
check "F-17 file_sar → docs_notes mentions SAR" "$R" '"docs_notes":"SAR'
check "F-18 file_sar → budget cost=0.05 deducted" "$R" '"budget_remaining":3\.[0-9]'
# ═══════════════════════════════════════════════════════════
# GROUP G — /state endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP G — State endpoint"
STATE="$(curl -s $BASE/state)"
check "G-01 GET /state → episode_id set" "$STATE" '"episode_id":"'
check "G-02 GET /state → step_count > 0" "$STATE" '"step_count":[1-9]'
check "G-03 GET /state → budget_spent > 0" "$STATE" '"budget_spent":[0-9]'
check "G-04 GET /state → investigation_actions_used list" "$STATE" '"investigation_actions_used":\['
check "G-05 GET /state → recent_decisions list" "$STATE" '"recent_decisions":\['
check "G-06 GET /state → correct_decisions >= 0" "$STATE" '"correct_decisions":[0-9]'
# ═══════════════════════════════════════════════════════════
# GROUP H — /grader endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP H — Grader endpoint"
GRADER="$(curl -s $BASE/grader)"
check "H-01 GET /grader → normalised_score present" "$GRADER" '"normalised_score":'
check "H-02 GET /grader → total_reward present" "$GRADER" '"total_reward":'
check "H-03 GET /grader → budget_spent present" "$GRADER" '"budget_spent":'
check "H-04 GET /grader → budget_penalty present" "$GRADER" '"budget_penalty":'
check "H-05 GET /grader → per_task array" "$GRADER" '"per_task":\['
check "H-06 GET /grader → reward_breakdown in per_task" "$GRADER" '"reward_breakdown":'
check "H-07 GET /grader → passed field present" "$GRADER" '"passed":'
# ═══════════════════════════════════════════════════════════
# GROUP I — /replay endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP I — /replay endpoint (offline eval)"
# I-01 Replay with all 30 correct terminal actions (no investigation)
# Hard/critical tasks should be penalised (×0.80) → 0.7–0.9 range
# Task order: EASY×6, MED×8, HARD×10, CRIT×6
# Correct actions:
# EASY: approve reject approve flag approve flag
# MED: escalate hold flag flag hold escalate hold flag
# HARD: escalate reject reject approve escalate flag reject reject escalate reject
# CRIT: approve reject escalate reject reject escalate
REPLAY_PERFECT='{"actions":["approve","reject","approve","flag","approve","flag",
"escalate","hold","flag","flag","hold","escalate","hold","flag",
"escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
"approve","reject","escalate","reject","reject","escalate"]}'
R=$(curl -s -X POST $BASE/replay \
-H "Content-Type: application/json" \
-d "$REPLAY_PERFECT")
check "I-01 Replay correct terminals (no investigation) → score<1.0 (hard/critical penalised)" "$R" '"normalised_score":0\.[7-9]'
check "I-02 Replay → passed=true (score>0.5 despite penalty)" "$R" '"passed":true'
check "I-03 Replay → budget_spent=0.0 (no inv actions)" "$R" '"budget_spent":0.0'
# I-04 Replay with investigation actions included (inspect before first task)
REPLAY_WITH_INV='{"actions":["inspect","approve","reject","approve","flag","approve","flag",
"escalate","hold","flag","flag","hold","escalate","hold","flag",
"escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
"approve","reject","escalate","reject","reject","escalate"]}'
R=$(curl -s -X POST $BASE/replay \
-H "Content-Type: application/json" \
-d "$REPLAY_WITH_INV")
check "I-04 Replay with inspect → budget_spent=0.1" "$R" '"budget_spent":0.1'
# I-05 Replay with invalid action → 422
check "I-05 Replay invalid action → 422 error" \
"$(curl -s -X POST $BASE/replay \
-H 'Content-Type: application/json' \
-d '{"actions":["delete","approve"]}')" \
"Invalid action"
# I-06 Replay result has per_task breakdown
check "I-06 Replay → per_task array present" "$R" '"per_task":\['
check "I-07 Replay → reward_breakdown in each task" "$R" '"reward_breakdown":'
# I-08 Replay with confidences
REPLAY_CONF='{"actions":["approve","reject"],"confidences":[0.95,0.90]}'
check "I-08 Replay with confidences → ok" \
"$(curl -s -X POST $BASE/replay \
-H 'Content-Type: application/json' \
-d "$REPLAY_CONF")" \
'"normalised_score":'
# ═══════════════════════════════════════════════════════════
# GROUP J — /baseline endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP J — Baseline agent"
BASELINE="$(curl -s -X POST $BASE/baseline)"
check "J-01 POST /baseline → normalised_score present" "$BASELINE" '"normalised_score":'
check "J-02 POST /baseline → total_reward present" "$BASELINE" '"total_reward":'
check "J-03 POST /baseline → steps > 0" "$BASELINE" '"steps":[1-9]'
check "J-04 POST /baseline → per_task scores present" "$BASELINE" '"scores":\['
check "J-05 POST /baseline → score >= 0.5 (passes)" "$BASELINE" '"normalised_score":(1\.0|0\.[5-9])'
# ═══════════════════════════════════════════════════════════
# GROUP K — /analytics endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP K — Analytics endpoint"
# Run a full episode first so analytics has data
reset
BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json, sys
BASE = os.environ.get("BASE", "http://localhost:8000")
_ssl = ssl.create_default_context()
_ssl.check_hostname = False
_ssl.verify_mode = ssl.CERT_NONE
def post(path, body=None):
req = urllib.request.Request(f"{BASE}{path}",
data=json.dumps(body).encode() if body else None,
headers={"Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read())
def get(path):
with urllib.request.urlopen(f"{BASE}{path}", context=_ssl) as r: return json.loads(r.read())
post("/reset", {"seed": 0})
# EASY x6 + MED x8 + HARD x10: no chain gate, terminal only
for action, txn in [
("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"),
("approve","TXN-E005"),("flag","TXN-E006"),
("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"),
("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"),
("escalate","TXN-H001"),("reject","TXN-H002"),("reject","TXN-H003"),("approve","TXN-H004"),
("escalate","TXN-H005"),("flag","TXN-H006"),("reject","TXN-H007"),("reject","TXN-H008"),
("escalate","TXN-H009"),("reject","TXN-H010"),
]:
post("/step", {"action_type": action, "transaction_id": txn})
# CRIT x6: chain-gated — must provide chain_min investigation steps first
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"})
post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C006"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"})
PYEOF
ANA="$(curl -s $BASE/analytics)"
check "K-01 GET /analytics → episodes_completed >= 1" "$ANA" '"episodes_completed":[1-9]'
check "K-02 GET /analytics → best_score present" "$ANA" '"best_score":'
check "K-03 GET /analytics → avg_score present" "$ANA" '"avg_score":'
check "K-04 GET /analytics → avg_budget_spent present" "$ANA" '"avg_budget_spent":'
check "K-05 GET /analytics → current_episode present" "$ANA" '"current_episode":'
check "K-06 GET /analytics → by_difficulty present" "$ANA" '"by_difficulty":'
check "K-07 GET /analytics → easy accuracy" "$ANA" '"easy":'
check "K-08 GET /analytics → critical accuracy" "$ANA" '"critical":'
# ═══════════════════════════════════════════════════════════
# GROUP L — /leaderboard endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP L — Leaderboard endpoint"
LB="$(curl -s $BASE/leaderboard)"
check "L-01 GET /leaderboard → count >= 1 (episode recorded)" "$LB" '"count":[1-9]'
check "L-02 GET /leaderboard → entries array present" "$LB" '"entries":\['
check "L-03 GET /leaderboard → episode_id in entry" "$LB" '"episode_id":"'
check "L-04 GET /leaderboard → normalised_score in entry" "$LB" '"normalised_score":'
check "L-05 GET /leaderboard → timestamp in entry" "$LB" '"timestamp":"'
check "L-06 GET /leaderboard → budget_spent in entry" "$LB" '"budget_spent":'
# ═══════════════════════════════════════════════════════════
# GROUP M — Perfect episode score
# ═══════════════════════════════════════════════════════════
section "GROUP M — Perfect episode (all 30 correct + investigation on hard/critical)"
reset
BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json
BASE = os.environ.get("BASE", "http://localhost:8000")
_ssl = ssl.create_default_context()
_ssl.check_hostname = False
_ssl.verify_mode = ssl.CERT_NONE
def post(path, body=None):
req = urllib.request.Request(f"{BASE}{path}",
data=json.dumps(body).encode() if body else None,
headers={"Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read())
post("/reset", {"seed": 0})
# Easy x6 + Medium x8: terminal only
for action, txn in [
("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"),
("approve","TXN-E005"),("flag","TXN-E006"),
("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"),
("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"),
]:
post("/step", {"action_type": action, "transaction_id": txn})
# Hard x10: one required investigation sub-action before each terminal
post("/step", {"action_type": "inspect", "transaction_id": "TXN-H001"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-H001"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H002"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-H002"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-H003"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-H003"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-H004"})
post("/step", {"action_type": "approve", "transaction_id": "TXN-H004"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-H005"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H005"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-H005"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-H006"})
post("/step", {"action_type": "flag", "transaction_id": "TXN-H006"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H007"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-H007"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H008"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-H008"})
post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-H009"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-H009"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-H010"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-H010"})
# Critical x6: required investigation sub-actions before each terminal
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "approve", "transaction_id": "TXN-C001"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C002"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C002"})
post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C004"})
post("/step", {"action_type": "contact_sender", "transaction_id": "TXN-C004"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C004"})
post("/step", {"action_type": "inspect", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "verify_kyc", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "reject", "transaction_id": "TXN-C005"})
post("/step", {"action_type": "request_docs", "transaction_id": "TXN-C006"})
post("/step", {"action_type": "escalate", "transaction_id": "TXN-C006"})
PYEOF
GRADER="$(curl -s $BASE/grader)"
check "M-01 Perfect episode → normalised_score=1.0" "$GRADER" '"normalised_score":1.0'
check "M-02 Perfect episode → passed=true" "$GRADER" '"passed":true'
check "M-03 Perfect episode → budget_penalty=0.0" "$GRADER" '"budget_penalty":0.0'
# ═══════════════════════════════════════════════════════════
# GROUP N — Difficulty weighting
# ═══════════════════════════════════════════════════════════
section "GROUP N — Difficulty weighting in grader"
# A critical task correct should contribute more weight than easy
REPLAY_CRIT='{"actions":["approve","reject","approve","flag","approve","flag",
"escalate","hold","flag","flag","hold","escalate","hold","flag",
"escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
"approve","reject","escalate","reject","reject","escalate"]}'
FULL=$(curl -s -X POST $BASE/replay \
-H "Content-Type: application/json" \
-d "$REPLAY_CRIT")
check "N-01 Full correct replay → max_possible_reward includes weights" \
"$FULL" '"max_possible_reward":4[0-9]\.'
check "N-02 Correct-but-no-investigation → total_reward below max (penalty applied)" \
"$(echo "$FULL" | python3 -c "
import sys,json,re
d=json.load(sys.stdin)
print('DIFF' if d['total_reward'] < d['max_possible_reward'] - 0.01 else 'EQUAL')
")" "DIFF"
# ═══════════════════════════════════════════════════════════
# GROUP O — Budget mechanics
# ═══════════════════════════════════════════════════════════
section "GROUP O — Budget mechanics"
reset
# Burn through budget with contact_sender (cost=0.3) × 5 = 1.5
# then request_docs × 5 = 1.0, then verify_kyc × 5 = 1.0 (total = 3.5 so far ≤5)
# Total exceeding: won't exceed in these few calls, test budget tracking
step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # -0.30
step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null # dup, but cost still deducted
step '{"action_type":"request_docs","transaction_id":"TXN-E001"}' > /dev/null # -0.20
step '{"action_type":"verify_kyc","transaction_id":"TXN-E001"}' > /dev/null # -0.20
step '{"action_type":"file_sar","transaction_id":"TXN-E001"}' > /dev/null # -0.05
R=$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')
# budget_remaining = 5.0 - 0.30 - 0.30 - 0.20 - 0.20 - 0.05 = 3.95
check "O-01 Budget correctly deducted after multiple inv actions" \
"$R" '"budget_remaining":3\.[0-9]'
# grader shows budget_spent
GRADER="$(curl -s $BASE/grader)"
check "O-02 Grader → budget_spent > 0" "$GRADER" '"budget_spent":[1-9]'
# replay that overshoots budget → budget_penalty > 0
HEAVY='{"actions":["contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","approve","reject","approve","flag","escalate","hold","flag","flag","hold","escalate","escalate","reject","reject","approve","escalate","flag","approve","reject","escalate","reject"]}'
R=$(curl -s -X POST $BASE/replay \
-H "Content-Type: application/json" \
-d "$HEAVY")
check "O-03 Over-budget replay → budget_penalty > 0" "$R" '"budget_penalty":0\.[0-9]*[1-9]'
check "O-04 Over-budget replay → budget_overspend > 0" "$R" '"budget_overspend":0\.[1-9]'
# ═══════════════════════════════════════════════════════════
# GROUP P — Edge cases & errors
# ═══════════════════════════════════════════════════════════
section "GROUP P — Edge cases & error handling"
# P-01 Step before reset → 400
check "P-01 Step on fresh env (reset then done) → handled" \
"$(curl -s -X POST $BASE/reset > /dev/null && curl -s -X POST $BASE/step \
-H 'Content-Type: application/json' \
-d '{"action_type":"approve","transaction_id":"TXN-NONE"}')" \
'"reward"'
# P-02 Unknown action_type → 422
check "P-02 Unknown action_type → 422" \
"$(curl -s -X POST $BASE/step \
-H 'Content-Type: application/json' \
-d '{"action_type":"nuke","transaction_id":"TXN-E001"}')" \
"Invalid action_type"
# P-03 Missing action_type field → 422
check "P-03 Missing action_type field → error" \
"$(curl -s -X POST $BASE/step \
-H 'Content-Type: application/json' \
-d '{"transaction_id":"TXN-E001"}')" \
'"detail"'
# P-04 Empty replay actions list
check "P-04 Replay empty actions → returns score" \
"$(curl -s -X POST $BASE/replay \
-H 'Content-Type: application/json' \
-d '{"actions":[]}')" \
'"normalised_score"'
# P-05 Replay with confidences shorter than actions → still works
check "P-05 Replay with partial confidences → score present" \
"$(curl -s -X POST $BASE/replay \
-H 'Content-Type: application/json' \
-d '{"actions":["approve","reject"],"confidences":[0.9]}')" \
'"normalised_score"'
# P-06 Grader before any reset → 400 or returns score
check "P-06 Grader before actions → handled gracefully" \
"$(curl -s $BASE/grader)" \
'"normalised_score"|"error"'
# P-07 Analytics before any completed episode → handled
reset
check "P-07 Analytics after fresh reset → message or data" \
"$(curl -s $BASE/analytics)" \
'"message"|"episodes_completed"'
# ═══════════════════════════════════════════════════════════
# GROUP Q — WebSocket
# ═══════════════════════════════════════════════════════════
section "GROUP Q — WebSocket (requires wscat or python)"
if command -v python3 &>/dev/null; then
WS_RESULT=$(BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json
try:
BASE = os.environ.get("BASE", "http://localhost:8000")
_ssl = ssl.create_default_context()
_ssl.check_hostname = False
_ssl.verify_mode = ssl.CERT_NONE
# ws upgrade check via HTTP (will get 426 Upgrade Required — proves endpoint exists)
req = urllib.request.Request(f"{BASE}/ws")
try:
urllib.request.urlopen(req, context=_ssl)
except Exception as e:
msg = str(e)
if "426" in msg or "101" in msg or "Switching" in msg or "upgrade" in msg.lower() or "404" in msg:
print("WS_OK")
else:
print(f"WS_UNKNOWN:{msg[:60]}")
except Exception as e:
print(f"WS_ERR:{e}")
PYEOF
)
check "Q-01 WS /ws endpoint exists (426 upgrade = correct)" \
"$WS_RESULT" "WS_OK"
else
printf " \033[33m-\033[0m Q-01 WebSocket check skipped (no python3)\n"
SKIP=$((SKIP+1))
fi
# ═══════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════
echo ""
echo "╔══════════════════════════════════════════════════════╗"
TOTAL=$((PASS+FAIL))
if [ "$FAIL" -eq 0 ]; then
printf "║ ✓ All %d tests passed" "$TOTAL"
else
printf "║ Results: %d/%d passed, %d failed" "$PASS" "$TOTAL" "$FAIL"
fi
[ "$SKIP" -gt 0 ] && printf " (%d skipped)" "$SKIP"
echo ""
echo "╚══════════════════════════════════════════════════════╝"
echo ""
# Exit with failure code if any tests failed
[ "$FAIL" -eq 0 ]