#!/usr/bin/env bash
# run_tests.sh — PayOps v2 full test suite
# Usage: bash payops_env/run_tests.sh [BASE_URL]
# Requires the server to be running. Default: http://localhost:8000

BASE="${1:-http://localhost:8000}"
PASS=0
FAIL=0
SKIP=0

# ── helpers ──────────────────────────────────────────────────────────────────

check() {
  local name="$1"
  local got="$2"
  local want="$3"
  if echo "$got" | grep -qE "$want"; then
    printf "  \033[32m✓\033[0m  %s\n" "$name"
    PASS=$((PASS+1))
  else
    printf "  \033[31m✗\033[0m  %s\n" "$name"
    printf "     expected pattern : %s\n" "$want"
    printf "     got              : %s\n" "$(echo "$got" | head -c 200)"
    FAIL=$((FAIL+1))
  fi
}

check_absent() {
  local name="$1"
  local got="$2"
  local absent="$3"
  if echo "$got" | grep -qE "$absent"; then
    printf "  \033[31m✗\033[0m  %s  (unexpected: %s)\n" "$name" "$absent"
    FAIL=$((FAIL+1))
  else
    printf "  \033[32m✓\033[0m  %s\n" "$name"
    PASS=$((PASS+1))
  fi
}

step() {
  curl -s -X POST "$BASE/step" \
    -H "Content-Type: application/json" \
    -d "$1"
}

reset() {
  curl -s -X POST "$BASE/reset" -H 'Content-Type: application/json' -d '{"seed":0}' > /dev/null
}

section() {
  echo ""
  echo "── $1 ──"
}

echo ""
echo "╔══════════════════════════════════════════════════════╗"
echo "║    PayOps v2 — Full Post-Main Test Suite             ║"
echo "║    Target: $BASE"
echo "╚══════════════════════════════════════════════════════╝"
echo ""

# ═══════════════════════════════════════════════════════════
# GROUP A — Server / Infrastructure
# ═══════════════════════════════════════════════════════════
section "GROUP A — Server / Infrastructure"

# A-01  Health check
check "A-01  GET /health → status ok" \
  "$(curl -s $BASE/health)" \
  '"status":"ok"'

# A-02  Version is v2
check "A-02  GET /health → version 2.0.0" \
  "$(curl -s $BASE/health)" \
  '"version":"2.0.0"'

# A-03  Schema action model present
check "A-03  GET /schema → PayOpsAction schema" \
  "$(curl -s $BASE/schema)" \
  '"PayOpsAction"'

# A-04  Schema observation model present
check "A-04  GET /schema → PayOpsObservation schema" \
  "$(curl -s $BASE/schema)" \
  '"PayOpsObservation"'

# A-05  Schema state model present
check "A-05  GET /schema → PayOpsState schema" \
  "$(curl -s $BASE/schema)" \
  '"PayOpsState"'

# ═══════════════════════════════════════════════════════════
# GROUP B — Tasks endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP B — Tasks endpoint"

TASKS_RESP="$(curl -s $BASE/tasks)"

# B-01  30 tasks
check "B-01  GET /tasks → count=30" \
  "$TASKS_RESP" '"count":30'

# B-02  All 4 difficulty tiers present
check "B-02  Tasks include 'easy' tier" "$TASKS_RESP" '"difficulty":"easy"'
check "B-03  Tasks include 'medium' tier" "$TASKS_RESP" '"difficulty":"medium"'
check "B-04  Tasks include 'hard' tier" "$TASKS_RESP" '"difficulty":"hard"'
check "B-05  Tasks include 'critical' tier" "$TASKS_RESP" '"difficulty":"critical"'

# B-06  Regulatory tasks present
check "B-06  Tasks include regulatory_action flag" \
  "$TASKS_RESP" '"regulatory_action":true'

# B-07  Multi-step chain tasks present
check "B-07  Tasks include chain_total > 1" \
  "$TASKS_RESP" '"chain_total":3'

# B-08  requires_investigation populated
check "B-08  Tasks include requires_investigation" \
  "$TASKS_RESP" '"requires_investigation":\['

# ═══════════════════════════════════════════════════════════
# GROUP C — Reset
# ═══════════════════════════════════════════════════════════
section "GROUP C — Reset"

RESET_RESP="$(curl -s -X POST $BASE/reset)"
check "C-01  POST /reset → returns EASY-001" "$RESET_RESP" '"task_id":"EASY-001"'
check "C-02  POST /reset → done=false" "$RESET_RESP" '"done":false'
check "C-03  POST /reset → budget_remaining=5.0" "$RESET_RESP" '"budget_remaining":5.0'
check "C-04  POST /reset → risk_score present" "$RESET_RESP" '"risk_score":'
check "C-05  POST /reset → ml_confidence present" "$RESET_RESP" '"ml_confidence":'
check "C-06  POST /reset → chain_total present" "$RESET_RESP" '"chain_total":'

# ═══════════════════════════════════════════════════════════
# GROUP D — Terminal Actions (correct decisions)
# ═══════════════════════════════════════════════════════════
section "GROUP D — Terminal Actions (correct decisions)"

reset

# D-01  approve correct → reward=1.0
check "D-01  EASY-001 approve → reward=1.0" \
  "$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')" \
  '"reward":1.0'

# D-02  reject correct → reward=1.0
check "D-02  EASY-002 reject → reward=1.0" \
  "$(step '{"action_type":"reject","transaction_id":"TXN-E002"}')" \
  '"reward":1.0'

# D-03  approve correct (refund) → reward=1.0
check "D-03  EASY-003 approve → reward=1.0" \
  "$(step '{"action_type":"approve","transaction_id":"TXN-E003"}')" \
  '"reward":1.0'

# D-04  flag correct → reward=1.0
check "D-04  EASY-004 flag → reward=1.0" \
  "$(step '{"action_type":"flag","transaction_id":"TXN-E004"}')" \
  '"reward":1.0'

# D-04b  approve correct (mortgage repayment) → reward=1.0
check "D-04b  EASY-005 approve → reward=1.0" \
  "$(step '{"action_type":"approve","transaction_id":"TXN-E005"}')" \
  '"reward":1.0'

# D-04c  flag correct (duplicate payment) → reward=1.0
check "D-04c  EASY-006 flag → reward=1.0" \
  "$(step '{"action_type":"flag","transaction_id":"TXN-E006"}')" \
  '"reward":1.0'

# D-05  escalate correct → reward=1.0
check "D-05  MED-001 escalate → reward=1.0" \
  "$(step '{"action_type":"escalate","transaction_id":"TXN-M001"}')" \
  '"reward":1.0'

# D-06  hold correct → reward=1.0
check "D-06  MED-002 hold → reward=1.0" \
  "$(step '{"action_type":"hold","transaction_id":"TXN-M002"}')" \
  '"reward":1.0'

# D-07  task advances after terminal
R=$(step '{"action_type":"flag","transaction_id":"TXN-M003"}')
check "D-07  After flag on MED-003, next task is MED-004" "$R" '"task_id":"MED-004"'

# ═══════════════════════════════════════════════════════════
# GROUP E — Terminal Actions (wrong decisions / partial credit)
# ═══════════════════════════════════════════════════════════
section "GROUP E — Wrong actions & partial credit"

reset

# E-01  approve when should reject → -1.0 (fraud approval)
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null  # advance past E001
check "E-01  Approve fraud (EASY-002) → reward=-1.0" \
  "$(step '{"action_type":"approve","transaction_id":"TXN-E002"}')" \
  '"reward":-1.0'

reset
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null
step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null

# E-02  reject correct → approve → -0.5
check "E-02  Reject legit (EASY-003) → reward=-0.5" \
  "$(step '{"action_type":"reject","transaction_id":"TXN-E003"}')" \
  '"reward":-0.5'

reset
# E-03  partial credit — escalate instead of escalate on MED-001 is correct
#        flag instead of escalate earns partial
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null
step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null
step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null
step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null
step '{"action_type":"approve","transaction_id":"TXN-E005"}' > /dev/null
step '{"action_type":"flag","transaction_id":"TXN-E006"}' > /dev/null
R=$(step '{"action_type":"flag","transaction_id":"TXN-M001"}')
check "E-03  Partial credit: flag on MED-001 (correct=escalate) → reward > 0" \
  "$R" '"reward":0\.[0-9]'

# ═══════════════════════════════════════════════════════════
# GROUP F — Investigation Sub-Actions
# ═══════════════════════════════════════════════════════════
section "GROUP F — Investigation sub-actions"

reset

# F-01  inspect → reward=0.15, stays on same task
R=$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')
check "F-01  inspect → reward=0.15" "$R" '"reward":0.15'
check "F-02  inspect → does NOT advance task (still EASY-001)" "$R" '"task_id":"EASY-001"'
check "F-03  inspect → inspection_notes populated" "$R" '"inspection_notes":"'
check "F-04  inspect → budget_remaining=4.9" "$R" '"budget_remaining":4.9'

# F-05  inspect again on same task → reward=0.0 (no double-dip)
check "F-05  second inspect → reward=0.0" \
  "$(step '{"action_type":"inspect","transaction_id":"TXN-E001"}')" \
  '"reward":0.0'

# Advance to EASY-002
step '{"action_type":"approve","transaction_id":"TXN-E001"}' > /dev/null

# F-06  request_docs → reward=0.15, docs_notes populated
R=$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')
check "F-06  request_docs → reward=0.15" "$R" '"reward":0.15'
check "F-07  request_docs → docs_notes populated" "$R" '"docs_notes":"'
check "F-08  request_docs → budget_remaining=4.6 (cost=0.2)" "$R" '"budget_remaining":4.6'

# F-09  request_docs again → reward=0.0
check "F-09  second request_docs → reward=0.0" \
  "$(step '{"action_type":"request_docs","transaction_id":"TXN-E002"}')" \
  '"reward":0.0'

step '{"action_type":"reject","transaction_id":"TXN-E002"}' > /dev/null  # advance

# F-10  verify_kyc → reward=0.15, kyc_notes populated
R=$(step '{"action_type":"verify_kyc","transaction_id":"TXN-E003"}')
check "F-10  verify_kyc → reward=0.15" "$R" '"reward":0.15'
check "F-11  verify_kyc → kyc_notes populated" "$R" '"kyc_notes":"'
check "F-12  verify_kyc → budget cost=0.2 deducted" "$R" '"budget_remaining":4.[0-9]'

step '{"action_type":"approve","transaction_id":"TXN-E003"}' > /dev/null

# F-13  contact_sender → reward=0.15, contact_notes populated
R=$(step '{"action_type":"contact_sender","transaction_id":"TXN-E004"}')
check "F-13  contact_sender → reward=0.15" "$R" '"reward":0.15'
check "F-14  contact_sender → contact_notes populated" "$R" '"contact_notes":"'
check "F-15  contact_sender → budget cost=0.3 deducted" "$R" '"budget_remaining":3\.[0-9]'

step '{"action_type":"flag","transaction_id":"TXN-E004"}' > /dev/null

# F-16  file_sar → reward=0.15, docs_notes mentions SAR
R=$(step '{"action_type":"file_sar","transaction_id":"TXN-M001"}')
check "F-16  file_sar → reward=0.15" "$R" '"reward":0.15'
check "F-17  file_sar → docs_notes mentions SAR" "$R" '"docs_notes":"SAR'
check "F-18  file_sar → budget cost=0.05 deducted" "$R" '"budget_remaining":3\.[0-9]'

# ═══════════════════════════════════════════════════════════
# GROUP G — /state endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP G — State endpoint"

STATE="$(curl -s $BASE/state)"
check "G-01  GET /state → episode_id set" "$STATE" '"episode_id":"'
check "G-02  GET /state → step_count > 0" "$STATE" '"step_count":[1-9]'
check "G-03  GET /state → budget_spent > 0" "$STATE" '"budget_spent":[0-9]'
check "G-04  GET /state → investigation_actions_used list" "$STATE" '"investigation_actions_used":\['
check "G-05  GET /state → recent_decisions list" "$STATE" '"recent_decisions":\['
check "G-06  GET /state → correct_decisions >= 0" "$STATE" '"correct_decisions":[0-9]'

# ═══════════════════════════════════════════════════════════
# GROUP H — /grader endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP H — Grader endpoint"

GRADER="$(curl -s $BASE/grader)"
check "H-01  GET /grader → normalised_score present" "$GRADER" '"normalised_score":'
check "H-02  GET /grader → total_reward present" "$GRADER" '"total_reward":'
check "H-03  GET /grader → budget_spent present" "$GRADER" '"budget_spent":'
check "H-04  GET /grader → budget_penalty present" "$GRADER" '"budget_penalty":'
check "H-05  GET /grader → per_task array" "$GRADER" '"per_task":\['
check "H-06  GET /grader → reward_breakdown in per_task" "$GRADER" '"reward_breakdown":'
check "H-07  GET /grader → passed field present" "$GRADER" '"passed":'

# ═══════════════════════════════════════════════════════════
# GROUP I — /replay endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP I — /replay endpoint (offline eval)"

# I-01  Replay with all 30 correct terminal actions (no investigation)
#        Hard/critical tasks should be penalised (×0.80) → 0.7–0.9 range
# Task order: EASY×6, MED×8, HARD×10, CRIT×6
# Correct actions:
#   EASY: approve reject approve flag approve flag
#   MED:  escalate hold flag flag hold escalate hold flag
#   HARD: escalate reject reject approve escalate flag reject reject escalate reject
#   CRIT: approve reject escalate reject reject escalate
REPLAY_PERFECT='{"actions":["approve","reject","approve","flag","approve","flag",
  "escalate","hold","flag","flag","hold","escalate","hold","flag",
  "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
  "approve","reject","escalate","reject","reject","escalate"]}'
R=$(curl -s -X POST $BASE/replay \
    -H "Content-Type: application/json" \
    -d "$REPLAY_PERFECT")
check "I-01  Replay correct terminals (no investigation) → score<1.0 (hard/critical penalised)" "$R" '"normalised_score":0\.[7-9]'
check "I-02  Replay → passed=true (score>0.5 despite penalty)" "$R" '"passed":true'
check "I-03  Replay → budget_spent=0.0 (no inv actions)" "$R" '"budget_spent":0.0'

# I-04  Replay with investigation actions included (inspect before first task)
REPLAY_WITH_INV='{"actions":["inspect","approve","reject","approve","flag","approve","flag",
  "escalate","hold","flag","flag","hold","escalate","hold","flag",
  "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
  "approve","reject","escalate","reject","reject","escalate"]}'
R=$(curl -s -X POST $BASE/replay \
    -H "Content-Type: application/json" \
    -d "$REPLAY_WITH_INV")
check "I-04  Replay with inspect → budget_spent=0.1" "$R" '"budget_spent":0.1'

# I-05  Replay with invalid action → 422
check "I-05  Replay invalid action → 422 error" \
  "$(curl -s -X POST $BASE/replay \
      -H 'Content-Type: application/json' \
      -d '{"actions":["delete","approve"]}')" \
  "Invalid action"

# I-06  Replay result has per_task breakdown
check "I-06  Replay → per_task array present" "$R" '"per_task":\['
check "I-07  Replay → reward_breakdown in each task" "$R" '"reward_breakdown":'

# I-08  Replay with confidences
REPLAY_CONF='{"actions":["approve","reject"],"confidences":[0.95,0.90]}'
check "I-08  Replay with confidences → ok" \
  "$(curl -s -X POST $BASE/replay \
      -H 'Content-Type: application/json' \
      -d "$REPLAY_CONF")" \
  '"normalised_score":'

# ═══════════════════════════════════════════════════════════
# GROUP J — /baseline endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP J — Baseline agent"

BASELINE="$(curl -s -X POST $BASE/baseline)"
check "J-01  POST /baseline → normalised_score present" "$BASELINE" '"normalised_score":'
check "J-02  POST /baseline → total_reward present" "$BASELINE" '"total_reward":'
check "J-03  POST /baseline → steps > 0" "$BASELINE" '"steps":[1-9]'
check "J-04  POST /baseline → per_task scores present" "$BASELINE" '"scores":\['
check "J-05  POST /baseline → score >= 0.5 (passes)" "$BASELINE" '"normalised_score":(1\.0|0\.[5-9])'

# ═══════════════════════════════════════════════════════════
# GROUP K — /analytics endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP K — Analytics endpoint"

# Run a full episode first so analytics has data
reset
BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json, sys
BASE = os.environ.get("BASE", "http://localhost:8000")
_ssl = ssl.create_default_context()
_ssl.check_hostname = False
_ssl.verify_mode = ssl.CERT_NONE
def post(path, body=None):
    req = urllib.request.Request(f"{BASE}{path}",
        data=json.dumps(body).encode() if body else None,
        headers={"Content-Type": "application/json"}, method="POST")
    with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read())
def get(path):
    with urllib.request.urlopen(f"{BASE}{path}", context=_ssl) as r: return json.loads(r.read())

post("/reset", {"seed": 0})
# EASY x6 + MED x8 + HARD x10: no chain gate, terminal only
for action, txn in [
  ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"),
  ("approve","TXN-E005"),("flag","TXN-E006"),
  ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"),
  ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"),
  ("escalate","TXN-H001"),("reject","TXN-H002"),("reject","TXN-H003"),("approve","TXN-H004"),
  ("escalate","TXN-H005"),("flag","TXN-H006"),("reject","TXN-H007"),("reject","TXN-H008"),
  ("escalate","TXN-H009"),("reject","TXN-H010"),
]:
    post("/step", {"action_type": action, "transaction_id": txn})
# CRIT x6: chain-gated — must provide chain_min investigation steps first
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C001"})
post("/step", {"action_type": "request_docs",  "transaction_id": "TXN-C001"})
post("/step", {"action_type": "approve",       "transaction_id": "TXN-C001"})
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C002"})
post("/step", {"action_type": "reject",        "transaction_id": "TXN-C002"})
post("/step", {"action_type": "request_docs",  "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C003"})
post("/step", {"action_type": "escalate",      "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C004"})
post("/step", {"action_type": "reject",        "transaction_id": "TXN-C004"})
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C005"})
post("/step", {"action_type": "verify_kyc",    "transaction_id": "TXN-C005"})
post("/step", {"action_type": "reject",        "transaction_id": "TXN-C005"})
post("/step", {"action_type": "inspect",       "transaction_id": "TXN-C006"})
post("/step", {"action_type": "escalate",      "transaction_id": "TXN-C006"})
PYEOF

ANA="$(curl -s $BASE/analytics)"
check "K-01  GET /analytics → episodes_completed >= 1" "$ANA" '"episodes_completed":[1-9]'
check "K-02  GET /analytics → best_score present" "$ANA" '"best_score":'
check "K-03  GET /analytics → avg_score present" "$ANA" '"avg_score":'
check "K-04  GET /analytics → avg_budget_spent present" "$ANA" '"avg_budget_spent":'
check "K-05  GET /analytics → current_episode present" "$ANA" '"current_episode":'
check "K-06  GET /analytics → by_difficulty present" "$ANA" '"by_difficulty":'
check "K-07  GET /analytics → easy accuracy" "$ANA" '"easy":'
check "K-08  GET /analytics → critical accuracy" "$ANA" '"critical":'

# ═══════════════════════════════════════════════════════════
# GROUP L — /leaderboard endpoint
# ═══════════════════════════════════════════════════════════
section "GROUP L — Leaderboard endpoint"

LB="$(curl -s $BASE/leaderboard)"
check "L-01  GET /leaderboard → count >= 1 (episode recorded)" "$LB" '"count":[1-9]'
check "L-02  GET /leaderboard → entries array present" "$LB" '"entries":\['
check "L-03  GET /leaderboard → episode_id in entry" "$LB" '"episode_id":"'
check "L-04  GET /leaderboard → normalised_score in entry" "$LB" '"normalised_score":'
check "L-05  GET /leaderboard → timestamp in entry" "$LB" '"timestamp":"'
check "L-06  GET /leaderboard → budget_spent in entry" "$LB" '"budget_spent":'

# ═══════════════════════════════════════════════════════════
# GROUP M — Perfect episode score
# ═══════════════════════════════════════════════════════════
section "GROUP M — Perfect episode (all 30 correct + investigation on hard/critical)"

reset
BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json
BASE = os.environ.get("BASE", "http://localhost:8000")
_ssl = ssl.create_default_context()
_ssl.check_hostname = False
_ssl.verify_mode = ssl.CERT_NONE
def post(path, body=None):
    req = urllib.request.Request(f"{BASE}{path}",
        data=json.dumps(body).encode() if body else None,
        headers={"Content-Type": "application/json"}, method="POST")
    with urllib.request.urlopen(req, context=_ssl) as r: return json.loads(r.read())

post("/reset", {"seed": 0})
# Easy x6 + Medium x8: terminal only
for action, txn in [
  ("approve","TXN-E001"),("reject","TXN-E002"),("approve","TXN-E003"),("flag","TXN-E004"),
  ("approve","TXN-E005"),("flag","TXN-E006"),
  ("escalate","TXN-M001"),("hold","TXN-M002"),("flag","TXN-M003"),("flag","TXN-M004"),
  ("hold","TXN-M005"),("escalate","TXN-M006"),("hold","TXN-M007"),("flag","TXN-M008"),
]:
    post("/step", {"action_type": action, "transaction_id": txn})
# Hard x10: one required investigation sub-action before each terminal
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-H001"})
post("/step", {"action_type": "escalate",        "transaction_id": "TXN-H001"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-H002"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-H002"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-H003"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-H003"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-H004"})
post("/step", {"action_type": "approve",         "transaction_id": "TXN-H004"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-H005"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-H005"})
post("/step", {"action_type": "escalate",        "transaction_id": "TXN-H005"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-H006"})
post("/step", {"action_type": "flag",            "transaction_id": "TXN-H006"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-H007"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-H007"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-H008"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-H008"})
post("/step", {"action_type": "verify_kyc",      "transaction_id": "TXN-H009"})
post("/step", {"action_type": "escalate",        "transaction_id": "TXN-H009"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-H010"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-H010"})
# Critical x6: required investigation sub-actions before each terminal
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-C001"})
post("/step", {"action_type": "request_docs",    "transaction_id": "TXN-C001"})
post("/step", {"action_type": "approve",         "transaction_id": "TXN-C001"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-C002"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-C002"})
post("/step", {"action_type": "request_docs",    "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-C003"})
post("/step", {"action_type": "escalate",        "transaction_id": "TXN-C003"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-C004"})
post("/step", {"action_type": "contact_sender",  "transaction_id": "TXN-C004"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-C004"})
post("/step", {"action_type": "inspect",         "transaction_id": "TXN-C005"})
post("/step", {"action_type": "verify_kyc",      "transaction_id": "TXN-C005"})
post("/step", {"action_type": "reject",          "transaction_id": "TXN-C005"})
post("/step", {"action_type": "request_docs",    "transaction_id": "TXN-C006"})
post("/step", {"action_type": "escalate",        "transaction_id": "TXN-C006"})
PYEOF

GRADER="$(curl -s $BASE/grader)"
check "M-01  Perfect episode → normalised_score=1.0" "$GRADER" '"normalised_score":1.0'
check "M-02  Perfect episode → passed=true" "$GRADER" '"passed":true'
check "M-03  Perfect episode → budget_penalty=0.0" "$GRADER" '"budget_penalty":0.0'

# ═══════════════════════════════════════════════════════════
# GROUP N — Difficulty weighting
# ═══════════════════════════════════════════════════════════
section "GROUP N — Difficulty weighting in grader"

# A critical task correct should contribute more weight than easy
REPLAY_CRIT='{"actions":["approve","reject","approve","flag","approve","flag",
  "escalate","hold","flag","flag","hold","escalate","hold","flag",
  "escalate","reject","reject","approve","escalate","flag","reject","reject","escalate","reject",
  "approve","reject","escalate","reject","reject","escalate"]}'

FULL=$(curl -s -X POST $BASE/replay \
    -H "Content-Type: application/json" \
    -d "$REPLAY_CRIT")
check "N-01  Full correct replay → max_possible_reward includes weights" \
  "$FULL" '"max_possible_reward":4[0-9]\.'
check "N-02  Correct-but-no-investigation → total_reward below max (penalty applied)" \
  "$(echo "$FULL" | python3 -c "
import sys,json,re
d=json.load(sys.stdin)
print('DIFF' if d['total_reward'] < d['max_possible_reward'] - 0.01 else 'EQUAL')
")" "DIFF"

# ═══════════════════════════════════════════════════════════
# GROUP O — Budget mechanics
# ═══════════════════════════════════════════════════════════
section "GROUP O — Budget mechanics"

reset

# Burn through budget with contact_sender (cost=0.3) × 5 = 1.5
# then request_docs × 5 = 1.0, then verify_kyc × 5 = 1.0  (total = 3.5 so far ≤5)
# Total exceeding: won't exceed in these few calls, test budget tracking
step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null  # -0.30
step '{"action_type":"contact_sender","transaction_id":"TXN-E001"}' > /dev/null  # dup, but cost still deducted
step '{"action_type":"request_docs","transaction_id":"TXN-E001"}' > /dev/null    # -0.20
step '{"action_type":"verify_kyc","transaction_id":"TXN-E001"}' > /dev/null      # -0.20
step '{"action_type":"file_sar","transaction_id":"TXN-E001"}' > /dev/null        # -0.05
R=$(step '{"action_type":"approve","transaction_id":"TXN-E001"}')
# budget_remaining = 5.0 - 0.30 - 0.30 - 0.20 - 0.20 - 0.05 = 3.95
check "O-01  Budget correctly deducted after multiple inv actions" \
  "$R" '"budget_remaining":3\.[0-9]'

# grader shows budget_spent
GRADER="$(curl -s $BASE/grader)"
check "O-02  Grader → budget_spent > 0" "$GRADER" '"budget_spent":[1-9]'

# replay that overshoots budget → budget_penalty > 0
HEAVY='{"actions":["contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","contact_sender","approve","reject","approve","flag","escalate","hold","flag","flag","hold","escalate","escalate","reject","reject","approve","escalate","flag","approve","reject","escalate","reject"]}'
R=$(curl -s -X POST $BASE/replay \
    -H "Content-Type: application/json" \
    -d "$HEAVY")
check "O-03  Over-budget replay → budget_penalty > 0" "$R" '"budget_penalty":0\.[0-9]*[1-9]'
check "O-04  Over-budget replay → budget_overspend > 0" "$R" '"budget_overspend":0\.[1-9]'

# ═══════════════════════════════════════════════════════════
# GROUP P — Edge cases & errors
# ═══════════════════════════════════════════════════════════
section "GROUP P — Edge cases & error handling"

# P-01  Step before reset → 400
check "P-01  Step on fresh env (reset then done) → handled" \
  "$(curl -s -X POST $BASE/reset > /dev/null && curl -s -X POST $BASE/step \
      -H 'Content-Type: application/json' \
      -d '{"action_type":"approve","transaction_id":"TXN-NONE"}')" \
  '"reward"'

# P-02  Unknown action_type → 422
check "P-02  Unknown action_type → 422" \
  "$(curl -s -X POST $BASE/step \
      -H 'Content-Type: application/json' \
      -d '{"action_type":"nuke","transaction_id":"TXN-E001"}')" \
  "Invalid action_type"

# P-03  Missing action_type field → 422
check "P-03  Missing action_type field → error" \
  "$(curl -s -X POST $BASE/step \
      -H 'Content-Type: application/json' \
      -d '{"transaction_id":"TXN-E001"}')" \
  '"detail"'

# P-04  Empty replay actions list
check "P-04  Replay empty actions → returns score" \
  "$(curl -s -X POST $BASE/replay \
      -H 'Content-Type: application/json' \
      -d '{"actions":[]}')" \
  '"normalised_score"'

# P-05  Replay with confidences shorter than actions → still works
check "P-05  Replay with partial confidences → score present" \
  "$(curl -s -X POST $BASE/replay \
      -H 'Content-Type: application/json' \
      -d '{"actions":["approve","reject"],"confidences":[0.9]}')" \
  '"normalised_score"'

# P-06  Grader before any reset → 400 or returns score
check "P-06  Grader before actions → handled gracefully" \
  "$(curl -s $BASE/grader)" \
  '"normalised_score"|"error"'

# P-07  Analytics before any completed episode → handled
reset
check "P-07  Analytics after fresh reset → message or data" \
  "$(curl -s $BASE/analytics)" \
  '"message"|"episodes_completed"'

# ═══════════════════════════════════════════════════════════
# GROUP Q — WebSocket
# ═══════════════════════════════════════════════════════════
section "GROUP Q — WebSocket (requires wscat or python)"

if command -v python3 &>/dev/null; then
  WS_RESULT=$(BASE=$BASE python3 - <<'PYEOF'
import os, ssl, urllib.request, json
try:
    BASE = os.environ.get("BASE", "http://localhost:8000")
    _ssl = ssl.create_default_context()
    _ssl.check_hostname = False
    _ssl.verify_mode = ssl.CERT_NONE
    # ws upgrade check via HTTP (will get 426 Upgrade Required — proves endpoint exists)
    req = urllib.request.Request(f"{BASE}/ws")
    try:
        urllib.request.urlopen(req, context=_ssl)
    except Exception as e:
        msg = str(e)
        if "426" in msg or "101" in msg or "Switching" in msg or "upgrade" in msg.lower() or "404" in msg:
            print("WS_OK")
        else:
            print(f"WS_UNKNOWN:{msg[:60]}")
except Exception as e:
    print(f"WS_ERR:{e}")
PYEOF
)
  check "Q-01  WS /ws endpoint exists (426 upgrade = correct)" \
    "$WS_RESULT" "WS_OK"
else
  printf "  \033[33m-\033[0m  Q-01  WebSocket check skipped (no python3)\n"
  SKIP=$((SKIP+1))
fi

# ═══════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════
echo ""
echo "╔══════════════════════════════════════════════════════╗"
TOTAL=$((PASS+FAIL))
if [ "$FAIL" -eq 0 ]; then
  printf "║  ✓  All %d tests passed" "$TOTAL"
else
  printf "║  Results: %d/%d passed, %d failed" "$PASS" "$TOTAL" "$FAIL"
fi
[ "$SKIP" -gt 0 ] && printf "  (%d skipped)" "$SKIP"
echo ""
echo "╚══════════════════════════════════════════════════════╝"
echo ""

# Exit with failure code if any tests failed
[ "$FAIL" -eq 0 ]