ChargeBackOps / tests /test_arbitration.py
mitudrudutta's picture
feat: tighten EscalationROI, add ambiguous medium case, LLM note judge wrapper
e32a33b
"""Unit tests for scenarios.arbitration
These tests pin the three bands of the arbitration ruling (merchant-wins,
issuer-wins, deterministic coin flip) and the $250-per-side fee accounting so
a regression in the terminal-round math shows up before the end-to-end env
tests do.
"""
from __future__ import annotations
from dataclasses import replace
from scenarios.arbitration import (
ARB_FEE_PER_SIDE,
ARB_ISSUER_WIN_THRESHOLD,
ARB_MERCHANT_WIN_THRESHOLD,
ArbitrationOutcome,
_coin_flip_merchant_wins,
arbitration_ruling,
)
from scenarios.simulation import CaseProgress, get_task
_TASK = get_task("goods_not_received_easy")
_CASE = _TASK.cases[0]
def _progress(attached: list[str]) -> CaseProgress:
p = CaseProgress()
p.attached_evidence_ids = list(attached)
return p
def test_merchant_wins_on_strong_packet():
"""Required + 2 helpful β†’ score 0.8 clears the 0.65 bar β†’ MERCHANT_WINS."""
progress = _progress(
["E1-ORDER-CONF", "E1-DELIVERY-SCAN", "E1-SIGNATURE", "E1-SUPPORT-ACK"]
)
ruling = arbitration_ruling(_CASE, progress)
assert ruling.evidence_strength_score >= ARB_MERCHANT_WIN_THRESHOLD
assert ruling.outcome == ArbitrationOutcome.MERCHANT_WINS
assert ruling.arb_fee_per_side == ARB_FEE_PER_SIDE
assert ruling.merchant_net_pnl == _CASE.amount - ARB_FEE_PER_SIDE
def test_issuer_wins_on_empty_packet():
"""Score 0 sits below the 0.35 floor β†’ ISSUER_WINS, merchant eats amount + fee."""
progress = _progress([])
ruling = arbitration_ruling(_CASE, progress)
assert ruling.evidence_strength_score <= ARB_ISSUER_WIN_THRESHOLD
assert ruling.outcome == ArbitrationOutcome.ISSUER_WINS
assert ruling.merchant_net_pnl == -_CASE.amount - ARB_FEE_PER_SIDE
def test_ambiguity_band_uses_deterministic_coin_flip():
"""Scores in (0.35, 0.65) map to a case_id-keyed coin flip β€” reproducible."""
# Two helpful-only evidence ids β†’ 0.4 band score, no required subset.
progress = _progress(["E1-SIGNATURE", "E1-SUPPORT-ACK"])
r1 = arbitration_ruling(_CASE, progress)
r2 = arbitration_ruling(_CASE, progress)
assert r1.outcome == r2.outcome
assert ARB_ISSUER_WIN_THRESHOLD < r1.evidence_strength_score < ARB_MERCHANT_WIN_THRESHOLD
expected = (
ArbitrationOutcome.MERCHANT_WINS
if _coin_flip_merchant_wins(_CASE.case_id)
else ArbitrationOutcome.ISSUER_WINS
)
assert r1.outcome == expected
def test_coin_flip_varies_across_case_ids():
"""Changing only the case_id must change the coin-flip answer for some cases.
If every case_id hashed to the same parity, the ambiguity band wouldn't
actually be 50/50 across the benchmark β€” this test guards against that.
"""
flips = {_coin_flip_merchant_wins(f"CB-TEST-{i}") for i in range(20)}
assert flips == {True, False}
def test_ruling_is_pure():
"""Same inputs, same outputs β€” required for reproducible benchmarks."""
progress = _progress(
["E1-ORDER-CONF", "E1-DELIVERY-SCAN", "E1-SIGNATURE", "E1-SUPPORT-ACK"]
)
r1 = arbitration_ruling(_CASE, progress)
r2 = arbitration_ruling(_CASE, progress)
assert r1 == r2
# A second case_id clone with identical evidence should give the same
# MERCHANT_WINS outcome (score is above 0.65, so no coin-flip involved).
cloned = replace(_CASE, case_id="CB-CLONE-1")
r3 = arbitration_ruling(cloned, progress)
assert r3.outcome == r1.outcome
assert r3.merchant_net_pnl == r1.merchant_net_pnl