ChargeBackOps / scenarios /simulation.py
mitudrudutta's picture
feat: Implement wait_for_updates action for handling delayed cases and evidence
2dedffd
"""Internal task definitions and runtime types for ChargebackOps."""
from __future__ import annotations
from dataclasses import dataclass, field, replace
from typing import Literal
SystemName = Literal["orders", "payment", "shipping", "support", "refunds", "risk"]
StrategyName = Literal["contest", "accept_chargeback", "issue_refund"]
@dataclass(frozen=True)
class InternalEvidence:
"""Evidence item stored in a synthetic merchant system."""
evidence_id: str
source_system: SystemName
title: str
summary: str
helpful: bool = False
harmful: bool = False
required: bool = False
@dataclass(frozen=True)
class InternalCase:
"""Synthetic chargeback case definition."""
case_id: str
order_id: str
customer_id: str
amount: float
currency: str
reason_code: str
summary: str
inspection_notes: str
deadline_step: int
optimal_strategy: StrategyName
acceptable_strategies: tuple[StrategyName, ...]
policy_guidance: str
policy_requirements: tuple[str, ...]
recommended_strategy: StrategyName
resolution_summary: str
weight: float
evidence_by_system: dict[SystemName, tuple[InternalEvidence, ...]]
required_evidence_ids: tuple[str, ...] = ()
helpful_evidence_ids: tuple[str, ...] = ()
harmful_evidence_ids: tuple[str, ...] = ()
# Card network metadata — mirrors real dispute identifiers
card_network: str = "visa"
network_reason_code: str = ""
response_window_days: int = 30
compelling_evidence_category: str = ""
# Issuer-perceived complexity multiplier in (0, 1].
# Lower values dampen evidence_strength_score so harder cases land in the
# ambiguity band and exercise the multi-round dispute path.
dispute_complexity: float = 1.0
# Long-horizon backlog controls. Defaults keep existing tasks immediate.
arrival_step: int = 0
issuer_response_delay_steps: int = 0
evidence_response_delay_steps: int = 0
delayed_systems: tuple[SystemName, ...] = ()
@dataclass(frozen=True)
class TaskScenario:
"""One benchmark task."""
task_id: str
title: str
difficulty: Literal["easy", "medium", "hard", "nightmare"]
objective: str
description: str
max_steps: int
cases: tuple[InternalCase, ...]
@dataclass
class CaseProgress:
"""Mutable runtime state for one case."""
inspected: bool = False
policy_retrieved: bool = False
revealed_systems: set[SystemName] = field(default_factory=set)
retrieved_evidence_ids: set[str] = field(default_factory=set)
attached_evidence_ids: list[str] = field(default_factory=list)
current_strategy: StrategyName | None = None
final_resolution: str | None = None
resolution_status: str = "open"
resolved_at_step: int | None = None
duplicate_queries: int = 0
invalid_actions: int = 0
submit_attempts: int = 0
deadline_penalized: bool = False
notes: list[str] = field(default_factory=list)
representment_note: str | None = None
# multi-round dispute lifecycle
round_number: int = 1
issuer_decisions: list[str] = field(default_factory=list)
issuer_rationales: list[str] = field(default_factory=list)
pre_arb_evidence_added: list[str] = field(default_factory=list)
arbitration_outcome: str | None = None
arb_fees_paid: float = 0.0
final_economic_outcome: float | None = None
pending_issuer_round_number: int | None = None
pending_issuer_due_step: int | None = None
merchant_submitted_at_step: int | None = None
pending_evidence_systems: dict[SystemName, int] = field(default_factory=dict)
@dataclass
class ActionRecord:
"""Runtime action history."""
step_index: int
action_type: str
case_id: str | None
outcome: str
reward: float
def _ev(
evidence_id: str,
source_system: SystemName,
title: str,
summary: str,
*,
helpful: bool = False,
harmful: bool = False,
required: bool = False,
) -> InternalEvidence:
return InternalEvidence(
evidence_id=evidence_id,
source_system=source_system,
title=title,
summary=summary,
helpful=helpful,
harmful=harmful,
required=required,
)
TASKS: dict[str, TaskScenario] = {
"goods_not_received_easy": TaskScenario(
task_id="goods_not_received_easy",
title="Delivered But Disputed",
difficulty="easy",
objective="Contest a goods-not-received chargeback with the right delivery proof before the deadline.",
description=(
"A single e-commerce dispute where carrier confirmation and the order confirmation "
"are enough to win. The task teaches the standard representment loop."
),
max_steps=10,
cases=(
InternalCase(
case_id="CB-E1",
order_id="ORD-7410",
customer_id="CUST-1001",
amount=129.99,
currency="USD",
reason_code="goods_not_received",
summary="Cardholder claims the package never arrived.",
inspection_notes=(
"Order shipped the same day. Merchant policy requires carrier proof plus the original order confirmation "
"for goods-not-received disputes."
),
deadline_step=8,
optimal_strategy="contest",
acceptable_strategies=(),
policy_guidance=(
"For goods-not-received disputes, prove the merchandise was fulfilled to the billed customer with "
"order confirmation and carrier delivery evidence."
),
policy_requirements=(
"order confirmation",
"carrier delivery confirmation",
),
recommended_strategy="contest",
resolution_summary="Strong delivery proof exists. Contesting should recover the funds.",
weight=1.0,
required_evidence_ids=("E1-ORDER-CONF", "E1-DELIVERY-SCAN"),
helpful_evidence_ids=(
"E1-SIGNATURE",
"E1-SUPPORT-ACK",
),
harmful_evidence_ids=(),
card_network="visa",
network_reason_code="13.1",
response_window_days=30,
compelling_evidence_category="CE 3.5 — Merchandise Not Received",
evidence_by_system={
"orders": (
_ev(
"E1-ORDER-CONF",
"orders",
"Order confirmation",
"Order confirmation email and checkout receipt showing the billed customer, shipping address, and SKU.",
helpful=True,
required=True,
),
),
"payment": (
_ev(
"E1-AUTH",
"payment",
"Authorization record",
"Authorization approved and captured successfully.",
),
),
"shipping": (
_ev(
"E1-DELIVERY-SCAN",
"shipping",
"Carrier delivery scan",
"Carrier tracking shows delivered to the customer address two days after shipment.",
helpful=True,
required=True,
),
_ev(
"E1-SIGNATURE",
"shipping",
"Doorstep photo confirmation",
"Carrier stored a package photo at the delivery location.",
helpful=True,
),
),
"support": (
_ev(
"E1-SUPPORT-ACK",
"support",
"Support ticket acknowledgement",
"Customer contacted support to ask if the package was left at the front desk after delivery.",
helpful=True,
),
),
"refunds": (
_ev(
"E1-NO-REFUND",
"refunds",
"Refund ledger",
"No refund or goodwill credit was issued before the dispute opened.",
),
),
"risk": (
_ev(
"E1-RISK",
"risk",
"Risk summary",
"Low-risk order with no fraud flags.",
),
),
},
),
),
),
"fraud_signal_ambiguity": TaskScenario(
task_id="fraud_signal_ambiguity",
title="Fraud Signal Ambiguity",
difficulty="easy",
objective="Choose whether to contest a CNP fraud dispute and curate only the evidence that helps.",
description=(
"A card-not-present fraud dispute with mixed signals. Strong account-linkage evidence exists, "
"but payment mismatch artifacts will hurt the case if attached."
),
max_steps=10,
cases=(
InternalCase(
case_id="CB-M1",
order_id="ORD-8821",
customer_id="CUST-2048",
amount=480.0,
currency="USD",
reason_code="fraud_cnp",
summary="Issuer filed a card-not-present fraud dispute on a high-value electronics order.",
inspection_notes=(
"The order used a known account and device, but AVS/CVV mismatches were present. "
"Winning requires emphasizing customer-account linkage and avoiding mismatch artifacts."
),
deadline_step=7,
optimal_strategy="contest",
acceptable_strategies=(),
policy_guidance=(
"For CNP fraud disputes, contest only when you can link the cardholder to the account or device history. "
"Do not attach evidence that strengthens the issuer's fraud narrative."
),
policy_requirements=(
"prior good order linkage",
"customer account confirmation",
),
recommended_strategy="contest",
resolution_summary="Contest with strong account-linkage evidence. Conceding this case forfeits defensible revenue.",
weight=1.1,
required_evidence_ids=("M1-PRIOR-ORDERS", "M1-ACCOUNT-CHAT"),
helpful_evidence_ids=(
"M1-DELIVERY",
"M1-ORDER",
"M1-VELOCITY",
),
harmful_evidence_ids=("M1-AVS-MISMATCH", "M1-CVV-MISMATCH"),
card_network="visa",
network_reason_code="10.4",
response_window_days=30,
compelling_evidence_category="CE 3.6 — Fraud, Card-Absent Environment",
evidence_by_system={
"orders": (
_ev(
"M1-ORDER",
"orders",
"Order receipt",
"Checkout receipt showing customer account id, shipping address, and same email as prior purchases.",
helpful=True,
),
),
"payment": (
_ev(
"M1-AVS-MISMATCH",
"payment",
"AVS mismatch detail",
"Street-number mismatch was recorded at authorization time.",
harmful=True,
),
_ev(
"M1-CVV-MISMATCH",
"payment",
"CVV mismatch detail",
"CVV did not fully match at authorization time.",
harmful=True,
),
_ev(
"M1-AUTH",
"payment",
"Authorization capture",
"Payment was successfully authorized and captured.",
),
),
"shipping": (
_ev(
"M1-DELIVERY",
"shipping",
"Carrier delivery confirmation",
"Package was delivered to the saved customer address two days later.",
helpful=True,
),
),
"support": (
_ev(
"M1-ACCOUNT-CHAT",
"support",
"Authenticated support chat",
"Customer logged into the account and confirmed the delivery window in chat before shipment.",
helpful=True,
required=True,
),
),
"refunds": (
_ev(
"M1-NO-REFUND",
"refunds",
"Refund ledger",
"No refund or cancellation was issued prior to the dispute.",
),
),
"risk": (
_ev(
"M1-PRIOR-ORDERS",
"risk",
"Prior account activity",
"Same account, same device fingerprint, and three prior fulfilled orders without disputes.",
helpful=True,
required=True,
),
_ev(
"M1-VELOCITY",
"risk",
"Velocity check",
"No abnormal velocity or proxy usage detected.",
helpful=True,
),
),
},
),
),
),
"queue_optimization_hard": TaskScenario(
task_id="queue_optimization_hard",
title="Dispute Queue Optimization",
difficulty="hard",
objective="Maximize recovery across a queue of disputes while respecting deadlines and avoiding weak contests.",
description=(
"A real operations queue with three disputes. Two should be actioned quickly, and one should be conceded. "
"The step budget leaves little room for waste."
),
max_steps=18,
cases=(
InternalCase(
case_id="CB-H1",
order_id="ORD-9901",
customer_id="CUST-4100",
amount=860.0,
currency="USD",
reason_code="goods_not_received",
summary="High-value furniture delivery disputed as not received.",
inspection_notes=(
"Carrier stored both a delivery scan and signature. This is the highest-value recoverable case in the queue."
),
deadline_step=14,
optimal_strategy="contest",
acceptable_strategies=(),
policy_guidance=(
"Use merchant receipt plus carrier proof for goods-not-received disputes. This case is strong if contested on time."
),
policy_requirements=(
"order confirmation",
"signature-backed delivery proof",
),
recommended_strategy="contest",
resolution_summary="Contest immediately with the signature-backed delivery packet.",
weight=1.7,
required_evidence_ids=("H1-ORDER-CONF", "H1-SIGNATURE"),
helpful_evidence_ids=(
"H1-DELIVERY-SCAN",
),
harmful_evidence_ids=(),
card_network="mastercard",
network_reason_code="4855",
response_window_days=45,
compelling_evidence_category="Goods or Services Not Provided",
dispute_complexity=0.60,
evidence_by_system={
"orders": (
_ev(
"H1-ORDER-CONF",
"orders",
"Order invoice",
"Signed furniture order invoice with billing and delivery address.",
helpful=True,
required=True,
),
),
"payment": (
_ev(
"H1-AUTH",
"payment",
"Captured payment",
"Payment authorization and capture both succeeded.",
),
),
"shipping": (
_ev(
"H1-SIGNATURE",
"shipping",
"Delivery signature",
"Carrier recorded a recipient signature at the shipping address.",
helpful=True,
required=True,
),
_ev(
"H1-DELIVERY-SCAN",
"shipping",
"Final-mile delivery scan",
"Tracking confirms delivery within the promised window.",
helpful=True,
),
),
"support": (
_ev(
"H1-SUPPORT",
"support",
"Support history",
"No delivery complaint was opened before the dispute.",
),
),
"refunds": (
_ev(
"H1-NO-REFUND",
"refunds",
"Refund ledger",
"No refund was issued.",
),
),
"risk": (
_ev(
"H1-RISK",
"risk",
"Risk summary",
"Low-risk order. No notable fraud flags.",
),
),
},
),
InternalCase(
case_id="CB-H2",
order_id="ORD-9902",
customer_id="CUST-4101",
amount=240.0,
currency="USD",
reason_code="fraud_cnp",
summary="Apparel order disputed as unauthorized.",
inspection_notes=(
"The account is new, there is no durable linkage to the cardholder, and the payment record contains mismatch artifacts. "
"This case should be conceded."
),
deadline_step=14,
optimal_strategy="accept_chargeback",
acceptable_strategies=("issue_refund",),
policy_guidance=(
"Do not contest when you lack durable account or device linkage. Avoid wasting steps on weak fraud disputes."
),
policy_requirements=("cardholder linkage evidence"),
recommended_strategy="accept_chargeback",
resolution_summary="Concede the dispute. Contesting wastes portfolio value.",
weight=0.8,
required_evidence_ids=(),
helpful_evidence_ids=(),
harmful_evidence_ids=("H2-AVS", "H2-CVV"),
card_network="mastercard",
network_reason_code="4837",
response_window_days=45,
compelling_evidence_category="No Cardholder Authorization",
evidence_by_system={
"orders": (
_ev(
"H2-ORDER",
"orders",
"Order receipt",
"Guest checkout with a new shipping address and no prior order history.",
),
),
"payment": (
_ev(
"H2-AVS",
"payment",
"AVS mismatch detail",
"Street and postal code mismatches were present.",
harmful=True,
),
_ev(
"H2-CVV",
"payment",
"CVV mismatch detail",
"CVV did not match.",
harmful=True,
),
),
"shipping": (
_ev(
"H2-DELIVERY",
"shipping",
"Carrier delivery confirmation",
"Delivered to a new address without signature.",
),
),
"support": (
_ev(
"H2-SUPPORT",
"support",
"Support log",
"No authenticated support interactions were recorded.",
),
),
"refunds": (
_ev(
"H2-NO-REFUND",
"refunds",
"Refund ledger",
"No refund issued before the chargeback.",
),
),
"risk": (
_ev(
"H2-RISK",
"risk",
"Risk summary",
"Elevated risk score and no positive account history.",
),
),
},
),
InternalCase(
case_id="CB-H3",
order_id="ORD-9903",
customer_id="CUST-4102",
amount=320.0,
currency="USD",
reason_code="credit_not_processed",
summary="Subscriber canceled before renewal and says the credit was never processed.",
inspection_notes=(
"The merchant missed the promised refund SLA. This should be resolved fast with a refund, not a contest."
),
deadline_step=4,
optimal_strategy="issue_refund",
acceptable_strategies=("accept_chargeback",),
policy_guidance=(
"If the merchant failed to process a promised credit, refund immediately or concede. Contesting is not supportable."
),
policy_requirements=(
"proof of cancellation request",
"refund status check",
),
recommended_strategy="issue_refund",
resolution_summary="Refund immediately. Delay turns a manageable loss into a deadline miss.",
weight=1.2,
required_evidence_ids=(),
helpful_evidence_ids=("H3-CANCEL", "H3-NO-REFUND"),
harmful_evidence_ids=(),
card_network="visa",
network_reason_code="13.6",
response_window_days=30,
compelling_evidence_category="CE 3.4 — Credit Not Processed",
evidence_by_system={
"orders": (
_ev(
"H3-ORDER",
"orders",
"Renewal invoice",
"Subscription renewed automatically for the annual plan.",
),
),
"payment": (
_ev(
"H3-PAYMENT",
"payment",
"Captured renewal payment",
"Renewal payment settled successfully.",
),
),
"shipping": (),
"support": (
_ev(
"H3-CANCEL",
"support",
"Cancellation request",
"Customer requested cancellation before renewal and support promised a refund within five business days.",
helpful=True,
),
),
"refunds": (
_ev(
"H3-NO-REFUND",
"refunds",
"Refund ledger",
"No refund has been issued as of the dispute open date.",
helpful=True,
),
),
"risk": (),
},
),
),
),
"pre_arb_recovery_medium": TaskScenario(
task_id="pre_arb_recovery_medium",
title="Pre-Arbitration Recovery",
difficulty="medium",
objective=(
"Win a goods-not-received dispute that requires recovering compelling "
"evidence in round 2 instead of burning $250 on arbitration."
),
description=(
"Required evidence is split across orders and support. A round-1 packet "
"from the default systems will fall short and the issuer will request "
"compelling evidence. Querying support in round 2 unlocks the missing "
"proof; jumping straight to arbitration concedes a $250 fee on a "
"packet the issuer would have accepted."
),
max_steps=12,
cases=(
InternalCase(
case_id="CB-P1",
order_id="ORD-7710",
customer_id="CUST-3300",
amount=700.0,
currency="USD",
reason_code="goods_not_received",
summary=(
"Customer denies receipt of a $700 electronics order. "
"Authenticated support transcript proves delivery acknowledgement."
),
inspection_notes=(
"The order was delivered, but the strongest acknowledgement lives "
"in the support transcript — not in the orders or shipping system. "
"A first-pass packet will be missing required evidence."
),
deadline_step=10,
optimal_strategy="contest",
acceptable_strategies=(),
policy_guidance=(
"Goods-not-received disputes need order confirmation plus a "
"delivery acknowledgement. If the support transcript is the only "
"delivery acknowledgement, attach it through the pre-arbitration "
"response — do not skip straight to arbitration."
),
policy_requirements=(
"order confirmation",
"support delivery acknowledgement",
),
recommended_strategy="contest",
resolution_summary=(
"Recover the support acknowledgement in pre-arb. Escalating to "
"arbitration without it forfeits $250 on a winnable case."
),
weight=1.3,
required_evidence_ids=("P1-ORDER-CONF", "P1-SUPPORT-CONF"),
helpful_evidence_ids=("P1-DELIVERY-SCAN", "P1-RISK-CLEAR"),
harmful_evidence_ids=(),
card_network="visa",
network_reason_code="13.1",
response_window_days=30,
compelling_evidence_category="CE 3.5 — Merchandise Not Received",
evidence_by_system={
"orders": (
_ev(
"P1-ORDER-CONF",
"orders",
"Order confirmation",
"Order receipt with billed customer, shipping address, and SKU.",
helpful=True,
required=True,
),
),
"payment": (
_ev(
"P1-AUTH",
"payment",
"Authorization capture",
"Authorization approved and captured cleanly.",
),
),
"shipping": (
_ev(
"P1-DELIVERY-SCAN",
"shipping",
"Carrier delivery scan",
"Carrier tracking shows the package delivered to the saved address.",
helpful=True,
),
),
"support": (
_ev(
"P1-SUPPORT-CONF",
"support",
"Authenticated support acknowledgement",
"Customer logged in and confirmed receipt of the package in chat the next day.",
helpful=True,
required=True,
),
),
"refunds": (
_ev(
"P1-NO-REFUND",
"refunds",
"Refund ledger",
"No refund or goodwill credit was issued before the dispute opened.",
),
),
"risk": (
_ev(
"P1-RISK-CLEAR",
"risk",
"Risk summary",
"Account has clean device fingerprint and prior fulfilled orders.",
helpful=True,
),
),
},
),
),
),
}
def _rewrite_case_ids(case: InternalCase, new_case_id: str) -> InternalCase:
"""Clone a hand-authored case with case-local evidence ids.
The long-horizon marathon deliberately reuses proven case blueprints, but
every cloned case needs unique evidence ids so the observation stream does
not look like repeated rows from the same dispute.
"""
id_map: dict[str, str] = {}
rewritten_systems: dict[SystemName, tuple[InternalEvidence, ...]] = {}
for system_name, items in case.evidence_by_system.items():
rewritten_items: list[InternalEvidence] = []
for item in items:
new_evidence_id = f"{new_case_id}-{item.evidence_id}"
id_map[item.evidence_id] = new_evidence_id
rewritten_items.append(replace(item, evidence_id=new_evidence_id))
rewritten_systems[system_name] = tuple(rewritten_items)
return replace(
case,
case_id=new_case_id,
order_id=f"ORD-LH-{new_case_id[-2:]}",
customer_id=f"CUST-LH-{new_case_id[-2:]}",
evidence_by_system=rewritten_systems,
required_evidence_ids=tuple(id_map[eid] for eid in case.required_evidence_ids),
helpful_evidence_ids=tuple(id_map[eid] for eid in case.helpful_evidence_ids),
harmful_evidence_ids=tuple(id_map[eid] for eid in case.harmful_evidence_ids),
)
def _marathon_case(
base: InternalCase,
*,
case_id: str,
amount: float,
arrival_step: int,
deadline_step: int,
weight: float,
issuer_delay: int = 0,
evidence_delay: int = 0,
delayed_systems: tuple[SystemName, ...] = (),
dispute_complexity: float | None = None,
summary_prefix: str = "",
) -> InternalCase:
cloned = _rewrite_case_ids(base, case_id)
return replace(
cloned,
amount=amount,
summary=f"{summary_prefix}{cloned.summary}" if summary_prefix else cloned.summary,
deadline_step=deadline_step,
weight=weight,
arrival_step=arrival_step,
issuer_response_delay_steps=issuer_delay,
evidence_response_delay_steps=evidence_delay,
delayed_systems=delayed_systems,
dispute_complexity=(
dispute_complexity
if dispute_complexity is not None
else cloned.dispute_complexity
),
)
def _build_monthly_dispute_backlog_marathon() -> TaskScenario:
"""Theme #2 flagship: delayed, wave-based dispute backlog.
This remains the same ChargebackOps domain, but makes planning genuinely
long-horizon: cases arrive in waves, some evidence systems are asynchronous,
and issuer reviews return several steps after submission. A local greedy
policy can solve individual cases but loses portfolio value if it forgets
pending work or ignores future deadlines.
"""
easy_delivery = TASKS["goods_not_received_easy"].cases[0]
fraud_strong = TASKS["fraud_signal_ambiguity"].cases[0]
pre_arb = TASKS["pre_arb_recovery_medium"].cases[0]
hard_delivery, weak_fraud, refund_due = TASKS["queue_optimization_hard"].cases
return TaskScenario(
task_id="monthly_dispute_backlog_marathon",
title="Monthly Dispute Backlog Marathon",
difficulty="nightmare",
objective=(
"Manage a 60-step month-end chargeback backlog with wave arrivals, "
"delayed evidence, delayed issuer responses, and arbitration ROI."
),
description=(
"A professional long-horizon backlog: twelve disputes arrive across "
"the episode. The agent must remember pending issuer reviews, revisit "
"delayed evidence, triage urgent refunds, contest positive-EV cases, "
"and avoid spending $250 arbitration fees on weak packets."
),
max_steps=60,
cases=(
_marathon_case(
refund_due,
case_id="CB-L01",
amount=320.0,
arrival_step=0,
deadline_step=6,
weight=1.2,
summary_prefix="[Wave 1 urgent refund] ",
),
_marathon_case(
easy_delivery,
case_id="CB-L02",
amount=430.0,
arrival_step=0,
deadline_step=20,
weight=1.1,
issuer_delay=3,
delayed_systems=("shipping",),
evidence_delay=2,
dispute_complexity=0.78,
summary_prefix="[Wave 1 delayed carrier file] ",
),
_marathon_case(
fraud_strong,
case_id="CB-L03",
amount=880.0,
arrival_step=0,
deadline_step=24,
weight=1.6,
issuer_delay=4,
delayed_systems=("risk",),
evidence_delay=2,
dispute_complexity=0.70,
summary_prefix="[Wave 1 high-value CNP] ",
),
_marathon_case(
weak_fraud,
case_id="CB-L04",
amount=240.0,
arrival_step=0,
deadline_step=14,
weight=0.8,
summary_prefix="[Wave 1 weak fraud] ",
),
_marathon_case(
pre_arb,
case_id="CB-L05",
amount=700.0,
arrival_step=8,
deadline_step=34,
weight=1.4,
issuer_delay=3,
delayed_systems=("support",),
evidence_delay=2,
dispute_complexity=0.68,
summary_prefix="[Wave 2 pre-arb recovery] ",
),
_marathon_case(
hard_delivery,
case_id="CB-L06",
amount=1200.0,
arrival_step=10,
deadline_step=40,
weight=1.9,
issuer_delay=5,
delayed_systems=("shipping", "support"),
evidence_delay=2,
dispute_complexity=0.62,
summary_prefix="[Wave 2 enterprise furniture] ",
),
_marathon_case(
refund_due,
case_id="CB-L07",
amount=180.0,
arrival_step=15,
deadline_step=25,
weight=0.9,
summary_prefix="[Wave 2 small refund SLA] ",
),
_marathon_case(
fraud_strong,
case_id="CB-L08",
amount=980.0,
arrival_step=18,
deadline_step=45,
weight=1.7,
issuer_delay=4,
delayed_systems=("support", "risk"),
evidence_delay=3,
dispute_complexity=0.65,
summary_prefix="[Wave 3 returning-customer CNP] ",
),
_marathon_case(
weak_fraud,
case_id="CB-L09",
amount=310.0,
arrival_step=25,
deadline_step=38,
weight=0.8,
summary_prefix="[Wave 3 weak guest checkout] ",
),
_marathon_case(
easy_delivery,
case_id="CB-L10",
amount=560.0,
arrival_step=30,
deadline_step=55,
weight=1.2,
issuer_delay=3,
delayed_systems=("shipping",),
evidence_delay=2,
dispute_complexity=0.74,
summary_prefix="[Wave 4 delivery proof] ",
),
_marathon_case(
pre_arb,
case_id="CB-L11",
amount=750.0,
arrival_step=35,
deadline_step=58,
weight=1.5,
issuer_delay=3,
delayed_systems=("support",),
evidence_delay=2,
dispute_complexity=0.66,
summary_prefix="[Wave 4 support acknowledgement] ",
),
_marathon_case(
refund_due,
case_id="CB-L12",
amount=90.0,
arrival_step=42,
deadline_step=52,
weight=0.6,
summary_prefix="[Wave 5 low-value refund] ",
),
),
)
TASKS["monthly_dispute_backlog_marathon"] = _build_monthly_dispute_backlog_marathon()
def get_task(task_id: str) -> TaskScenario:
"""Look up a built-in task or generate one from a ``generated_*`` id."""
if task_id in TASKS:
return TASKS[task_id]
# Support generated task ids: generated_{difficulty}_s{seed}
import re
m = re.match(r"^generated_(easy|medium|hard|nightmare)_s(\d+)$", task_id)
if m:
try:
from .case_generator import generate_task
except ImportError: # pragma: no cover
from case_generator import generate_task
difficulty = m.group(1)
seed = int(m.group(2))
return generate_task(seed, difficulty=difficulty) # type: ignore[arg-type]
# Support ISO-derived task ids: iso_{difficulty}_{index}
m_iso = re.match(r"^iso_(easy|medium|hard)_(\d+)$", task_id)
if m_iso:
try:
from .iso_adapter import build_iso_task, load_iso_rows
except ImportError: # pragma: no cover
from iso_adapter import build_iso_task, load_iso_rows
difficulty = m_iso.group(1)
task_index = int(m_iso.group(2))
rows = load_iso_rows()
if rows:
import random as _rng_mod
shuffled = list(rows)
_rng_mod.Random(42).shuffle(shuffled)
task = build_iso_task(
shuffled,
difficulty=difficulty,
start_index=task_index * 4,
task_index=task_index,
)
if task is not None:
return task
# Support Stripe-derived task ids: stripe_{difficulty}_{index}
m_stripe = re.match(r"^stripe_(easy|medium|hard)_(\d+)$", task_id)
if m_stripe:
try:
from connectors.stripe_sandbox import fetch_disputes, build_stripe_task
except ImportError: # pragma: no cover
from ..connectors.stripe_sandbox import fetch_disputes, build_stripe_task
disputes = fetch_disputes(limit=10)
task = build_stripe_task(
disputes, difficulty=m_stripe.group(1), task_index=int(m_stripe.group(2))
)
if task is not None:
return task
raise ValueError(f"Unknown task_id '{task_id}'. Available: {', '.join(TASKS)}")
def list_tasks() -> list[TaskScenario]:
"""Return the fixed benchmark task catalog.
The catalog is deterministic and identical across all deployments:
- **Showcase** (3): hand-crafted built-in tasks for demos and README.
- **Generated holdout** (7): seeded tasks never used for agent tuning.
ISO replay tasks are available via ``list_iso_tasks()`` and the
``/generate`` endpoint but are excluded from the default catalog so
that scores and task counts are always comparable.
"""
try:
from .case_generator import generate_task
except ImportError: # pragma: no cover
from case_generator import generate_task
# --- Showcase split (fixed, hand-crafted) ---
showcase = [
TASKS[task_id]
for task_id in [
"goods_not_received_easy",
"fraud_signal_ambiguity",
"pre_arb_recovery_medium",
"queue_optimization_hard",
"monthly_dispute_backlog_marathon",
]
]
# --- Generated holdout split (seeded, never used for tuning) ---
holdout = [
generate_task(seed=42, difficulty="easy"),
generate_task(seed=17, difficulty="medium"),
generate_task(seed=99, difficulty="medium"),
generate_task(seed=7, difficulty="hard"),
generate_task(seed=53, difficulty="hard"),
generate_task(seed=31, difficulty="nightmare"),
generate_task(seed=77, difficulty="nightmare"),
]
return showcase + holdout
def list_iso_tasks() -> list[TaskScenario]:
"""Return ISO 20022 replay tasks. Raises on failure instead of
silently returning an empty list so data/import issues are visible."""
try:
from .iso_adapter import generate_iso_suite
except ImportError: # pragma: no cover
from iso_adapter import generate_iso_suite
return generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)