Spaces:
Sleeping
Sleeping
| """Internal task definitions and runtime types for ChargebackOps.""" | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field, replace | |
| from typing import Literal | |
| SystemName = Literal["orders", "payment", "shipping", "support", "refunds", "risk"] | |
| StrategyName = Literal["contest", "accept_chargeback", "issue_refund"] | |
| class InternalEvidence: | |
| """Evidence item stored in a synthetic merchant system.""" | |
| evidence_id: str | |
| source_system: SystemName | |
| title: str | |
| summary: str | |
| helpful: bool = False | |
| harmful: bool = False | |
| required: bool = False | |
| class InternalCase: | |
| """Synthetic chargeback case definition.""" | |
| case_id: str | |
| order_id: str | |
| customer_id: str | |
| amount: float | |
| currency: str | |
| reason_code: str | |
| summary: str | |
| inspection_notes: str | |
| deadline_step: int | |
| optimal_strategy: StrategyName | |
| acceptable_strategies: tuple[StrategyName, ...] | |
| policy_guidance: str | |
| policy_requirements: tuple[str, ...] | |
| recommended_strategy: StrategyName | |
| resolution_summary: str | |
| weight: float | |
| evidence_by_system: dict[SystemName, tuple[InternalEvidence, ...]] | |
| required_evidence_ids: tuple[str, ...] = () | |
| helpful_evidence_ids: tuple[str, ...] = () | |
| harmful_evidence_ids: tuple[str, ...] = () | |
| # Card network metadata — mirrors real dispute identifiers | |
| card_network: str = "visa" | |
| network_reason_code: str = "" | |
| response_window_days: int = 30 | |
| compelling_evidence_category: str = "" | |
| # Issuer-perceived complexity multiplier in (0, 1]. | |
| # Lower values dampen evidence_strength_score so harder cases land in the | |
| # ambiguity band and exercise the multi-round dispute path. | |
| dispute_complexity: float = 1.0 | |
| # Long-horizon backlog controls. Defaults keep existing tasks immediate. | |
| arrival_step: int = 0 | |
| issuer_response_delay_steps: int = 0 | |
| evidence_response_delay_steps: int = 0 | |
| delayed_systems: tuple[SystemName, ...] = () | |
| class TaskScenario: | |
| """One benchmark task.""" | |
| task_id: str | |
| title: str | |
| difficulty: Literal["easy", "medium", "hard", "nightmare"] | |
| objective: str | |
| description: str | |
| max_steps: int | |
| cases: tuple[InternalCase, ...] | |
| class CaseProgress: | |
| """Mutable runtime state for one case.""" | |
| inspected: bool = False | |
| policy_retrieved: bool = False | |
| revealed_systems: set[SystemName] = field(default_factory=set) | |
| retrieved_evidence_ids: set[str] = field(default_factory=set) | |
| attached_evidence_ids: list[str] = field(default_factory=list) | |
| current_strategy: StrategyName | None = None | |
| final_resolution: str | None = None | |
| resolution_status: str = "open" | |
| resolved_at_step: int | None = None | |
| duplicate_queries: int = 0 | |
| invalid_actions: int = 0 | |
| submit_attempts: int = 0 | |
| deadline_penalized: bool = False | |
| notes: list[str] = field(default_factory=list) | |
| representment_note: str | None = None | |
| # multi-round dispute lifecycle | |
| round_number: int = 1 | |
| issuer_decisions: list[str] = field(default_factory=list) | |
| issuer_rationales: list[str] = field(default_factory=list) | |
| pre_arb_evidence_added: list[str] = field(default_factory=list) | |
| arbitration_outcome: str | None = None | |
| arb_fees_paid: float = 0.0 | |
| final_economic_outcome: float | None = None | |
| pending_issuer_round_number: int | None = None | |
| pending_issuer_due_step: int | None = None | |
| merchant_submitted_at_step: int | None = None | |
| pending_evidence_systems: dict[SystemName, int] = field(default_factory=dict) | |
| class ActionRecord: | |
| """Runtime action history.""" | |
| step_index: int | |
| action_type: str | |
| case_id: str | None | |
| outcome: str | |
| reward: float | |
| def _ev( | |
| evidence_id: str, | |
| source_system: SystemName, | |
| title: str, | |
| summary: str, | |
| *, | |
| helpful: bool = False, | |
| harmful: bool = False, | |
| required: bool = False, | |
| ) -> InternalEvidence: | |
| return InternalEvidence( | |
| evidence_id=evidence_id, | |
| source_system=source_system, | |
| title=title, | |
| summary=summary, | |
| helpful=helpful, | |
| harmful=harmful, | |
| required=required, | |
| ) | |
| TASKS: dict[str, TaskScenario] = { | |
| "goods_not_received_easy": TaskScenario( | |
| task_id="goods_not_received_easy", | |
| title="Delivered But Disputed", | |
| difficulty="easy", | |
| objective="Contest a goods-not-received chargeback with the right delivery proof before the deadline.", | |
| description=( | |
| "A single e-commerce dispute where carrier confirmation and the order confirmation " | |
| "are enough to win. The task teaches the standard representment loop." | |
| ), | |
| max_steps=10, | |
| cases=( | |
| InternalCase( | |
| case_id="CB-E1", | |
| order_id="ORD-7410", | |
| customer_id="CUST-1001", | |
| amount=129.99, | |
| currency="USD", | |
| reason_code="goods_not_received", | |
| summary="Cardholder claims the package never arrived.", | |
| inspection_notes=( | |
| "Order shipped the same day. Merchant policy requires carrier proof plus the original order confirmation " | |
| "for goods-not-received disputes." | |
| ), | |
| deadline_step=8, | |
| optimal_strategy="contest", | |
| acceptable_strategies=(), | |
| policy_guidance=( | |
| "For goods-not-received disputes, prove the merchandise was fulfilled to the billed customer with " | |
| "order confirmation and carrier delivery evidence." | |
| ), | |
| policy_requirements=( | |
| "order confirmation", | |
| "carrier delivery confirmation", | |
| ), | |
| recommended_strategy="contest", | |
| resolution_summary="Strong delivery proof exists. Contesting should recover the funds.", | |
| weight=1.0, | |
| required_evidence_ids=("E1-ORDER-CONF", "E1-DELIVERY-SCAN"), | |
| helpful_evidence_ids=( | |
| "E1-SIGNATURE", | |
| "E1-SUPPORT-ACK", | |
| ), | |
| harmful_evidence_ids=(), | |
| card_network="visa", | |
| network_reason_code="13.1", | |
| response_window_days=30, | |
| compelling_evidence_category="CE 3.5 — Merchandise Not Received", | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "E1-ORDER-CONF", | |
| "orders", | |
| "Order confirmation", | |
| "Order confirmation email and checkout receipt showing the billed customer, shipping address, and SKU.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "E1-AUTH", | |
| "payment", | |
| "Authorization record", | |
| "Authorization approved and captured successfully.", | |
| ), | |
| ), | |
| "shipping": ( | |
| _ev( | |
| "E1-DELIVERY-SCAN", | |
| "shipping", | |
| "Carrier delivery scan", | |
| "Carrier tracking shows delivered to the customer address two days after shipment.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| _ev( | |
| "E1-SIGNATURE", | |
| "shipping", | |
| "Doorstep photo confirmation", | |
| "Carrier stored a package photo at the delivery location.", | |
| helpful=True, | |
| ), | |
| ), | |
| "support": ( | |
| _ev( | |
| "E1-SUPPORT-ACK", | |
| "support", | |
| "Support ticket acknowledgement", | |
| "Customer contacted support to ask if the package was left at the front desk after delivery.", | |
| helpful=True, | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "E1-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund or goodwill credit was issued before the dispute opened.", | |
| ), | |
| ), | |
| "risk": ( | |
| _ev( | |
| "E1-RISK", | |
| "risk", | |
| "Risk summary", | |
| "Low-risk order with no fraud flags.", | |
| ), | |
| ), | |
| }, | |
| ), | |
| ), | |
| ), | |
| "fraud_signal_ambiguity": TaskScenario( | |
| task_id="fraud_signal_ambiguity", | |
| title="Fraud Signal Ambiguity", | |
| difficulty="easy", | |
| objective="Choose whether to contest a CNP fraud dispute and curate only the evidence that helps.", | |
| description=( | |
| "A card-not-present fraud dispute with mixed signals. Strong account-linkage evidence exists, " | |
| "but payment mismatch artifacts will hurt the case if attached." | |
| ), | |
| max_steps=10, | |
| cases=( | |
| InternalCase( | |
| case_id="CB-M1", | |
| order_id="ORD-8821", | |
| customer_id="CUST-2048", | |
| amount=480.0, | |
| currency="USD", | |
| reason_code="fraud_cnp", | |
| summary="Issuer filed a card-not-present fraud dispute on a high-value electronics order.", | |
| inspection_notes=( | |
| "The order used a known account and device, but AVS/CVV mismatches were present. " | |
| "Winning requires emphasizing customer-account linkage and avoiding mismatch artifacts." | |
| ), | |
| deadline_step=7, | |
| optimal_strategy="contest", | |
| acceptable_strategies=(), | |
| policy_guidance=( | |
| "For CNP fraud disputes, contest only when you can link the cardholder to the account or device history. " | |
| "Do not attach evidence that strengthens the issuer's fraud narrative." | |
| ), | |
| policy_requirements=( | |
| "prior good order linkage", | |
| "customer account confirmation", | |
| ), | |
| recommended_strategy="contest", | |
| resolution_summary="Contest with strong account-linkage evidence. Conceding this case forfeits defensible revenue.", | |
| weight=1.1, | |
| required_evidence_ids=("M1-PRIOR-ORDERS", "M1-ACCOUNT-CHAT"), | |
| helpful_evidence_ids=( | |
| "M1-DELIVERY", | |
| "M1-ORDER", | |
| "M1-VELOCITY", | |
| ), | |
| harmful_evidence_ids=("M1-AVS-MISMATCH", "M1-CVV-MISMATCH"), | |
| card_network="visa", | |
| network_reason_code="10.4", | |
| response_window_days=30, | |
| compelling_evidence_category="CE 3.6 — Fraud, Card-Absent Environment", | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "M1-ORDER", | |
| "orders", | |
| "Order receipt", | |
| "Checkout receipt showing customer account id, shipping address, and same email as prior purchases.", | |
| helpful=True, | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "M1-AVS-MISMATCH", | |
| "payment", | |
| "AVS mismatch detail", | |
| "Street-number mismatch was recorded at authorization time.", | |
| harmful=True, | |
| ), | |
| _ev( | |
| "M1-CVV-MISMATCH", | |
| "payment", | |
| "CVV mismatch detail", | |
| "CVV did not fully match at authorization time.", | |
| harmful=True, | |
| ), | |
| _ev( | |
| "M1-AUTH", | |
| "payment", | |
| "Authorization capture", | |
| "Payment was successfully authorized and captured.", | |
| ), | |
| ), | |
| "shipping": ( | |
| _ev( | |
| "M1-DELIVERY", | |
| "shipping", | |
| "Carrier delivery confirmation", | |
| "Package was delivered to the saved customer address two days later.", | |
| helpful=True, | |
| ), | |
| ), | |
| "support": ( | |
| _ev( | |
| "M1-ACCOUNT-CHAT", | |
| "support", | |
| "Authenticated support chat", | |
| "Customer logged into the account and confirmed the delivery window in chat before shipment.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "M1-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund or cancellation was issued prior to the dispute.", | |
| ), | |
| ), | |
| "risk": ( | |
| _ev( | |
| "M1-PRIOR-ORDERS", | |
| "risk", | |
| "Prior account activity", | |
| "Same account, same device fingerprint, and three prior fulfilled orders without disputes.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| _ev( | |
| "M1-VELOCITY", | |
| "risk", | |
| "Velocity check", | |
| "No abnormal velocity or proxy usage detected.", | |
| helpful=True, | |
| ), | |
| ), | |
| }, | |
| ), | |
| ), | |
| ), | |
| "queue_optimization_hard": TaskScenario( | |
| task_id="queue_optimization_hard", | |
| title="Dispute Queue Optimization", | |
| difficulty="hard", | |
| objective="Maximize recovery across a queue of disputes while respecting deadlines and avoiding weak contests.", | |
| description=( | |
| "A real operations queue with three disputes. Two should be actioned quickly, and one should be conceded. " | |
| "The step budget leaves little room for waste." | |
| ), | |
| max_steps=18, | |
| cases=( | |
| InternalCase( | |
| case_id="CB-H1", | |
| order_id="ORD-9901", | |
| customer_id="CUST-4100", | |
| amount=860.0, | |
| currency="USD", | |
| reason_code="goods_not_received", | |
| summary="High-value furniture delivery disputed as not received.", | |
| inspection_notes=( | |
| "Carrier stored both a delivery scan and signature. This is the highest-value recoverable case in the queue." | |
| ), | |
| deadline_step=14, | |
| optimal_strategy="contest", | |
| acceptable_strategies=(), | |
| policy_guidance=( | |
| "Use merchant receipt plus carrier proof for goods-not-received disputes. This case is strong if contested on time." | |
| ), | |
| policy_requirements=( | |
| "order confirmation", | |
| "signature-backed delivery proof", | |
| ), | |
| recommended_strategy="contest", | |
| resolution_summary="Contest immediately with the signature-backed delivery packet.", | |
| weight=1.7, | |
| required_evidence_ids=("H1-ORDER-CONF", "H1-SIGNATURE"), | |
| helpful_evidence_ids=( | |
| "H1-DELIVERY-SCAN", | |
| ), | |
| harmful_evidence_ids=(), | |
| card_network="mastercard", | |
| network_reason_code="4855", | |
| response_window_days=45, | |
| compelling_evidence_category="Goods or Services Not Provided", | |
| dispute_complexity=0.60, | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "H1-ORDER-CONF", | |
| "orders", | |
| "Order invoice", | |
| "Signed furniture order invoice with billing and delivery address.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "H1-AUTH", | |
| "payment", | |
| "Captured payment", | |
| "Payment authorization and capture both succeeded.", | |
| ), | |
| ), | |
| "shipping": ( | |
| _ev( | |
| "H1-SIGNATURE", | |
| "shipping", | |
| "Delivery signature", | |
| "Carrier recorded a recipient signature at the shipping address.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| _ev( | |
| "H1-DELIVERY-SCAN", | |
| "shipping", | |
| "Final-mile delivery scan", | |
| "Tracking confirms delivery within the promised window.", | |
| helpful=True, | |
| ), | |
| ), | |
| "support": ( | |
| _ev( | |
| "H1-SUPPORT", | |
| "support", | |
| "Support history", | |
| "No delivery complaint was opened before the dispute.", | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "H1-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund was issued.", | |
| ), | |
| ), | |
| "risk": ( | |
| _ev( | |
| "H1-RISK", | |
| "risk", | |
| "Risk summary", | |
| "Low-risk order. No notable fraud flags.", | |
| ), | |
| ), | |
| }, | |
| ), | |
| InternalCase( | |
| case_id="CB-H2", | |
| order_id="ORD-9902", | |
| customer_id="CUST-4101", | |
| amount=240.0, | |
| currency="USD", | |
| reason_code="fraud_cnp", | |
| summary="Apparel order disputed as unauthorized.", | |
| inspection_notes=( | |
| "The account is new, there is no durable linkage to the cardholder, and the payment record contains mismatch artifacts. " | |
| "This case should be conceded." | |
| ), | |
| deadline_step=14, | |
| optimal_strategy="accept_chargeback", | |
| acceptable_strategies=("issue_refund",), | |
| policy_guidance=( | |
| "Do not contest when you lack durable account or device linkage. Avoid wasting steps on weak fraud disputes." | |
| ), | |
| policy_requirements=("cardholder linkage evidence"), | |
| recommended_strategy="accept_chargeback", | |
| resolution_summary="Concede the dispute. Contesting wastes portfolio value.", | |
| weight=0.8, | |
| required_evidence_ids=(), | |
| helpful_evidence_ids=(), | |
| harmful_evidence_ids=("H2-AVS", "H2-CVV"), | |
| card_network="mastercard", | |
| network_reason_code="4837", | |
| response_window_days=45, | |
| compelling_evidence_category="No Cardholder Authorization", | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "H2-ORDER", | |
| "orders", | |
| "Order receipt", | |
| "Guest checkout with a new shipping address and no prior order history.", | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "H2-AVS", | |
| "payment", | |
| "AVS mismatch detail", | |
| "Street and postal code mismatches were present.", | |
| harmful=True, | |
| ), | |
| _ev( | |
| "H2-CVV", | |
| "payment", | |
| "CVV mismatch detail", | |
| "CVV did not match.", | |
| harmful=True, | |
| ), | |
| ), | |
| "shipping": ( | |
| _ev( | |
| "H2-DELIVERY", | |
| "shipping", | |
| "Carrier delivery confirmation", | |
| "Delivered to a new address without signature.", | |
| ), | |
| ), | |
| "support": ( | |
| _ev( | |
| "H2-SUPPORT", | |
| "support", | |
| "Support log", | |
| "No authenticated support interactions were recorded.", | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "H2-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund issued before the chargeback.", | |
| ), | |
| ), | |
| "risk": ( | |
| _ev( | |
| "H2-RISK", | |
| "risk", | |
| "Risk summary", | |
| "Elevated risk score and no positive account history.", | |
| ), | |
| ), | |
| }, | |
| ), | |
| InternalCase( | |
| case_id="CB-H3", | |
| order_id="ORD-9903", | |
| customer_id="CUST-4102", | |
| amount=320.0, | |
| currency="USD", | |
| reason_code="credit_not_processed", | |
| summary="Subscriber canceled before renewal and says the credit was never processed.", | |
| inspection_notes=( | |
| "The merchant missed the promised refund SLA. This should be resolved fast with a refund, not a contest." | |
| ), | |
| deadline_step=4, | |
| optimal_strategy="issue_refund", | |
| acceptable_strategies=("accept_chargeback",), | |
| policy_guidance=( | |
| "If the merchant failed to process a promised credit, refund immediately or concede. Contesting is not supportable." | |
| ), | |
| policy_requirements=( | |
| "proof of cancellation request", | |
| "refund status check", | |
| ), | |
| recommended_strategy="issue_refund", | |
| resolution_summary="Refund immediately. Delay turns a manageable loss into a deadline miss.", | |
| weight=1.2, | |
| required_evidence_ids=(), | |
| helpful_evidence_ids=("H3-CANCEL", "H3-NO-REFUND"), | |
| harmful_evidence_ids=(), | |
| card_network="visa", | |
| network_reason_code="13.6", | |
| response_window_days=30, | |
| compelling_evidence_category="CE 3.4 — Credit Not Processed", | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "H3-ORDER", | |
| "orders", | |
| "Renewal invoice", | |
| "Subscription renewed automatically for the annual plan.", | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "H3-PAYMENT", | |
| "payment", | |
| "Captured renewal payment", | |
| "Renewal payment settled successfully.", | |
| ), | |
| ), | |
| "shipping": (), | |
| "support": ( | |
| _ev( | |
| "H3-CANCEL", | |
| "support", | |
| "Cancellation request", | |
| "Customer requested cancellation before renewal and support promised a refund within five business days.", | |
| helpful=True, | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "H3-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund has been issued as of the dispute open date.", | |
| helpful=True, | |
| ), | |
| ), | |
| "risk": (), | |
| }, | |
| ), | |
| ), | |
| ), | |
| "pre_arb_recovery_medium": TaskScenario( | |
| task_id="pre_arb_recovery_medium", | |
| title="Pre-Arbitration Recovery", | |
| difficulty="medium", | |
| objective=( | |
| "Win a goods-not-received dispute that requires recovering compelling " | |
| "evidence in round 2 instead of burning $250 on arbitration." | |
| ), | |
| description=( | |
| "Required evidence is split across orders and support. A round-1 packet " | |
| "from the default systems will fall short and the issuer will request " | |
| "compelling evidence. Querying support in round 2 unlocks the missing " | |
| "proof; jumping straight to arbitration concedes a $250 fee on a " | |
| "packet the issuer would have accepted." | |
| ), | |
| max_steps=12, | |
| cases=( | |
| InternalCase( | |
| case_id="CB-P1", | |
| order_id="ORD-7710", | |
| customer_id="CUST-3300", | |
| amount=700.0, | |
| currency="USD", | |
| reason_code="goods_not_received", | |
| summary=( | |
| "Customer denies receipt of a $700 electronics order. " | |
| "Authenticated support transcript proves delivery acknowledgement." | |
| ), | |
| inspection_notes=( | |
| "The order was delivered, but the strongest acknowledgement lives " | |
| "in the support transcript — not in the orders or shipping system. " | |
| "A first-pass packet will be missing required evidence." | |
| ), | |
| deadline_step=10, | |
| optimal_strategy="contest", | |
| acceptable_strategies=(), | |
| policy_guidance=( | |
| "Goods-not-received disputes need order confirmation plus a " | |
| "delivery acknowledgement. If the support transcript is the only " | |
| "delivery acknowledgement, attach it through the pre-arbitration " | |
| "response — do not skip straight to arbitration." | |
| ), | |
| policy_requirements=( | |
| "order confirmation", | |
| "support delivery acknowledgement", | |
| ), | |
| recommended_strategy="contest", | |
| resolution_summary=( | |
| "Recover the support acknowledgement in pre-arb. Escalating to " | |
| "arbitration without it forfeits $250 on a winnable case." | |
| ), | |
| weight=1.3, | |
| required_evidence_ids=("P1-ORDER-CONF", "P1-SUPPORT-CONF"), | |
| helpful_evidence_ids=("P1-DELIVERY-SCAN", "P1-RISK-CLEAR"), | |
| harmful_evidence_ids=(), | |
| card_network="visa", | |
| network_reason_code="13.1", | |
| response_window_days=30, | |
| compelling_evidence_category="CE 3.5 — Merchandise Not Received", | |
| evidence_by_system={ | |
| "orders": ( | |
| _ev( | |
| "P1-ORDER-CONF", | |
| "orders", | |
| "Order confirmation", | |
| "Order receipt with billed customer, shipping address, and SKU.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| ), | |
| "payment": ( | |
| _ev( | |
| "P1-AUTH", | |
| "payment", | |
| "Authorization capture", | |
| "Authorization approved and captured cleanly.", | |
| ), | |
| ), | |
| "shipping": ( | |
| _ev( | |
| "P1-DELIVERY-SCAN", | |
| "shipping", | |
| "Carrier delivery scan", | |
| "Carrier tracking shows the package delivered to the saved address.", | |
| helpful=True, | |
| ), | |
| ), | |
| "support": ( | |
| _ev( | |
| "P1-SUPPORT-CONF", | |
| "support", | |
| "Authenticated support acknowledgement", | |
| "Customer logged in and confirmed receipt of the package in chat the next day.", | |
| helpful=True, | |
| required=True, | |
| ), | |
| ), | |
| "refunds": ( | |
| _ev( | |
| "P1-NO-REFUND", | |
| "refunds", | |
| "Refund ledger", | |
| "No refund or goodwill credit was issued before the dispute opened.", | |
| ), | |
| ), | |
| "risk": ( | |
| _ev( | |
| "P1-RISK-CLEAR", | |
| "risk", | |
| "Risk summary", | |
| "Account has clean device fingerprint and prior fulfilled orders.", | |
| helpful=True, | |
| ), | |
| ), | |
| }, | |
| ), | |
| ), | |
| ), | |
| } | |
| def _rewrite_case_ids(case: InternalCase, new_case_id: str) -> InternalCase: | |
| """Clone a hand-authored case with case-local evidence ids. | |
| The long-horizon marathon deliberately reuses proven case blueprints, but | |
| every cloned case needs unique evidence ids so the observation stream does | |
| not look like repeated rows from the same dispute. | |
| """ | |
| id_map: dict[str, str] = {} | |
| rewritten_systems: dict[SystemName, tuple[InternalEvidence, ...]] = {} | |
| for system_name, items in case.evidence_by_system.items(): | |
| rewritten_items: list[InternalEvidence] = [] | |
| for item in items: | |
| new_evidence_id = f"{new_case_id}-{item.evidence_id}" | |
| id_map[item.evidence_id] = new_evidence_id | |
| rewritten_items.append(replace(item, evidence_id=new_evidence_id)) | |
| rewritten_systems[system_name] = tuple(rewritten_items) | |
| return replace( | |
| case, | |
| case_id=new_case_id, | |
| order_id=f"ORD-LH-{new_case_id[-2:]}", | |
| customer_id=f"CUST-LH-{new_case_id[-2:]}", | |
| evidence_by_system=rewritten_systems, | |
| required_evidence_ids=tuple(id_map[eid] for eid in case.required_evidence_ids), | |
| helpful_evidence_ids=tuple(id_map[eid] for eid in case.helpful_evidence_ids), | |
| harmful_evidence_ids=tuple(id_map[eid] for eid in case.harmful_evidence_ids), | |
| ) | |
| def _marathon_case( | |
| base: InternalCase, | |
| *, | |
| case_id: str, | |
| amount: float, | |
| arrival_step: int, | |
| deadline_step: int, | |
| weight: float, | |
| issuer_delay: int = 0, | |
| evidence_delay: int = 0, | |
| delayed_systems: tuple[SystemName, ...] = (), | |
| dispute_complexity: float | None = None, | |
| summary_prefix: str = "", | |
| ) -> InternalCase: | |
| cloned = _rewrite_case_ids(base, case_id) | |
| return replace( | |
| cloned, | |
| amount=amount, | |
| summary=f"{summary_prefix}{cloned.summary}" if summary_prefix else cloned.summary, | |
| deadline_step=deadline_step, | |
| weight=weight, | |
| arrival_step=arrival_step, | |
| issuer_response_delay_steps=issuer_delay, | |
| evidence_response_delay_steps=evidence_delay, | |
| delayed_systems=delayed_systems, | |
| dispute_complexity=( | |
| dispute_complexity | |
| if dispute_complexity is not None | |
| else cloned.dispute_complexity | |
| ), | |
| ) | |
| def _build_monthly_dispute_backlog_marathon() -> TaskScenario: | |
| """Theme #2 flagship: delayed, wave-based dispute backlog. | |
| This remains the same ChargebackOps domain, but makes planning genuinely | |
| long-horizon: cases arrive in waves, some evidence systems are asynchronous, | |
| and issuer reviews return several steps after submission. A local greedy | |
| policy can solve individual cases but loses portfolio value if it forgets | |
| pending work or ignores future deadlines. | |
| """ | |
| easy_delivery = TASKS["goods_not_received_easy"].cases[0] | |
| fraud_strong = TASKS["fraud_signal_ambiguity"].cases[0] | |
| pre_arb = TASKS["pre_arb_recovery_medium"].cases[0] | |
| hard_delivery, weak_fraud, refund_due = TASKS["queue_optimization_hard"].cases | |
| return TaskScenario( | |
| task_id="monthly_dispute_backlog_marathon", | |
| title="Monthly Dispute Backlog Marathon", | |
| difficulty="nightmare", | |
| objective=( | |
| "Manage a 60-step month-end chargeback backlog with wave arrivals, " | |
| "delayed evidence, delayed issuer responses, and arbitration ROI." | |
| ), | |
| description=( | |
| "A professional long-horizon backlog: twelve disputes arrive across " | |
| "the episode. The agent must remember pending issuer reviews, revisit " | |
| "delayed evidence, triage urgent refunds, contest positive-EV cases, " | |
| "and avoid spending $250 arbitration fees on weak packets." | |
| ), | |
| max_steps=60, | |
| cases=( | |
| _marathon_case( | |
| refund_due, | |
| case_id="CB-L01", | |
| amount=320.0, | |
| arrival_step=0, | |
| deadline_step=6, | |
| weight=1.2, | |
| summary_prefix="[Wave 1 urgent refund] ", | |
| ), | |
| _marathon_case( | |
| easy_delivery, | |
| case_id="CB-L02", | |
| amount=430.0, | |
| arrival_step=0, | |
| deadline_step=20, | |
| weight=1.1, | |
| issuer_delay=3, | |
| delayed_systems=("shipping",), | |
| evidence_delay=2, | |
| dispute_complexity=0.78, | |
| summary_prefix="[Wave 1 delayed carrier file] ", | |
| ), | |
| _marathon_case( | |
| fraud_strong, | |
| case_id="CB-L03", | |
| amount=880.0, | |
| arrival_step=0, | |
| deadline_step=24, | |
| weight=1.6, | |
| issuer_delay=4, | |
| delayed_systems=("risk",), | |
| evidence_delay=2, | |
| dispute_complexity=0.70, | |
| summary_prefix="[Wave 1 high-value CNP] ", | |
| ), | |
| _marathon_case( | |
| weak_fraud, | |
| case_id="CB-L04", | |
| amount=240.0, | |
| arrival_step=0, | |
| deadline_step=14, | |
| weight=0.8, | |
| summary_prefix="[Wave 1 weak fraud] ", | |
| ), | |
| _marathon_case( | |
| pre_arb, | |
| case_id="CB-L05", | |
| amount=700.0, | |
| arrival_step=8, | |
| deadline_step=34, | |
| weight=1.4, | |
| issuer_delay=3, | |
| delayed_systems=("support",), | |
| evidence_delay=2, | |
| dispute_complexity=0.68, | |
| summary_prefix="[Wave 2 pre-arb recovery] ", | |
| ), | |
| _marathon_case( | |
| hard_delivery, | |
| case_id="CB-L06", | |
| amount=1200.0, | |
| arrival_step=10, | |
| deadline_step=40, | |
| weight=1.9, | |
| issuer_delay=5, | |
| delayed_systems=("shipping", "support"), | |
| evidence_delay=2, | |
| dispute_complexity=0.62, | |
| summary_prefix="[Wave 2 enterprise furniture] ", | |
| ), | |
| _marathon_case( | |
| refund_due, | |
| case_id="CB-L07", | |
| amount=180.0, | |
| arrival_step=15, | |
| deadline_step=25, | |
| weight=0.9, | |
| summary_prefix="[Wave 2 small refund SLA] ", | |
| ), | |
| _marathon_case( | |
| fraud_strong, | |
| case_id="CB-L08", | |
| amount=980.0, | |
| arrival_step=18, | |
| deadline_step=45, | |
| weight=1.7, | |
| issuer_delay=4, | |
| delayed_systems=("support", "risk"), | |
| evidence_delay=3, | |
| dispute_complexity=0.65, | |
| summary_prefix="[Wave 3 returning-customer CNP] ", | |
| ), | |
| _marathon_case( | |
| weak_fraud, | |
| case_id="CB-L09", | |
| amount=310.0, | |
| arrival_step=25, | |
| deadline_step=38, | |
| weight=0.8, | |
| summary_prefix="[Wave 3 weak guest checkout] ", | |
| ), | |
| _marathon_case( | |
| easy_delivery, | |
| case_id="CB-L10", | |
| amount=560.0, | |
| arrival_step=30, | |
| deadline_step=55, | |
| weight=1.2, | |
| issuer_delay=3, | |
| delayed_systems=("shipping",), | |
| evidence_delay=2, | |
| dispute_complexity=0.74, | |
| summary_prefix="[Wave 4 delivery proof] ", | |
| ), | |
| _marathon_case( | |
| pre_arb, | |
| case_id="CB-L11", | |
| amount=750.0, | |
| arrival_step=35, | |
| deadline_step=58, | |
| weight=1.5, | |
| issuer_delay=3, | |
| delayed_systems=("support",), | |
| evidence_delay=2, | |
| dispute_complexity=0.66, | |
| summary_prefix="[Wave 4 support acknowledgement] ", | |
| ), | |
| _marathon_case( | |
| refund_due, | |
| case_id="CB-L12", | |
| amount=90.0, | |
| arrival_step=42, | |
| deadline_step=52, | |
| weight=0.6, | |
| summary_prefix="[Wave 5 low-value refund] ", | |
| ), | |
| ), | |
| ) | |
| TASKS["monthly_dispute_backlog_marathon"] = _build_monthly_dispute_backlog_marathon() | |
| def get_task(task_id: str) -> TaskScenario: | |
| """Look up a built-in task or generate one from a ``generated_*`` id.""" | |
| if task_id in TASKS: | |
| return TASKS[task_id] | |
| # Support generated task ids: generated_{difficulty}_s{seed} | |
| import re | |
| m = re.match(r"^generated_(easy|medium|hard|nightmare)_s(\d+)$", task_id) | |
| if m: | |
| try: | |
| from .case_generator import generate_task | |
| except ImportError: # pragma: no cover | |
| from case_generator import generate_task | |
| difficulty = m.group(1) | |
| seed = int(m.group(2)) | |
| return generate_task(seed, difficulty=difficulty) # type: ignore[arg-type] | |
| # Support ISO-derived task ids: iso_{difficulty}_{index} | |
| m_iso = re.match(r"^iso_(easy|medium|hard)_(\d+)$", task_id) | |
| if m_iso: | |
| try: | |
| from .iso_adapter import build_iso_task, load_iso_rows | |
| except ImportError: # pragma: no cover | |
| from iso_adapter import build_iso_task, load_iso_rows | |
| difficulty = m_iso.group(1) | |
| task_index = int(m_iso.group(2)) | |
| rows = load_iso_rows() | |
| if rows: | |
| import random as _rng_mod | |
| shuffled = list(rows) | |
| _rng_mod.Random(42).shuffle(shuffled) | |
| task = build_iso_task( | |
| shuffled, | |
| difficulty=difficulty, | |
| start_index=task_index * 4, | |
| task_index=task_index, | |
| ) | |
| if task is not None: | |
| return task | |
| # Support Stripe-derived task ids: stripe_{difficulty}_{index} | |
| m_stripe = re.match(r"^stripe_(easy|medium|hard)_(\d+)$", task_id) | |
| if m_stripe: | |
| try: | |
| from connectors.stripe_sandbox import fetch_disputes, build_stripe_task | |
| except ImportError: # pragma: no cover | |
| from ..connectors.stripe_sandbox import fetch_disputes, build_stripe_task | |
| disputes = fetch_disputes(limit=10) | |
| task = build_stripe_task( | |
| disputes, difficulty=m_stripe.group(1), task_index=int(m_stripe.group(2)) | |
| ) | |
| if task is not None: | |
| return task | |
| raise ValueError(f"Unknown task_id '{task_id}'. Available: {', '.join(TASKS)}") | |
| def list_tasks() -> list[TaskScenario]: | |
| """Return the fixed benchmark task catalog. | |
| The catalog is deterministic and identical across all deployments: | |
| - **Showcase** (3): hand-crafted built-in tasks for demos and README. | |
| - **Generated holdout** (7): seeded tasks never used for agent tuning. | |
| ISO replay tasks are available via ``list_iso_tasks()`` and the | |
| ``/generate`` endpoint but are excluded from the default catalog so | |
| that scores and task counts are always comparable. | |
| """ | |
| try: | |
| from .case_generator import generate_task | |
| except ImportError: # pragma: no cover | |
| from case_generator import generate_task | |
| # --- Showcase split (fixed, hand-crafted) --- | |
| showcase = [ | |
| TASKS[task_id] | |
| for task_id in [ | |
| "goods_not_received_easy", | |
| "fraud_signal_ambiguity", | |
| "pre_arb_recovery_medium", | |
| "queue_optimization_hard", | |
| "monthly_dispute_backlog_marathon", | |
| ] | |
| ] | |
| # --- Generated holdout split (seeded, never used for tuning) --- | |
| holdout = [ | |
| generate_task(seed=42, difficulty="easy"), | |
| generate_task(seed=17, difficulty="medium"), | |
| generate_task(seed=99, difficulty="medium"), | |
| generate_task(seed=7, difficulty="hard"), | |
| generate_task(seed=53, difficulty="hard"), | |
| generate_task(seed=31, difficulty="nightmare"), | |
| generate_task(seed=77, difficulty="nightmare"), | |
| ] | |
| return showcase + holdout | |
| def list_iso_tasks() -> list[TaskScenario]: | |
| """Return ISO 20022 replay tasks. Raises on failure instead of | |
| silently returning an empty list so data/import issues are visible.""" | |
| try: | |
| from .iso_adapter import generate_iso_suite | |
| except ImportError: # pragma: no cover | |
| from iso_adapter import generate_iso_suite | |
| return generate_iso_suite(easy_count=1, medium_count=1, hard_count=1) | |