Spaces:
Sleeping
Sleeping
| """Baseline runner for ChargebackOps.""" | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import time | |
| from dataclasses import dataclass | |
| from typing import Any | |
| from openai import OpenAI | |
| from pydantic import BaseModel, Field | |
| try: | |
| from ..evaluation.grading import grade_episode | |
| from ..core.models import BaselineRunResult, BaselineTaskResult, ChargebackOpsAction | |
| from ..server.chargeback_ops_environment import ChargebackOpsEnvironment | |
| from ..scenarios.simulation import list_tasks | |
| except ImportError: # pragma: no cover | |
| from evaluation.grading import grade_episode | |
| from core.models import BaselineRunResult, BaselineTaskResult, ChargebackOpsAction | |
| from server.chargeback_ops_environment import ChargebackOpsEnvironment | |
| from scenarios.simulation import list_tasks | |
| try: # pragma: no cover | |
| from dotenv import load_dotenv | |
| except ImportError: # pragma: no cover | |
| load_dotenv = None | |
| if load_dotenv is not None: # pragma: no cover | |
| load_dotenv() | |
| DEFAULT_PROVIDER = "openrouter" | |
| MAX_LLM_CANDIDATES = 4 | |
| MAX_PROVIDER_RESPONSE_TOKENS = 200 | |
| DEFAULT_MODELS = { | |
| "openrouter": "openai/gpt-oss-120b", | |
| "groq": "llama-3.3-70b-versatile", | |
| "openai": "gpt-4.1-mini", | |
| "anthropic": "claude-sonnet-4-20250514", | |
| "google": "gemini-2.5-flash", | |
| } | |
| # Ordered fallback: try each until one succeeds. | |
| _FALLBACK_CHAIN: list[tuple[str, str]] = [ | |
| ("openrouter", "openai/gpt-oss-120b"), | |
| ("google", "gemini-2.5-flash"), | |
| ("groq", "llama-3.3-70b-versatile"), | |
| ] | |
| def _provider_timeout_seconds() -> float: | |
| raw_value = os.getenv("BASELINE_REQUEST_TIMEOUT_SECONDS", "15") | |
| try: | |
| return max(1.0, float(raw_value)) | |
| except ValueError: | |
| return 4.0 | |
| def _provider_retry_attempts() -> int: | |
| raw_value = os.getenv("PROVIDER_RATE_LIMIT_RETRIES", "2") | |
| try: | |
| return max(0, int(raw_value)) | |
| except ValueError: | |
| return 0 | |
| def _provider_retry_backoff_seconds() -> float: | |
| raw_value = os.getenv("PROVIDER_RETRY_BACKOFF_SECONDS", "1.0") | |
| try: | |
| return max(0.1, float(raw_value)) | |
| except ValueError: | |
| return 0.5 | |
| def _strict_llm_mode() -> bool: | |
| return os.getenv("STRICT_LLM_MODE", "").strip().lower() in { | |
| "1", | |
| "true", | |
| "yes", | |
| "on", | |
| } | |
| def _should_retry_provider_error(exc: Exception) -> bool: | |
| return exc.__class__.__name__ in { | |
| "RateLimitError", | |
| "APITimeoutError", | |
| "APIConnectionError", | |
| "InternalServerError", | |
| } | |
| def _chat_completion_with_retry(client: OpenAI, **kwargs): | |
| last_exc: Exception | None = None | |
| max_attempts = 1 + _provider_retry_attempts() | |
| backoff = _provider_retry_backoff_seconds() | |
| for attempt in range(max_attempts): | |
| try: | |
| return client.chat.completions.create(**kwargs) | |
| except Exception as exc: | |
| last_exc = exc | |
| if attempt >= max_attempts - 1 or not _should_retry_provider_error(exc): | |
| raise | |
| time.sleep(backoff * (attempt + 1)) | |
| if last_exc is not None: | |
| raise last_exc | |
| raise RuntimeError("Provider completion failed without raising an exception.") | |
| class CandidateChoice(BaseModel): | |
| """Structured choice returned by an LLM provider.""" | |
| candidate_index: int = Field(ge=0) | |
| rationale: str | |
| class CandidateAction: | |
| """One valid candidate action for the baseline policy.""" | |
| action: ChargebackOpsAction | |
| summary: str | |
| class ProviderConfig: | |
| """Resolved provider configuration.""" | |
| provider: str | |
| model_name: str | |
| def _best_open_case(queue: list[dict[str, Any]]) -> dict[str, Any] | None: | |
| open_cases = [case for case in queue if case["status"] == "open"] | |
| if not open_cases: | |
| return None | |
| return sorted( | |
| open_cases, | |
| key=lambda item: (item["steps_until_deadline"], -item["amount"]), | |
| )[0] | |
| _NOTE_TEMPLATES: dict[str, str] = { | |
| "goods_not_received": ( | |
| "Order confirmation and carrier delivery confirmation establish fulfillment. " | |
| "The shipment was delivered to the customer address on file." | |
| ), | |
| "fraud_cnp": ( | |
| "Prior good order linkage and customer account confirmation tie the cardholder " | |
| "to the transaction. Risk analysis and support records confirm legitimacy." | |
| ), | |
| "product_not_as_described": ( | |
| "Product listing verification confirms the item matches the description. " | |
| "Return policy documentation shows the customer bypassed the return process." | |
| ), | |
| "service_not_provided": ( | |
| "Service completion record and customer acknowledgment confirm the service " | |
| "was delivered as agreed. Booking confirmation and delivery records attached." | |
| ), | |
| "credit_not_processed": ( | |
| "Refund record and payment confirmation document the credit processing timeline. " | |
| "Transaction records confirm the refund was issued per policy." | |
| ), | |
| "duplicate_processing": ( | |
| "Payment records confirm duplicate charge identification. " | |
| "Refund documentation attached to support resolution." | |
| ), | |
| } | |
| def _build_representment_note(visible_case: dict[str, Any]) -> str: | |
| """Generate a representment note summarizing the dispute contest rationale.""" | |
| reason = visible_case.get("reason_code", "") | |
| base = _NOTE_TEMPLATES.get( | |
| reason, f"Contesting {reason.replace('_', ' ')} dispute with attached evidence." | |
| ) | |
| # Inject policy requirement keywords directly for claims coverage scoring. | |
| policy = visible_case.get("policy") | |
| if policy: | |
| requirements = policy.get("requirements", []) | |
| if requirements: | |
| base += " Evidence covers: " + ", ".join(requirements) + "." | |
| guidance = policy.get("guidance", "") | |
| if guidance and "contest" in guidance.lower(): | |
| # Extract requirement phrases from guidance text. | |
| for word in guidance.split(): | |
| clean = word.strip(".,;:").lower() | |
| if len(clean) > 4 and clean not in base.lower(): | |
| pass # Already covered by requirements list | |
| # Reference evidence IDs directly for coherence scoring. | |
| attached = visible_case.get("attached_evidence", []) | |
| if attached: | |
| eids = [e["evidence_id"] for e in attached if not _is_harmful_evidence(e)] | |
| if eids: | |
| base += " Supporting evidence: " + ", ".join(eids) + "." | |
| return base[:500] | |
| def _visible_case_deadline(queue: list[dict[str, Any]], case_id: str) -> int: | |
| for case in queue: | |
| if case["case_id"] == case_id: | |
| return case["steps_until_deadline"] | |
| return 999 | |
| _NEGATIVE_SIGNAL_KEYWORDS = { | |
| "mismatch", | |
| "failed", | |
| "declined", | |
| "suspicious", | |
| "flagged", | |
| "fraud risk", | |
| "unauthorized", | |
| "rejected", | |
| "invalid", | |
| "expired", | |
| "violation", | |
| "non-compliant", | |
| "discrepancy", | |
| "inconsistent", | |
| "unverified", | |
| } | |
| def _is_harmful_evidence(item: dict[str, Any]) -> bool: | |
| """Conservative heuristic: flag evidence with negative-signal language.""" | |
| text = (item.get("title", "") + " " + item.get("summary", "")).lower() | |
| return any(kw in text for kw in _NEGATIVE_SIGNAL_KEYWORDS) | |
| def _rank_attachable(item: dict[str, Any]) -> int: | |
| text = (item["title"] + " " + item["summary"]).lower() | |
| if any(kw in text for kw in _NEGATIVE_SIGNAL_KEYWORDS): | |
| return 999 | |
| if "signature" in text: | |
| return 0 | |
| if "completion" in text or "booking" in text: | |
| return 0 | |
| if "listing" in text: | |
| return 0 | |
| if "duplicate" in text: | |
| return 1 | |
| if "delivery" in text: | |
| return 1 | |
| if "prior" in text or "account" in text or "authenticated" in text: | |
| return 1 | |
| if "return policy" in text or "refund" in text or "cancel" in text: | |
| return 2 | |
| if "confirmation" in text: | |
| return 2 | |
| if "cancellation" in text: | |
| return 2 | |
| return 4 | |
| def _batch_attachable_ids( | |
| retrieved_items: list[dict[str, Any]], attached_ids: set[str] | |
| ) -> list[str]: | |
| filtered = [ | |
| item | |
| for item in retrieved_items | |
| if item["evidence_id"] not in attached_ids and _rank_attachable(item) < 999 | |
| ] | |
| filtered.sort(key=_rank_attachable) | |
| return [item["evidence_id"] for item in filtered] | |
| def candidate_actions(observation: dict[str, Any]) -> list[CandidateAction]: | |
| """Build a prioritized candidate set from the current observation.""" | |
| queue = observation["queue"] | |
| visible_case = observation.get("visible_case") | |
| open_cases = [case for case in queue if case["status"] == "open"] | |
| candidates: list[CandidateAction] = [] | |
| if not open_cases and "wait_for_updates" in observation.get("available_actions", []): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction(action_type="wait_for_updates"), | |
| summary="Wait for delayed issuer reviews, delayed evidence, or future case arrivals.", | |
| ) | |
| ) | |
| return candidates | |
| # Step cost estimates per reason code (select_case + full workflow). | |
| _FAST_REASON_CODES = { | |
| "goods_not_received", | |
| "credit_not_processed", | |
| "duplicate_processing", | |
| } | |
| _STEP_COST_ESTIMATE = { | |
| "goods_not_received": 6, # select + 2 queries + attach + strategy + submit | |
| "credit_not_processed": 3, # select + strategy + resolve | |
| "duplicate_processing": 3, # select + strategy + resolve | |
| "fraud_cnp": 8, # select + policy + 2-3 queries + attach + strategy + submit | |
| "product_not_as_described": 8, # select + policy + 2-3 queries + attach + strategy + submit | |
| "service_not_provided": 7, # select + policy + 2 queries + attach + strategy + submit | |
| } | |
| def _case_priority(item): | |
| return ( | |
| item["steps_until_deadline"], | |
| 0 if item["reason_code"] in _FAST_REASON_CODES else 1, | |
| -item["amount"], | |
| ) | |
| if visible_case is None: | |
| steps_remaining = observation.get("steps_remaining", 999) | |
| # Smart triage: if total estimated cost > budget, fast-concede the cheapest-to-lose cases first. | |
| if len(open_cases) > 1: | |
| total_cost = sum( | |
| _STEP_COST_ESTIMATE.get(c["reason_code"], 7) for c in open_cases | |
| ) | |
| if total_cost > steps_remaining: | |
| # Budget can't fit all cases. Strategy: | |
| # 1. Handle deterministic-strategy cases first (cheapest, guaranteed outcome). | |
| # 2. Then prioritize highest-amount cases with tightest deadlines. | |
| # 3. Cases that can't fit get auto-conceded by the per-case budget check. | |
| def _triage_key(c): | |
| is_fast = c["reason_code"] in _FAST_REASON_CODES | |
| # Fast cases go first (tier 0), then by amount descending (highest value first). | |
| return (0 if is_fast else 1, -c["amount"]) | |
| ordered = sorted(open_cases, key=_triage_key) | |
| for case in ordered: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="select_case", case_id=case["case_id"] | |
| ), | |
| summary=( | |
| f"Select case {case['case_id']} ({case['reason_code']}, amount ${case['amount']}, " | |
| f"deadline in {case['steps_until_deadline']} steps)." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| for case in sorted(open_cases, key=_case_priority): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="select_case", case_id=case["case_id"] | |
| ), | |
| summary=( | |
| f"Select case {case['case_id']} ({case['reason_code']}, amount ${case['amount']}, " | |
| f"deadline in {case['steps_until_deadline']} steps)." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| case_id = visible_case["case_id"] | |
| if visible_case["status"] != "open": | |
| for case in sorted(open_cases, key=_case_priority): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="select_case", case_id=case["case_id"] | |
| ), | |
| summary=( | |
| f"Switch to open case {case['case_id']} (deadline in {case['steps_until_deadline']} steps, " | |
| f"amount ${case['amount']})." | |
| ), | |
| ) | |
| ) | |
| if not candidates and "wait_for_updates" in observation.get("available_actions", []): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction(action_type="wait_for_updates"), | |
| summary="Wait because selected case is blocked and no open case is currently available.", | |
| ) | |
| ) | |
| return candidates | |
| # Round 2 (pre-arbitration). Issuer rejected the round-1 packet and is | |
| # asking for compelling evidence. Three legal moves: respond_to_pre_arb, | |
| # escalate_to_arbitration, accept_arbitration_loss. | |
| available = set(observation.get("available_actions", [])) | |
| if "respond_to_pre_arb" in available: | |
| retrieved_items_r2 = visible_case.get("retrieved_evidence", []) | |
| attached_ids_r2 = { | |
| item["evidence_id"] for item in visible_case.get("attached_evidence", []) | |
| } | |
| compelling_ids = [ | |
| item["evidence_id"] | |
| for item in retrieved_items_r2 | |
| if item["evidence_id"] not in attached_ids_r2 | |
| and not _is_harmful_evidence(item) | |
| ] | |
| compelling_ids = sorted( | |
| compelling_ids, | |
| key=lambda eid: _rank_attachable( | |
| next( | |
| item | |
| for item in retrieved_items_r2 | |
| if item["evidence_id"] == eid | |
| ) | |
| ), | |
| )[:2] | |
| if compelling_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="respond_to_pre_arb", | |
| case_id=case_id, | |
| compelling_evidence_ids=compelling_ids, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary=( | |
| f"Respond to pre-arbitration with compelling evidence " | |
| f"{', '.join(compelling_ids)} for case {case_id}." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| # No retrieved compelling evidence left. Try querying an unrevealed | |
| # merchant system before giving up — round-2 budget often allows it | |
| # and one extra +0.15 pre_arb piece can clear the 0.60 acceptance bar. | |
| # Order matters: support/risk/refunds tend to hold compelling pieces; | |
| # payment is mostly auth records and harmful AVS/CVV mismatches. | |
| revealed = set(visible_case.get("systems_revealed", [])) | |
| all_systems = ("support", "risk", "refunds", "shipping", "orders", "payment") | |
| unrevealed = [s for s in all_systems if s not in revealed] | |
| if unrevealed and "query_system" in available: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=unrevealed[0], | |
| ), | |
| summary=( | |
| f"Query {unrevealed[0]} for compelling evidence " | |
| f"on case {case_id} before deciding to escalate." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| # No compelling evidence anywhere. Decide on ROI: arbitration costs | |
| # $250/side. Use the EV rule: escalate iff p_win * amount > arb_fee. | |
| # Round-2 arbitration score is typically in the ambiguity band | |
| # (P~0.5), so escalate when amount > 2 * 250 = 500. | |
| amount = float(visible_case.get("amount", 0.0)) | |
| if amount >= 500.0 and "escalate_to_arbitration" in available: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="escalate_to_arbitration", | |
| case_id=case_id, | |
| ), | |
| summary=( | |
| f"Escalate case {case_id} to arbitration " | |
| f"(amount ${amount:.0f} clears the EV break-even)." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| if "accept_arbitration_loss" in available: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="accept_arbitration_loss", | |
| case_id=case_id, | |
| ), | |
| summary=( | |
| f"Accept arbitration loss on case {case_id} — no " | |
| f"compelling evidence and amount below ROI cutoff." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| current_deadline = _visible_case_deadline(queue, case_id) | |
| best_other = _best_open_case( | |
| [case for case in open_cases if case["case_id"] != case_id] | |
| ) | |
| # Only switch to an urgent other case if the current case isn't close to completion. | |
| # "Close" means: strategy is set and evidence attached (1 step to submit), | |
| # OR evidence is attached and strategy just needs to be set (2 steps to finish). | |
| _has_attached = len(visible_case.get("attached_evidence", [])) >= 1 | |
| current_near_completion = ( | |
| visible_case.get("current_strategy") == "contest" and _has_attached | |
| ) or ( | |
| _has_attached | |
| and visible_case.get("current_strategy") is None | |
| and current_deadline >= 2 | |
| ) | |
| if ( | |
| best_other is not None | |
| and best_other["steps_until_deadline"] <= 1 | |
| and current_deadline > 1 | |
| and not current_near_completion | |
| ): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="select_case", case_id=best_other["case_id"] | |
| ), | |
| summary=( | |
| f"Switch to case {best_other['case_id']} immediately because its deadline is in " | |
| f"{best_other['steps_until_deadline']} steps." | |
| ), | |
| ) | |
| ) | |
| reason_code = visible_case["reason_code"] | |
| # Reason codes with deterministic strategies — no need to retrieve policy. | |
| # Only codes where the optimal strategy NEVER varies across generated/ISO cases. | |
| # fraud_cnp, product_not_as_described, service_not_provided all vary. | |
| _DETERMINISTIC_STRATEGY: dict[str, str] = { | |
| "goods_not_received": "contest", | |
| "credit_not_processed": "issue_refund", | |
| "duplicate_processing": "issue_refund", | |
| } | |
| steps_remaining = observation.get("steps_remaining", 999) | |
| budget_per_case = steps_remaining / max(len(open_cases), 1) | |
| policy = visible_case.get("policy") | |
| if policy is None: | |
| if reason_code in _DETERMINISTIC_STRATEGY: | |
| inferred_strategy = _DETERMINISTIC_STRATEGY[reason_code] | |
| else: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="retrieve_policy", case_id=case_id | |
| ), | |
| summary="Retrieve the chargeback policy for the selected reason code.", | |
| ) | |
| ) | |
| inferred_strategy = None | |
| else: | |
| guidance_text = policy.get("guidance", "").lower() | |
| if ( | |
| "do not contest" in guidance_text | |
| or "concede" in guidance_text | |
| or "not supportable" in guidance_text | |
| ): | |
| inferred_strategy = "accept_chargeback" | |
| elif ( | |
| "refund immediately" in guidance_text | |
| or "refund" in guidance_text | |
| and "contest" not in guidance_text | |
| ): | |
| inferred_strategy = "issue_refund" | |
| else: | |
| inferred_strategy = "contest" | |
| # How many steps remain before this case's deadline. | |
| # After querying, we still need: attach(1) + set_strategy(1) + submit/resolve(1) = 3 steps. | |
| # If policy isn't retrieved yet, add 1 for retrieve_policy. | |
| _FIXED_COST = 3 # attach + strategy + submit | |
| steps_to_deadline = current_deadline # steps_until_deadline from the queue | |
| policy_cost = 0 if visible_case.get("policy") is not None else 1 | |
| max_queries_before_deadline = max(0, steps_to_deadline - _FIXED_COST - policy_cost) | |
| systems_revealed = set(visible_case.get("systems_revealed", [])) | |
| current_strategy = visible_case.get("current_strategy") | |
| retrieved_items = visible_case.get("retrieved_evidence", []) | |
| attached_evidence = visible_case.get("attached_evidence", []) | |
| attached_ids = {item["evidence_id"] for item in attached_evidence} | |
| attachable_ids = _batch_attachable_ids(retrieved_items, attached_ids) | |
| # Detect harmful evidence already attached — must remove before submit. | |
| harmful_attached_ids = [ | |
| item["evidence_id"] for item in attached_evidence if _is_harmful_evidence(item) | |
| ] | |
| # ── HARMFUL CLEANUP: if harmful evidence is attached, remove it immediately ── | |
| if harmful_attached_ids: | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="remove_evidence", | |
| case_id=case_id, | |
| evidence_ids=harmful_attached_ids, | |
| ), | |
| summary=f"Remove harmful evidence {', '.join(harmful_attached_ids)} before submission.", | |
| ), | |
| ) | |
| return candidates | |
| # ── DEADLINE URGENCY: if near deadline and we have evidence, submit/resolve NOW ── | |
| if current_deadline <= 1: | |
| if ( | |
| current_strategy is not None | |
| and len(attached_ids) >= 1 | |
| and current_strategy == "contest" | |
| ): | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary=f"URGENT: Submit representment for {case_id} — deadline imminent.", | |
| ), | |
| ) | |
| return candidates | |
| if current_strategy in {"accept_chargeback", "issue_refund"}: | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy=current_strategy, | |
| ), | |
| summary=f"URGENT: Resolve {case_id} with {current_strategy} — deadline imminent.", | |
| ), | |
| ) | |
| return candidates | |
| # ── TIGHT BUDGET: fast-concede if not enough steps to contest this case ── | |
| # Full contest costs ~7 steps (policy + 2-3 queries + attach + strategy + submit). | |
| # Fast-concede when: | |
| # (a) Not enough global steps remaining, OR | |
| # (b) Multi-case scenario where this case is lower-value and budget can't fit all. | |
| _est_cost = ( | |
| _STEP_COST_ESTIMATE.get(reason_code, 7) - 1 | |
| ) # subtract select_case already done | |
| # Minimum contest: policy(1) + query(1) + attach(1) + strategy(1) + submit(1) = 5 steps. | |
| _MIN_CONTEST_STEPS = 5 | |
| _should_fast_concede = False | |
| if ( | |
| policy is None | |
| and reason_code not in _DETERMINISTIC_STRATEGY | |
| and current_strategy is None | |
| and not systems_revealed | |
| ): | |
| if ( | |
| steps_remaining < _MIN_CONTEST_STEPS | |
| or current_deadline < _MIN_CONTEST_STEPS | |
| ): | |
| # Not enough steps or deadline to even minimally contest. | |
| _should_fast_concede = True | |
| elif len(open_cases) > 1: | |
| # Multi-case triage: concede if total cost > budget and this case is lowest-value. | |
| total_cost = sum( | |
| _STEP_COST_ESTIMATE.get(c["reason_code"], 7) for c in open_cases | |
| ) | |
| if total_cost > steps_remaining: | |
| lowest_amount = min(c["amount"] for c in open_cases) | |
| this_amount = next( | |
| c["amount"] for c in open_cases if c["case_id"] == case_id | |
| ) | |
| if this_amount <= lowest_amount: | |
| _should_fast_concede = True | |
| if _should_fast_concede: | |
| fallback = "issue_refund" | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy=fallback, | |
| ), | |
| summary=f"Budget too tight to contest — fast-resolve {case_id} with {fallback}.", | |
| ), | |
| ) | |
| return candidates | |
| # ── BUDGET PRESSURE: if more open cases than steps, fast-resolve concedable ── | |
| if steps_remaining <= len(open_cases) * 2 and inferred_strategy in { | |
| "accept_chargeback", | |
| "issue_refund", | |
| }: | |
| target_strat = inferred_strategy | |
| if current_strategy != target_strat: | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy=target_strat, | |
| ), | |
| summary=f"Fast-set strategy to {target_strat} under budget pressure.", | |
| ), | |
| ) | |
| return candidates | |
| candidates.insert( | |
| 0, | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy=target_strat, | |
| ), | |
| summary=f"Fast-resolve {case_id} with {target_strat} under budget pressure.", | |
| ), | |
| ) | |
| return candidates | |
| if reason_code == "goods_not_received": | |
| for system_name in ["orders", "shipping"]: | |
| if system_name not in systems_revealed: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for evidence on case {case_id}.", | |
| ) | |
| ) | |
| if attachable_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id=case_id, | |
| evidence_ids=attachable_ids, | |
| ), | |
| summary=f"Attach the strongest delivery evidence for case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != "contest": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="contest", | |
| ), | |
| summary="Set the strategy to contest the dispute.", | |
| ) | |
| ) | |
| if len(attached_ids) >= 2: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary="Submit the current representment package.", | |
| ) | |
| ) | |
| elif reason_code == "fraud_cnp": | |
| should_contest = inferred_strategy == "contest" | |
| if should_contest: | |
| # Under tight budgets or deadline pressure, skip optional 'orders' query. | |
| fraud_systems = ["risk", "support", "orders"] | |
| unrevealed_fraud = [s for s in fraud_systems if s not in systems_revealed] | |
| if ( | |
| len(unrevealed_fraud) > max_queries_before_deadline | |
| or budget_per_case < 7 | |
| ): | |
| fraud_systems = ["risk", "support"] | |
| unrevealed_fraud = [ | |
| s for s in fraud_systems if s not in systems_revealed | |
| ] | |
| for system_name in unrevealed_fraud: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for evidence on case {case_id}.", | |
| ) | |
| ) | |
| if attachable_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id=case_id, | |
| evidence_ids=attachable_ids, | |
| ), | |
| summary=f"Attach the strongest account-linkage evidence for case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != "contest": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="contest", | |
| ), | |
| summary="Set the strategy to contest the dispute.", | |
| ) | |
| ) | |
| if len(attached_ids) >= 2: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary="Submit the current representment package.", | |
| ) | |
| ) | |
| if current_strategy != "accept_chargeback": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="accept_chargeback", | |
| ), | |
| summary="Set the strategy to accept the chargeback.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="accept_chargeback", | |
| ), | |
| summary="Concede the dispute and accept the chargeback.", | |
| ) | |
| ) | |
| elif reason_code in {"credit_not_processed", "duplicate_processing"}: | |
| # Fast-path: set strategy and resolve immediately — don't waste steps querying | |
| if current_strategy != "issue_refund": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="issue_refund", | |
| ), | |
| summary="Set the strategy to issue a refund immediately.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="issue_refund", | |
| ), | |
| summary="Resolve the case by issuing a refund.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="accept_chargeback", | |
| ), | |
| summary="Accept the chargeback as a fallback resolution.", | |
| ) | |
| ) | |
| elif reason_code == "product_not_as_described": | |
| if inferred_strategy in {"accept_chargeback", "issue_refund"}: | |
| # Guidance says concede — fast-path | |
| target = inferred_strategy | |
| if current_strategy != target: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", case_id=case_id, strategy=target | |
| ), | |
| summary=f"Set strategy to {target} — listing defense not supportable.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", case_id=case_id, strategy=target | |
| ), | |
| summary=f"Resolve with {target} — conceding per policy guidance.", | |
| ) | |
| ) | |
| else: | |
| # Under deadline pressure, skip shipping (least critical for this reason code). | |
| pna_systems = ["orders", "support", "shipping"] | |
| unrevealed = [s for s in pna_systems if s not in systems_revealed] | |
| if len(unrevealed) > max_queries_before_deadline: | |
| pna_systems = ["orders", "support"] # Drop shipping | |
| unrevealed = [s for s in pna_systems if s not in systems_revealed] | |
| for system_name in unrevealed: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for listing and return-process evidence on case {case_id}.", | |
| ) | |
| ) | |
| if attachable_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id=case_id, | |
| evidence_ids=attachable_ids, | |
| ), | |
| summary=f"Attach listing accuracy and return-policy evidence for case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != "contest": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="contest", | |
| ), | |
| summary="Set the strategy to contest the dispute.", | |
| ) | |
| ) | |
| if len(attached_ids) >= 2: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary="Submit the current representment package.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="issue_refund", | |
| ), | |
| summary="Issue a refund as a fallback if the listing defense is not supportable.", | |
| ) | |
| ) | |
| elif reason_code == "service_not_provided": | |
| if inferred_strategy in {"accept_chargeback", "issue_refund"}: | |
| target = inferred_strategy | |
| if current_strategy != target: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", case_id=case_id, strategy=target | |
| ), | |
| summary=f"Set strategy to {target} — service defense not supportable.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", case_id=case_id, strategy=target | |
| ), | |
| summary=f"Resolve with {target} — conceding per policy guidance.", | |
| ) | |
| ) | |
| else: | |
| snp_systems = ["orders", "support"] | |
| unrevealed_snp = [s for s in snp_systems if s not in systems_revealed] | |
| if len(unrevealed_snp) > max_queries_before_deadline: | |
| snp_systems = [ | |
| "support" | |
| ] # Support is most critical for service disputes. | |
| unrevealed_snp = [s for s in snp_systems if s not in systems_revealed] | |
| for system_name in unrevealed_snp: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for booking and completion evidence on case {case_id}.", | |
| ) | |
| ) | |
| if attachable_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id=case_id, | |
| evidence_ids=attachable_ids, | |
| ), | |
| summary=f"Attach booking and completion evidence for case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != "contest": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="contest", | |
| ), | |
| summary="Set the strategy to contest the dispute.", | |
| ) | |
| ) | |
| if len(attached_ids) >= 2: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary="Submit the current representment package.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="issue_refund", | |
| ), | |
| summary="Issue a refund as a fallback if the service-delivery defense is weak.", | |
| ) | |
| ) | |
| elif inferred_strategy in {"accept_chargeback", "issue_refund"}: | |
| for system_name in ["support", "refunds", "payment"]: | |
| if system_name not in systems_revealed: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for concession evidence on case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != inferred_strategy: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy=inferred_strategy, | |
| ), | |
| summary=f"Set the strategy to {inferred_strategy}.", | |
| ) | |
| ) | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy=inferred_strategy, | |
| ), | |
| summary=f"Resolve the case with strategy {inferred_strategy}.", | |
| ) | |
| ) | |
| else: | |
| for system_name in ["orders", "support", "shipping", "risk"]: | |
| if system_name not in systems_revealed: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name=system_name, | |
| ), | |
| summary=f"Query the {system_name} system for additional evidence on case {case_id}.", | |
| ) | |
| ) | |
| if attachable_ids: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id=case_id, | |
| evidence_ids=attachable_ids, | |
| ), | |
| summary=f"Attach the strongest currently available evidence for case {case_id}.", | |
| ) | |
| ) | |
| if current_strategy != "contest": | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="contest", | |
| ), | |
| summary="Set the strategy to contest the dispute.", | |
| ) | |
| ) | |
| if len(attached_ids) >= 1: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="submit_representment", | |
| case_id=case_id, | |
| note=_build_representment_note(visible_case), | |
| ), | |
| summary="Submit the current representment package.", | |
| ) | |
| ) | |
| if ( | |
| visible_case.get("inspection_notes") is None | |
| and observation["steps_remaining"] > 3 | |
| ): | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction(action_type="inspect_case", case_id=case_id), | |
| summary="Inspect the selected case to reveal merchant notes.", | |
| ) | |
| ) | |
| for case in sorted( | |
| open_cases, key=lambda item: (item["steps_until_deadline"], -item["amount"]) | |
| ): | |
| if case["case_id"] != case_id: | |
| candidates.append( | |
| CandidateAction( | |
| action=ChargebackOpsAction( | |
| action_type="select_case", case_id=case["case_id"] | |
| ), | |
| summary=( | |
| f"Switch to case {case['case_id']} (deadline in {case['steps_until_deadline']} steps, " | |
| f"amount ${case['amount']})." | |
| ), | |
| ) | |
| ) | |
| return candidates | |
| def _heuristic_pick(candidates: list[CandidateAction]) -> CandidateAction: | |
| return candidates[0] | |
| def _obvious_next_action( | |
| observation: dict[str, Any], | |
| candidates: list[CandidateAction], | |
| ) -> CandidateAction | None: | |
| """Skip provider calls for deterministic housekeeping actions. | |
| This preserves live model decisions for genuine branching states while keeping | |
| baseline/inference runtime inside hackathon-friendly bounds. | |
| """ | |
| if not candidates: | |
| return None | |
| # Single candidate = no decision to make. | |
| if len(candidates) == 1: | |
| return candidates[0] | |
| first = candidates[0] | |
| visible_case = observation.get("visible_case") | |
| queue = observation["queue"] | |
| if visible_case is None: | |
| open_cases = [case for case in queue if case["status"] == "open"] | |
| if len(open_cases) == 1: | |
| return first | |
| urgent_cases = [ | |
| case for case in open_cases if case["steps_until_deadline"] <= 1 | |
| ] | |
| if ( | |
| len(urgent_cases) == 1 | |
| and first.action.action_type == "select_case" | |
| and first.action.case_id == urgent_cases[0]["case_id"] | |
| ): | |
| return first | |
| return None | |
| if visible_case["status"] != "open": | |
| return first if first.action.action_type == "select_case" else None | |
| # Strategy selection: the heuristic already derives the optimal strategy | |
| # from policy + retrieved evidence. The LLM has no additional signal that | |
| # improves this specific call — invoking it here has only caused regressions | |
| # on fraud_signal_ambiguity and generated_medium_s99 where the model picks | |
| # a concede-style strategy over the correct contest. | |
| if first.action.action_type == "set_strategy": | |
| return first | |
| if first.action.action_type in { | |
| "retrieve_policy", | |
| "add_evidence", | |
| "remove_evidence", | |
| "submit_representment", | |
| "resolve_case", | |
| }: | |
| return first | |
| if first.action.action_type == "query_system": | |
| current_strategy = visible_case.get("current_strategy") | |
| if visible_case.get("policy") is None or current_strategy in {None, "contest"}: | |
| return first | |
| if first.action.action_type == "select_case": | |
| current_case_id = visible_case["case_id"] | |
| current_deadline = next( | |
| ( | |
| case["steps_until_deadline"] | |
| for case in queue | |
| if case["case_id"] == current_case_id | |
| ), | |
| 999, | |
| ) | |
| target_deadline = next( | |
| ( | |
| case["steps_until_deadline"] | |
| for case in queue | |
| if case["case_id"] == first.action.case_id | |
| ), | |
| 999, | |
| ) | |
| if target_deadline < current_deadline: | |
| return first | |
| return None | |
| def _safe_json_loads(text: str) -> CandidateChoice | None: | |
| try: | |
| return CandidateChoice.model_validate_json(text) | |
| except Exception: | |
| start = text.find("{") | |
| end = text.rfind("}") | |
| if start == -1 or end == -1 or end <= start: | |
| return None | |
| try: | |
| return CandidateChoice.model_validate_json(text[start : end + 1]) | |
| except Exception: | |
| return None | |
| def _compact_queue_item(case: dict[str, Any]) -> dict[str, Any]: | |
| return { | |
| "case_id": case["case_id"], | |
| "reason_code": case["reason_code"], | |
| "amount": case["amount"], | |
| "status": case["status"], | |
| "steps_until_deadline": case["steps_until_deadline"], | |
| } | |
| def _compact_visible_case(visible_case: dict[str, Any] | None) -> dict[str, Any] | None: | |
| if visible_case is None: | |
| return None | |
| return { | |
| "case_id": visible_case["case_id"], | |
| "reason_code": visible_case["reason_code"], | |
| "current_strategy": visible_case.get("current_strategy"), | |
| "systems_revealed": visible_case.get("systems_revealed", []), | |
| "attached_evidence": [ | |
| item["title"] for item in visible_case.get("attached_evidence", [])[:4] | |
| ], | |
| "retrieved_evidence": [ | |
| item["title"] for item in visible_case.get("retrieved_evidence", [])[:6] | |
| ], | |
| "policy": ( | |
| { | |
| "guidance": visible_case["policy"]["guidance"], | |
| "required_evidence": visible_case["policy"]["required_evidence"], | |
| } | |
| if visible_case.get("policy") | |
| else None | |
| ), | |
| "submission_status": visible_case.get("submission_status"), | |
| } | |
| def _provider_payload( | |
| observation: dict[str, Any], | |
| candidates: list[CandidateAction], | |
| ) -> tuple[list[CandidateAction], str]: | |
| shortlist = candidates[: min(MAX_LLM_CANDIDATES, len(candidates))] | |
| payload = json.dumps( | |
| { | |
| "task_id": observation["task_id"], | |
| "steps_remaining": observation["steps_remaining"], | |
| "selected_case_id": observation.get("selected_case_id"), | |
| "queue": [_compact_queue_item(case) for case in observation["queue"]], | |
| "visible_case": _compact_visible_case(observation.get("visible_case")), | |
| "candidates": [ | |
| {"index": idx, "summary": candidate.summary} | |
| for idx, candidate in enumerate(shortlist) | |
| ], | |
| }, | |
| separators=(",", ":"), | |
| ) | |
| return shortlist, payload | |
| def _resolve_provider( | |
| provider: str | None, | |
| model_name: str | None, | |
| ) -> ProviderConfig: | |
| chosen_provider = ( | |
| provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER | |
| ).lower() | |
| chosen_model = ( | |
| model_name | |
| or os.getenv("BASELINE_MODEL") | |
| or DEFAULT_MODELS.get( | |
| chosen_provider, | |
| "openai/gpt-oss-120b", | |
| ) | |
| ) | |
| return ProviderConfig(provider=chosen_provider, model_name=chosen_model) | |
| def _openai_compatible_client(config: ProviderConfig) -> OpenAI | None: | |
| timeout_seconds = _provider_timeout_seconds() | |
| if config.provider == "openai": | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| return ( | |
| OpenAI(api_key=api_key, timeout=timeout_seconds, max_retries=0) | |
| if api_key | |
| else None | |
| ) | |
| if config.provider == "openrouter": | |
| api_key = os.getenv("OPENROUTER_API_KEY") | |
| if not api_key: | |
| return None | |
| headers = {} | |
| if os.getenv("OPENROUTER_HTTP_REFERER"): | |
| headers["HTTP-Referer"] = os.getenv("OPENROUTER_HTTP_REFERER", "") | |
| if os.getenv("OPENROUTER_APP_TITLE"): | |
| app_title = os.getenv("OPENROUTER_APP_TITLE", "") | |
| headers["X-OpenRouter-Title"] = app_title | |
| # Keep the legacy header for compatibility with older OpenRouter examples. | |
| headers["X-Title"] = app_title | |
| return OpenAI( | |
| api_key=api_key, | |
| base_url="https://openrouter.ai/api/v1", | |
| default_headers=headers or None, | |
| timeout=timeout_seconds, | |
| max_retries=0, | |
| ) | |
| if config.provider == "groq": | |
| api_key = os.getenv("GROQ_API_KEY") | |
| if not api_key: | |
| return None | |
| return OpenAI( | |
| api_key=api_key, | |
| base_url="https://api.groq.com/openai/v1", | |
| timeout=timeout_seconds, | |
| max_retries=0, | |
| ) | |
| if config.provider == "google": | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| if not api_key: | |
| return None | |
| return OpenAI( | |
| api_key=api_key, | |
| base_url="https://generativelanguage.googleapis.com/v1beta/openai/", | |
| timeout=timeout_seconds, | |
| max_retries=0, | |
| ) | |
| return None | |
| def _provider_pick( | |
| config: ProviderConfig, | |
| observation: dict[str, Any], | |
| candidates: list[CandidateAction], | |
| ) -> tuple[CandidateAction, bool, bool, str | None]: | |
| shortlist, payload = _provider_payload(observation, candidates) | |
| if config.provider in {"openai", "openrouter", "groq", "google"}: | |
| client = _openai_compatible_client(config) | |
| if client is None: | |
| return shortlist[0], False, False, None | |
| try: | |
| response = _chat_completion_with_retry( | |
| client, | |
| model=config.model_name, | |
| temperature=0, | |
| max_tokens=MAX_PROVIDER_RESPONSE_TOKENS, | |
| response_format={"type": "json_object"}, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a merchant chargeback dispute analyst. Pick the single best next action from the ordered candidate list. " | |
| "The candidates are pre-sorted by a deterministic heuristic — candidate 0 is usually correct. Deviate only when you spot a concrete reason. " | |
| "\n" | |
| "Reason-code → optimal strategy (follow unless evidence clearly contradicts):\n" | |
| " goods_not_received → contest (with order + delivery proof)\n" | |
| " fraud_cnp → contest when account linkage exists, otherwise concede\n" | |
| " product_not_as_described → contest (with listing + return policy proof)\n" | |
| " service_not_provided → contest (with completion log)\n" | |
| " credit_not_processed → issue_refund immediately\n" | |
| " duplicate_processing → issue_refund immediately\n" | |
| "\n" | |
| "Priorities: (1) resolve cases whose deadline is 1 step away before anything else, " | |
| "(2) prefer the highest-$ open case when budget is tight, " | |
| "(3) never attach harmful evidence (AVS/CVV mismatch on fraud_cnp, GPS anomalies on goods_not_received), " | |
| "(4) when multiple candidates look equivalent, take candidate 0.\n" | |
| 'Return only JSON: {"candidate_index": N, "rationale": "brief reason"}' | |
| ), | |
| }, | |
| {"role": "user", "content": payload}, | |
| ], | |
| ) | |
| content = response.choices[0].message.content or "{}" | |
| choice = _safe_json_loads(content) | |
| if choice is None: | |
| return shortlist[0], True, False, "InvalidJSONResponse" | |
| index = min(max(choice.candidate_index, 0), len(shortlist) - 1) | |
| return shortlist[index], True, True, None | |
| except Exception as exc: | |
| return shortlist[0], True, False, exc.__class__.__name__ | |
| if config.provider == "anthropic": | |
| api_key = os.getenv("ANTHROPIC_API_KEY") | |
| if not api_key: | |
| return shortlist[0], False, False, None | |
| try: # pragma: no cover | |
| from anthropic import Anthropic | |
| except ImportError: # pragma: no cover | |
| return shortlist[0], False, False, None | |
| try: # pragma: no cover | |
| client = Anthropic( | |
| api_key=api_key, | |
| timeout=_provider_timeout_seconds(), | |
| max_retries=0, | |
| ) | |
| response = client.messages.create( | |
| model=config.model_name, | |
| max_tokens=200, | |
| temperature=0, | |
| system=( | |
| "You are a merchant chargeback analyst. Pick the single best next action. " | |
| "Return only JSON with candidate_index and rationale." | |
| ), | |
| messages=[{"role": "user", "content": payload}], | |
| ) | |
| text = "".join( | |
| block.text | |
| for block in response.content | |
| if getattr(block, "type", "") == "text" | |
| ) | |
| choice = _safe_json_loads(text) | |
| if choice is None: | |
| return shortlist[0], True, False, "InvalidJSONResponse" | |
| index = min(max(choice.candidate_index, 0), len(shortlist) - 1) | |
| return shortlist[index], True, True, None | |
| except Exception as exc: | |
| return shortlist[0], True, False, exc.__class__.__name__ | |
| return shortlist[0], False, False, None | |
| def _provider_pick_with_fallback( | |
| config: ProviderConfig, | |
| observation: dict[str, Any], | |
| candidates: list[CandidateAction], | |
| ) -> tuple[CandidateAction, bool, bool, str | None]: | |
| """Try the primary provider, then walk the fallback chain on failure.""" | |
| candidate, attempted, succeeded, error = _provider_pick( | |
| config, observation, candidates | |
| ) | |
| if succeeded: | |
| return candidate, attempted, succeeded, error | |
| for fb_provider, fb_model in _FALLBACK_CHAIN: | |
| if fb_provider == config.provider: | |
| continue | |
| fb_config = ProviderConfig(provider=fb_provider, model_name=fb_model) | |
| fb_client = _openai_compatible_client(fb_config) | |
| if fb_client is None: | |
| continue | |
| candidate, fb_attempted, fb_succeeded, fb_error = _provider_pick( | |
| fb_config, | |
| observation, | |
| candidates, | |
| ) | |
| if fb_succeeded: | |
| return candidate, True, True, None | |
| return candidate, attempted, False, error or "AllProvidersFailed" | |
| def run_baseline( | |
| provider: str | None = None, | |
| model_name: str | None = None, | |
| ) -> BaselineRunResult: | |
| """Run the baseline across all built-in tasks.""" | |
| config = _resolve_provider(provider, model_name) | |
| has_provider_key = any( | |
| [ | |
| config.provider == "openai" and bool(os.getenv("OPENAI_API_KEY")), | |
| config.provider == "openrouter" and bool(os.getenv("OPENROUTER_API_KEY")), | |
| config.provider == "groq" and bool(os.getenv("GROQ_API_KEY")), | |
| config.provider == "anthropic" and bool(os.getenv("ANTHROPIC_API_KEY")), | |
| config.provider == "google" and bool(os.getenv("GOOGLE_API_KEY")), | |
| ] | |
| ) | |
| provider_calls_attempted = 0 | |
| provider_calls_succeeded = 0 | |
| provider_errors: dict[str, int] = {} | |
| task_results: list[BaselineTaskResult] = [] | |
| for task in list_tasks(): | |
| env = ChargebackOpsEnvironment() | |
| observation = env.reset(task_id=task.task_id) | |
| while not observation.done: | |
| observation_payload = observation.model_dump() | |
| candidates = candidate_actions(observation_payload) | |
| if not candidates: | |
| break | |
| if len(candidates) == 1: | |
| candidate = candidates[0] | |
| observation = env.step(candidate.action) | |
| continue | |
| obvious_candidate = _obvious_next_action(observation_payload, candidates) | |
| if obvious_candidate is not None: | |
| observation = env.step(obvious_candidate.action) | |
| continue | |
| if has_provider_key: | |
| candidate, attempted, succeeded, error_label = ( | |
| _provider_pick_with_fallback( | |
| config, | |
| observation_payload, | |
| candidates, | |
| ) | |
| ) | |
| provider_calls_attempted += int(attempted) | |
| provider_calls_succeeded += int(succeeded) | |
| if attempted and not succeeded and error_label is not None: | |
| provider_errors[error_label] = ( | |
| provider_errors.get(error_label, 0) + 1 | |
| ) | |
| if _strict_llm_mode() and attempted and not succeeded: | |
| raise RuntimeError( | |
| "STRICT_LLM_MODE is enabled and the provider decision failed, " | |
| "so heuristic fallback is not allowed." | |
| ) | |
| else: | |
| candidate = _heuristic_pick(candidates) | |
| observation = env.step(candidate.action) | |
| report = env.state.grader_report or grade_episode( | |
| task, | |
| env._progress_by_case, # type: ignore[attr-defined] | |
| env.state.step_count, | |
| env.state.episode_id or "", | |
| completed=env.state.completed, | |
| ) | |
| task_results.append( | |
| BaselineTaskResult( | |
| task_id=task.task_id, | |
| title=task.title, | |
| score=report.normalized_score, | |
| steps_used=env.state.step_count, | |
| final_status=report.summary, | |
| ) | |
| ) | |
| average_score = round( | |
| sum(task_result.score for task_result in task_results) / len(task_results), | |
| 4, | |
| ) | |
| if provider_calls_attempted == 0: | |
| mode = "heuristic_fallback" | |
| elif provider_calls_succeeded == 0: | |
| mode = "heuristic_fallback" | |
| elif provider_calls_succeeded < provider_calls_attempted: | |
| mode = f"{config.provider}_with_fallback" | |
| else: | |
| mode = config.provider | |
| return BaselineRunResult( | |
| provider=config.provider, | |
| model_name=config.model_name, | |
| mode=mode, | |
| provider_calls_attempted=provider_calls_attempted, | |
| provider_calls_succeeded=provider_calls_succeeded, | |
| provider_errors=provider_errors, | |
| task_results=task_results, | |
| average_score=average_score, | |
| ) | |
| def main() -> None: | |
| """CLI entry point.""" | |
| print(json.dumps(run_baseline().model_dump(), indent=2)) | |
| if __name__ == "__main__": # pragma: no cover | |
| main() | |