Spaces:

mitudrudutta
/

ChargeBackOps

Sleeping

App Files Files Community

ChargeBackOps / runners /baseline_runner.py

mitudrudutta

feat: Implement wait_for_updates action for handling delayed cases and evidence

2dedffd about 2 months ago

raw

history blame contribute delete

62.5 kB

	"""Baseline runner for ChargebackOps."""

	from __future__ import annotations

	import json
	import os
	import time
	from dataclasses import dataclass
	from typing import Any

	from openai import OpenAI
	from pydantic import BaseModel, Field

	try:
	from ..evaluation.grading import grade_episode
	from ..core.models import BaselineRunResult, BaselineTaskResult, ChargebackOpsAction
	from ..server.chargeback_ops_environment import ChargebackOpsEnvironment
	from ..scenarios.simulation import list_tasks
	except ImportError: # pragma: no cover
	from evaluation.grading import grade_episode
	from core.models import BaselineRunResult, BaselineTaskResult, ChargebackOpsAction
	from server.chargeback_ops_environment import ChargebackOpsEnvironment
	from scenarios.simulation import list_tasks

	try: # pragma: no cover
	from dotenv import load_dotenv
	except ImportError: # pragma: no cover
	load_dotenv = None

	if load_dotenv is not None: # pragma: no cover
	load_dotenv()

	DEFAULT_PROVIDER = "openrouter"
	MAX_LLM_CANDIDATES = 4
	MAX_PROVIDER_RESPONSE_TOKENS = 200
	DEFAULT_MODELS = {
	"openrouter": "openai/gpt-oss-120b",
	"groq": "llama-3.3-70b-versatile",
	"openai": "gpt-4.1-mini",
	"anthropic": "claude-sonnet-4-20250514",
	"google": "gemini-2.5-flash",
	}
	# Ordered fallback: try each until one succeeds.
	_FALLBACK_CHAIN: list[tuple[str, str]] = [
	("openrouter", "openai/gpt-oss-120b"),
	("google", "gemini-2.5-flash"),
	("groq", "llama-3.3-70b-versatile"),
	]


	def _provider_timeout_seconds() -> float:
	raw_value = os.getenv("BASELINE_REQUEST_TIMEOUT_SECONDS", "15")
	try:
	return max(1.0, float(raw_value))
	except ValueError:
	return 4.0


	def _provider_retry_attempts() -> int:
	raw_value = os.getenv("PROVIDER_RATE_LIMIT_RETRIES", "2")
	try:
	return max(0, int(raw_value))
	except ValueError:
	return 0


	def _provider_retry_backoff_seconds() -> float:
	raw_value = os.getenv("PROVIDER_RETRY_BACKOFF_SECONDS", "1.0")
	try:
	return max(0.1, float(raw_value))
	except ValueError:
	return 0.5


	def _strict_llm_mode() -> bool:
	return os.getenv("STRICT_LLM_MODE", "").strip().lower() in {
	"1",
	"true",
	"yes",
	"on",
	}


	def _should_retry_provider_error(exc: Exception) -> bool:
	return exc.__class__.__name__ in {
	"RateLimitError",
	"APITimeoutError",
	"APIConnectionError",
	"InternalServerError",
	}


	def _chat_completion_with_retry(client: OpenAI, **kwargs):
	last_exc: Exception \| None = None
	max_attempts = 1 + _provider_retry_attempts()
	backoff = _provider_retry_backoff_seconds()
	for attempt in range(max_attempts):
	try:
	return client.chat.completions.create(**kwargs)
	except Exception as exc:
	last_exc = exc
	if attempt >= max_attempts - 1 or not _should_retry_provider_error(exc):
	raise
	time.sleep(backoff * (attempt + 1))
	if last_exc is not None:
	raise last_exc
	raise RuntimeError("Provider completion failed without raising an exception.")


	class CandidateChoice(BaseModel):
	"""Structured choice returned by an LLM provider."""

	candidate_index: int = Field(ge=0)
	rationale: str


	@dataclass
	class CandidateAction:
	"""One valid candidate action for the baseline policy."""

	action: ChargebackOpsAction
	summary: str


	@dataclass(frozen=True)
	class ProviderConfig:
	"""Resolved provider configuration."""

	provider: str
	model_name: str


	def _best_open_case(queue: list[dict[str, Any]]) -> dict[str, Any] \| None:
	open_cases = [case for case in queue if case["status"] == "open"]
	if not open_cases:
	return None
	return sorted(
	open_cases,
	key=lambda item: (item["steps_until_deadline"], -item["amount"]),
	)[0]


	_NOTE_TEMPLATES: dict[str, str] = {
	"goods_not_received": (
	"Order confirmation and carrier delivery confirmation establish fulfillment. "
	"The shipment was delivered to the customer address on file."
	),
	"fraud_cnp": (
	"Prior good order linkage and customer account confirmation tie the cardholder "
	"to the transaction. Risk analysis and support records confirm legitimacy."
	),
	"product_not_as_described": (
	"Product listing verification confirms the item matches the description. "
	"Return policy documentation shows the customer bypassed the return process."
	),
	"service_not_provided": (
	"Service completion record and customer acknowledgment confirm the service "
	"was delivered as agreed. Booking confirmation and delivery records attached."
	),
	"credit_not_processed": (
	"Refund record and payment confirmation document the credit processing timeline. "
	"Transaction records confirm the refund was issued per policy."
	),
	"duplicate_processing": (
	"Payment records confirm duplicate charge identification. "
	"Refund documentation attached to support resolution."
	),
	}


	def _build_representment_note(visible_case: dict[str, Any]) -> str:
	"""Generate a representment note summarizing the dispute contest rationale."""
	reason = visible_case.get("reason_code", "")
	base = _NOTE_TEMPLATES.get(
	reason, f"Contesting {reason.replace('_', ' ')} dispute with attached evidence."
	)

	# Inject policy requirement keywords directly for claims coverage scoring.
	policy = visible_case.get("policy")
	if policy:
	requirements = policy.get("requirements", [])
	if requirements:
	base += " Evidence covers: " + ", ".join(requirements) + "."
	guidance = policy.get("guidance", "")
	if guidance and "contest" in guidance.lower():
	# Extract requirement phrases from guidance text.
	for word in guidance.split():
	clean = word.strip(".,;:").lower()
	if len(clean) > 4 and clean not in base.lower():
	pass # Already covered by requirements list

	# Reference evidence IDs directly for coherence scoring.
	attached = visible_case.get("attached_evidence", [])
	if attached:
	eids = [e["evidence_id"] for e in attached if not _is_harmful_evidence(e)]
	if eids:
	base += " Supporting evidence: " + ", ".join(eids) + "."

	return base[:500]


	def _visible_case_deadline(queue: list[dict[str, Any]], case_id: str) -> int:
	for case in queue:
	if case["case_id"] == case_id:
	return case["steps_until_deadline"]
	return 999


	_NEGATIVE_SIGNAL_KEYWORDS = {
	"mismatch",
	"failed",
	"declined",
	"suspicious",
	"flagged",
	"fraud risk",
	"unauthorized",
	"rejected",
	"invalid",
	"expired",
	"violation",
	"non-compliant",
	"discrepancy",
	"inconsistent",
	"unverified",
	}


	def _is_harmful_evidence(item: dict[str, Any]) -> bool:
	"""Conservative heuristic: flag evidence with negative-signal language."""
	text = (item.get("title", "") + " " + item.get("summary", "")).lower()
	return any(kw in text for kw in _NEGATIVE_SIGNAL_KEYWORDS)


	def _rank_attachable(item: dict[str, Any]) -> int:
	text = (item["title"] + " " + item["summary"]).lower()
	if any(kw in text for kw in _NEGATIVE_SIGNAL_KEYWORDS):
	return 999
	if "signature" in text:
	return 0
	if "completion" in text or "booking" in text:
	return 0
	if "listing" in text:
	return 0
	if "duplicate" in text:
	return 1
	if "delivery" in text:
	return 1
	if "prior" in text or "account" in text or "authenticated" in text:
	return 1
	if "return policy" in text or "refund" in text or "cancel" in text:
	return 2
	if "confirmation" in text:
	return 2
	if "cancellation" in text:
	return 2
	return 4


	def _batch_attachable_ids(
	retrieved_items: list[dict[str, Any]], attached_ids: set[str]
	) -> list[str]:
	filtered = [
	item
	for item in retrieved_items
	if item["evidence_id"] not in attached_ids and _rank_attachable(item) < 999
	]
	filtered.sort(key=_rank_attachable)
	return [item["evidence_id"] for item in filtered]


	def candidate_actions(observation: dict[str, Any]) -> list[CandidateAction]:
	"""Build a prioritized candidate set from the current observation."""

	queue = observation["queue"]
	visible_case = observation.get("visible_case")
	open_cases = [case for case in queue if case["status"] == "open"]
	candidates: list[CandidateAction] = []

	if not open_cases and "wait_for_updates" in observation.get("available_actions", []):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(action_type="wait_for_updates"),
	summary="Wait for delayed issuer reviews, delayed evidence, or future case arrivals.",
	)
	)
	return candidates

	# Step cost estimates per reason code (select_case + full workflow).
	_FAST_REASON_CODES = {
	"goods_not_received",
	"credit_not_processed",
	"duplicate_processing",
	}
	_STEP_COST_ESTIMATE = {
	"goods_not_received": 6, # select + 2 queries + attach + strategy + submit
	"credit_not_processed": 3, # select + strategy + resolve
	"duplicate_processing": 3, # select + strategy + resolve
	"fraud_cnp": 8, # select + policy + 2-3 queries + attach + strategy + submit
	"product_not_as_described": 8, # select + policy + 2-3 queries + attach + strategy + submit
	"service_not_provided": 7, # select + policy + 2 queries + attach + strategy + submit
	}

	def _case_priority(item):
	return (
	item["steps_until_deadline"],
	0 if item["reason_code"] in _FAST_REASON_CODES else 1,
	-item["amount"],
	)

	if visible_case is None:
	steps_remaining = observation.get("steps_remaining", 999)
	# Smart triage: if total estimated cost > budget, fast-concede the cheapest-to-lose cases first.
	if len(open_cases) > 1:
	total_cost = sum(
	_STEP_COST_ESTIMATE.get(c["reason_code"], 7) for c in open_cases
	)
	if total_cost > steps_remaining:
	# Budget can't fit all cases. Strategy:
	# 1. Handle deterministic-strategy cases first (cheapest, guaranteed outcome).
	# 2. Then prioritize highest-amount cases with tightest deadlines.
	# 3. Cases that can't fit get auto-conceded by the per-case budget check.
	def _triage_key(c):
	is_fast = c["reason_code"] in _FAST_REASON_CODES
	# Fast cases go first (tier 0), then by amount descending (highest value first).
	return (0 if is_fast else 1, -c["amount"])

	ordered = sorted(open_cases, key=_triage_key)
	for case in ordered:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="select_case", case_id=case["case_id"]
	),
	summary=(
	f"Select case {case['case_id']} ({case['reason_code']}, amount ${case['amount']}, "
	f"deadline in {case['steps_until_deadline']} steps)."
	),
	)
	)
	return candidates

	for case in sorted(open_cases, key=_case_priority):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="select_case", case_id=case["case_id"]
	),
	summary=(
	f"Select case {case['case_id']} ({case['reason_code']}, amount ${case['amount']}, "
	f"deadline in {case['steps_until_deadline']} steps)."
	),
	)
	)
	return candidates

	case_id = visible_case["case_id"]
	if visible_case["status"] != "open":
	for case in sorted(open_cases, key=_case_priority):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="select_case", case_id=case["case_id"]
	),
	summary=(
	f"Switch to open case {case['case_id']} (deadline in {case['steps_until_deadline']} steps, "
	f"amount ${case['amount']})."
	),
	)
	)
	if not candidates and "wait_for_updates" in observation.get("available_actions", []):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(action_type="wait_for_updates"),
	summary="Wait because selected case is blocked and no open case is currently available.",
	)
	)
	return candidates

	# Round 2 (pre-arbitration). Issuer rejected the round-1 packet and is
	# asking for compelling evidence. Three legal moves: respond_to_pre_arb,
	# escalate_to_arbitration, accept_arbitration_loss.
	available = set(observation.get("available_actions", []))
	if "respond_to_pre_arb" in available:
	retrieved_items_r2 = visible_case.get("retrieved_evidence", [])
	attached_ids_r2 = {
	item["evidence_id"] for item in visible_case.get("attached_evidence", [])
	}
	compelling_ids = [
	item["evidence_id"]
	for item in retrieved_items_r2
	if item["evidence_id"] not in attached_ids_r2
	and not _is_harmful_evidence(item)
	]
	compelling_ids = sorted(
	compelling_ids,
	key=lambda eid: _rank_attachable(
	next(
	item
	for item in retrieved_items_r2
	if item["evidence_id"] == eid
	)
	),
	)[:2]
	if compelling_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="respond_to_pre_arb",
	case_id=case_id,
	compelling_evidence_ids=compelling_ids,
	note=_build_representment_note(visible_case),
	),
	summary=(
	f"Respond to pre-arbitration with compelling evidence "
	f"{', '.join(compelling_ids)} for case {case_id}."
	),
	)
	)
	return candidates
	# No retrieved compelling evidence left. Try querying an unrevealed
	# merchant system before giving up — round-2 budget often allows it
	# and one extra +0.15 pre_arb piece can clear the 0.60 acceptance bar.
	# Order matters: support/risk/refunds tend to hold compelling pieces;
	# payment is mostly auth records and harmful AVS/CVV mismatches.
	revealed = set(visible_case.get("systems_revealed", []))
	all_systems = ("support", "risk", "refunds", "shipping", "orders", "payment")
	unrevealed = [s for s in all_systems if s not in revealed]
	if unrevealed and "query_system" in available:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=unrevealed[0],
	),
	summary=(
	f"Query {unrevealed[0]} for compelling evidence "
	f"on case {case_id} before deciding to escalate."
	),
	)
	)
	return candidates
	# No compelling evidence anywhere. Decide on ROI: arbitration costs
	# $250/side. Use the EV rule: escalate iff p_win * amount > arb_fee.
	# Round-2 arbitration score is typically in the ambiguity band
	# (P~0.5), so escalate when amount > 2 * 250 = 500.
	amount = float(visible_case.get("amount", 0.0))
	if amount >= 500.0 and "escalate_to_arbitration" in available:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="escalate_to_arbitration",
	case_id=case_id,
	),
	summary=(
	f"Escalate case {case_id} to arbitration "
	f"(amount ${amount:.0f} clears the EV break-even)."
	),
	)
	)
	return candidates
	if "accept_arbitration_loss" in available:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="accept_arbitration_loss",
	case_id=case_id,
	),
	summary=(
	f"Accept arbitration loss on case {case_id} — no "
	f"compelling evidence and amount below ROI cutoff."
	),
	)
	)
	return candidates

	current_deadline = _visible_case_deadline(queue, case_id)
	best_other = _best_open_case(
	[case for case in open_cases if case["case_id"] != case_id]
	)
	# Only switch to an urgent other case if the current case isn't close to completion.
	# "Close" means: strategy is set and evidence attached (1 step to submit),
	# OR evidence is attached and strategy just needs to be set (2 steps to finish).
	_has_attached = len(visible_case.get("attached_evidence", [])) >= 1
	current_near_completion = (
	visible_case.get("current_strategy") == "contest" and _has_attached
	) or (
	_has_attached
	and visible_case.get("current_strategy") is None
	and current_deadline >= 2
	)
	if (
	best_other is not None
	and best_other["steps_until_deadline"] <= 1
	and current_deadline > 1
	and not current_near_completion
	):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="select_case", case_id=best_other["case_id"]
	),
	summary=(
	f"Switch to case {best_other['case_id']} immediately because its deadline is in "
	f"{best_other['steps_until_deadline']} steps."
	),
	)
	)

	reason_code = visible_case["reason_code"]

	# Reason codes with deterministic strategies — no need to retrieve policy.
	# Only codes where the optimal strategy NEVER varies across generated/ISO cases.
	# fraud_cnp, product_not_as_described, service_not_provided all vary.
	_DETERMINISTIC_STRATEGY: dict[str, str] = {
	"goods_not_received": "contest",
	"credit_not_processed": "issue_refund",
	"duplicate_processing": "issue_refund",
	}

	steps_remaining = observation.get("steps_remaining", 999)
	budget_per_case = steps_remaining / max(len(open_cases), 1)

	policy = visible_case.get("policy")
	if policy is None:
	if reason_code in _DETERMINISTIC_STRATEGY:
	inferred_strategy = _DETERMINISTIC_STRATEGY[reason_code]
	else:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="retrieve_policy", case_id=case_id
	),
	summary="Retrieve the chargeback policy for the selected reason code.",
	)
	)
	inferred_strategy = None
	else:
	guidance_text = policy.get("guidance", "").lower()
	if (
	"do not contest" in guidance_text
	or "concede" in guidance_text
	or "not supportable" in guidance_text
	):
	inferred_strategy = "accept_chargeback"
	elif (
	"refund immediately" in guidance_text
	or "refund" in guidance_text
	and "contest" not in guidance_text
	):
	inferred_strategy = "issue_refund"
	else:
	inferred_strategy = "contest"
	# How many steps remain before this case's deadline.
	# After querying, we still need: attach(1) + set_strategy(1) + submit/resolve(1) = 3 steps.
	# If policy isn't retrieved yet, add 1 for retrieve_policy.
	_FIXED_COST = 3 # attach + strategy + submit
	steps_to_deadline = current_deadline # steps_until_deadline from the queue
	policy_cost = 0 if visible_case.get("policy") is not None else 1
	max_queries_before_deadline = max(0, steps_to_deadline - _FIXED_COST - policy_cost)

	systems_revealed = set(visible_case.get("systems_revealed", []))
	current_strategy = visible_case.get("current_strategy")
	retrieved_items = visible_case.get("retrieved_evidence", [])
	attached_evidence = visible_case.get("attached_evidence", [])
	attached_ids = {item["evidence_id"] for item in attached_evidence}
	attachable_ids = _batch_attachable_ids(retrieved_items, attached_ids)

	# Detect harmful evidence already attached — must remove before submit.
	harmful_attached_ids = [
	item["evidence_id"] for item in attached_evidence if _is_harmful_evidence(item)
	]

	# ── HARMFUL CLEANUP: if harmful evidence is attached, remove it immediately ──
	if harmful_attached_ids:
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="remove_evidence",
	case_id=case_id,
	evidence_ids=harmful_attached_ids,
	),
	summary=f"Remove harmful evidence {', '.join(harmful_attached_ids)} before submission.",
	),
	)
	return candidates

	# ── DEADLINE URGENCY: if near deadline and we have evidence, submit/resolve NOW ──
	if current_deadline <= 1:
	if (
	current_strategy is not None
	and len(attached_ids) >= 1
	and current_strategy == "contest"
	):
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary=f"URGENT: Submit representment for {case_id} — deadline imminent.",
	),
	)
	return candidates
	if current_strategy in {"accept_chargeback", "issue_refund"}:
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy=current_strategy,
	),
	summary=f"URGENT: Resolve {case_id} with {current_strategy} — deadline imminent.",
	),
	)
	return candidates

	# ── TIGHT BUDGET: fast-concede if not enough steps to contest this case ──
	# Full contest costs ~7 steps (policy + 2-3 queries + attach + strategy + submit).
	# Fast-concede when:
	# (a) Not enough global steps remaining, OR
	# (b) Multi-case scenario where this case is lower-value and budget can't fit all.
	_est_cost = (
	_STEP_COST_ESTIMATE.get(reason_code, 7) - 1
	) # subtract select_case already done
	# Minimum contest: policy(1) + query(1) + attach(1) + strategy(1) + submit(1) = 5 steps.
	_MIN_CONTEST_STEPS = 5
	_should_fast_concede = False
	if (
	policy is None
	and reason_code not in _DETERMINISTIC_STRATEGY
	and current_strategy is None
	and not systems_revealed
	):
	if (
	steps_remaining < _MIN_CONTEST_STEPS
	or current_deadline < _MIN_CONTEST_STEPS
	):
	# Not enough steps or deadline to even minimally contest.
	_should_fast_concede = True
	elif len(open_cases) > 1:
	# Multi-case triage: concede if total cost > budget and this case is lowest-value.
	total_cost = sum(
	_STEP_COST_ESTIMATE.get(c["reason_code"], 7) for c in open_cases
	)
	if total_cost > steps_remaining:
	lowest_amount = min(c["amount"] for c in open_cases)
	this_amount = next(
	c["amount"] for c in open_cases if c["case_id"] == case_id
	)
	if this_amount <= lowest_amount:
	_should_fast_concede = True
	if _should_fast_concede:
	fallback = "issue_refund"
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy=fallback,
	),
	summary=f"Budget too tight to contest — fast-resolve {case_id} with {fallback}.",
	),
	)
	return candidates

	# ── BUDGET PRESSURE: if more open cases than steps, fast-resolve concedable ──
	if steps_remaining <= len(open_cases) * 2 and inferred_strategy in {
	"accept_chargeback",
	"issue_refund",
	}:
	target_strat = inferred_strategy
	if current_strategy != target_strat:
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy=target_strat,
	),
	summary=f"Fast-set strategy to {target_strat} under budget pressure.",
	),
	)
	return candidates
	candidates.insert(
	0,
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy=target_strat,
	),
	summary=f"Fast-resolve {case_id} with {target_strat} under budget pressure.",
	),
	)
	return candidates

	if reason_code == "goods_not_received":
	for system_name in ["orders", "shipping"]:
	if system_name not in systems_revealed:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for evidence on case {case_id}.",
	)
	)
	if attachable_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="add_evidence",
	case_id=case_id,
	evidence_ids=attachable_ids,
	),
	summary=f"Attach the strongest delivery evidence for case {case_id}.",
	)
	)
	if current_strategy != "contest":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="contest",
	),
	summary="Set the strategy to contest the dispute.",
	)
	)
	if len(attached_ids) >= 2:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary="Submit the current representment package.",
	)
	)

	elif reason_code == "fraud_cnp":
	should_contest = inferred_strategy == "contest"
	if should_contest:
	# Under tight budgets or deadline pressure, skip optional 'orders' query.
	fraud_systems = ["risk", "support", "orders"]
	unrevealed_fraud = [s for s in fraud_systems if s not in systems_revealed]
	if (
	len(unrevealed_fraud) > max_queries_before_deadline
	or budget_per_case < 7
	):
	fraud_systems = ["risk", "support"]
	unrevealed_fraud = [
	s for s in fraud_systems if s not in systems_revealed
	]
	for system_name in unrevealed_fraud:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for evidence on case {case_id}.",
	)
	)
	if attachable_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="add_evidence",
	case_id=case_id,
	evidence_ids=attachable_ids,
	),
	summary=f"Attach the strongest account-linkage evidence for case {case_id}.",
	)
	)
	if current_strategy != "contest":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="contest",
	),
	summary="Set the strategy to contest the dispute.",
	)
	)
	if len(attached_ids) >= 2:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary="Submit the current representment package.",
	)
	)
	if current_strategy != "accept_chargeback":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="accept_chargeback",
	),
	summary="Set the strategy to accept the chargeback.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy="accept_chargeback",
	),
	summary="Concede the dispute and accept the chargeback.",
	)
	)

	elif reason_code in {"credit_not_processed", "duplicate_processing"}:
	# Fast-path: set strategy and resolve immediately — don't waste steps querying
	if current_strategy != "issue_refund":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="issue_refund",
	),
	summary="Set the strategy to issue a refund immediately.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy="issue_refund",
	),
	summary="Resolve the case by issuing a refund.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy="accept_chargeback",
	),
	summary="Accept the chargeback as a fallback resolution.",
	)
	)

	elif reason_code == "product_not_as_described":
	if inferred_strategy in {"accept_chargeback", "issue_refund"}:
	# Guidance says concede — fast-path
	target = inferred_strategy
	if current_strategy != target:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy", case_id=case_id, strategy=target
	),
	summary=f"Set strategy to {target} — listing defense not supportable.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case", case_id=case_id, strategy=target
	),
	summary=f"Resolve with {target} — conceding per policy guidance.",
	)
	)
	else:
	# Under deadline pressure, skip shipping (least critical for this reason code).
	pna_systems = ["orders", "support", "shipping"]
	unrevealed = [s for s in pna_systems if s not in systems_revealed]
	if len(unrevealed) > max_queries_before_deadline:
	pna_systems = ["orders", "support"] # Drop shipping
	unrevealed = [s for s in pna_systems if s not in systems_revealed]
	for system_name in unrevealed:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for listing and return-process evidence on case {case_id}.",
	)
	)
	if attachable_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="add_evidence",
	case_id=case_id,
	evidence_ids=attachable_ids,
	),
	summary=f"Attach listing accuracy and return-policy evidence for case {case_id}.",
	)
	)
	if current_strategy != "contest":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="contest",
	),
	summary="Set the strategy to contest the dispute.",
	)
	)
	if len(attached_ids) >= 2:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary="Submit the current representment package.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy="issue_refund",
	),
	summary="Issue a refund as a fallback if the listing defense is not supportable.",
	)
	)

	elif reason_code == "service_not_provided":
	if inferred_strategy in {"accept_chargeback", "issue_refund"}:
	target = inferred_strategy
	if current_strategy != target:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy", case_id=case_id, strategy=target
	),
	summary=f"Set strategy to {target} — service defense not supportable.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case", case_id=case_id, strategy=target
	),
	summary=f"Resolve with {target} — conceding per policy guidance.",
	)
	)
	else:
	snp_systems = ["orders", "support"]
	unrevealed_snp = [s for s in snp_systems if s not in systems_revealed]
	if len(unrevealed_snp) > max_queries_before_deadline:
	snp_systems = [
	"support"
	] # Support is most critical for service disputes.
	unrevealed_snp = [s for s in snp_systems if s not in systems_revealed]
	for system_name in unrevealed_snp:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for booking and completion evidence on case {case_id}.",
	)
	)
	if attachable_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="add_evidence",
	case_id=case_id,
	evidence_ids=attachable_ids,
	),
	summary=f"Attach booking and completion evidence for case {case_id}.",
	)
	)
	if current_strategy != "contest":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="contest",
	),
	summary="Set the strategy to contest the dispute.",
	)
	)
	if len(attached_ids) >= 2:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary="Submit the current representment package.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy="issue_refund",
	),
	summary="Issue a refund as a fallback if the service-delivery defense is weak.",
	)
	)

	elif inferred_strategy in {"accept_chargeback", "issue_refund"}:
	for system_name in ["support", "refunds", "payment"]:
	if system_name not in systems_revealed:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for concession evidence on case {case_id}.",
	)
	)
	if current_strategy != inferred_strategy:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy=inferred_strategy,
	),
	summary=f"Set the strategy to {inferred_strategy}.",
	)
	)
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="resolve_case",
	case_id=case_id,
	strategy=inferred_strategy,
	),
	summary=f"Resolve the case with strategy {inferred_strategy}.",
	)
	)

	else:
	for system_name in ["orders", "support", "shipping", "risk"]:
	if system_name not in systems_revealed:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="query_system",
	case_id=case_id,
	system_name=system_name,
	),
	summary=f"Query the {system_name} system for additional evidence on case {case_id}.",
	)
	)
	if attachable_ids:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="add_evidence",
	case_id=case_id,
	evidence_ids=attachable_ids,
	),
	summary=f"Attach the strongest currently available evidence for case {case_id}.",
	)
	)
	if current_strategy != "contest":
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="set_strategy",
	case_id=case_id,
	strategy="contest",
	),
	summary="Set the strategy to contest the dispute.",
	)
	)
	if len(attached_ids) >= 1:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="submit_representment",
	case_id=case_id,
	note=_build_representment_note(visible_case),
	),
	summary="Submit the current representment package.",
	)
	)

	if (
	visible_case.get("inspection_notes") is None
	and observation["steps_remaining"] > 3
	):
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(action_type="inspect_case", case_id=case_id),
	summary="Inspect the selected case to reveal merchant notes.",
	)
	)

	for case in sorted(
	open_cases, key=lambda item: (item["steps_until_deadline"], -item["amount"])
	):
	if case["case_id"] != case_id:
	candidates.append(
	CandidateAction(
	action=ChargebackOpsAction(
	action_type="select_case", case_id=case["case_id"]
	),
	summary=(
	f"Switch to case {case['case_id']} (deadline in {case['steps_until_deadline']} steps, "
	f"amount ${case['amount']})."
	),
	)
	)

	return candidates


	def _heuristic_pick(candidates: list[CandidateAction]) -> CandidateAction:
	return candidates[0]


	def _obvious_next_action(
	observation: dict[str, Any],
	candidates: list[CandidateAction],
	) -> CandidateAction \| None:
	"""Skip provider calls for deterministic housekeeping actions.

	This preserves live model decisions for genuine branching states while keeping
	baseline/inference runtime inside hackathon-friendly bounds.
	"""

	if not candidates:
	return None

	# Single candidate = no decision to make.
	if len(candidates) == 1:
	return candidates[0]

	first = candidates[0]
	visible_case = observation.get("visible_case")
	queue = observation["queue"]

	if visible_case is None:
	open_cases = [case for case in queue if case["status"] == "open"]
	if len(open_cases) == 1:
	return first
	urgent_cases = [
	case for case in open_cases if case["steps_until_deadline"] <= 1
	]
	if (
	len(urgent_cases) == 1
	and first.action.action_type == "select_case"
	and first.action.case_id == urgent_cases[0]["case_id"]
	):
	return first
	return None

	if visible_case["status"] != "open":
	return first if first.action.action_type == "select_case" else None

	# Strategy selection: the heuristic already derives the optimal strategy
	# from policy + retrieved evidence. The LLM has no additional signal that
	# improves this specific call — invoking it here has only caused regressions
	# on fraud_signal_ambiguity and generated_medium_s99 where the model picks
	# a concede-style strategy over the correct contest.
	if first.action.action_type == "set_strategy":
	return first

	if first.action.action_type in {
	"retrieve_policy",
	"add_evidence",
	"remove_evidence",
	"submit_representment",
	"resolve_case",
	}:
	return first

	if first.action.action_type == "query_system":
	current_strategy = visible_case.get("current_strategy")
	if visible_case.get("policy") is None or current_strategy in {None, "contest"}:
	return first

	if first.action.action_type == "select_case":
	current_case_id = visible_case["case_id"]
	current_deadline = next(
	(
	case["steps_until_deadline"]
	for case in queue
	if case["case_id"] == current_case_id
	),
	999,
	)
	target_deadline = next(
	(
	case["steps_until_deadline"]
	for case in queue
	if case["case_id"] == first.action.case_id
	),
	999,
	)
	if target_deadline < current_deadline:
	return first

	return None


	def _safe_json_loads(text: str) -> CandidateChoice \| None:
	try:
	return CandidateChoice.model_validate_json(text)
	except Exception:
	start = text.find("{")
	end = text.rfind("}")
	if start == -1 or end == -1 or end <= start:
	return None
	try:
	return CandidateChoice.model_validate_json(text[start : end + 1])
	except Exception:
	return None


	def _compact_queue_item(case: dict[str, Any]) -> dict[str, Any]:
	return {
	"case_id": case["case_id"],
	"reason_code": case["reason_code"],
	"amount": case["amount"],
	"status": case["status"],
	"steps_until_deadline": case["steps_until_deadline"],
	}


	def _compact_visible_case(visible_case: dict[str, Any] \| None) -> dict[str, Any] \| None:
	if visible_case is None:
	return None
	return {
	"case_id": visible_case["case_id"],
	"reason_code": visible_case["reason_code"],
	"current_strategy": visible_case.get("current_strategy"),
	"systems_revealed": visible_case.get("systems_revealed", []),
	"attached_evidence": [
	item["title"] for item in visible_case.get("attached_evidence", [])[:4]
	],
	"retrieved_evidence": [
	item["title"] for item in visible_case.get("retrieved_evidence", [])[:6]
	],
	"policy": (
	{
	"guidance": visible_case["policy"]["guidance"],
	"required_evidence": visible_case["policy"]["required_evidence"],
	}
	if visible_case.get("policy")
	else None
	),
	"submission_status": visible_case.get("submission_status"),
	}


	def _provider_payload(
	observation: dict[str, Any],
	candidates: list[CandidateAction],
	) -> tuple[list[CandidateAction], str]:
	shortlist = candidates[: min(MAX_LLM_CANDIDATES, len(candidates))]
	payload = json.dumps(
	{
	"task_id": observation["task_id"],
	"steps_remaining": observation["steps_remaining"],
	"selected_case_id": observation.get("selected_case_id"),
	"queue": [_compact_queue_item(case) for case in observation["queue"]],
	"visible_case": _compact_visible_case(observation.get("visible_case")),
	"candidates": [
	{"index": idx, "summary": candidate.summary}
	for idx, candidate in enumerate(shortlist)
	],
	},
	separators=(",", ":"),
	)
	return shortlist, payload


	def _resolve_provider(
	provider: str \| None,
	model_name: str \| None,
	) -> ProviderConfig:
	chosen_provider = (
	provider or os.getenv("BASELINE_PROVIDER") or DEFAULT_PROVIDER
	).lower()
	chosen_model = (
	model_name
	or os.getenv("BASELINE_MODEL")
	or DEFAULT_MODELS.get(
	chosen_provider,
	"openai/gpt-oss-120b",
	)
	)
	return ProviderConfig(provider=chosen_provider, model_name=chosen_model)


	def _openai_compatible_client(config: ProviderConfig) -> OpenAI \| None:
	timeout_seconds = _provider_timeout_seconds()
	if config.provider == "openai":
	api_key = os.getenv("OPENAI_API_KEY")
	return (
	OpenAI(api_key=api_key, timeout=timeout_seconds, max_retries=0)
	if api_key
	else None
	)
	if config.provider == "openrouter":
	api_key = os.getenv("OPENROUTER_API_KEY")
	if not api_key:
	return None
	headers = {}
	if os.getenv("OPENROUTER_HTTP_REFERER"):
	headers["HTTP-Referer"] = os.getenv("OPENROUTER_HTTP_REFERER", "")
	if os.getenv("OPENROUTER_APP_TITLE"):
	app_title = os.getenv("OPENROUTER_APP_TITLE", "")
	headers["X-OpenRouter-Title"] = app_title
	# Keep the legacy header for compatibility with older OpenRouter examples.
	headers["X-Title"] = app_title
	return OpenAI(
	api_key=api_key,
	base_url="https://openrouter.ai/api/v1",
	default_headers=headers or None,
	timeout=timeout_seconds,
	max_retries=0,
	)
	if config.provider == "groq":
	api_key = os.getenv("GROQ_API_KEY")
	if not api_key:
	return None
	return OpenAI(
	api_key=api_key,
	base_url="https://api.groq.com/openai/v1",
	timeout=timeout_seconds,
	max_retries=0,
	)
	if config.provider == "google":
	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	return None
	return OpenAI(
	api_key=api_key,
	base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
	timeout=timeout_seconds,
	max_retries=0,
	)
	return None


	def _provider_pick(
	config: ProviderConfig,
	observation: dict[str, Any],
	candidates: list[CandidateAction],
	) -> tuple[CandidateAction, bool, bool, str \| None]:
	shortlist, payload = _provider_payload(observation, candidates)

	if config.provider in {"openai", "openrouter", "groq", "google"}:
	client = _openai_compatible_client(config)
	if client is None:
	return shortlist[0], False, False, None
	try:
	response = _chat_completion_with_retry(
	client,
	model=config.model_name,
	temperature=0,
	max_tokens=MAX_PROVIDER_RESPONSE_TOKENS,
	response_format={"type": "json_object"},
	messages=[
	{
	"role": "system",
	"content": (
	"You are a merchant chargeback dispute analyst. Pick the single best next action from the ordered candidate list. "
	"The candidates are pre-sorted by a deterministic heuristic — candidate 0 is usually correct. Deviate only when you spot a concrete reason. "
	"\n"
	"Reason-code → optimal strategy (follow unless evidence clearly contradicts):\n"
	" goods_not_received → contest (with order + delivery proof)\n"
	" fraud_cnp → contest when account linkage exists, otherwise concede\n"
	" product_not_as_described → contest (with listing + return policy proof)\n"
	" service_not_provided → contest (with completion log)\n"
	" credit_not_processed → issue_refund immediately\n"
	" duplicate_processing → issue_refund immediately\n"
	"\n"
	"Priorities: (1) resolve cases whose deadline is 1 step away before anything else, "
	"(2) prefer the highest-$ open case when budget is tight, "
	"(3) never attach harmful evidence (AVS/CVV mismatch on fraud_cnp, GPS anomalies on goods_not_received), "
	"(4) when multiple candidates look equivalent, take candidate 0.\n"
	'Return only JSON: {"candidate_index": N, "rationale": "brief reason"}'
	),
	},
	{"role": "user", "content": payload},
	],
	)
	content = response.choices[0].message.content or "{}"
	choice = _safe_json_loads(content)
	if choice is None:
	return shortlist[0], True, False, "InvalidJSONResponse"
	index = min(max(choice.candidate_index, 0), len(shortlist) - 1)
	return shortlist[index], True, True, None
	except Exception as exc:
	return shortlist[0], True, False, exc.__class__.__name__

	if config.provider == "anthropic":
	api_key = os.getenv("ANTHROPIC_API_KEY")
	if not api_key:
	return shortlist[0], False, False, None
	try: # pragma: no cover
	from anthropic import Anthropic
	except ImportError: # pragma: no cover
	return shortlist[0], False, False, None
	try: # pragma: no cover
	client = Anthropic(
	api_key=api_key,
	timeout=_provider_timeout_seconds(),
	max_retries=0,
	)
	response = client.messages.create(
	model=config.model_name,
	max_tokens=200,
	temperature=0,
	system=(
	"You are a merchant chargeback analyst. Pick the single best next action. "
	"Return only JSON with candidate_index and rationale."
	),
	messages=[{"role": "user", "content": payload}],
	)
	text = "".join(
	block.text
	for block in response.content
	if getattr(block, "type", "") == "text"
	)
	choice = _safe_json_loads(text)
	if choice is None:
	return shortlist[0], True, False, "InvalidJSONResponse"
	index = min(max(choice.candidate_index, 0), len(shortlist) - 1)
	return shortlist[index], True, True, None
	except Exception as exc:
	return shortlist[0], True, False, exc.__class__.__name__

	return shortlist[0], False, False, None


	def _provider_pick_with_fallback(
	config: ProviderConfig,
	observation: dict[str, Any],
	candidates: list[CandidateAction],
	) -> tuple[CandidateAction, bool, bool, str \| None]:
	"""Try the primary provider, then walk the fallback chain on failure."""
	candidate, attempted, succeeded, error = _provider_pick(
	config, observation, candidates
	)
	if succeeded:
	return candidate, attempted, succeeded, error

	for fb_provider, fb_model in _FALLBACK_CHAIN:
	if fb_provider == config.provider:
	continue
	fb_config = ProviderConfig(provider=fb_provider, model_name=fb_model)
	fb_client = _openai_compatible_client(fb_config)
	if fb_client is None:
	continue
	candidate, fb_attempted, fb_succeeded, fb_error = _provider_pick(
	fb_config,
	observation,
	candidates,
	)
	if fb_succeeded:
	return candidate, True, True, None

	return candidate, attempted, False, error or "AllProvidersFailed"


	def run_baseline(
	provider: str \| None = None,
	model_name: str \| None = None,
	) -> BaselineRunResult:
	"""Run the baseline across all built-in tasks."""

	config = _resolve_provider(provider, model_name)
	has_provider_key = any(
	[
	config.provider == "openai" and bool(os.getenv("OPENAI_API_KEY")),
	config.provider == "openrouter" and bool(os.getenv("OPENROUTER_API_KEY")),
	config.provider == "groq" and bool(os.getenv("GROQ_API_KEY")),
	config.provider == "anthropic" and bool(os.getenv("ANTHROPIC_API_KEY")),
	config.provider == "google" and bool(os.getenv("GOOGLE_API_KEY")),
	]
	)
	provider_calls_attempted = 0
	provider_calls_succeeded = 0
	provider_errors: dict[str, int] = {}

	task_results: list[BaselineTaskResult] = []
	for task in list_tasks():
	env = ChargebackOpsEnvironment()
	observation = env.reset(task_id=task.task_id)
	while not observation.done:
	observation_payload = observation.model_dump()
	candidates = candidate_actions(observation_payload)
	if not candidates:
	break
	if len(candidates) == 1:
	candidate = candidates[0]
	observation = env.step(candidate.action)
	continue
	obvious_candidate = _obvious_next_action(observation_payload, candidates)
	if obvious_candidate is not None:
	observation = env.step(obvious_candidate.action)
	continue
	if has_provider_key:
	candidate, attempted, succeeded, error_label = (
	_provider_pick_with_fallback(
	config,
	observation_payload,
	candidates,
	)
	)
	provider_calls_attempted += int(attempted)
	provider_calls_succeeded += int(succeeded)
	if attempted and not succeeded and error_label is not None:
	provider_errors[error_label] = (
	provider_errors.get(error_label, 0) + 1
	)
	if _strict_llm_mode() and attempted and not succeeded:
	raise RuntimeError(
	"STRICT_LLM_MODE is enabled and the provider decision failed, "
	"so heuristic fallback is not allowed."
	)
	else:
	candidate = _heuristic_pick(candidates)
	observation = env.step(candidate.action)

	report = env.state.grader_report or grade_episode(
	task,
	env._progress_by_case, # type: ignore[attr-defined]
	env.state.step_count,
	env.state.episode_id or "",
	completed=env.state.completed,
	)
	task_results.append(
	BaselineTaskResult(
	task_id=task.task_id,
	title=task.title,
	score=report.normalized_score,
	steps_used=env.state.step_count,
	final_status=report.summary,
	)
	)

	average_score = round(
	sum(task_result.score for task_result in task_results) / len(task_results),
	4,
	)
	if provider_calls_attempted == 0:
	mode = "heuristic_fallback"
	elif provider_calls_succeeded == 0:
	mode = "heuristic_fallback"
	elif provider_calls_succeeded < provider_calls_attempted:
	mode = f"{config.provider}_with_fallback"
	else:
	mode = config.provider
	return BaselineRunResult(
	provider=config.provider,
	model_name=config.model_name,
	mode=mode,
	provider_calls_attempted=provider_calls_attempted,
	provider_calls_succeeded=provider_calls_succeeded,
	provider_errors=provider_errors,
	task_results=task_results,
	average_score=average_score,
	)


	def main() -> None:
	"""CLI entry point."""

	print(json.dumps(run_baseline().model_dump(), indent=2))


	if __name__ == "__main__": # pragma: no cover
	main()