"""Gradio demo UI for ChargebackOps."""
from __future__ import annotations
import base64
import os
from pathlib import Path
from typing import Any, Callable
# Ensure matplotlib has a writable config dir on locked-down hosts (e.g. HF
# Spaces). Guarded so importing this module from a notebook doesn't pollute
# the user's environment unnecessarily.
if not os.environ.get("MPLCONFIGDIR"):
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
import gradio as gr
try:
from ..core.models import ChargebackOpsAction
from ..evaluation.rubrics import (
CASE_DIMENSION_NAMES,
CASE_DIMENSION_WEIGHTS,
)
from ..runners.baseline_runner import (
_heuristic_pick,
_obvious_next_action,
candidate_actions,
)
from ..runners.benchmark_runner import POLICY_REGISTRY
from ..scenarios.simulation import get_task, list_tasks
from .chargeback_ops_environment import ChargebackOpsEnvironment
except ImportError: # pragma: no cover
from core.models import ChargebackOpsAction
from evaluation.rubrics import (
CASE_DIMENSION_NAMES,
CASE_DIMENSION_WEIGHTS,
)
from runners.baseline_runner import (
_heuristic_pick,
_obvious_next_action,
candidate_actions,
)
from runners.benchmark_runner import POLICY_REGISTRY
from scenarios.simulation import get_task, list_tasks
from server.chargeback_ops_environment import ChargebackOpsEnvironment
# OpenAI-compatible LLM policy is optional — the demo gracefully degrades to
# scripted policies if the openai SDK or runners.inference is unavailable.
try: # pragma: no cover — exercised only when LLM policy is selected
from openai import OpenAI # noqa: F401
try:
from ..runners.inference import _pick_with_openai_client
except ImportError:
from runners.inference import _pick_with_openai_client
_LLM_POLICY_AVAILABLE = True
except Exception: # pragma: no cover
_pick_with_openai_client = None # type: ignore[assignment]
_LLM_POLICY_AVAILABLE = False
# Path to the bundled hero figures (used by the Training Results tab).
_FIGURES_DIR = Path(__file__).resolve().parents[1] / "docs" / "figures"
# ---------------------------------------------------------------------------
# Static metadata
# ---------------------------------------------------------------------------
# Human-readable display labels for the 8 rubric dimensions (in canonical order).
_DIMENSION_LABELS: tuple[str, ...] = (
"Strategy Correctness",
"Evidence Quality",
"Packet Validity",
"Deadline Compliance",
"Efficiency",
"Outcome Quality",
"Note Quality",
"Escalation ROI",
)
# Per-dimension scoring summary (kept short so the table fits on one screen).
_DIMENSION_SCORING: tuple[str, ...] = (
"1.0 optimal · 0.35 acceptable · 0.0 wrong",
"Required + helpful coverage; harmful evidence penalised",
"Binary: all required evidence + zero harmful",
"Binary: case resolved before deadline",
"Penalises waste; rewards early concession",
"1.0 optimal · 0.4 acceptable · 0.0 wrong",
"Policy keywords + evidence references",
"EV-rational arbitration: P(win)·amount vs $250 fee",
)
# Selectable scripted policies (label shown to user → registry key).
# Order is intentional: best → worst, so radio top-to-bottom reads as a
# discrimination ladder.
_POLICY_CHOICES: tuple[tuple[str, str], ...] = (
("Heuristic — EV-rational baseline", "heuristic"),
("Escalate-all — contest then always escalate", "escalate_all"),
("Concede-all — always accept the chargeback", "concede_all"),
("Naive — submit empty packet, no evidence", "naive"),
("LLM (OpenAI-compatible API)", "llm"),
)
_POLICY_LABEL_BY_KEY: dict[str, str] = {
key: label for label, key in _POLICY_CHOICES
}
# Subset used by the Compare tab — scripted-only, deterministic, no API calls.
_COMPARE_POLICIES: tuple[str, ...] = (
"naive",
"concede_all",
"escalate_all",
"heuristic",
)
# One-click presets for the Run-Episode tab. Each preset is
# (button_label, task_id, generated_flag, difficulty, seed, recommended_policy, blurb).
_PRESETS: tuple[tuple[str, str, bool, str, int, str, str], ...] = (
(
"Easy contestable",
"goods_not_received_easy",
False,
"easy",
42,
"heuristic",
"Goods-not-received with strong evidence — heuristic should win round 1.",
),
(
"Queue optimization (hard)",
"queue_optimization_hard",
False,
"hard",
42,
"heuristic",
"Triage a heterogeneous queue under tight deadlines — exercises EV reasoning.",
),
(
"Long-horizon backlog",
"monthly_dispute_backlog_marathon",
False,
"medium",
42,
"heuristic",
"12 cases over 60 steps with delayed evidence; tests scheduling + waiting.",
),
(
"Generated nightmare",
"generated_nightmare_s31",
True,
"nightmare",
31,
"heuristic",
"Adversarial parametric task — even the heuristic struggles.",
),
(
"Compare all 4 policies",
"goods_not_received_easy",
False,
"easy",
42,
"heuristic",
"Open the Compare tab — same task, all four scripted policies side-by-side.",
),
)
# ---------------------------------------------------------------------------
# CSS
# ---------------------------------------------------------------------------
_CSS = """
.dashboard-header { text-align: center; padding: 16px 0 8px 0; }
.dashboard-header h1 { margin: 0; font-size: 28px; }
.dashboard-header p { margin: 4px 0 0 0; color: #888; font-size: 14px; }
.score-big { text-align: center; padding: 12px 0; }
.score-big .value { font-size: 56px; font-weight: 800; line-height: 1.1; }
.score-big .label { font-size: 13px; color: #888; margin-top: 4px; }
.bar-row { display: flex; align-items: center; margin: 4px 0; font-size: 13px; }
.bar-row .bar-label { width: 80px; flex-shrink: 0; }
.bar-row .bar-track { flex: 1; background: #2a2a2a; border-radius: 4px; height: 18px; overflow: hidden; margin: 0 8px; }
.bar-row .bar-fill { height: 100%; border-radius: 4px; transition: width 0.3s; }
.bar-row .bar-value { width: 44px; text-align: right; flex-shrink: 0; }
.case-card { border: 1px solid #3a3a3a; border-radius: 8px; padding: 14px; margin: 8px 0; background: #1a1a1a; }
.case-card .case-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px; }
.case-card .case-header .case-id { font-weight: 700; font-size: 15px; }
.case-card .case-header .case-meta { font-size: 12px; color: #999; }
.case-card .case-notes { font-size: 11px; color: #777; margin-top: 8px; }
.queue-table { width: 100%; border-collapse: collapse; font-size: 13px; }
.queue-table th { text-align: left; padding: 8px; border-bottom: 2px solid #444; font-weight: 600; color: #ccc; }
.queue-table td { padding: 8px; border-bottom: 1px solid #2a2a2a; }
.queue-table tr:hover { background: #1e1e1e; }
.urgency-crit { color: #ef4444; font-weight: 700; }
.urgency-warn { color: #eab308; font-weight: 600; }
.urgency-ok { color: #22c55e; }
.status-open { color: #3b82f6; }
.status-done { color: #22c55e; }
.status-fail { color: #ef4444; }
.budget-section { padding: 8px 0; }
.budget-section .budget-label { display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 3px; }
.color-green { color: #22c55e; }
.color-yellow { color: #eab308; }
.color-red { color: #ef4444; }
.color-blue { color: #3b82f6; }
.round-panel { border: 1px solid #3a3a3a; border-radius: 8px; padding: 12px 14px; margin: 8px 0; background: #1a1a1a; }
.round-panel .panel-title { font-weight: 700; font-size: 13px; color: #ccc; margin-bottom: 6px; text-transform: uppercase; letter-spacing: 0.5px; }
.round-badge { display: inline-block; padding: 3px 10px; border-radius: 12px; font-size: 12px; font-weight: 700; margin-right: 8px; }
.round-1 { background: #1e3a8a; color: #93c5fd; }
.round-2 { background: #78350f; color: #fcd34d; }
.round-3 { background: #7f1d1d; color: #fca5a5; }
.issuer-quote { font-style: italic; color: #d4d4d4; font-size: 13px; padding: 6px 10px; border-left: 3px solid #6366f1; margin: 6px 0; background: #15171f; }
.issuer-decision { font-weight: 700; font-size: 13px; }
.dec-accept { color: #22c55e; }
.dec-request { color: #eab308; }
.dec-escalate { color: #ef4444; }
.arb-panel { border: 1px solid #7f1d1d; border-radius: 8px; padding: 12px 14px; margin: 8px 0; background: #1a0e0e; }
.arb-row { display: flex; justify-content: space-between; padding: 4px 0; font-size: 13px; }
.arb-row .arb-label { color: #999; }
.arb-row .arb-value { font-weight: 700; }
.outcome-merchant { color: #22c55e; }
.outcome-issuer { color: #ef4444; }
.pnl-pos { color: #22c55e; font-weight: 800; }
.pnl-neg { color: #ef4444; font-weight: 800; }
"""
# ---------------------------------------------------------------------------
# HTML builders
# ---------------------------------------------------------------------------
def _bar_html(label: str, value: float, color: str) -> str:
pct = max(0, min(100, int(value * 100)))
return (
f'
'
f'
{label}'
f'
'
f'
{value:.2f}'
f"
"
)
def _score_color(v: float) -> str:
if v >= 0.8:
return "#22c55e"
if v >= 0.4:
return "#eab308"
return "#ef4444"
def _queue_html(observation) -> str:
if not observation.queue:
return "No cases.
"
rows = ""
for c in observation.queue:
sl = c.steps_until_deadline
if sl <= 1:
urg_cls, urg_icon = "urgency-crit", "!!"
elif sl <= 3:
urg_cls, urg_icon = "urgency-warn", "!"
else:
urg_cls, urg_icon = "urgency-ok", ""
if c.status == "open":
st_cls = "status-open"
elif c.status in ("won", "refunded", "accepted_chargeback"):
st_cls = "status-done"
else:
st_cls = "status-fail"
st_label = c.status.replace("_", " ").title()
net = f"{c.card_network.upper()} {c.network_reason_code}"
rows += (
f""
f"| {c.case_id} | "
f"{net} | "
f"{c.reason_code.replace('_', ' ')} | "
f"${c.amount:,.2f} | "
f'{urg_icon} {sl} | '
f'{st_label} | '
f"
"
)
return (
f''
f"| Case | Network | Reason | "
f"Amount | Deadline | "
f"Status |
"
f"{rows}
"
)
def _budget_html(steps_used: int, max_steps: int, score: float) -> str:
steps_pct = min(100, int(100 * steps_used / max(1, max_steps)))
score_pct = min(100, int(100 * score))
remaining = max_steps - steps_used
if steps_pct < 50:
budget_color = "#22c55e"
elif steps_pct < 80:
budget_color = "#eab308"
else:
budget_color = "#ef4444"
return f"""
Steps{remaining} left of {max_steps}
Score{score:.3f}
"""
_DEC_CLASS = {
"accept": "dec-accept",
"request_more_evidence": "dec-request",
"escalate_to_arbitration": "dec-escalate",
"merchant_wins": "outcome-merchant",
"issuer_wins": "outcome-issuer",
}
def _round_panel_html(
observation, history: list[dict[str, str]] | None = None
) -> str:
"""Render the visible case's round panel, including a chronological
issuer-message log so multi-round disputes show every R1/R2/R3 message.
``history`` is a list of ``{round, decision, rationale}`` dicts the caller
accumulates across steps.
"""
vc = observation.visible_case
if vc is None:
return ""
rnd = vc.round_number or 1
badge_cls = f"round-{min(rnd, 3)}"
rnd_label = {1: "Representment", 2: "Pre-Arbitration", 3: "Arbitration"}.get(rnd, f"Round {rnd}")
body = (
f''
f'R{rnd}'
f'{rnd_label} · case {vc.case_id}'
f'
'
)
# Show full issuer-message history if we have it, else fall back to the
# last-message snapshot from the observation.
rendered_any = False
if history:
for entry in history:
ent_rnd = entry.get("round", "?")
ent_dec = entry.get("decision") or ""
ent_rat = entry.get("rationale") or ""
ent_badge_cls = f"round-{min(int(ent_rnd) if str(ent_rnd).isdigit() else 1, 3)}"
dec_cls = _DEC_CLASS.get(ent_dec, "")
dec_pretty = ent_dec.replace("_", " ").title() if ent_dec else "(no decision)"
body += (
f''
f'R{ent_rnd}'
f'Issuer: {dec_pretty}'
f'
'
)
if ent_rat:
body += f'“{ent_rat}”
'
rendered_any = True
if not rendered_any and vc.last_issuer_decision:
dec = vc.last_issuer_decision
dec_cls = _DEC_CLASS.get(dec, "")
dec_pretty = dec.replace("_", " ").title()
body += f'Issuer: {dec_pretty}
'
if vc.last_issuer_rationale:
body += f'“{vc.last_issuer_rationale}”
'
if vc.pre_arb_evidence_added:
ids = ", ".join(vc.pre_arb_evidence_added)
body += (
f''
f'Pre-arb evidence added: {ids}
'
)
return f'{body}
'
def _arbitration_panel_html(observation) -> str:
vc = observation.visible_case
if vc is None or vc.arbitration_outcome is None:
return ""
outcome = vc.arbitration_outcome
outcome_cls = _DEC_CLASS.get(outcome, "")
outcome_label = outcome.replace("_", " ").title()
pnl = vc.final_economic_outcome
pnl_cls = "pnl-pos" if (pnl is not None and pnl >= 0) else "pnl-neg"
pnl_str = f"${pnl:+,.2f}" if pnl is not None else "n/a"
fees = vc.arb_fees_paid or 0.0
return (
f''
f'
ARBArbitration Outcome
'
f'
Ruling'
f'{outcome_label}
'
f'
Arb fees paid'
f'${fees:,.2f}
'
f'
Final P&L'
f'{pnl_str}
'
f'
'
)
def _grader_html(report: dict | None) -> str:
if not report:
return ""
score = report.get("normalized_score", 0)
summary = report.get("summary", "")
sc = _score_color(score)
html = (
f''
f'
{score:.3f}
'
f'
{summary}
'
f"
"
)
dims = [
("Strategy", "strategy_correctness", "20%"),
("Evidence", "evidence_quality", "15%"),
("Packet", "packet_validity", "10%"),
("Deadline", "deadline_compliance", "10%"),
("Efficiency", "efficiency", "10%"),
("Outcome", "outcome_quality", "10%"),
("Note", "note_quality", "5%"),
("Esc ROI", "escalation_roi", "20%"),
]
for case in report.get("case_reports", []):
cid = case.get("case_id", "")
res = case.get("final_resolution", "")
ws = case.get("weighted_score", 0)
bars = ""
for label, key, weight in dims:
v = case.get(key, 0)
bars += _bar_html(f"{label} ({weight})", v, _score_color(v))
notes = case.get("notes", "")
notes_html = f'{notes}
' if notes else ""
html += (
f''
f'"
f"{bars}{notes_html}"
f"
"
)
return html
# ---------------------------------------------------------------------------
# Episode runner (generator — yields per step)
# ---------------------------------------------------------------------------
def _resolve_task_id(task_id: str, generated: bool, difficulty: str, seed: int) -> str:
if generated:
return f"generated_{difficulty}_s{seed}"
return task_id
def _build_llm_policy(
base_url: str, api_key: str, model_name: str
) -> tuple[Callable[[dict[str, Any]], ChargebackOpsAction | None], str]:
"""Return ``(policy_fn, label)`` calling an OpenAI-compatible chat model.
The policy mirrors the production inference pipeline in
:mod:`runners.inference`: candidate generation + obvious-action shortcut +
LLM pick over the shortlist. On any LLM failure (network, parse, missing
key) it falls back to the heuristic so the demo never freezes mid-stream.
UI fields take precedence; blanks fall back to ``HF_TOKEN`` /
``API_KEY`` / ``OPENROUTER_API_KEY`` / ``GROQ_API_KEY`` / ``API_BASE_URL``
/ ``MODEL_NAME`` env vars. This lets HF Space operators wire credentials
via Space Secrets without the public demo asking visitors for keys.
"""
if not _LLM_POLICY_AVAILABLE or _pick_with_openai_client is None:
raise RuntimeError(
"openai SDK is not available — install `openai` to use the LLM policy."
)
base_url = (base_url or "").strip()
api_key = (api_key or "").strip()
model_name = (model_name or "").strip()
if not api_key:
api_key = (
os.getenv("HF_TOKEN")
or os.getenv("API_KEY")
or os.getenv("OPENROUTER_API_KEY")
or os.getenv("GROQ_API_KEY")
or ""
)
# Resolve provider from explicit base_url first, then from which key
# variable was set in the environment. This lets us pick a sensible
# default model name even when only the key is provided.
provider: str = ""
if not base_url:
base_url = os.getenv("API_BASE_URL", "").strip()
if base_url:
lowered = base_url.lower()
if "groq" in lowered:
provider = "groq"
elif "openrouter" in lowered:
provider = "openrouter"
elif "huggingface" in lowered or "hf.space" in lowered:
provider = "hf"
elif "openai.com" in lowered:
provider = "openai"
if not base_url:
if os.getenv("GROQ_API_KEY"):
base_url, provider = "https://api.groq.com/openai/v1", "groq"
elif os.getenv("OPENROUTER_API_KEY"):
base_url, provider = "https://openrouter.ai/api/v1", "openrouter"
else:
base_url, provider = "https://router.huggingface.co/v1", "hf"
if not model_name:
model_name = os.getenv("MODEL_NAME", "").strip()
if not model_name:
# Provider-appropriate defaults — every option here works without
# the user having to look up a model card.
provider_defaults = {
"groq": "llama-3.3-70b-versatile",
"openrouter": "meta-llama/llama-3.1-8b-instruct:free",
"openai": "gpt-4o-mini",
"hf": "Qwen/Qwen2.5-72B-Instruct",
}
model_name = provider_defaults.get(provider, "Qwen/Qwen2.5-72B-Instruct")
if not api_key:
raise RuntimeError(
"No API key — type one in the UI, or set HF_TOKEN / API_KEY / "
"OPENROUTER_API_KEY / GROQ_API_KEY in the environment (HF Space "
"Secrets work too)."
)
if not model_name:
raise RuntimeError("Model name is required for the LLM policy.")
client = OpenAI(
base_url=base_url,
api_key=api_key,
timeout=15.0,
max_retries=0,
)
def policy_fn(observation: dict[str, Any]) -> ChargebackOpsAction | None:
cands = candidate_actions(observation)
if not cands:
return None
if len(cands) == 1:
return cands[0].action
obvious = _obvious_next_action(observation, cands)
if obvious is not None:
return obvious.action
try:
pick, _ok, _err = _pick_with_openai_client(
client, model_name, observation, cands
)
return pick.action
except Exception:
return _heuristic_pick(cands).action
label = f"LLM ({model_name})"
return policy_fn, label
def _result_badge(result: str | None) -> str:
"""Prefix a step result string with a status emoji for fast scanning.
Distinguishes accepted/no-op/rejected so the trace dataframe self-narrates.
"""
if not result:
return "· (no result)"
text = str(result)
lowered = text.lower()
if "error" in lowered or "reject" in lowered or "invalid" in lowered or "fail" in lowered:
return f"✗ {text}"
if "no-op" in lowered or "noop" in lowered or "ignored" in lowered or "skipped" in lowered:
return f"⚠ {text}"
return f"✓ {text}"
def _resolve_max_steps(observation, task_id: str) -> int:
"""Pull the task budget from the observation; fall back to the task definition.
The legacy implementation defaulted to 10 if the observation field was absent,
which silently mis-rendered the budget bar. The env always populates
``info.current_task_max_steps`` after ``reset``; if it ever doesn't, we read
the task object directly so the bar still reflects truth.
"""
cap = observation.info.get("current_task_max_steps")
if isinstance(cap, int) and cap > 0:
return cap
try:
return int(get_task(task_id).max_steps)
except Exception: # pragma: no cover — defensive
return 60
def run_episode(
task_id: str,
generated: bool,
difficulty: str,
seed: int,
policy: str = "heuristic",
llm_base_url: str = "",
llm_api_key: str = "",
llm_model: str = "",
):
tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
env = ChargebackOpsEnvironment()
obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
max_steps = _resolve_max_steps(obs, tid)
rows: list[list[Any]] = []
policy_fn: Callable[[dict[str, Any]], ChargebackOpsAction | None] | None = None
if policy == "llm":
try:
policy_fn, policy_label = _build_llm_policy(
llm_base_url, llm_api_key, llm_model
)
except Exception as exc:
err_md = (
f"### LLM policy unavailable\n"
f"`{type(exc).__name__}: {exc}`\n\n"
f"Falling back to **heuristic** for this run."
)
policy = "heuristic"
policy_fn = POLICY_REGISTRY["heuristic"]
policy_label = _POLICY_LABEL_BY_KEY[policy]
yield (
err_md,
_queue_html(obs),
_budget_html(0, max_steps, 0.0),
[],
"",
"",
"",
None,
)
if policy_fn is None:
policy_fn = POLICY_REGISTRY.get(policy) or POLICY_REGISTRY["heuristic"]
if policy not in POLICY_REGISTRY:
policy = "heuristic"
policy_label = _POLICY_LABEL_BY_KEY.get(policy, policy)
# Per-case issuer-message log: case_id -> [{"round","decision","rationale"}]
issuer_log: dict[str, list[dict[str, str]]] = {}
def _maybe_log_issuer_msg(observation) -> None:
vc = observation.visible_case
if vc is None or not vc.last_issuer_decision:
return
log = issuer_log.setdefault(vc.case_id, [])
entry = {
"round": str(vc.round_number or 1),
"decision": vc.last_issuer_decision or "",
"rationale": vc.last_issuer_rationale or "",
}
# Avoid duplicating the same message on adjacent steps.
if not log or log[-1] != entry:
log.append(entry)
def _current_history(observation) -> list[dict[str, str]]:
vc = observation.visible_case
if vc is None:
return []
return issuer_log.get(vc.case_id, [])
header = (
f"### {obs.task_title}\n"
f"`{obs.task_id}` — {len(obs.queue)} case(s), "
f"{max_steps} steps, **{obs.difficulty}** · policy: **{policy_label}**"
)
yield (
header,
_queue_html(obs),
_budget_html(0, max_steps, 0.0),
[row[:] for row in rows],
_round_panel_html(obs, _current_history(obs)),
_arbitration_panel_html(obs),
"",
None,
)
step = 0
while not obs.done:
payload = obs.model_dump()
try:
action = policy_fn(payload)
except Exception as exc: # pragma: no cover — surface in UI
err_md = (
f"### Policy error\n"
f"`{policy}` raised `{type(exc).__name__}: {exc}` on step {step + 1}. "
f"Halting episode."
)
yield (
err_md,
_queue_html(obs),
_budget_html(step, max_steps, obs.progress_score),
[row[:] for row in rows],
_round_panel_html(obs, _current_history(obs)),
_arbitration_panel_html(obs),
"",
None,
)
return
if action is None:
break
summary_action = action
step += 1
try:
obs = env.step(action)
except Exception as exc: # pragma: no cover — surface in UI
err_md = (
f"### Environment error\n"
f"`env.step({summary_action.action_type})` raised "
f"`{type(exc).__name__}: {exc}` on step {step}. "
f"Halting episode."
)
rows.append(
[
step,
summary_action.action_type,
summary_action.case_id or "",
summary_action.system_name or "",
summary_action.strategy or "",
0.0,
f"✗ error: {type(exc).__name__}",
]
)
yield (
err_md,
_queue_html(obs),
_budget_html(step, max_steps, obs.progress_score),
[row[:] for row in rows],
_round_panel_html(obs, _current_history(obs)),
_arbitration_panel_html(obs),
"",
None,
)
return
_maybe_log_issuer_msg(obs)
rows.append(
[
step,
summary_action.action_type,
summary_action.case_id or obs.selected_case_id or "",
summary_action.system_name or "",
summary_action.strategy or "",
round(obs.reward or 0.0, 4),
_result_badge(obs.last_action_result),
]
)
status_md = (
f"**Step {step}** — `{summary_action.action_type}` "
f"→ reward **{round(obs.reward or 0.0, 4)}** · policy: **{policy_label}**"
)
grader = (
_grader_html(obs.grader_report.model_dump()) if obs.grader_report else ""
)
yield (
status_md,
_queue_html(obs),
_budget_html(step, max_steps, obs.progress_score),
[row[:] for row in rows],
_round_panel_html(obs, _current_history(obs)),
_arbitration_panel_html(obs),
grader,
None,
)
report = obs.grader_report.model_dump() if obs.grader_report else None
sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
final_md = (
f"### Done — score **{sc}** in **{len(rows)}** steps "
f"· policy: **{policy_label}**"
)
yield (
final_md,
_queue_html(obs),
_budget_html(step, max_steps, obs.progress_score),
[row[:] for row in rows],
_round_panel_html(obs, _current_history(obs)),
_arbitration_panel_html(obs),
_grader_html(report),
report,
)
# ---------------------------------------------------------------------------
# Compare tab — run all four scripted policies on the same task in series and
# render a single side-by-side bar chart of the final scores plus a per-case
# per-dimension breakdown.
# ---------------------------------------------------------------------------
def _run_one_episode_sync(task_id: str, policy_key: str) -> dict[str, Any]:
"""Synchronously run a single scripted-policy episode and return summary.
Cheap because every policy in :data:`_COMPARE_POLICIES` is pure-Python and
fully offline (no provider calls).
"""
env = ChargebackOpsEnvironment()
obs = env.reset(task_id=task_id)
policy_fn = POLICY_REGISTRY[policy_key]
steps = 0
while not obs.done:
try:
action = policy_fn(obs.model_dump())
except Exception:
break
if action is None:
break
try:
obs = env.step(action)
except Exception:
break
steps += 1
score = obs.grader_report.normalized_score if obs.grader_report else 0.0
return {
"policy": policy_key,
"score": float(score),
"steps": steps,
"summary": obs.grader_report.summary if obs.grader_report else "",
}
def run_compare(task_id: str, generated: bool, difficulty: str, seed: int):
"""Run all four scripted policies on the same task and render a chart."""
tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
results = [_run_one_episode_sync(tid, p) for p in _COMPARE_POLICIES]
# Bar-chart HTML (CSS-only, no extra deps).
max_score = max((r["score"] for r in results), default=1.0) or 1.0
bars = ""
for r in results:
pct = int(round(100 * r["score"] / max(0.001, max_score)))
color = _score_color(r["score"])
bars += (
f''
f'
{r["policy"]}'
f'
'
f'
'
f'{r["score"]:.3f} · {r["steps"]} steps'
f'
'
)
# Discrimination delta.
by_policy = {r["policy"]: r["score"] for r in results}
delta = by_policy.get("heuristic", 0.0) - by_policy.get("naive", 0.0)
title = (
f''
f'Task: {tid} · '
f'Discrimination delta (heuristic − naive) = '
f''
f'+{delta:.3f}'
f'
'
)
md = (
f"### Side-by-side: 4 scripted policies on the same task\n"
f"Same `task_id`, same `seed`, no provider calls. The discrimination "
f"gradient (`naive` → `concede_all` → `escalate_all` → `heuristic`) "
f"is the empirical evidence behind the README's `+0.813` claim."
)
table_rows = [
[r["policy"], f"{r['score']:.3f}", r["steps"], r["summary"]]
for r in results
]
return md, title + '' + bars + "
", table_rows
# ---------------------------------------------------------------------------
# Build Gradio app
# ---------------------------------------------------------------------------
def build_demo() -> gr.Blocks:
tasks = list_tasks()
task_ids = [t.task_id for t in tasks]
default = task_ids[0] if task_ids else "goods_not_received_easy"
with gr.Blocks(title="ChargebackOps") as demo:
# Inject CSS (Gradio 6 moved css= to launch(); ")
# Header + context links
gr.HTML(
'"
)
with gr.Tabs():
# ── Tab 1: Run Episode ────────────────────────────────
with gr.Tab("Run Episode"):
# Preset buttons row — one-click task+policy configuration.
gr.Markdown("**Quick presets** — click any to load a known-good configuration.")
with gr.Row():
preset_buttons = [
gr.Button(p[0], size="sm", scale=1) for p in _PRESETS
]
preset_blurb = gr.Markdown("")
with gr.Row():
dd_task = gr.Dropdown(
label="Task", choices=task_ids, value=default, scale=3
)
cb_gen = gr.Checkbox(label="Generated", value=False, scale=1)
rd_diff = gr.Radio(
["easy", "medium", "hard", "nightmare"],
label="Difficulty",
value="easy",
visible=False,
scale=2,
)
nb_seed = gr.Number(
label="Seed", value=42, precision=0, visible=False, scale=1
)
with gr.Row():
rd_policy = gr.Radio(
choices=list(_POLICY_CHOICES),
value="heuristic",
label="Policy",
scale=4,
)
btn_run = gr.Button("Run Episode", variant="primary", scale=1)
# LLM-policy inputs — only visible when "LLM" is selected.
with gr.Accordion(
"LLM policy settings (used when 'LLM' is selected above)",
open=False,
visible=False,
) as llm_accordion:
gr.Markdown(
"Bring your own OpenAI-compatible endpoint. Defaults match the "
"Hugging Face router; OpenRouter, Groq, Together, Fireworks, "
"and Anthropic-compatible gateways all work. **Leave fields "
"blank** to inherit `HF_TOKEN` / `OPENROUTER_API_KEY` / "
"`GROQ_API_KEY` / `API_BASE_URL` / `MODEL_NAME` from the "
"environment (set them as Space Secrets when deploying)."
)
with gr.Row():
tb_llm_base = gr.Textbox(
label="Base URL",
value="https://router.huggingface.co/v1",
scale=2,
)
tb_llm_model = gr.Textbox(
label="Model",
value="Qwen/Qwen2.5-72B-Instruct",
scale=2,
)
tb_llm_key = gr.Textbox(
label="API key",
value="",
type="password",
scale=2,
)
md_status = gr.Markdown(
"Pick a task + policy and click **Run Episode**. Run the same task "
"under each of the four scripted policies (heuristic, escalate-all, "
"concede-all, naive) to reproduce the discrimination gradient — naive "
"→ 0.000, concede-all → ~0.44, escalate-all → ~0.77, heuristic → ~0.81. "
"Or pick **LLM** and bring your own model. For a side-by-side view, "
"open the **Compare policies** tab."
)
with gr.Row(equal_height=True):
with gr.Column(scale=3):
html_queue = gr.HTML(label="Dispute Queue")
with gr.Column(scale=1, min_width=200):
html_budget = gr.HTML(label="Budget")
df_trace = gr.Dataframe(
headers=[
"#",
"Action",
"Case",
"System",
"Strategy",
"Reward",
"Result",
],
datatype=["number", "str", "str", "str", "str", "number", "str"],
interactive=False,
wrap=True,
label="Step Trace (✓ accepted · ⚠ no-op · ✗ rejected)",
)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
html_round = gr.HTML(label="Dispute Round (issuer messages)")
with gr.Column(scale=1):
html_arb = gr.HTML(label="Arbitration")
html_grader = gr.HTML(label="Grader Report")
with gr.Accordion("Raw grader JSON (export-friendly)", open=False):
json_raw = gr.JSON(label="Raw JSON", show_label=False)
btn_run.click(
fn=run_episode,
inputs=[
dd_task, cb_gen, rd_diff, nb_seed, rd_policy,
tb_llm_base, tb_llm_key, tb_llm_model,
],
outputs=[
md_status,
html_queue,
html_budget,
df_trace,
html_round,
html_arb,
html_grader,
json_raw,
],
)
# Generated-checkbox visibility callback.
def _toggle_generated(generated: bool):
return (
gr.update(visible=generated),
gr.update(visible=generated),
)
cb_gen.change(
fn=_toggle_generated,
inputs=[cb_gen],
outputs=[rd_diff, nb_seed],
)
# Show LLM accordion only when 'llm' policy is selected.
def _toggle_llm(policy: str):
return gr.update(visible=(policy == "llm"), open=(policy == "llm"))
rd_policy.change(
fn=_toggle_llm, inputs=[rd_policy], outputs=[llm_accordion]
)
# Wire each preset button to populate the inputs atomically.
def _make_preset_handler(preset):
label, t_id, gen, diff, seed_v, pol, blurb = preset
def _apply():
return (
t_id, # dd_task
gen, # cb_gen
gr.update(value=diff, visible=gen), # rd_diff
gr.update(value=seed_v, visible=gen), # nb_seed
pol, # rd_policy
gr.update(visible=(pol == "llm")), # llm_accordion
f"**Preset:** {label} — {blurb}", # preset_blurb
)
return _apply
for btn, preset in zip(preset_buttons, _PRESETS):
btn.click(
fn=_make_preset_handler(preset),
inputs=[],
outputs=[
dd_task,
cb_gen,
rd_diff,
nb_seed,
rd_policy,
llm_accordion,
preset_blurb,
],
)
# ── Tab 2: Compare policies ──────────────────────────
with gr.Tab("Compare policies"):
gr.Markdown(
"Run all four scripted policies on the **same task / seed** and see "
"the discrimination gradient at a glance. No provider calls, no LLM, "
"fully deterministic — this is the empirical evidence behind the "
"README's `+0.813` discrimination delta claim."
)
with gr.Row():
cmp_task = gr.Dropdown(
label="Task", choices=task_ids, value=default, scale=3
)
cmp_gen = gr.Checkbox(label="Generated", value=False, scale=1)
cmp_diff = gr.Radio(
["easy", "medium", "hard", "nightmare"],
label="Difficulty",
value="easy",
visible=False,
scale=2,
)
cmp_seed = gr.Number(
label="Seed", value=42, precision=0, visible=False, scale=1
)
btn_cmp = gr.Button("Run all 4 policies", variant="primary")
cmp_md = gr.Markdown("")
cmp_html = gr.HTML(label="Final-score comparison")
cmp_table = gr.Dataframe(
headers=["Policy", "Score", "Steps", "Summary"],
datatype=["str", "str", "number", "str"],
interactive=False,
wrap=True,
label="Per-policy summary",
)
btn_cmp.click(
fn=run_compare,
inputs=[cmp_task, cmp_gen, cmp_diff, cmp_seed],
outputs=[cmp_md, cmp_html, cmp_table],
)
cmp_gen.change(
fn=_toggle_generated,
inputs=[cmp_gen],
outputs=[cmp_diff, cmp_seed],
)
# ── Tab 3: Task Catalog ──────────────────────────────
with gr.Tab("Task Catalog"):
catalog_rows = []
for t in tasks:
nets = sorted(
{
f"{c.card_network.upper()} {c.network_reason_code}"
for c in t.cases
}
)
catalog_rows.append(
[
t.task_id,
t.title,
t.difficulty,
len(t.cases),
t.max_steps,
", ".join(nets),
t.objective,
]
)
gr.Dataframe(
value=catalog_rows,
headers=[
"Task ID",
"Title",
"Difficulty",
"Cases",
"Steps",
"Networks",
"Objective",
],
interactive=False,
wrap=True,
label=f"{len(tasks)}-Task Benchmark Catalog",
)
# ── Tab 3: Environment Info ───────────────────────────
with gr.Tab("Environment"):
gr.Markdown(_environment_tab_markdown())
# ── Tab 5: Rubric Tree ────────────────────────────────
with gr.Tab("Rubric Tree"):
gr.Markdown(
"Live introspection of `env.rubric.named_rubrics()` — the same composable "
"OpenEnv `Rubric` tree that grades every step. Weights and structure below "
"are read from the running environment, not hardcoded."
)
gr.HTML(_rubric_tree_html())
gr.Markdown(
"See [`docs/METHOD.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/METHOD.md) "
"and [`docs/SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md) "
"for the full design and the GRPO failure-mode write-up."
)
# ── Tab 6: Training Results ───────────────────────────
with gr.Tab("Training Results"):
gr.Markdown(_training_tab_markdown())
for caption, fname in (
(
"**Cross-iteration training curve.** Iter 3 plateaued below the "
"heuristic at 0.728. Iter 5 plateaued *bit-exactly* at the heuristic "
"at 0.8132 — the signature of the eval-fallback exploit, not "
"convergent learning.",
"training_curve_cross_iter.png",
),
(
"**Iter-5 eval-score attribution.** The trained policy contributes "
"0.000 (every action is rejected by env validation). The eval rollout "
"helper's heuristic-fallback path contributes 0.8132 — i.e. all of it.",
"gaming_attribution.png",
),
(
"**Scripted-policy discrimination gradient.** The 8-dimension "
"`WeightedSum` plus the deadline `Gate` defeats every degenerate "
"policy: empty-packet zeros out, concede-all caps at 0.44, "
"escalate-all caps at 0.77.",
"discrimination_gradient.png",
),
(
"**8-dimension OpenEnv rubric weights**, grouped by category "
"(decision / packet / process / terminal). 40% of reward sits on "
"decision + terminal — where economically irrational policies "
"bleed money fastest.",
"rubric_weights.png",
),
(
"**Iter-5 per-difficulty curves.** Post-step-80 plateau is the "
"fallback heuristic across every difficulty band; see "
"SPECIFICATION_GAMING.md for the diagnosis.",
"training_curve_by_family.png",
),
):
src = _figure_data_uri(fname)
if src is None:
gr.Markdown(
f"_(figure `{fname}` not bundled — see "
f"[`docs/figures/{fname}`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/figures/{fname}))_"
)
continue
gr.Markdown(caption)
gr.HTML(
f'
'
)
return demo
# ---------------------------------------------------------------------------
# Tab content builders (called once at app build; keep cheap)
# ---------------------------------------------------------------------------
def _environment_tab_markdown() -> str:
"""Render the Environment tab content from live constants.
Reads action types from ``core.models.ActionType`` and the rubric weights
from ``evaluation.rubrics.CASE_DIMENSION_WEIGHTS`` so this tab can never
drift from the source of truth.
"""
try:
from core.models import ActionType # type: ignore[attr-defined]
except ImportError: # pragma: no cover
from ..core.models import ActionType # type: ignore[attr-defined]
# ``Literal`` exposes its members via ``__args__``.
actions: tuple[str, ...] = tuple(getattr(ActionType, "__args__", ()))
n_actions = len(actions)
r1 = (
"select_case", "inspect_case", "query_system", "retrieve_policy",
"add_evidence", "remove_evidence", "set_strategy",
"submit_representment", "resolve_case",
)
r23 = ("respond_to_pre_arb", "escalate_to_arbitration", "accept_arbitration_loss")
long_horizon = ("wait_for_updates",)
def _join(items: tuple[str, ...]) -> str:
return " · ".join(f"`{name}`" for name in items)
rubric_rows = "\n".join(
f"| {label} | {int(round(weight * 100))}% | {scoring} |"
for label, weight, scoring in zip(
_DIMENSION_LABELS, CASE_DIMENSION_WEIGHTS, _DIMENSION_SCORING
)
)
return (
f"## Action Space ({n_actions} typed actions)\n\n"
f"**Round 1 — Representment:** {_join(r1)}\n\n"
f"**Round 2/3 — Pre-arb & Arbitration:** {_join(r23)}\n\n"
f"**Long-horizon backlog:** {_join(long_horizon)}\n\n"
"## Merchant Systems (6)\n\n"
"`orders` · `payment` · `shipping` · "
"`support` · `refunds` · `risk`\n\n"
"## Grading (8 dimensions)\n\n"
"Weights are read live from `evaluation.rubrics.CASE_DIMENSION_WEIGHTS`.\n\n"
"| Dimension | Weight | Scoring |\n"
"|---|---|---|\n"
f"{rubric_rows}\n\n"
"## Scripted policies (Run Episode tab)\n\n"
"| Policy | What it does | Headline avg |\n"
"|---|---|---|\n"
"| `naive` | Submit empty packet, no evidence, no policy work | 0.000 |\n"
"| `concede_all` | Always set strategy `accept_chargeback` and resolve | 0.444 |\n"
"| `escalate_all` | Contest like the heuristic, then always escalate | 0.767 |\n"
"| `heuristic` | EV-rational, fully offline | **0.813** |\n\n"
"## Card Networks\n\n"
"| Reason Code | Visa | Mastercard |\n"
"|---|---|---|\n"
"| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
"| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
"| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
"| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
"| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
"| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
)
def _rubric_tree_html() -> str:
"""Render the live ``env.rubric.named_rubrics()`` tree as nested HTML.
Also explicitly surfaces the deadline ``Gate(CaseAbandonedRubric)`` that
sits on top of the per-case ``WeightedSum`` — OpenEnv's default walk
iterates registered child rubrics only, and the Gate is a sibling of the
aggregator inside :class:`CaseRubric`.
Falls back to a static snapshot if introspection fails for any reason
(e.g. an old OpenEnv build) so the demo never breaks on this tab.
"""
try:
env = ChargebackOpsEnvironment()
named = list(env.rubric.named_rubrics())
except Exception as exc: # pragma: no cover — defensive fallback
return (
f"Could not introspect rubric tree: "
f"{type(exc).__name__}: {exc}"
)
# Map weights onto leaf rubrics by name. CASE_DIMENSION_NAMES is the
# canonical order the WeightedSum was built with; weights align by index.
weight_by_dim = dict(zip(CASE_DIMENSION_NAMES, CASE_DIMENSION_WEIGHTS))
rows: list[str] = []
rows.append(
""
"| Path | Class | Weight / Role |
"
)
# Explicitly inject the deadline gate row above the aggregator subtree,
# since some OpenEnv versions don't yield it via named_rubrics().
deadline_gate_injected = False
for path, rubric in named:
cls_name = type(rubric).__name__
if (
not deadline_gate_injected
and cls_name == "WeightedSum"
and path.endswith("aggregator")
):
parent = path.rsplit(".", 1)[0]
rows.append(
f"{' ' * (parent.count('.') * 4 + 4)}"
f"{parent}.deadline_gate | "
f"Gate(CaseAbandonedRubric) | "
f"hard-zero on miss |
"
)
deadline_gate_injected = True
weight_str = "—"
for dim_name, weight in weight_by_dim.items():
tag = "".join(part.capitalize() for part in dim_name.split("_")) + "Rubric"
if cls_name == tag:
weight_str = f"{int(round(weight * 100))}%"
break
depth = path.count(".")
indent = " " * (depth * 4)
rows.append(
f"{indent}{path or '(root)'} | "
f"{cls_name} | "
f"{weight_str} |
"
)
rows.append("
")
return "".join(rows)
# ---------------------------------------------------------------------------
# Training Results helpers
# ---------------------------------------------------------------------------
def _figure_data_uri(filename: str) -> str | None:
"""Return a base64 ``data:image/png`` URI for a bundled figure, or None.
Embedding figures inline avoids dependencies on the static-asset routing
of whatever host serves the demo (HF Spaces, FastAPI sub-mount, etc.).
"""
path = _FIGURES_DIR / filename
if not path.is_file():
return None
try:
data = path.read_bytes()
except OSError:
return None
encoded = base64.b64encode(data).decode("ascii")
return f"data:image/png;base64,{encoded}"
def _training_tab_markdown() -> str:
return (
"## Real training, end-to-end\n\n"
"**Pipeline.** Qwen2.5-3B fp16 + LoRA r=16 on a single Colab T4. Phase A is "
"supervised fine-tuning on heuristic rollouts; Phase B is GRPO with an outcome-"
"based reward (terminal $-PnL after the model's action plus a heuristic tail-"
"rollout). The training loop **connects to the live `ChargebackOpsEnvironment`** "
"— every gradient step is graded by the same rubric and same Issuer adversary "
"the eval uses. There is no static dataset shortcut.\n\n"
"**Five iterations, three failure modes.** Iter 1 produced total gradient "
"collapse (group reward variance ≈ 0). Iter 3 broke through to non-zero gradient "
"but plateaued at 0.728. **Iter 5 ran 200 GRPO steps and uncovered a reproducible "
"specification-gaming exploit** where the model emits invalid `accept_case` "
"actions, triggers the eval rollout helper's heuristic-fallback path, and "
"scores bit-exactly the heuristic baseline at 0.8132. The full diagnosis is in "
"[`SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md).\n\n"
"**Honest trained-vs-untrained delta:** the SFT step at 0.536 — **+0.08 absolute, "
"+18% relative** over the untrained Qwen2.5-3B base — is the only legitimate "
"model-attributable improvement on iter 5. We document this honestly because "
"the failure mode itself is a research artefact future GRPO recipes can target "
"as a benchmark.\n\n"
"**Reproduce.** "
"[Latest training run (Colab — iter 5, 200 GRPO steps)](https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5?usp=sharing) · "
"[Previous training run (Colab — iter 3, 62 GRPO steps)](https://colab.research.google.com/drive/1AjG3Sv7FnMeOSls6JMzTunkMzlJi_ySu?usp=sharing) · "
"[`notebooks/train_merchant_agent.ipynb`](https://github.com/MitudruDutta/chargebackops/blob/main/notebooks/train_merchant_agent.ipynb)\n"
)