Spaces:

mitudrudutta
/

ChargeBackOps

Sleeping

App Files Files Community

mitudrudutta commited on Apr 26

Commit

862cfc4

1 Parent(s): 4d7c179

Implement code changes to enhance functionality and improve performance

Browse files

Files changed (1) hide show

server/demo_ui.py +957 -78

server/demo_ui.py CHANGED Viewed

@@ -2,32 +2,164 @@
 from __future__ import annotations
 import os
-from typing import Any
-os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
 import gradio as gr
 try:
-    from ..evaluation.agent_brutal_audit import _bad_policy_action
     from ..runners.baseline_runner import (
         _heuristic_pick,
         _obvious_next_action,
         candidate_actions,
     )
-    from ..scenarios.simulation import list_tasks
     from .chargeback_ops_environment import ChargebackOpsEnvironment
 except ImportError:  # pragma: no cover
-    from evaluation.agent_brutal_audit import _bad_policy_action
     from runners.baseline_runner import (
         _heuristic_pick,
         _obvious_next_action,
         candidate_actions,
     )
-    from scenarios.simulation import list_tasks
     from server.chargeback_ops_environment import ChargebackOpsEnvironment
 # ---------------------------------------------------------------------------
 # CSS
@@ -205,7 +337,16 @@ _DEC_CLASS = {
 }
-def _round_panel_html(observation) -> str:
     vc = observation.visible_case
     if vc is None:
         return ""
@@ -221,14 +362,34 @@ def _round_panel_html(observation) -> str:
         f'</div>'
     )
-    if vc.last_issuer_decision:
         dec = vc.last_issuer_decision
         dec_cls = _DEC_CLASS.get(dec, "")
         dec_pretty = dec.replace("_", " ").title()
         body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
-    if vc.last_issuer_rationale:
-        body += f'<div class="issuer-quote">&ldquo;{vc.last_issuer_rationale}&rdquo;</div>'
     if vc.pre_arb_evidence_added:
         ids = ", ".join(vc.pre_arb_evidence_added)
@@ -329,18 +490,218 @@ def _resolve_task_id(task_id: str, generated: bool, difficulty: str, seed: int)
     return task_id
 def run_episode(
-    task_id: str, generated: bool, difficulty: str, seed: int, policy: str = "heuristic"
 ):
     tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
     env = ChargebackOpsEnvironment()
     obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
-    max_steps = obs.info.get("current_task_max_steps", 10)
     rows: list[list[Any]] = []
-    policy_label = (
-        "Heuristic" if policy == "heuristic" else "Naive (concede-everything)"
-    )
     header = (
         f"### {obs.task_title}\n"
         f"`{obs.task_id}` &mdash; {len(obs.queue)} case(s), "
@@ -351,7 +712,7 @@ def run_episode(
         _queue_html(obs),
         _budget_html(0, max_steps, 0.0),
         [row[:] for row in rows],
-        _round_panel_html(obs),
         _arbitration_panel_html(obs),
         "",
         None,
@@ -359,19 +720,65 @@ def run_episode(
     step = 0
     while not obs.done:
-        if policy == "bad":
-            action = _bad_policy_action(obs)
-            summary_action = action
-        else:
-            payload = obs.model_dump()
-            cands = candidate_actions(payload)
-            if not cands:
-                break
-            pick = _obvious_next_action(payload, cands) or _heuristic_pick(cands)
-            action = pick.action
-            summary_action = pick.action
         step += 1
-        obs = env.step(action)
         rows.append(
             [
                 step,
@@ -380,7 +787,7 @@ def run_episode(
                 summary_action.system_name or "",
                 summary_action.strategy or "",
                 round(obs.reward or 0.0, 4),
-                obs.last_action_result,
             ]
         )
@@ -396,7 +803,7 @@ def run_episode(
             _queue_html(obs),
             _budget_html(step, max_steps, obs.progress_score),
             [row[:] for row in rows],
-            _round_panel_html(obs),
             _arbitration_panel_html(obs),
             grader,
             None,
@@ -404,19 +811,109 @@ def run_episode(
     report = obs.grader_report.model_dump() if obs.grader_report else None
     sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
-    final_md = f"### Done &mdash; score **{sc}** in **{len(rows)}** steps"
     yield (
         final_md,
         _queue_html(obs),
         _budget_html(step, max_steps, obs.progress_score),
         [row[:] for row in rows],
-        _round_panel_html(obs),
         _arbitration_panel_html(obs),
         _grader_html(report),
         report,
     )
 # ---------------------------------------------------------------------------
 # Build Gradio app
 # ---------------------------------------------------------------------------
@@ -431,17 +928,38 @@ def build_demo() -> gr.Blocks:
         # Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
         gr.HTML(f"<style>{_CSS}</style>")
-        # Header
         gr.HTML(
             '<div class="dashboard-header">'
             "<h1>ChargebackOps</h1>"
-            "<p>Merchant chargeback dispute environment &mdash; OpenEnv benchmark</p>"
             "</div>"
         )
         with gr.Tabs():
             # ── Tab 1: Run Episode ────────────────────────────────
             with gr.Tab("Run Episode"):
                 with gr.Row():
                     dd_task = gr.Dropdown(
                         label="Task", choices=task_ids, value=default, scale=3
@@ -451,25 +969,60 @@ def build_demo() -> gr.Blocks:
                         ["easy", "medium", "hard", "nightmare"],
                         label="Difficulty",
                         value="easy",
                         scale=2,
                     )
-                    nb_seed = gr.Number(label="Seed", value=42, precision=0, scale=1)
                 with gr.Row():
                     rd_policy = gr.Radio(
-                        choices=[
-                            ("Heuristic (smart baseline)", "heuristic"),
-                            ("Naive (always concede)", "bad"),
-                        ],
                         value="heuristic",
                         label="Policy",
                         scale=4,
                     )
                     btn_run = gr.Button("Run Episode", variant="primary", scale=1)
                 md_status = gr.Markdown(
-                    "Pick a task + policy and click **Run Episode**. Compare **Heuristic** vs "
-                    "**Naive** to see how the 8-dimension rubric &mdash; including escalation ROI &mdash; "
-                    "separates an EV-rational agent from a lazy one."
                 )
                 with gr.Row(equal_height=True):
@@ -491,21 +1044,25 @@ def build_demo() -> gr.Blocks:
                     datatype=["number", "str", "str", "str", "str", "number", "str"],
                     interactive=False,
                     wrap=True,
-                    label="Step Trace",
                 )
                 with gr.Row(equal_height=True):
                     with gr.Column(scale=1):
-                        html_round = gr.HTML(label="Dispute Round")
                     with gr.Column(scale=1):
                         html_arb = gr.HTML(label="Arbitration")
                 html_grader = gr.HTML(label="Grader Report")
-                json_raw = gr.JSON(label="Raw JSON", visible=False)
                 btn_run.click(
                     fn=run_episode,
-                    inputs=[dd_task, cb_gen, rd_diff, nb_seed, rd_policy],
                     outputs=[
                         md_status,
                         html_queue,
@@ -518,7 +1075,104 @@ def build_demo() -> gr.Blocks:
                     ],
                 )
-            # ── Tab 2: Task Catalog ───────────────────────────────
             with gr.Tab("Task Catalog"):
                 catalog_rows = []
                 for t in tasks:
@@ -557,37 +1211,262 @@ def build_demo() -> gr.Blocks:
             # ── Tab 3: Environment Info ───────────────────────────
             with gr.Tab("Environment"):
                 gr.Markdown(
-                    "## Action Space (12 typed actions)\n\n"
-                    "**Round 1 — Representment:** `select_case` &middot; `inspect_case` &middot; "
-                    "`query_system` &middot; `retrieve_policy` &middot; `add_evidence` &middot; "
-                    "`remove_evidence` &middot; `set_strategy` &middot; `submit_representment` &middot; "
-                    "`resolve_case`\n\n"
-                    "**Round 2/3 — Pre-arb &amp; Arbitration:** `respond_to_pre_arb` &middot; "
-                    "`escalate_to_arbitration` &middot; `accept_arbitration_loss`\n\n"
-                    "## Merchant Systems (6)\n\n"
-                    "`orders` &middot; `payment` &middot; `shipping` &middot; "
-                    "`support` &middot; `refunds` &middot; `risk`\n\n"
-                    "## Grading (8 dimensions)\n\n"
-                    "| Dimension | Weight | Scoring |\n"
-                    "|---|---|---|\n"
-                    "| Strategy Correctness | 20% | 1.0 optimal, 0.35 acceptable, 0.0 wrong |\n"
-                    "| Evidence Quality | 15% | Required + helpful coverage, harmful penalty |\n"
-                    "| Packet Validity | 10% | Binary: all required, zero harmful |\n"
-                    "| Deadline Compliance | 10% | Binary: resolved before deadline |\n"
-                    "| Efficiency | 10% | Penalises waste, rewards early concession |\n"
-                    "| Outcome Quality | 10% | 1.0 optimal, 0.4 acceptable, 0.0 wrong |\n"
-                    "| Note Quality | 5% | Policy keywords + evidence refs |\n"
-                    "| Escalation ROI | 20% | EV-rational arbitration: P(win)·amount vs $250 fee |\n\n"
-                    "## Card Networks\n\n"
-                    "| Reason Code | Visa | Mastercard |\n"
-                    "|---|---|---|\n"
-                    "| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
-                    "| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
-                    "| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
-                    "| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
-                    "| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
-                    "| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
                 )
     return demo

 from __future__ import annotations
+import base64
 import os
+from pathlib import Path
+from typing import Any, Callable
+# Ensure matplotlib has a writable config dir on locked-down hosts (e.g. HF
+# Spaces). Guarded so importing this module from a notebook doesn't pollute
+# the user's environment unnecessarily.
+if not os.environ.get("MPLCONFIGDIR"):
+    os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
 import gradio as gr
 try:
+    from ..core.models import ChargebackOpsAction
+    from ..evaluation.rubrics import (
+        CASE_DIMENSION_NAMES,
+        CASE_DIMENSION_WEIGHTS,
+    )
     from ..runners.baseline_runner import (
         _heuristic_pick,
         _obvious_next_action,
         candidate_actions,
     )
+    from ..runners.benchmark_runner import POLICY_REGISTRY
+    from ..scenarios.simulation import get_task, list_tasks
     from .chargeback_ops_environment import ChargebackOpsEnvironment
 except ImportError:  # pragma: no cover
+    from core.models import ChargebackOpsAction
+    from evaluation.rubrics import (
+        CASE_DIMENSION_NAMES,
+        CASE_DIMENSION_WEIGHTS,
+    )
     from runners.baseline_runner import (
         _heuristic_pick,
         _obvious_next_action,
         candidate_actions,
     )
+    from runners.benchmark_runner import POLICY_REGISTRY
+    from scenarios.simulation import get_task, list_tasks
     from server.chargeback_ops_environment import ChargebackOpsEnvironment
+# OpenAI-compatible LLM policy is optional — the demo gracefully degrades to
+# scripted policies if the openai SDK or runners.inference is unavailable.
+try:  # pragma: no cover — exercised only when LLM policy is selected
+    from openai import OpenAI  # noqa: F401
+    try:
+        from ..runners.inference import _pick_with_openai_client
+    except ImportError:
+        from runners.inference import _pick_with_openai_client
+    _LLM_POLICY_AVAILABLE = True
+except Exception:  # pragma: no cover
+    _pick_with_openai_client = None  # type: ignore[assignment]
+    _LLM_POLICY_AVAILABLE = False
+# Path to the bundled hero figures (used by the Training Results tab).
+_FIGURES_DIR = Path(__file__).resolve().parents[1] / "docs" / "figures"
+# ---------------------------------------------------------------------------
+# Static metadata
+# ---------------------------------------------------------------------------
+# Human-readable display labels for the 8 rubric dimensions (in canonical order).
+_DIMENSION_LABELS: tuple[str, ...] = (
+    "Strategy Correctness",
+    "Evidence Quality",
+    "Packet Validity",
+    "Deadline Compliance",
+    "Efficiency",
+    "Outcome Quality",
+    "Note Quality",
+    "Escalation ROI",
+)
+# Per-dimension scoring summary (kept short so the table fits on one screen).
+_DIMENSION_SCORING: tuple[str, ...] = (
+    "1.0 optimal · 0.35 acceptable · 0.0 wrong",
+    "Required + helpful coverage; harmful evidence penalised",
+    "Binary: all required evidence + zero harmful",
+    "Binary: case resolved before deadline",
+    "Penalises waste; rewards early concession",
+    "1.0 optimal · 0.4 acceptable · 0.0 wrong",
+    "Policy keywords + evidence references",
+    "EV-rational arbitration: P(win)·amount vs $250 fee",
+)
+# Selectable scripted policies (label shown to user → registry key).
+# Order is intentional: best → worst, so radio top-to-bottom reads as a
+# discrimination ladder.
+_POLICY_CHOICES: tuple[tuple[str, str], ...] = (
+    ("Heuristic — EV-rational baseline", "heuristic"),
+    ("Escalate-all — contest then always escalate", "escalate_all"),
+    ("Concede-all — always accept the chargeback", "concede_all"),
+    ("Naive — submit empty packet, no evidence", "naive"),
+    ("LLM (OpenAI-compatible API)", "llm"),
+)
+_POLICY_LABEL_BY_KEY: dict[str, str] = {
+    key: label for label, key in _POLICY_CHOICES
+}
+# Subset used by the Compare tab — scripted-only, deterministic, no API calls.
+_COMPARE_POLICIES: tuple[str, ...] = (
+    "naive",
+    "concede_all",
+    "escalate_all",
+    "heuristic",
+)
+# One-click presets for the Run-Episode tab. Each preset is
+# (button_label, task_id, generated_flag, difficulty, seed, recommended_policy, blurb).
+_PRESETS: tuple[tuple[str, str, bool, str, int, str, str], ...] = (
+    (
+        "Easy contestable",
+        "goods_not_received_easy",
+        False,
+        "easy",
+        42,
+        "heuristic",
+        "Goods-not-received with strong evidence — heuristic should win round 1.",
+    ),
+    (
+        "Queue optimization (hard)",
+        "queue_optimization_hard",
+        False,
+        "hard",
+        42,
+        "heuristic",
+        "Triage a heterogeneous queue under tight deadlines — exercises EV reasoning.",
+    ),
+    (
+        "Long-horizon backlog",
+        "monthly_dispute_backlog_marathon",
+        False,
+        "medium",
+        42,
+        "heuristic",
+        "12 cases over 60 steps with delayed evidence; tests scheduling + waiting.",
+    ),
+    (
+        "Generated nightmare",
+        "generated_nightmare_s31",
+        True,
+        "nightmare",
+        31,
+        "heuristic",
+        "Adversarial parametric task — even the heuristic struggles.",
+    ),
+    (
+        "Compare all 4 policies",
+        "goods_not_received_easy",
+        False,
+        "easy",
+        42,
+        "heuristic",
+        "Open the Compare tab — same task, all four scripted policies side-by-side.",
+    ),
+)
 # ---------------------------------------------------------------------------
 # CSS
 }
+def _round_panel_html(
+    observation, history: list[dict[str, str]] | None = None
+) -> str:
+    """Render the visible case's round panel, including a chronological
+    issuer-message log so multi-round disputes show every R1/R2/R3 message.
+    ``history`` is a list of ``{round, decision, rationale}`` dicts the caller
+    accumulates across steps.
+    """
     vc = observation.visible_case
     if vc is None:
         return ""
         f'</div>'
     )
+    # Show full issuer-message history if we have it, else fall back to the
+    # last-message snapshot from the observation.
+    rendered_any = False
+    if history:
+        for entry in history:
+            ent_rnd = entry.get("round", "?")
+            ent_dec = entry.get("decision") or ""
+            ent_rat = entry.get("rationale") or ""
+            ent_badge_cls = f"round-{min(int(ent_rnd) if str(ent_rnd).isdigit() else 1, 3)}"
+            dec_cls = _DEC_CLASS.get(ent_dec, "")
+            dec_pretty = ent_dec.replace("_", " ").title() if ent_dec else "(no decision)"
+            body += (
+                f'<div style="margin-top:8px;">'
+                f'<span class="round-badge {ent_badge_cls}">R{ent_rnd}</span>'
+                f'<span class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</span>'
+                f'</div>'
+            )
+            if ent_rat:
+                body += f'<div class="issuer-quote">&ldquo;{ent_rat}&rdquo;</div>'
+            rendered_any = True
+    if not rendered_any and vc.last_issuer_decision:
         dec = vc.last_issuer_decision
         dec_cls = _DEC_CLASS.get(dec, "")
         dec_pretty = dec.replace("_", " ").title()
         body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
+        if vc.last_issuer_rationale:
+            body += f'<div class="issuer-quote">&ldquo;{vc.last_issuer_rationale}&rdquo;</div>'
     if vc.pre_arb_evidence_added:
         ids = ", ".join(vc.pre_arb_evidence_added)
     return task_id
+def _build_llm_policy(
+    base_url: str, api_key: str, model_name: str
+) -> tuple[Callable[[dict[str, Any]], ChargebackOpsAction | None], str]:
+    """Return ``(policy_fn, label)`` calling an OpenAI-compatible chat model.
+    The policy mirrors the production inference pipeline in
+    :mod:`runners.inference`: candidate generation + obvious-action shortcut +
+    LLM pick over the shortlist. On any LLM failure (network, parse, missing
+    key) it falls back to the heuristic so the demo never freezes mid-stream.
+    UI fields take precedence; blanks fall back to ``HF_TOKEN`` /
+    ``API_KEY`` / ``OPENROUTER_API_KEY`` / ``GROQ_API_KEY`` / ``API_BASE_URL``
+    / ``MODEL_NAME`` env vars. This lets HF Space operators wire credentials
+    via Space Secrets without the public demo asking visitors for keys.
+    """
+    if not _LLM_POLICY_AVAILABLE or _pick_with_openai_client is None:
+        raise RuntimeError(
+            "openai SDK is not available — install `openai` to use the LLM policy."
+        )
+    base_url = (base_url or "").strip()
+    api_key = (api_key or "").strip()
+    model_name = (model_name or "").strip()
+    if not api_key:
+        api_key = (
+            os.getenv("HF_TOKEN")
+            or os.getenv("API_KEY")
+            or os.getenv("OPENROUTER_API_KEY")
+            or os.getenv("GROQ_API_KEY")
+            or ""
+        )
+    # Resolve provider from explicit base_url first, then from which key
+    # variable was set in the environment. This lets us pick a sensible
+    # default model name even when only the key is provided.
+    provider: str = ""
+    if not base_url:
+        base_url = os.getenv("API_BASE_URL", "").strip()
+    if base_url:
+        lowered = base_url.lower()
+        if "groq" in lowered:
+            provider = "groq"
+        elif "openrouter" in lowered:
+            provider = "openrouter"
+        elif "huggingface" in lowered or "hf.space" in lowered:
+            provider = "hf"
+        elif "openai.com" in lowered:
+            provider = "openai"
+    if not base_url:
+        if os.getenv("GROQ_API_KEY"):
+            base_url, provider = "https://api.groq.com/openai/v1", "groq"
+        elif os.getenv("OPENROUTER_API_KEY"):
+            base_url, provider = "https://openrouter.ai/api/v1", "openrouter"
+        else:
+            base_url, provider = "https://router.huggingface.co/v1", "hf"
+    if not model_name:
+        model_name = os.getenv("MODEL_NAME", "").strip()
+    if not model_name:
+        # Provider-appropriate defaults — every option here works without
+        # the user having to look up a model card.
+        provider_defaults = {
+            "groq": "llama-3.3-70b-versatile",
+            "openrouter": "meta-llama/llama-3.1-8b-instruct:free",
+            "openai": "gpt-4o-mini",
+            "hf": "Qwen/Qwen2.5-72B-Instruct",
+        }
+        model_name = provider_defaults.get(provider, "Qwen/Qwen2.5-72B-Instruct")
+    if not api_key:
+        raise RuntimeError(
+            "No API key — type one in the UI, or set HF_TOKEN / API_KEY / "
+            "OPENROUTER_API_KEY / GROQ_API_KEY in the environment (HF Space "
+            "Secrets work too)."
+        )
+    if not model_name:
+        raise RuntimeError("Model name is required for the LLM policy.")
+    client = OpenAI(
+        base_url=base_url,
+        api_key=api_key,
+        timeout=15.0,
+        max_retries=0,
+    )
+    def policy_fn(observation: dict[str, Any]) -> ChargebackOpsAction | None:
+        cands = candidate_actions(observation)
+        if not cands:
+            return None
+        if len(cands) == 1:
+            return cands[0].action
+        obvious = _obvious_next_action(observation, cands)
+        if obvious is not None:
+            return obvious.action
+        try:
+            pick, _ok, _err = _pick_with_openai_client(
+                client, model_name, observation, cands
+            )
+            return pick.action
+        except Exception:
+            return _heuristic_pick(cands).action
+    label = f"LLM ({model_name})"
+    return policy_fn, label
+def _result_badge(result: str | None) -> str:
+    """Prefix a step result string with a status emoji for fast scanning.
+    Distinguishes accepted/no-op/rejected so the trace dataframe self-narrates.
+    """
+    if not result:
+        return "· (no result)"
+    text = str(result)
+    lowered = text.lower()
+    if "error" in lowered or "reject" in lowered or "invalid" in lowered or "fail" in lowered:
+        return f"✗ {text}"
+    if "no-op" in lowered or "noop" in lowered or "ignored" in lowered or "skipped" in lowered:
+        return f"⚠ {text}"
+    return f"✓ {text}"
+def _resolve_max_steps(observation, task_id: str) -> int:
+    """Pull the task budget from the observation; fall back to the task definition.
+    The legacy implementation defaulted to 10 if the observation field was absent,
+    which silently mis-rendered the budget bar. The env always populates
+    ``info.current_task_max_steps`` after ``reset``; if it ever doesn't, we read
+    the task object directly so the bar still reflects truth.
+    """
+    cap = observation.info.get("current_task_max_steps")
+    if isinstance(cap, int) and cap > 0:
+        return cap
+    try:
+        return int(get_task(task_id).max_steps)
+    except Exception:  # pragma: no cover — defensive
+        return 60
 def run_episode(
+    task_id: str,
+    generated: bool,
+    difficulty: str,
+    seed: int,
+    policy: str = "heuristic",
+    llm_base_url: str = "",
+    llm_api_key: str = "",
+    llm_model: str = "",
 ):
     tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
     env = ChargebackOpsEnvironment()
     obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
+    max_steps = _resolve_max_steps(obs, tid)
     rows: list[list[Any]] = []
+    policy_fn: Callable[[dict[str, Any]], ChargebackOpsAction | None] | None = None
+    if policy == "llm":
+        try:
+            policy_fn, policy_label = _build_llm_policy(
+                llm_base_url, llm_api_key, llm_model
+            )
+        except Exception as exc:
+            err_md = (
+                f"### LLM policy unavailable\n"
+                f"`{type(exc).__name__}: {exc}`\n\n"
+                f"Falling back to **heuristic** for this run."
+            )
+            policy = "heuristic"
+            policy_fn = POLICY_REGISTRY["heuristic"]
+            policy_label = _POLICY_LABEL_BY_KEY[policy]
+            yield (
+                err_md,
+                _queue_html(obs),
+                _budget_html(0, max_steps, 0.0),
+                [],
+                "",
+                "",
+                "",
+                None,
+            )
+    if policy_fn is None:
+        policy_fn = POLICY_REGISTRY.get(policy) or POLICY_REGISTRY["heuristic"]
+        if policy not in POLICY_REGISTRY:
+            policy = "heuristic"
+        policy_label = _POLICY_LABEL_BY_KEY.get(policy, policy)
+    # Per-case issuer-message log: case_id -> [{"round","decision","rationale"}]
+    issuer_log: dict[str, list[dict[str, str]]] = {}
+    def _maybe_log_issuer_msg(observation) -> None:
+        vc = observation.visible_case
+        if vc is None or not vc.last_issuer_decision:
+            return
+        log = issuer_log.setdefault(vc.case_id, [])
+        entry = {
+            "round": str(vc.round_number or 1),
+            "decision": vc.last_issuer_decision or "",
+            "rationale": vc.last_issuer_rationale or "",
+        }
+        # Avoid duplicating the same message on adjacent steps.
+        if not log or log[-1] != entry:
+            log.append(entry)
+    def _current_history(observation) -> list[dict[str, str]]:
+        vc = observation.visible_case
+        if vc is None:
+            return []
+        return issuer_log.get(vc.case_id, [])
     header = (
         f"### {obs.task_title}\n"
         f"`{obs.task_id}` &mdash; {len(obs.queue)} case(s), "
         _queue_html(obs),
         _budget_html(0, max_steps, 0.0),
         [row[:] for row in rows],
+        _round_panel_html(obs, _current_history(obs)),
         _arbitration_panel_html(obs),
         "",
         None,
     step = 0
     while not obs.done:
+        payload = obs.model_dump()
+        try:
+            action = policy_fn(payload)
+        except Exception as exc:  # pragma: no cover — surface in UI
+            err_md = (
+                f"### Policy error\n"
+                f"`{policy}` raised `{type(exc).__name__}: {exc}` on step {step + 1}. "
+                f"Halting episode."
+            )
+            yield (
+                err_md,
+                _queue_html(obs),
+                _budget_html(step, max_steps, obs.progress_score),
+                [row[:] for row in rows],
+                _round_panel_html(obs, _current_history(obs)),
+                _arbitration_panel_html(obs),
+                "",
+                None,
+            )
+            return
+        if action is None:
+            break
+        summary_action = action
         step += 1
+        try:
+            obs = env.step(action)
+        except Exception as exc:  # pragma: no cover — surface in UI
+            err_md = (
+                f"### Environment error\n"
+                f"`env.step({summary_action.action_type})` raised "
+                f"`{type(exc).__name__}: {exc}` on step {step}. "
+                f"Halting episode."
+            )
+            rows.append(
+                [
+                    step,
+                    summary_action.action_type,
+                    summary_action.case_id or "",
+                    summary_action.system_name or "",
+                    summary_action.strategy or "",
+                    0.0,
+                    f"✗ error: {type(exc).__name__}",
+                ]
+            )
+            yield (
+                err_md,
+                _queue_html(obs),
+                _budget_html(step, max_steps, obs.progress_score),
+                [row[:] for row in rows],
+                _round_panel_html(obs, _current_history(obs)),
+                _arbitration_panel_html(obs),
+                "",
+                None,
+            )
+            return
+        _maybe_log_issuer_msg(obs)
         rows.append(
             [
                 step,
                 summary_action.system_name or "",
                 summary_action.strategy or "",
                 round(obs.reward or 0.0, 4),
+                _result_badge(obs.last_action_result),
             ]
         )
             _queue_html(obs),
             _budget_html(step, max_steps, obs.progress_score),
             [row[:] for row in rows],
+            _round_panel_html(obs, _current_history(obs)),
             _arbitration_panel_html(obs),
             grader,
             None,
     report = obs.grader_report.model_dump() if obs.grader_report else None
     sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
+    final_md = (
+        f"### Done &mdash; score **{sc}** in **{len(rows)}** steps "
+        f"&middot; policy: **{policy_label}**"
+    )
     yield (
         final_md,
         _queue_html(obs),
         _budget_html(step, max_steps, obs.progress_score),
         [row[:] for row in rows],
+        _round_panel_html(obs, _current_history(obs)),
         _arbitration_panel_html(obs),
         _grader_html(report),
         report,
     )
+# ---------------------------------------------------------------------------
+# Compare tab — run all four scripted policies on the same task in series and
+# render a single side-by-side bar chart of the final scores plus a per-case
+# per-dimension breakdown.
+# ---------------------------------------------------------------------------
+def _run_one_episode_sync(task_id: str, policy_key: str) -> dict[str, Any]:
+    """Synchronously run a single scripted-policy episode and return summary.
+    Cheap because every policy in :data:`_COMPARE_POLICIES` is pure-Python and
+    fully offline (no provider calls).
+    """
+    env = ChargebackOpsEnvironment()
+    obs = env.reset(task_id=task_id)
+    policy_fn = POLICY_REGISTRY[policy_key]
+    steps = 0
+    while not obs.done:
+        try:
+            action = policy_fn(obs.model_dump())
+        except Exception:
+            break
+        if action is None:
+            break
+        try:
+            obs = env.step(action)
+        except Exception:
+            break
+        steps += 1
+    score = obs.grader_report.normalized_score if obs.grader_report else 0.0
+    return {
+        "policy": policy_key,
+        "score": float(score),
+        "steps": steps,
+        "summary": obs.grader_report.summary if obs.grader_report else "",
+    }
+def run_compare(task_id: str, generated: bool, difficulty: str, seed: int):
+    """Run all four scripted policies on the same task and render a chart."""
+    tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
+    results = [_run_one_episode_sync(tid, p) for p in _COMPARE_POLICIES]
+    # Bar-chart HTML (CSS-only, no extra deps).
+    max_score = max((r["score"] for r in results), default=1.0) or 1.0
+    bars = ""
+    for r in results:
+        pct = int(round(100 * r["score"] / max(0.001, max_score)))
+        color = _score_color(r["score"])
+        bars += (
+            f'<div class="bar-row" style="margin:6px 0;">'
+            f'<span class="bar-label" style="width:130px;">{r["policy"]}</span>'
+            f'<div class="bar-track" style="flex:1;height:22px;">'
+            f'<div class="bar-fill" style="width:{pct}%;background:{color};height:100%;"></div>'
+            f'</div>'
+            f'<span class="bar-value" style="width:120px;">'
+            f'{r["score"]:.3f} · {r["steps"]} steps</span>'
+            f'</div>'
+        )
+    # Discrimination delta.
+    by_policy = {r["policy"]: r["score"] for r in results}
+    delta = by_policy.get("heuristic", 0.0) - by_policy.get("naive", 0.0)
+    title = (
+        f'<div style="margin:8px 0;font-size:14px;">'
+        f'<b>Task</b>: <code>{tid}</code> &middot; '
+        f'<b>Discrimination delta</b> (heuristic − naive) = '
+        f'<span style="color:{_score_color(delta)};">'
+        f'<b>+{delta:.3f}</b></span>'
+        f'</div>'
+    )
+    md = (
+        f"### Side-by-side: 4 scripted policies on the same task\n"
+        f"Same `task_id`, same `seed`, no provider calls. The discrimination "
+        f"gradient (`naive` → `concede_all` → `escalate_all` → `heuristic`) "
+        f"is the empirical evidence behind the README's `+0.813` claim."
+    )
+    table_rows = [
+        [r["policy"], f"{r['score']:.3f}", r["steps"], r["summary"]]
+        for r in results
+    ]
+    return md, title + '<div style="padding:8px 0;">' + bars + "</div>", table_rows
 # ---------------------------------------------------------------------------
 # Build Gradio app
 # ---------------------------------------------------------------------------
         # Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
         gr.HTML(f"<style>{_CSS}</style>")
+        # Header + context links
         gr.HTML(
             '<div class="dashboard-header">'
             "<h1>ChargebackOps</h1>"
+            "<p>Merchant chargeback dispute environment &mdash; an OpenEnv benchmark for "
+            "cost-asymmetric multi-round LLM agents</p>"
+            '<div style="margin-top:8px;">'
+            '<a href="https://github.com/MitudruDutta/chargebackops" target="_blank" '
+            'style="margin:0 6px;color:#3b82f6;text-decoration:none;">📦 GitHub</a> '
+            '<a href="https://huggingface.co/spaces/mitudrudutta/ChargeBackOps" target="_blank" '
+            'style="margin:0 6px;color:#FFD21E;text-decoration:none;">🤗 HF Space</a> '
+            '<a href="https://youtu.be/7dz37JTTMo4" target="_blank" '
+            'style="margin:0 6px;color:#FF0000;text-decoration:none;">📺 Walkthrough</a> '
+            '<a href="https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5" target="_blank" '
+            'style="margin:0 6px;color:#F9AB00;text-decoration:none;">🧪 Training Colab</a> '
+            '<a href="https://github.com/meta-pytorch/OpenEnv" target="_blank" '
+            'style="margin:0 6px;color:#0668E1;text-decoration:none;">🦙 Meta OpenEnv</a>'
+            "</div>"
             "</div>"
         )
         with gr.Tabs():
             # ── Tab 1: Run Episode ────────────────────────────────
             with gr.Tab("Run Episode"):
+                # Preset buttons row — one-click task+policy configuration.
+                gr.Markdown("**Quick presets** — click any to load a known-good configuration.")
+                with gr.Row():
+                    preset_buttons = [
+                        gr.Button(p[0], size="sm", scale=1) for p in _PRESETS
+                    ]
+                preset_blurb = gr.Markdown("")
                 with gr.Row():
                     dd_task = gr.Dropdown(
                         label="Task", choices=task_ids, value=default, scale=3
                         ["easy", "medium", "hard", "nightmare"],
                         label="Difficulty",
                         value="easy",
+                        visible=False,
                         scale=2,
                     )
+                    nb_seed = gr.Number(
+                        label="Seed", value=42, precision=0, visible=False, scale=1
+                    )
                 with gr.Row():
                     rd_policy = gr.Radio(
+                        choices=list(_POLICY_CHOICES),
                         value="heuristic",
                         label="Policy",
                         scale=4,
                     )
                     btn_run = gr.Button("Run Episode", variant="primary", scale=1)
+                # LLM-policy inputs — only visible when "LLM" is selected.
+                with gr.Accordion(
+                    "LLM policy settings (used when 'LLM' is selected above)",
+                    open=False,
+                    visible=False,
+                ) as llm_accordion:
+                    gr.Markdown(
+                        "Bring your own OpenAI-compatible endpoint. Defaults match the "
+                        "Hugging Face router; OpenRouter, Groq, Together, Fireworks, "
+                        "and Anthropic-compatible gateways all work. **Leave fields "
+                        "blank** to inherit `HF_TOKEN` / `OPENROUTER_API_KEY` / "
+                        "`GROQ_API_KEY` / `API_BASE_URL` / `MODEL_NAME` from the "
+                        "environment (set them as Space Secrets when deploying)."
+                    )
+                    with gr.Row():
+                        tb_llm_base = gr.Textbox(
+                            label="Base URL",
+                            value="https://router.huggingface.co/v1",
+                            scale=2,
+                        )
+                        tb_llm_model = gr.Textbox(
+                            label="Model",
+                            value="Qwen/Qwen2.5-72B-Instruct",
+                            scale=2,
+                        )
+                        tb_llm_key = gr.Textbox(
+                            label="API key",
+                            value="",
+                            type="password",
+                            scale=2,
+                        )
                 md_status = gr.Markdown(
+                    "Pick a task + policy and click **Run Episode**. Run the same task "
+                    "under each of the four scripted policies (heuristic, escalate-all, "
+                    "concede-all, naive) to reproduce the discrimination gradient — naive "
+                    "→ 0.000, concede-all → ~0.44, escalate-all → ~0.77, heuristic → ~0.81. "
+                    "Or pick **LLM** and bring your own model. For a side-by-side view, "
+                    "open the **Compare policies** tab."
                 )
                 with gr.Row(equal_height=True):
                     datatype=["number", "str", "str", "str", "str", "number", "str"],
                     interactive=False,
                     wrap=True,
+                    label="Step Trace (✓ accepted · ⚠ no-op · ✗ rejected)",
                 )
                 with gr.Row(equal_height=True):
                     with gr.Column(scale=1):
+                        html_round = gr.HTML(label="Dispute Round (issuer messages)")
                     with gr.Column(scale=1):
                         html_arb = gr.HTML(label="Arbitration")
                 html_grader = gr.HTML(label="Grader Report")
+                with gr.Accordion("Raw grader JSON (export-friendly)", open=False):
+                    json_raw = gr.JSON(label="Raw JSON", show_label=False)
                 btn_run.click(
                     fn=run_episode,
+                    inputs=[
+                        dd_task, cb_gen, rd_diff, nb_seed, rd_policy,
+                        tb_llm_base, tb_llm_key, tb_llm_model,
+                    ],
                     outputs=[
                         md_status,
                         html_queue,
                     ],
                 )
+                # Generated-checkbox visibility callback.
+                def _toggle_generated(generated: bool):
+                    return (
+                        gr.update(visible=generated),
+                        gr.update(visible=generated),
+                    )
+                cb_gen.change(
+                    fn=_toggle_generated,
+                    inputs=[cb_gen],
+                    outputs=[rd_diff, nb_seed],
+                )
+                # Show LLM accordion only when 'llm' policy is selected.
+                def _toggle_llm(policy: str):
+                    return gr.update(visible=(policy == "llm"), open=(policy == "llm"))
+                rd_policy.change(
+                    fn=_toggle_llm, inputs=[rd_policy], outputs=[llm_accordion]
+                )
+                # Wire each preset button to populate the inputs atomically.
+                def _make_preset_handler(preset):
+                    label, t_id, gen, diff, seed_v, pol, blurb = preset
+                    def _apply():
+                        return (
+                            t_id,                              # dd_task
+                            gen,                               # cb_gen
+                            gr.update(value=diff, visible=gen),  # rd_diff
+                            gr.update(value=seed_v, visible=gen),  # nb_seed
+                            pol,                               # rd_policy
+                            gr.update(visible=(pol == "llm")),  # llm_accordion
+                            f"**Preset:** {label} — {blurb}",   # preset_blurb
+                        )
+                    return _apply
+                for btn, preset in zip(preset_buttons, _PRESETS):
+                    btn.click(
+                        fn=_make_preset_handler(preset),
+                        inputs=[],
+                        outputs=[
+                            dd_task,
+                            cb_gen,
+                            rd_diff,
+                            nb_seed,
+                            rd_policy,
+                            llm_accordion,
+                            preset_blurb,
+                        ],
+                    )
+            # ── Tab 2: Compare policies ──────────────────────────
+            with gr.Tab("Compare policies"):
+                gr.Markdown(
+                    "Run all four scripted policies on the **same task / seed** and see "
+                    "the discrimination gradient at a glance. No provider calls, no LLM, "
+                    "fully deterministic — this is the empirical evidence behind the "
+                    "README's `+0.813` discrimination delta claim."
+                )
+                with gr.Row():
+                    cmp_task = gr.Dropdown(
+                        label="Task", choices=task_ids, value=default, scale=3
+                    )
+                    cmp_gen = gr.Checkbox(label="Generated", value=False, scale=1)
+                    cmp_diff = gr.Radio(
+                        ["easy", "medium", "hard", "nightmare"],
+                        label="Difficulty",
+                        value="easy",
+                        visible=False,
+                        scale=2,
+                    )
+                    cmp_seed = gr.Number(
+                        label="Seed", value=42, precision=0, visible=False, scale=1
+                    )
+                btn_cmp = gr.Button("Run all 4 policies", variant="primary")
+                cmp_md = gr.Markdown("")
+                cmp_html = gr.HTML(label="Final-score comparison")
+                cmp_table = gr.Dataframe(
+                    headers=["Policy", "Score", "Steps", "Summary"],
+                    datatype=["str", "str", "number", "str"],
+                    interactive=False,
+                    wrap=True,
+                    label="Per-policy summary",
+                )
+                btn_cmp.click(
+                    fn=run_compare,
+                    inputs=[cmp_task, cmp_gen, cmp_diff, cmp_seed],
+                    outputs=[cmp_md, cmp_html, cmp_table],
+                )
+                cmp_gen.change(
+                    fn=_toggle_generated,
+                    inputs=[cmp_gen],
+                    outputs=[cmp_diff, cmp_seed],
+                )
+            # ── Tab 3: Task Catalog ──────────────────────────────
             with gr.Tab("Task Catalog"):
                 catalog_rows = []
                 for t in tasks:
             # ── Tab 3: Environment Info ───────────────────────────
             with gr.Tab("Environment"):
+                gr.Markdown(_environment_tab_markdown())
+            # ── Tab 5: Rubric Tree ────────────────────────────────
+            with gr.Tab("Rubric Tree"):
                 gr.Markdown(
+                    "Live introspection of `env.rubric.named_rubrics()` — the same composable "
+                    "OpenEnv `Rubric` tree that grades every step. Weights and structure below "
+                    "are read from the running environment, not hardcoded."
                 )
+                gr.HTML(_rubric_tree_html())
+                gr.Markdown(
+                    "See [`docs/METHOD.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/METHOD.md) "
+                    "and [`docs/SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md) "
+                    "for the full design and the GRPO failure-mode write-up."
+                )
+            # ── Tab 6: Training Results ───────────────────────────
+            with gr.Tab("Training Results"):
+                gr.Markdown(_training_tab_markdown())
+                for caption, fname in (
+                    (
+                        "**Cross-iteration training curve.** Iter 3 plateaued below the "
+                        "heuristic at 0.728. Iter 5 plateaued *bit-exactly* at the heuristic "
+                        "at 0.8132 — the signature of the eval-fallback exploit, not "
+                        "convergent learning.",
+                        "training_curve_cross_iter.png",
+                    ),
+                    (
+                        "**Iter-5 eval-score attribution.** The trained policy contributes "
+                        "0.000 (every action is rejected by env validation). The eval rollout "
+                        "helper's heuristic-fallback path contributes 0.8132 — i.e. all of it.",
+                        "gaming_attribution.png",
+                    ),
+                    (
+                        "**Scripted-policy discrimination gradient.** The 8-dimension "
+                        "`WeightedSum` plus the deadline `Gate` defeats every degenerate "
+                        "policy: empty-packet zeros out, concede-all caps at 0.44, "
+                        "escalate-all caps at 0.77.",
+                        "discrimination_gradient.png",
+                    ),
+                    (
+                        "**8-dimension OpenEnv rubric weights**, grouped by category "
+                        "(decision / packet / process / terminal). 40% of reward sits on "
+                        "decision + terminal — where economically irrational policies "
+                        "bleed money fastest.",
+                        "rubric_weights.png",
+                    ),
+                    (
+                        "**Iter-5 per-difficulty curves.** Post-step-80 plateau is the "
+                        "fallback heuristic across every difficulty band; see "
+                        "SPECIFICATION_GAMING.md for the diagnosis.",
+                        "training_curve_by_family.png",
+                    ),
+                ):
+                    src = _figure_data_uri(fname)
+                    if src is None:
+                        gr.Markdown(
+                            f"_(figure `{fname}` not bundled — see "
+                            f"[`docs/figures/{fname}`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/figures/{fname}))_"
+                        )
+                        continue
+                    gr.Markdown(caption)
+                    gr.HTML(
+                        f'<img src="{src}" style="width:100%;max-width:1100px;'
+                        f'border:1px solid #2a2a2a;border-radius:6px;margin:6px 0;" '
+                        f'alt="{fname}" />'
+                    )
     return demo
+# ---------------------------------------------------------------------------
+# Tab content builders (called once at app build; keep cheap)
+# ---------------------------------------------------------------------------
+def _environment_tab_markdown() -> str:
+    """Render the Environment tab content from live constants.
+    Reads action types from ``core.models.ActionType`` and the rubric weights
+    from ``evaluation.rubrics.CASE_DIMENSION_WEIGHTS`` so this tab can never
+    drift from the source of truth.
+    """
+    try:
+        from core.models import ActionType  # type: ignore[attr-defined]
+    except ImportError:  # pragma: no cover
+        from ..core.models import ActionType  # type: ignore[attr-defined]
+    # ``Literal`` exposes its members via ``__args__``.
+    actions: tuple[str, ...] = tuple(getattr(ActionType, "__args__", ()))
+    n_actions = len(actions)
+    r1 = (
+        "select_case", "inspect_case", "query_system", "retrieve_policy",
+        "add_evidence", "remove_evidence", "set_strategy",
+        "submit_representment", "resolve_case",
+    )
+    r23 = ("respond_to_pre_arb", "escalate_to_arbitration", "accept_arbitration_loss")
+    long_horizon = ("wait_for_updates",)
+    def _join(items: tuple[str, ...]) -> str:
+        return " &middot; ".join(f"`{name}`" for name in items)
+    rubric_rows = "\n".join(
+        f"| {label} | {int(round(weight * 100))}% | {scoring} |"
+        for label, weight, scoring in zip(
+            _DIMENSION_LABELS, CASE_DIMENSION_WEIGHTS, _DIMENSION_SCORING
+        )
+    )
+    return (
+        f"## Action Space ({n_actions} typed actions)\n\n"
+        f"**Round 1 — Representment:** {_join(r1)}\n\n"
+        f"**Round 2/3 — Pre-arb &amp; Arbitration:** {_join(r23)}\n\n"
+        f"**Long-horizon backlog:** {_join(long_horizon)}\n\n"
+        "## Merchant Systems (6)\n\n"
+        "`orders` &middot; `payment` &middot; `shipping` &middot; "
+        "`support` &middot; `refunds` &middot; `risk`\n\n"
+        "## Grading (8 dimensions)\n\n"
+        "Weights are read live from `evaluation.rubrics.CASE_DIMENSION_WEIGHTS`.\n\n"
+        "| Dimension | Weight | Scoring |\n"
+        "|---|---|---|\n"
+        f"{rubric_rows}\n\n"
+        "## Scripted policies (Run Episode tab)\n\n"
+        "| Policy | What it does | Headline avg |\n"
+        "|---|---|---|\n"
+        "| `naive` | Submit empty packet, no evidence, no policy work | 0.000 |\n"
+        "| `concede_all` | Always set strategy `accept_chargeback` and resolve | 0.444 |\n"
+        "| `escalate_all` | Contest like the heuristic, then always escalate | 0.767 |\n"
+        "| `heuristic` | EV-rational, fully offline | **0.813** |\n\n"
+        "## Card Networks\n\n"
+        "| Reason Code | Visa | Mastercard |\n"
+        "|---|---|---|\n"
+        "| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
+        "| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
+        "| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
+        "| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
+        "| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
+        "| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
+    )
+def _rubric_tree_html() -> str:
+    """Render the live ``env.rubric.named_rubrics()`` tree as nested HTML.
+    Also explicitly surfaces the deadline ``Gate(CaseAbandonedRubric)`` that
+    sits on top of the per-case ``WeightedSum`` — OpenEnv's default walk
+    iterates registered child rubrics only, and the Gate is a sibling of the
+    aggregator inside :class:`CaseRubric`.
+    Falls back to a static snapshot if introspection fails for any reason
+    (e.g. an old OpenEnv build) so the demo never breaks on this tab.
+    """
+    try:
+        env = ChargebackOpsEnvironment()
+        named = list(env.rubric.named_rubrics())
+    except Exception as exc:  # pragma: no cover — defensive fallback
+        return (
+            f"<pre style='color:#ef4444;'>Could not introspect rubric tree: "
+            f"{type(exc).__name__}: {exc}</pre>"
+        )
+    # Map weights onto leaf rubrics by name. CASE_DIMENSION_NAMES is the
+    # canonical order the WeightedSum was built with; weights align by index.
+    weight_by_dim = dict(zip(CASE_DIMENSION_NAMES, CASE_DIMENSION_WEIGHTS))
+    rows: list[str] = []
+    rows.append(
+        "<table class='queue-table' style='font-family:ui-monospace,monospace;'>"
+        "<tr><th>Path</th><th>Class</th><th>Weight / Role</th></tr>"
+    )
+    # Explicitly inject the deadline gate row above the aggregator subtree,
+    # since some OpenEnv versions don't yield it via named_rubrics().
+    deadline_gate_injected = False
+    for path, rubric in named:
+        cls_name = type(rubric).__name__
+        if (
+            not deadline_gate_injected
+            and cls_name == "WeightedSum"
+            and path.endswith("aggregator")
+        ):
+            parent = path.rsplit(".", 1)[0]
+            rows.append(
+                f"<tr><td>{'&nbsp;' * (parent.count('.') * 4 + 4)}"
+                f"<code>{parent}.deadline_gate</code></td>"
+                f"<td>Gate(CaseAbandonedRubric)</td>"
+                f"<td style='text-align:right;color:#eab308;'>hard-zero on miss</td></tr>"
+            )
+            deadline_gate_injected = True
+        weight_str = "—"
+        for dim_name, weight in weight_by_dim.items():
+            tag = "".join(part.capitalize() for part in dim_name.split("_")) + "Rubric"
+            if cls_name == tag:
+                weight_str = f"{int(round(weight * 100))}%"
+                break
+        depth = path.count(".")
+        indent = "&nbsp;" * (depth * 4)
+        rows.append(
+            f"<tr><td>{indent}<code>{path or '(root)'}</code></td>"
+            f"<td>{cls_name}</td>"
+            f"<td style='text-align:right;'>{weight_str}</td></tr>"
+        )
+    rows.append("</table>")
+    return "".join(rows)
+# ---------------------------------------------------------------------------
+# Training Results helpers
+# ---------------------------------------------------------------------------
+def _figure_data_uri(filename: str) -> str | None:
+    """Return a base64 ``data:image/png`` URI for a bundled figure, or None.
+    Embedding figures inline avoids dependencies on the static-asset routing
+    of whatever host serves the demo (HF Spaces, FastAPI sub-mount, etc.).
+    """
+    path = _FIGURES_DIR / filename
+    if not path.is_file():
+        return None
+    try:
+        data = path.read_bytes()
+    except OSError:
+        return None
+    encoded = base64.b64encode(data).decode("ascii")
+    return f"data:image/png;base64,{encoded}"
+def _training_tab_markdown() -> str:
+    return (
+        "## Real training, end-to-end\n\n"
+        "**Pipeline.** Qwen2.5-3B fp16 + LoRA r=16 on a single Colab T4. Phase A is "
+        "supervised fine-tuning on heuristic rollouts; Phase B is GRPO with an outcome-"
+        "based reward (terminal $-PnL after the model's action plus a heuristic tail-"
+        "rollout). The training loop **connects to the live `ChargebackOpsEnvironment`** "
+        "— every gradient step is graded by the same rubric and same Issuer adversary "
+        "the eval uses. There is no static dataset shortcut.\n\n"
+        "**Five iterations, three failure modes.** Iter 1 produced total gradient "
+        "collapse (group reward variance ≈ 0). Iter 3 broke through to non-zero gradient "
+        "but plateaued at 0.728. **Iter 5 ran 200 GRPO steps and uncovered a reproducible "
+        "specification-gaming exploit** where the model emits invalid `accept_case` "
+        "actions, triggers the eval rollout helper's heuristic-fallback path, and "
+        "scores bit-exactly the heuristic baseline at 0.8132. The full diagnosis is in "
+        "[`SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md).\n\n"
+        "**Honest trained-vs-untrained delta:** the SFT step at 0.536 — **+0.08 absolute, "
+        "+18% relative** over the untrained Qwen2.5-3B base — is the only legitimate "
+        "model-attributable improvement on iter 5. We document this honestly because "
+        "the failure mode itself is a research artefact future GRPO recipes can target "
+        "as a benchmark.\n\n"
+        "**Reproduce.** "
+        "[Latest training run (Colab — iter 5, 200 GRPO steps)](https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5?usp=sharing) · "
+        "[Previous training run (Colab — iter 3, 62 GRPO steps)](https://colab.research.google.com/drive/1AjG3Sv7FnMeOSls6JMzTunkMzlJi_ySu?usp=sharing) · "
+        "[`notebooks/train_merchant_agent.ipynb`](https://github.com/MitudruDutta/chargebackops/blob/main/notebooks/train_merchant_agent.ipynb)\n"
+    )