mitudrudutta commited on
Commit
862cfc4
Β·
1 Parent(s): 4d7c179

Implement code changes to enhance functionality and improve performance

Browse files
Files changed (1) hide show
  1. server/demo_ui.py +957 -78
server/demo_ui.py CHANGED
@@ -2,32 +2,164 @@
2
 
3
  from __future__ import annotations
4
 
 
5
  import os
6
- from typing import Any
 
7
 
8
- os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
 
 
 
 
9
 
10
  import gradio as gr
11
 
12
  try:
13
- from ..evaluation.agent_brutal_audit import _bad_policy_action
 
 
 
 
14
  from ..runners.baseline_runner import (
15
  _heuristic_pick,
16
  _obvious_next_action,
17
  candidate_actions,
18
  )
19
- from ..scenarios.simulation import list_tasks
 
20
  from .chargeback_ops_environment import ChargebackOpsEnvironment
21
  except ImportError: # pragma: no cover
22
- from evaluation.agent_brutal_audit import _bad_policy_action
 
 
 
 
23
  from runners.baseline_runner import (
24
  _heuristic_pick,
25
  _obvious_next_action,
26
  candidate_actions,
27
  )
28
- from scenarios.simulation import list_tasks
 
29
  from server.chargeback_ops_environment import ChargebackOpsEnvironment
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # ---------------------------------------------------------------------------
33
  # CSS
@@ -205,7 +337,16 @@ _DEC_CLASS = {
205
  }
206
 
207
 
208
- def _round_panel_html(observation) -> str:
 
 
 
 
 
 
 
 
 
209
  vc = observation.visible_case
210
  if vc is None:
211
  return ""
@@ -221,14 +362,34 @@ def _round_panel_html(observation) -> str:
221
  f'</div>'
222
  )
223
 
224
- if vc.last_issuer_decision:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  dec = vc.last_issuer_decision
226
  dec_cls = _DEC_CLASS.get(dec, "")
227
  dec_pretty = dec.replace("_", " ").title()
228
  body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
229
-
230
- if vc.last_issuer_rationale:
231
- body += f'<div class="issuer-quote">&ldquo;{vc.last_issuer_rationale}&rdquo;</div>'
232
 
233
  if vc.pre_arb_evidence_added:
234
  ids = ", ".join(vc.pre_arb_evidence_added)
@@ -329,18 +490,218 @@ def _resolve_task_id(task_id: str, generated: bool, difficulty: str, seed: int)
329
  return task_id
330
 
331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  def run_episode(
333
- task_id: str, generated: bool, difficulty: str, seed: int, policy: str = "heuristic"
 
 
 
 
 
 
 
334
  ):
335
  tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
336
  env = ChargebackOpsEnvironment()
337
  obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
338
- max_steps = obs.info.get("current_task_max_steps", 10)
339
  rows: list[list[Any]] = []
340
 
341
- policy_label = (
342
- "Heuristic" if policy == "heuristic" else "Naive (concede-everything)"
343
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  header = (
345
  f"### {obs.task_title}\n"
346
  f"`{obs.task_id}` &mdash; {len(obs.queue)} case(s), "
@@ -351,7 +712,7 @@ def run_episode(
351
  _queue_html(obs),
352
  _budget_html(0, max_steps, 0.0),
353
  [row[:] for row in rows],
354
- _round_panel_html(obs),
355
  _arbitration_panel_html(obs),
356
  "",
357
  None,
@@ -359,19 +720,65 @@ def run_episode(
359
 
360
  step = 0
361
  while not obs.done:
362
- if policy == "bad":
363
- action = _bad_policy_action(obs)
364
- summary_action = action
365
- else:
366
- payload = obs.model_dump()
367
- cands = candidate_actions(payload)
368
- if not cands:
369
- break
370
- pick = _obvious_next_action(payload, cands) or _heuristic_pick(cands)
371
- action = pick.action
372
- summary_action = pick.action
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  step += 1
374
- obs = env.step(action)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  rows.append(
376
  [
377
  step,
@@ -380,7 +787,7 @@ def run_episode(
380
  summary_action.system_name or "",
381
  summary_action.strategy or "",
382
  round(obs.reward or 0.0, 4),
383
- obs.last_action_result,
384
  ]
385
  )
386
 
@@ -396,7 +803,7 @@ def run_episode(
396
  _queue_html(obs),
397
  _budget_html(step, max_steps, obs.progress_score),
398
  [row[:] for row in rows],
399
- _round_panel_html(obs),
400
  _arbitration_panel_html(obs),
401
  grader,
402
  None,
@@ -404,19 +811,109 @@ def run_episode(
404
 
405
  report = obs.grader_report.model_dump() if obs.grader_report else None
406
  sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
407
- final_md = f"### Done &mdash; score **{sc}** in **{len(rows)}** steps"
 
 
 
408
  yield (
409
  final_md,
410
  _queue_html(obs),
411
  _budget_html(step, max_steps, obs.progress_score),
412
  [row[:] for row in rows],
413
- _round_panel_html(obs),
414
  _arbitration_panel_html(obs),
415
  _grader_html(report),
416
  report,
417
  )
418
 
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  # ---------------------------------------------------------------------------
421
  # Build Gradio app
422
  # ---------------------------------------------------------------------------
@@ -431,17 +928,38 @@ def build_demo() -> gr.Blocks:
431
  # Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
432
  gr.HTML(f"<style>{_CSS}</style>")
433
 
434
- # Header
435
  gr.HTML(
436
  '<div class="dashboard-header">'
437
  "<h1>ChargebackOps</h1>"
438
- "<p>Merchant chargeback dispute environment &mdash; OpenEnv benchmark</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  "</div>"
440
  )
441
 
442
  with gr.Tabs():
443
  # ── Tab 1: Run Episode ────────────────────────────────
444
  with gr.Tab("Run Episode"):
 
 
 
 
 
 
 
 
445
  with gr.Row():
446
  dd_task = gr.Dropdown(
447
  label="Task", choices=task_ids, value=default, scale=3
@@ -451,25 +969,60 @@ def build_demo() -> gr.Blocks:
451
  ["easy", "medium", "hard", "nightmare"],
452
  label="Difficulty",
453
  value="easy",
 
454
  scale=2,
455
  )
456
- nb_seed = gr.Number(label="Seed", value=42, precision=0, scale=1)
 
 
457
  with gr.Row():
458
  rd_policy = gr.Radio(
459
- choices=[
460
- ("Heuristic (smart baseline)", "heuristic"),
461
- ("Naive (always concede)", "bad"),
462
- ],
463
  value="heuristic",
464
  label="Policy",
465
  scale=4,
466
  )
467
  btn_run = gr.Button("Run Episode", variant="primary", scale=1)
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  md_status = gr.Markdown(
470
- "Pick a task + policy and click **Run Episode**. Compare **Heuristic** vs "
471
- "**Naive** to see how the 8-dimension rubric &mdash; including escalation ROI &mdash; "
472
- "separates an EV-rational agent from a lazy one."
 
 
 
473
  )
474
 
475
  with gr.Row(equal_height=True):
@@ -491,21 +1044,25 @@ def build_demo() -> gr.Blocks:
491
  datatype=["number", "str", "str", "str", "str", "number", "str"],
492
  interactive=False,
493
  wrap=True,
494
- label="Step Trace",
495
  )
496
 
497
  with gr.Row(equal_height=True):
498
  with gr.Column(scale=1):
499
- html_round = gr.HTML(label="Dispute Round")
500
  with gr.Column(scale=1):
501
  html_arb = gr.HTML(label="Arbitration")
502
 
503
  html_grader = gr.HTML(label="Grader Report")
504
- json_raw = gr.JSON(label="Raw JSON", visible=False)
 
505
 
506
  btn_run.click(
507
  fn=run_episode,
508
- inputs=[dd_task, cb_gen, rd_diff, nb_seed, rd_policy],
 
 
 
509
  outputs=[
510
  md_status,
511
  html_queue,
@@ -518,7 +1075,104 @@ def build_demo() -> gr.Blocks:
518
  ],
519
  )
520
 
521
- # ── Tab 2: Task Catalog ───────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  with gr.Tab("Task Catalog"):
523
  catalog_rows = []
524
  for t in tasks:
@@ -557,37 +1211,262 @@ def build_demo() -> gr.Blocks:
557
 
558
  # ── Tab 3: Environment Info ───────────────────────────
559
  with gr.Tab("Environment"):
 
 
 
 
560
  gr.Markdown(
561
- "## Action Space (12 typed actions)\n\n"
562
- "**Round 1 β€” Representment:** `select_case` &middot; `inspect_case` &middot; "
563
- "`query_system` &middot; `retrieve_policy` &middot; `add_evidence` &middot; "
564
- "`remove_evidence` &middot; `set_strategy` &middot; `submit_representment` &middot; "
565
- "`resolve_case`\n\n"
566
- "**Round 2/3 β€” Pre-arb &amp; Arbitration:** `respond_to_pre_arb` &middot; "
567
- "`escalate_to_arbitration` &middot; `accept_arbitration_loss`\n\n"
568
- "## Merchant Systems (6)\n\n"
569
- "`orders` &middot; `payment` &middot; `shipping` &middot; "
570
- "`support` &middot; `refunds` &middot; `risk`\n\n"
571
- "## Grading (8 dimensions)\n\n"
572
- "| Dimension | Weight | Scoring |\n"
573
- "|---|---|---|\n"
574
- "| Strategy Correctness | 20% | 1.0 optimal, 0.35 acceptable, 0.0 wrong |\n"
575
- "| Evidence Quality | 15% | Required + helpful coverage, harmful penalty |\n"
576
- "| Packet Validity | 10% | Binary: all required, zero harmful |\n"
577
- "| Deadline Compliance | 10% | Binary: resolved before deadline |\n"
578
- "| Efficiency | 10% | Penalises waste, rewards early concession |\n"
579
- "| Outcome Quality | 10% | 1.0 optimal, 0.4 acceptable, 0.0 wrong |\n"
580
- "| Note Quality | 5% | Policy keywords + evidence refs |\n"
581
- "| Escalation ROI | 20% | EV-rational arbitration: P(win)Β·amount vs $250 fee |\n\n"
582
- "## Card Networks\n\n"
583
- "| Reason Code | Visa | Mastercard |\n"
584
- "|---|---|---|\n"
585
- "| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
586
- "| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
587
- "| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
588
- "| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
589
- "| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
590
- "| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
591
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
 
593
  return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from __future__ import annotations
4
 
5
+ import base64
6
  import os
7
+ from pathlib import Path
8
+ from typing import Any, Callable
9
 
10
+ # Ensure matplotlib has a writable config dir on locked-down hosts (e.g. HF
11
+ # Spaces). Guarded so importing this module from a notebook doesn't pollute
12
+ # the user's environment unnecessarily.
13
+ if not os.environ.get("MPLCONFIGDIR"):
14
+ os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
15
 
16
  import gradio as gr
17
 
18
  try:
19
+ from ..core.models import ChargebackOpsAction
20
+ from ..evaluation.rubrics import (
21
+ CASE_DIMENSION_NAMES,
22
+ CASE_DIMENSION_WEIGHTS,
23
+ )
24
  from ..runners.baseline_runner import (
25
  _heuristic_pick,
26
  _obvious_next_action,
27
  candidate_actions,
28
  )
29
+ from ..runners.benchmark_runner import POLICY_REGISTRY
30
+ from ..scenarios.simulation import get_task, list_tasks
31
  from .chargeback_ops_environment import ChargebackOpsEnvironment
32
  except ImportError: # pragma: no cover
33
+ from core.models import ChargebackOpsAction
34
+ from evaluation.rubrics import (
35
+ CASE_DIMENSION_NAMES,
36
+ CASE_DIMENSION_WEIGHTS,
37
+ )
38
  from runners.baseline_runner import (
39
  _heuristic_pick,
40
  _obvious_next_action,
41
  candidate_actions,
42
  )
43
+ from runners.benchmark_runner import POLICY_REGISTRY
44
+ from scenarios.simulation import get_task, list_tasks
45
  from server.chargeback_ops_environment import ChargebackOpsEnvironment
46
 
47
+ # OpenAI-compatible LLM policy is optional β€” the demo gracefully degrades to
48
+ # scripted policies if the openai SDK or runners.inference is unavailable.
49
+ try: # pragma: no cover β€” exercised only when LLM policy is selected
50
+ from openai import OpenAI # noqa: F401
51
+ try:
52
+ from ..runners.inference import _pick_with_openai_client
53
+ except ImportError:
54
+ from runners.inference import _pick_with_openai_client
55
+ _LLM_POLICY_AVAILABLE = True
56
+ except Exception: # pragma: no cover
57
+ _pick_with_openai_client = None # type: ignore[assignment]
58
+ _LLM_POLICY_AVAILABLE = False
59
+
60
+ # Path to the bundled hero figures (used by the Training Results tab).
61
+ _FIGURES_DIR = Path(__file__).resolve().parents[1] / "docs" / "figures"
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Static metadata
66
+ # ---------------------------------------------------------------------------
67
+
68
+ # Human-readable display labels for the 8 rubric dimensions (in canonical order).
69
+ _DIMENSION_LABELS: tuple[str, ...] = (
70
+ "Strategy Correctness",
71
+ "Evidence Quality",
72
+ "Packet Validity",
73
+ "Deadline Compliance",
74
+ "Efficiency",
75
+ "Outcome Quality",
76
+ "Note Quality",
77
+ "Escalation ROI",
78
+ )
79
+
80
+ # Per-dimension scoring summary (kept short so the table fits on one screen).
81
+ _DIMENSION_SCORING: tuple[str, ...] = (
82
+ "1.0 optimal Β· 0.35 acceptable Β· 0.0 wrong",
83
+ "Required + helpful coverage; harmful evidence penalised",
84
+ "Binary: all required evidence + zero harmful",
85
+ "Binary: case resolved before deadline",
86
+ "Penalises waste; rewards early concession",
87
+ "1.0 optimal Β· 0.4 acceptable Β· 0.0 wrong",
88
+ "Policy keywords + evidence references",
89
+ "EV-rational arbitration: P(win)Β·amount vs $250 fee",
90
+ )
91
+
92
+ # Selectable scripted policies (label shown to user β†’ registry key).
93
+ # Order is intentional: best β†’ worst, so radio top-to-bottom reads as a
94
+ # discrimination ladder.
95
+ _POLICY_CHOICES: tuple[tuple[str, str], ...] = (
96
+ ("Heuristic β€” EV-rational baseline", "heuristic"),
97
+ ("Escalate-all β€” contest then always escalate", "escalate_all"),
98
+ ("Concede-all β€” always accept the chargeback", "concede_all"),
99
+ ("Naive β€” submit empty packet, no evidence", "naive"),
100
+ ("LLM (OpenAI-compatible API)", "llm"),
101
+ )
102
+ _POLICY_LABEL_BY_KEY: dict[str, str] = {
103
+ key: label for label, key in _POLICY_CHOICES
104
+ }
105
+ # Subset used by the Compare tab β€” scripted-only, deterministic, no API calls.
106
+ _COMPARE_POLICIES: tuple[str, ...] = (
107
+ "naive",
108
+ "concede_all",
109
+ "escalate_all",
110
+ "heuristic",
111
+ )
112
+
113
+ # One-click presets for the Run-Episode tab. Each preset is
114
+ # (button_label, task_id, generated_flag, difficulty, seed, recommended_policy, blurb).
115
+ _PRESETS: tuple[tuple[str, str, bool, str, int, str, str], ...] = (
116
+ (
117
+ "Easy contestable",
118
+ "goods_not_received_easy",
119
+ False,
120
+ "easy",
121
+ 42,
122
+ "heuristic",
123
+ "Goods-not-received with strong evidence β€” heuristic should win round 1.",
124
+ ),
125
+ (
126
+ "Queue optimization (hard)",
127
+ "queue_optimization_hard",
128
+ False,
129
+ "hard",
130
+ 42,
131
+ "heuristic",
132
+ "Triage a heterogeneous queue under tight deadlines β€” exercises EV reasoning.",
133
+ ),
134
+ (
135
+ "Long-horizon backlog",
136
+ "monthly_dispute_backlog_marathon",
137
+ False,
138
+ "medium",
139
+ 42,
140
+ "heuristic",
141
+ "12 cases over 60 steps with delayed evidence; tests scheduling + waiting.",
142
+ ),
143
+ (
144
+ "Generated nightmare",
145
+ "generated_nightmare_s31",
146
+ True,
147
+ "nightmare",
148
+ 31,
149
+ "heuristic",
150
+ "Adversarial parametric task β€” even the heuristic struggles.",
151
+ ),
152
+ (
153
+ "Compare all 4 policies",
154
+ "goods_not_received_easy",
155
+ False,
156
+ "easy",
157
+ 42,
158
+ "heuristic",
159
+ "Open the Compare tab β€” same task, all four scripted policies side-by-side.",
160
+ ),
161
+ )
162
+
163
 
164
  # ---------------------------------------------------------------------------
165
  # CSS
 
337
  }
338
 
339
 
340
+ def _round_panel_html(
341
+ observation, history: list[dict[str, str]] | None = None
342
+ ) -> str:
343
+ """Render the visible case's round panel, including a chronological
344
+ issuer-message log so multi-round disputes show every R1/R2/R3 message.
345
+
346
+ ``history`` is a list of ``{round, decision, rationale}`` dicts the caller
347
+ accumulates across steps.
348
+ """
349
+
350
  vc = observation.visible_case
351
  if vc is None:
352
  return ""
 
362
  f'</div>'
363
  )
364
 
365
+ # Show full issuer-message history if we have it, else fall back to the
366
+ # last-message snapshot from the observation.
367
+ rendered_any = False
368
+ if history:
369
+ for entry in history:
370
+ ent_rnd = entry.get("round", "?")
371
+ ent_dec = entry.get("decision") or ""
372
+ ent_rat = entry.get("rationale") or ""
373
+ ent_badge_cls = f"round-{min(int(ent_rnd) if str(ent_rnd).isdigit() else 1, 3)}"
374
+ dec_cls = _DEC_CLASS.get(ent_dec, "")
375
+ dec_pretty = ent_dec.replace("_", " ").title() if ent_dec else "(no decision)"
376
+ body += (
377
+ f'<div style="margin-top:8px;">'
378
+ f'<span class="round-badge {ent_badge_cls}">R{ent_rnd}</span>'
379
+ f'<span class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</span>'
380
+ f'</div>'
381
+ )
382
+ if ent_rat:
383
+ body += f'<div class="issuer-quote">&ldquo;{ent_rat}&rdquo;</div>'
384
+ rendered_any = True
385
+
386
+ if not rendered_any and vc.last_issuer_decision:
387
  dec = vc.last_issuer_decision
388
  dec_cls = _DEC_CLASS.get(dec, "")
389
  dec_pretty = dec.replace("_", " ").title()
390
  body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
391
+ if vc.last_issuer_rationale:
392
+ body += f'<div class="issuer-quote">&ldquo;{vc.last_issuer_rationale}&rdquo;</div>'
 
393
 
394
  if vc.pre_arb_evidence_added:
395
  ids = ", ".join(vc.pre_arb_evidence_added)
 
490
  return task_id
491
 
492
 
493
+ def _build_llm_policy(
494
+ base_url: str, api_key: str, model_name: str
495
+ ) -> tuple[Callable[[dict[str, Any]], ChargebackOpsAction | None], str]:
496
+ """Return ``(policy_fn, label)`` calling an OpenAI-compatible chat model.
497
+
498
+ The policy mirrors the production inference pipeline in
499
+ :mod:`runners.inference`: candidate generation + obvious-action shortcut +
500
+ LLM pick over the shortlist. On any LLM failure (network, parse, missing
501
+ key) it falls back to the heuristic so the demo never freezes mid-stream.
502
+
503
+ UI fields take precedence; blanks fall back to ``HF_TOKEN`` /
504
+ ``API_KEY`` / ``OPENROUTER_API_KEY`` / ``GROQ_API_KEY`` / ``API_BASE_URL``
505
+ / ``MODEL_NAME`` env vars. This lets HF Space operators wire credentials
506
+ via Space Secrets without the public demo asking visitors for keys.
507
+ """
508
+
509
+ if not _LLM_POLICY_AVAILABLE or _pick_with_openai_client is None:
510
+ raise RuntimeError(
511
+ "openai SDK is not available β€” install `openai` to use the LLM policy."
512
+ )
513
+
514
+ base_url = (base_url or "").strip()
515
+ api_key = (api_key or "").strip()
516
+ model_name = (model_name or "").strip()
517
+
518
+ if not api_key:
519
+ api_key = (
520
+ os.getenv("HF_TOKEN")
521
+ or os.getenv("API_KEY")
522
+ or os.getenv("OPENROUTER_API_KEY")
523
+ or os.getenv("GROQ_API_KEY")
524
+ or ""
525
+ )
526
+ # Resolve provider from explicit base_url first, then from which key
527
+ # variable was set in the environment. This lets us pick a sensible
528
+ # default model name even when only the key is provided.
529
+ provider: str = ""
530
+ if not base_url:
531
+ base_url = os.getenv("API_BASE_URL", "").strip()
532
+ if base_url:
533
+ lowered = base_url.lower()
534
+ if "groq" in lowered:
535
+ provider = "groq"
536
+ elif "openrouter" in lowered:
537
+ provider = "openrouter"
538
+ elif "huggingface" in lowered or "hf.space" in lowered:
539
+ provider = "hf"
540
+ elif "openai.com" in lowered:
541
+ provider = "openai"
542
+ if not base_url:
543
+ if os.getenv("GROQ_API_KEY"):
544
+ base_url, provider = "https://api.groq.com/openai/v1", "groq"
545
+ elif os.getenv("OPENROUTER_API_KEY"):
546
+ base_url, provider = "https://openrouter.ai/api/v1", "openrouter"
547
+ else:
548
+ base_url, provider = "https://router.huggingface.co/v1", "hf"
549
+
550
+ if not model_name:
551
+ model_name = os.getenv("MODEL_NAME", "").strip()
552
+ if not model_name:
553
+ # Provider-appropriate defaults β€” every option here works without
554
+ # the user having to look up a model card.
555
+ provider_defaults = {
556
+ "groq": "llama-3.3-70b-versatile",
557
+ "openrouter": "meta-llama/llama-3.1-8b-instruct:free",
558
+ "openai": "gpt-4o-mini",
559
+ "hf": "Qwen/Qwen2.5-72B-Instruct",
560
+ }
561
+ model_name = provider_defaults.get(provider, "Qwen/Qwen2.5-72B-Instruct")
562
+
563
+ if not api_key:
564
+ raise RuntimeError(
565
+ "No API key β€” type one in the UI, or set HF_TOKEN / API_KEY / "
566
+ "OPENROUTER_API_KEY / GROQ_API_KEY in the environment (HF Space "
567
+ "Secrets work too)."
568
+ )
569
+ if not model_name:
570
+ raise RuntimeError("Model name is required for the LLM policy.")
571
+
572
+ client = OpenAI(
573
+ base_url=base_url,
574
+ api_key=api_key,
575
+ timeout=15.0,
576
+ max_retries=0,
577
+ )
578
+
579
+ def policy_fn(observation: dict[str, Any]) -> ChargebackOpsAction | None:
580
+ cands = candidate_actions(observation)
581
+ if not cands:
582
+ return None
583
+ if len(cands) == 1:
584
+ return cands[0].action
585
+ obvious = _obvious_next_action(observation, cands)
586
+ if obvious is not None:
587
+ return obvious.action
588
+ try:
589
+ pick, _ok, _err = _pick_with_openai_client(
590
+ client, model_name, observation, cands
591
+ )
592
+ return pick.action
593
+ except Exception:
594
+ return _heuristic_pick(cands).action
595
+
596
+ label = f"LLM ({model_name})"
597
+ return policy_fn, label
598
+
599
+
600
+ def _result_badge(result: str | None) -> str:
601
+ """Prefix a step result string with a status emoji for fast scanning.
602
+
603
+ Distinguishes accepted/no-op/rejected so the trace dataframe self-narrates.
604
+ """
605
+
606
+ if not result:
607
+ return "Β· (no result)"
608
+ text = str(result)
609
+ lowered = text.lower()
610
+ if "error" in lowered or "reject" in lowered or "invalid" in lowered or "fail" in lowered:
611
+ return f"βœ— {text}"
612
+ if "no-op" in lowered or "noop" in lowered or "ignored" in lowered or "skipped" in lowered:
613
+ return f"⚠ {text}"
614
+ return f"βœ“ {text}"
615
+
616
+
617
+ def _resolve_max_steps(observation, task_id: str) -> int:
618
+ """Pull the task budget from the observation; fall back to the task definition.
619
+
620
+ The legacy implementation defaulted to 10 if the observation field was absent,
621
+ which silently mis-rendered the budget bar. The env always populates
622
+ ``info.current_task_max_steps`` after ``reset``; if it ever doesn't, we read
623
+ the task object directly so the bar still reflects truth.
624
+ """
625
+
626
+ cap = observation.info.get("current_task_max_steps")
627
+ if isinstance(cap, int) and cap > 0:
628
+ return cap
629
+ try:
630
+ return int(get_task(task_id).max_steps)
631
+ except Exception: # pragma: no cover β€” defensive
632
+ return 60
633
+
634
+
635
  def run_episode(
636
+ task_id: str,
637
+ generated: bool,
638
+ difficulty: str,
639
+ seed: int,
640
+ policy: str = "heuristic",
641
+ llm_base_url: str = "",
642
+ llm_api_key: str = "",
643
+ llm_model: str = "",
644
  ):
645
  tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
646
  env = ChargebackOpsEnvironment()
647
  obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
648
+ max_steps = _resolve_max_steps(obs, tid)
649
  rows: list[list[Any]] = []
650
 
651
+ policy_fn: Callable[[dict[str, Any]], ChargebackOpsAction | None] | None = None
652
+ if policy == "llm":
653
+ try:
654
+ policy_fn, policy_label = _build_llm_policy(
655
+ llm_base_url, llm_api_key, llm_model
656
+ )
657
+ except Exception as exc:
658
+ err_md = (
659
+ f"### LLM policy unavailable\n"
660
+ f"`{type(exc).__name__}: {exc}`\n\n"
661
+ f"Falling back to **heuristic** for this run."
662
+ )
663
+ policy = "heuristic"
664
+ policy_fn = POLICY_REGISTRY["heuristic"]
665
+ policy_label = _POLICY_LABEL_BY_KEY[policy]
666
+ yield (
667
+ err_md,
668
+ _queue_html(obs),
669
+ _budget_html(0, max_steps, 0.0),
670
+ [],
671
+ "",
672
+ "",
673
+ "",
674
+ None,
675
+ )
676
+ if policy_fn is None:
677
+ policy_fn = POLICY_REGISTRY.get(policy) or POLICY_REGISTRY["heuristic"]
678
+ if policy not in POLICY_REGISTRY:
679
+ policy = "heuristic"
680
+ policy_label = _POLICY_LABEL_BY_KEY.get(policy, policy)
681
+
682
+ # Per-case issuer-message log: case_id -> [{"round","decision","rationale"}]
683
+ issuer_log: dict[str, list[dict[str, str]]] = {}
684
+
685
+ def _maybe_log_issuer_msg(observation) -> None:
686
+ vc = observation.visible_case
687
+ if vc is None or not vc.last_issuer_decision:
688
+ return
689
+ log = issuer_log.setdefault(vc.case_id, [])
690
+ entry = {
691
+ "round": str(vc.round_number or 1),
692
+ "decision": vc.last_issuer_decision or "",
693
+ "rationale": vc.last_issuer_rationale or "",
694
+ }
695
+ # Avoid duplicating the same message on adjacent steps.
696
+ if not log or log[-1] != entry:
697
+ log.append(entry)
698
+
699
+ def _current_history(observation) -> list[dict[str, str]]:
700
+ vc = observation.visible_case
701
+ if vc is None:
702
+ return []
703
+ return issuer_log.get(vc.case_id, [])
704
+
705
  header = (
706
  f"### {obs.task_title}\n"
707
  f"`{obs.task_id}` &mdash; {len(obs.queue)} case(s), "
 
712
  _queue_html(obs),
713
  _budget_html(0, max_steps, 0.0),
714
  [row[:] for row in rows],
715
+ _round_panel_html(obs, _current_history(obs)),
716
  _arbitration_panel_html(obs),
717
  "",
718
  None,
 
720
 
721
  step = 0
722
  while not obs.done:
723
+ payload = obs.model_dump()
724
+ try:
725
+ action = policy_fn(payload)
726
+ except Exception as exc: # pragma: no cover β€” surface in UI
727
+ err_md = (
728
+ f"### Policy error\n"
729
+ f"`{policy}` raised `{type(exc).__name__}: {exc}` on step {step + 1}. "
730
+ f"Halting episode."
731
+ )
732
+ yield (
733
+ err_md,
734
+ _queue_html(obs),
735
+ _budget_html(step, max_steps, obs.progress_score),
736
+ [row[:] for row in rows],
737
+ _round_panel_html(obs, _current_history(obs)),
738
+ _arbitration_panel_html(obs),
739
+ "",
740
+ None,
741
+ )
742
+ return
743
+ if action is None:
744
+ break
745
+
746
+ summary_action = action
747
  step += 1
748
+ try:
749
+ obs = env.step(action)
750
+ except Exception as exc: # pragma: no cover β€” surface in UI
751
+ err_md = (
752
+ f"### Environment error\n"
753
+ f"`env.step({summary_action.action_type})` raised "
754
+ f"`{type(exc).__name__}: {exc}` on step {step}. "
755
+ f"Halting episode."
756
+ )
757
+ rows.append(
758
+ [
759
+ step,
760
+ summary_action.action_type,
761
+ summary_action.case_id or "",
762
+ summary_action.system_name or "",
763
+ summary_action.strategy or "",
764
+ 0.0,
765
+ f"βœ— error: {type(exc).__name__}",
766
+ ]
767
+ )
768
+ yield (
769
+ err_md,
770
+ _queue_html(obs),
771
+ _budget_html(step, max_steps, obs.progress_score),
772
+ [row[:] for row in rows],
773
+ _round_panel_html(obs, _current_history(obs)),
774
+ _arbitration_panel_html(obs),
775
+ "",
776
+ None,
777
+ )
778
+ return
779
+
780
+ _maybe_log_issuer_msg(obs)
781
+
782
  rows.append(
783
  [
784
  step,
 
787
  summary_action.system_name or "",
788
  summary_action.strategy or "",
789
  round(obs.reward or 0.0, 4),
790
+ _result_badge(obs.last_action_result),
791
  ]
792
  )
793
 
 
803
  _queue_html(obs),
804
  _budget_html(step, max_steps, obs.progress_score),
805
  [row[:] for row in rows],
806
+ _round_panel_html(obs, _current_history(obs)),
807
  _arbitration_panel_html(obs),
808
  grader,
809
  None,
 
811
 
812
  report = obs.grader_report.model_dump() if obs.grader_report else None
813
  sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
814
+ final_md = (
815
+ f"### Done &mdash; score **{sc}** in **{len(rows)}** steps "
816
+ f"&middot; policy: **{policy_label}**"
817
+ )
818
  yield (
819
  final_md,
820
  _queue_html(obs),
821
  _budget_html(step, max_steps, obs.progress_score),
822
  [row[:] for row in rows],
823
+ _round_panel_html(obs, _current_history(obs)),
824
  _arbitration_panel_html(obs),
825
  _grader_html(report),
826
  report,
827
  )
828
 
829
 
830
+ # ---------------------------------------------------------------------------
831
+ # Compare tab β€” run all four scripted policies on the same task in series and
832
+ # render a single side-by-side bar chart of the final scores plus a per-case
833
+ # per-dimension breakdown.
834
+ # ---------------------------------------------------------------------------
835
+
836
+
837
+ def _run_one_episode_sync(task_id: str, policy_key: str) -> dict[str, Any]:
838
+ """Synchronously run a single scripted-policy episode and return summary.
839
+
840
+ Cheap because every policy in :data:`_COMPARE_POLICIES` is pure-Python and
841
+ fully offline (no provider calls).
842
+ """
843
+
844
+ env = ChargebackOpsEnvironment()
845
+ obs = env.reset(task_id=task_id)
846
+ policy_fn = POLICY_REGISTRY[policy_key]
847
+ steps = 0
848
+ while not obs.done:
849
+ try:
850
+ action = policy_fn(obs.model_dump())
851
+ except Exception:
852
+ break
853
+ if action is None:
854
+ break
855
+ try:
856
+ obs = env.step(action)
857
+ except Exception:
858
+ break
859
+ steps += 1
860
+ score = obs.grader_report.normalized_score if obs.grader_report else 0.0
861
+ return {
862
+ "policy": policy_key,
863
+ "score": float(score),
864
+ "steps": steps,
865
+ "summary": obs.grader_report.summary if obs.grader_report else "",
866
+ }
867
+
868
+
869
+ def run_compare(task_id: str, generated: bool, difficulty: str, seed: int):
870
+ """Run all four scripted policies on the same task and render a chart."""
871
+
872
+ tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
873
+ results = [_run_one_episode_sync(tid, p) for p in _COMPARE_POLICIES]
874
+
875
+ # Bar-chart HTML (CSS-only, no extra deps).
876
+ max_score = max((r["score"] for r in results), default=1.0) or 1.0
877
+ bars = ""
878
+ for r in results:
879
+ pct = int(round(100 * r["score"] / max(0.001, max_score)))
880
+ color = _score_color(r["score"])
881
+ bars += (
882
+ f'<div class="bar-row" style="margin:6px 0;">'
883
+ f'<span class="bar-label" style="width:130px;">{r["policy"]}</span>'
884
+ f'<div class="bar-track" style="flex:1;height:22px;">'
885
+ f'<div class="bar-fill" style="width:{pct}%;background:{color};height:100%;"></div>'
886
+ f'</div>'
887
+ f'<span class="bar-value" style="width:120px;">'
888
+ f'{r["score"]:.3f} Β· {r["steps"]} steps</span>'
889
+ f'</div>'
890
+ )
891
+
892
+ # Discrimination delta.
893
+ by_policy = {r["policy"]: r["score"] for r in results}
894
+ delta = by_policy.get("heuristic", 0.0) - by_policy.get("naive", 0.0)
895
+ title = (
896
+ f'<div style="margin:8px 0;font-size:14px;">'
897
+ f'<b>Task</b>: <code>{tid}</code> &middot; '
898
+ f'<b>Discrimination delta</b> (heuristic βˆ’ naive) = '
899
+ f'<span style="color:{_score_color(delta)};">'
900
+ f'<b>+{delta:.3f}</b></span>'
901
+ f'</div>'
902
+ )
903
+
904
+ md = (
905
+ f"### Side-by-side: 4 scripted policies on the same task\n"
906
+ f"Same `task_id`, same `seed`, no provider calls. The discrimination "
907
+ f"gradient (`naive` β†’ `concede_all` β†’ `escalate_all` β†’ `heuristic`) "
908
+ f"is the empirical evidence behind the README's `+0.813` claim."
909
+ )
910
+ table_rows = [
911
+ [r["policy"], f"{r['score']:.3f}", r["steps"], r["summary"]]
912
+ for r in results
913
+ ]
914
+ return md, title + '<div style="padding:8px 0;">' + bars + "</div>", table_rows
915
+
916
+
917
  # ---------------------------------------------------------------------------
918
  # Build Gradio app
919
  # ---------------------------------------------------------------------------
 
928
  # Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
929
  gr.HTML(f"<style>{_CSS}</style>")
930
 
931
+ # Header + context links
932
  gr.HTML(
933
  '<div class="dashboard-header">'
934
  "<h1>ChargebackOps</h1>"
935
+ "<p>Merchant chargeback dispute environment &mdash; an OpenEnv benchmark for "
936
+ "cost-asymmetric multi-round LLM agents</p>"
937
+ '<div style="margin-top:8px;">'
938
+ '<a href="https://github.com/MitudruDutta/chargebackops" target="_blank" '
939
+ 'style="margin:0 6px;color:#3b82f6;text-decoration:none;">πŸ“¦ GitHub</a> '
940
+ '<a href="https://huggingface.co/spaces/mitudrudutta/ChargeBackOps" target="_blank" '
941
+ 'style="margin:0 6px;color:#FFD21E;text-decoration:none;">πŸ€— HF Space</a> '
942
+ '<a href="https://youtu.be/7dz37JTTMo4" target="_blank" '
943
+ 'style="margin:0 6px;color:#FF0000;text-decoration:none;">πŸ“Ί Walkthrough</a> '
944
+ '<a href="https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5" target="_blank" '
945
+ 'style="margin:0 6px;color:#F9AB00;text-decoration:none;">πŸ§ͺ Training Colab</a> '
946
+ '<a href="https://github.com/meta-pytorch/OpenEnv" target="_blank" '
947
+ 'style="margin:0 6px;color:#0668E1;text-decoration:none;">πŸ¦™ Meta OpenEnv</a>'
948
+ "</div>"
949
  "</div>"
950
  )
951
 
952
  with gr.Tabs():
953
  # ── Tab 1: Run Episode ────────────────────────────────
954
  with gr.Tab("Run Episode"):
955
+ # Preset buttons row β€” one-click task+policy configuration.
956
+ gr.Markdown("**Quick presets** β€” click any to load a known-good configuration.")
957
+ with gr.Row():
958
+ preset_buttons = [
959
+ gr.Button(p[0], size="sm", scale=1) for p in _PRESETS
960
+ ]
961
+ preset_blurb = gr.Markdown("")
962
+
963
  with gr.Row():
964
  dd_task = gr.Dropdown(
965
  label="Task", choices=task_ids, value=default, scale=3
 
969
  ["easy", "medium", "hard", "nightmare"],
970
  label="Difficulty",
971
  value="easy",
972
+ visible=False,
973
  scale=2,
974
  )
975
+ nb_seed = gr.Number(
976
+ label="Seed", value=42, precision=0, visible=False, scale=1
977
+ )
978
  with gr.Row():
979
  rd_policy = gr.Radio(
980
+ choices=list(_POLICY_CHOICES),
 
 
 
981
  value="heuristic",
982
  label="Policy",
983
  scale=4,
984
  )
985
  btn_run = gr.Button("Run Episode", variant="primary", scale=1)
986
 
987
+ # LLM-policy inputs β€” only visible when "LLM" is selected.
988
+ with gr.Accordion(
989
+ "LLM policy settings (used when 'LLM' is selected above)",
990
+ open=False,
991
+ visible=False,
992
+ ) as llm_accordion:
993
+ gr.Markdown(
994
+ "Bring your own OpenAI-compatible endpoint. Defaults match the "
995
+ "Hugging Face router; OpenRouter, Groq, Together, Fireworks, "
996
+ "and Anthropic-compatible gateways all work. **Leave fields "
997
+ "blank** to inherit `HF_TOKEN` / `OPENROUTER_API_KEY` / "
998
+ "`GROQ_API_KEY` / `API_BASE_URL` / `MODEL_NAME` from the "
999
+ "environment (set them as Space Secrets when deploying)."
1000
+ )
1001
+ with gr.Row():
1002
+ tb_llm_base = gr.Textbox(
1003
+ label="Base URL",
1004
+ value="https://router.huggingface.co/v1",
1005
+ scale=2,
1006
+ )
1007
+ tb_llm_model = gr.Textbox(
1008
+ label="Model",
1009
+ value="Qwen/Qwen2.5-72B-Instruct",
1010
+ scale=2,
1011
+ )
1012
+ tb_llm_key = gr.Textbox(
1013
+ label="API key",
1014
+ value="",
1015
+ type="password",
1016
+ scale=2,
1017
+ )
1018
+
1019
  md_status = gr.Markdown(
1020
+ "Pick a task + policy and click **Run Episode**. Run the same task "
1021
+ "under each of the four scripted policies (heuristic, escalate-all, "
1022
+ "concede-all, naive) to reproduce the discrimination gradient β€” naive "
1023
+ "β†’ 0.000, concede-all β†’ ~0.44, escalate-all β†’ ~0.77, heuristic β†’ ~0.81. "
1024
+ "Or pick **LLM** and bring your own model. For a side-by-side view, "
1025
+ "open the **Compare policies** tab."
1026
  )
1027
 
1028
  with gr.Row(equal_height=True):
 
1044
  datatype=["number", "str", "str", "str", "str", "number", "str"],
1045
  interactive=False,
1046
  wrap=True,
1047
+ label="Step Trace (βœ“ accepted Β· ⚠ no-op Β· βœ— rejected)",
1048
  )
1049
 
1050
  with gr.Row(equal_height=True):
1051
  with gr.Column(scale=1):
1052
+ html_round = gr.HTML(label="Dispute Round (issuer messages)")
1053
  with gr.Column(scale=1):
1054
  html_arb = gr.HTML(label="Arbitration")
1055
 
1056
  html_grader = gr.HTML(label="Grader Report")
1057
+ with gr.Accordion("Raw grader JSON (export-friendly)", open=False):
1058
+ json_raw = gr.JSON(label="Raw JSON", show_label=False)
1059
 
1060
  btn_run.click(
1061
  fn=run_episode,
1062
+ inputs=[
1063
+ dd_task, cb_gen, rd_diff, nb_seed, rd_policy,
1064
+ tb_llm_base, tb_llm_key, tb_llm_model,
1065
+ ],
1066
  outputs=[
1067
  md_status,
1068
  html_queue,
 
1075
  ],
1076
  )
1077
 
1078
+ # Generated-checkbox visibility callback.
1079
+ def _toggle_generated(generated: bool):
1080
+ return (
1081
+ gr.update(visible=generated),
1082
+ gr.update(visible=generated),
1083
+ )
1084
+
1085
+ cb_gen.change(
1086
+ fn=_toggle_generated,
1087
+ inputs=[cb_gen],
1088
+ outputs=[rd_diff, nb_seed],
1089
+ )
1090
+
1091
+ # Show LLM accordion only when 'llm' policy is selected.
1092
+ def _toggle_llm(policy: str):
1093
+ return gr.update(visible=(policy == "llm"), open=(policy == "llm"))
1094
+
1095
+ rd_policy.change(
1096
+ fn=_toggle_llm, inputs=[rd_policy], outputs=[llm_accordion]
1097
+ )
1098
+
1099
+ # Wire each preset button to populate the inputs atomically.
1100
+ def _make_preset_handler(preset):
1101
+ label, t_id, gen, diff, seed_v, pol, blurb = preset
1102
+
1103
+ def _apply():
1104
+ return (
1105
+ t_id, # dd_task
1106
+ gen, # cb_gen
1107
+ gr.update(value=diff, visible=gen), # rd_diff
1108
+ gr.update(value=seed_v, visible=gen), # nb_seed
1109
+ pol, # rd_policy
1110
+ gr.update(visible=(pol == "llm")), # llm_accordion
1111
+ f"**Preset:** {label} β€” {blurb}", # preset_blurb
1112
+ )
1113
+
1114
+ return _apply
1115
+
1116
+ for btn, preset in zip(preset_buttons, _PRESETS):
1117
+ btn.click(
1118
+ fn=_make_preset_handler(preset),
1119
+ inputs=[],
1120
+ outputs=[
1121
+ dd_task,
1122
+ cb_gen,
1123
+ rd_diff,
1124
+ nb_seed,
1125
+ rd_policy,
1126
+ llm_accordion,
1127
+ preset_blurb,
1128
+ ],
1129
+ )
1130
+
1131
+ # ── Tab 2: Compare policies ──────────────────────────
1132
+ with gr.Tab("Compare policies"):
1133
+ gr.Markdown(
1134
+ "Run all four scripted policies on the **same task / seed** and see "
1135
+ "the discrimination gradient at a glance. No provider calls, no LLM, "
1136
+ "fully deterministic β€” this is the empirical evidence behind the "
1137
+ "README's `+0.813` discrimination delta claim."
1138
+ )
1139
+ with gr.Row():
1140
+ cmp_task = gr.Dropdown(
1141
+ label="Task", choices=task_ids, value=default, scale=3
1142
+ )
1143
+ cmp_gen = gr.Checkbox(label="Generated", value=False, scale=1)
1144
+ cmp_diff = gr.Radio(
1145
+ ["easy", "medium", "hard", "nightmare"],
1146
+ label="Difficulty",
1147
+ value="easy",
1148
+ visible=False,
1149
+ scale=2,
1150
+ )
1151
+ cmp_seed = gr.Number(
1152
+ label="Seed", value=42, precision=0, visible=False, scale=1
1153
+ )
1154
+ btn_cmp = gr.Button("Run all 4 policies", variant="primary")
1155
+ cmp_md = gr.Markdown("")
1156
+ cmp_html = gr.HTML(label="Final-score comparison")
1157
+ cmp_table = gr.Dataframe(
1158
+ headers=["Policy", "Score", "Steps", "Summary"],
1159
+ datatype=["str", "str", "number", "str"],
1160
+ interactive=False,
1161
+ wrap=True,
1162
+ label="Per-policy summary",
1163
+ )
1164
+ btn_cmp.click(
1165
+ fn=run_compare,
1166
+ inputs=[cmp_task, cmp_gen, cmp_diff, cmp_seed],
1167
+ outputs=[cmp_md, cmp_html, cmp_table],
1168
+ )
1169
+ cmp_gen.change(
1170
+ fn=_toggle_generated,
1171
+ inputs=[cmp_gen],
1172
+ outputs=[cmp_diff, cmp_seed],
1173
+ )
1174
+
1175
+ # ── Tab 3: Task Catalog ──────────────────────────────
1176
  with gr.Tab("Task Catalog"):
1177
  catalog_rows = []
1178
  for t in tasks:
 
1211
 
1212
  # ── Tab 3: Environment Info ───────────────────────────
1213
  with gr.Tab("Environment"):
1214
+ gr.Markdown(_environment_tab_markdown())
1215
+
1216
+ # ── Tab 5: Rubric Tree ────────────────────────────────
1217
+ with gr.Tab("Rubric Tree"):
1218
  gr.Markdown(
1219
+ "Live introspection of `env.rubric.named_rubrics()` β€” the same composable "
1220
+ "OpenEnv `Rubric` tree that grades every step. Weights and structure below "
1221
+ "are read from the running environment, not hardcoded."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222
  )
1223
+ gr.HTML(_rubric_tree_html())
1224
+ gr.Markdown(
1225
+ "See [`docs/METHOD.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/METHOD.md) "
1226
+ "and [`docs/SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md) "
1227
+ "for the full design and the GRPO failure-mode write-up."
1228
+ )
1229
+
1230
+ # ── Tab 6: Training Results ───────────────────────────
1231
+ with gr.Tab("Training Results"):
1232
+ gr.Markdown(_training_tab_markdown())
1233
+ for caption, fname in (
1234
+ (
1235
+ "**Cross-iteration training curve.** Iter 3 plateaued below the "
1236
+ "heuristic at 0.728. Iter 5 plateaued *bit-exactly* at the heuristic "
1237
+ "at 0.8132 β€” the signature of the eval-fallback exploit, not "
1238
+ "convergent learning.",
1239
+ "training_curve_cross_iter.png",
1240
+ ),
1241
+ (
1242
+ "**Iter-5 eval-score attribution.** The trained policy contributes "
1243
+ "0.000 (every action is rejected by env validation). The eval rollout "
1244
+ "helper's heuristic-fallback path contributes 0.8132 β€” i.e. all of it.",
1245
+ "gaming_attribution.png",
1246
+ ),
1247
+ (
1248
+ "**Scripted-policy discrimination gradient.** The 8-dimension "
1249
+ "`WeightedSum` plus the deadline `Gate` defeats every degenerate "
1250
+ "policy: empty-packet zeros out, concede-all caps at 0.44, "
1251
+ "escalate-all caps at 0.77.",
1252
+ "discrimination_gradient.png",
1253
+ ),
1254
+ (
1255
+ "**8-dimension OpenEnv rubric weights**, grouped by category "
1256
+ "(decision / packet / process / terminal). 40% of reward sits on "
1257
+ "decision + terminal β€” where economically irrational policies "
1258
+ "bleed money fastest.",
1259
+ "rubric_weights.png",
1260
+ ),
1261
+ (
1262
+ "**Iter-5 per-difficulty curves.** Post-step-80 plateau is the "
1263
+ "fallback heuristic across every difficulty band; see "
1264
+ "SPECIFICATION_GAMING.md for the diagnosis.",
1265
+ "training_curve_by_family.png",
1266
+ ),
1267
+ ):
1268
+ src = _figure_data_uri(fname)
1269
+ if src is None:
1270
+ gr.Markdown(
1271
+ f"_(figure `{fname}` not bundled β€” see "
1272
+ f"[`docs/figures/{fname}`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/figures/{fname}))_"
1273
+ )
1274
+ continue
1275
+ gr.Markdown(caption)
1276
+ gr.HTML(
1277
+ f'<img src="{src}" style="width:100%;max-width:1100px;'
1278
+ f'border:1px solid #2a2a2a;border-radius:6px;margin:6px 0;" '
1279
+ f'alt="{fname}" />'
1280
+ )
1281
 
1282
  return demo
1283
+
1284
+
1285
+ # ---------------------------------------------------------------------------
1286
+ # Tab content builders (called once at app build; keep cheap)
1287
+ # ---------------------------------------------------------------------------
1288
+
1289
+
1290
+ def _environment_tab_markdown() -> str:
1291
+ """Render the Environment tab content from live constants.
1292
+
1293
+ Reads action types from ``core.models.ActionType`` and the rubric weights
1294
+ from ``evaluation.rubrics.CASE_DIMENSION_WEIGHTS`` so this tab can never
1295
+ drift from the source of truth.
1296
+ """
1297
+
1298
+ try:
1299
+ from core.models import ActionType # type: ignore[attr-defined]
1300
+ except ImportError: # pragma: no cover
1301
+ from ..core.models import ActionType # type: ignore[attr-defined]
1302
+
1303
+ # ``Literal`` exposes its members via ``__args__``.
1304
+ actions: tuple[str, ...] = tuple(getattr(ActionType, "__args__", ()))
1305
+ n_actions = len(actions)
1306
+
1307
+ r1 = (
1308
+ "select_case", "inspect_case", "query_system", "retrieve_policy",
1309
+ "add_evidence", "remove_evidence", "set_strategy",
1310
+ "submit_representment", "resolve_case",
1311
+ )
1312
+ r23 = ("respond_to_pre_arb", "escalate_to_arbitration", "accept_arbitration_loss")
1313
+ long_horizon = ("wait_for_updates",)
1314
+
1315
+ def _join(items: tuple[str, ...]) -> str:
1316
+ return " &middot; ".join(f"`{name}`" for name in items)
1317
+
1318
+ rubric_rows = "\n".join(
1319
+ f"| {label} | {int(round(weight * 100))}% | {scoring} |"
1320
+ for label, weight, scoring in zip(
1321
+ _DIMENSION_LABELS, CASE_DIMENSION_WEIGHTS, _DIMENSION_SCORING
1322
+ )
1323
+ )
1324
+
1325
+ return (
1326
+ f"## Action Space ({n_actions} typed actions)\n\n"
1327
+ f"**Round 1 β€” Representment:** {_join(r1)}\n\n"
1328
+ f"**Round 2/3 β€” Pre-arb &amp; Arbitration:** {_join(r23)}\n\n"
1329
+ f"**Long-horizon backlog:** {_join(long_horizon)}\n\n"
1330
+ "## Merchant Systems (6)\n\n"
1331
+ "`orders` &middot; `payment` &middot; `shipping` &middot; "
1332
+ "`support` &middot; `refunds` &middot; `risk`\n\n"
1333
+ "## Grading (8 dimensions)\n\n"
1334
+ "Weights are read live from `evaluation.rubrics.CASE_DIMENSION_WEIGHTS`.\n\n"
1335
+ "| Dimension | Weight | Scoring |\n"
1336
+ "|---|---|---|\n"
1337
+ f"{rubric_rows}\n\n"
1338
+ "## Scripted policies (Run Episode tab)\n\n"
1339
+ "| Policy | What it does | Headline avg |\n"
1340
+ "|---|---|---|\n"
1341
+ "| `naive` | Submit empty packet, no evidence, no policy work | 0.000 |\n"
1342
+ "| `concede_all` | Always set strategy `accept_chargeback` and resolve | 0.444 |\n"
1343
+ "| `escalate_all` | Contest like the heuristic, then always escalate | 0.767 |\n"
1344
+ "| `heuristic` | EV-rational, fully offline | **0.813** |\n\n"
1345
+ "## Card Networks\n\n"
1346
+ "| Reason Code | Visa | Mastercard |\n"
1347
+ "|---|---|---|\n"
1348
+ "| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
1349
+ "| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
1350
+ "| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
1351
+ "| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
1352
+ "| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
1353
+ "| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
1354
+ )
1355
+
1356
+
1357
+ def _rubric_tree_html() -> str:
1358
+ """Render the live ``env.rubric.named_rubrics()`` tree as nested HTML.
1359
+
1360
+ Also explicitly surfaces the deadline ``Gate(CaseAbandonedRubric)`` that
1361
+ sits on top of the per-case ``WeightedSum`` β€” OpenEnv's default walk
1362
+ iterates registered child rubrics only, and the Gate is a sibling of the
1363
+ aggregator inside :class:`CaseRubric`.
1364
+
1365
+ Falls back to a static snapshot if introspection fails for any reason
1366
+ (e.g. an old OpenEnv build) so the demo never breaks on this tab.
1367
+ """
1368
+
1369
+ try:
1370
+ env = ChargebackOpsEnvironment()
1371
+ named = list(env.rubric.named_rubrics())
1372
+ except Exception as exc: # pragma: no cover β€” defensive fallback
1373
+ return (
1374
+ f"<pre style='color:#ef4444;'>Could not introspect rubric tree: "
1375
+ f"{type(exc).__name__}: {exc}</pre>"
1376
+ )
1377
+
1378
+ # Map weights onto leaf rubrics by name. CASE_DIMENSION_NAMES is the
1379
+ # canonical order the WeightedSum was built with; weights align by index.
1380
+ weight_by_dim = dict(zip(CASE_DIMENSION_NAMES, CASE_DIMENSION_WEIGHTS))
1381
+
1382
+ rows: list[str] = []
1383
+ rows.append(
1384
+ "<table class='queue-table' style='font-family:ui-monospace,monospace;'>"
1385
+ "<tr><th>Path</th><th>Class</th><th>Weight / Role</th></tr>"
1386
+ )
1387
+
1388
+ # Explicitly inject the deadline gate row above the aggregator subtree,
1389
+ # since some OpenEnv versions don't yield it via named_rubrics().
1390
+ deadline_gate_injected = False
1391
+ for path, rubric in named:
1392
+ cls_name = type(rubric).__name__
1393
+ if (
1394
+ not deadline_gate_injected
1395
+ and cls_name == "WeightedSum"
1396
+ and path.endswith("aggregator")
1397
+ ):
1398
+ parent = path.rsplit(".", 1)[0]
1399
+ rows.append(
1400
+ f"<tr><td>{'&nbsp;' * (parent.count('.') * 4 + 4)}"
1401
+ f"<code>{parent}.deadline_gate</code></td>"
1402
+ f"<td>Gate(CaseAbandonedRubric)</td>"
1403
+ f"<td style='text-align:right;color:#eab308;'>hard-zero on miss</td></tr>"
1404
+ )
1405
+ deadline_gate_injected = True
1406
+
1407
+ weight_str = "β€”"
1408
+ for dim_name, weight in weight_by_dim.items():
1409
+ tag = "".join(part.capitalize() for part in dim_name.split("_")) + "Rubric"
1410
+ if cls_name == tag:
1411
+ weight_str = f"{int(round(weight * 100))}%"
1412
+ break
1413
+ depth = path.count(".")
1414
+ indent = "&nbsp;" * (depth * 4)
1415
+ rows.append(
1416
+ f"<tr><td>{indent}<code>{path or '(root)'}</code></td>"
1417
+ f"<td>{cls_name}</td>"
1418
+ f"<td style='text-align:right;'>{weight_str}</td></tr>"
1419
+ )
1420
+ rows.append("</table>")
1421
+ return "".join(rows)
1422
+
1423
+
1424
+ # ---------------------------------------------------------------------------
1425
+ # Training Results helpers
1426
+ # ---------------------------------------------------------------------------
1427
+
1428
+
1429
+ def _figure_data_uri(filename: str) -> str | None:
1430
+ """Return a base64 ``data:image/png`` URI for a bundled figure, or None.
1431
+
1432
+ Embedding figures inline avoids dependencies on the static-asset routing
1433
+ of whatever host serves the demo (HF Spaces, FastAPI sub-mount, etc.).
1434
+ """
1435
+
1436
+ path = _FIGURES_DIR / filename
1437
+ if not path.is_file():
1438
+ return None
1439
+ try:
1440
+ data = path.read_bytes()
1441
+ except OSError:
1442
+ return None
1443
+ encoded = base64.b64encode(data).decode("ascii")
1444
+ return f"data:image/png;base64,{encoded}"
1445
+
1446
+
1447
+ def _training_tab_markdown() -> str:
1448
+ return (
1449
+ "## Real training, end-to-end\n\n"
1450
+ "**Pipeline.** Qwen2.5-3B fp16 + LoRA r=16 on a single Colab T4. Phase A is "
1451
+ "supervised fine-tuning on heuristic rollouts; Phase B is GRPO with an outcome-"
1452
+ "based reward (terminal $-PnL after the model's action plus a heuristic tail-"
1453
+ "rollout). The training loop **connects to the live `ChargebackOpsEnvironment`** "
1454
+ "β€” every gradient step is graded by the same rubric and same Issuer adversary "
1455
+ "the eval uses. There is no static dataset shortcut.\n\n"
1456
+ "**Five iterations, three failure modes.** Iter 1 produced total gradient "
1457
+ "collapse (group reward variance β‰ˆ 0). Iter 3 broke through to non-zero gradient "
1458
+ "but plateaued at 0.728. **Iter 5 ran 200 GRPO steps and uncovered a reproducible "
1459
+ "specification-gaming exploit** where the model emits invalid `accept_case` "
1460
+ "actions, triggers the eval rollout helper's heuristic-fallback path, and "
1461
+ "scores bit-exactly the heuristic baseline at 0.8132. The full diagnosis is in "
1462
+ "[`SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md).\n\n"
1463
+ "**Honest trained-vs-untrained delta:** the SFT step at 0.536 β€” **+0.08 absolute, "
1464
+ "+18% relative** over the untrained Qwen2.5-3B base β€” is the only legitimate "
1465
+ "model-attributable improvement on iter 5. We document this honestly because "
1466
+ "the failure mode itself is a research artefact future GRPO recipes can target "
1467
+ "as a benchmark.\n\n"
1468
+ "**Reproduce.** "
1469
+ "[Latest training run (Colab β€” iter 5, 200 GRPO steps)](https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5?usp=sharing) Β· "
1470
+ "[Previous training run (Colab β€” iter 3, 62 GRPO steps)](https://colab.research.google.com/drive/1AjG3Sv7FnMeOSls6JMzTunkMzlJi_ySu?usp=sharing) Β· "
1471
+ "[`notebooks/train_merchant_agent.ipynb`](https://github.com/MitudruDutta/chargebackops/blob/main/notebooks/train_merchant_agent.ipynb)\n"
1472
+ )