Spaces:
Sleeping
Sleeping
Commit Β·
862cfc4
1
Parent(s): 4d7c179
Implement code changes to enhance functionality and improve performance
Browse files- server/demo_ui.py +957 -78
server/demo_ui.py
CHANGED
|
@@ -2,32 +2,164 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 5 |
import os
|
| 6 |
-
from
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
|
| 12 |
try:
|
| 13 |
-
from ..
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from ..runners.baseline_runner import (
|
| 15 |
_heuristic_pick,
|
| 16 |
_obvious_next_action,
|
| 17 |
candidate_actions,
|
| 18 |
)
|
| 19 |
-
from ..
|
|
|
|
| 20 |
from .chargeback_ops_environment import ChargebackOpsEnvironment
|
| 21 |
except ImportError: # pragma: no cover
|
| 22 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from runners.baseline_runner import (
|
| 24 |
_heuristic_pick,
|
| 25 |
_obvious_next_action,
|
| 26 |
candidate_actions,
|
| 27 |
)
|
| 28 |
-
from
|
|
|
|
| 29 |
from server.chargeback_ops_environment import ChargebackOpsEnvironment
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# ---------------------------------------------------------------------------
|
| 33 |
# CSS
|
|
@@ -205,7 +337,16 @@ _DEC_CLASS = {
|
|
| 205 |
}
|
| 206 |
|
| 207 |
|
| 208 |
-
def _round_panel_html(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
vc = observation.visible_case
|
| 210 |
if vc is None:
|
| 211 |
return ""
|
|
@@ -221,14 +362,34 @@ def _round_panel_html(observation) -> str:
|
|
| 221 |
f'</div>'
|
| 222 |
)
|
| 223 |
|
| 224 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
dec = vc.last_issuer_decision
|
| 226 |
dec_cls = _DEC_CLASS.get(dec, "")
|
| 227 |
dec_pretty = dec.replace("_", " ").title()
|
| 228 |
body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
body += f'<div class="issuer-quote">“{vc.last_issuer_rationale}”</div>'
|
| 232 |
|
| 233 |
if vc.pre_arb_evidence_added:
|
| 234 |
ids = ", ".join(vc.pre_arb_evidence_added)
|
|
@@ -329,18 +490,218 @@ def _resolve_task_id(task_id: str, generated: bool, difficulty: str, seed: int)
|
|
| 329 |
return task_id
|
| 330 |
|
| 331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
def run_episode(
|
| 333 |
-
task_id: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
):
|
| 335 |
tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
|
| 336 |
env = ChargebackOpsEnvironment()
|
| 337 |
obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
|
| 338 |
-
max_steps =
|
| 339 |
rows: list[list[Any]] = []
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
header = (
|
| 345 |
f"### {obs.task_title}\n"
|
| 346 |
f"`{obs.task_id}` — {len(obs.queue)} case(s), "
|
|
@@ -351,7 +712,7 @@ def run_episode(
|
|
| 351 |
_queue_html(obs),
|
| 352 |
_budget_html(0, max_steps, 0.0),
|
| 353 |
[row[:] for row in rows],
|
| 354 |
-
_round_panel_html(obs),
|
| 355 |
_arbitration_panel_html(obs),
|
| 356 |
"",
|
| 357 |
None,
|
|
@@ -359,19 +720,65 @@ def run_episode(
|
|
| 359 |
|
| 360 |
step = 0
|
| 361 |
while not obs.done:
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
step += 1
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
rows.append(
|
| 376 |
[
|
| 377 |
step,
|
|
@@ -380,7 +787,7 @@ def run_episode(
|
|
| 380 |
summary_action.system_name or "",
|
| 381 |
summary_action.strategy or "",
|
| 382 |
round(obs.reward or 0.0, 4),
|
| 383 |
-
obs.last_action_result,
|
| 384 |
]
|
| 385 |
)
|
| 386 |
|
|
@@ -396,7 +803,7 @@ def run_episode(
|
|
| 396 |
_queue_html(obs),
|
| 397 |
_budget_html(step, max_steps, obs.progress_score),
|
| 398 |
[row[:] for row in rows],
|
| 399 |
-
_round_panel_html(obs),
|
| 400 |
_arbitration_panel_html(obs),
|
| 401 |
grader,
|
| 402 |
None,
|
|
@@ -404,19 +811,109 @@ def run_episode(
|
|
| 404 |
|
| 405 |
report = obs.grader_report.model_dump() if obs.grader_report else None
|
| 406 |
sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
|
| 407 |
-
final_md =
|
|
|
|
|
|
|
|
|
|
| 408 |
yield (
|
| 409 |
final_md,
|
| 410 |
_queue_html(obs),
|
| 411 |
_budget_html(step, max_steps, obs.progress_score),
|
| 412 |
[row[:] for row in rows],
|
| 413 |
-
_round_panel_html(obs),
|
| 414 |
_arbitration_panel_html(obs),
|
| 415 |
_grader_html(report),
|
| 416 |
report,
|
| 417 |
)
|
| 418 |
|
| 419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
# ---------------------------------------------------------------------------
|
| 421 |
# Build Gradio app
|
| 422 |
# ---------------------------------------------------------------------------
|
|
@@ -431,17 +928,38 @@ def build_demo() -> gr.Blocks:
|
|
| 431 |
# Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
|
| 432 |
gr.HTML(f"<style>{_CSS}</style>")
|
| 433 |
|
| 434 |
-
# Header
|
| 435 |
gr.HTML(
|
| 436 |
'<div class="dashboard-header">'
|
| 437 |
"<h1>ChargebackOps</h1>"
|
| 438 |
-
"<p>Merchant chargeback dispute environment — OpenEnv benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
"</div>"
|
| 440 |
)
|
| 441 |
|
| 442 |
with gr.Tabs():
|
| 443 |
# ββ Tab 1: Run Episode ββββββββββββββββββββββββββββββββ
|
| 444 |
with gr.Tab("Run Episode"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
with gr.Row():
|
| 446 |
dd_task = gr.Dropdown(
|
| 447 |
label="Task", choices=task_ids, value=default, scale=3
|
|
@@ -451,25 +969,60 @@ def build_demo() -> gr.Blocks:
|
|
| 451 |
["easy", "medium", "hard", "nightmare"],
|
| 452 |
label="Difficulty",
|
| 453 |
value="easy",
|
|
|
|
| 454 |
scale=2,
|
| 455 |
)
|
| 456 |
-
nb_seed = gr.Number(
|
|
|
|
|
|
|
| 457 |
with gr.Row():
|
| 458 |
rd_policy = gr.Radio(
|
| 459 |
-
choices=
|
| 460 |
-
("Heuristic (smart baseline)", "heuristic"),
|
| 461 |
-
("Naive (always concede)", "bad"),
|
| 462 |
-
],
|
| 463 |
value="heuristic",
|
| 464 |
label="Policy",
|
| 465 |
scale=4,
|
| 466 |
)
|
| 467 |
btn_run = gr.Button("Run Episode", variant="primary", scale=1)
|
| 468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
md_status = gr.Markdown(
|
| 470 |
-
"Pick a task + policy and click **Run Episode**.
|
| 471 |
-
"
|
| 472 |
-
"
|
|
|
|
|
|
|
|
|
|
| 473 |
)
|
| 474 |
|
| 475 |
with gr.Row(equal_height=True):
|
|
@@ -491,21 +1044,25 @@ def build_demo() -> gr.Blocks:
|
|
| 491 |
datatype=["number", "str", "str", "str", "str", "number", "str"],
|
| 492 |
interactive=False,
|
| 493 |
wrap=True,
|
| 494 |
-
label="Step Trace",
|
| 495 |
)
|
| 496 |
|
| 497 |
with gr.Row(equal_height=True):
|
| 498 |
with gr.Column(scale=1):
|
| 499 |
-
html_round = gr.HTML(label="Dispute Round")
|
| 500 |
with gr.Column(scale=1):
|
| 501 |
html_arb = gr.HTML(label="Arbitration")
|
| 502 |
|
| 503 |
html_grader = gr.HTML(label="Grader Report")
|
| 504 |
-
|
|
|
|
| 505 |
|
| 506 |
btn_run.click(
|
| 507 |
fn=run_episode,
|
| 508 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
| 509 |
outputs=[
|
| 510 |
md_status,
|
| 511 |
html_queue,
|
|
@@ -518,7 +1075,104 @@ def build_demo() -> gr.Blocks:
|
|
| 518 |
],
|
| 519 |
)
|
| 520 |
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
with gr.Tab("Task Catalog"):
|
| 523 |
catalog_rows = []
|
| 524 |
for t in tasks:
|
|
@@ -557,37 +1211,262 @@ def build_demo() -> gr.Blocks:
|
|
| 557 |
|
| 558 |
# ββ Tab 3: Environment Info βββββββββββββββββββββββββββ
|
| 559 |
with gr.Tab("Environment"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
gr.Markdown(
|
| 561 |
-
"
|
| 562 |
-
"
|
| 563 |
-
"
|
| 564 |
-
"`remove_evidence` · `set_strategy` · `submit_representment` · "
|
| 565 |
-
"`resolve_case`\n\n"
|
| 566 |
-
"**Round 2/3 β Pre-arb & Arbitration:** `respond_to_pre_arb` · "
|
| 567 |
-
"`escalate_to_arbitration` · `accept_arbitration_loss`\n\n"
|
| 568 |
-
"## Merchant Systems (6)\n\n"
|
| 569 |
-
"`orders` · `payment` · `shipping` · "
|
| 570 |
-
"`support` · `refunds` · `risk`\n\n"
|
| 571 |
-
"## Grading (8 dimensions)\n\n"
|
| 572 |
-
"| Dimension | Weight | Scoring |\n"
|
| 573 |
-
"|---|---|---|\n"
|
| 574 |
-
"| Strategy Correctness | 20% | 1.0 optimal, 0.35 acceptable, 0.0 wrong |\n"
|
| 575 |
-
"| Evidence Quality | 15% | Required + helpful coverage, harmful penalty |\n"
|
| 576 |
-
"| Packet Validity | 10% | Binary: all required, zero harmful |\n"
|
| 577 |
-
"| Deadline Compliance | 10% | Binary: resolved before deadline |\n"
|
| 578 |
-
"| Efficiency | 10% | Penalises waste, rewards early concession |\n"
|
| 579 |
-
"| Outcome Quality | 10% | 1.0 optimal, 0.4 acceptable, 0.0 wrong |\n"
|
| 580 |
-
"| Note Quality | 5% | Policy keywords + evidence refs |\n"
|
| 581 |
-
"| Escalation ROI | 20% | EV-rational arbitration: P(win)Β·amount vs $250 fee |\n\n"
|
| 582 |
-
"## Card Networks\n\n"
|
| 583 |
-
"| Reason Code | Visa | Mastercard |\n"
|
| 584 |
-
"|---|---|---|\n"
|
| 585 |
-
"| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
|
| 586 |
-
"| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
|
| 587 |
-
"| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
|
| 588 |
-
"| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
|
| 589 |
-
"| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
|
| 590 |
-
"| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
|
| 591 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import base64
|
| 6 |
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Callable
|
| 9 |
|
| 10 |
+
# Ensure matplotlib has a writable config dir on locked-down hosts (e.g. HF
|
| 11 |
+
# Spaces). Guarded so importing this module from a notebook doesn't pollute
|
| 12 |
+
# the user's environment unnecessarily.
|
| 13 |
+
if not os.environ.get("MPLCONFIGDIR"):
|
| 14 |
+
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib"
|
| 15 |
|
| 16 |
import gradio as gr
|
| 17 |
|
| 18 |
try:
|
| 19 |
+
from ..core.models import ChargebackOpsAction
|
| 20 |
+
from ..evaluation.rubrics import (
|
| 21 |
+
CASE_DIMENSION_NAMES,
|
| 22 |
+
CASE_DIMENSION_WEIGHTS,
|
| 23 |
+
)
|
| 24 |
from ..runners.baseline_runner import (
|
| 25 |
_heuristic_pick,
|
| 26 |
_obvious_next_action,
|
| 27 |
candidate_actions,
|
| 28 |
)
|
| 29 |
+
from ..runners.benchmark_runner import POLICY_REGISTRY
|
| 30 |
+
from ..scenarios.simulation import get_task, list_tasks
|
| 31 |
from .chargeback_ops_environment import ChargebackOpsEnvironment
|
| 32 |
except ImportError: # pragma: no cover
|
| 33 |
+
from core.models import ChargebackOpsAction
|
| 34 |
+
from evaluation.rubrics import (
|
| 35 |
+
CASE_DIMENSION_NAMES,
|
| 36 |
+
CASE_DIMENSION_WEIGHTS,
|
| 37 |
+
)
|
| 38 |
from runners.baseline_runner import (
|
| 39 |
_heuristic_pick,
|
| 40 |
_obvious_next_action,
|
| 41 |
candidate_actions,
|
| 42 |
)
|
| 43 |
+
from runners.benchmark_runner import POLICY_REGISTRY
|
| 44 |
+
from scenarios.simulation import get_task, list_tasks
|
| 45 |
from server.chargeback_ops_environment import ChargebackOpsEnvironment
|
| 46 |
|
| 47 |
+
# OpenAI-compatible LLM policy is optional β the demo gracefully degrades to
|
| 48 |
+
# scripted policies if the openai SDK or runners.inference is unavailable.
|
| 49 |
+
try: # pragma: no cover β exercised only when LLM policy is selected
|
| 50 |
+
from openai import OpenAI # noqa: F401
|
| 51 |
+
try:
|
| 52 |
+
from ..runners.inference import _pick_with_openai_client
|
| 53 |
+
except ImportError:
|
| 54 |
+
from runners.inference import _pick_with_openai_client
|
| 55 |
+
_LLM_POLICY_AVAILABLE = True
|
| 56 |
+
except Exception: # pragma: no cover
|
| 57 |
+
_pick_with_openai_client = None # type: ignore[assignment]
|
| 58 |
+
_LLM_POLICY_AVAILABLE = False
|
| 59 |
+
|
| 60 |
+
# Path to the bundled hero figures (used by the Training Results tab).
|
| 61 |
+
_FIGURES_DIR = Path(__file__).resolve().parents[1] / "docs" / "figures"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# Static metadata
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
# Human-readable display labels for the 8 rubric dimensions (in canonical order).
|
| 69 |
+
_DIMENSION_LABELS: tuple[str, ...] = (
|
| 70 |
+
"Strategy Correctness",
|
| 71 |
+
"Evidence Quality",
|
| 72 |
+
"Packet Validity",
|
| 73 |
+
"Deadline Compliance",
|
| 74 |
+
"Efficiency",
|
| 75 |
+
"Outcome Quality",
|
| 76 |
+
"Note Quality",
|
| 77 |
+
"Escalation ROI",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Per-dimension scoring summary (kept short so the table fits on one screen).
|
| 81 |
+
_DIMENSION_SCORING: tuple[str, ...] = (
|
| 82 |
+
"1.0 optimal Β· 0.35 acceptable Β· 0.0 wrong",
|
| 83 |
+
"Required + helpful coverage; harmful evidence penalised",
|
| 84 |
+
"Binary: all required evidence + zero harmful",
|
| 85 |
+
"Binary: case resolved before deadline",
|
| 86 |
+
"Penalises waste; rewards early concession",
|
| 87 |
+
"1.0 optimal Β· 0.4 acceptable Β· 0.0 wrong",
|
| 88 |
+
"Policy keywords + evidence references",
|
| 89 |
+
"EV-rational arbitration: P(win)Β·amount vs $250 fee",
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Selectable scripted policies (label shown to user β registry key).
|
| 93 |
+
# Order is intentional: best β worst, so radio top-to-bottom reads as a
|
| 94 |
+
# discrimination ladder.
|
| 95 |
+
_POLICY_CHOICES: tuple[tuple[str, str], ...] = (
|
| 96 |
+
("Heuristic β EV-rational baseline", "heuristic"),
|
| 97 |
+
("Escalate-all β contest then always escalate", "escalate_all"),
|
| 98 |
+
("Concede-all β always accept the chargeback", "concede_all"),
|
| 99 |
+
("Naive β submit empty packet, no evidence", "naive"),
|
| 100 |
+
("LLM (OpenAI-compatible API)", "llm"),
|
| 101 |
+
)
|
| 102 |
+
_POLICY_LABEL_BY_KEY: dict[str, str] = {
|
| 103 |
+
key: label for label, key in _POLICY_CHOICES
|
| 104 |
+
}
|
| 105 |
+
# Subset used by the Compare tab β scripted-only, deterministic, no API calls.
|
| 106 |
+
_COMPARE_POLICIES: tuple[str, ...] = (
|
| 107 |
+
"naive",
|
| 108 |
+
"concede_all",
|
| 109 |
+
"escalate_all",
|
| 110 |
+
"heuristic",
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# One-click presets for the Run-Episode tab. Each preset is
|
| 114 |
+
# (button_label, task_id, generated_flag, difficulty, seed, recommended_policy, blurb).
|
| 115 |
+
_PRESETS: tuple[tuple[str, str, bool, str, int, str, str], ...] = (
|
| 116 |
+
(
|
| 117 |
+
"Easy contestable",
|
| 118 |
+
"goods_not_received_easy",
|
| 119 |
+
False,
|
| 120 |
+
"easy",
|
| 121 |
+
42,
|
| 122 |
+
"heuristic",
|
| 123 |
+
"Goods-not-received with strong evidence β heuristic should win round 1.",
|
| 124 |
+
),
|
| 125 |
+
(
|
| 126 |
+
"Queue optimization (hard)",
|
| 127 |
+
"queue_optimization_hard",
|
| 128 |
+
False,
|
| 129 |
+
"hard",
|
| 130 |
+
42,
|
| 131 |
+
"heuristic",
|
| 132 |
+
"Triage a heterogeneous queue under tight deadlines β exercises EV reasoning.",
|
| 133 |
+
),
|
| 134 |
+
(
|
| 135 |
+
"Long-horizon backlog",
|
| 136 |
+
"monthly_dispute_backlog_marathon",
|
| 137 |
+
False,
|
| 138 |
+
"medium",
|
| 139 |
+
42,
|
| 140 |
+
"heuristic",
|
| 141 |
+
"12 cases over 60 steps with delayed evidence; tests scheduling + waiting.",
|
| 142 |
+
),
|
| 143 |
+
(
|
| 144 |
+
"Generated nightmare",
|
| 145 |
+
"generated_nightmare_s31",
|
| 146 |
+
True,
|
| 147 |
+
"nightmare",
|
| 148 |
+
31,
|
| 149 |
+
"heuristic",
|
| 150 |
+
"Adversarial parametric task β even the heuristic struggles.",
|
| 151 |
+
),
|
| 152 |
+
(
|
| 153 |
+
"Compare all 4 policies",
|
| 154 |
+
"goods_not_received_easy",
|
| 155 |
+
False,
|
| 156 |
+
"easy",
|
| 157 |
+
42,
|
| 158 |
+
"heuristic",
|
| 159 |
+
"Open the Compare tab β same task, all four scripted policies side-by-side.",
|
| 160 |
+
),
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
|
| 164 |
# ---------------------------------------------------------------------------
|
| 165 |
# CSS
|
|
|
|
| 337 |
}
|
| 338 |
|
| 339 |
|
| 340 |
+
def _round_panel_html(
|
| 341 |
+
observation, history: list[dict[str, str]] | None = None
|
| 342 |
+
) -> str:
|
| 343 |
+
"""Render the visible case's round panel, including a chronological
|
| 344 |
+
issuer-message log so multi-round disputes show every R1/R2/R3 message.
|
| 345 |
+
|
| 346 |
+
``history`` is a list of ``{round, decision, rationale}`` dicts the caller
|
| 347 |
+
accumulates across steps.
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
vc = observation.visible_case
|
| 351 |
if vc is None:
|
| 352 |
return ""
|
|
|
|
| 362 |
f'</div>'
|
| 363 |
)
|
| 364 |
|
| 365 |
+
# Show full issuer-message history if we have it, else fall back to the
|
| 366 |
+
# last-message snapshot from the observation.
|
| 367 |
+
rendered_any = False
|
| 368 |
+
if history:
|
| 369 |
+
for entry in history:
|
| 370 |
+
ent_rnd = entry.get("round", "?")
|
| 371 |
+
ent_dec = entry.get("decision") or ""
|
| 372 |
+
ent_rat = entry.get("rationale") or ""
|
| 373 |
+
ent_badge_cls = f"round-{min(int(ent_rnd) if str(ent_rnd).isdigit() else 1, 3)}"
|
| 374 |
+
dec_cls = _DEC_CLASS.get(ent_dec, "")
|
| 375 |
+
dec_pretty = ent_dec.replace("_", " ").title() if ent_dec else "(no decision)"
|
| 376 |
+
body += (
|
| 377 |
+
f'<div style="margin-top:8px;">'
|
| 378 |
+
f'<span class="round-badge {ent_badge_cls}">R{ent_rnd}</span>'
|
| 379 |
+
f'<span class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</span>'
|
| 380 |
+
f'</div>'
|
| 381 |
+
)
|
| 382 |
+
if ent_rat:
|
| 383 |
+
body += f'<div class="issuer-quote">“{ent_rat}”</div>'
|
| 384 |
+
rendered_any = True
|
| 385 |
+
|
| 386 |
+
if not rendered_any and vc.last_issuer_decision:
|
| 387 |
dec = vc.last_issuer_decision
|
| 388 |
dec_cls = _DEC_CLASS.get(dec, "")
|
| 389 |
dec_pretty = dec.replace("_", " ").title()
|
| 390 |
body += f'<div class="issuer-decision {dec_cls}">Issuer: {dec_pretty}</div>'
|
| 391 |
+
if vc.last_issuer_rationale:
|
| 392 |
+
body += f'<div class="issuer-quote">“{vc.last_issuer_rationale}”</div>'
|
|
|
|
| 393 |
|
| 394 |
if vc.pre_arb_evidence_added:
|
| 395 |
ids = ", ".join(vc.pre_arb_evidence_added)
|
|
|
|
| 490 |
return task_id
|
| 491 |
|
| 492 |
|
| 493 |
+
def _build_llm_policy(
|
| 494 |
+
base_url: str, api_key: str, model_name: str
|
| 495 |
+
) -> tuple[Callable[[dict[str, Any]], ChargebackOpsAction | None], str]:
|
| 496 |
+
"""Return ``(policy_fn, label)`` calling an OpenAI-compatible chat model.
|
| 497 |
+
|
| 498 |
+
The policy mirrors the production inference pipeline in
|
| 499 |
+
:mod:`runners.inference`: candidate generation + obvious-action shortcut +
|
| 500 |
+
LLM pick over the shortlist. On any LLM failure (network, parse, missing
|
| 501 |
+
key) it falls back to the heuristic so the demo never freezes mid-stream.
|
| 502 |
+
|
| 503 |
+
UI fields take precedence; blanks fall back to ``HF_TOKEN`` /
|
| 504 |
+
``API_KEY`` / ``OPENROUTER_API_KEY`` / ``GROQ_API_KEY`` / ``API_BASE_URL``
|
| 505 |
+
/ ``MODEL_NAME`` env vars. This lets HF Space operators wire credentials
|
| 506 |
+
via Space Secrets without the public demo asking visitors for keys.
|
| 507 |
+
"""
|
| 508 |
+
|
| 509 |
+
if not _LLM_POLICY_AVAILABLE or _pick_with_openai_client is None:
|
| 510 |
+
raise RuntimeError(
|
| 511 |
+
"openai SDK is not available β install `openai` to use the LLM policy."
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
base_url = (base_url or "").strip()
|
| 515 |
+
api_key = (api_key or "").strip()
|
| 516 |
+
model_name = (model_name or "").strip()
|
| 517 |
+
|
| 518 |
+
if not api_key:
|
| 519 |
+
api_key = (
|
| 520 |
+
os.getenv("HF_TOKEN")
|
| 521 |
+
or os.getenv("API_KEY")
|
| 522 |
+
or os.getenv("OPENROUTER_API_KEY")
|
| 523 |
+
or os.getenv("GROQ_API_KEY")
|
| 524 |
+
or ""
|
| 525 |
+
)
|
| 526 |
+
# Resolve provider from explicit base_url first, then from which key
|
| 527 |
+
# variable was set in the environment. This lets us pick a sensible
|
| 528 |
+
# default model name even when only the key is provided.
|
| 529 |
+
provider: str = ""
|
| 530 |
+
if not base_url:
|
| 531 |
+
base_url = os.getenv("API_BASE_URL", "").strip()
|
| 532 |
+
if base_url:
|
| 533 |
+
lowered = base_url.lower()
|
| 534 |
+
if "groq" in lowered:
|
| 535 |
+
provider = "groq"
|
| 536 |
+
elif "openrouter" in lowered:
|
| 537 |
+
provider = "openrouter"
|
| 538 |
+
elif "huggingface" in lowered or "hf.space" in lowered:
|
| 539 |
+
provider = "hf"
|
| 540 |
+
elif "openai.com" in lowered:
|
| 541 |
+
provider = "openai"
|
| 542 |
+
if not base_url:
|
| 543 |
+
if os.getenv("GROQ_API_KEY"):
|
| 544 |
+
base_url, provider = "https://api.groq.com/openai/v1", "groq"
|
| 545 |
+
elif os.getenv("OPENROUTER_API_KEY"):
|
| 546 |
+
base_url, provider = "https://openrouter.ai/api/v1", "openrouter"
|
| 547 |
+
else:
|
| 548 |
+
base_url, provider = "https://router.huggingface.co/v1", "hf"
|
| 549 |
+
|
| 550 |
+
if not model_name:
|
| 551 |
+
model_name = os.getenv("MODEL_NAME", "").strip()
|
| 552 |
+
if not model_name:
|
| 553 |
+
# Provider-appropriate defaults β every option here works without
|
| 554 |
+
# the user having to look up a model card.
|
| 555 |
+
provider_defaults = {
|
| 556 |
+
"groq": "llama-3.3-70b-versatile",
|
| 557 |
+
"openrouter": "meta-llama/llama-3.1-8b-instruct:free",
|
| 558 |
+
"openai": "gpt-4o-mini",
|
| 559 |
+
"hf": "Qwen/Qwen2.5-72B-Instruct",
|
| 560 |
+
}
|
| 561 |
+
model_name = provider_defaults.get(provider, "Qwen/Qwen2.5-72B-Instruct")
|
| 562 |
+
|
| 563 |
+
if not api_key:
|
| 564 |
+
raise RuntimeError(
|
| 565 |
+
"No API key β type one in the UI, or set HF_TOKEN / API_KEY / "
|
| 566 |
+
"OPENROUTER_API_KEY / GROQ_API_KEY in the environment (HF Space "
|
| 567 |
+
"Secrets work too)."
|
| 568 |
+
)
|
| 569 |
+
if not model_name:
|
| 570 |
+
raise RuntimeError("Model name is required for the LLM policy.")
|
| 571 |
+
|
| 572 |
+
client = OpenAI(
|
| 573 |
+
base_url=base_url,
|
| 574 |
+
api_key=api_key,
|
| 575 |
+
timeout=15.0,
|
| 576 |
+
max_retries=0,
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
def policy_fn(observation: dict[str, Any]) -> ChargebackOpsAction | None:
|
| 580 |
+
cands = candidate_actions(observation)
|
| 581 |
+
if not cands:
|
| 582 |
+
return None
|
| 583 |
+
if len(cands) == 1:
|
| 584 |
+
return cands[0].action
|
| 585 |
+
obvious = _obvious_next_action(observation, cands)
|
| 586 |
+
if obvious is not None:
|
| 587 |
+
return obvious.action
|
| 588 |
+
try:
|
| 589 |
+
pick, _ok, _err = _pick_with_openai_client(
|
| 590 |
+
client, model_name, observation, cands
|
| 591 |
+
)
|
| 592 |
+
return pick.action
|
| 593 |
+
except Exception:
|
| 594 |
+
return _heuristic_pick(cands).action
|
| 595 |
+
|
| 596 |
+
label = f"LLM ({model_name})"
|
| 597 |
+
return policy_fn, label
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def _result_badge(result: str | None) -> str:
|
| 601 |
+
"""Prefix a step result string with a status emoji for fast scanning.
|
| 602 |
+
|
| 603 |
+
Distinguishes accepted/no-op/rejected so the trace dataframe self-narrates.
|
| 604 |
+
"""
|
| 605 |
+
|
| 606 |
+
if not result:
|
| 607 |
+
return "Β· (no result)"
|
| 608 |
+
text = str(result)
|
| 609 |
+
lowered = text.lower()
|
| 610 |
+
if "error" in lowered or "reject" in lowered or "invalid" in lowered or "fail" in lowered:
|
| 611 |
+
return f"β {text}"
|
| 612 |
+
if "no-op" in lowered or "noop" in lowered or "ignored" in lowered or "skipped" in lowered:
|
| 613 |
+
return f"β {text}"
|
| 614 |
+
return f"β {text}"
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
def _resolve_max_steps(observation, task_id: str) -> int:
|
| 618 |
+
"""Pull the task budget from the observation; fall back to the task definition.
|
| 619 |
+
|
| 620 |
+
The legacy implementation defaulted to 10 if the observation field was absent,
|
| 621 |
+
which silently mis-rendered the budget bar. The env always populates
|
| 622 |
+
``info.current_task_max_steps`` after ``reset``; if it ever doesn't, we read
|
| 623 |
+
the task object directly so the bar still reflects truth.
|
| 624 |
+
"""
|
| 625 |
+
|
| 626 |
+
cap = observation.info.get("current_task_max_steps")
|
| 627 |
+
if isinstance(cap, int) and cap > 0:
|
| 628 |
+
return cap
|
| 629 |
+
try:
|
| 630 |
+
return int(get_task(task_id).max_steps)
|
| 631 |
+
except Exception: # pragma: no cover β defensive
|
| 632 |
+
return 60
|
| 633 |
+
|
| 634 |
+
|
| 635 |
def run_episode(
|
| 636 |
+
task_id: str,
|
| 637 |
+
generated: bool,
|
| 638 |
+
difficulty: str,
|
| 639 |
+
seed: int,
|
| 640 |
+
policy: str = "heuristic",
|
| 641 |
+
llm_base_url: str = "",
|
| 642 |
+
llm_api_key: str = "",
|
| 643 |
+
llm_model: str = "",
|
| 644 |
):
|
| 645 |
tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
|
| 646 |
env = ChargebackOpsEnvironment()
|
| 647 |
obs = env.reset(task_id=tid, difficulty=difficulty, seed=int(seed))
|
| 648 |
+
max_steps = _resolve_max_steps(obs, tid)
|
| 649 |
rows: list[list[Any]] = []
|
| 650 |
|
| 651 |
+
policy_fn: Callable[[dict[str, Any]], ChargebackOpsAction | None] | None = None
|
| 652 |
+
if policy == "llm":
|
| 653 |
+
try:
|
| 654 |
+
policy_fn, policy_label = _build_llm_policy(
|
| 655 |
+
llm_base_url, llm_api_key, llm_model
|
| 656 |
+
)
|
| 657 |
+
except Exception as exc:
|
| 658 |
+
err_md = (
|
| 659 |
+
f"### LLM policy unavailable\n"
|
| 660 |
+
f"`{type(exc).__name__}: {exc}`\n\n"
|
| 661 |
+
f"Falling back to **heuristic** for this run."
|
| 662 |
+
)
|
| 663 |
+
policy = "heuristic"
|
| 664 |
+
policy_fn = POLICY_REGISTRY["heuristic"]
|
| 665 |
+
policy_label = _POLICY_LABEL_BY_KEY[policy]
|
| 666 |
+
yield (
|
| 667 |
+
err_md,
|
| 668 |
+
_queue_html(obs),
|
| 669 |
+
_budget_html(0, max_steps, 0.0),
|
| 670 |
+
[],
|
| 671 |
+
"",
|
| 672 |
+
"",
|
| 673 |
+
"",
|
| 674 |
+
None,
|
| 675 |
+
)
|
| 676 |
+
if policy_fn is None:
|
| 677 |
+
policy_fn = POLICY_REGISTRY.get(policy) or POLICY_REGISTRY["heuristic"]
|
| 678 |
+
if policy not in POLICY_REGISTRY:
|
| 679 |
+
policy = "heuristic"
|
| 680 |
+
policy_label = _POLICY_LABEL_BY_KEY.get(policy, policy)
|
| 681 |
+
|
| 682 |
+
# Per-case issuer-message log: case_id -> [{"round","decision","rationale"}]
|
| 683 |
+
issuer_log: dict[str, list[dict[str, str]]] = {}
|
| 684 |
+
|
| 685 |
+
def _maybe_log_issuer_msg(observation) -> None:
|
| 686 |
+
vc = observation.visible_case
|
| 687 |
+
if vc is None or not vc.last_issuer_decision:
|
| 688 |
+
return
|
| 689 |
+
log = issuer_log.setdefault(vc.case_id, [])
|
| 690 |
+
entry = {
|
| 691 |
+
"round": str(vc.round_number or 1),
|
| 692 |
+
"decision": vc.last_issuer_decision or "",
|
| 693 |
+
"rationale": vc.last_issuer_rationale or "",
|
| 694 |
+
}
|
| 695 |
+
# Avoid duplicating the same message on adjacent steps.
|
| 696 |
+
if not log or log[-1] != entry:
|
| 697 |
+
log.append(entry)
|
| 698 |
+
|
| 699 |
+
def _current_history(observation) -> list[dict[str, str]]:
|
| 700 |
+
vc = observation.visible_case
|
| 701 |
+
if vc is None:
|
| 702 |
+
return []
|
| 703 |
+
return issuer_log.get(vc.case_id, [])
|
| 704 |
+
|
| 705 |
header = (
|
| 706 |
f"### {obs.task_title}\n"
|
| 707 |
f"`{obs.task_id}` — {len(obs.queue)} case(s), "
|
|
|
|
| 712 |
_queue_html(obs),
|
| 713 |
_budget_html(0, max_steps, 0.0),
|
| 714 |
[row[:] for row in rows],
|
| 715 |
+
_round_panel_html(obs, _current_history(obs)),
|
| 716 |
_arbitration_panel_html(obs),
|
| 717 |
"",
|
| 718 |
None,
|
|
|
|
| 720 |
|
| 721 |
step = 0
|
| 722 |
while not obs.done:
|
| 723 |
+
payload = obs.model_dump()
|
| 724 |
+
try:
|
| 725 |
+
action = policy_fn(payload)
|
| 726 |
+
except Exception as exc: # pragma: no cover β surface in UI
|
| 727 |
+
err_md = (
|
| 728 |
+
f"### Policy error\n"
|
| 729 |
+
f"`{policy}` raised `{type(exc).__name__}: {exc}` on step {step + 1}. "
|
| 730 |
+
f"Halting episode."
|
| 731 |
+
)
|
| 732 |
+
yield (
|
| 733 |
+
err_md,
|
| 734 |
+
_queue_html(obs),
|
| 735 |
+
_budget_html(step, max_steps, obs.progress_score),
|
| 736 |
+
[row[:] for row in rows],
|
| 737 |
+
_round_panel_html(obs, _current_history(obs)),
|
| 738 |
+
_arbitration_panel_html(obs),
|
| 739 |
+
"",
|
| 740 |
+
None,
|
| 741 |
+
)
|
| 742 |
+
return
|
| 743 |
+
if action is None:
|
| 744 |
+
break
|
| 745 |
+
|
| 746 |
+
summary_action = action
|
| 747 |
step += 1
|
| 748 |
+
try:
|
| 749 |
+
obs = env.step(action)
|
| 750 |
+
except Exception as exc: # pragma: no cover β surface in UI
|
| 751 |
+
err_md = (
|
| 752 |
+
f"### Environment error\n"
|
| 753 |
+
f"`env.step({summary_action.action_type})` raised "
|
| 754 |
+
f"`{type(exc).__name__}: {exc}` on step {step}. "
|
| 755 |
+
f"Halting episode."
|
| 756 |
+
)
|
| 757 |
+
rows.append(
|
| 758 |
+
[
|
| 759 |
+
step,
|
| 760 |
+
summary_action.action_type,
|
| 761 |
+
summary_action.case_id or "",
|
| 762 |
+
summary_action.system_name or "",
|
| 763 |
+
summary_action.strategy or "",
|
| 764 |
+
0.0,
|
| 765 |
+
f"β error: {type(exc).__name__}",
|
| 766 |
+
]
|
| 767 |
+
)
|
| 768 |
+
yield (
|
| 769 |
+
err_md,
|
| 770 |
+
_queue_html(obs),
|
| 771 |
+
_budget_html(step, max_steps, obs.progress_score),
|
| 772 |
+
[row[:] for row in rows],
|
| 773 |
+
_round_panel_html(obs, _current_history(obs)),
|
| 774 |
+
_arbitration_panel_html(obs),
|
| 775 |
+
"",
|
| 776 |
+
None,
|
| 777 |
+
)
|
| 778 |
+
return
|
| 779 |
+
|
| 780 |
+
_maybe_log_issuer_msg(obs)
|
| 781 |
+
|
| 782 |
rows.append(
|
| 783 |
[
|
| 784 |
step,
|
|
|
|
| 787 |
summary_action.system_name or "",
|
| 788 |
summary_action.strategy or "",
|
| 789 |
round(obs.reward or 0.0, 4),
|
| 790 |
+
_result_badge(obs.last_action_result),
|
| 791 |
]
|
| 792 |
)
|
| 793 |
|
|
|
|
| 803 |
_queue_html(obs),
|
| 804 |
_budget_html(step, max_steps, obs.progress_score),
|
| 805 |
[row[:] for row in rows],
|
| 806 |
+
_round_panel_html(obs, _current_history(obs)),
|
| 807 |
_arbitration_panel_html(obs),
|
| 808 |
grader,
|
| 809 |
None,
|
|
|
|
| 811 |
|
| 812 |
report = obs.grader_report.model_dump() if obs.grader_report else None
|
| 813 |
sc = f"{obs.grader_report.normalized_score:.3f}" if obs.grader_report else "n/a"
|
| 814 |
+
final_md = (
|
| 815 |
+
f"### Done — score **{sc}** in **{len(rows)}** steps "
|
| 816 |
+
f"· policy: **{policy_label}**"
|
| 817 |
+
)
|
| 818 |
yield (
|
| 819 |
final_md,
|
| 820 |
_queue_html(obs),
|
| 821 |
_budget_html(step, max_steps, obs.progress_score),
|
| 822 |
[row[:] for row in rows],
|
| 823 |
+
_round_panel_html(obs, _current_history(obs)),
|
| 824 |
_arbitration_panel_html(obs),
|
| 825 |
_grader_html(report),
|
| 826 |
report,
|
| 827 |
)
|
| 828 |
|
| 829 |
|
| 830 |
+
# ---------------------------------------------------------------------------
|
| 831 |
+
# Compare tab β run all four scripted policies on the same task in series and
|
| 832 |
+
# render a single side-by-side bar chart of the final scores plus a per-case
|
| 833 |
+
# per-dimension breakdown.
|
| 834 |
+
# ---------------------------------------------------------------------------
|
| 835 |
+
|
| 836 |
+
|
| 837 |
+
def _run_one_episode_sync(task_id: str, policy_key: str) -> dict[str, Any]:
|
| 838 |
+
"""Synchronously run a single scripted-policy episode and return summary.
|
| 839 |
+
|
| 840 |
+
Cheap because every policy in :data:`_COMPARE_POLICIES` is pure-Python and
|
| 841 |
+
fully offline (no provider calls).
|
| 842 |
+
"""
|
| 843 |
+
|
| 844 |
+
env = ChargebackOpsEnvironment()
|
| 845 |
+
obs = env.reset(task_id=task_id)
|
| 846 |
+
policy_fn = POLICY_REGISTRY[policy_key]
|
| 847 |
+
steps = 0
|
| 848 |
+
while not obs.done:
|
| 849 |
+
try:
|
| 850 |
+
action = policy_fn(obs.model_dump())
|
| 851 |
+
except Exception:
|
| 852 |
+
break
|
| 853 |
+
if action is None:
|
| 854 |
+
break
|
| 855 |
+
try:
|
| 856 |
+
obs = env.step(action)
|
| 857 |
+
except Exception:
|
| 858 |
+
break
|
| 859 |
+
steps += 1
|
| 860 |
+
score = obs.grader_report.normalized_score if obs.grader_report else 0.0
|
| 861 |
+
return {
|
| 862 |
+
"policy": policy_key,
|
| 863 |
+
"score": float(score),
|
| 864 |
+
"steps": steps,
|
| 865 |
+
"summary": obs.grader_report.summary if obs.grader_report else "",
|
| 866 |
+
}
|
| 867 |
+
|
| 868 |
+
|
| 869 |
+
def run_compare(task_id: str, generated: bool, difficulty: str, seed: int):
|
| 870 |
+
"""Run all four scripted policies on the same task and render a chart."""
|
| 871 |
+
|
| 872 |
+
tid = _resolve_task_id(task_id, generated, difficulty, int(seed))
|
| 873 |
+
results = [_run_one_episode_sync(tid, p) for p in _COMPARE_POLICIES]
|
| 874 |
+
|
| 875 |
+
# Bar-chart HTML (CSS-only, no extra deps).
|
| 876 |
+
max_score = max((r["score"] for r in results), default=1.0) or 1.0
|
| 877 |
+
bars = ""
|
| 878 |
+
for r in results:
|
| 879 |
+
pct = int(round(100 * r["score"] / max(0.001, max_score)))
|
| 880 |
+
color = _score_color(r["score"])
|
| 881 |
+
bars += (
|
| 882 |
+
f'<div class="bar-row" style="margin:6px 0;">'
|
| 883 |
+
f'<span class="bar-label" style="width:130px;">{r["policy"]}</span>'
|
| 884 |
+
f'<div class="bar-track" style="flex:1;height:22px;">'
|
| 885 |
+
f'<div class="bar-fill" style="width:{pct}%;background:{color};height:100%;"></div>'
|
| 886 |
+
f'</div>'
|
| 887 |
+
f'<span class="bar-value" style="width:120px;">'
|
| 888 |
+
f'{r["score"]:.3f} Β· {r["steps"]} steps</span>'
|
| 889 |
+
f'</div>'
|
| 890 |
+
)
|
| 891 |
+
|
| 892 |
+
# Discrimination delta.
|
| 893 |
+
by_policy = {r["policy"]: r["score"] for r in results}
|
| 894 |
+
delta = by_policy.get("heuristic", 0.0) - by_policy.get("naive", 0.0)
|
| 895 |
+
title = (
|
| 896 |
+
f'<div style="margin:8px 0;font-size:14px;">'
|
| 897 |
+
f'<b>Task</b>: <code>{tid}</code> · '
|
| 898 |
+
f'<b>Discrimination delta</b> (heuristic β naive) = '
|
| 899 |
+
f'<span style="color:{_score_color(delta)};">'
|
| 900 |
+
f'<b>+{delta:.3f}</b></span>'
|
| 901 |
+
f'</div>'
|
| 902 |
+
)
|
| 903 |
+
|
| 904 |
+
md = (
|
| 905 |
+
f"### Side-by-side: 4 scripted policies on the same task\n"
|
| 906 |
+
f"Same `task_id`, same `seed`, no provider calls. The discrimination "
|
| 907 |
+
f"gradient (`naive` β `concede_all` β `escalate_all` β `heuristic`) "
|
| 908 |
+
f"is the empirical evidence behind the README's `+0.813` claim."
|
| 909 |
+
)
|
| 910 |
+
table_rows = [
|
| 911 |
+
[r["policy"], f"{r['score']:.3f}", r["steps"], r["summary"]]
|
| 912 |
+
for r in results
|
| 913 |
+
]
|
| 914 |
+
return md, title + '<div style="padding:8px 0;">' + bars + "</div>", table_rows
|
| 915 |
+
|
| 916 |
+
|
| 917 |
# ---------------------------------------------------------------------------
|
| 918 |
# Build Gradio app
|
| 919 |
# ---------------------------------------------------------------------------
|
|
|
|
| 928 |
# Inject CSS (Gradio 6 moved css= to launch(); <style> tag works everywhere)
|
| 929 |
gr.HTML(f"<style>{_CSS}</style>")
|
| 930 |
|
| 931 |
+
# Header + context links
|
| 932 |
gr.HTML(
|
| 933 |
'<div class="dashboard-header">'
|
| 934 |
"<h1>ChargebackOps</h1>"
|
| 935 |
+
"<p>Merchant chargeback dispute environment — an OpenEnv benchmark for "
|
| 936 |
+
"cost-asymmetric multi-round LLM agents</p>"
|
| 937 |
+
'<div style="margin-top:8px;">'
|
| 938 |
+
'<a href="https://github.com/MitudruDutta/chargebackops" target="_blank" '
|
| 939 |
+
'style="margin:0 6px;color:#3b82f6;text-decoration:none;">π¦ GitHub</a> '
|
| 940 |
+
'<a href="https://huggingface.co/spaces/mitudrudutta/ChargeBackOps" target="_blank" '
|
| 941 |
+
'style="margin:0 6px;color:#FFD21E;text-decoration:none;">π€ HF Space</a> '
|
| 942 |
+
'<a href="https://youtu.be/7dz37JTTMo4" target="_blank" '
|
| 943 |
+
'style="margin:0 6px;color:#FF0000;text-decoration:none;">πΊ Walkthrough</a> '
|
| 944 |
+
'<a href="https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5" target="_blank" '
|
| 945 |
+
'style="margin:0 6px;color:#F9AB00;text-decoration:none;">π§ͺ Training Colab</a> '
|
| 946 |
+
'<a href="https://github.com/meta-pytorch/OpenEnv" target="_blank" '
|
| 947 |
+
'style="margin:0 6px;color:#0668E1;text-decoration:none;">π¦ Meta OpenEnv</a>'
|
| 948 |
+
"</div>"
|
| 949 |
"</div>"
|
| 950 |
)
|
| 951 |
|
| 952 |
with gr.Tabs():
|
| 953 |
# ββ Tab 1: Run Episode ββββββββββββββββββββββββββββββββ
|
| 954 |
with gr.Tab("Run Episode"):
|
| 955 |
+
# Preset buttons row β one-click task+policy configuration.
|
| 956 |
+
gr.Markdown("**Quick presets** β click any to load a known-good configuration.")
|
| 957 |
+
with gr.Row():
|
| 958 |
+
preset_buttons = [
|
| 959 |
+
gr.Button(p[0], size="sm", scale=1) for p in _PRESETS
|
| 960 |
+
]
|
| 961 |
+
preset_blurb = gr.Markdown("")
|
| 962 |
+
|
| 963 |
with gr.Row():
|
| 964 |
dd_task = gr.Dropdown(
|
| 965 |
label="Task", choices=task_ids, value=default, scale=3
|
|
|
|
| 969 |
["easy", "medium", "hard", "nightmare"],
|
| 970 |
label="Difficulty",
|
| 971 |
value="easy",
|
| 972 |
+
visible=False,
|
| 973 |
scale=2,
|
| 974 |
)
|
| 975 |
+
nb_seed = gr.Number(
|
| 976 |
+
label="Seed", value=42, precision=0, visible=False, scale=1
|
| 977 |
+
)
|
| 978 |
with gr.Row():
|
| 979 |
rd_policy = gr.Radio(
|
| 980 |
+
choices=list(_POLICY_CHOICES),
|
|
|
|
|
|
|
|
|
|
| 981 |
value="heuristic",
|
| 982 |
label="Policy",
|
| 983 |
scale=4,
|
| 984 |
)
|
| 985 |
btn_run = gr.Button("Run Episode", variant="primary", scale=1)
|
| 986 |
|
| 987 |
+
# LLM-policy inputs β only visible when "LLM" is selected.
|
| 988 |
+
with gr.Accordion(
|
| 989 |
+
"LLM policy settings (used when 'LLM' is selected above)",
|
| 990 |
+
open=False,
|
| 991 |
+
visible=False,
|
| 992 |
+
) as llm_accordion:
|
| 993 |
+
gr.Markdown(
|
| 994 |
+
"Bring your own OpenAI-compatible endpoint. Defaults match the "
|
| 995 |
+
"Hugging Face router; OpenRouter, Groq, Together, Fireworks, "
|
| 996 |
+
"and Anthropic-compatible gateways all work. **Leave fields "
|
| 997 |
+
"blank** to inherit `HF_TOKEN` / `OPENROUTER_API_KEY` / "
|
| 998 |
+
"`GROQ_API_KEY` / `API_BASE_URL` / `MODEL_NAME` from the "
|
| 999 |
+
"environment (set them as Space Secrets when deploying)."
|
| 1000 |
+
)
|
| 1001 |
+
with gr.Row():
|
| 1002 |
+
tb_llm_base = gr.Textbox(
|
| 1003 |
+
label="Base URL",
|
| 1004 |
+
value="https://router.huggingface.co/v1",
|
| 1005 |
+
scale=2,
|
| 1006 |
+
)
|
| 1007 |
+
tb_llm_model = gr.Textbox(
|
| 1008 |
+
label="Model",
|
| 1009 |
+
value="Qwen/Qwen2.5-72B-Instruct",
|
| 1010 |
+
scale=2,
|
| 1011 |
+
)
|
| 1012 |
+
tb_llm_key = gr.Textbox(
|
| 1013 |
+
label="API key",
|
| 1014 |
+
value="",
|
| 1015 |
+
type="password",
|
| 1016 |
+
scale=2,
|
| 1017 |
+
)
|
| 1018 |
+
|
| 1019 |
md_status = gr.Markdown(
|
| 1020 |
+
"Pick a task + policy and click **Run Episode**. Run the same task "
|
| 1021 |
+
"under each of the four scripted policies (heuristic, escalate-all, "
|
| 1022 |
+
"concede-all, naive) to reproduce the discrimination gradient β naive "
|
| 1023 |
+
"β 0.000, concede-all β ~0.44, escalate-all β ~0.77, heuristic β ~0.81. "
|
| 1024 |
+
"Or pick **LLM** and bring your own model. For a side-by-side view, "
|
| 1025 |
+
"open the **Compare policies** tab."
|
| 1026 |
)
|
| 1027 |
|
| 1028 |
with gr.Row(equal_height=True):
|
|
|
|
| 1044 |
datatype=["number", "str", "str", "str", "str", "number", "str"],
|
| 1045 |
interactive=False,
|
| 1046 |
wrap=True,
|
| 1047 |
+
label="Step Trace (β accepted Β· β no-op Β· β rejected)",
|
| 1048 |
)
|
| 1049 |
|
| 1050 |
with gr.Row(equal_height=True):
|
| 1051 |
with gr.Column(scale=1):
|
| 1052 |
+
html_round = gr.HTML(label="Dispute Round (issuer messages)")
|
| 1053 |
with gr.Column(scale=1):
|
| 1054 |
html_arb = gr.HTML(label="Arbitration")
|
| 1055 |
|
| 1056 |
html_grader = gr.HTML(label="Grader Report")
|
| 1057 |
+
with gr.Accordion("Raw grader JSON (export-friendly)", open=False):
|
| 1058 |
+
json_raw = gr.JSON(label="Raw JSON", show_label=False)
|
| 1059 |
|
| 1060 |
btn_run.click(
|
| 1061 |
fn=run_episode,
|
| 1062 |
+
inputs=[
|
| 1063 |
+
dd_task, cb_gen, rd_diff, nb_seed, rd_policy,
|
| 1064 |
+
tb_llm_base, tb_llm_key, tb_llm_model,
|
| 1065 |
+
],
|
| 1066 |
outputs=[
|
| 1067 |
md_status,
|
| 1068 |
html_queue,
|
|
|
|
| 1075 |
],
|
| 1076 |
)
|
| 1077 |
|
| 1078 |
+
# Generated-checkbox visibility callback.
|
| 1079 |
+
def _toggle_generated(generated: bool):
|
| 1080 |
+
return (
|
| 1081 |
+
gr.update(visible=generated),
|
| 1082 |
+
gr.update(visible=generated),
|
| 1083 |
+
)
|
| 1084 |
+
|
| 1085 |
+
cb_gen.change(
|
| 1086 |
+
fn=_toggle_generated,
|
| 1087 |
+
inputs=[cb_gen],
|
| 1088 |
+
outputs=[rd_diff, nb_seed],
|
| 1089 |
+
)
|
| 1090 |
+
|
| 1091 |
+
# Show LLM accordion only when 'llm' policy is selected.
|
| 1092 |
+
def _toggle_llm(policy: str):
|
| 1093 |
+
return gr.update(visible=(policy == "llm"), open=(policy == "llm"))
|
| 1094 |
+
|
| 1095 |
+
rd_policy.change(
|
| 1096 |
+
fn=_toggle_llm, inputs=[rd_policy], outputs=[llm_accordion]
|
| 1097 |
+
)
|
| 1098 |
+
|
| 1099 |
+
# Wire each preset button to populate the inputs atomically.
|
| 1100 |
+
def _make_preset_handler(preset):
|
| 1101 |
+
label, t_id, gen, diff, seed_v, pol, blurb = preset
|
| 1102 |
+
|
| 1103 |
+
def _apply():
|
| 1104 |
+
return (
|
| 1105 |
+
t_id, # dd_task
|
| 1106 |
+
gen, # cb_gen
|
| 1107 |
+
gr.update(value=diff, visible=gen), # rd_diff
|
| 1108 |
+
gr.update(value=seed_v, visible=gen), # nb_seed
|
| 1109 |
+
pol, # rd_policy
|
| 1110 |
+
gr.update(visible=(pol == "llm")), # llm_accordion
|
| 1111 |
+
f"**Preset:** {label} β {blurb}", # preset_blurb
|
| 1112 |
+
)
|
| 1113 |
+
|
| 1114 |
+
return _apply
|
| 1115 |
+
|
| 1116 |
+
for btn, preset in zip(preset_buttons, _PRESETS):
|
| 1117 |
+
btn.click(
|
| 1118 |
+
fn=_make_preset_handler(preset),
|
| 1119 |
+
inputs=[],
|
| 1120 |
+
outputs=[
|
| 1121 |
+
dd_task,
|
| 1122 |
+
cb_gen,
|
| 1123 |
+
rd_diff,
|
| 1124 |
+
nb_seed,
|
| 1125 |
+
rd_policy,
|
| 1126 |
+
llm_accordion,
|
| 1127 |
+
preset_blurb,
|
| 1128 |
+
],
|
| 1129 |
+
)
|
| 1130 |
+
|
| 1131 |
+
# ββ Tab 2: Compare policies ββββββββββββββββββββββββββ
|
| 1132 |
+
with gr.Tab("Compare policies"):
|
| 1133 |
+
gr.Markdown(
|
| 1134 |
+
"Run all four scripted policies on the **same task / seed** and see "
|
| 1135 |
+
"the discrimination gradient at a glance. No provider calls, no LLM, "
|
| 1136 |
+
"fully deterministic β this is the empirical evidence behind the "
|
| 1137 |
+
"README's `+0.813` discrimination delta claim."
|
| 1138 |
+
)
|
| 1139 |
+
with gr.Row():
|
| 1140 |
+
cmp_task = gr.Dropdown(
|
| 1141 |
+
label="Task", choices=task_ids, value=default, scale=3
|
| 1142 |
+
)
|
| 1143 |
+
cmp_gen = gr.Checkbox(label="Generated", value=False, scale=1)
|
| 1144 |
+
cmp_diff = gr.Radio(
|
| 1145 |
+
["easy", "medium", "hard", "nightmare"],
|
| 1146 |
+
label="Difficulty",
|
| 1147 |
+
value="easy",
|
| 1148 |
+
visible=False,
|
| 1149 |
+
scale=2,
|
| 1150 |
+
)
|
| 1151 |
+
cmp_seed = gr.Number(
|
| 1152 |
+
label="Seed", value=42, precision=0, visible=False, scale=1
|
| 1153 |
+
)
|
| 1154 |
+
btn_cmp = gr.Button("Run all 4 policies", variant="primary")
|
| 1155 |
+
cmp_md = gr.Markdown("")
|
| 1156 |
+
cmp_html = gr.HTML(label="Final-score comparison")
|
| 1157 |
+
cmp_table = gr.Dataframe(
|
| 1158 |
+
headers=["Policy", "Score", "Steps", "Summary"],
|
| 1159 |
+
datatype=["str", "str", "number", "str"],
|
| 1160 |
+
interactive=False,
|
| 1161 |
+
wrap=True,
|
| 1162 |
+
label="Per-policy summary",
|
| 1163 |
+
)
|
| 1164 |
+
btn_cmp.click(
|
| 1165 |
+
fn=run_compare,
|
| 1166 |
+
inputs=[cmp_task, cmp_gen, cmp_diff, cmp_seed],
|
| 1167 |
+
outputs=[cmp_md, cmp_html, cmp_table],
|
| 1168 |
+
)
|
| 1169 |
+
cmp_gen.change(
|
| 1170 |
+
fn=_toggle_generated,
|
| 1171 |
+
inputs=[cmp_gen],
|
| 1172 |
+
outputs=[cmp_diff, cmp_seed],
|
| 1173 |
+
)
|
| 1174 |
+
|
| 1175 |
+
# ββ Tab 3: Task Catalog ββββββββββββββββββββββββββββββ
|
| 1176 |
with gr.Tab("Task Catalog"):
|
| 1177 |
catalog_rows = []
|
| 1178 |
for t in tasks:
|
|
|
|
| 1211 |
|
| 1212 |
# ββ Tab 3: Environment Info βββββββββββββββββββββββββββ
|
| 1213 |
with gr.Tab("Environment"):
|
| 1214 |
+
gr.Markdown(_environment_tab_markdown())
|
| 1215 |
+
|
| 1216 |
+
# ββ Tab 5: Rubric Tree ββββββββββββββββββββββββββββββββ
|
| 1217 |
+
with gr.Tab("Rubric Tree"):
|
| 1218 |
gr.Markdown(
|
| 1219 |
+
"Live introspection of `env.rubric.named_rubrics()` β the same composable "
|
| 1220 |
+
"OpenEnv `Rubric` tree that grades every step. Weights and structure below "
|
| 1221 |
+
"are read from the running environment, not hardcoded."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1222 |
)
|
| 1223 |
+
gr.HTML(_rubric_tree_html())
|
| 1224 |
+
gr.Markdown(
|
| 1225 |
+
"See [`docs/METHOD.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/METHOD.md) "
|
| 1226 |
+
"and [`docs/SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md) "
|
| 1227 |
+
"for the full design and the GRPO failure-mode write-up."
|
| 1228 |
+
)
|
| 1229 |
+
|
| 1230 |
+
# ββ Tab 6: Training Results βββββββββββββββββββββββββββ
|
| 1231 |
+
with gr.Tab("Training Results"):
|
| 1232 |
+
gr.Markdown(_training_tab_markdown())
|
| 1233 |
+
for caption, fname in (
|
| 1234 |
+
(
|
| 1235 |
+
"**Cross-iteration training curve.** Iter 3 plateaued below the "
|
| 1236 |
+
"heuristic at 0.728. Iter 5 plateaued *bit-exactly* at the heuristic "
|
| 1237 |
+
"at 0.8132 β the signature of the eval-fallback exploit, not "
|
| 1238 |
+
"convergent learning.",
|
| 1239 |
+
"training_curve_cross_iter.png",
|
| 1240 |
+
),
|
| 1241 |
+
(
|
| 1242 |
+
"**Iter-5 eval-score attribution.** The trained policy contributes "
|
| 1243 |
+
"0.000 (every action is rejected by env validation). The eval rollout "
|
| 1244 |
+
"helper's heuristic-fallback path contributes 0.8132 β i.e. all of it.",
|
| 1245 |
+
"gaming_attribution.png",
|
| 1246 |
+
),
|
| 1247 |
+
(
|
| 1248 |
+
"**Scripted-policy discrimination gradient.** The 8-dimension "
|
| 1249 |
+
"`WeightedSum` plus the deadline `Gate` defeats every degenerate "
|
| 1250 |
+
"policy: empty-packet zeros out, concede-all caps at 0.44, "
|
| 1251 |
+
"escalate-all caps at 0.77.",
|
| 1252 |
+
"discrimination_gradient.png",
|
| 1253 |
+
),
|
| 1254 |
+
(
|
| 1255 |
+
"**8-dimension OpenEnv rubric weights**, grouped by category "
|
| 1256 |
+
"(decision / packet / process / terminal). 40% of reward sits on "
|
| 1257 |
+
"decision + terminal β where economically irrational policies "
|
| 1258 |
+
"bleed money fastest.",
|
| 1259 |
+
"rubric_weights.png",
|
| 1260 |
+
),
|
| 1261 |
+
(
|
| 1262 |
+
"**Iter-5 per-difficulty curves.** Post-step-80 plateau is the "
|
| 1263 |
+
"fallback heuristic across every difficulty band; see "
|
| 1264 |
+
"SPECIFICATION_GAMING.md for the diagnosis.",
|
| 1265 |
+
"training_curve_by_family.png",
|
| 1266 |
+
),
|
| 1267 |
+
):
|
| 1268 |
+
src = _figure_data_uri(fname)
|
| 1269 |
+
if src is None:
|
| 1270 |
+
gr.Markdown(
|
| 1271 |
+
f"_(figure `{fname}` not bundled β see "
|
| 1272 |
+
f"[`docs/figures/{fname}`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/figures/{fname}))_"
|
| 1273 |
+
)
|
| 1274 |
+
continue
|
| 1275 |
+
gr.Markdown(caption)
|
| 1276 |
+
gr.HTML(
|
| 1277 |
+
f'<img src="{src}" style="width:100%;max-width:1100px;'
|
| 1278 |
+
f'border:1px solid #2a2a2a;border-radius:6px;margin:6px 0;" '
|
| 1279 |
+
f'alt="{fname}" />'
|
| 1280 |
+
)
|
| 1281 |
|
| 1282 |
return demo
|
| 1283 |
+
|
| 1284 |
+
|
| 1285 |
+
# ---------------------------------------------------------------------------
|
| 1286 |
+
# Tab content builders (called once at app build; keep cheap)
|
| 1287 |
+
# ---------------------------------------------------------------------------
|
| 1288 |
+
|
| 1289 |
+
|
| 1290 |
+
def _environment_tab_markdown() -> str:
|
| 1291 |
+
"""Render the Environment tab content from live constants.
|
| 1292 |
+
|
| 1293 |
+
Reads action types from ``core.models.ActionType`` and the rubric weights
|
| 1294 |
+
from ``evaluation.rubrics.CASE_DIMENSION_WEIGHTS`` so this tab can never
|
| 1295 |
+
drift from the source of truth.
|
| 1296 |
+
"""
|
| 1297 |
+
|
| 1298 |
+
try:
|
| 1299 |
+
from core.models import ActionType # type: ignore[attr-defined]
|
| 1300 |
+
except ImportError: # pragma: no cover
|
| 1301 |
+
from ..core.models import ActionType # type: ignore[attr-defined]
|
| 1302 |
+
|
| 1303 |
+
# ``Literal`` exposes its members via ``__args__``.
|
| 1304 |
+
actions: tuple[str, ...] = tuple(getattr(ActionType, "__args__", ()))
|
| 1305 |
+
n_actions = len(actions)
|
| 1306 |
+
|
| 1307 |
+
r1 = (
|
| 1308 |
+
"select_case", "inspect_case", "query_system", "retrieve_policy",
|
| 1309 |
+
"add_evidence", "remove_evidence", "set_strategy",
|
| 1310 |
+
"submit_representment", "resolve_case",
|
| 1311 |
+
)
|
| 1312 |
+
r23 = ("respond_to_pre_arb", "escalate_to_arbitration", "accept_arbitration_loss")
|
| 1313 |
+
long_horizon = ("wait_for_updates",)
|
| 1314 |
+
|
| 1315 |
+
def _join(items: tuple[str, ...]) -> str:
|
| 1316 |
+
return " · ".join(f"`{name}`" for name in items)
|
| 1317 |
+
|
| 1318 |
+
rubric_rows = "\n".join(
|
| 1319 |
+
f"| {label} | {int(round(weight * 100))}% | {scoring} |"
|
| 1320 |
+
for label, weight, scoring in zip(
|
| 1321 |
+
_DIMENSION_LABELS, CASE_DIMENSION_WEIGHTS, _DIMENSION_SCORING
|
| 1322 |
+
)
|
| 1323 |
+
)
|
| 1324 |
+
|
| 1325 |
+
return (
|
| 1326 |
+
f"## Action Space ({n_actions} typed actions)\n\n"
|
| 1327 |
+
f"**Round 1 β Representment:** {_join(r1)}\n\n"
|
| 1328 |
+
f"**Round 2/3 β Pre-arb & Arbitration:** {_join(r23)}\n\n"
|
| 1329 |
+
f"**Long-horizon backlog:** {_join(long_horizon)}\n\n"
|
| 1330 |
+
"## Merchant Systems (6)\n\n"
|
| 1331 |
+
"`orders` · `payment` · `shipping` · "
|
| 1332 |
+
"`support` · `refunds` · `risk`\n\n"
|
| 1333 |
+
"## Grading (8 dimensions)\n\n"
|
| 1334 |
+
"Weights are read live from `evaluation.rubrics.CASE_DIMENSION_WEIGHTS`.\n\n"
|
| 1335 |
+
"| Dimension | Weight | Scoring |\n"
|
| 1336 |
+
"|---|---|---|\n"
|
| 1337 |
+
f"{rubric_rows}\n\n"
|
| 1338 |
+
"## Scripted policies (Run Episode tab)\n\n"
|
| 1339 |
+
"| Policy | What it does | Headline avg |\n"
|
| 1340 |
+
"|---|---|---|\n"
|
| 1341 |
+
"| `naive` | Submit empty packet, no evidence, no policy work | 0.000 |\n"
|
| 1342 |
+
"| `concede_all` | Always set strategy `accept_chargeback` and resolve | 0.444 |\n"
|
| 1343 |
+
"| `escalate_all` | Contest like the heuristic, then always escalate | 0.767 |\n"
|
| 1344 |
+
"| `heuristic` | EV-rational, fully offline | **0.813** |\n\n"
|
| 1345 |
+
"## Card Networks\n\n"
|
| 1346 |
+
"| Reason Code | Visa | Mastercard |\n"
|
| 1347 |
+
"|---|---|---|\n"
|
| 1348 |
+
"| Goods Not Received | 13.1 (30 days) | 4855 (45 days) |\n"
|
| 1349 |
+
"| Fraud CNP | 10.4 (30 days) | 4837 (45 days) |\n"
|
| 1350 |
+
"| Credit Not Processed | 13.6 (30 days) | 4860 (45 days) |\n"
|
| 1351 |
+
"| Duplicate Processing | 12.4 (30 days) | 4834 (45 days) |\n"
|
| 1352 |
+
"| Product Not As Described | 13.3 (30 days) | 4853 (45 days) |\n"
|
| 1353 |
+
"| Service Not Provided | 13.1 (30 days) | 4855 (45 days) |\n"
|
| 1354 |
+
)
|
| 1355 |
+
|
| 1356 |
+
|
| 1357 |
+
def _rubric_tree_html() -> str:
|
| 1358 |
+
"""Render the live ``env.rubric.named_rubrics()`` tree as nested HTML.
|
| 1359 |
+
|
| 1360 |
+
Also explicitly surfaces the deadline ``Gate(CaseAbandonedRubric)`` that
|
| 1361 |
+
sits on top of the per-case ``WeightedSum`` β OpenEnv's default walk
|
| 1362 |
+
iterates registered child rubrics only, and the Gate is a sibling of the
|
| 1363 |
+
aggregator inside :class:`CaseRubric`.
|
| 1364 |
+
|
| 1365 |
+
Falls back to a static snapshot if introspection fails for any reason
|
| 1366 |
+
(e.g. an old OpenEnv build) so the demo never breaks on this tab.
|
| 1367 |
+
"""
|
| 1368 |
+
|
| 1369 |
+
try:
|
| 1370 |
+
env = ChargebackOpsEnvironment()
|
| 1371 |
+
named = list(env.rubric.named_rubrics())
|
| 1372 |
+
except Exception as exc: # pragma: no cover β defensive fallback
|
| 1373 |
+
return (
|
| 1374 |
+
f"<pre style='color:#ef4444;'>Could not introspect rubric tree: "
|
| 1375 |
+
f"{type(exc).__name__}: {exc}</pre>"
|
| 1376 |
+
)
|
| 1377 |
+
|
| 1378 |
+
# Map weights onto leaf rubrics by name. CASE_DIMENSION_NAMES is the
|
| 1379 |
+
# canonical order the WeightedSum was built with; weights align by index.
|
| 1380 |
+
weight_by_dim = dict(zip(CASE_DIMENSION_NAMES, CASE_DIMENSION_WEIGHTS))
|
| 1381 |
+
|
| 1382 |
+
rows: list[str] = []
|
| 1383 |
+
rows.append(
|
| 1384 |
+
"<table class='queue-table' style='font-family:ui-monospace,monospace;'>"
|
| 1385 |
+
"<tr><th>Path</th><th>Class</th><th>Weight / Role</th></tr>"
|
| 1386 |
+
)
|
| 1387 |
+
|
| 1388 |
+
# Explicitly inject the deadline gate row above the aggregator subtree,
|
| 1389 |
+
# since some OpenEnv versions don't yield it via named_rubrics().
|
| 1390 |
+
deadline_gate_injected = False
|
| 1391 |
+
for path, rubric in named:
|
| 1392 |
+
cls_name = type(rubric).__name__
|
| 1393 |
+
if (
|
| 1394 |
+
not deadline_gate_injected
|
| 1395 |
+
and cls_name == "WeightedSum"
|
| 1396 |
+
and path.endswith("aggregator")
|
| 1397 |
+
):
|
| 1398 |
+
parent = path.rsplit(".", 1)[0]
|
| 1399 |
+
rows.append(
|
| 1400 |
+
f"<tr><td>{' ' * (parent.count('.') * 4 + 4)}"
|
| 1401 |
+
f"<code>{parent}.deadline_gate</code></td>"
|
| 1402 |
+
f"<td>Gate(CaseAbandonedRubric)</td>"
|
| 1403 |
+
f"<td style='text-align:right;color:#eab308;'>hard-zero on miss</td></tr>"
|
| 1404 |
+
)
|
| 1405 |
+
deadline_gate_injected = True
|
| 1406 |
+
|
| 1407 |
+
weight_str = "β"
|
| 1408 |
+
for dim_name, weight in weight_by_dim.items():
|
| 1409 |
+
tag = "".join(part.capitalize() for part in dim_name.split("_")) + "Rubric"
|
| 1410 |
+
if cls_name == tag:
|
| 1411 |
+
weight_str = f"{int(round(weight * 100))}%"
|
| 1412 |
+
break
|
| 1413 |
+
depth = path.count(".")
|
| 1414 |
+
indent = " " * (depth * 4)
|
| 1415 |
+
rows.append(
|
| 1416 |
+
f"<tr><td>{indent}<code>{path or '(root)'}</code></td>"
|
| 1417 |
+
f"<td>{cls_name}</td>"
|
| 1418 |
+
f"<td style='text-align:right;'>{weight_str}</td></tr>"
|
| 1419 |
+
)
|
| 1420 |
+
rows.append("</table>")
|
| 1421 |
+
return "".join(rows)
|
| 1422 |
+
|
| 1423 |
+
|
| 1424 |
+
# ---------------------------------------------------------------------------
|
| 1425 |
+
# Training Results helpers
|
| 1426 |
+
# ---------------------------------------------------------------------------
|
| 1427 |
+
|
| 1428 |
+
|
| 1429 |
+
def _figure_data_uri(filename: str) -> str | None:
|
| 1430 |
+
"""Return a base64 ``data:image/png`` URI for a bundled figure, or None.
|
| 1431 |
+
|
| 1432 |
+
Embedding figures inline avoids dependencies on the static-asset routing
|
| 1433 |
+
of whatever host serves the demo (HF Spaces, FastAPI sub-mount, etc.).
|
| 1434 |
+
"""
|
| 1435 |
+
|
| 1436 |
+
path = _FIGURES_DIR / filename
|
| 1437 |
+
if not path.is_file():
|
| 1438 |
+
return None
|
| 1439 |
+
try:
|
| 1440 |
+
data = path.read_bytes()
|
| 1441 |
+
except OSError:
|
| 1442 |
+
return None
|
| 1443 |
+
encoded = base64.b64encode(data).decode("ascii")
|
| 1444 |
+
return f"data:image/png;base64,{encoded}"
|
| 1445 |
+
|
| 1446 |
+
|
| 1447 |
+
def _training_tab_markdown() -> str:
|
| 1448 |
+
return (
|
| 1449 |
+
"## Real training, end-to-end\n\n"
|
| 1450 |
+
"**Pipeline.** Qwen2.5-3B fp16 + LoRA r=16 on a single Colab T4. Phase A is "
|
| 1451 |
+
"supervised fine-tuning on heuristic rollouts; Phase B is GRPO with an outcome-"
|
| 1452 |
+
"based reward (terminal $-PnL after the model's action plus a heuristic tail-"
|
| 1453 |
+
"rollout). The training loop **connects to the live `ChargebackOpsEnvironment`** "
|
| 1454 |
+
"β every gradient step is graded by the same rubric and same Issuer adversary "
|
| 1455 |
+
"the eval uses. There is no static dataset shortcut.\n\n"
|
| 1456 |
+
"**Five iterations, three failure modes.** Iter 1 produced total gradient "
|
| 1457 |
+
"collapse (group reward variance β 0). Iter 3 broke through to non-zero gradient "
|
| 1458 |
+
"but plateaued at 0.728. **Iter 5 ran 200 GRPO steps and uncovered a reproducible "
|
| 1459 |
+
"specification-gaming exploit** where the model emits invalid `accept_case` "
|
| 1460 |
+
"actions, triggers the eval rollout helper's heuristic-fallback path, and "
|
| 1461 |
+
"scores bit-exactly the heuristic baseline at 0.8132. The full diagnosis is in "
|
| 1462 |
+
"[`SPECIFICATION_GAMING.md`](https://github.com/MitudruDutta/chargebackops/blob/main/docs/SPECIFICATION_GAMING.md).\n\n"
|
| 1463 |
+
"**Honest trained-vs-untrained delta:** the SFT step at 0.536 β **+0.08 absolute, "
|
| 1464 |
+
"+18% relative** over the untrained Qwen2.5-3B base β is the only legitimate "
|
| 1465 |
+
"model-attributable improvement on iter 5. We document this honestly because "
|
| 1466 |
+
"the failure mode itself is a research artefact future GRPO recipes can target "
|
| 1467 |
+
"as a benchmark.\n\n"
|
| 1468 |
+
"**Reproduce.** "
|
| 1469 |
+
"[Latest training run (Colab β iter 5, 200 GRPO steps)](https://colab.research.google.com/drive/1GtLH6_b10oHlAnnGq4hnBkcGJ-pE_za5?usp=sharing) Β· "
|
| 1470 |
+
"[Previous training run (Colab β iter 3, 62 GRPO steps)](https://colab.research.google.com/drive/1AjG3Sv7FnMeOSls6JMzTunkMzlJi_ySu?usp=sharing) Β· "
|
| 1471 |
+
"[`notebooks/train_merchant_agent.ipynb`](https://github.com/MitudruDutta/chargebackops/blob/main/notebooks/train_merchant_agent.ipynb)\n"
|
| 1472 |
+
)
|