dolev31 commited on
Commit
f82bd7d
·
1 Parent(s): 9762f1e

Redesign leaderboard UI with polished theme, hero header, and consistent styling

Browse files

- Add custom Gradio theme (blue-indigo palette, Inter/JetBrains Mono fonts)
- Add dark gradient hero header with IBM Research logo, ICLR 2025 badge, stats strip
- Add ~200 lines of custom CSS: underline tabs, polished tables, form cards, FAQ styling
- Consolidate Frontier tab into Leaderboard (removes duplicate Pareto plot)
- Rename tabs for cleaner navigation (Safety, Tiers, Per-App, Get Key)
- Add Plotly style helpers for consistent chart styling and polished empty states
- Add responsive breakpoints for mobile

Files changed (2) hide show
  1. app.py +434 -85
  2. assets/ibm_research_logo.png +3 -0
app.py CHANGED
@@ -21,6 +21,7 @@ from pathlib import Path
21
  from typing import List, Optional
22
 
23
  import gradio as gr
 
24
  import pandas as pd
25
  import plotly.graph_objects as go
26
 
@@ -187,6 +188,297 @@ def handle_key_request(email: str, team: str, institution: str) -> str:
187
 
188
  RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  # ---------------------------------------------------------------------------
192
  # Submission status workflow
@@ -316,13 +608,10 @@ def build_radar_chart(submissions: list[dict],
316
  fig = go.Figure()
317
 
318
  if not selected_agents:
319
- fig.add_annotation(text="Select agents to compare", showarrow=False,
320
- xref="paper", yref="paper", x=0.5, y=0.5)
321
- fig.update_layout(title="Safety Dimension Radar", height=500)
322
- return fig
323
 
324
  dim_labels = [DIMENSION_DISPLAY.get(d, d) for d in SAFETY_DIMENSIONS]
325
- colors = ["#3b82f6", "#ef4444", "#22c55e", "#a855f7"]
326
 
327
  for i, agent_name in enumerate(selected_agents[:4]):
328
  # Find submission
@@ -350,27 +639,26 @@ def build_radar_chart(submissions: list[dict],
350
  theta=labels,
351
  fill="toself",
352
  name=agent_name,
353
- line=dict(color=colors[i % len(colors)]),
354
  opacity=0.6,
355
  ))
356
 
357
- fig.update_layout(
358
- polar=dict(
359
- radialaxis=dict(visible=True, range=[0, 1]),
360
- ),
361
  title="Safety Dimension Radar (higher = safer)",
362
  height=500,
363
- showlegend=True,
364
- )
 
 
 
 
365
  return fig
366
 
367
 
368
  def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
369
  """Build a heatmap of risk ratios (agents x dimensions)."""
370
  if not submissions:
371
- fig = go.Figure()
372
- fig.add_annotation(text="No submissions yet", showarrow=False)
373
- return fig
374
 
375
  agent_names = []
376
  z_values = []
@@ -409,11 +697,12 @@ def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
409
  colorbar=dict(title="Risk Ratio"),
410
  ))
411
 
412
- fig.update_layout(
413
- title="Risk Ratio Heatmap (Green=Low, Yellow=Medium, Red=High)",
414
  height=max(300, 60 * len(agent_names) + 100),
415
- xaxis=dict(side="top"),
416
- )
 
417
  return fig
418
 
419
 
@@ -422,16 +711,13 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
422
  fig = go.Figure()
423
 
424
  if not submissions:
425
- fig.add_annotation(text="No submissions yet", showarrow=False,
426
- xref="paper", yref="paper", x=0.5, y=0.5)
427
- fig.update_layout(title="Performance-Safety Frontier", height=500)
428
- return fig
429
 
430
  # Diagonal line (perfect safety: CuP = CR)
431
  fig.add_trace(go.Scatter(
432
  x=[0, 1], y=[0, 1],
433
  mode="lines",
434
- line=dict(color="gray", dash="dash", width=1),
435
  name="Perfect Safety (CuP=CR)",
436
  showlegend=True,
437
  ))
@@ -451,14 +737,14 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
451
  risks.append(avg_risk)
452
 
453
  # Color by risk level
454
- colors = []
455
  for r in risks:
456
  if r <= 0.05:
457
- colors.append("#22c55e")
458
  elif r <= 0.15:
459
- colors.append("#eab308")
460
  else:
461
- colors.append("#ef4444")
462
 
463
  hover_text = [
464
  f"<b>{n}</b><br>Team: {t}<br>CR: {cr:.3f}<br>CuP: {cup:.3f}<br>"
@@ -470,10 +756,10 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
470
  x=crs,
471
  y=cups,
472
  mode="markers+text",
473
- marker=dict(size=14, color=colors, line=dict(width=1, color="white")),
474
  text=names,
475
  textposition="top center",
476
- textfont=dict(size=10),
477
  hovertext=hover_text,
478
  hoverinfo="text",
479
  name="Agents",
@@ -493,19 +779,19 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
493
  fig.add_trace(go.Scatter(
494
  x=pareto_x, y=pareto_y,
495
  mode="lines",
496
- line=dict(color="#3b82f6", width=2),
497
  name="Pareto Frontier",
498
  ))
499
 
500
- fig.update_layout(
501
  title="Performance-Safety Frontier",
502
  xaxis_title="CR (Completion Rate)",
503
  yaxis_title="CuP (Completion under Policy)",
504
- xaxis=dict(range=[-0.02, 1.02]),
505
- yaxis=dict(range=[-0.02, 1.02]),
506
  height=550,
507
  legend=dict(x=0.02, y=0.98),
508
- )
509
  return fig
510
 
511
 
@@ -815,24 +1101,94 @@ def create_app() -> gr.Blocks:
815
  submissions = load_submissions()
816
  agent_choices = [s.get("metadata", {}).get("agent_id", "?") for s in submissions]
817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  with gr.Blocks(
819
  title="ST-WebAgentBench Leaderboard",
820
- theme=gr.themes.Soft(),
 
821
  ) as demo:
822
 
823
- gr.HTML("""
824
- <div style="text-align:center; padding: 20px 0;">
825
- <h1 style="margin-bottom: 5px;">ST-WebAgentBench Leaderboard</h1>
826
- <p style="color: #666; font-size: 1.1em;">
827
- Evaluating Safety &amp; Trustworthiness in Web Agents &mdash;
828
- <b>ICLR 2025</b>
829
- </p>
830
- <p style="font-size: 0.9em;">
831
- <a href="https://arxiv.org/abs/2410.06703" target="_blank">Paper</a> &nbsp;|&nbsp;
832
- <a href="https://huggingface.co/datasets/dolev31/st-webagentbench" target="_blank">Dataset</a> &nbsp;|&nbsp;
833
- <a href="https://github.com/segev-shlomov/ST-WebAgentBench" target="_blank">GitHub</a> &nbsp;|&nbsp;
834
- <a href="https://sites.google.com/view/st-webagentbench/home" target="_blank">Website</a>
835
  </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
  </div>
837
  """)
838
 
@@ -840,7 +1196,7 @@ def create_app() -> gr.Blocks:
840
 
841
  # ---- Tab 1: Leaderboard ----
842
  with gr.TabItem("Leaderboard"):
843
- with gr.Row():
844
  sort_by = gr.Dropdown(
845
  choices=["CuP", "CR", "semi-CuP", "Risk Ratio", "Gap", "Date"],
846
  value="CuP", label="Sort by",
@@ -855,7 +1211,8 @@ def create_app() -> gr.Blocks:
855
  leaderboard_table = gr.Dataframe(
856
  value=build_main_table(submissions),
857
  interactive=False,
858
- label="Ranked by CuP (Completion under Policy) — the primary ST-WebAgentBench metric",
 
859
  )
860
 
861
  def update_table(sort_val, model_val, open_val, verified_val):
@@ -873,11 +1230,17 @@ def create_app() -> gr.Blocks:
873
  gr.Markdown("### Performance-Safety Frontier")
874
  pareto_plot = gr.Plot(
875
  value=build_pareto_frontier(submissions),
876
- label="CR vs CuP — agents on the frontier are Pareto-optimal",
877
  )
 
 
 
 
 
 
 
878
 
879
- # ---- Tab 2: Safety Profile ----
880
- with gr.TabItem("Safety Profile"):
881
  agent_selector = gr.Dropdown(
882
  choices=agent_choices,
883
  multiselect=True,
@@ -899,25 +1262,8 @@ def create_app() -> gr.Blocks:
899
 
900
  agent_selector.change(update_radar, inputs=[agent_selector], outputs=[radar_chart], api_name=False)
901
 
902
- # ---- Tab 3: Frontier (standalone) ----
903
- with gr.TabItem("Frontier"):
904
- gr.Markdown("""
905
- ### Performance-Safety Frontier
906
-
907
- This scatter plot shows each agent's **CR** (task completion ignoring safety)
908
- vs **CuP** (task completion with zero policy violations).
909
-
910
- - The **diagonal** (y=x) represents perfect policy adherence
911
- - Distance below the diagonal = the agent's **safety gap**
912
- - The **Pareto frontier** connects agents that are best-in-class for their safety level
913
- - **Dot color**: Green = low risk, Yellow = medium, Red = high
914
- """)
915
- frontier_plot = gr.Plot(
916
- value=build_pareto_frontier(submissions),
917
- )
918
-
919
- # ---- Tab 4: Tier Analysis ----
920
- with gr.TabItem("Tier Analysis"):
921
  gr.Markdown("""
922
  ### CRM Difficulty Tier Breakdown
923
 
@@ -933,16 +1279,16 @@ def create_app() -> gr.Blocks:
933
  interactive=False,
934
  )
935
 
936
- # ---- Tab 5: Per-App ----
937
- with gr.TabItem("Per-App Breakdown"):
938
  gr.Markdown("### Performance by Web Application")
939
  app_table = gr.Dataframe(
940
  value=build_app_table(submissions),
941
  interactive=False,
942
  )
943
 
944
- # ---- Tab 6: Get Signing Key ----
945
- with gr.TabItem("Get Signing Key"):
946
  gr.Markdown("""
947
  ## Get Your Signing Key
948
 
@@ -955,10 +1301,11 @@ def create_app() -> gr.Blocks:
955
  **Important:** Use the **same email** here and as `--contact-email`
956
  when generating your submission file.
957
  """)
958
- key_email = gr.Textbox(label="Email *", placeholder="you@example.com")
959
- key_team = gr.Textbox(label="Team Name *", placeholder="Your Team")
960
- key_institution = gr.Textbox(label="Institution (optional)", placeholder="University / Company")
961
- key_btn = gr.Button("Generate Signing Key", variant="primary")
 
962
  key_result = gr.Textbox(label="Your Signing Key", interactive=False, lines=6)
963
 
964
  key_btn.click(
@@ -968,7 +1315,7 @@ def create_app() -> gr.Blocks:
968
  api_name=False,
969
  )
970
 
971
- # ---- Tab 7: Submit ----
972
  with gr.TabItem("Submit"):
973
  gr.Markdown(f"""
974
  ## Submit Your Results
@@ -1005,8 +1352,9 @@ def create_app() -> gr.Blocks:
1005
  5. **Anti-gaming** — rate limiting, duplicate detection, completeness enforcement
1006
  """)
1007
 
1008
- upload = gr.File(label="Upload submission.json", file_types=[".json"])
1009
- submit_btn = gr.Button("Validate & Submit", variant="primary")
 
1010
  result_text = gr.Textbox(label="Verification Report", interactive=False, lines=20)
1011
 
1012
  submit_btn.click(
@@ -1016,8 +1364,9 @@ def create_app() -> gr.Blocks:
1016
  api_name=False,
1017
  )
1018
 
1019
- # ---- Tab 8: FAQ ----
1020
  with gr.TabItem("FAQ"):
 
1021
  gr.Markdown("""
1022
  ## Frequently Asked Questions
1023
 
@@ -1349,7 +1698,7 @@ or visit the [project website](https://sites.google.com/view/st-webagentbench/ho
1349
  contact details.
1350
  """)
1351
 
1352
- # ---- Tab 9: About ----
1353
  with gr.TabItem("About"):
1354
  # Build dimensions list dynamically
1355
  _dim_lines = "\n".join(
 
21
  from typing import List, Optional
22
 
23
  import gradio as gr
24
+ from gradio.themes.utils import colors, fonts, sizes
25
  import pandas as pd
26
  import plotly.graph_objects as go
27
 
 
188
 
189
  RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
190
 
191
+ # ---------------------------------------------------------------------------
192
+ # UI Design Constants
193
+ # ---------------------------------------------------------------------------
194
+
195
+ CUSTOM_CSS = """
196
+ /* === Global === */
197
+ .gradio-container {
198
+ max-width: 1200px !important;
199
+ margin: 0 auto !important;
200
+ }
201
+
202
+ /* === Hero Header === */
203
+ #hero-header {
204
+ background: linear-gradient(135deg, #1e3a8a 0%, #312e81 50%, #1e293b 100%);
205
+ border-radius: 16px;
206
+ padding: 40px 48px 32px;
207
+ margin-bottom: 8px;
208
+ position: relative;
209
+ overflow: hidden;
210
+ }
211
+ #hero-header::before {
212
+ content: '';
213
+ position: absolute;
214
+ top: -50%;
215
+ right: -20%;
216
+ width: 500px;
217
+ height: 500px;
218
+ background: radial-gradient(circle, rgba(99, 102, 241, 0.15) 0%, transparent 70%);
219
+ pointer-events: none;
220
+ }
221
+ #hero-header h1 {
222
+ color: white;
223
+ font-size: 2rem;
224
+ font-weight: 700;
225
+ margin: 0 0 6px 0;
226
+ letter-spacing: -0.02em;
227
+ }
228
+ #hero-header .subtitle {
229
+ color: #cbd5e1;
230
+ font-size: 1.05rem;
231
+ margin: 0 0 16px 0;
232
+ font-weight: 400;
233
+ }
234
+ #hero-header .iclr-badge {
235
+ display: inline-block;
236
+ background: linear-gradient(135deg, #6366f1, #818cf8);
237
+ color: white;
238
+ font-size: 0.75rem;
239
+ font-weight: 600;
240
+ padding: 3px 10px;
241
+ border-radius: 9999px;
242
+ letter-spacing: 0.03em;
243
+ vertical-align: middle;
244
+ margin-left: 8px;
245
+ }
246
+ #hero-header .nav-links {
247
+ margin-top: 12px;
248
+ display: flex;
249
+ gap: 20px;
250
+ flex-wrap: wrap;
251
+ }
252
+ #hero-header .nav-links a {
253
+ color: #93c5fd;
254
+ text-decoration: none;
255
+ font-size: 0.9rem;
256
+ font-weight: 500;
257
+ transition: color 0.15s ease;
258
+ display: inline-flex;
259
+ align-items: center;
260
+ gap: 4px;
261
+ }
262
+ #hero-header .nav-links a:hover {
263
+ color: white;
264
+ }
265
+ #hero-header .stats-strip {
266
+ display: flex;
267
+ gap: 32px;
268
+ margin-top: 20px;
269
+ padding-top: 16px;
270
+ border-top: 1px solid rgba(255,255,255,0.1);
271
+ flex-wrap: wrap;
272
+ }
273
+ #hero-header .stat-item {
274
+ text-align: left;
275
+ }
276
+ #hero-header .stat-value {
277
+ color: white;
278
+ font-size: 1.5rem;
279
+ font-weight: 700;
280
+ line-height: 1.2;
281
+ }
282
+ #hero-header .stat-label {
283
+ color: #94a3b8;
284
+ font-size: 0.78rem;
285
+ font-weight: 500;
286
+ text-transform: uppercase;
287
+ letter-spacing: 0.05em;
288
+ }
289
+ #hero-header .logo-row {
290
+ display: flex;
291
+ align-items: center;
292
+ gap: 16px;
293
+ margin-bottom: 12px;
294
+ }
295
+ #hero-header .logo-row img {
296
+ height: 28px;
297
+ filter: brightness(0) invert(1);
298
+ opacity: 0.9;
299
+ }
300
+
301
+ /* === Tabs === */
302
+ .tabs > .tab-nav {
303
+ border-bottom: 2px solid #e2e8f0 !important;
304
+ gap: 0 !important;
305
+ padding: 0 4px !important;
306
+ background: transparent !important;
307
+ }
308
+ .tabs > .tab-nav > button {
309
+ border: none !important;
310
+ border-bottom: 2px solid transparent !important;
311
+ margin-bottom: -2px !important;
312
+ padding: 10px 18px !important;
313
+ font-weight: 500 !important;
314
+ font-size: 0.9rem !important;
315
+ color: #64748b !important;
316
+ background: transparent !important;
317
+ transition: color 0.15s ease, border-color 0.15s ease !important;
318
+ border-radius: 0 !important;
319
+ box-shadow: none !important;
320
+ }
321
+ .tabs > .tab-nav > button:hover {
322
+ color: #1e293b !important;
323
+ background: transparent !important;
324
+ }
325
+ .tabs > .tab-nav > button.selected {
326
+ color: #2563eb !important;
327
+ border-bottom-color: #2563eb !important;
328
+ font-weight: 600 !important;
329
+ background: transparent !important;
330
+ }
331
+
332
+ /* === Tables (Dataframe) === */
333
+ .table-wrap {
334
+ border-radius: 12px !important;
335
+ overflow: hidden !important;
336
+ border: 1px solid #e2e8f0 !important;
337
+ }
338
+ .table-wrap table {
339
+ border-collapse: collapse !important;
340
+ }
341
+ .table-wrap table thead th {
342
+ background: #f1f5f9 !important;
343
+ color: #334155 !important;
344
+ font-weight: 600 !important;
345
+ font-size: 0.82rem !important;
346
+ text-transform: uppercase !important;
347
+ letter-spacing: 0.04em !important;
348
+ padding: 12px 16px !important;
349
+ border-bottom: 2px solid #e2e8f0 !important;
350
+ }
351
+ .table-wrap table tbody td {
352
+ padding: 10px 16px !important;
353
+ font-size: 0.88rem !important;
354
+ border-bottom: 1px solid #f1f5f9 !important;
355
+ }
356
+ .table-wrap table tbody tr:hover {
357
+ background: #eff6ff !important;
358
+ }
359
+
360
+ /* === Accordion (FAQ) === */
361
+ .faq-section .accordion {
362
+ border: 1px solid #e2e8f0 !important;
363
+ border-radius: 10px !important;
364
+ margin-bottom: 8px !important;
365
+ overflow: hidden !important;
366
+ box-shadow: none !important;
367
+ }
368
+ .faq-section .accordion > .label-wrap {
369
+ padding: 14px 18px !important;
370
+ background: white !important;
371
+ }
372
+ .faq-section .accordion > .label-wrap:hover {
373
+ background: #f8fafc !important;
374
+ }
375
+ .faq-section .accordion .prose {
376
+ padding: 4px 18px 18px !important;
377
+ color: #475569 !important;
378
+ line-height: 1.65 !important;
379
+ }
380
+ .faq-section h3 {
381
+ color: #1e293b !important;
382
+ font-size: 1.05rem !important;
383
+ font-weight: 600 !important;
384
+ margin-top: 28px !important;
385
+ margin-bottom: 12px !important;
386
+ padding-bottom: 6px !important;
387
+ border-bottom: 1px solid #e2e8f0 !important;
388
+ }
389
+
390
+ /* === Form Cards === */
391
+ .form-card {
392
+ background: white !important;
393
+ border: 1px solid #e2e8f0 !important;
394
+ border-radius: 12px !important;
395
+ padding: 24px !important;
396
+ box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.04) !important;
397
+ }
398
+
399
+ /* === Filter Row === */
400
+ .filter-row {
401
+ background: #f8fafc !important;
402
+ border: 1px solid #e2e8f0 !important;
403
+ border-radius: 10px !important;
404
+ padding: 12px 16px !important;
405
+ margin-bottom: 12px !important;
406
+ }
407
+
408
+ /* === Responsive === */
409
+ @media (max-width: 768px) {
410
+ #hero-header {
411
+ padding: 28px 24px 24px;
412
+ }
413
+ #hero-header h1 {
414
+ font-size: 1.5rem;
415
+ }
416
+ #hero-header .stats-strip {
417
+ gap: 20px;
418
+ }
419
+ #hero-header .stat-value {
420
+ font-size: 1.2rem;
421
+ }
422
+ .tabs > .tab-nav > button {
423
+ padding: 8px 12px !important;
424
+ font-size: 0.82rem !important;
425
+ }
426
+ }
427
+ """
428
+
429
+ # --- Plotly Style Constants ---
430
+ PLOTLY_FONT = "Inter, system-ui, sans-serif"
431
+ PLOTLY_TEXT_COLOR = "#334155" # slate-700
432
+ PLOTLY_TITLE_COLOR = "#1e293b" # slate-800
433
+ PLOTLY_GRID_COLOR = "#e2e8f0" # slate-200
434
+
435
+ PLOTLY_COLORWAY = [
436
+ "#3b82f6", # blue-500
437
+ "#6366f1", # indigo-500
438
+ "#8b5cf6", # violet-500
439
+ "#06b6d4", # cyan-500
440
+ "#10b981", # emerald-500
441
+ "#f59e0b", # amber-500
442
+ ]
443
+
444
+
445
+ def _plotly_layout(**overrides) -> dict:
446
+ """Consistent Plotly layout kwargs."""
447
+ defaults = dict(
448
+ font=dict(family=PLOTLY_FONT, color=PLOTLY_TEXT_COLOR, size=13),
449
+ title_font=dict(family=PLOTLY_FONT, color=PLOTLY_TITLE_COLOR, size=16),
450
+ plot_bgcolor="rgba(0,0,0,0)",
451
+ paper_bgcolor="rgba(0,0,0,0)",
452
+ margin=dict(l=48, r=24, t=56, b=48),
453
+ legend=dict(
454
+ font=dict(size=12),
455
+ bgcolor="rgba(255,255,255,0.8)",
456
+ bordercolor="#e2e8f0",
457
+ borderwidth=1,
458
+ ),
459
+ colorway=PLOTLY_COLORWAY,
460
+ )
461
+ defaults.update(overrides)
462
+ return defaults
463
+
464
+
465
+ def _empty_figure(message: str, height: int = 400) -> go.Figure:
466
+ """Polished empty-state chart."""
467
+ fig = go.Figure()
468
+ fig.add_annotation(
469
+ text=f"<b>{message}</b><br><span style='font-size:12px;color:#94a3b8'>"
470
+ f"Submit results to populate this chart</span>",
471
+ showarrow=False,
472
+ xref="paper", yref="paper", x=0.5, y=0.5,
473
+ font=dict(size=16, color="#64748b", family=PLOTLY_FONT),
474
+ )
475
+ fig.update_layout(
476
+ **_plotly_layout(height=height),
477
+ xaxis=dict(visible=False),
478
+ yaxis=dict(visible=False),
479
+ )
480
+ return fig
481
+
482
 
483
  # ---------------------------------------------------------------------------
484
  # Submission status workflow
 
608
  fig = go.Figure()
609
 
610
  if not selected_agents:
611
+ return _empty_figure("Select agents to compare", 500)
 
 
 
612
 
613
  dim_labels = [DIMENSION_DISPLAY.get(d, d) for d in SAFETY_DIMENSIONS]
614
+ chart_colors = PLOTLY_COLORWAY[:4]
615
 
616
  for i, agent_name in enumerate(selected_agents[:4]):
617
  # Find submission
 
639
  theta=labels,
640
  fill="toself",
641
  name=agent_name,
642
+ line=dict(color=chart_colors[i % len(chart_colors)]),
643
  opacity=0.6,
644
  ))
645
 
646
+ fig.update_layout(**_plotly_layout(
 
 
 
647
  title="Safety Dimension Radar (higher = safer)",
648
  height=500,
649
+ polar=dict(
650
+ radialaxis=dict(visible=True, range=[0, 1], gridcolor=PLOTLY_GRID_COLOR),
651
+ angularaxis=dict(gridcolor=PLOTLY_GRID_COLOR),
652
+ bgcolor="rgba(0,0,0,0)",
653
+ ),
654
+ ))
655
  return fig
656
 
657
 
658
  def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
659
  """Build a heatmap of risk ratios (agents x dimensions)."""
660
  if not submissions:
661
+ return _empty_figure("No submissions yet")
 
 
662
 
663
  agent_names = []
664
  z_values = []
 
697
  colorbar=dict(title="Risk Ratio"),
698
  ))
699
 
700
+ fig.update_layout(**_plotly_layout(
701
+ title="Risk Heatmap by Safety Dimension",
702
  height=max(300, 60 * len(agent_names) + 100),
703
+ xaxis=dict(side="top", tickfont=dict(size=11)),
704
+ yaxis=dict(tickfont=dict(size=12)),
705
+ ))
706
  return fig
707
 
708
 
 
711
  fig = go.Figure()
712
 
713
  if not submissions:
714
+ return _empty_figure("No submissions yet", 550)
 
 
 
715
 
716
  # Diagonal line (perfect safety: CuP = CR)
717
  fig.add_trace(go.Scatter(
718
  x=[0, 1], y=[0, 1],
719
  mode="lines",
720
+ line=dict(color="#94a3b8", dash="dash", width=1),
721
  name="Perfect Safety (CuP=CR)",
722
  showlegend=True,
723
  ))
 
737
  risks.append(avg_risk)
738
 
739
  # Color by risk level
740
+ dot_colors = []
741
  for r in risks:
742
  if r <= 0.05:
743
+ dot_colors.append("#22c55e")
744
  elif r <= 0.15:
745
+ dot_colors.append("#eab308")
746
  else:
747
+ dot_colors.append("#ef4444")
748
 
749
  hover_text = [
750
  f"<b>{n}</b><br>Team: {t}<br>CR: {cr:.3f}<br>CuP: {cup:.3f}<br>"
 
756
  x=crs,
757
  y=cups,
758
  mode="markers+text",
759
+ marker=dict(size=14, color=dot_colors, line=dict(width=1.5, color="white")),
760
  text=names,
761
  textposition="top center",
762
+ textfont=dict(size=10, family=PLOTLY_FONT),
763
  hovertext=hover_text,
764
  hoverinfo="text",
765
  name="Agents",
 
779
  fig.add_trace(go.Scatter(
780
  x=pareto_x, y=pareto_y,
781
  mode="lines",
782
+ line=dict(color="#4f46e5", width=2, dash="dot"),
783
  name="Pareto Frontier",
784
  ))
785
 
786
+ fig.update_layout(**_plotly_layout(
787
  title="Performance-Safety Frontier",
788
  xaxis_title="CR (Completion Rate)",
789
  yaxis_title="CuP (Completion under Policy)",
790
+ xaxis=dict(range=[-0.02, 1.02], gridcolor="#f1f5f9", zeroline=False),
791
+ yaxis=dict(range=[-0.02, 1.02], gridcolor="#f1f5f9", zeroline=False),
792
  height=550,
793
  legend=dict(x=0.02, y=0.98),
794
+ ))
795
  return fig
796
 
797
 
 
1101
  submissions = load_submissions()
1102
  agent_choices = [s.get("metadata", {}).get("agent_id", "?") for s in submissions]
1103
 
1104
+ theme = gr.themes.Soft(
1105
+ primary_hue=colors.blue,
1106
+ secondary_hue=colors.indigo,
1107
+ neutral_hue=colors.slate,
1108
+ spacing_size=sizes.spacing_md,
1109
+ radius_size=sizes.radius_md,
1110
+ text_size=sizes.text_md,
1111
+ font=(
1112
+ gr.themes.GoogleFont("Inter"),
1113
+ "ui-sans-serif",
1114
+ "system-ui",
1115
+ "sans-serif",
1116
+ ),
1117
+ font_mono=(
1118
+ gr.themes.GoogleFont("JetBrains Mono"),
1119
+ "ui-monospace",
1120
+ "Consolas",
1121
+ "monospace",
1122
+ ),
1123
+ ).set(
1124
+ body_background_fill="#f8fafc",
1125
+ body_text_color="#1e293b",
1126
+ body_text_color_subdued="#64748b",
1127
+ block_background_fill="white",
1128
+ block_border_width="1px",
1129
+ block_border_color="#e2e8f0",
1130
+ block_shadow="0 1px 3px 0 rgb(0 0 0 / 0.05), 0 1px 2px -1px rgb(0 0 0 / 0.05)",
1131
+ block_label_background_fill="*primary_50",
1132
+ block_label_text_color="*primary_700",
1133
+ button_primary_background_fill="linear-gradient(135deg, *primary_500, *secondary_500)",
1134
+ button_primary_background_fill_hover="linear-gradient(135deg, *primary_600, *secondary_600)",
1135
+ button_primary_shadow="0 4px 6px -1px rgb(59 130 246 / 0.25)",
1136
+ button_primary_border_color="transparent",
1137
+ button_secondary_background_fill="white",
1138
+ button_secondary_border_color="*primary_200",
1139
+ button_secondary_text_color="*primary_600",
1140
+ input_background_fill="white",
1141
+ input_border_color="#e2e8f0",
1142
+ input_border_width="1px",
1143
+ input_shadow="none",
1144
+ input_shadow_focus="0 0 0 3px rgb(59 130 246 / 0.15)",
1145
+ table_border_color="#e2e8f0",
1146
+ table_even_background_fill="white",
1147
+ table_odd_background_fill="#f8fafc",
1148
+ link_text_color="*primary_600",
1149
+ link_text_color_hover="*primary_700",
1150
+ link_text_color_active="*primary_800",
1151
+ )
1152
+
1153
  with gr.Blocks(
1154
  title="ST-WebAgentBench Leaderboard",
1155
+ theme=theme,
1156
+ css=CUSTOM_CSS,
1157
  ) as demo:
1158
 
1159
+ gr.HTML(f"""
1160
+ <div id="hero-header">
1161
+ <div class="logo-row">
1162
+ <img src="assets/ibm_research_logo.png" alt="IBM Research" />
1163
+ </div>
1164
+ <h1>ST-WebAgentBench <span class="iclr-badge">ICLR 2025</span></h1>
1165
+ <p class="subtitle">
1166
+ Evaluating Safety &amp; Trustworthiness in Web Agents
 
 
 
 
1167
  </p>
1168
+ <div class="nav-links">
1169
+ <a href="https://arxiv.org/abs/2410.06703" target="_blank">&#128196; Paper</a>
1170
+ <a href="https://huggingface.co/datasets/dolev31/st-webagentbench" target="_blank">&#128202; Dataset</a>
1171
+ <a href="https://github.com/segev-shlomov/ST-WebAgentBench" target="_blank">&#128187; GitHub</a>
1172
+ <a href="https://sites.google.com/view/st-webagentbench/home" target="_blank">&#127760; Website</a>
1173
+ </div>
1174
+ <div class="stats-strip">
1175
+ <div class="stat-item">
1176
+ <div class="stat-value">{EXPECTED_TASK_COUNT}</div>
1177
+ <div class="stat-label">Tasks</div>
1178
+ </div>
1179
+ <div class="stat-item">
1180
+ <div class="stat-value">{EXPECTED_POLICY_COUNT:,}</div>
1181
+ <div class="stat-label">Policies</div>
1182
+ </div>
1183
+ <div class="stat-item">
1184
+ <div class="stat-value">{len(SAFETY_DIMENSIONS)}</div>
1185
+ <div class="stat-label">Safety Dimensions</div>
1186
+ </div>
1187
+ <div class="stat-item">
1188
+ <div class="stat-value">3</div>
1189
+ <div class="stat-label">Web Applications</div>
1190
+ </div>
1191
+ </div>
1192
  </div>
1193
  """)
1194
 
 
1196
 
1197
  # ---- Tab 1: Leaderboard ----
1198
  with gr.TabItem("Leaderboard"):
1199
+ with gr.Row(elem_classes="filter-row"):
1200
  sort_by = gr.Dropdown(
1201
  choices=["CuP", "CR", "semi-CuP", "Risk Ratio", "Gap", "Date"],
1202
  value="CuP", label="Sort by",
 
1211
  leaderboard_table = gr.Dataframe(
1212
  value=build_main_table(submissions),
1213
  interactive=False,
1214
+ label="Ranked by CuP (Completion under Policy)",
1215
+ elem_id="leaderboard-table",
1216
  )
1217
 
1218
  def update_table(sort_val, model_val, open_val, verified_val):
 
1230
  gr.Markdown("### Performance-Safety Frontier")
1231
  pareto_plot = gr.Plot(
1232
  value=build_pareto_frontier(submissions),
 
1233
  )
1234
+ with gr.Accordion("How to read this chart", open=False):
1235
+ gr.Markdown("""
1236
+ - The **diagonal** (y=x) represents perfect policy adherence
1237
+ - Distance below the diagonal = the agent's **safety gap**
1238
+ - The **Pareto frontier** connects agents that are best-in-class at their safety level
1239
+ - **Dot color**: Green = low risk, Yellow = medium, Red = high
1240
+ """)
1241
 
1242
+ # ---- Tab 2: Safety ----
1243
+ with gr.TabItem("Safety"):
1244
  agent_selector = gr.Dropdown(
1245
  choices=agent_choices,
1246
  multiselect=True,
 
1262
 
1263
  agent_selector.change(update_radar, inputs=[agent_selector], outputs=[radar_chart], api_name=False)
1264
 
1265
+ # ---- Tab 3: Tiers ----
1266
+ with gr.TabItem("Tiers"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1267
  gr.Markdown("""
1268
  ### CRM Difficulty Tier Breakdown
1269
 
 
1279
  interactive=False,
1280
  )
1281
 
1282
+ # ---- Tab 4: Per-App ----
1283
+ with gr.TabItem("Per-App"):
1284
  gr.Markdown("### Performance by Web Application")
1285
  app_table = gr.Dataframe(
1286
  value=build_app_table(submissions),
1287
  interactive=False,
1288
  )
1289
 
1290
+ # ---- Tab 5: Get Key ----
1291
+ with gr.TabItem("Get Key"):
1292
  gr.Markdown("""
1293
  ## Get Your Signing Key
1294
 
 
1301
  **Important:** Use the **same email** here and as `--contact-email`
1302
  when generating your submission file.
1303
  """)
1304
+ with gr.Group(elem_classes="form-card"):
1305
+ key_email = gr.Textbox(label="Email", placeholder="you@example.com")
1306
+ key_team = gr.Textbox(label="Team Name", placeholder="Your Team")
1307
+ key_institution = gr.Textbox(label="Institution (optional)", placeholder="University / Company")
1308
+ key_btn = gr.Button("Generate Signing Key", variant="primary")
1309
  key_result = gr.Textbox(label="Your Signing Key", interactive=False, lines=6)
1310
 
1311
  key_btn.click(
 
1315
  api_name=False,
1316
  )
1317
 
1318
+ # ---- Tab 6: Submit ----
1319
  with gr.TabItem("Submit"):
1320
  gr.Markdown(f"""
1321
  ## Submit Your Results
 
1352
  5. **Anti-gaming** — rate limiting, duplicate detection, completeness enforcement
1353
  """)
1354
 
1355
+ with gr.Group(elem_classes="form-card"):
1356
+ upload = gr.File(label="Upload submission.json", file_types=[".json"])
1357
+ submit_btn = gr.Button("Validate & Submit", variant="primary")
1358
  result_text = gr.Textbox(label="Verification Report", interactive=False, lines=20)
1359
 
1360
  submit_btn.click(
 
1364
  api_name=False,
1365
  )
1366
 
1367
+ # ---- Tab 7: FAQ ----
1368
  with gr.TabItem("FAQ"):
1369
+ with gr.Column(elem_classes="faq-section"):
1370
  gr.Markdown("""
1371
  ## Frequently Asked Questions
1372
 
 
1698
  contact details.
1699
  """)
1700
 
1701
+ # ---- Tab 8: About ----
1702
  with gr.TabItem("About"):
1703
  # Build dimensions list dynamically
1704
  _dim_lines = "\n".join(
assets/ibm_research_logo.png ADDED

Git LFS Details

  • SHA256: 4c2e6739f852352d125a0708b594f3af7294a8411045fbda1b900d6abf935906
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB