Redesign leaderboard UI with polished theme, hero header, and consistent styling
Browse files- Add custom Gradio theme (blue-indigo palette, Inter/JetBrains Mono fonts)
- Add dark gradient hero header with IBM Research logo, ICLR 2025 badge, stats strip
- Add ~200 lines of custom CSS: underline tabs, polished tables, form cards, FAQ styling
- Consolidate Frontier tab into Leaderboard (removes duplicate Pareto plot)
- Rename tabs for cleaner navigation (Safety, Tiers, Per-App, Get Key)
- Add Plotly style helpers for consistent chart styling and polished empty states
- Add responsive breakpoints for mobile
- app.py +434 -85
- assets/ibm_research_logo.png +3 -0
app.py
CHANGED
|
@@ -21,6 +21,7 @@ from pathlib import Path
|
|
| 21 |
from typing import List, Optional
|
| 22 |
|
| 23 |
import gradio as gr
|
|
|
|
| 24 |
import pandas as pd
|
| 25 |
import plotly.graph_objects as go
|
| 26 |
|
|
@@ -187,6 +188,297 @@ def handle_key_request(email: str, team: str, institution: str) -> str:
|
|
| 187 |
|
| 188 |
RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
# ---------------------------------------------------------------------------
|
| 192 |
# Submission status workflow
|
|
@@ -316,13 +608,10 @@ def build_radar_chart(submissions: list[dict],
|
|
| 316 |
fig = go.Figure()
|
| 317 |
|
| 318 |
if not selected_agents:
|
| 319 |
-
|
| 320 |
-
xref="paper", yref="paper", x=0.5, y=0.5)
|
| 321 |
-
fig.update_layout(title="Safety Dimension Radar", height=500)
|
| 322 |
-
return fig
|
| 323 |
|
| 324 |
dim_labels = [DIMENSION_DISPLAY.get(d, d) for d in SAFETY_DIMENSIONS]
|
| 325 |
-
|
| 326 |
|
| 327 |
for i, agent_name in enumerate(selected_agents[:4]):
|
| 328 |
# Find submission
|
|
@@ -350,27 +639,26 @@ def build_radar_chart(submissions: list[dict],
|
|
| 350 |
theta=labels,
|
| 351 |
fill="toself",
|
| 352 |
name=agent_name,
|
| 353 |
-
line=dict(color=
|
| 354 |
opacity=0.6,
|
| 355 |
))
|
| 356 |
|
| 357 |
-
fig.update_layout(
|
| 358 |
-
polar=dict(
|
| 359 |
-
radialaxis=dict(visible=True, range=[0, 1]),
|
| 360 |
-
),
|
| 361 |
title="Safety Dimension Radar (higher = safer)",
|
| 362 |
height=500,
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
return fig
|
| 366 |
|
| 367 |
|
| 368 |
def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
|
| 369 |
"""Build a heatmap of risk ratios (agents x dimensions)."""
|
| 370 |
if not submissions:
|
| 371 |
-
|
| 372 |
-
fig.add_annotation(text="No submissions yet", showarrow=False)
|
| 373 |
-
return fig
|
| 374 |
|
| 375 |
agent_names = []
|
| 376 |
z_values = []
|
|
@@ -409,11 +697,12 @@ def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
|
|
| 409 |
colorbar=dict(title="Risk Ratio"),
|
| 410 |
))
|
| 411 |
|
| 412 |
-
fig.update_layout(
|
| 413 |
-
title="Risk
|
| 414 |
height=max(300, 60 * len(agent_names) + 100),
|
| 415 |
-
xaxis=dict(side="top"),
|
| 416 |
-
|
|
|
|
| 417 |
return fig
|
| 418 |
|
| 419 |
|
|
@@ -422,16 +711,13 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
|
|
| 422 |
fig = go.Figure()
|
| 423 |
|
| 424 |
if not submissions:
|
| 425 |
-
|
| 426 |
-
xref="paper", yref="paper", x=0.5, y=0.5)
|
| 427 |
-
fig.update_layout(title="Performance-Safety Frontier", height=500)
|
| 428 |
-
return fig
|
| 429 |
|
| 430 |
# Diagonal line (perfect safety: CuP = CR)
|
| 431 |
fig.add_trace(go.Scatter(
|
| 432 |
x=[0, 1], y=[0, 1],
|
| 433 |
mode="lines",
|
| 434 |
-
line=dict(color="
|
| 435 |
name="Perfect Safety (CuP=CR)",
|
| 436 |
showlegend=True,
|
| 437 |
))
|
|
@@ -451,14 +737,14 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
|
|
| 451 |
risks.append(avg_risk)
|
| 452 |
|
| 453 |
# Color by risk level
|
| 454 |
-
|
| 455 |
for r in risks:
|
| 456 |
if r <= 0.05:
|
| 457 |
-
|
| 458 |
elif r <= 0.15:
|
| 459 |
-
|
| 460 |
else:
|
| 461 |
-
|
| 462 |
|
| 463 |
hover_text = [
|
| 464 |
f"<b>{n}</b><br>Team: {t}<br>CR: {cr:.3f}<br>CuP: {cup:.3f}<br>"
|
|
@@ -470,10 +756,10 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
|
|
| 470 |
x=crs,
|
| 471 |
y=cups,
|
| 472 |
mode="markers+text",
|
| 473 |
-
marker=dict(size=14, color=
|
| 474 |
text=names,
|
| 475 |
textposition="top center",
|
| 476 |
-
textfont=dict(size=10),
|
| 477 |
hovertext=hover_text,
|
| 478 |
hoverinfo="text",
|
| 479 |
name="Agents",
|
|
@@ -493,19 +779,19 @@ def build_pareto_frontier(submissions: list[dict]) -> go.Figure:
|
|
| 493 |
fig.add_trace(go.Scatter(
|
| 494 |
x=pareto_x, y=pareto_y,
|
| 495 |
mode="lines",
|
| 496 |
-
line=dict(color="#
|
| 497 |
name="Pareto Frontier",
|
| 498 |
))
|
| 499 |
|
| 500 |
-
fig.update_layout(
|
| 501 |
title="Performance-Safety Frontier",
|
| 502 |
xaxis_title="CR (Completion Rate)",
|
| 503 |
yaxis_title="CuP (Completion under Policy)",
|
| 504 |
-
xaxis=dict(range=[-0.02, 1.02]),
|
| 505 |
-
yaxis=dict(range=[-0.02, 1.02]),
|
| 506 |
height=550,
|
| 507 |
legend=dict(x=0.02, y=0.98),
|
| 508 |
-
)
|
| 509 |
return fig
|
| 510 |
|
| 511 |
|
|
@@ -815,24 +1101,94 @@ def create_app() -> gr.Blocks:
|
|
| 815 |
submissions = load_submissions()
|
| 816 |
agent_choices = [s.get("metadata", {}).get("agent_id", "?") for s in submissions]
|
| 817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
with gr.Blocks(
|
| 819 |
title="ST-WebAgentBench Leaderboard",
|
| 820 |
-
theme=
|
|
|
|
| 821 |
) as demo:
|
| 822 |
|
| 823 |
-
gr.HTML("""
|
| 824 |
-
<div
|
| 825 |
-
<
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
<
|
| 830 |
-
|
| 831 |
-
<a href="https://arxiv.org/abs/2410.06703" target="_blank">Paper</a> |
|
| 832 |
-
<a href="https://huggingface.co/datasets/dolev31/st-webagentbench" target="_blank">Dataset</a> |
|
| 833 |
-
<a href="https://github.com/segev-shlomov/ST-WebAgentBench" target="_blank">GitHub</a> |
|
| 834 |
-
<a href="https://sites.google.com/view/st-webagentbench/home" target="_blank">Website</a>
|
| 835 |
</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
</div>
|
| 837 |
""")
|
| 838 |
|
|
@@ -840,7 +1196,7 @@ def create_app() -> gr.Blocks:
|
|
| 840 |
|
| 841 |
# ---- Tab 1: Leaderboard ----
|
| 842 |
with gr.TabItem("Leaderboard"):
|
| 843 |
-
with gr.Row():
|
| 844 |
sort_by = gr.Dropdown(
|
| 845 |
choices=["CuP", "CR", "semi-CuP", "Risk Ratio", "Gap", "Date"],
|
| 846 |
value="CuP", label="Sort by",
|
|
@@ -855,7 +1211,8 @@ def create_app() -> gr.Blocks:
|
|
| 855 |
leaderboard_table = gr.Dataframe(
|
| 856 |
value=build_main_table(submissions),
|
| 857 |
interactive=False,
|
| 858 |
-
label="Ranked by CuP (Completion under Policy)
|
|
|
|
| 859 |
)
|
| 860 |
|
| 861 |
def update_table(sort_val, model_val, open_val, verified_val):
|
|
@@ -873,11 +1230,17 @@ def create_app() -> gr.Blocks:
|
|
| 873 |
gr.Markdown("### Performance-Safety Frontier")
|
| 874 |
pareto_plot = gr.Plot(
|
| 875 |
value=build_pareto_frontier(submissions),
|
| 876 |
-
label="CR vs CuP — agents on the frontier are Pareto-optimal",
|
| 877 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
|
| 879 |
-
# ---- Tab 2: Safety
|
| 880 |
-
with gr.TabItem("Safety
|
| 881 |
agent_selector = gr.Dropdown(
|
| 882 |
choices=agent_choices,
|
| 883 |
multiselect=True,
|
|
@@ -899,25 +1262,8 @@ def create_app() -> gr.Blocks:
|
|
| 899 |
|
| 900 |
agent_selector.change(update_radar, inputs=[agent_selector], outputs=[radar_chart], api_name=False)
|
| 901 |
|
| 902 |
-
# ---- Tab 3:
|
| 903 |
-
with gr.TabItem("
|
| 904 |
-
gr.Markdown("""
|
| 905 |
-
### Performance-Safety Frontier
|
| 906 |
-
|
| 907 |
-
This scatter plot shows each agent's **CR** (task completion ignoring safety)
|
| 908 |
-
vs **CuP** (task completion with zero policy violations).
|
| 909 |
-
|
| 910 |
-
- The **diagonal** (y=x) represents perfect policy adherence
|
| 911 |
-
- Distance below the diagonal = the agent's **safety gap**
|
| 912 |
-
- The **Pareto frontier** connects agents that are best-in-class for their safety level
|
| 913 |
-
- **Dot color**: Green = low risk, Yellow = medium, Red = high
|
| 914 |
-
""")
|
| 915 |
-
frontier_plot = gr.Plot(
|
| 916 |
-
value=build_pareto_frontier(submissions),
|
| 917 |
-
)
|
| 918 |
-
|
| 919 |
-
# ---- Tab 4: Tier Analysis ----
|
| 920 |
-
with gr.TabItem("Tier Analysis"):
|
| 921 |
gr.Markdown("""
|
| 922 |
### CRM Difficulty Tier Breakdown
|
| 923 |
|
|
@@ -933,16 +1279,16 @@ def create_app() -> gr.Blocks:
|
|
| 933 |
interactive=False,
|
| 934 |
)
|
| 935 |
|
| 936 |
-
# ---- Tab
|
| 937 |
-
with gr.TabItem("Per-App
|
| 938 |
gr.Markdown("### Performance by Web Application")
|
| 939 |
app_table = gr.Dataframe(
|
| 940 |
value=build_app_table(submissions),
|
| 941 |
interactive=False,
|
| 942 |
)
|
| 943 |
|
| 944 |
-
# ---- Tab
|
| 945 |
-
with gr.TabItem("Get
|
| 946 |
gr.Markdown("""
|
| 947 |
## Get Your Signing Key
|
| 948 |
|
|
@@ -955,10 +1301,11 @@ def create_app() -> gr.Blocks:
|
|
| 955 |
**Important:** Use the **same email** here and as `--contact-email`
|
| 956 |
when generating your submission file.
|
| 957 |
""")
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
|
|
|
| 962 |
key_result = gr.Textbox(label="Your Signing Key", interactive=False, lines=6)
|
| 963 |
|
| 964 |
key_btn.click(
|
|
@@ -968,7 +1315,7 @@ def create_app() -> gr.Blocks:
|
|
| 968 |
api_name=False,
|
| 969 |
)
|
| 970 |
|
| 971 |
-
# ---- Tab
|
| 972 |
with gr.TabItem("Submit"):
|
| 973 |
gr.Markdown(f"""
|
| 974 |
## Submit Your Results
|
|
@@ -1005,8 +1352,9 @@ def create_app() -> gr.Blocks:
|
|
| 1005 |
5. **Anti-gaming** — rate limiting, duplicate detection, completeness enforcement
|
| 1006 |
""")
|
| 1007 |
|
| 1008 |
-
|
| 1009 |
-
|
|
|
|
| 1010 |
result_text = gr.Textbox(label="Verification Report", interactive=False, lines=20)
|
| 1011 |
|
| 1012 |
submit_btn.click(
|
|
@@ -1016,8 +1364,9 @@ def create_app() -> gr.Blocks:
|
|
| 1016 |
api_name=False,
|
| 1017 |
)
|
| 1018 |
|
| 1019 |
-
# ---- Tab
|
| 1020 |
with gr.TabItem("FAQ"):
|
|
|
|
| 1021 |
gr.Markdown("""
|
| 1022 |
## Frequently Asked Questions
|
| 1023 |
|
|
@@ -1349,7 +1698,7 @@ or visit the [project website](https://sites.google.com/view/st-webagentbench/ho
|
|
| 1349 |
contact details.
|
| 1350 |
""")
|
| 1351 |
|
| 1352 |
-
# ---- Tab
|
| 1353 |
with gr.TabItem("About"):
|
| 1354 |
# Build dimensions list dynamically
|
| 1355 |
_dim_lines = "\n".join(
|
|
|
|
| 21 |
from typing import List, Optional
|
| 22 |
|
| 23 |
import gradio as gr
|
| 24 |
+
from gradio.themes.utils import colors, fonts, sizes
|
| 25 |
import pandas as pd
|
| 26 |
import plotly.graph_objects as go
|
| 27 |
|
|
|
|
| 188 |
|
| 189 |
RISK_COLORS = {"low": "#22c55e", "medium": "#eab308", "high": "#ef4444"}
|
| 190 |
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
# UI Design Constants
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
+
|
| 195 |
+
CUSTOM_CSS = """
|
| 196 |
+
/* === Global === */
|
| 197 |
+
.gradio-container {
|
| 198 |
+
max-width: 1200px !important;
|
| 199 |
+
margin: 0 auto !important;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
/* === Hero Header === */
|
| 203 |
+
#hero-header {
|
| 204 |
+
background: linear-gradient(135deg, #1e3a8a 0%, #312e81 50%, #1e293b 100%);
|
| 205 |
+
border-radius: 16px;
|
| 206 |
+
padding: 40px 48px 32px;
|
| 207 |
+
margin-bottom: 8px;
|
| 208 |
+
position: relative;
|
| 209 |
+
overflow: hidden;
|
| 210 |
+
}
|
| 211 |
+
#hero-header::before {
|
| 212 |
+
content: '';
|
| 213 |
+
position: absolute;
|
| 214 |
+
top: -50%;
|
| 215 |
+
right: -20%;
|
| 216 |
+
width: 500px;
|
| 217 |
+
height: 500px;
|
| 218 |
+
background: radial-gradient(circle, rgba(99, 102, 241, 0.15) 0%, transparent 70%);
|
| 219 |
+
pointer-events: none;
|
| 220 |
+
}
|
| 221 |
+
#hero-header h1 {
|
| 222 |
+
color: white;
|
| 223 |
+
font-size: 2rem;
|
| 224 |
+
font-weight: 700;
|
| 225 |
+
margin: 0 0 6px 0;
|
| 226 |
+
letter-spacing: -0.02em;
|
| 227 |
+
}
|
| 228 |
+
#hero-header .subtitle {
|
| 229 |
+
color: #cbd5e1;
|
| 230 |
+
font-size: 1.05rem;
|
| 231 |
+
margin: 0 0 16px 0;
|
| 232 |
+
font-weight: 400;
|
| 233 |
+
}
|
| 234 |
+
#hero-header .iclr-badge {
|
| 235 |
+
display: inline-block;
|
| 236 |
+
background: linear-gradient(135deg, #6366f1, #818cf8);
|
| 237 |
+
color: white;
|
| 238 |
+
font-size: 0.75rem;
|
| 239 |
+
font-weight: 600;
|
| 240 |
+
padding: 3px 10px;
|
| 241 |
+
border-radius: 9999px;
|
| 242 |
+
letter-spacing: 0.03em;
|
| 243 |
+
vertical-align: middle;
|
| 244 |
+
margin-left: 8px;
|
| 245 |
+
}
|
| 246 |
+
#hero-header .nav-links {
|
| 247 |
+
margin-top: 12px;
|
| 248 |
+
display: flex;
|
| 249 |
+
gap: 20px;
|
| 250 |
+
flex-wrap: wrap;
|
| 251 |
+
}
|
| 252 |
+
#hero-header .nav-links a {
|
| 253 |
+
color: #93c5fd;
|
| 254 |
+
text-decoration: none;
|
| 255 |
+
font-size: 0.9rem;
|
| 256 |
+
font-weight: 500;
|
| 257 |
+
transition: color 0.15s ease;
|
| 258 |
+
display: inline-flex;
|
| 259 |
+
align-items: center;
|
| 260 |
+
gap: 4px;
|
| 261 |
+
}
|
| 262 |
+
#hero-header .nav-links a:hover {
|
| 263 |
+
color: white;
|
| 264 |
+
}
|
| 265 |
+
#hero-header .stats-strip {
|
| 266 |
+
display: flex;
|
| 267 |
+
gap: 32px;
|
| 268 |
+
margin-top: 20px;
|
| 269 |
+
padding-top: 16px;
|
| 270 |
+
border-top: 1px solid rgba(255,255,255,0.1);
|
| 271 |
+
flex-wrap: wrap;
|
| 272 |
+
}
|
| 273 |
+
#hero-header .stat-item {
|
| 274 |
+
text-align: left;
|
| 275 |
+
}
|
| 276 |
+
#hero-header .stat-value {
|
| 277 |
+
color: white;
|
| 278 |
+
font-size: 1.5rem;
|
| 279 |
+
font-weight: 700;
|
| 280 |
+
line-height: 1.2;
|
| 281 |
+
}
|
| 282 |
+
#hero-header .stat-label {
|
| 283 |
+
color: #94a3b8;
|
| 284 |
+
font-size: 0.78rem;
|
| 285 |
+
font-weight: 500;
|
| 286 |
+
text-transform: uppercase;
|
| 287 |
+
letter-spacing: 0.05em;
|
| 288 |
+
}
|
| 289 |
+
#hero-header .logo-row {
|
| 290 |
+
display: flex;
|
| 291 |
+
align-items: center;
|
| 292 |
+
gap: 16px;
|
| 293 |
+
margin-bottom: 12px;
|
| 294 |
+
}
|
| 295 |
+
#hero-header .logo-row img {
|
| 296 |
+
height: 28px;
|
| 297 |
+
filter: brightness(0) invert(1);
|
| 298 |
+
opacity: 0.9;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
/* === Tabs === */
|
| 302 |
+
.tabs > .tab-nav {
|
| 303 |
+
border-bottom: 2px solid #e2e8f0 !important;
|
| 304 |
+
gap: 0 !important;
|
| 305 |
+
padding: 0 4px !important;
|
| 306 |
+
background: transparent !important;
|
| 307 |
+
}
|
| 308 |
+
.tabs > .tab-nav > button {
|
| 309 |
+
border: none !important;
|
| 310 |
+
border-bottom: 2px solid transparent !important;
|
| 311 |
+
margin-bottom: -2px !important;
|
| 312 |
+
padding: 10px 18px !important;
|
| 313 |
+
font-weight: 500 !important;
|
| 314 |
+
font-size: 0.9rem !important;
|
| 315 |
+
color: #64748b !important;
|
| 316 |
+
background: transparent !important;
|
| 317 |
+
transition: color 0.15s ease, border-color 0.15s ease !important;
|
| 318 |
+
border-radius: 0 !important;
|
| 319 |
+
box-shadow: none !important;
|
| 320 |
+
}
|
| 321 |
+
.tabs > .tab-nav > button:hover {
|
| 322 |
+
color: #1e293b !important;
|
| 323 |
+
background: transparent !important;
|
| 324 |
+
}
|
| 325 |
+
.tabs > .tab-nav > button.selected {
|
| 326 |
+
color: #2563eb !important;
|
| 327 |
+
border-bottom-color: #2563eb !important;
|
| 328 |
+
font-weight: 600 !important;
|
| 329 |
+
background: transparent !important;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
/* === Tables (Dataframe) === */
|
| 333 |
+
.table-wrap {
|
| 334 |
+
border-radius: 12px !important;
|
| 335 |
+
overflow: hidden !important;
|
| 336 |
+
border: 1px solid #e2e8f0 !important;
|
| 337 |
+
}
|
| 338 |
+
.table-wrap table {
|
| 339 |
+
border-collapse: collapse !important;
|
| 340 |
+
}
|
| 341 |
+
.table-wrap table thead th {
|
| 342 |
+
background: #f1f5f9 !important;
|
| 343 |
+
color: #334155 !important;
|
| 344 |
+
font-weight: 600 !important;
|
| 345 |
+
font-size: 0.82rem !important;
|
| 346 |
+
text-transform: uppercase !important;
|
| 347 |
+
letter-spacing: 0.04em !important;
|
| 348 |
+
padding: 12px 16px !important;
|
| 349 |
+
border-bottom: 2px solid #e2e8f0 !important;
|
| 350 |
+
}
|
| 351 |
+
.table-wrap table tbody td {
|
| 352 |
+
padding: 10px 16px !important;
|
| 353 |
+
font-size: 0.88rem !important;
|
| 354 |
+
border-bottom: 1px solid #f1f5f9 !important;
|
| 355 |
+
}
|
| 356 |
+
.table-wrap table tbody tr:hover {
|
| 357 |
+
background: #eff6ff !important;
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
/* === Accordion (FAQ) === */
|
| 361 |
+
.faq-section .accordion {
|
| 362 |
+
border: 1px solid #e2e8f0 !important;
|
| 363 |
+
border-radius: 10px !important;
|
| 364 |
+
margin-bottom: 8px !important;
|
| 365 |
+
overflow: hidden !important;
|
| 366 |
+
box-shadow: none !important;
|
| 367 |
+
}
|
| 368 |
+
.faq-section .accordion > .label-wrap {
|
| 369 |
+
padding: 14px 18px !important;
|
| 370 |
+
background: white !important;
|
| 371 |
+
}
|
| 372 |
+
.faq-section .accordion > .label-wrap:hover {
|
| 373 |
+
background: #f8fafc !important;
|
| 374 |
+
}
|
| 375 |
+
.faq-section .accordion .prose {
|
| 376 |
+
padding: 4px 18px 18px !important;
|
| 377 |
+
color: #475569 !important;
|
| 378 |
+
line-height: 1.65 !important;
|
| 379 |
+
}
|
| 380 |
+
.faq-section h3 {
|
| 381 |
+
color: #1e293b !important;
|
| 382 |
+
font-size: 1.05rem !important;
|
| 383 |
+
font-weight: 600 !important;
|
| 384 |
+
margin-top: 28px !important;
|
| 385 |
+
margin-bottom: 12px !important;
|
| 386 |
+
padding-bottom: 6px !important;
|
| 387 |
+
border-bottom: 1px solid #e2e8f0 !important;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
/* === Form Cards === */
|
| 391 |
+
.form-card {
|
| 392 |
+
background: white !important;
|
| 393 |
+
border: 1px solid #e2e8f0 !important;
|
| 394 |
+
border-radius: 12px !important;
|
| 395 |
+
padding: 24px !important;
|
| 396 |
+
box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.04) !important;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
/* === Filter Row === */
|
| 400 |
+
.filter-row {
|
| 401 |
+
background: #f8fafc !important;
|
| 402 |
+
border: 1px solid #e2e8f0 !important;
|
| 403 |
+
border-radius: 10px !important;
|
| 404 |
+
padding: 12px 16px !important;
|
| 405 |
+
margin-bottom: 12px !important;
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
/* === Responsive === */
|
| 409 |
+
@media (max-width: 768px) {
|
| 410 |
+
#hero-header {
|
| 411 |
+
padding: 28px 24px 24px;
|
| 412 |
+
}
|
| 413 |
+
#hero-header h1 {
|
| 414 |
+
font-size: 1.5rem;
|
| 415 |
+
}
|
| 416 |
+
#hero-header .stats-strip {
|
| 417 |
+
gap: 20px;
|
| 418 |
+
}
|
| 419 |
+
#hero-header .stat-value {
|
| 420 |
+
font-size: 1.2rem;
|
| 421 |
+
}
|
| 422 |
+
.tabs > .tab-nav > button {
|
| 423 |
+
padding: 8px 12px !important;
|
| 424 |
+
font-size: 0.82rem !important;
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
"""
|
| 428 |
+
|
| 429 |
+
# --- Plotly Style Constants ---
|
| 430 |
+
PLOTLY_FONT = "Inter, system-ui, sans-serif"
|
| 431 |
+
PLOTLY_TEXT_COLOR = "#334155" # slate-700
|
| 432 |
+
PLOTLY_TITLE_COLOR = "#1e293b" # slate-800
|
| 433 |
+
PLOTLY_GRID_COLOR = "#e2e8f0" # slate-200
|
| 434 |
+
|
| 435 |
+
PLOTLY_COLORWAY = [
|
| 436 |
+
"#3b82f6", # blue-500
|
| 437 |
+
"#6366f1", # indigo-500
|
| 438 |
+
"#8b5cf6", # violet-500
|
| 439 |
+
"#06b6d4", # cyan-500
|
| 440 |
+
"#10b981", # emerald-500
|
| 441 |
+
"#f59e0b", # amber-500
|
| 442 |
+
]
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
def _plotly_layout(**overrides) -> dict:
|
| 446 |
+
"""Consistent Plotly layout kwargs."""
|
| 447 |
+
defaults = dict(
|
| 448 |
+
font=dict(family=PLOTLY_FONT, color=PLOTLY_TEXT_COLOR, size=13),
|
| 449 |
+
title_font=dict(family=PLOTLY_FONT, color=PLOTLY_TITLE_COLOR, size=16),
|
| 450 |
+
plot_bgcolor="rgba(0,0,0,0)",
|
| 451 |
+
paper_bgcolor="rgba(0,0,0,0)",
|
| 452 |
+
margin=dict(l=48, r=24, t=56, b=48),
|
| 453 |
+
legend=dict(
|
| 454 |
+
font=dict(size=12),
|
| 455 |
+
bgcolor="rgba(255,255,255,0.8)",
|
| 456 |
+
bordercolor="#e2e8f0",
|
| 457 |
+
borderwidth=1,
|
| 458 |
+
),
|
| 459 |
+
colorway=PLOTLY_COLORWAY,
|
| 460 |
+
)
|
| 461 |
+
defaults.update(overrides)
|
| 462 |
+
return defaults
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def _empty_figure(message: str, height: int = 400) -> go.Figure:
|
| 466 |
+
"""Polished empty-state chart."""
|
| 467 |
+
fig = go.Figure()
|
| 468 |
+
fig.add_annotation(
|
| 469 |
+
text=f"<b>{message}</b><br><span style='font-size:12px;color:#94a3b8'>"
|
| 470 |
+
f"Submit results to populate this chart</span>",
|
| 471 |
+
showarrow=False,
|
| 472 |
+
xref="paper", yref="paper", x=0.5, y=0.5,
|
| 473 |
+
font=dict(size=16, color="#64748b", family=PLOTLY_FONT),
|
| 474 |
+
)
|
| 475 |
+
fig.update_layout(
|
| 476 |
+
**_plotly_layout(height=height),
|
| 477 |
+
xaxis=dict(visible=False),
|
| 478 |
+
yaxis=dict(visible=False),
|
| 479 |
+
)
|
| 480 |
+
return fig
|
| 481 |
+
|
| 482 |
|
| 483 |
# ---------------------------------------------------------------------------
|
| 484 |
# Submission status workflow
|
|
|
|
| 608 |
fig = go.Figure()
|
| 609 |
|
| 610 |
if not selected_agents:
|
| 611 |
+
return _empty_figure("Select agents to compare", 500)
|
|
|
|
|
|
|
|
|
|
| 612 |
|
| 613 |
dim_labels = [DIMENSION_DISPLAY.get(d, d) for d in SAFETY_DIMENSIONS]
|
| 614 |
+
chart_colors = PLOTLY_COLORWAY[:4]
|
| 615 |
|
| 616 |
for i, agent_name in enumerate(selected_agents[:4]):
|
| 617 |
# Find submission
|
|
|
|
| 639 |
theta=labels,
|
| 640 |
fill="toself",
|
| 641 |
name=agent_name,
|
| 642 |
+
line=dict(color=chart_colors[i % len(chart_colors)]),
|
| 643 |
opacity=0.6,
|
| 644 |
))
|
| 645 |
|
| 646 |
+
fig.update_layout(**_plotly_layout(
|
|
|
|
|
|
|
|
|
|
| 647 |
title="Safety Dimension Radar (higher = safer)",
|
| 648 |
height=500,
|
| 649 |
+
polar=dict(
|
| 650 |
+
radialaxis=dict(visible=True, range=[0, 1], gridcolor=PLOTLY_GRID_COLOR),
|
| 651 |
+
angularaxis=dict(gridcolor=PLOTLY_GRID_COLOR),
|
| 652 |
+
bgcolor="rgba(0,0,0,0)",
|
| 653 |
+
),
|
| 654 |
+
))
|
| 655 |
return fig
|
| 656 |
|
| 657 |
|
| 658 |
def build_risk_heatmap(submissions: list[dict]) -> go.Figure:
|
| 659 |
"""Build a heatmap of risk ratios (agents x dimensions)."""
|
| 660 |
if not submissions:
|
| 661 |
+
return _empty_figure("No submissions yet")
|
|
|
|
|
|
|
| 662 |
|
| 663 |
agent_names = []
|
| 664 |
z_values = []
|
|
|
|
| 697 |
colorbar=dict(title="Risk Ratio"),
|
| 698 |
))
|
| 699 |
|
| 700 |
+
fig.update_layout(**_plotly_layout(
|
| 701 |
+
title="Risk Heatmap by Safety Dimension",
|
| 702 |
height=max(300, 60 * len(agent_names) + 100),
|
| 703 |
+
xaxis=dict(side="top", tickfont=dict(size=11)),
|
| 704 |
+
yaxis=dict(tickfont=dict(size=12)),
|
| 705 |
+
))
|
| 706 |
return fig
|
| 707 |
|
| 708 |
|
|
|
|
| 711 |
fig = go.Figure()
|
| 712 |
|
| 713 |
if not submissions:
|
| 714 |
+
return _empty_figure("No submissions yet", 550)
|
|
|
|
|
|
|
|
|
|
| 715 |
|
| 716 |
# Diagonal line (perfect safety: CuP = CR)
|
| 717 |
fig.add_trace(go.Scatter(
|
| 718 |
x=[0, 1], y=[0, 1],
|
| 719 |
mode="lines",
|
| 720 |
+
line=dict(color="#94a3b8", dash="dash", width=1),
|
| 721 |
name="Perfect Safety (CuP=CR)",
|
| 722 |
showlegend=True,
|
| 723 |
))
|
|
|
|
| 737 |
risks.append(avg_risk)
|
| 738 |
|
| 739 |
# Color by risk level
|
| 740 |
+
dot_colors = []
|
| 741 |
for r in risks:
|
| 742 |
if r <= 0.05:
|
| 743 |
+
dot_colors.append("#22c55e")
|
| 744 |
elif r <= 0.15:
|
| 745 |
+
dot_colors.append("#eab308")
|
| 746 |
else:
|
| 747 |
+
dot_colors.append("#ef4444")
|
| 748 |
|
| 749 |
hover_text = [
|
| 750 |
f"<b>{n}</b><br>Team: {t}<br>CR: {cr:.3f}<br>CuP: {cup:.3f}<br>"
|
|
|
|
| 756 |
x=crs,
|
| 757 |
y=cups,
|
| 758 |
mode="markers+text",
|
| 759 |
+
marker=dict(size=14, color=dot_colors, line=dict(width=1.5, color="white")),
|
| 760 |
text=names,
|
| 761 |
textposition="top center",
|
| 762 |
+
textfont=dict(size=10, family=PLOTLY_FONT),
|
| 763 |
hovertext=hover_text,
|
| 764 |
hoverinfo="text",
|
| 765 |
name="Agents",
|
|
|
|
| 779 |
fig.add_trace(go.Scatter(
|
| 780 |
x=pareto_x, y=pareto_y,
|
| 781 |
mode="lines",
|
| 782 |
+
line=dict(color="#4f46e5", width=2, dash="dot"),
|
| 783 |
name="Pareto Frontier",
|
| 784 |
))
|
| 785 |
|
| 786 |
+
fig.update_layout(**_plotly_layout(
|
| 787 |
title="Performance-Safety Frontier",
|
| 788 |
xaxis_title="CR (Completion Rate)",
|
| 789 |
yaxis_title="CuP (Completion under Policy)",
|
| 790 |
+
xaxis=dict(range=[-0.02, 1.02], gridcolor="#f1f5f9", zeroline=False),
|
| 791 |
+
yaxis=dict(range=[-0.02, 1.02], gridcolor="#f1f5f9", zeroline=False),
|
| 792 |
height=550,
|
| 793 |
legend=dict(x=0.02, y=0.98),
|
| 794 |
+
))
|
| 795 |
return fig
|
| 796 |
|
| 797 |
|
|
|
|
| 1101 |
submissions = load_submissions()
|
| 1102 |
agent_choices = [s.get("metadata", {}).get("agent_id", "?") for s in submissions]
|
| 1103 |
|
| 1104 |
+
theme = gr.themes.Soft(
|
| 1105 |
+
primary_hue=colors.blue,
|
| 1106 |
+
secondary_hue=colors.indigo,
|
| 1107 |
+
neutral_hue=colors.slate,
|
| 1108 |
+
spacing_size=sizes.spacing_md,
|
| 1109 |
+
radius_size=sizes.radius_md,
|
| 1110 |
+
text_size=sizes.text_md,
|
| 1111 |
+
font=(
|
| 1112 |
+
gr.themes.GoogleFont("Inter"),
|
| 1113 |
+
"ui-sans-serif",
|
| 1114 |
+
"system-ui",
|
| 1115 |
+
"sans-serif",
|
| 1116 |
+
),
|
| 1117 |
+
font_mono=(
|
| 1118 |
+
gr.themes.GoogleFont("JetBrains Mono"),
|
| 1119 |
+
"ui-monospace",
|
| 1120 |
+
"Consolas",
|
| 1121 |
+
"monospace",
|
| 1122 |
+
),
|
| 1123 |
+
).set(
|
| 1124 |
+
body_background_fill="#f8fafc",
|
| 1125 |
+
body_text_color="#1e293b",
|
| 1126 |
+
body_text_color_subdued="#64748b",
|
| 1127 |
+
block_background_fill="white",
|
| 1128 |
+
block_border_width="1px",
|
| 1129 |
+
block_border_color="#e2e8f0",
|
| 1130 |
+
block_shadow="0 1px 3px 0 rgb(0 0 0 / 0.05), 0 1px 2px -1px rgb(0 0 0 / 0.05)",
|
| 1131 |
+
block_label_background_fill="*primary_50",
|
| 1132 |
+
block_label_text_color="*primary_700",
|
| 1133 |
+
button_primary_background_fill="linear-gradient(135deg, *primary_500, *secondary_500)",
|
| 1134 |
+
button_primary_background_fill_hover="linear-gradient(135deg, *primary_600, *secondary_600)",
|
| 1135 |
+
button_primary_shadow="0 4px 6px -1px rgb(59 130 246 / 0.25)",
|
| 1136 |
+
button_primary_border_color="transparent",
|
| 1137 |
+
button_secondary_background_fill="white",
|
| 1138 |
+
button_secondary_border_color="*primary_200",
|
| 1139 |
+
button_secondary_text_color="*primary_600",
|
| 1140 |
+
input_background_fill="white",
|
| 1141 |
+
input_border_color="#e2e8f0",
|
| 1142 |
+
input_border_width="1px",
|
| 1143 |
+
input_shadow="none",
|
| 1144 |
+
input_shadow_focus="0 0 0 3px rgb(59 130 246 / 0.15)",
|
| 1145 |
+
table_border_color="#e2e8f0",
|
| 1146 |
+
table_even_background_fill="white",
|
| 1147 |
+
table_odd_background_fill="#f8fafc",
|
| 1148 |
+
link_text_color="*primary_600",
|
| 1149 |
+
link_text_color_hover="*primary_700",
|
| 1150 |
+
link_text_color_active="*primary_800",
|
| 1151 |
+
)
|
| 1152 |
+
|
| 1153 |
with gr.Blocks(
|
| 1154 |
title="ST-WebAgentBench Leaderboard",
|
| 1155 |
+
theme=theme,
|
| 1156 |
+
css=CUSTOM_CSS,
|
| 1157 |
) as demo:
|
| 1158 |
|
| 1159 |
+
gr.HTML(f"""
|
| 1160 |
+
<div id="hero-header">
|
| 1161 |
+
<div class="logo-row">
|
| 1162 |
+
<img src="assets/ibm_research_logo.png" alt="IBM Research" />
|
| 1163 |
+
</div>
|
| 1164 |
+
<h1>ST-WebAgentBench <span class="iclr-badge">ICLR 2025</span></h1>
|
| 1165 |
+
<p class="subtitle">
|
| 1166 |
+
Evaluating Safety & Trustworthiness in Web Agents
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1167 |
</p>
|
| 1168 |
+
<div class="nav-links">
|
| 1169 |
+
<a href="https://arxiv.org/abs/2410.06703" target="_blank">📄 Paper</a>
|
| 1170 |
+
<a href="https://huggingface.co/datasets/dolev31/st-webagentbench" target="_blank">📊 Dataset</a>
|
| 1171 |
+
<a href="https://github.com/segev-shlomov/ST-WebAgentBench" target="_blank">💻 GitHub</a>
|
| 1172 |
+
<a href="https://sites.google.com/view/st-webagentbench/home" target="_blank">🌐 Website</a>
|
| 1173 |
+
</div>
|
| 1174 |
+
<div class="stats-strip">
|
| 1175 |
+
<div class="stat-item">
|
| 1176 |
+
<div class="stat-value">{EXPECTED_TASK_COUNT}</div>
|
| 1177 |
+
<div class="stat-label">Tasks</div>
|
| 1178 |
+
</div>
|
| 1179 |
+
<div class="stat-item">
|
| 1180 |
+
<div class="stat-value">{EXPECTED_POLICY_COUNT:,}</div>
|
| 1181 |
+
<div class="stat-label">Policies</div>
|
| 1182 |
+
</div>
|
| 1183 |
+
<div class="stat-item">
|
| 1184 |
+
<div class="stat-value">{len(SAFETY_DIMENSIONS)}</div>
|
| 1185 |
+
<div class="stat-label">Safety Dimensions</div>
|
| 1186 |
+
</div>
|
| 1187 |
+
<div class="stat-item">
|
| 1188 |
+
<div class="stat-value">3</div>
|
| 1189 |
+
<div class="stat-label">Web Applications</div>
|
| 1190 |
+
</div>
|
| 1191 |
+
</div>
|
| 1192 |
</div>
|
| 1193 |
""")
|
| 1194 |
|
|
|
|
| 1196 |
|
| 1197 |
# ---- Tab 1: Leaderboard ----
|
| 1198 |
with gr.TabItem("Leaderboard"):
|
| 1199 |
+
with gr.Row(elem_classes="filter-row"):
|
| 1200 |
sort_by = gr.Dropdown(
|
| 1201 |
choices=["CuP", "CR", "semi-CuP", "Risk Ratio", "Gap", "Date"],
|
| 1202 |
value="CuP", label="Sort by",
|
|
|
|
| 1211 |
leaderboard_table = gr.Dataframe(
|
| 1212 |
value=build_main_table(submissions),
|
| 1213 |
interactive=False,
|
| 1214 |
+
label="Ranked by CuP (Completion under Policy)",
|
| 1215 |
+
elem_id="leaderboard-table",
|
| 1216 |
)
|
| 1217 |
|
| 1218 |
def update_table(sort_val, model_val, open_val, verified_val):
|
|
|
|
| 1230 |
gr.Markdown("### Performance-Safety Frontier")
|
| 1231 |
pareto_plot = gr.Plot(
|
| 1232 |
value=build_pareto_frontier(submissions),
|
|
|
|
| 1233 |
)
|
| 1234 |
+
with gr.Accordion("How to read this chart", open=False):
|
| 1235 |
+
gr.Markdown("""
|
| 1236 |
+
- The **diagonal** (y=x) represents perfect policy adherence
|
| 1237 |
+
- Distance below the diagonal = the agent's **safety gap**
|
| 1238 |
+
- The **Pareto frontier** connects agents that are best-in-class at their safety level
|
| 1239 |
+
- **Dot color**: Green = low risk, Yellow = medium, Red = high
|
| 1240 |
+
""")
|
| 1241 |
|
| 1242 |
+
# ---- Tab 2: Safety ----
|
| 1243 |
+
with gr.TabItem("Safety"):
|
| 1244 |
agent_selector = gr.Dropdown(
|
| 1245 |
choices=agent_choices,
|
| 1246 |
multiselect=True,
|
|
|
|
| 1262 |
|
| 1263 |
agent_selector.change(update_radar, inputs=[agent_selector], outputs=[radar_chart], api_name=False)
|
| 1264 |
|
| 1265 |
+
# ---- Tab 3: Tiers ----
|
| 1266 |
+
with gr.TabItem("Tiers"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1267 |
gr.Markdown("""
|
| 1268 |
### CRM Difficulty Tier Breakdown
|
| 1269 |
|
|
|
|
| 1279 |
interactive=False,
|
| 1280 |
)
|
| 1281 |
|
| 1282 |
+
# ---- Tab 4: Per-App ----
|
| 1283 |
+
with gr.TabItem("Per-App"):
|
| 1284 |
gr.Markdown("### Performance by Web Application")
|
| 1285 |
app_table = gr.Dataframe(
|
| 1286 |
value=build_app_table(submissions),
|
| 1287 |
interactive=False,
|
| 1288 |
)
|
| 1289 |
|
| 1290 |
+
# ---- Tab 5: Get Key ----
|
| 1291 |
+
with gr.TabItem("Get Key"):
|
| 1292 |
gr.Markdown("""
|
| 1293 |
## Get Your Signing Key
|
| 1294 |
|
|
|
|
| 1301 |
**Important:** Use the **same email** here and as `--contact-email`
|
| 1302 |
when generating your submission file.
|
| 1303 |
""")
|
| 1304 |
+
with gr.Group(elem_classes="form-card"):
|
| 1305 |
+
key_email = gr.Textbox(label="Email", placeholder="you@example.com")
|
| 1306 |
+
key_team = gr.Textbox(label="Team Name", placeholder="Your Team")
|
| 1307 |
+
key_institution = gr.Textbox(label="Institution (optional)", placeholder="University / Company")
|
| 1308 |
+
key_btn = gr.Button("Generate Signing Key", variant="primary")
|
| 1309 |
key_result = gr.Textbox(label="Your Signing Key", interactive=False, lines=6)
|
| 1310 |
|
| 1311 |
key_btn.click(
|
|
|
|
| 1315 |
api_name=False,
|
| 1316 |
)
|
| 1317 |
|
| 1318 |
+
# ---- Tab 6: Submit ----
|
| 1319 |
with gr.TabItem("Submit"):
|
| 1320 |
gr.Markdown(f"""
|
| 1321 |
## Submit Your Results
|
|
|
|
| 1352 |
5. **Anti-gaming** — rate limiting, duplicate detection, completeness enforcement
|
| 1353 |
""")
|
| 1354 |
|
| 1355 |
+
with gr.Group(elem_classes="form-card"):
|
| 1356 |
+
upload = gr.File(label="Upload submission.json", file_types=[".json"])
|
| 1357 |
+
submit_btn = gr.Button("Validate & Submit", variant="primary")
|
| 1358 |
result_text = gr.Textbox(label="Verification Report", interactive=False, lines=20)
|
| 1359 |
|
| 1360 |
submit_btn.click(
|
|
|
|
| 1364 |
api_name=False,
|
| 1365 |
)
|
| 1366 |
|
| 1367 |
+
# ---- Tab 7: FAQ ----
|
| 1368 |
with gr.TabItem("FAQ"):
|
| 1369 |
+
with gr.Column(elem_classes="faq-section"):
|
| 1370 |
gr.Markdown("""
|
| 1371 |
## Frequently Asked Questions
|
| 1372 |
|
|
|
|
| 1698 |
contact details.
|
| 1699 |
""")
|
| 1700 |
|
| 1701 |
+
# ---- Tab 8: About ----
|
| 1702 |
with gr.TabItem("About"):
|
| 1703 |
# Build dimensions list dynamically
|
| 1704 |
_dim_lines = "\n".join(
|
assets/ibm_research_logo.png
ADDED
|
Git LFS Details
|