Spaces:
Sleeping
Sleeping
| import html | |
| from pathlib import Path | |
| from datetime import datetime | |
| import pandas as pd | |
| import gradio as gr | |
| CSV_PATH = Path("leaderboard.csv") | |
| # Full breakdown columns (Appendix Table 6) | |
| COLS = [ | |
| "Model", | |
| "Score", | |
| "Completeness", | |
| "Grounding", | |
| "Success Rate", | |
| "Recovery Rate", | |
| "Flexibility", | |
| "Order", | |
| "Info Diversity", | |
| "Format", | |
| "Tradeoff", | |
| "Tool Calls", | |
| "# Turns", | |
| "Progress Tracking", | |
| "Goal Decomposition", | |
| ] | |
| PERCENT_COLS = { | |
| "Success Rate", | |
| "Recovery Rate", | |
| "Flexibility", | |
| "Order", | |
| "Info Diversity", | |
| "Format", | |
| "Tradeoff", | |
| } | |
| LABEL_MAP = { | |
| "Info Diversity": "Info Div.", | |
| "# Turns": "Turns", | |
| "Progress Tracking": "Progress", | |
| "Goal Decomposition": "Goal Decomp.", | |
| } | |
| # Light, Arena-like, high-contrast style (prevents "light bg + light text" issues) | |
| ARENA_CSS = r""" | |
| :root { color-scheme: light; } | |
| html, body { background: #f6f7fb !important; } | |
| /* Gradio theme tokens */ | |
| .gradio-container{ | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| padding: 18px 16px 24px !important; | |
| --body-background-fill: #f6f7fb !important; | |
| --body-background-fill-hover: #f6f7fb !important; | |
| --body-text-color: #0f172a !important; | |
| --body-text-color-subdued: #334155 !important; | |
| --block-background-fill: #ffffff !important; | |
| --block-background-fill-hover: #ffffff !important; | |
| --block-border-color: #e5e7eb !important; | |
| --input-background-fill: #ffffff !important; | |
| --input-background-fill-hover: #ffffff !important; | |
| --input-border-color: #e2e8f0 !important; | |
| --input-border-color-focus: #93c5fd !important; | |
| --input-text-color: #0f172a !important; | |
| --input-placeholder-color: #94a3b8 !important; | |
| --button-secondary-background-fill: #ffffff !important; | |
| --button-secondary-background-fill-hover: #f8fafc !important; | |
| --button-secondary-border-color: #e2e8f0 !important; | |
| --button-secondary-text-color: #0f172a !important; | |
| --button-primary-background-fill: #2563eb !important; | |
| --button-primary-background-fill-hover: #1d4ed8 !important; | |
| --button-primary-border-color: #2563eb !important; | |
| --button-primary-text-color: #ffffff !important; | |
| --link-text-color: #2563eb !important; | |
| --link-text-color-active: #1d4ed8 !important; | |
| } | |
| /* Arena table card */ | |
| .arena-card{ | |
| background: #ffffff; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 14px; | |
| box-shadow: 0 1px 2px rgba(16,24,40,0.06); | |
| overflow: hidden; | |
| } | |
| .arena-wrap{ width: 100%; overflow-x: auto; } | |
| table.arena-table{ | |
| width: 100%; | |
| min-width: 1300px; /* wide table, scrolls horizontally */ | |
| border-collapse: separate; | |
| border-spacing: 0; | |
| font-size: 13px; | |
| color: #0f172a; | |
| } | |
| /* IMPORTANT: override any global "prose table" borders */ | |
| table.arena-table th, table.arena-table td{ | |
| border: none !important; | |
| overflow: visible !important; | |
| text-overflow: clip !important; | |
| } | |
| table.arena-table thead th{ | |
| position: sticky; | |
| top: 0; | |
| z-index: 2; | |
| background: #f8fafc; | |
| color: #334155 !important; | |
| font-weight: 650; | |
| text-align: left; | |
| padding: 10px 12px; | |
| border-bottom: 1px solid #e2e8f0 !important; | |
| white-space: nowrap; | |
| } | |
| table.arena-table tbody td{ | |
| padding: 10px 12px; | |
| border-bottom: 1px solid #eef2f7 !important; | |
| white-space: nowrap; | |
| color: #0f172a !important; | |
| } | |
| table.arena-table tbody tr:nth-child(even){ background: #fbfdff; } | |
| table.arena-table tbody tr:hover{ background: #f1f5f9; } | |
| table.arena-table th.num, table.arena-table td.num{ | |
| text-align: right; | |
| font-variant-numeric: tabular-nums; | |
| } | |
| table.arena-table td.model{ font-weight: 650; } | |
| table.arena-table td.rank{ width: 52px; color: #64748b !important; } | |
| /* optional: keep Model column visible while horizontal scrolling */ | |
| table.arena-table thead th:first-child, | |
| table.arena-table tbody td:first-child{ | |
| position: sticky; | |
| left: 0; | |
| z-index: 3; | |
| background: #f8fafc; | |
| } | |
| table.arena-table tbody td:first-child{ | |
| background: #ffffff; | |
| } | |
| """ | |
| def _to_float(x): | |
| """Parse numeric and percent strings into float for sorting.""" | |
| if x is None: | |
| return float("nan") | |
| if isinstance(x, (int, float)) and not pd.isna(x): | |
| return float(x) | |
| s = str(x).strip() | |
| if not s: | |
| return float("nan") | |
| if s.endswith("%"): | |
| s = s[:-1].strip() | |
| s = s.replace(",", "") | |
| try: | |
| return float(s) | |
| except Exception: | |
| return float("nan") | |
| def load_df() -> pd.DataFrame: | |
| if not CSV_PATH.exists(): | |
| return pd.DataFrame(columns=COLS) | |
| df = pd.read_csv(CSV_PATH) | |
| for c in COLS: | |
| if c not in df.columns: | |
| df[c] = "" | |
| return df[COLS] | |
| def format_cell(col: str, val) -> str: | |
| if val is None or (isinstance(val, float) and pd.isna(val)): | |
| return "" | |
| s = str(val).strip() | |
| if col in PERCENT_COLS: | |
| return s | |
| f = _to_float(val) | |
| if pd.isna(f): | |
| return s | |
| return f"{f:.2f}" | |
| def prepare_df(query: str, sort_by: str, descending: bool) -> pd.DataFrame: | |
| df = load_df() | |
| if query: | |
| q = query.lower().strip() | |
| df = df[df["Model"].astype(str).str.lower().str.contains(q, na=False)] | |
| if sort_by in df.columns: | |
| df = df.assign(_s=df[sort_by].map(_to_float)) | |
| df = df.sort_values("_s", ascending=not descending, na_position="last").drop(columns=["_s"]) | |
| df = df.reset_index(drop=True) | |
| df.insert(0, "Rank", range(1, len(df) + 1)) | |
| return df | |
| def render_table(df: pd.DataFrame) -> str: | |
| if df.empty: | |
| return "<div class='arena-card' style='padding:14px;'>No entries found.</div>" | |
| cols = list(df.columns) | |
| ths = [] | |
| for c in cols: | |
| label = LABEL_MAP.get(c, c) | |
| cls = "num" if c not in ("Model",) else "" | |
| if c == "Rank": | |
| cls = "rank num" | |
| ths.append(f"<th class='{cls}'>{html.escape(label)}</th>") | |
| rows = [] | |
| for _, row in df.iterrows(): | |
| tds = [] | |
| for c in cols: | |
| if c == "Rank": | |
| cls = "rank num" | |
| val = row[c] | |
| elif c == "Model": | |
| cls = "model" | |
| val = row[c] | |
| else: | |
| cls = "num" | |
| val = format_cell(c, row[c]) | |
| tds.append(f"<td class='{cls}'>{html.escape(str(val))}</td>") | |
| rows.append("<tr>" + "".join(tds) + "</tr>") | |
| return f""" | |
| <div class="arena-card"> | |
| <div class="arena-wrap"> | |
| <table class="arena-table"> | |
| <thead><tr>{''.join(ths)}</tr></thead> | |
| <tbody>{''.join(rows)}</tbody> | |
| </table> | |
| </div> | |
| </div> | |
| """ | |
| def update(q: str, s: str, d: bool) -> str: | |
| return render_table(prepare_df(q, s, d)) | |
| SORT_CHOICES = [c for c in COLS if c != "Model"] | |
| with gr.Blocks(title="ToolGym Leaderboard", css=ARENA_CSS) as demo: | |
| gr.Markdown("# 🏆 ToolGym Leaderboard") | |
| gr.Markdown("Full leaderboard breakdown. Update by editing `leaderboard.csv` via PR.") | |
| with gr.Row(): | |
| query = gr.Textbox(label="Search", placeholder="e.g., deepseek, gemini, qwen ...") | |
| sort_by = gr.Dropdown(label="Sort by", choices=SORT_CHOICES, value="Score") | |
| descending = gr.Checkbox(value=True, label="Descending") | |
| table = gr.HTML() | |
| query.change(update, inputs=[query, sort_by, descending], outputs=table) | |
| sort_by.change(update, inputs=[query, sort_by, descending], outputs=table) | |
| descending.change(update, inputs=[query, sort_by, descending], outputs=table) | |
| demo.load(update, inputs=[query, sort_by, descending], outputs=table) | |
| ts = "" | |
| if CSV_PATH.exists(): | |
| ts = datetime.utcfromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d %H:%M UTC") | |
| gr.Markdown(f"<small>Source: <code>leaderboard.csv</code>{(' · Last updated: ' + ts) if ts else ''}</small>") | |
| with gr.Accordion("Submit / Update", open=False): | |
| gr.Markdown( | |
| "- Open a PR editing `leaderboard.csv`.\n" | |
| "- Include: model name, evaluation setting/commit hash, and the metrics.\n" | |
| ) | |
| demo.launch() | |