leaderboard / app.py
xiziqiao's picture
Update app.py
6a2f803 verified
import html
from pathlib import Path
from datetime import datetime
import pandas as pd
import gradio as gr
CSV_PATH = Path("leaderboard.csv")
# Full breakdown columns (Appendix Table 6)
COLS = [
"Model",
"Score",
"Completeness",
"Grounding",
"Success Rate",
"Recovery Rate",
"Flexibility",
"Order",
"Info Diversity",
"Format",
"Tradeoff",
"Tool Calls",
"# Turns",
"Progress Tracking",
"Goal Decomposition",
]
PERCENT_COLS = {
"Success Rate",
"Recovery Rate",
"Flexibility",
"Order",
"Info Diversity",
"Format",
"Tradeoff",
}
LABEL_MAP = {
"Info Diversity": "Info Div.",
"# Turns": "Turns",
"Progress Tracking": "Progress",
"Goal Decomposition": "Goal Decomp.",
}
# Light, Arena-like, high-contrast style (prevents "light bg + light text" issues)
ARENA_CSS = r"""
:root { color-scheme: light; }
html, body { background: #f6f7fb !important; }
/* Gradio theme tokens */
.gradio-container{
max-width: 1200px !important;
margin: 0 auto !important;
padding: 18px 16px 24px !important;
--body-background-fill: #f6f7fb !important;
--body-background-fill-hover: #f6f7fb !important;
--body-text-color: #0f172a !important;
--body-text-color-subdued: #334155 !important;
--block-background-fill: #ffffff !important;
--block-background-fill-hover: #ffffff !important;
--block-border-color: #e5e7eb !important;
--input-background-fill: #ffffff !important;
--input-background-fill-hover: #ffffff !important;
--input-border-color: #e2e8f0 !important;
--input-border-color-focus: #93c5fd !important;
--input-text-color: #0f172a !important;
--input-placeholder-color: #94a3b8 !important;
--button-secondary-background-fill: #ffffff !important;
--button-secondary-background-fill-hover: #f8fafc !important;
--button-secondary-border-color: #e2e8f0 !important;
--button-secondary-text-color: #0f172a !important;
--button-primary-background-fill: #2563eb !important;
--button-primary-background-fill-hover: #1d4ed8 !important;
--button-primary-border-color: #2563eb !important;
--button-primary-text-color: #ffffff !important;
--link-text-color: #2563eb !important;
--link-text-color-active: #1d4ed8 !important;
}
/* Arena table card */
.arena-card{
background: #ffffff;
border: 1px solid #e5e7eb;
border-radius: 14px;
box-shadow: 0 1px 2px rgba(16,24,40,0.06);
overflow: hidden;
}
.arena-wrap{ width: 100%; overflow-x: auto; }
table.arena-table{
width: 100%;
min-width: 1300px; /* wide table, scrolls horizontally */
border-collapse: separate;
border-spacing: 0;
font-size: 13px;
color: #0f172a;
}
/* IMPORTANT: override any global "prose table" borders */
table.arena-table th, table.arena-table td{
border: none !important;
overflow: visible !important;
text-overflow: clip !important;
}
table.arena-table thead th{
position: sticky;
top: 0;
z-index: 2;
background: #f8fafc;
color: #334155 !important;
font-weight: 650;
text-align: left;
padding: 10px 12px;
border-bottom: 1px solid #e2e8f0 !important;
white-space: nowrap;
}
table.arena-table tbody td{
padding: 10px 12px;
border-bottom: 1px solid #eef2f7 !important;
white-space: nowrap;
color: #0f172a !important;
}
table.arena-table tbody tr:nth-child(even){ background: #fbfdff; }
table.arena-table tbody tr:hover{ background: #f1f5f9; }
table.arena-table th.num, table.arena-table td.num{
text-align: right;
font-variant-numeric: tabular-nums;
}
table.arena-table td.model{ font-weight: 650; }
table.arena-table td.rank{ width: 52px; color: #64748b !important; }
/* optional: keep Model column visible while horizontal scrolling */
table.arena-table thead th:first-child,
table.arena-table tbody td:first-child{
position: sticky;
left: 0;
z-index: 3;
background: #f8fafc;
}
table.arena-table tbody td:first-child{
background: #ffffff;
}
"""
def _to_float(x):
"""Parse numeric and percent strings into float for sorting."""
if x is None:
return float("nan")
if isinstance(x, (int, float)) and not pd.isna(x):
return float(x)
s = str(x).strip()
if not s:
return float("nan")
if s.endswith("%"):
s = s[:-1].strip()
s = s.replace(",", "")
try:
return float(s)
except Exception:
return float("nan")
def load_df() -> pd.DataFrame:
if not CSV_PATH.exists():
return pd.DataFrame(columns=COLS)
df = pd.read_csv(CSV_PATH)
for c in COLS:
if c not in df.columns:
df[c] = ""
return df[COLS]
def format_cell(col: str, val) -> str:
if val is None or (isinstance(val, float) and pd.isna(val)):
return ""
s = str(val).strip()
if col in PERCENT_COLS:
return s
f = _to_float(val)
if pd.isna(f):
return s
return f"{f:.2f}"
def prepare_df(query: str, sort_by: str, descending: bool) -> pd.DataFrame:
df = load_df()
if query:
q = query.lower().strip()
df = df[df["Model"].astype(str).str.lower().str.contains(q, na=False)]
if sort_by in df.columns:
df = df.assign(_s=df[sort_by].map(_to_float))
df = df.sort_values("_s", ascending=not descending, na_position="last").drop(columns=["_s"])
df = df.reset_index(drop=True)
df.insert(0, "Rank", range(1, len(df) + 1))
return df
def render_table(df: pd.DataFrame) -> str:
if df.empty:
return "<div class='arena-card' style='padding:14px;'>No entries found.</div>"
cols = list(df.columns)
ths = []
for c in cols:
label = LABEL_MAP.get(c, c)
cls = "num" if c not in ("Model",) else ""
if c == "Rank":
cls = "rank num"
ths.append(f"<th class='{cls}'>{html.escape(label)}</th>")
rows = []
for _, row in df.iterrows():
tds = []
for c in cols:
if c == "Rank":
cls = "rank num"
val = row[c]
elif c == "Model":
cls = "model"
val = row[c]
else:
cls = "num"
val = format_cell(c, row[c])
tds.append(f"<td class='{cls}'>{html.escape(str(val))}</td>")
rows.append("<tr>" + "".join(tds) + "</tr>")
return f"""
<div class="arena-card">
<div class="arena-wrap">
<table class="arena-table">
<thead><tr>{''.join(ths)}</tr></thead>
<tbody>{''.join(rows)}</tbody>
</table>
</div>
</div>
"""
def update(q: str, s: str, d: bool) -> str:
return render_table(prepare_df(q, s, d))
SORT_CHOICES = [c for c in COLS if c != "Model"]
with gr.Blocks(title="ToolGym Leaderboard", css=ARENA_CSS) as demo:
gr.Markdown("# 🏆 ToolGym Leaderboard")
gr.Markdown("Full leaderboard breakdown. Update by editing `leaderboard.csv` via PR.")
with gr.Row():
query = gr.Textbox(label="Search", placeholder="e.g., deepseek, gemini, qwen ...")
sort_by = gr.Dropdown(label="Sort by", choices=SORT_CHOICES, value="Score")
descending = gr.Checkbox(value=True, label="Descending")
table = gr.HTML()
query.change(update, inputs=[query, sort_by, descending], outputs=table)
sort_by.change(update, inputs=[query, sort_by, descending], outputs=table)
descending.change(update, inputs=[query, sort_by, descending], outputs=table)
demo.load(update, inputs=[query, sort_by, descending], outputs=table)
ts = ""
if CSV_PATH.exists():
ts = datetime.utcfromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d %H:%M UTC")
gr.Markdown(f"<small>Source: <code>leaderboard.csv</code>{(' · Last updated: ' + ts) if ts else ''}</small>")
with gr.Accordion("Submit / Update", open=False):
gr.Markdown(
"- Open a PR editing `leaderboard.csv`.\n"
"- Include: model name, evaluation setting/commit hash, and the metrics.\n"
)
demo.launch()