xiziqiao commited on
Commit
6a2f803
verified
1 Parent(s): e7fdf18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -108
app.py CHANGED
@@ -7,8 +7,8 @@ import gradio as gr
7
 
8
  CSV_PATH = Path("leaderboard.csv")
9
 
10
- # Full breakdown columns = Appendix Table 6 (same metrics, same names)
11
- FULL_COLS = [
12
  "Model",
13
  "Score",
14
  "Completeness",
@@ -26,19 +26,6 @@ FULL_COLS = [
26
  "Goal Decomposition",
27
  ]
28
 
29
- # A compact "Arena-like" summary view (close to your Table 2)
30
- SUMMARY_COLS = [
31
- "Model",
32
- "Score",
33
- "Completeness",
34
- "Grounding",
35
- "Recovery Rate",
36
- "Flexibility",
37
- "Format",
38
- "Tool Calls",
39
- "Goal Decomposition",
40
- ]
41
-
42
  PERCENT_COLS = {
43
  "Success Rate",
44
  "Recovery Rate",
@@ -56,12 +43,12 @@ LABEL_MAP = {
56
  "Goal Decomposition": "Goal Decomp.",
57
  }
58
 
 
59
  ARENA_CSS = r"""
60
- /* ===== Force a clean "Arena-like" light theme with proper contrast ===== */
61
  :root { color-scheme: light; }
62
  html, body { background: #f6f7fb !important; }
63
 
64
- /* ---- Gradio theme tokens (this fixes your "light bg + light text" issue) ---- */
65
  .gradio-container{
66
  max-width: 1200px !important;
67
  margin: 0 auto !important;
@@ -70,10 +57,10 @@ html, body { background: #f6f7fb !important; }
70
  --body-background-fill: #f6f7fb !important;
71
  --body-background-fill-hover: #f6f7fb !important;
72
 
73
- --body-text-color: #0f172a !important; /* main text */
74
- --body-text-color-subdued: #334155 !important; /* secondary text */
75
 
76
- --block-background-fill: #ffffff !important; /* panels */
77
  --block-background-fill-hover: #ffffff !important;
78
  --block-border-color: #e5e7eb !important;
79
 
@@ -98,24 +85,7 @@ html, body { background: #f6f7fb !important; }
98
  --link-text-color-active: #1d4ed8 !important;
99
  }
100
 
101
- /* Make markdown visibly dark (Gradio sometimes keeps dark-theme opacity) */
102
- .gradio-container .prose, .gradio-container .prose *{
103
- color: #0f172a !important;
104
- }
105
- .gradio-container .prose p{
106
- color: #334155 !important;
107
- }
108
-
109
- /* Make inline code pill readable */
110
- .gradio-container code{
111
- background: #f1f5f9 !important;
112
- border: 1px solid #e2e8f0 !important;
113
- color: #0f172a !important;
114
- padding: 1px 6px;
115
- border-radius: 6px;
116
- }
117
-
118
- /* ===== Arena table card ===== */
119
  .arena-card{
120
  background: #ffffff;
121
  border: 1px solid #e5e7eb;
@@ -127,33 +97,38 @@ html, body { background: #f6f7fb !important; }
127
 
128
  table.arena-table{
129
  width: 100%;
130
- min-width: 1100px;
131
  border-collapse: separate;
132
  border-spacing: 0;
133
  font-size: 13px;
134
  color: #0f172a;
135
  }
136
 
137
- /* Header */
 
 
 
 
 
 
138
  table.arena-table thead th{
139
  position: sticky;
140
  top: 0;
141
- z-index: 1;
142
  background: #f8fafc;
143
  color: #334155 !important;
144
  font-weight: 650;
145
  text-align: left;
146
  padding: 10px 12px;
147
- border-bottom: 1px solid #e2e8f0;
148
  white-space: nowrap;
149
  }
150
 
151
- /* Body */
152
  table.arena-table tbody td{
153
  padding: 10px 12px;
154
- border-bottom: 1px solid #eef2f7;
155
  white-space: nowrap;
156
- color: #0f172a !important; /* IMPORTANT: force readable text */
157
  }
158
 
159
  table.arena-table tbody tr:nth-child(even){ background: #fbfdff; }
@@ -163,8 +138,21 @@ table.arena-table th.num, table.arena-table td.num{
163
  text-align: right;
164
  font-variant-numeric: tabular-nums;
165
  }
166
- table.arena-table td.model{ font-weight: 600; }
 
167
  table.arena-table td.rank{ width: 52px; color: #64748b !important; }
 
 
 
 
 
 
 
 
 
 
 
 
168
  """
169
 
170
  def _to_float(x):
@@ -173,7 +161,6 @@ def _to_float(x):
173
  return float("nan")
174
  if isinstance(x, (int, float)) and not pd.isna(x):
175
  return float(x)
176
-
177
  s = str(x).strip()
178
  if not s:
179
  return float("nan")
@@ -187,44 +174,37 @@ def _to_float(x):
187
 
188
  def load_df() -> pd.DataFrame:
189
  if not CSV_PATH.exists():
190
- return pd.DataFrame(columns=FULL_COLS)
191
-
192
  df = pd.read_csv(CSV_PATH)
193
- for c in FULL_COLS:
194
  if c not in df.columns:
195
  df[c] = ""
196
- return df[FULL_COLS]
197
 
198
  def format_cell(col: str, val) -> str:
199
  if val is None or (isinstance(val, float) and pd.isna(val)):
200
  return ""
 
201
  if col in PERCENT_COLS:
202
- return str(val).strip()
203
  f = _to_float(val)
204
  if pd.isna(f):
205
- return str(val).strip()
206
  return f"{f:.2f}"
207
 
208
- def prepare_df(query: str, sort_by: str, descending: bool, view: str) -> pd.DataFrame:
209
  df = load_df()
210
-
211
- # Search (Arena-style: by model name)
212
  if query:
213
  q = query.lower().strip()
214
  df = df[df["Model"].astype(str).str.lower().str.contains(q, na=False)]
215
 
216
- cols = SUMMARY_COLS if view == "Summary" else FULL_COLS
217
-
218
- # Sort
219
  if sort_by in df.columns:
220
  df = df.assign(_s=df[sort_by].map(_to_float))
221
  df = df.sort_values("_s", ascending=not descending, na_position="last").drop(columns=["_s"])
222
 
223
- # Rank
224
  df = df.reset_index(drop=True)
225
  df.insert(0, "Rank", range(1, len(df) + 1))
226
-
227
- return df[["Rank"] + cols]
228
 
229
  def render_table(df: pd.DataFrame) -> str:
230
  if df.empty:
@@ -232,32 +212,28 @@ def render_table(df: pd.DataFrame) -> str:
232
 
233
  cols = list(df.columns)
234
 
235
- # Header
236
  ths = []
237
  for c in cols:
238
  label = LABEL_MAP.get(c, c)
239
- cls = []
240
  if c == "Rank":
241
- cls += ["rank", "num"]
242
- elif c == "Model":
243
- cls += ["model"]
244
- else:
245
- cls += ["num"]
246
- ths.append(f"<th class=\"{' '.join(cls)}\">{html.escape(label)}</th>")
247
-
248
- # Body
249
  rows = []
250
  for _, row in df.iterrows():
251
  tds = []
252
  for c in cols:
253
  if c == "Rank":
254
  cls = "rank num"
 
255
  elif c == "Model":
256
  cls = "model"
 
257
  else:
258
  cls = "num"
259
- val = format_cell(c, row[c])
260
- tds.append(f"<td class=\"{cls}\">{html.escape(str(val))}</td>")
261
  rows.append("<tr>" + "".join(tds) + "</tr>")
262
 
263
  return f"""
@@ -271,57 +247,36 @@ def render_table(df: pd.DataFrame) -> str:
271
  </div>
272
  """
273
 
274
- def update(query: str, view: str, sort_by: str, descending: bool) -> str:
275
- df = prepare_df(query, sort_by, descending, view)
276
- return render_table(df)
277
 
278
- def sort_choices(view: str):
279
- cols = SUMMARY_COLS if view == "Summary" else FULL_COLS
280
- return [c for c in cols if c not in ("Model",)]
281
 
282
  with gr.Blocks(title="ToolGym Leaderboard", css=ARENA_CSS) as demo:
283
  gr.Markdown("# 馃弳 ToolGym Leaderboard")
284
- gr.Markdown("Arena-style leaderboard view (Summary / Full breakdown). Update by editing `leaderboard.csv` via PR.")
285
 
286
  with gr.Row():
287
  query = gr.Textbox(label="Search", placeholder="e.g., deepseek, gemini, qwen ...")
288
- view = gr.Radio(choices=["Summary", "Full breakdown"], value="Full breakdown", label="View")
289
-
290
- with gr.Row():
291
- sort_by = gr.Dropdown(choices=sort_choices("Full breakdown"), value="Score", label="Sort by")
292
  descending = gr.Checkbox(value=True, label="Descending")
293
 
294
  table = gr.HTML()
295
 
296
- # When switching view, update sort choices + refresh table
297
- def on_view_change(v, q, desc):
298
- new_choices = sort_choices(v)
299
- # keep Score as default if present
300
- default = "Score" if "Score" in new_choices else (new_choices[0] if new_choices else "")
301
- df = prepare_df(q, default, desc, v)
302
- return gr.Dropdown.update(choices=new_choices, value=default), render_table(df)
303
-
304
- view.change(on_view_change, inputs=[view, query, descending], outputs=[sort_by, table])
305
-
306
- # Regular refresh
307
- query.change(update, inputs=[query, view, sort_by, descending], outputs=table)
308
- sort_by.change(update, inputs=[query, view, sort_by, descending], outputs=table)
309
- descending.change(update, inputs=[query, view, sort_by, descending], outputs=table)
310
-
311
- # Initial render
312
- demo.load(update, inputs=[query, view, sort_by, descending], outputs=table)
313
 
314
- # Footer info
315
  if CSV_PATH.exists():
316
  ts = datetime.utcfromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d %H:%M UTC")
317
- gr.Markdown(f"<small>Source: <code>leaderboard.csv</code> 路 Last updated: {ts}</small>")
318
- else:
319
- gr.Markdown("<small>Source: <code>leaderboard.csv</code></small>")
320
 
321
  with gr.Accordion("Submit / Update", open=False):
322
  gr.Markdown(
323
- "- Open a PR that edits `leaderboard.csv`.\n"
324
- "- Please include: model name, evaluation setting/commit hash, and the metrics.\n"
325
  )
326
 
327
  demo.launch()
 
7
 
8
  CSV_PATH = Path("leaderboard.csv")
9
 
10
+ # Full breakdown columns (Appendix Table 6)
11
+ COLS = [
12
  "Model",
13
  "Score",
14
  "Completeness",
 
26
  "Goal Decomposition",
27
  ]
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  PERCENT_COLS = {
30
  "Success Rate",
31
  "Recovery Rate",
 
43
  "Goal Decomposition": "Goal Decomp.",
44
  }
45
 
46
+ # Light, Arena-like, high-contrast style (prevents "light bg + light text" issues)
47
  ARENA_CSS = r"""
 
48
  :root { color-scheme: light; }
49
  html, body { background: #f6f7fb !important; }
50
 
51
+ /* Gradio theme tokens */
52
  .gradio-container{
53
  max-width: 1200px !important;
54
  margin: 0 auto !important;
 
57
  --body-background-fill: #f6f7fb !important;
58
  --body-background-fill-hover: #f6f7fb !important;
59
 
60
+ --body-text-color: #0f172a !important;
61
+ --body-text-color-subdued: #334155 !important;
62
 
63
+ --block-background-fill: #ffffff !important;
64
  --block-background-fill-hover: #ffffff !important;
65
  --block-border-color: #e5e7eb !important;
66
 
 
85
  --link-text-color-active: #1d4ed8 !important;
86
  }
87
 
88
+ /* Arena table card */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  .arena-card{
90
  background: #ffffff;
91
  border: 1px solid #e5e7eb;
 
97
 
98
  table.arena-table{
99
  width: 100%;
100
+ min-width: 1300px; /* wide table, scrolls horizontally */
101
  border-collapse: separate;
102
  border-spacing: 0;
103
  font-size: 13px;
104
  color: #0f172a;
105
  }
106
 
107
+ /* IMPORTANT: override any global "prose table" borders */
108
+ table.arena-table th, table.arena-table td{
109
+ border: none !important;
110
+ overflow: visible !important;
111
+ text-overflow: clip !important;
112
+ }
113
+
114
  table.arena-table thead th{
115
  position: sticky;
116
  top: 0;
117
+ z-index: 2;
118
  background: #f8fafc;
119
  color: #334155 !important;
120
  font-weight: 650;
121
  text-align: left;
122
  padding: 10px 12px;
123
+ border-bottom: 1px solid #e2e8f0 !important;
124
  white-space: nowrap;
125
  }
126
 
 
127
  table.arena-table tbody td{
128
  padding: 10px 12px;
129
+ border-bottom: 1px solid #eef2f7 !important;
130
  white-space: nowrap;
131
+ color: #0f172a !important;
132
  }
133
 
134
  table.arena-table tbody tr:nth-child(even){ background: #fbfdff; }
 
138
  text-align: right;
139
  font-variant-numeric: tabular-nums;
140
  }
141
+
142
+ table.arena-table td.model{ font-weight: 650; }
143
  table.arena-table td.rank{ width: 52px; color: #64748b !important; }
144
+
145
+ /* optional: keep Model column visible while horizontal scrolling */
146
+ table.arena-table thead th:first-child,
147
+ table.arena-table tbody td:first-child{
148
+ position: sticky;
149
+ left: 0;
150
+ z-index: 3;
151
+ background: #f8fafc;
152
+ }
153
+ table.arena-table tbody td:first-child{
154
+ background: #ffffff;
155
+ }
156
  """
157
 
158
  def _to_float(x):
 
161
  return float("nan")
162
  if isinstance(x, (int, float)) and not pd.isna(x):
163
  return float(x)
 
164
  s = str(x).strip()
165
  if not s:
166
  return float("nan")
 
174
 
175
  def load_df() -> pd.DataFrame:
176
  if not CSV_PATH.exists():
177
+ return pd.DataFrame(columns=COLS)
 
178
  df = pd.read_csv(CSV_PATH)
179
+ for c in COLS:
180
  if c not in df.columns:
181
  df[c] = ""
182
+ return df[COLS]
183
 
184
  def format_cell(col: str, val) -> str:
185
  if val is None or (isinstance(val, float) and pd.isna(val)):
186
  return ""
187
+ s = str(val).strip()
188
  if col in PERCENT_COLS:
189
+ return s
190
  f = _to_float(val)
191
  if pd.isna(f):
192
+ return s
193
  return f"{f:.2f}"
194
 
195
+ def prepare_df(query: str, sort_by: str, descending: bool) -> pd.DataFrame:
196
  df = load_df()
 
 
197
  if query:
198
  q = query.lower().strip()
199
  df = df[df["Model"].astype(str).str.lower().str.contains(q, na=False)]
200
 
 
 
 
201
  if sort_by in df.columns:
202
  df = df.assign(_s=df[sort_by].map(_to_float))
203
  df = df.sort_values("_s", ascending=not descending, na_position="last").drop(columns=["_s"])
204
 
 
205
  df = df.reset_index(drop=True)
206
  df.insert(0, "Rank", range(1, len(df) + 1))
207
+ return df
 
208
 
209
  def render_table(df: pd.DataFrame) -> str:
210
  if df.empty:
 
212
 
213
  cols = list(df.columns)
214
 
 
215
  ths = []
216
  for c in cols:
217
  label = LABEL_MAP.get(c, c)
218
+ cls = "num" if c not in ("Model",) else ""
219
  if c == "Rank":
220
+ cls = "rank num"
221
+ ths.append(f"<th class='{cls}'>{html.escape(label)}</th>")
222
+
 
 
 
 
 
223
  rows = []
224
  for _, row in df.iterrows():
225
  tds = []
226
  for c in cols:
227
  if c == "Rank":
228
  cls = "rank num"
229
+ val = row[c]
230
  elif c == "Model":
231
  cls = "model"
232
+ val = row[c]
233
  else:
234
  cls = "num"
235
+ val = format_cell(c, row[c])
236
+ tds.append(f"<td class='{cls}'>{html.escape(str(val))}</td>")
237
  rows.append("<tr>" + "".join(tds) + "</tr>")
238
 
239
  return f"""
 
247
  </div>
248
  """
249
 
250
+ def update(q: str, s: str, d: bool) -> str:
251
+ return render_table(prepare_df(q, s, d))
 
252
 
253
+ SORT_CHOICES = [c for c in COLS if c != "Model"]
 
 
254
 
255
  with gr.Blocks(title="ToolGym Leaderboard", css=ARENA_CSS) as demo:
256
  gr.Markdown("# 馃弳 ToolGym Leaderboard")
257
+ gr.Markdown("Full leaderboard breakdown. Update by editing `leaderboard.csv` via PR.")
258
 
259
  with gr.Row():
260
  query = gr.Textbox(label="Search", placeholder="e.g., deepseek, gemini, qwen ...")
261
+ sort_by = gr.Dropdown(label="Sort by", choices=SORT_CHOICES, value="Score")
 
 
 
262
  descending = gr.Checkbox(value=True, label="Descending")
263
 
264
  table = gr.HTML()
265
 
266
+ query.change(update, inputs=[query, sort_by, descending], outputs=table)
267
+ sort_by.change(update, inputs=[query, sort_by, descending], outputs=table)
268
+ descending.change(update, inputs=[query, sort_by, descending], outputs=table)
269
+ demo.load(update, inputs=[query, sort_by, descending], outputs=table)
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ ts = ""
272
  if CSV_PATH.exists():
273
  ts = datetime.utcfromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d %H:%M UTC")
274
+ gr.Markdown(f"<small>Source: <code>leaderboard.csv</code>{(' 路 Last updated: ' + ts) if ts else ''}</small>")
 
 
275
 
276
  with gr.Accordion("Submit / Update", open=False):
277
  gr.Markdown(
278
+ "- Open a PR editing `leaderboard.csv`.\n"
279
+ "- Include: model name, evaluation setting/commit hash, and the metrics.\n"
280
  )
281
 
282
  demo.launch()