jing084 commited on
Commit
20c3de5
·
verified ·
1 Parent(s): 2507b23

Upload app.py

Browse files

export MOECAP_RESULT_DIR="auto-cap/moe-cap-results"
python app.py

Files changed (1) hide show
  1. app.py +283 -481
app.py CHANGED
@@ -1,498 +1,300 @@
1
  #!/usr/bin/env python
2
  import os
3
- import datetime
4
- import socket
5
- import base64
6
- from threading import Thread
 
 
 
 
 
 
 
7
 
8
  import gradio as gr
9
  import pandas as pd
10
- import time
11
- from apscheduler.schedulers.background import BackgroundScheduler
12
-
13
- from huggingface_hub import snapshot_download
14
- from pytz import utc
15
-
16
- from src.display.about import (
17
- CITATION_BUTTON_LABEL,
18
- CITATION_BUTTON_TEXT,
19
- EVALUATION_QUEUE_TEXT,
20
- INTRODUCTION_TEXT,
21
- LLM_BENCHMARKS_TEXT,
22
- LLM_BENCHMARKS_DETAILS,
23
- FAQ_TEXT,
24
- TITLE,
25
- ACKNOWLEDGEMENT_TEXT,
26
- )
27
-
28
- from src.display.css_html_js import custom_css
29
-
30
- from src.display.utils import (
31
- BENCHMARK_COLS,
32
- COLS,
33
- EVAL_COLS,
34
- EVAL_TYPES,
35
- TYPES,
36
- AutoEvalColumn,
37
- ModelType,
38
- InferenceFramework,
39
- fields,
40
- WeightType,
41
- Precision,
42
- GPUType
43
- )
44
-
45
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
46
- QUEUE_REPO, REPO_ID, RESULTS_REPO, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
47
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
48
- from src.submission.submit import add_new_eval
49
- from src.utils import get_dataset_summary_table
50
-
51
- def get_args():
52
- import argparse
53
-
54
- parser = argparse.ArgumentParser(description="Run the LLM Leaderboard")
55
- parser.add_argument("--debug", action="store_true", help="Run in debug mode")
56
- return parser.parse_args()
57
-
58
- args = get_args()
59
- if args.debug:
60
- print("Running in debug mode")
61
- QUEUE_REPO = DEBUG_QUEUE_REPO
62
- RESULTS_REPO = DEBUG_RESULTS_REPO
63
-
64
- def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  try:
66
- print(local_dir)
67
- snapshot_download(
68
- repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout
69
- )
70
  except Exception as e:
71
- restart_space()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- def restart_space():
75
- API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
76
 
 
 
77
 
78
- def init_space():
79
- # dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
 
 
 
80
 
81
- if socket.gethostname() not in {"neuromancer"}:
82
- # sync model_type with open-llm-leaderboard
83
- ui_snapshot_download(
84
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
85
  )
86
- ui_snapshot_download(
87
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
 
 
 
 
 
 
88
  )
89
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
90
 
91
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
92
- EVAL_REQUESTS_PATH, EVAL_COLS
93
- )
94
- # return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
95
- return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
96
-
97
-
98
- def add_benchmark_columns(shown_columns):
99
- benchmark_columns = []
100
- for benchmark in BENCHMARK_COLS:
101
- if benchmark in shown_columns:
102
- for c in COLS:
103
- if benchmark in c and benchmark != c:
104
- benchmark_columns.append(c)
105
- return benchmark_columns
106
-
107
-
108
- # Searching and filtering
109
- def update_table(
110
- hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
111
- ):
112
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
113
- filtered_df = filter_queries(query, filtered_df)
114
- benchmark_columns = add_benchmark_columns(columns)
115
- df = select_columns(filtered_df, columns + benchmark_columns)
116
- return df
117
-
118
-
119
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
120
- return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
121
-
122
-
123
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
124
- # always_here_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
125
-
126
- always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
127
- dummy_col = [AutoEvalColumn.dummy.name]
128
-
129
- # We use COLS to maintain sorting
130
- filtered_df = df[
131
- # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
132
- always_here_cols
133
- + [c for c in COLS if c in df.columns and c in columns]
134
- + dummy_col
135
- ]
136
- return filtered_df
137
-
138
-
139
- def filter_queries(query: str, filtered_df: pd.DataFrame):
140
- final_df = []
141
- if query != "":
142
- queries = [q.strip() for q in query.split(";")]
143
- for _q in queries:
144
- _q = _q.strip()
145
- if _q != "":
146
- temp_filtered_df = search_table(filtered_df, _q)
147
- if len(temp_filtered_df) > 0:
148
- final_df.append(temp_filtered_df)
149
- if len(final_df) > 0:
150
- filtered_df = pd.concat(final_df)
151
- subset = [AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
152
- filtered_df = filtered_df.drop_duplicates(subset=subset)
153
- return filtered_df
154
-
155
-
156
- def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list) -> pd.DataFrame:
157
- # Show all models
158
- filtered_df = df
159
-
160
- type_emoji = [t[0] for t in type_query]
161
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
162
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
163
- filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
164
-
165
- # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
166
- # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
167
- # mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
168
- # filtered_df = filtered_df.loc[mask]
169
-
170
- return filtered_df
171
-
172
- shown_columns = None
173
- dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
174
- leaderboard_df = original_df.copy()
175
-
176
- # def update_leaderboard_table():
177
- # global leaderboard_df, shown_columns
178
- # print("Updating leaderboard table")
179
- # return leaderboard_df[
180
- # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
181
- # + shown_columns.value
182
- # + [AutoEvalColumn.dummy.name]
183
- # ] if not leaderboard_df.empty else leaderboard_df
184
-
185
-
186
- # def update_hidden_leaderboard_table():
187
- # global original_df
188
- # return original_df[COLS] if original_df.empty is False else original_df
189
-
190
- # def update_dataset_table():
191
- # global dataset_df
192
- # return dataset_df
193
-
194
- # def update_finish_table():
195
- # global finished_eval_queue_df
196
- # return finished_eval_queue_df
197
-
198
- # def update_running_table():
199
- # global running_eval_queue_df
200
- # return running_eval_queue_df
201
-
202
- # def update_pending_table():
203
- # global pending_eval_queue_df
204
- # return pending_eval_queue_df
205
-
206
- # def update_finish_num():
207
- # global finished_eval_queue_df
208
- # return len(finished_eval_queue_df)
209
-
210
- # def update_running_num():
211
- # global running_eval_queue_df
212
- # return len(running_eval_queue_df)
213
-
214
- # def update_pending_num():
215
- # global pending_eval_queue_df
216
- # return len(pending_eval_queue_df)
217
-
218
- # triggered only once at startup => read query parameter if it exists
219
- def load_query(request: gr.Request):
220
- query = request.query_params.get("query") or ""
221
- return query
222
-
223
-
224
- def get_image_html(url, image_path):
225
- with open(image_path, "rb") as image_file:
226
- encoded_string = base64.b64encode(image_file.read()).decode()
227
- return f'<a href="{url}" target="_blank"><img src="data:image/jpg;base64,{encoded_string}" alt="NetMind.AI Logo" style="width:100pt;"></a>'
228
-
229
-
230
- # Prepare the HTML content with the image
231
- image_html = get_image_html("https://netmind.ai/home", "./src/display/imgs/Netmind.AI_LOGO.jpg")
232
-
233
-
234
- demo = gr.Blocks(css=custom_css)
235
- with demo:
236
- gr.HTML(TITLE)
237
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
238
- gr.HTML(ACKNOWLEDGEMENT_TEXT.format(image_html=image_html))
239
-
240
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
241
- with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
242
- with gr.Row():
243
- with gr.Column():
244
- with gr.Row():
245
- search_bar = gr.Textbox(
246
- placeholder=" 🔍 Model search (separate multiple queries with `;`)",
247
- show_label=False,
248
- elem_id="search-bar"
249
- )
250
- with gr.Row():
251
- shown_columns = gr.CheckboxGroup(
252
- choices=[
253
- c.name
254
- for c in fields(AutoEvalColumn)
255
- if not c.hidden and not c.never_hidden and not c.dummy
256
- ],
257
- value=[
258
- c.name
259
- for c in fields(AutoEvalColumn)
260
- if c.displayed_by_default and not c.hidden and not c.never_hidden
261
- ],
262
- label="Tasks",
263
- elem_id="column-select",
264
- interactive=True,
265
- )
266
-
267
- with gr.Column(min_width=320):
268
- filter_columns_size = gr.CheckboxGroup(
269
- label="Inference frameworks",
270
- choices=[t.to_str() for t in InferenceFramework],
271
- value=[t.to_str() for t in InferenceFramework],
272
- interactive=True,
273
- elem_id="filter-columns-size",
274
- )
275
-
276
- filter_columns_type = gr.CheckboxGroup(
277
- label="Model types",
278
- choices=[t.to_str() for t in ModelType],
279
- value=[t.to_str() for t in ModelType],
280
- interactive=True,
281
- elem_id="filter-columns-type",
282
- )
283
-
284
- filter_columns_precision = gr.CheckboxGroup(
285
- label="Precision",
286
- choices=[i.value.name for i in Precision],
287
- value=[i.value.name for i in Precision],
288
- interactive=True,
289
- elem_id="filter-columns-precision",
290
- )
291
-
292
- # filter_columns_size = gr.CheckboxGroup(
293
- # label="Model sizes (in billions of parameters)",
294
- # choices=list(NUMERIC_INTERVALS.keys()),
295
- # value=list(NUMERIC_INTERVALS.keys()),
296
- # interactive=True,
297
- # elem_id="filter-columns-size",
298
- # )
299
-
300
- # breakpoint()
301
- benchmark_columns = add_benchmark_columns(shown_columns.value)
302
- leaderboard_table = gr.components.Dataframe(
303
- value=(
304
- leaderboard_df[
305
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
306
- + shown_columns.value
307
- + benchmark_columns
308
- + [AutoEvalColumn.dummy.name]
309
- ]
310
- if leaderboard_df.empty is False
311
- else leaderboard_df
312
- ),
313
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
314
- datatype=TYPES,
315
- elem_id="leaderboard-table",
316
- interactive=False,
317
- visible=True,
318
- ) # column_widths=["2%", "20%"]
319
-
320
- # Dummy leaderboard for handling the case when the user uses backspace key
321
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
322
- value=original_df[COLS] if original_df.empty is False else original_df,
323
- headers=COLS,
324
- datatype=TYPES,
325
- visible=False,
326
- )
327
-
328
- search_bar.submit(
329
- update_table,
330
- [
331
- hidden_leaderboard_table_for_search,
332
- shown_columns,
333
- filter_columns_type,
334
- filter_columns_precision,
335
- filter_columns_size,
336
- search_bar,
337
- ],
338
- leaderboard_table
339
- )
340
-
341
- # Check query parameter once at startup and update search bar
342
- demo.load(load_query, inputs=[], outputs=[search_bar])
343
-
344
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
345
- selector.change(
346
- update_table,
347
- [
348
- hidden_leaderboard_table_for_search,
349
- shown_columns,
350
- filter_columns_type,
351
- filter_columns_precision,
352
- filter_columns_size,
353
- search_bar,
354
- ],
355
- leaderboard_table,
356
- queue=True,
357
- )
358
 
359
- # with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
360
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
361
-
362
- # dataset_table = gr.components.Dataframe(
363
- # value=dataset_df,
364
- # headers=list(dataset_df.columns),
365
- # datatype=["str", "markdown", "str", "str", "str"],
366
- # elem_id="dataset-table",
367
- # interactive=False,
368
- # visible=True,
369
- # column_widths=["15%", "20%"],
370
- # )
371
-
372
- # gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
373
- # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
374
-
375
- with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
376
- with gr.Column():
377
- with gr.Row():
378
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
379
-
380
- with gr.Column():
381
- with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
382
- with gr.Row():
383
- finished_eval_table = gr.components.Dataframe(
384
- value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
385
- )
386
-
387
- with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
388
- with gr.Row():
389
- running_eval_table = gr.components.Dataframe(
390
- value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
391
- )
392
-
393
- with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
394
- with gr.Row():
395
- pending_eval_table = gr.components.Dataframe(
396
- value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
397
- )
398
-
399
- with gr.Row():
400
- gr.Markdown("# Submit your model here", elem_classes="markdown-text")
401
-
402
- with gr.Row():
403
- inference_framework = gr.Dropdown(
404
- choices=[t.to_str() for t in InferenceFramework],
405
- label="Inference framework",
406
- multiselect=False,
407
- value=None,
408
- interactive=True,
409
- )
410
-
411
- gpu_type = gr.Dropdown(
412
- choices=[t.to_str() for t in GPUType],
413
- label="GPU type",
414
- multiselect=False,
415
- value="NVIDIA-A100-PCIe-80GB",
416
- interactive=True,
417
- )
418
-
419
-
420
- with gr.Row():
421
- with gr.Column():
422
- model_name_textbox = gr.Textbox(label="Model name")
423
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
424
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
425
- model_type = gr.Dropdown(
426
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
427
- label="Model type",
428
- multiselect=False,
429
- value=None,
430
- interactive=True,
431
- )
432
-
433
- with gr.Column():
434
- precision = gr.Dropdown(
435
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
436
- label="Precision",
437
- multiselect=False,
438
- value="float32",
439
- interactive=True,
440
- )
441
-
442
- weight_type = gr.Dropdown(
443
- choices=[i.value.name for i in WeightType],
444
- label="Weights type",
445
- multiselect=False,
446
- value="Original",
447
- interactive=True,
448
- )
449
-
450
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
451
-
452
- submit_button = gr.Button("Submit Eval")
453
- submission_result = gr.Markdown()
454
- debug = gr.Checkbox(value=args.debug, label="Debug", visible=False)
455
- submit_button.click(
456
- add_new_eval,
457
- [
458
- model_name_textbox,
459
- base_model_name_textbox,
460
- revision_name_textbox,
461
- precision,
462
- private,
463
- weight_type,
464
- model_type,
465
- inference_framework,
466
- debug,
467
- gpu_type
468
- ],
469
- submission_result,
470
- )
471
-
472
- with gr.Row():
473
- with gr.Accordion("Citing this leaderboard", open=False):
474
- citation_button = gr.Textbox(
475
- value=CITATION_BUTTON_TEXT,
476
- label=CITATION_BUTTON_LABEL,
477
- lines=20,
478
- elem_id="citation-button",
479
- show_copy_button=True,
480
- )
481
-
482
- scheduler = BackgroundScheduler(timezone=utc)
483
-
484
- scheduler.add_job(restart_space, "interval", hours=6)
485
-
486
- def launch_backend():
487
- import subprocess
488
- from src.backend.envs import DEVICE
489
-
490
- if DEVICE not in {"cpu"}:
491
- _ = subprocess.run(["python", "backend-cli.py"])
492
-
493
- # Thread(target=periodic_init, daemon=True).start()
494
- # scheduler.add_job(launch_backend, "interval", seconds=120)
495
  if __name__ == "__main__":
496
- scheduler.start()
497
- demo.queue(default_concurrency_limit=40).launch()
498
-
 
1
  #!/usr/bin/env python
2
  import os
3
+ os.environ["GRADIO_LANGUAGE"] = "en"
4
+
5
+
6
+ RESULT_DIR = os.environ.get("MOECAP_RESULT_DIR")
7
+ if not RESULT_DIR:
8
+ raise RuntimeError(
9
+ "MOECAP_RESULT_DIR is not set. Please set MOECAP_RESULT_DIR before running app.py"
10
+ )
11
+
12
+ import json
13
+ from typing import List, Tuple
14
 
15
  import gradio as gr
16
  import pandas as pd
17
+ from datasets import load_dataset
18
+
19
+
20
+ def f2(x):
21
+ """Format to 2 decimal places if number, else return as-is."""
22
+ if isinstance(x, (int, float)):
23
+ return round(float(x), 2)
24
+ return x
25
+
26
+
27
+ def json_to_row(path: str, metrics: dict) -> dict:
28
+ model_name = metrics.get("model_name")
29
+ if not model_name:
30
+ model_name = "unknown-model"
31
+
32
+ dataset = metrics.get("dataset", "gsm8k")
33
+
34
+ method = metrics.get("method", "")
35
+ precision = metrics.get("precision", "")
36
+ gsm8k_e2e = metrics.get("gsm8k_e2e_s", None)
37
+ gsm8k_bs = metrics.get("gsm8k_bs", None)
38
+ gsm8k_gpu = metrics.get("gpu_type", "")
39
+
40
+ em = metrics.get("exact_match")
41
+ correct = metrics.get("correct")
42
+ total = metrics.get("total")
43
+ if isinstance(correct, (int, float)) and isinstance(total, (int, float)) and total > 0:
44
+ acc = correct / total
45
+ else:
46
+ acc = em
47
+
48
+ def pct(x):
49
+ return round(x * 100, 2) if isinstance(x, (int, float)) else None
50
+
51
+ if isinstance(model_name, str) and "/" in model_name:
52
+ hf_url = f"https://huggingface.co/{model_name}"
53
+ model_cell = f"<a href='{hf_url}' target='_blank'>{model_name}</a>"
54
+ else:
55
+ model_cell = model_name
56
+
57
+ row = {
58
+ "Model": model_cell,
59
+ "Dataset": dataset,
60
+ "Method": method,
61
+ "Precision": precision,
62
+ "GSM8K<br>E2E(s)": f2(gsm8k_e2e),
63
+ "GSM8K<br>bs": gsm8k_bs,
64
+ "GSM8K<br>GPU": gsm8k_gpu,
65
+ "GSM8K<br>Accuracy(%)": pct(acc),
66
+ "GSM8K<br>Decoding T/s": f2(metrics.get("decoding_throughput")),
67
+ "GSM8K<br>Prefill T/s": f2(metrics.get("prefill_tp")),
68
+
69
+ "GSM8K<br>Prefill<br>S-MBU(%)": pct(metrics.get("prefill_smbu")),
70
+ "GSM8K<br>Prefill<br>S-MFU(%)": pct(metrics.get("prefill_smfu")),
71
+ "GSM8K<br>Decoding<br>S-MBU(%)": pct(metrics.get("decoding_smbu")),
72
+ "GSM8K<br>Decoding<br>S-MFU(%)": pct(metrics.get("decoding_smfu")),
73
+
74
+ "TTFT(s)": f2(metrics.get("ttft")),
75
+ "TPOT(s)": f2(metrics.get("tpot")),
76
+ }
77
+ return row
78
+
79
+
80
+ # uoload
81
+
82
+ def build_leaderboard_from_files(files: List[gr.File], prev_rows: list | None = None):
83
+ if prev_rows is None:
84
+ prev_rows = []
85
+
86
+ if not files and prev_rows:
87
+ df = pd.DataFrame(prev_rows)
88
+ raw_models = set()
89
+ for cell in df["Model"].tolist():
90
+ if isinstance(cell, str) and "href" in cell:
91
+ try:
92
+ name = cell.split(">", 1)[1].split("<", 1)[0]
93
+ except Exception:
94
+ name = cell
95
+ else:
96
+ name = cell
97
+ raw_models.add(name)
98
+ links = []
99
+ for name in sorted(raw_models):
100
+ if isinstance(name, str) and "/" in name:
101
+ hf_url = f"https://huggingface.co/{name}"
102
+ links.append(f"[{name}]({hf_url})")
103
+ else:
104
+ links.append(str(name))
105
+ models_str = ", ".join(links)
106
+ summary_md = f"**Loaded {len(prev_rows)} result files.** \n**Models:** {models_str}"
107
+ table_html = df.to_html(escape=False, index=False, classes="metrics-table")
108
+ return summary_md, table_html, prev_rows
109
+
110
+ new_rows = []
111
+ if files:
112
+ for f in files:
113
+ path = f.name
114
+ try:
115
+ with open(path, "r", encoding="utf-8") as fp:
116
+ metrics = json.load(fp)
117
+ new_rows.append(json_to_row(path, metrics))
118
+ except Exception:
119
+ continue
120
+
121
+ all_rows = prev_rows + new_rows
122
+
123
+ if not all_rows:
124
+ empty_html = "<p>No files loaded.</p>"
125
+ return "No files uploaded.", empty_html, []
126
+
127
+ df = pd.DataFrame(all_rows)
128
+
129
+ raw_models = set()
130
+ for cell in df["Model"].tolist():
131
+ if isinstance(cell, str) and "href" in cell:
132
+ try:
133
+ name = cell.split(">", 1)[1].split("<", 1)[0]
134
+ except Exception:
135
+ name = cell
136
+ else:
137
+ name = cell
138
+ raw_models.add(name)
139
+ links = []
140
+ for name in sorted(raw_models):
141
+ if isinstance(name, str) and "/" in name:
142
+ hf_url = f"https://huggingface.co/{name}"
143
+ links.append(f"[{name}]({hf_url})")
144
+ else:
145
+ links.append(str(name))
146
+ models_str = ", ".join(links)
147
+ summary_md = f"**Loaded {len(all_rows)} result files.** \n**Models:** {models_str}"
148
+
149
+ table_html = df.to_html(escape=False, index=False, classes="metrics-table")
150
+
151
+ return summary_md, table_html, all_rows
152
+
153
+
154
+ def load_from_dir(dir_path: str):
155
+
156
  try:
157
+ ds = load_dataset(dir_path, split="train")
 
 
 
158
  except Exception as e:
159
+ empty_html = "<p>No files loaded.</p>"
160
+ return f"Failed to load dataset `{dir_path}`: {e}", empty_html
161
+
162
+ rows = []
163
+ for i, example in enumerate(ds):
164
+
165
+ if isinstance(example, dict):
166
+ metrics = example.get("metrics") or example.get("json") or example
167
+ else:
168
+ metrics = example
169
+
170
+ rows.append(json_to_row(f"{dir_path}#{i}", metrics))
171
+
172
+ if not rows:
173
+ empty_html = "<p>No records found.</p>"
174
+ return f"No records found in dataset `{dir_path}`.", empty_html
175
+
176
+ df = pd.DataFrame(rows)
177
+
178
+ raw_models = set()
179
+ for cell in df["Model"].tolist():
180
+ if isinstance(cell, str) and "href" in cell:
181
+ try:
182
+ name = cell.split(">", 1)[1].split("<", 1)[0]
183
+ except Exception:
184
+ name = cell
185
+ else:
186
+ name = cell
187
+ raw_models.add(name)
188
+ links = []
189
+ for name in sorted(raw_models):
190
+ if isinstance(name, str) and "/" in name:
191
+ hf_url = f"https://huggingface.co/{name}"
192
+ links.append(f"[{name}]({hf_url})")
193
+ else:
194
+ links.append(str(name))
195
+ models_str = ", ".join(links)
196
+ summary_md = (
197
+ f"**Loaded {len(rows)} result files from dataset `{dir_path}`.** \n"
198
+ f"**Models:** {models_str}"
199
+ )
200
+
201
+ table_html = df.to_html(escape=False, index=False, classes="metrics-table")
202
+
203
+ return summary_md, table_html
204
+
205
+
206
+ # Gradio UI
207
+
208
+ def build_app() -> gr.Blocks:
209
+ row_css = """
210
+ .gradio-container table.metrics-table th,
211
+ .gradio-container table.metrics-table td {
212
+ padding-top: 10px;
213
+ padding-bottom: 10px;
214
+ padding-left: 8px;
215
+ padding-right: 8px;
216
+ border: 1px solid #e5e7eb;
217
+ }
218
+ .gradio-container table.metrics-table {
219
+ border-collapse: collapse;
220
+ width: 100%;
221
+ }
222
+ """
223
+
224
+ with gr.Blocks(title="MoE-CAP Dashboard", css=row_css) as demo:
225
+ gr.Markdown("# MoE-CAP Dashboard")
226
+
227
+ with gr.Row():
228
+ with gr.Column(scale=1):
229
+ gr.Markdown(
230
+ "### Tasks\n"
231
+ "- Mathematics Problem-Solving Performance — "
232
+ "[**GSM8K**](https://arxiv.org/abs/2110-14168)\n\n"
233
+ "### Columns and Metrics\n"
234
+ "- Model \n"
235
+ "- Dataset \n"
236
+ "- Method \n"
237
+ "- Precision \n"
238
+ "- GSM8K E2E (s) \n"
239
+ "- GSM8K Batch Size \n"
240
+ "- GPU Type \n"
241
+ "- GSM8K Accuracy (%) \n"
242
+ "- Decoding Throughput (tokens/s) \n"
243
+ "- Prefill Throughput (tokens/s) \n"
244
+ "- Prefill S-MBU (%) \n"
245
+ "- Prefill S-MFU (%) \n"
246
+ "- Decoding S-MBU (%) \n"
247
+ "- Decoding S-MFU (%) \n"
248
+ "- TTFT (s) \n"
249
+ "- TPOT (s)"
250
+ )
251
 
252
+ with gr.Column(scale=1):
253
+ # manual upload
254
+ # files_input = gr.Files(
255
+ # label="Upload `cap_metrics_*.json` files",
256
+ # file_types=[".json"],
257
+ # file_count="multiple",
258
+ # )
259
+ # run_button = gr.Button("Parse Uploaded Files")
260
+
261
+ dir_path = gr.Textbox(
262
+ label="Load from output directory",
263
+ value=RESULT_DIR,
264
+ lines=1,
265
+ )
266
+ load_dir_button = gr.Button("Load from directory")
267
 
268
+ # upload_summary = gr.Markdown(label="Upload Summary")
269
+ # upload_table = gr.HTML(label="Upload Metrics")
270
 
271
+ summary_output = gr.Markdown(label="Directory Summary")
272
+ leaderboard_output = gr.HTML(label="Directory Metrics")
273
 
274
+ # run_button.click(
275
+ # fn=build_leaderboard_from_files,
276
+ # inputs=files_input,
277
+ # outputs=[upload_summary, upload_table],
278
+ # )
279
 
280
+ load_dir_button.click(
281
+ fn=load_from_dir,
282
+ inputs=dir_path,
283
+ outputs=[summary_output, leaderboard_output],
284
  )
285
+
286
+
287
+
288
+ timer = gr.Timer(5.0)
289
+ timer.tick(
290
+ fn=auto_refresh_from_dir,
291
+ inputs=dir_path,
292
+ outputs=[summary_output, leaderboard_output],
293
  )
 
294
 
295
+ return demo
296
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  if __name__ == "__main__":
299
+ app = build_app()
300
+ app.launch(server_port=7861)