linzhengyu commited on
Commit
2576caa
Β·
1 Parent(s): e20b429

feat: list benchmarks in different tabs

Browse files
Files changed (4) hide show
  1. app.py +21 -9
  2. src/about.py +4 -0
  3. src/display/utils.py +1 -1
  4. uv.lock +1 -12
app.py CHANGED
@@ -13,11 +13,13 @@ from src.about import (
13
  INTRODUCTION_TEXT,
14
  LLM_BENCHMARKS_TEXT,
15
  TITLE,
 
16
  )
17
  from src.display.css_html_js import custom_css
18
  from src.display.utils import (
19
  BENCHMARK_COLS,
20
  COLS,
 
21
  EVAL_COLS,
22
  EVAL_TYPES,
23
  AutoEvalColumn,
@@ -76,17 +78,17 @@ LEADERBOARD_DF = get_leaderboard_df(
76
  ) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
77
 
78
 
79
- def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
80
  if dataframe is None or dataframe.empty:
81
  raise ValueError("Leaderboard DataFrame is empty or None.")
82
  # print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
83
  selected_columns = SelectColumns(
84
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
85
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
86
  label="Select Columns to Display:",
87
  )
88
  search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
89
- hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden]
90
  filter_columns = [
91
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
92
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
@@ -106,7 +108,7 @@ def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
106
  ]
107
  return Leaderboard(
108
  value=dataframe,
109
- datatype=[c.type for c in fields(AutoEvalColumn)],
110
  select_columns=selected_columns,
111
  search_columns=search_columns,
112
  hide_columns=hidden_columns,
@@ -122,13 +124,23 @@ with demo:
122
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
123
 
124
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
125
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
126
- leaderboard = init_leaderboard(LEADERBOARD_DF)
127
 
128
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
129
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
130
 
131
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
132
  with gr.Column():
133
  with gr.Row():
134
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
13
  INTRODUCTION_TEXT,
14
  LLM_BENCHMARKS_TEXT,
15
  TITLE,
16
+ BENCHMARKS,
17
  )
18
  from src.display.css_html_js import custom_css
19
  from src.display.utils import (
20
  BENCHMARK_COLS,
21
  COLS,
22
+ BASE_COLS,
23
  EVAL_COLS,
24
  EVAL_TYPES,
25
  AutoEvalColumn,
 
78
  ) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
79
 
80
 
81
+ def init_leaderboard(dataframe: pd.DataFrame, cols: list[str]) -> Leaderboard:
82
  if dataframe is None or dataframe.empty:
83
  raise ValueError("Leaderboard DataFrame is empty or None.")
84
  # print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
85
  selected_columns = SelectColumns(
86
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name in cols],
87
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden and c.name in cols],
88
  label="Select Columns to Display:",
89
  )
90
  search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
91
+ hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden and c.name in cols]
92
  filter_columns = [
93
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
94
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
 
108
  ]
109
  return Leaderboard(
110
  value=dataframe,
111
+ datatype=[c.type for c in fields(AutoEvalColumn) if c.name in cols],
112
  select_columns=selected_columns,
113
  search_columns=search_columns,
114
  hide_columns=hidden_columns,
 
124
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
125
 
126
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
 
127
 
128
+ for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
129
+ with gr.TabItem(f"πŸ… {benchmark}", elem_id="llm-benchmark-tab-table", id=i):
130
+ benchmark_cols = [BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)]
131
+ cols = BASE_COLS + benchmark_cols
132
+ BENCHMARK_DF = get_leaderboard_df(
133
+ settings.EVAL_RESULTS_PATH,
134
+ settings.EVAL_REQUESTS_PATH,
135
+ cols,
136
+ benchmark_cols,
137
+ )
138
+ leaderboard = init_leaderboard(BENCHMARK_DF, cols)
139
+
140
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=len(BENCHMARKS)):
141
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
142
 
143
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=len(BENCHMARKS) + 1):
144
  with gr.Column():
145
  with gr.Row():
146
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
src/about.py CHANGED
@@ -46,6 +46,10 @@ class Tasks(Enum):
46
  task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
47
 
48
 
 
 
 
 
49
  NUM_FEWSHOT = 0 # Change with your few shot
50
  # ---------------------------------------------------
51
 
 
46
  task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
47
 
48
 
49
+ BENCHMARKS = {m.value.benchmark for m in Tasks}
50
+ METRICS = {m.value.metric for m in Tasks}
51
+ COL_NAMES = {m.value.col_name for m in Tasks}
52
+
53
  NUM_FEWSHOT = 0 # Change with your few shot
54
  # ---------------------------------------------------
55
 
src/display/utils.py CHANGED
@@ -157,7 +157,7 @@ class Precision(Enum):
157
 
158
  # Column selection
159
  COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
160
-
161
  EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
162
  EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
163
 
 
157
 
158
  # Column selection
159
  COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
160
+ BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
161
  EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
162
  EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
163
 
uv.lock CHANGED
@@ -1,5 +1,5 @@
1
  version = 1
2
- revision = 2
3
  requires-python = "==3.10.*"
4
 
5
  [[package]]
@@ -687,7 +687,6 @@ dependencies = [
687
  { name = "python-dotenv" },
688
  { name = "rich" },
689
  { name = "sentencepiece" },
690
- { name = "tabulate" },
691
  { name = "tokenizers" },
692
  { name = "tqdm" },
693
  { name = "transformers" },
@@ -716,7 +715,6 @@ requires-dist = [
716
  { name = "python-dotenv", specifier = ">=1.2.1" },
717
  { name = "rich", specifier = ">=14.2.0" },
718
  { name = "sentencepiece" },
719
- { name = "tabulate", specifier = ">=0.9.0" },
720
  { name = "tokenizers", specifier = ">=0.15.0" },
721
  { name = "tqdm" },
722
  { name = "transformers" },
@@ -1307,15 +1305,6 @@ wheels = [
1307
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1308
  ]
1309
 
1310
- [[package]]
1311
- name = "tabulate"
1312
- version = "0.9.0"
1313
- source = { registry = "https://pypi.org/simple" }
1314
- sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
1315
- wheels = [
1316
- { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
1317
- ]
1318
-
1319
  [[package]]
1320
  name = "tokenizers"
1321
  version = "0.22.1"
 
1
  version = 1
2
+ revision = 3
3
  requires-python = "==3.10.*"
4
 
5
  [[package]]
 
687
  { name = "python-dotenv" },
688
  { name = "rich" },
689
  { name = "sentencepiece" },
 
690
  { name = "tokenizers" },
691
  { name = "tqdm" },
692
  { name = "transformers" },
 
715
  { name = "python-dotenv", specifier = ">=1.2.1" },
716
  { name = "rich", specifier = ">=14.2.0" },
717
  { name = "sentencepiece" },
 
718
  { name = "tokenizers", specifier = ">=0.15.0" },
719
  { name = "tqdm" },
720
  { name = "transformers" },
 
1305
  { url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
1306
  ]
1307
 
 
 
 
 
 
 
 
 
 
1308
  [[package]]
1309
  name = "tokenizers"
1310
  version = "0.22.1"