j_yoon.song commited on
Commit
0865d34
·
1 Parent(s): abb7c49
Files changed (5) hide show
  1. app.py +118 -33
  2. src/about.py +2 -2
  3. src/config.py +39 -0
  4. src/data/export_category_250618.csv +1 -33
  5. src/data_utils.py +0 -0
app.py CHANGED
@@ -3,6 +3,7 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -57,37 +58,39 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
 
90
 
 
91
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
@@ -95,8 +98,91 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -201,5 +287,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- print("test")
205
  demo.queue(default_concurrency_limit=40).launch()
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ from src.data_utils import get_dataframe_category, get_dataframe_language
7
 
8
  from src.about import (
9
  CITATION_BUTTON_LABEL,
 
58
  pending_eval_queue_df,
59
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
60
 
61
+ # def init_leaderboard(dataframe):
62
+ # if dataframe is None or dataframe.empty:
63
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
64
+ # return Leaderboard(
65
+ # value=dataframe,
66
+ # datatype=[c.type for c in fields(AutoEvalColumn)],
67
+ # select_columns=SelectColumns(
68
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
69
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
70
+ # label="Select Columns to Display:",
71
+ # ),
72
+ # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
74
+ # filter_columns=[
75
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
77
+ # ColumnFilter(
78
+ # AutoEvalColumn.params.name,
79
+ # type="slider",
80
+ # min=0.01,
81
+ # max=150,
82
+ # label="Select the number of parameters (B)",
83
+ # ),
84
+ # ColumnFilter(
85
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
86
+ # ),
87
+ # ],
88
+ # bool_checkboxgroup_label="Hide models",
89
+ # interactive=False,
90
+ # )
91
+
92
 
93
+ tab_keys = ["Category", "Language"]
94
 
95
  demo = gr.Blocks(css=custom_css)
96
  with demo:
 
98
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
99
 
100
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
101
+
102
+ def search_leaderboard(query, df):
103
+ if not query.strip():
104
+ return df
105
+ filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
106
+ return filtered
107
+
108
+ def update_modelselector_group(groups, df):
109
+ """
110
+ groups (gr.CheckboxGroup): List of currently selected models
111
+ df (DataFrame or gr.State): Current dataframe
112
+ """
113
+ print("groups:", groups)
114
+ if not groups:
115
+ return None
116
+
117
+ filtered_df = df[df["Group"].isin(groups)]
118
+ models = filtered_df["Model Name"].unique().tolist()
119
+
120
+ return models
121
+
122
+ def update_columnselector_group(columns, groups, df):
123
+ print("column groups:", groups)
124
+
125
+ columns = [c for c in columns if c in df.columns[:3]]
126
+
127
+ columns.extend(df.columns[3:])
128
+
129
+ print(columns)
130
+
131
+ return columns
132
+
133
+
134
+ def update_leaderboard(models, columns, df):
135
+ print("models:", models)
136
+ print("columns:", columns)
137
+
138
+ filtered_df = df[df["Model Name"].isin(models)]
139
+ filtered_columns = [c for c in df.columns if c in columns or c in ["Model Name"]]
140
+ filtered_df = filtered_df[filtered_columns]
141
+
142
+ for col in filtered_df.select_dtypes(include="number").columns:
143
+ filtered_df[col] = filtered_df[col].round(3)
144
+
145
+ return filtered_df
146
+
147
+ def get_models_by_group(df, groups):
148
+ return df[df["Group"].isin(groups)]["Model Name"].tolist()
149
+
150
+ for _, key in enumerate(tab_keys):
151
+ with gr.TabItem(key, visible=True):
152
+ if key == "Category":
153
+ df = get_dataframe_category()
154
+ else:
155
+ df = get_dataframe_language()
156
+ df_state = gr.State(df)
157
+
158
+ with gr.Row():
159
+ with gr.Column():
160
+ search_box = gr.Textbox(label="Search Model by Name")
161
+ group_list = df["Group"].unique().tolist()
162
+ group_selector = gr.CheckboxGroup(choices=df["Group"].unique().tolist(), value=group_list, label="Select Model Group")
163
+
164
+ if key == "Category":
165
+ column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_CATEGORY[3:], label="Select Columns")
166
+ else:
167
+ column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_LANG[3:], label="Select Columns")
168
+
169
+ with gr.Column():
170
+ with gr.Accordion("세부 사항", open=False):
171
+ model_group = df["Model Name"].tolist()
172
+ model_selector = gr.CheckboxGroup(choices=df["Model Name"].tolist(), value=model_group, label="Select Models")
173
+
174
+ ld = gr.DataFrame(
175
+ value=df.round(3)
176
+ )
177
+
178
+ # Define change functions for user interaction
179
+ search_box.change(fn=search_leaderboard, inputs=[search_box, df_state], outputs=ld)
180
+ group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
181
+ model_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
182
+ column_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
183
+
184
+ # with gr.TabItem("Docs"):
185
+ # gr.Markdown((Path(__file__).parent / "docs.md").read_text())
186
 
187
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
188
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
287
  scheduler = BackgroundScheduler()
288
  scheduler.add_job(restart_space, "interval", seconds=1800)
289
  scheduler.start()
 
290
  demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">🥇 ProductivityBench (v1)</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ ProductivityBench is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
src/config.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ON_LOAD_COLUMNS_LANG = [
2
+ "Model Name",
3
+ "Group",
4
+ "Overall",
5
+ "KO",
6
+ "EN",
7
+ "JA",
8
+ "ZH",
9
+ "PL",
10
+ "DE",
11
+ "PT",
12
+ "ES",
13
+ "FR",
14
+ "IT",
15
+ "RU",
16
+ "VI"
17
+ ]
18
+
19
+ ON_LOAD_COLUMNS_CATEGORY = [
20
+ "Model Name",
21
+ "Group",
22
+ "Overall",
23
+ "Content Generation",
24
+ "Editing",
25
+ "Data Analysis",
26
+ "Reasoning",
27
+ "Samsung Knowledge",
28
+ "Hallucination",
29
+ "Safety",
30
+ "Repeatition",
31
+ "Summarization",
32
+ "Translation",
33
+ "Multi-Turn"
34
+ ]
35
+
36
+ COLUMN_GROUP_LIST = [
37
+ "Category",
38
+ "Language"
39
+ ]
src/data/export_category_250618.csv CHANGED
@@ -8,36 +8,4 @@
8
  "deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
9
  "deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
10
  "deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
11
- "deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
12
- "gemini-1.5-flash" "Gemini" "45.24" "50.65" "42.26" "46.67" "43.00" "20.00" "53.33" "21.25" "13.33" "66.44" "40.00" "39.44"
13
- "gemini-1.5-pro" "Gemini" "52.48" "57.14" "50.00" "50.00" "54.00" "43.33" "51.67" "33.75" "30.00" "69.52" "52.00" "40.56"
14
- "gemini-2.0-flash" "Gemini" "55.27" "54.55" "54.17" "56.00" "51.00" "58.33" "60.00" "20.00" "40.00" "74.32" "56.00" "42.22"
15
- "gemini-2.5-pro-05-06" "Gemini" "63.98" "62.99" "61.90" "70.67" "72.00" "48.33" "73.33" "23.75" "43.33" "78.77" "66.00" "52.78"
16
- "Gemma-2-27B-it" "Gemma" "43.14" "51.95" "38.10" "42.67" "29.00" "21.67" "48.33" "37.50" "20.00" "62.33" "41.00" "32.78"
17
- "Gemma-3-1B-it" "Gemma" "12.96" "25.32" "10.12" "15.33" "9.00" "0.00" "11.67" "27.50" "6.67" "22.60" "2.67" "6.11"
18
- "Gemma-3-4B-it" "Gemma" "29.61" "40.91" "28.57" "30.00" "20.00" "13.33" "20.00" "28.75" "10.00" "51.03" "22.00" "16.11"
19
- "Gemma-3-12B-it" "Gemma" "42.50" "51.30" "48.81" "37.33" "30.00" "23.33" "31.67" "33.75" "16.67" "66.44" "37.33" "28.33"
20
- "Gemma-3-27B-it" "Gemma" "44.09" "53.25" "44.64" "50.00" "39.00" "33.33" "45.00" "26.25" "23.33" "63.36" "33.67" "34.44"
21
- "gpt-4o" "GPT" "56.42" "61.04" "61.31" "58.67" "49.00" "45.00" "51.67" "35.00" "43.33" "73.29" "53.00" "45.56"
22
- "gpt-o1" "GPT" "67.92" "68.18" "76.19" "74.00" "69.00" "35.00" "65.00" "30.00" "66.67" "84.59" "66.67" "58.33"
23
- "gpt-o3" "GPT" "70.33" "76.62" "75.00" "74.67" "79.00" "53.33" "58.33" "23.75" "76.67" "83.56" "74.00" "53.89"
24
- "gpt-o4-mini" "GPT" "65.31" "75.97" "63.69" "76.00" "77.00" "41.67" "55.00" "30.00" "66.67" "81.85" "59.67" "51.67"
25
- "llama3_1_8b_inst" "Llama" "25.79" "37.66" "25.00" "31.33" "18.00" "13.33" "36.67" "23.75" "13.33" "37.67" "17.00" "15.00"
26
- "llama3_1_70b_inst" "Llama" "40.79" "45.45" "41.67" "49.33" "35.00" "23.33" "43.33" "21.25" "20.00" "54.79" "37.33" "32.22"
27
- "llama3_1_405b_fp8_inst" "Llama" "48.03" "50.00" "48.81" "52.67" "47.00" "30.00" "50.00" "22.50" "33.33" "64.04" "47.33" "36.67"
28
- "llama3_3_70b_inst" "Llama" "40.60" "48.70" "43.45" "45.33" "38.00" "16.67" "40.00" "20.00" "16.67" "58.56" "32.67" "33.89"
29
- "llama4_scout" "Llama" "44.98" "46.75" "39.88" "52.67" "43.00" "31.67" "41.67" "22.50" "23.33" "61.30" "44.00" "37.22"
30
- "llama4_maverick" "Llama" "51.65" "54.55" "43.45" "58.67" "55.00" "36.67" "55.00" "32.50" "16.67" "64.04" "53.33" "44.44"
31
- "Mixtral-8x7B-Instruct-v0.1" "Mistral" "22.81" "26.62" "16.07" "24.67" "13.00" "16.67" "38.33" "23.75" "23.33" "37.67" "13.00" "18.33"
32
- "phi-4" "Phi" "39.83" "45.45" "39.88" "47.33" "45.00" "16.67" "33.33" "46.25" "23.33" "51.71" "33.00" "27.78"
33
- "Qwen2-72B-Instruct" "Qwen" "39.52" "42.86" "38.69" "34.67" "31.00" "18.33" "51.67" "32.50" "23.33" "56.16" "37.33" "31.67"
34
- "Qwen2.5-14B-Instruct" "Qwen" "37.99" "45.45" "27.98" "36.67" "39.00" "21.67" "51.67" "32.50" "26.67" "54.45" "32.67" "28.89"
35
- "Qwen2.5-32B-Instruct" "Qwen" "43.84" "51.95" "38.10" "47.33" "45.00" "21.67" "55.00" "35.00" "20.00" "63.36" "36.00" "31.67"
36
- "Qwen2.5-72B-Instruct" "Qwen" "46.19" "52.60" "43.45" "50.67" "42.00" "23.33" "48.33" "37.50" "30.00" "65.41" "39.00" "36.11"
37
- "Qwen-QwQ-32B" "Qwen" "47.46" "54.55" "45.24" "65.33" "66.00" "25.00" "36.67" "21.25" "26.67" "65.07" "39.33" "29.44"
38
- "Qwen3-235B-A22B" "Qwen" "48.09" "59.74" "41.67" "65.33" "71.00" "33.33" "41.67" "20.00" "33.33" "66.44" "30.33" "38.89"
39
- "Gauss2.2-37B-Instruct-250430" "Gauss" "50.70" "52.60" "50.60" "43.33" "42.00" "28.33" "41.67" "26.25" "26.67" "71.23" "58.00" "40.00"
40
- "Gauss2.2-37B-Think-250430" "Gauss" "46.00" "57.14" "40.48" "59.33" "59.00" "26.67" "36.67" "20.00" "36.67" "60.62" "39.67" "32.78"
41
- "GaussO-Owl-Ultra-Think-250604" "Gauss" "57.05" "63.64" "52.98" "66.00" "57.00" "48.33" "55.00" "37.50" "33.33" "75.00" "53.67" "40.56"
42
- "GaussO-Owl-Ultra-Think-250423" "Gauss" "56.10" "61.04" "47.62" "68.00" "69.00" "48.33" "51.67" "23.75" "53.33" "69.86" "53.00" "44.44"
43
- "GaussO-Owl-Ultra-Instruct-250423" "Gauss" "58.58" "64.94" "55.95" "63.33" "69.00" "41.67" "53.33" "25.00" "36.67" "73.97" "60.00" "44.44"
 
8
  "deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
9
  "deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
10
  "deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
11
+ "deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/data_utils.py ADDED
File without changes