.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,46 @@
1
  ---
2
- title: test_space
3
- emoji: 👀
4
  colorFrom: green
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.38.0
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
- hf_oauth: true
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Test Space
3
+ emoji: 🥇
4
  colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
  license: apache-2.0
10
+ short_description: Duplicate this leaderboard to initialize your own!
11
+ sdk_version: 5.19.0
12
  ---
13
 
14
+ # Start the configuration
15
+
16
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
+
18
+ Results files should have the following format and be stored as json files:
19
+ ```json
20
+ {
21
+ "config": {
22
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
+ "model_name": "path of the model on the hub: org/model",
24
+ "model_sha": "revision on the hub",
25
+ },
26
+ "results": {
27
+ "task_name": {
28
+ "metric_name": score,
29
+ },
30
+ "task_name2": {
31
+ "metric_name": score,
32
+ }
33
+ }
34
+ }
35
+ ```
36
+
37
+ Request files are created automatically by this tool.
38
+
39
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
+
41
+ # Code logic for more complex edits
42
+
43
+ You'll find
44
+ - the main table' columns names and properties in `src/display/utils.py`
45
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -1,21 +1,15 @@
1
  import gradio as gr
2
-
3
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
  from src.data_utils import get_dataframe_category, get_dataframe_language
8
  import src.config as configs
9
- from utils import get_profile, get_organizations, get_profile_and_organizations, download_with_restart
10
-
11
 
12
  from src.about import (
13
  CITATION_BUTTON_LABEL,
14
  CITATION_BUTTON_TEXT,
15
  EVALUATION_QUEUE_TEXT,
16
- EVALUATION_QUEUE_TEXT_OPTION1,
17
- EVALUATION_QUEUE_TEXT_OPTION2,
18
- EVALUATION_QUEUE_TEXT_OPTION3,
19
  INTRODUCTION_TEXT,
20
  LLM_BENCHMARKS_TEXT,
21
  TITLE,
@@ -27,45 +21,37 @@ from src.display.utils import (
27
  EVAL_COLS,
28
  EVAL_TYPES,
29
  AutoEvalColumn,
 
30
  fields,
31
  WeightType,
32
  Precision
33
  )
34
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
35
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
36
- from src.submission.submit import add_new_eval_option1, add_new_eval_option2
37
 
38
 
39
- from handlers import (
40
- search_leaderboard,
41
- update_modelselector_group,
42
- update_columnselector_group,
43
- update_leaderboard,
44
- get_models_by_group,
45
- )
46
- from ui import create_leaderboard_tab
47
- from constants import TAB_KEYS, TAB_NAMES, VLLM_VERSIONS
48
-
49
  def restart_space():
50
  API.restart_space(repo_id=REPO_ID)
51
 
52
  ### Space initialisation
53
- download_with_restart(
54
- snapshot_download,
55
- repo_id=QUEUE_REPO,
56
- local_dir=EVAL_REQUESTS_PATH,
57
- repo_type="dataset",
58
- token=TOKEN,
59
- restart_func=restart_space
60
- )
61
- download_with_restart(
62
- snapshot_download,
63
- repo_id=RESULTS_REPO,
64
- local_dir=EVAL_RESULTS_PATH,
65
- repo_type="dataset",
66
- token=TOKEN,
67
- restart_func=restart_space
68
- )
 
69
 
70
  (
71
  finished_eval_queue_df,
@@ -73,28 +59,132 @@ download_with_restart(
73
  pending_eval_queue_df,
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  demo = gr.Blocks(css=custom_css)
77
  with demo:
78
  gr.HTML(TITLE)
79
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
80
- user_state = gr.State()
81
- organization_state = gr.State()
82
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
83
- for _, key in enumerate(TAB_KEYS):
84
- if key == "Category":
85
- df = get_dataframe_category()
86
- column_selector_value = configs.ON_LOAD_COLUMNS_CATEGORY[3:]
87
- else:
88
- df = get_dataframe_language()
89
- column_selector_value = configs.ON_LOAD_COLUMNS_LANG[3:]
90
- create_leaderboard_tab(
91
- df,
92
- key,
93
- search_leaderboard,
94
- update_modelselector_group,
95
- update_leaderboard,
96
- column_selector_value
97
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
99
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
100
 
@@ -103,23 +193,57 @@ with demo:
103
  with gr.Row():
104
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
105
 
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
109
  with gr.Row():
110
- gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is available)", elem_classes="markdown-text")
111
 
112
  with gr.Row():
113
  with gr.Column():
114
- benchmark_type = gr.Dropdown(
115
- choices=["TRUEBench v0.1"],
116
- label="The name of the benchmark to be evaluated",
 
 
117
  multiselect=False,
118
- value="TRUEBench v0.1",
119
  interactive=True,
120
  )
121
- model_name_textbox = gr.Textbox(label="Model name")
122
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
123
  precision = gr.Dropdown(
124
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
125
  label="Precision",
@@ -127,118 +251,29 @@ with demo:
127
  value="float16",
128
  interactive=True,
129
  )
130
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
131
- vllm_version_type = gr.Dropdown(
132
- choices=VLLM_VERSIONS,
133
- label="vLLM version",
134
  multiselect=False,
135
- value="v0.9.2",
136
  interactive=True,
137
  )
138
- with gr.Column():
139
- temperature_textbox = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
140
- top_p_textbox = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
141
- top_k_textbox = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
142
- presence_penalty_textbox = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
143
- frequency_penalty_textbox = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
144
- repetition_penalty_textbox = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
145
-
146
- login_button = gr.LoginButton()
147
  submit_button = gr.Button("Submit Eval")
148
  submission_result = gr.Markdown()
149
- event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
150
- event.then(
151
- add_new_eval_option1,
152
  [
153
- benchmark_type,
154
  model_name_textbox,
155
  base_model_name_textbox,
156
  revision_name_textbox,
157
  precision,
158
- temperature_textbox,
159
- top_p_textbox,
160
- top_k_textbox,
161
- presence_penalty_textbox,
162
- frequency_penalty_textbox,
163
- repetition_penalty_textbox,
164
- vllm_version_type,
165
- user_state,
166
- organization_state
167
  ],
168
  submission_result,
169
  )
170
- with gr.Row():
171
- gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION2, elem_classes="markdown-text")
172
-
173
- with gr.Row():
174
- gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is unavailable)", elem_classes="markdown-text")
175
-
176
- with gr.Row():
177
- with gr.Column():
178
- benchmark_type2 = gr.Dropdown(
179
- choices=["TRUEBench v0.1"],
180
- label="The name of the benchmark to be evaluated",
181
- multiselect=False,
182
- value="TRUEBench v0.1",
183
- interactive=True,
184
- )
185
- model_name_textbox2 = gr.Textbox(label="Model name")
186
- revision_name_textbox2 = gr.Textbox(label="Revision commit", placeholder="main")
187
- precision2 = gr.Dropdown(
188
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
189
- label="Precision",
190
- multiselect=False,
191
- value="float16",
192
- interactive=True,
193
- )
194
- base_model_name_textbox2 = gr.Textbox(label="Base model (for delta or adapter weights)")
195
-
196
- with gr.Column():
197
- temperature_textbox2 = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
198
- top_p_textbox2 = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
199
- top_k_textbox2 = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
200
- presence_penalty_textbox2 = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
201
- frequency_penalty_textbox2 = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
202
- repetition_penalty_textbox2 = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
203
-
204
- with gr.Row():
205
- with gr.Column():
206
- model_load_code_snippet_textbox = gr.Textbox(label="Code for model loading", lines=15, placeholder="model = AutoModel.from_pretrained('your model name', revision=revision)")
207
- with gr.Column():
208
- inference_code_snippet_textbox = gr.Textbox(label="Code for inference", lines=15, placeholder="output = model(...)")
209
- with gr.Column():
210
- terminate_code_snippet_textbox = gr.Textbox(label="Code for termination", lines=15)
211
-
212
- login_button2 = gr.LoginButton()
213
-
214
- submit_button2 = gr.Button("Submit Eval")
215
- submission_result2 = gr.Markdown()
216
- event2 = submit_button2.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
217
- event2.then(
218
- add_new_eval_option2,
219
- [
220
- benchmark_type2,
221
- model_name_textbox2,
222
- base_model_name_textbox2,
223
- revision_name_textbox2,
224
- precision2,
225
- temperature_textbox2,
226
- top_p_textbox2,
227
- top_k_textbox2,
228
- presence_penalty_textbox2,
229
- frequency_penalty_textbox2,
230
- repetition_penalty_textbox2,
231
- model_load_code_snippet_textbox,
232
- inference_code_snippet_textbox,
233
- terminate_code_snippet_textbox,
234
- user_state,
235
- organization_state
236
- ],
237
- submission_result2,
238
- )
239
-
240
- with gr.Row():
241
- gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION3, elem_classes="markdown-text")
242
 
243
  with gr.Row():
244
  with gr.Accordion("📙 Citation", open=False):
@@ -250,8 +285,7 @@ with demo:
250
  show_copy_button=True,
251
  )
252
 
253
-
254
  scheduler = BackgroundScheduler()
255
  scheduler.add_job(restart_space, "interval", seconds=1800)
256
  scheduler.start()
257
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
  from src.data_utils import get_dataframe_category, get_dataframe_language
7
  import src.config as configs
 
 
8
 
9
  from src.about import (
10
  CITATION_BUTTON_LABEL,
11
  CITATION_BUTTON_TEXT,
12
  EVALUATION_QUEUE_TEXT,
 
 
 
13
  INTRODUCTION_TEXT,
14
  LLM_BENCHMARKS_TEXT,
15
  TITLE,
 
21
  EVAL_COLS,
22
  EVAL_TYPES,
23
  AutoEvalColumn,
24
+ ModelType,
25
  fields,
26
  WeightType,
27
  Precision
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
+ from src.submission.submit import add_new_eval
32
 
33
 
 
 
 
 
 
 
 
 
 
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
37
  ### Space initialisation
38
+ try:
39
+ print(EVAL_REQUESTS_PATH)
40
+ snapshot_download(
41
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
42
+ )
43
+ except Exception:
44
+ restart_space()
45
+ try:
46
+ print(EVAL_RESULTS_PATH)
47
+ snapshot_download(
48
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
49
+ )
50
+ except Exception:
51
+ restart_space()
52
+
53
+
54
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
55
 
56
  (
57
  finished_eval_queue_df,
 
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
+ # def init_leaderboard(dataframe):
63
+ # if dataframe is None or dataframe.empty:
64
+ # raise ValueError("Leaderboard DataFrame is empty or None.")
65
+ # return Leaderboard(
66
+ # value=dataframe,
67
+ # datatype=[c.type for c in fields(AutoEvalColumn)],
68
+ # select_columns=SelectColumns(
69
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
70
+ # cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
71
+ # label="Select Columns to Display:",
72
+ # ),
73
+ # search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
74
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
75
+ # filter_columns=[
76
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
77
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
78
+ # ColumnFilter(
79
+ # AutoEvalColumn.params.name,
80
+ # type="slider",
81
+ # min=0.01,
82
+ # max=150,
83
+ # label="Select the number of parameters (B)",
84
+ # ),
85
+ # ColumnFilter(
86
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
87
+ # ),
88
+ # ],
89
+ # bool_checkboxgroup_label="Hide models",
90
+ # interactive=False,
91
+ # )
92
+
93
+
94
+ tab_keys = ["Category", "Language"]
95
+
96
  demo = gr.Blocks(css=custom_css)
97
  with demo:
98
  gr.HTML(TITLE)
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
+
 
101
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
102
+
103
+ def search_leaderboard(query, df):
104
+ if not query.strip():
105
+ return df
106
+ filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
107
+ return filtered
108
+
109
+ def update_modelselector_group(groups, df):
110
+ """
111
+ groups (gr.CheckboxGroup): List of currently selected models
112
+ df (DataFrame or gr.State): Current dataframe
113
+ """
114
+ print("groups:", groups)
115
+ if not groups:
116
+ return None
117
+
118
+ filtered_df = df[df["Group"].isin(groups)]
119
+ models = filtered_df["Model Name"].unique().tolist()
120
+
121
+ return models
122
+
123
+ def update_columnselector_group(columns, groups, df):
124
+ print("column groups:", groups)
125
+
126
+ columns = [c for c in columns if c in df.columns[:3]]
127
+
128
+ columns.extend(df.columns[3:])
129
+
130
+ print(columns)
131
+
132
+ return columns
133
+
134
+
135
+ def update_leaderboard(models, columns, df):
136
+ print("models:", models)
137
+ print("columns:", columns)
138
+
139
+ filtered_df = df[df["Model Name"].isin(models)]
140
+ filtered_columns = [c for c in df.columns if c in columns or c in ["Model Name"]]
141
+ filtered_df = filtered_df[filtered_columns]
142
+
143
+ for col in filtered_df.select_dtypes(include="number").columns:
144
+ filtered_df[col] = filtered_df[col].round(3)
145
+
146
+ return filtered_df
147
+
148
+ def get_models_by_group(df, groups):
149
+ return df[df["Group"].isin(groups)]["Model Name"].tolist()
150
+
151
+ for _, key in enumerate(tab_keys):
152
+ with gr.TabItem(key, visible=True):
153
+ if key == "Category":
154
+ df = get_dataframe_category()
155
+ else:
156
+ df = get_dataframe_language()
157
+ df_state = gr.State(df)
158
+
159
+ with gr.Row():
160
+ with gr.Column():
161
+ search_box = gr.Textbox(label="Search Model by Name")
162
+ group_list = df["Group"].unique().tolist()
163
+ group_selector = gr.CheckboxGroup(choices=df["Group"].unique().tolist(), value=group_list, label="Select Model Group")
164
+
165
+ if key == "Category":
166
+ column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_CATEGORY[3:], label="Select Columns")
167
+ else:
168
+ column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_LANG[3:], label="Select Columns")
169
+
170
+ with gr.Column():
171
+ with gr.Accordion("세부 사항", open=False):
172
+ model_group = df["Model Name"].tolist()
173
+ model_selector = gr.CheckboxGroup(choices=df["Model Name"].tolist(), value=model_group, label="Select Models")
174
+
175
+ ld = gr.DataFrame(
176
+ value=df.round(3)
177
+ )
178
+
179
+ # Define change functions for user interaction
180
+ search_box.change(fn=search_leaderboard, inputs=[search_box, df_state], outputs=ld)
181
+ group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
182
+ model_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
183
+ column_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
184
+
185
+ # with gr.TabItem("Docs"):
186
+ # gr.Markdown((Path(__file__).parent / "docs.md").read_text())
187
+
188
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
189
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
190
 
 
193
  with gr.Row():
194
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
195
 
196
+ with gr.Column():
197
+ with gr.Accordion(
198
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
199
+ open=False,
200
+ ):
201
+ with gr.Row():
202
+ finished_eval_table = gr.components.Dataframe(
203
+ value=finished_eval_queue_df,
204
+ headers=EVAL_COLS,
205
+ datatype=EVAL_TYPES,
206
+ row_count=5,
207
+ )
208
+ with gr.Accordion(
209
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
210
+ open=False,
211
+ ):
212
+ with gr.Row():
213
+ running_eval_table = gr.components.Dataframe(
214
+ value=running_eval_queue_df,
215
+ headers=EVAL_COLS,
216
+ datatype=EVAL_TYPES,
217
+ row_count=5,
218
+ )
219
 
220
+ with gr.Accordion(
221
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
222
+ open=False,
223
+ ):
224
+ with gr.Row():
225
+ pending_eval_table = gr.components.Dataframe(
226
+ value=pending_eval_queue_df,
227
+ headers=EVAL_COLS,
228
+ datatype=EVAL_TYPES,
229
+ row_count=5,
230
+ )
231
  with gr.Row():
232
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
233
 
234
  with gr.Row():
235
  with gr.Column():
236
+ model_name_textbox = gr.Textbox(label="Model name")
237
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
238
+ model_type = gr.Dropdown(
239
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
240
+ label="Model type",
241
  multiselect=False,
242
+ value=None,
243
  interactive=True,
244
  )
245
+
246
+ with gr.Column():
247
  precision = gr.Dropdown(
248
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
249
  label="Precision",
 
251
  value="float16",
252
  interactive=True,
253
  )
254
+ weight_type = gr.Dropdown(
255
+ choices=[i.value.name for i in WeightType],
256
+ label="Weights type",
 
257
  multiselect=False,
258
+ value="Original",
259
  interactive=True,
260
  )
261
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
262
+
 
 
 
 
 
 
 
263
  submit_button = gr.Button("Submit Eval")
264
  submission_result = gr.Markdown()
265
+ submit_button.click(
266
+ add_new_eval,
 
267
  [
 
268
  model_name_textbox,
269
  base_model_name_textbox,
270
  revision_name_textbox,
271
  precision,
272
+ weight_type,
273
+ model_type,
 
 
 
 
 
 
 
274
  ],
275
  submission_result,
276
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  with gr.Row():
279
  with gr.Accordion("📙 Citation", open=False):
 
285
  show_copy_button=True,
286
  )
287
 
 
288
  scheduler = BackgroundScheduler()
289
  scheduler.add_job(restart_space, "interval", seconds=1800)
290
  scheduler.start()
291
+ demo.queue(default_concurrency_limit=40).launch()
constants.py DELETED
@@ -1,36 +0,0 @@
1
- # constants.py
2
-
3
- TAB_KEYS = ["Category", "Language"]
4
-
5
- TAB_NAMES = {
6
- "Category": "TRUEBench v0.1 (Category 🔧)",
7
- "Language": "TRUEBench v0.1 (Language 🌎)"
8
- }
9
-
10
- VLLM_VERSIONS = [
11
- "v0.9.2", "v0.9.2rc2", "v0.9.2rc1", "v0.9.1", "v0.9.1rc2", "v0.9.1rc1",
12
- "v0.9.0.1", "v0.9.0", "v0.8.5", "v0.8.5.post1", "v0.8.4", "v0.8.3",
13
- "v0.8.3rc1", "v0.8.2", "v0.8.1", "v0.8.0", "v0.8.0rc2", "v0.8.0rc1",
14
- "v0.7.3", "v0.7.2", "v0.7.1", "v0.6.6", "v0.6.6.post1", "v0.6.5",
15
- "v0.6.4.post1", "v0.6.4", "v0.6.3.post1", "v0.6.2", "v0.6.1",
16
- "v0.6.1.post2", "v0.6.1.post1", "v0.6.0"
17
- ]
18
-
19
- # 리더보드 필수 컬럼(항상 포함되어야 함)
20
- LEADERBOARD_REQUIRED_COLUMNS = [
21
- "Model Name", "Group", "Overall", "Model Type", "Output Form", "Rank"
22
- ]
23
-
24
- # Model badge mappings (centralized for both UI and backend)
25
- MODEL_TYPE_MAP = {
26
- "deepseek_r1": "open",
27
- "deepseek_r1_0528": "open",
28
- "Qwen3-32B": "open",
29
- "Gauss2.3-Think-250708": "closed"
30
- }
31
- OUTPUT_FORM_MAP = {
32
- "deepseek_r1": "reasoning",
33
- "deepseek_r1_0528": "normal",
34
- "Qwen3-32B": "reasoning",
35
- "Gauss2.3-Think-250708": "reasoning"
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
handlers.py DELETED
@@ -1,86 +0,0 @@
1
- import pandas as pd
2
-
3
- def search_leaderboard(query, df, sort_col=None, sort_asc=True):
4
- if not query.strip():
5
- filtered = df
6
- else:
7
- filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
8
- if sort_col and sort_col in filtered.columns:
9
- filtered = filtered.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
10
- return filtered
11
-
12
- def update_modelselector_group(groups, df):
13
- """
14
- groups (gr.CheckboxGroup): List of currently selected models
15
- df (DataFrame or gr.State): Current dataframe
16
- """
17
- print("groups:", groups)
18
- if not groups:
19
- return None
20
-
21
- filtered_df = df[df["Group"].isin(groups)]
22
- models = filtered_df["Model Name"].unique().tolist()
23
-
24
- return models
25
-
26
- def update_columnselector_group(columns, groups, df):
27
- print("column groups:", groups)
28
-
29
- columns = [c for c in columns if c in df.columns[:3]]
30
-
31
- columns.extend(df.columns[3:])
32
-
33
- print(columns)
34
-
35
- return columns
36
-
37
- from constants import LEADERBOARD_REQUIRED_COLUMNS, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
38
-
39
- def update_leaderboard(models, columns, df, sort_col=None, sort_asc=True):
40
- print("models:", models)
41
- print("columns:", columns)
42
- print("sort_col:", sort_col, "sort_asc:", sort_asc)
43
-
44
- # 필수 컬럼 항상 포함
45
- columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + list(columns)))
46
-
47
- # 뱃지/랭크 렌더링에 필요한 컬럼 항상 포함
48
- always_include = ["Model Name", "Model Type", "Output Form", "Rank"]
49
- filtered_df = df[df["Model Name"].isin(models)].copy()
50
-
51
- # Model Type, Output Form, Rank 컬럼이 없으면 생성
52
- if "Model Type" not in filtered_df.columns:
53
- filtered_df["Model Type"] = filtered_df["Model Name"].map(MODEL_TYPE_MAP).fillna("open")
54
- if "Output Form" not in filtered_df.columns:
55
- filtered_df["Output Form"] = filtered_df["Model Name"].map(OUTPUT_FORM_MAP).fillna("normal")
56
- if "Rank" not in filtered_df.columns:
57
- # 정렬 기준: sort_col이 있으면 해당 컬럼, 없으면 Overall
58
- rank_col = sort_col if sort_col and sort_col in filtered_df.columns else ("Overall" if "Overall" in filtered_df.columns else None)
59
- if rank_col:
60
- filtered_df = filtered_df.sort_values(rank_col, ascending=not sort_asc).reset_index(drop=True)
61
- filtered_df["Rank"] = filtered_df.index + 1
62
- else:
63
- filtered_df["Rank"] = range(1, len(filtered_df) + 1)
64
-
65
- # always_include 컬럼은 무조건 포함
66
- filtered_columns = [c for c in df.columns if c in columns or c in always_include]
67
- for col in always_include:
68
- if col not in filtered_columns:
69
- filtered_columns.append(col)
70
-
71
- # 중복 제거 및 순서 보장
72
- filtered_columns = list(dict.fromkeys(filtered_columns))
73
- filtered_df = filtered_df[filtered_columns]
74
-
75
- for col in filtered_df.select_dtypes(include="number").columns:
76
- filtered_df[col] = filtered_df[col].round(3)
77
-
78
- if sort_col and sort_col in filtered_df.columns:
79
- filtered_df = filtered_df.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
80
- # Rank 재계산
81
- filtered_df["Rank"] = filtered_df.index + 1
82
-
83
- return filtered_df
84
-
85
- def get_models_by_group(df, groups):
86
- return df[df["Group"].isin(groups)]["Model Name"].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -21,32 +21,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">🥇 Test Space</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Leaderboards for LLM evaluation.
29
-
30
- *TRUE(Trustworthy Real-world Usage Evaluation)Bench* is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
  LLM_BENCHMARKS_TEXT = f"""
35
  ## How it works
36
- We utilize LLM Judge with human-crafted criteria to assess AI response.
 
 
 
37
  """
38
 
39
  EVALUATION_QUEUE_TEXT = """
40
- ## Submission Policy
41
- For each benchmark:
42
- 1. Each model affiliation (individual or organization) can submit up to 3 times within 24 hours.
43
- 2. The same model can only be submitted once within 24 hours.
44
- 3. Criteria for determining duplicate submissions:
45
- - Benchmark name
46
- - Model full name
47
- - Sampling parameters, dtype, vLLM version, etc. are not subject to duplicate checking.
48
- 4. Submissions are only allowed if the model's organization or username matches that of the submitter.
49
-
50
  ## Some good practices before submitting a model
51
 
52
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
@@ -69,50 +60,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
69
 
70
  ### 4) Fill up your model card
71
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
72
- """
73
-
74
- EVALUATION_QUEUE_TEXT_OPTION1 = """
75
- # (Option 1) Submit HF model where vLLM inference is available
76
- 1. Fill the information including model name, vLLM version, sampling hyperparameters.
77
- 2. Sign in using the log-in button below.
78
- 3. Press "Submit Eval" button to submit.
79
- """
80
-
81
- EVALUATION_QUEUE_TEXT_OPTION2 = """
82
- # (Option 2) Submit HF model where vLLM inference is unavailable
83
- 1. Fill the information same with Option 1 and code snippets of model loading, inference, and termination.
84
- 2. Sign in using the log-in button below.
85
- 3. Press "Submit Eval" button to submit.
86
- """
87
 
88
- EVALUATION_QUEUE_TEXT_OPTION3 = """
89
- # (Option 3) Pull Request
90
- If Option 1 & 2 is unavailable, make [PR](https://huggingface.co/spaces/coms1580/test_space/discussions?new_pr=true) with [ADD_MODEL] prefix with contents as follows:
91
-
92
- ```
93
- ### Open-weight models:
94
- - Benchmark Name: [The name of benchmark to be evaluated]
95
- - HugingFace Model ID: [HF_MODEL_ID]
96
- - Pretty Name: [PRETTY_NAME]
97
- - Sampling parameters:
98
- - Temperature
99
- - Top-p
100
- - Top-k
101
- - Presence penalty
102
- - Frequency penalty
103
- - Repetition penalty
104
- - Supported by vLLM: [yes/no]
105
- - (If yes) Version of vLLM
106
- - (If no) Code snippets:
107
- - Model loading
108
- - Inference
109
- - Termination
110
-
111
- ### Misc.
112
- - Contact: [your email]
113
- - Description: [e.g., paper link, blog post, etc.]
114
- - Notes: [optional]
115
- ```
116
  """
117
 
118
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">🥇 ProductivityBench (v1)</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ ProductivityBench is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
+
35
+ ## Reproducibility
36
+ To reproduce our results, here is the commands you can run:
37
+
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
 
 
 
 
 
 
 
 
 
 
41
  ## Some good practices before submitting a model
42
 
43
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 
60
 
61
  ### 4) Fill up your model card
62
  When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ ## In case of model failure
65
+ If your model is displayed in the `FAILED` category, its execution stopped.
66
+ Make sure you have followed the above steps first.
67
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/config.py CHANGED
@@ -24,9 +24,10 @@ ON_LOAD_COLUMNS_CATEGORY = [
24
  "Editing",
25
  "Data Analysis",
26
  "Reasoning",
 
27
  "Hallucination",
28
  "Safety",
29
- "Repetition",
30
  "Summarization",
31
  "Translation",
32
  "Multi-Turn"
 
24
  "Editing",
25
  "Data Analysis",
26
  "Reasoning",
27
+ "Samsung Knowledge",
28
  "Hallucination",
29
  "Safety",
30
+ "Repeatition",
31
  "Summarization",
32
  "Translation",
33
  "Multi-Turn"
src/data/export_category_250618.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "Model Name" "Group" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Samsung Knowledge" "Hallucination" "Safety" "Repeatition" "Summarization" "Translation" "Multi-Turn"
2
+ "claude-3-haiku-20240307" "Claude" "40.60" "44.16" "36.90" "39.33" "21.00" "23.33" "43.33" "50.00" "30.00" "60.96" "40.00" "23.33"
3
+ "claude-3-sonnet-20240229" "Claude" "44.47" "48.05" "42.26" "45.33" "32.00" "23.33" "45.00" "56.25" "36.67" "60.96" "46.33" "22.78"
4
+ "claude-3-5-sonnet-20240620" "Claude" "56.35" "53.25" "54.17" "64.00" "49.00" "55.00" "60.00" "52.50" "40.00" "69.86" "58.67" "36.67"
5
+ "claude-3-5-sonnet-20241022" "Claude" "58.45" "61.04" "55.36" "66.00" "54.00" "40.00" "63.33" "42.50" "40.00" "73.97" "62.33" "38.33"
6
+ "claude-3-7-sonnet-20250219" "Claude" "56.99" "59.09" "59.52" "64.00" "54.00" "50.00" "65.00" "37.50" "50.00" "71.58" "55.33" "37.22"
7
+ "claude-3-7-sonnet-20250219-thinking" "Claude" "58.70" "63.64" "58.33" "71.52" "68.00" "55.00" "62.71" "37.50" "50.00" "72.60" "55.00" "33.33"
8
+ "deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
9
+ "deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
10
+ "deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
11
+ "deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
src/data/export_category_250709.csv DELETED
Binary file (1.26 kB)
 
src/data/export_lang_250618.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "Model Name" "Group" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
2
+ "claude-3-haiku-20240307" "Claude" "40.60" "31.87" "30.99" "41.54" "36.92" "52.24" "55.22" "56.72" "55.22" "68.66" "74.63" "50.75" "38.46"
3
+ "claude-3-sonnet-20240229" "Claude" "44.47" "41.32" "33.19" "50.77" "38.46" "55.22" "52.24" "58.21" "61.19" "65.67" "67.16" "49.25" "44.62"
4
+ "claude-3-5-sonnet-20240620" "Claude" "56.35" "55.60" "43.30" "73.85" "47.69" "64.18" "65.67" "70.15" "67.16" "76.12" "71.64" "65.67" "55.38"
5
+ "claude-3-5-sonnet-20241022" "Claude" "58.45" "57.14" "47.91" "69.23" "49.23" "61.19" "62.69" "70.15" "71.64" "80.60" "73.13" "67.16" "60.00"
6
+ "claude-3-7-sonnet-20250219" "Claude" "56.99" "55.82" "46.59" "63.08" "56.92" "68.66" "59.70" "64.18" "64.18" "74.63" "67.16" "64.18" "66.15"
7
+ "claude-3-7-sonnet-20250219-thinking" "Claude" "58.70" "60.44" "50.11" "64.62" "44.62" "65.67" "67.16" "65.67" "50.75" "74.63" "70.15" "67.16" "63.08"
8
+ "deepseek_r1" "DeepSeek" "55.27" "53.19" "50.99" "64.62" "44.62" "59.70" "64.18" "55.22" "58.21" "70.15" "67.16" "58.21" "53.85"
9
+ "deepseek_r1_0528" "DeepSeek" "52.60" "48.79" "47.25" "58.46" "43.08" "52.24" "61.19" "68.66" "58.21" "62.69" "65.67" "61.19" "56.92"
10
+ "deepseek_v3" "DeepSeek" "56.99" "53.41" "49.01" "66.15" "43.08" "59.70" "70.15" "67.16" "65.67" "79.10" "74.63" "58.21" "64.62"
11
+ "deepseek_v3_0324" "DeepSeek" "54.51" "50.99" "49.67" "56.92" "43.08" "64.18" "68.66" "61.19" "56.72" "71.64" "62.69" "64.18" "52.31"
src/data/export_lang_250709.csv DELETED
Binary file (958 Bytes)
 
src/data_utils.py CHANGED
@@ -3,12 +3,12 @@ from pathlib import Path
3
 
4
  def get_dataframe_category():
5
  abs_path = Path(__file__).parent
6
- df = pd.read_csv(str(abs_path / "data/export_category_250709.csv"), encoding='utf-16', delimiter=" ")
7
  df = df.sort_values("Overall", ascending=False)
8
  return df
9
 
10
  def get_dataframe_language():
11
  abs_path = Path(__file__).parent
12
- df = pd.read_csv(str(abs_path / "data/export_lang_250709.csv"), encoding='utf-16', delimiter=" ")
13
  df = df.sort_values("Overall", ascending=False)
14
  return df
 
3
 
4
  def get_dataframe_category():
5
  abs_path = Path(__file__).parent
6
+ df = pd.read_csv(str(abs_path / "data/export_category_250618.csv"), encoding='utf-8', delimiter=" ")
7
  df = df.sort_values("Overall", ascending=False)
8
  return df
9
 
10
  def get_dataframe_language():
11
  abs_path = Path(__file__).parent
12
+ df = pd.read_csv(str(abs_path / "data/export_lang_250618.csv"), encoding='utf-8', delimiter=" ")
13
  df = df.sort_values("Overall", ascending=False)
14
  return df
src/display/css_html_js.py CHANGED
@@ -1,128 +1,5 @@
1
  custom_css = """
2
 
3
- /* Sort arrow/button styles */
4
- .sort-arrow, .sort-btn {
5
- display: inline-flex;
6
- align-items: center;
7
- justify-content: center;
8
- background: #23244a;
9
- color: #ffd700 !important; /* 항상 노란색 */
10
- border: 1.5px solid #ffd700; /* 금색 테두리 */
11
- border-radius: 6px;
12
- font-size: 15px;
13
- font-weight: 700;
14
- margin-left: 6px;
15
- margin-right: 2px;
16
- padding: 2px 8px 2px 6px;
17
- cursor: pointer;
18
- transition: background 0.2s, color 0.2s, border 0.2s;
19
- min-width: 28px;
20
- min-height: 28px;
21
- outline: none;
22
- }
23
- .sort-arrow.active, .sort-btn.active {
24
- color: #ffd700 !important; /* 금색 */
25
- border-color: #ffd700;
26
- background: #1a237e;
27
- }
28
- .sort-arrow:hover, .sort-btn:hover {
29
- background: #ffd700;
30
- color: #23244a !important;
31
- border-color: #ffd700;
32
- }
33
- .sort-arrow svg, .sort-btn svg {
34
- margin-left: 2px;
35
- margin-right: 0;
36
- width: 1em;
37
- height: 1em;
38
- vertical-align: middle;
39
- }
40
-
41
- /* Enhanced leaderboard table styles */
42
- .pretty-leaderboard-table {
43
- width: 100%;
44
- border-collapse: separate;
45
- border-spacing: 0;
46
- background: rgba(30, 34, 54, 0.98);
47
- border-radius: 16px;
48
- box-shadow: 0 4px 24px 0 rgba(16, 152, 247, 0.10), 0 1.5px 6px 0 rgba(227, 84, 84, 0.08);
49
- overflow: hidden;
50
- margin-bottom: 24px;
51
- }
52
- .pretty-leaderboard-table th, .pretty-leaderboard-table td {
53
- padding: 12px 16px;
54
- text-align: left;
55
- border-bottom: 1px solid #23244a;
56
- font-size: 15px;
57
- }
58
- .pretty-leaderboard-table th {
59
- background: linear-gradient(90deg, #23244a 0%, #1a237e 100%);
60
- color: #F5F6F7;
61
- font-weight: 700;
62
- letter-spacing: 0.5px;
63
- border-bottom: 2px solid #1098F7;
64
- }
65
- .pretty-leaderboard-table tr:nth-child(even) {
66
- background: rgba(245, 246, 247, 0.03);
67
- }
68
- .pretty-leaderboard-table tr:hover {
69
- background: rgba(16, 152, 247, 0.08);
70
- transition: background 0.2s;
71
- }
72
- .pretty-leaderboard-table td {
73
- color: #F5F6F7;
74
- vertical-align: middle;
75
- }
76
- .pretty-leaderboard-table tr:last-child td {
77
- border-bottom: none;
78
- }
79
- .pretty-leaderboard-table th:first-child, .pretty-leaderboard-table td:first-child {
80
- border-top-left-radius: 16px;
81
- }
82
- .pretty-leaderboard-table th:last-child, .pretty-leaderboard-table td:last-child {
83
- border-top-right-radius: 16px;
84
- }
85
-
86
- /* Enhanced score bar styles */
87
- .score-bar {
88
- display: flex;
89
- align-items: center;
90
- gap: 12px;
91
- width: 100%;
92
- }
93
- .score-bar-track {
94
- flex-grow: 1;
95
- height: 10px;
96
- background: rgba(245, 246, 247, 0.12);
97
- border-radius: 5px;
98
- overflow: hidden;
99
- max-width: 220px;
100
- box-shadow: 0 1px 4px 0 rgba(16, 152, 247, 0.10);
101
- }
102
- .score-bar-fill {
103
- height: 100%;
104
- background: linear-gradient(90deg, #E35454, #1098F7);
105
- border-radius: 5px;
106
- transition: width 0.3s cubic-bezier(0.4,0,0.2,1);
107
- }
108
- .score-bar-value {
109
- font-family: 'SF Mono', monospace;
110
- font-weight: 600;
111
- color: #F5F6F7;
112
- min-width: 60px;
113
- font-size: 14px;
114
- }
115
-
116
- body {
117
- min-height: 100vh;
118
- background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
119
- background-image:
120
- radial-gradient(rgba(255,255,255,0.12) 1.2px, transparent 1.2px),
121
- radial-gradient(rgba(255,255,255,0.08) 1px, transparent 1px);
122
- background-size: 40px 40px, 80px 80px;
123
- background-position: 0 0, 20px 20px;
124
- }
125
-
126
  .markdown-text {
127
  font-size: 16px !important;
128
  }
@@ -145,15 +22,7 @@ body {
145
  }
146
 
147
  #leaderboard-table {
148
- margin-top: 15px;
149
- /* Space-themed background */
150
- background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
151
- position: relative;
152
- background-image:
153
- radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
154
- radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
155
- background-size: 40px 40px, 80px 80px;
156
- background-position: 0 0, 20px 20px;
157
  }
158
 
159
  #leaderboard-table-lite {
@@ -225,53 +94,6 @@ body {
225
  #box-filter > .form{
226
  border: 0
227
  }
228
-
229
- /* Model type and output form badge styles */
230
- .badge {
231
- display: inline-block;
232
- border-radius: 12px;
233
- padding: 2px 10px;
234
- font-size: 0.85em;
235
- font-weight: 700;
236
- margin-left: 6px;
237
- box-shadow: 0 1px 4px rgba(0,0,0,0.10);
238
- vertical-align: middle;
239
- }
240
- .badge-open {
241
- background: linear-gradient(90deg, #2196f3, #21cbf3);
242
- color: #fff;
243
- }
244
- .badge-closed {
245
- background: linear-gradient(90deg, #757575, #bdbdbd);
246
- color: #fff;
247
- }
248
- .badge-normal {
249
- background: linear-gradient(90deg, #43a047, #66bb6a);
250
- color: #fff;
251
- }
252
- .badge-reasoning {
253
- background: linear-gradient(90deg, #8e24aa, #d500f9);
254
- color: #fff;
255
- }
256
-
257
- /* Sort button styles */
258
- .sort-btn {
259
- background: #23244a;
260
- color: #F5F6F7;
261
- border: 1px solid #1098F7;
262
- border-radius: 6px;
263
- font-size: 13px;
264
- font-weight: 700;
265
- margin-left: 4px;
266
- margin-right: 2px;
267
- padding: 2px 7px;
268
- cursor: pointer;
269
- transition: background 0.2s, color 0.2s;
270
- }
271
- .sort-btn:hover {
272
- background: #1098F7;
273
- color: #fff;
274
- }
275
  """
276
 
277
  get_window_url_params = """
@@ -281,105 +103,3 @@ get_window_url_params = """
281
  return url_params;
282
  }
283
  """
284
-
285
- def get_rank_badge(rank: int) -> str:
286
- """
287
- Returns HTML for a rank badge (1st, 2nd, 3rd) with appropriate styling.
288
- """
289
- badge_styles = {
290
- 1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
291
- 2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
292
- 3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
293
- }
294
- if rank in badge_styles:
295
- label, gradient, text_color = badge_styles[rank]
296
- return f'''
297
- <div style="
298
- display: inline-flex;
299
- align-items: center;
300
- justify-content: center;
301
- min-width: 48px;
302
- padding: 4px 12px;
303
- background: {gradient};
304
- color: {text_color};
305
- border-radius: 6px;
306
- font-weight: 700;
307
- font-size: 1em;
308
- box-shadow: 0 2px 4px rgba(0,0,0,0.18);
309
- border: 1.5px solid #fff2;
310
- ">
311
- {label}
312
- </div>
313
- '''
314
- return f'''
315
- <div style="
316
- display: inline-flex;
317
- align-items: center;
318
- justify-content: center;
319
- min-width: 28px;
320
- color: #a1a1aa;
321
- font-weight: 500;
322
- ">
323
- {rank}
324
- </div>
325
- '''
326
-
327
- def get_score_gauge(score: float, max_score: float = 1.0) -> str:
328
- """
329
- Returns HTML for an overall score gauge (progress bar style).
330
- """
331
- percent = min(max(score / max_score, 0), 1) * 100
332
- return f'''
333
- <div class="score-bar" style="margin: 0.5em 0;">
334
- <div class="score-bar-track">
335
- <div class="score-bar-fill" style="width: {percent}%;"></div>
336
- </div>
337
- <span class="score-bar-value">{score:.3f}</span>
338
- </div>
339
- '''
340
-
341
- def get_leaderboard_table_html(df) -> str:
342
- """
343
- Returns HTML for a pretty leaderboard table using badge and gauge.
344
- Expects df to have columns: 'Model', 'Score', 'Model Type', 'Output Form'.
345
- """
346
- def get_type_badge(model_type):
347
- if model_type == "open":
348
- return '<span class="badge badge-open">open</span>'
349
- else:
350
- return '<span class="badge badge-closed">closed</span>'
351
-
352
- def get_output_badge(output_form):
353
- if output_form == "reasoning":
354
- return '<span class="badge badge-reasoning">reasoning</span>'
355
- else:
356
- return '<span class="badge badge-normal">normal</span>'
357
-
358
- html = ['<table class="pretty-leaderboard-table">']
359
- # Header
360
- html.append(
361
- "<thead><tr>"
362
- "<th>Rank</th>"
363
- "<th>Model</th>"
364
- "<th>Overall Score</th>"
365
- "</tr></thead>"
366
- )
367
- html.append("<tbody>")
368
- for idx, row in enumerate(df.itertuples(index=False), 1):
369
- model = getattr(row, "Model", "")
370
- score = getattr(row, "Score", 0.0)
371
- model_type = getattr(row, "Model_Type", getattr(row, "Model Type", "open"))
372
- output_form = getattr(row, "Output_Form", getattr(row, "Output Form", "normal"))
373
- badge = get_rank_badge(idx)
374
- gauge = get_score_gauge(score)
375
- type_badge = get_type_badge(model_type)
376
- output_badge = get_output_badge(output_form)
377
- html.append(
378
- f"<tr>"
379
- f"<td>{badge}</td>"
380
- f"<td>{model} {type_badge} {output_badge}</td>"
381
- f"<td>{gauge}</td>"
382
- f"</tr>"
383
- )
384
- html.append("</tbody></table>")
385
- return "\n".join(html)
 
1
  custom_css = """
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  .markdown-text {
4
  font-size: 16px !important;
5
  }
 
22
  }
23
 
24
  #leaderboard-table {
25
+ margin-top: 15px
 
 
 
 
 
 
 
 
26
  }
27
 
28
  #leaderboard-table-lite {
 
94
  #box-filter > .form{
95
  border: 0
96
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  """
98
 
99
  get_window_url_params = """
 
103
  return url_params;
104
  }
105
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py CHANGED
@@ -25,128 +25,3 @@ def has_no_nan_values(df, columns):
25
 
26
  def has_nan_values(df, columns):
27
  return df[columns].isna().any(axis=1)
28
-
29
- def get_score_bar(score):
30
- """
31
- Generate HTML for a score bar with gradient styling.
32
- Expects score in the range 0-100.
33
- """
34
- width = max(0, min(score, 100)) # Clamp to [0, 100]
35
- return f"""
36
- <div class="score-bar">
37
- <div class="score-bar-track">
38
- <div class="score-bar-fill" style="width: {width}%;"></div>
39
- </div>
40
- <span class="score-bar-value">{score:.3f}</span>
41
- </div>
42
- """
43
-
44
- def render_leaderboard_html(df, overall_col="average"):
45
- """
46
- Render a DataFrame as an HTML table, replacing the overall_col with a gauge bar.
47
- """
48
- from .formatting import get_score_bar
49
- from src.display.css_html_js import get_rank_badge
50
-
51
- def get_type_badge(model_type):
52
- if model_type == "open":
53
- return '<span class="badge badge-open">open</span>'
54
- else:
55
- return '<span class="badge badge-closed">closed</span>'
56
-
57
- def get_output_badge(output_form):
58
- if output_form == "reasoning":
59
- return '<span class="badge badge-reasoning">reasoning</span>'
60
- else:
61
- return '<span class="badge badge-normal">normal</span>'
62
-
63
- # 숨길 컬럼
64
- hidden_cols = ["Model", "Model Type", "Output Form", "Rank"]
65
-
66
- # Build table header
67
- def get_sort_arrow(col, sort_col, sort_asc):
68
- # "Model Name", "Group" 컬럼을 제외한 모든 컬럼에 정렬 버튼 노출
69
- if col in {"Model Name", "Group"}:
70
- return ""
71
- # 하나의 버튼(▲ 또는 ▼)만 노출, 클릭 시 asc가 반전됨
72
- if col == sort_col:
73
- # 현재 정렬 상태에 따라 아이콘과 data-asc를 반전
74
- if sort_asc:
75
- # 오름차순 상태: ▼ 아이콘, 클릭 시 내림차순
76
- svg = (
77
- '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
78
- '<polygon points="3,5 11,5 7,11" fill="currentColor"/></svg>'
79
- )
80
- return (
81
- f'<span class="sort-arrow active" data-col="{col}" data-asc="false" aria-label="내림차순 정렬">{svg}</span>'
82
- )
83
- else:
84
- # 내림차순 상태: ▲ 아이콘, 클릭 시 오름차순
85
- svg = (
86
- '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
87
- '<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
88
- )
89
- return (
90
- f'<span class="sort-arrow active" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
91
- )
92
- else:
93
- # 정렬 중이 아닌 컬럼: ▲(오름차순) 아이콘, 클릭 시 오름차순
94
- svg = (
95
- '<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
96
- '<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
97
- )
98
- return (
99
- f'<span class="sort-arrow" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
100
- )
101
-
102
- # 정렬 상태 추출 (State에서 전달받거나 기본값)
103
- sort_col = getattr(df, "_sort_col", None) or (df.columns[0] if len(df.columns) > 0 else None)
104
- sort_asc = getattr(df, "_sort_asc", None)
105
- if sort_asc is None:
106
- sort_asc = True
107
-
108
- html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
109
- for col in df.columns:
110
- if col in hidden_cols:
111
- continue
112
- html += f'<th>{col}{get_sort_arrow(col, sort_col, sort_asc)}</th>'
113
- html += '</tr></thead>\n<tbody>\n'
114
-
115
- # Build table rows
116
- for idx, row in df.iterrows():
117
- html += '<tr>'
118
- for col in df.columns:
119
- if col in hidden_cols:
120
- continue
121
- cell = row[col]
122
- if col == overall_col:
123
- try:
124
- cell_html = get_score_bar(float(cell))
125
- except Exception:
126
- cell_html = str(cell)
127
- html += f'<td>{cell_html}</td>'
128
- elif col in ["Model Name"]:
129
- # 1~3위 하이라이트 + 4등 이후 흰색 + 뱃지 항상 표시
130
- rank = row.get("Rank", None)
131
- model_type = row.get("Model Type", None) or row.get("Model_Type", None)
132
- output_form = row.get("Output Form", None) or row.get("Output_Form", None)
133
- highlight_style = ""
134
- if rank == 1 or rank == "1":
135
- highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
136
- elif rank == 2 or rank == "2":
137
- highlight_style = "color: #b0b0b0; font-weight: bold;"
138
- elif rank == 3 or rank == "3":
139
- highlight_style = "color: #cd7f32; font-weight: bold;"
140
- else:
141
- highlight_style = "color: #fff; font-weight: 600;"
142
- badge_html = ""
143
- if model_type:
144
- badge_html += " " + get_type_badge(model_type)
145
- if output_form:
146
- badge_html += " " + get_output_badge(output_form)
147
- html += f'<td><span style="{highlight_style}">{cell}</span>{badge_html}</td>'
148
- else:
149
- html += f'<td>{cell}</td>'
150
- html += '</tr>\n'
151
- html += '</tbody></table>'
152
- return html
 
25
 
26
  def has_nan_values(df, columns):
27
  return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -21,26 +21,24 @@ class ColumnContent:
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
- from dataclasses import field
25
-
26
  auto_eval_column_dict = []
27
  # Init
28
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
29
- auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
30
- # Scores
31
- auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
32
  for task in Tasks:
33
- auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda t=task: ColumnContent(t.value.col_name, "number", True))])
34
  # Model information
35
- auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
36
- auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
37
- auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
38
- auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
39
- auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
40
- auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
41
- auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
42
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False))])
43
- auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
44
 
45
  # We use make dataclass to dynamically fill the scores from Tasks
46
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -93,8 +91,6 @@ class WeightType(Enum):
93
  class Precision(Enum):
94
  float16 = ModelDetails("float16")
95
  bfloat16 = ModelDetails("bfloat16")
96
- fp8 = ModelDetails("fp8")
97
- int4 = ModelDetails("int4")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -102,10 +98,6 @@ class Precision(Enum):
102
  return Precision.float16
103
  if precision in ["torch.bfloat16", "bfloat16"]:
104
  return Precision.bfloat16
105
- if precision == "fp8":
106
- return Precision.fp8
107
- if precision == "int4":
108
- return Precision.int4
109
  return Precision.Unknown
110
 
111
  # Column selection
@@ -115,3 +107,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
115
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
116
 
117
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
 
 
24
  auto_eval_column_dict = []
25
  # Init
26
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
 
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
101
  return Precision.Unknown
102
 
103
  # Column selection
 
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
+
src/envs.py CHANGED
@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "coms1580" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/test_space"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
src/submission/check_validity.py CHANGED
@@ -88,13 +88,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
-
92
- file_names.append(f"{info['benchmark']}_{info['model']}")
93
 
94
  # Select organisation
95
  if info["model"].count("/") == 0 or "submitted_time" not in info:
96
  continue
97
  organisation, _ = info["model"].split("/")
98
- users_to_submission_dates[organisation].extend([{"benchmark": info['benchmark'], "model": info["model"], "submitted_time": info["submitted_time"]}])
99
 
100
  return set(file_names), users_to_submission_dates
 
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
 
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
95
  continue
96
  organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
 
99
  return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -1,7 +1,7 @@
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
- from typing import Optional
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
@@ -10,26 +10,17 @@ from src.submission.check_validity import (
10
  get_model_size,
11
  is_model_on_hub,
12
  )
13
- import gradio as gr
14
 
15
  REQUESTED_MODELS = None
16
  USERS_TO_SUBMISSION_DATES = None
17
 
18
- def add_new_eval_option1(
19
- benchmark: str,
20
  model: str,
21
  base_model: str,
22
  revision: str,
23
  precision: str,
24
- temperature: str,
25
- top_p: str,
26
- top_k: str,
27
- presence_penalty: str,
28
- frequency_penalty: str,
29
- repetition_penalty: str,
30
- vllm_version: str,
31
- user_state: str,
32
- organization_list: list
33
  ):
34
  global REQUESTED_MODELS
35
  global USERS_TO_SUBMISSION_DATES
@@ -43,174 +34,25 @@ def add_new_eval_option1(
43
  model_path = model.split("/")[1]
44
 
45
  precision = precision.split(" ")[0]
46
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S %z")
47
 
48
- # Check submitter qualification
49
- if user_name != user_state and user_name not in organization_list:
50
- return styled_error("The submitter does not have submission rights for this model.")
51
-
52
- # Does the organization submit more than three times in a day?
53
- submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
54
- submission_cnt = 0
55
- for i in range(len(submission_times)):
56
- hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
57
- if hours_diff <= 24:
58
- submission_cnt += 1
59
- if submission_cnt > 3:
60
- return styled_error("The organization already submitted three times for this benchmark today.")
61
 
62
  # Does the model actually exist?
63
  if revision == "":
64
  revision = "main"
65
 
66
- # Is the model info correctly filled?
67
- try:
68
- model_info = API.model_info(repo_id=model, revision=revision)
69
- except Exception:
70
- return styled_error("Could not get your model information. Please fill it up properly.")
71
-
72
- model_size = get_model_size(model_info=model_info, precision=precision)
73
-
74
- # Were the model card and license filled?
75
- try:
76
- license = model_info.cardData["license"]
77
- except Exception:
78
- return styled_error("Please select a license for your model.")
79
-
80
- modelcard_OK, error_msg = check_model_card(model)
81
- if not modelcard_OK:
82
- return styled_error(error_msg)
83
-
84
- if temperature == "":
85
- temperature = "1.0"
86
-
87
- if top_p == "":
88
- top_p = "1.0"
89
-
90
- if top_k == "":
91
- top_k = "-1"
92
-
93
- if presence_penalty == "":
94
- presence_penalty = "0.0"
95
-
96
- if frequency_penalty == "":
97
- frequency_penalty = "0.0"
98
-
99
- if repetition_penalty == "":
100
- repetition_penalty = "1.0"
101
-
102
- # Seems good, creating the eval
103
- print("Adding new eval")
104
-
105
- eval_entry = {
106
- "benchmark": benchmark,
107
- "model": model,
108
- "base_model": base_model,
109
- "revision": revision,
110
- "precision": precision,
111
- "status": "PENDING",
112
- "submitted_time": current_time,
113
- "likes": model_info.likes,
114
- "params": model_size,
115
- "license": license,
116
- "private": False,
117
- "temperature": float(temperature),
118
- "top_p": float(top_p),
119
- "top_k": float(top_k),
120
- "vllm_version": vllm_version,
121
- "presence_penalty": float(presence_penalty),
122
- "frequency_penalty": float(frequency_penalty),
123
- "repetition_penalty": float(repetition_penalty),
124
- "load_model_code": "None",
125
- "inference_code": "None",
126
- "termination_code": "None",
127
- }
128
-
129
- # Check for duplicate submission
130
- submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
131
- submission_cnt = 0
132
- for i in range(len(submission_times)):
133
- hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
134
- if hours_diff <= 24:
135
- submission_cnt += 1
136
- if submission_cnt > 1:
137
- return styled_warning("This model has been already submitted within 24 hours.")
138
-
139
- print("Creating eval file")
140
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
141
- os.makedirs(OUT_DIR, exist_ok=True)
142
- out_path = f"{OUT_DIR}/{benchmark}_{model_path}_eval_request_False.json"
143
 
144
- with open(out_path, "w") as f:
145
- f.write(json.dumps(eval_entry))
146
-
147
- print("Uploading eval file")
148
- API.upload_file(
149
- path_or_fileobj=out_path,
150
- path_in_repo=out_path.split("eval-queue/")[1],
151
- repo_id=QUEUE_REPO,
152
- repo_type="dataset",
153
- commit_message=f"Add {model} to eval queue",
154
- )
155
-
156
- # Remove the local file
157
- os.remove(out_path)
158
-
159
- return styled_message(
160
- "Your request has been submitted to the evaluation queue!"
161
-
162
- )
163
-
164
-
165
- def add_new_eval_option2(
166
- benchmark: str,
167
- model: str,
168
- base_model: str,
169
- revision: str,
170
- precision: str,
171
- temperature: str,
172
- top_p: str,
173
- top_k: str,
174
- presence_penalty: str,
175
- frequency_penalty: str,
176
- repetition_penalty: str,
177
- load_model_code: str,
178
- inference_code: str,
179
- termination_code: str,
180
- user_state: str,
181
- organization_list: list
182
- ):
183
- global REQUESTED_MODELS
184
- global USERS_TO_SUBMISSION_DATES
185
- if not REQUESTED_MODELS:
186
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
187
-
188
- user_name = ""
189
- model_path = model
190
- if "/" in model:
191
- user_name = model.split("/")[0]
192
- model_path = model.split("/")[1]
193
-
194
- precision = precision.split(" ")[0]
195
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S %z")
196
-
197
- # Check submitter qualification
198
- if user_name != user_state and user_name not in organization_list:
199
- return styled_error("The submitter does not have submission rights for this model.")
200
-
201
- # Does the organization submit more than three times in a day?
202
- submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
203
- submission_cnt = 0
204
- for i in range(len(submission_times)):
205
- hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
206
- if hours_diff <= 24:
207
- submission_cnt += 1
208
- if submission_cnt > 3:
209
- return styled_error("The organization already submitted three times for this benchmark today.")
210
-
211
- # Does the model actually exist?
212
- if revision == "":
213
- revision = "main"
214
 
215
  # Is the model info correctly filled?
216
  try:
@@ -224,71 +66,38 @@ def add_new_eval_option2(
224
  try:
225
  license = model_info.cardData["license"]
226
  except Exception:
227
- return styled_error("Please select a license for your model.")
228
 
229
  modelcard_OK, error_msg = check_model_card(model)
230
  if not modelcard_OK:
231
  return styled_error(error_msg)
232
 
233
- if temperature == "":
234
- temperature = "1.0"
235
-
236
- if top_p == "":
237
- top_p = "1.0"
238
-
239
- if top_k == "":
240
- top_k = "-1"
241
-
242
- if presence_penalty == "":
243
- presence_penalty = "0.0"
244
-
245
- if frequency_penalty == "":
246
- frequency_penalty = "0.0"
247
-
248
- if repetition_penalty == "":
249
- repetition_penalty = "1.0"
250
-
251
  # Seems good, creating the eval
252
  print("Adding new eval")
253
 
254
  eval_entry = {
255
- "benchmark": benchmark,
256
  "model": model,
257
  "base_model": base_model,
258
  "revision": revision,
259
  "precision": precision,
 
260
  "status": "PENDING",
261
  "submitted_time": current_time,
 
262
  "likes": model_info.likes,
263
  "params": model_size,
264
  "license": license,
265
  "private": False,
266
- "temperature": float(temperature),
267
- "top_p": float(top_p),
268
- "top_k": float(top_k),
269
- "vllm_version": "None",
270
- "presence_penalty": float(presence_penalty),
271
- "frequency_penalty": float(frequency_penalty),
272
- "repetition_penalty": float(repetition_penalty),
273
- "load_model_code": load_model_code,
274
- "inference_code": inference_code,
275
- "termination_code": termination_code
276
  }
277
-
278
  # Check for duplicate submission
279
- submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
280
- submission_cnt = 0
281
- for i in range(len(submission_times)):
282
- hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
283
- if hours_diff <= 24:
284
- submission_cnt += 1
285
- if submission_cnt > 1:
286
- return styled_warning("This model has been already submitted within 24 hours.")
287
 
288
  print("Creating eval file")
289
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
290
  os.makedirs(OUT_DIR, exist_ok=True)
291
- out_path = f"{OUT_DIR}/{benchmark}_{model_path}_eval_request_False.json"
292
 
293
  with open(out_path, "w") as f:
294
  f.write(json.dumps(eval_entry))
@@ -306,5 +115,5 @@ def add_new_eval_option2(
306
  os.remove(out_path)
307
 
308
  return styled_message(
309
- "Your request has been submitted to the evaluation queue!"
310
  )
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
+
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
 
10
  get_model_size,
11
  is_model_on_hub,
12
  )
 
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
+ def add_new_eval(
 
18
  model: str,
19
  base_model: str,
20
  revision: str,
21
  precision: str,
22
+ weight_type: str,
23
+ model_type: str,
 
 
 
 
 
 
 
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
 
34
  model_path = model.split("/")[1]
35
 
36
  precision = precision.split(" ")[0]
37
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
+ if model_type is None or model_type == "":
40
+ return styled_error("Please select a model type.")
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
+ # Is the model on the hub?
47
+ if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ if not base_model_on_hub:
50
+ return styled_error(f'Base model "{base_model}" {error}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ if not model_on_hub:
55
+ return styled_error(f'Model "{model}" {error}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Is the model info correctly filled?
58
  try:
 
66
  try:
67
  license = model_info.cardData["license"]
68
  except Exception:
69
+ return styled_error("Please select a license for your model")
70
 
71
  modelcard_OK, error_msg = check_model_card(model)
72
  if not modelcard_OK:
73
  return styled_error(error_msg)
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
77
 
78
  eval_entry = {
 
79
  "model": model,
80
  "base_model": base_model,
81
  "revision": revision,
82
  "precision": precision,
83
+ "weight_type": weight_type,
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
+ "model_type": model_type,
87
  "likes": model_info.likes,
88
  "params": model_size,
89
  "license": license,
90
  "private": False,
 
 
 
 
 
 
 
 
 
 
91
  }
92
+
93
  # Check for duplicate submission
94
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ return styled_warning("This model has been already submitted.")
 
 
 
 
 
 
96
 
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
115
  os.remove(out_path)
116
 
117
  return styled_message(
118
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
ui.py DELETED
@@ -1,228 +0,0 @@
1
- import gradio as gr
2
- import src.config as configs
3
- from constants import TAB_NAMES, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
4
- from src.display.formatting import render_leaderboard_html
5
- from src.display.css_html_js import get_leaderboard_table_html, custom_css
6
- import pandas as pd
7
- from constants import LEADERBOARD_REQUIRED_COLUMNS
8
-
9
- def render_pretty_leaderboard_html(df):
10
- """
11
- Renders a pretty leaderboard table using badge and gauge.
12
- Supports both ['Model', 'Score'] and ['Model Name', 'Overall'] columns.
13
- Sorts by score descending and rounds for display.
14
- """
15
- # Flexible column mapping
16
- col_map = {}
17
- if "Model" in df.columns:
18
- col_map["Model"] = "Model"
19
- elif "Model Name" in df.columns:
20
- col_map["Model"] = "Model Name"
21
- else:
22
- return "<div style='color:red'>DataFrame must have a 'Model' or 'Model Name' column.</div>"
23
- if "Score" in df.columns:
24
- col_map["Score"] = "Score"
25
- elif "Overall" in df.columns:
26
- col_map["Score"] = "Overall"
27
- else:
28
- return "<div style='color:red'>DataFrame must have a 'Score' or 'Overall' column.</div>"
29
-
30
- # Example mappings for demonstration (expand as needed)
31
- model_type_map = MODEL_TYPE_MAP
32
- output_form_map = OUTPUT_FORM_MAP
33
-
34
- # Copy and rename for uniformity
35
- df2 = df.copy()
36
- df2 = df2.rename(columns={col_map["Model"]: "Model", col_map["Score"]: "Score"})
37
-
38
- # 매핑 전후로 누락된 모델명을 출력 (디버깅용)
39
- missing_type = set(df2["Model"]) - set(model_type_map.keys())
40
- missing_output = set(df2["Model"]) - set(output_form_map.keys())
41
- if missing_type:
42
- print("Model Type 매핑 누락:", missing_type)
43
- if missing_output:
44
- print("Output Form 매핑 누락:", missing_output)
45
-
46
- # Add badge columns
47
- df2["Model Type"] = df2["Model"].map(model_type_map).fillna("open")
48
- df2["Output Form"] = df2["Model"].map(output_form_map).fillna("normal")
49
- # Drop NA, sort, round
50
- df2 = df2[["Model", "Score", "Model Type", "Output Form"]].dropna()
51
- df2["Score"] = pd.to_numeric(df2["Score"], errors="coerce").round(2)
52
- df2 = df2.sort_values("Score", ascending=False).reset_index(drop=True)
53
-
54
- return get_leaderboard_table_html(df2)
55
-
56
- def create_leaderboard_tab(df, key, search_leaderboard, update_modelselector_group, update_leaderboard, column_selector_value):
57
- """
58
- df: DataFrame to display
59
- key: "Category" or "Language"
60
- search_leaderboard, update_modelselector_group, update_leaderboard: handler functions
61
- column_selector_value: default columns to select
62
- """
63
- with gr.TabItem(
64
- TAB_NAMES[key],
65
- visible=True
66
- ):
67
- df_state = gr.State(df)
68
-
69
- with gr.Row():
70
- with gr.Column():
71
- search_box = gr.Textbox(label="Search Model by Name")
72
- group_list = df["Group"].unique().tolist()
73
- group_selector = gr.CheckboxGroup(
74
- choices=df["Group"].unique().tolist(),
75
- value=group_list,
76
- label="Select Model Group"
77
- )
78
- # 필수 컬럼 항상 포함, 체크 해제 불가(disabled)
79
- # 선택지에서 "Model Name", "Group", "Overall" 제외
80
- exclude_cols = {"Model Name", "Group", "Overall"}
81
- selectable_columns = [col for col in df.columns.tolist()[3:] if col not in exclude_cols]
82
- all_columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + selectable_columns))
83
- column_selector = gr.CheckboxGroup(
84
- choices=selectable_columns,
85
- value=[col for col in column_selector_value if col in selectable_columns],
86
- label="Select Columns"
87
- )
88
-
89
- with gr.Column():
90
- with gr.Accordion("Model List", open=False):
91
- model_group = df["Model Name"].tolist()
92
- model_selector = gr.CheckboxGroup(
93
- choices=df["Model Name"].tolist(),
94
- value=model_group,
95
- label="Select Models"
96
- )
97
-
98
- # badge 정보 포함 DataFrame 생성 (위쪽 테이블용)
99
- df_badge = df.copy()
100
- # Model 컬럼명 통일
101
- if "Model Name" in df_badge.columns:
102
- df_badge["Model"] = df_badge["Model Name"]
103
- # 예시 매핑 (아래쪽과 동일하게 확장)
104
- model_type_map = MODEL_TYPE_MAP
105
- output_form_map = OUTPUT_FORM_MAP
106
- df_badge["Model Type"] = df_badge["Model"].map(model_type_map).fillna("open")
107
- df_badge["Output Form"] = df_badge["Model"].map(output_form_map).fillna("normal")
108
- df_badge = df_badge.sort_values("Overall" if "Overall" in df_badge.columns else "Score", ascending=False).reset_index(drop=True)
109
- df_badge["Rank"] = df_badge.index + 1
110
-
111
- # 정렬 상태 관리용 State (한 번만 생성, 이후 재사용)
112
- default_sort_col = "Overall" if "Overall" in df_badge.columns else "Score"
113
- sort_col_state = gr.State(default_sort_col)
114
- sort_asc_state = gr.State(False) # 내림차순이 기본값
115
-
116
- # 정렬 함수 (JS에서 넘긴 asc 값을 그대로 사용)
117
- def sort_and_render(col, asc, models, columns, df_):
118
- print(f"[sort_and_render] called: col={col}, asc={asc}, models={models}, columns={columns}")
119
- filtered_df = update_leaderboard(models, columns, df_, col, asc)
120
- # 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
121
- filtered_df._sort_col = col
122
- filtered_df._sort_asc = asc
123
- return render_leaderboard_html(filtered_df.round(3)), col, asc
124
-
125
- leaderboard_html = render_leaderboard_html(df_badge.round(3))
126
- leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")
127
-
128
- # 정렬 트리거용 hidden textbox 추가
129
- sort_trigger = gr.Textbox(visible=False, elem_id="sort-leaderboard-trigger")
130
-
131
- # sort-arrow 클릭 시 항상 새로운 값으로 value를 변경하는 JS 삽입 (정렬 방향 포함)
132
- sort_js = """
133
- <script>
134
- (function() {
135
- document.addEventListener('DOMContentLoaded', function() {
136
- const table = document.getElementById('leaderboard-table');
137
- if (!table) return;
138
- table.addEventListener('click', function(e) {
139
- const arrow = e.target.closest('.sort-arrow');
140
- if (arrow) {
141
- const col = arrow.getAttribute('data-col');
142
- const asc = arrow.getAttribute('data-asc');
143
- // 항상 새로운 값으로 value를 변경하여 change 이벤트 강제 발생
144
- const trigger = document.querySelector('#sort-leaderboard-trigger input');
145
- if (trigger) {
146
- trigger.value = col + '|' + asc + '|' + Date.now();
147
- trigger.dispatchEvent(new Event('input', { bubbles: true }));
148
- trigger.dispatchEvent(new Event('change', { bubbles: true }));
149
- }
150
- }
151
- });
152
- });
153
- })();
154
- </script>
155
- """
156
- # 정렬 버튼 클릭 시에도 update_leaderboard를 호출하도록 wiring
157
- def sort_trigger_change(col_val, models, columns, df_, prev_col, prev_asc):
158
- print(f"[sort_trigger.change] col_val={col_val}, prev_col={prev_col}, prev_asc={prev_asc}")
159
- col, asc = col_val.split('|')[0], col_val.split('|')[1].lower() == "true"
160
- return sort_and_render(col, asc, models, columns, df_)
161
-
162
- sort_trigger.change(
163
- fn=sort_trigger_change,
164
- inputs=[sort_trigger, model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
165
- outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
166
- )
167
-
168
- # 커스텀 JS를 상단 테이블에 삽입
169
- leaderboard_html_comp.style = None # gr.HTML에는 style 파라미터가 없으므로, 아래에서 삽입
170
- leaderboard_html_comp.value += sort_js
171
-
172
- # Pretty leaderboard preview (uses only 'Model' and 'Score' columns)
173
- pretty_html = gr.HTML(value=render_pretty_leaderboard_html(df.round(3)))
174
-
175
- # Define change functions for user interaction
176
- # 모든 UI 이벤트에서 update_leaderboard → sort_and_render → render_leaderboard_html 순으로 갱신
177
- def filter_and_sort_search(query, df, sort_col, sort_asc):
178
- print(f"[filter_and_sort_search] sort_col={sort_col}, sort_asc={sort_asc}")
179
- filtered_df = search_leaderboard(query, df, sort_col, sort_asc)
180
- # 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
181
- filtered_df._sort_col = sort_col
182
- filtered_df._sort_asc = sort_asc
183
- return render_leaderboard_html(filtered_df), sort_col, sort_asc
184
-
185
- def filter_and_sort_model(models, columns, df, sort_col, sort_asc):
186
- print(f"[filter_and_sort_model] sort_col={sort_col}, sort_asc={sort_asc}")
187
- filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
188
- filtered_df._sort_col = sort_col
189
- filtered_df._sort_asc = sort_asc
190
- return render_leaderboard_html(filtered_df), sort_col, sort_asc
191
-
192
- def filter_and_sort_column(models, columns, df, sort_col, sort_asc):
193
- print(f"[filter_and_sort_column] sort_col={sort_col}, sort_asc={sort_asc}")
194
- filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
195
- filtered_df._sort_col = sort_col
196
- filtered_df._sort_asc = sort_asc
197
- return render_leaderboard_html(filtered_df), sort_col, sort_asc
198
-
199
- search_box.change(
200
- fn=filter_and_sort_search,
201
- inputs=[search_box, df_state, sort_col_state, sort_asc_state],
202
- outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
203
- )
204
-
205
- group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
206
- model_selector.change(
207
- fn=filter_and_sort_model,
208
- inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
209
- outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
210
- )
211
-
212
- # column_selector 변경 시에도 항상 최신 sort_col, sort_asc를 유지
213
- column_selector.change(
214
- fn=filter_and_sort_column,
215
- inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
216
- outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
217
- )
218
-
219
- return {
220
- "search_box": search_box,
221
- "group_selector": group_selector,
222
- "column_selector": column_selector,
223
- "model_selector": model_selector,
224
- "leaderboard_html_comp": leaderboard_html_comp,
225
- "sort_trigger": sort_trigger,
226
- "df_state": df_state,
227
- "pretty_html": pretty_html
228
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py DELETED
@@ -1,42 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import gradio as gr
4
- from huggingface_hub import whoami
5
-
6
- def get_profile(profile: gr.OAuthProfile | None) -> str:
7
- if profile is None:
8
- return "Anonymous"
9
- return profile.username
10
-
11
- def get_organizations(oauth_token: gr.OAuthToken | None) -> str:
12
- if oauth_token is None:
13
- return "No Organization"
14
- org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
15
- return org_names
16
-
17
- def get_profile_and_organizations(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> tuple[str, str]:
18
- if profile is None:
19
- output_profile = "Anonymous"
20
- else:
21
- output_profile = profile.username
22
-
23
- if oauth_token is None:
24
- output_org = "No Organization"
25
- else:
26
- output_org = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
27
-
28
- return output_profile, output_org
29
-
30
- def download_with_restart(snapshot_download_func, repo_id, local_dir, repo_type, token, restart_func):
31
- try:
32
- print(local_dir)
33
- snapshot_download_func(
34
- repo_id=repo_id,
35
- local_dir=local_dir,
36
- repo_type=repo_type,
37
- tqdm_class=None,
38
- etag_timeout=30,
39
- token=token
40
- )
41
- except Exception:
42
- restart_func()