wenjiao commited on
Commit
e0f982a
·
1 Parent(s): 58c6d37

refactor: update code for latest Gradio API

Browse files
Dockerfile DELETED
@@ -1,17 +0,0 @@
1
- FROM python:3.10-slim
2
-
3
- ENV DEBIAN_FRONTEND=noninteractive
4
- WORKDIR /app
5
-
6
- RUN apt-get update && apt-get install -y \
7
- git git-lfs ffmpeg libsm6 libxext6 libgl1 \
8
- && rm -rf /var/lib/apt/lists/* \
9
- && git lfs install
10
-
11
- RUN pip install --no-cache-dir -U pip setuptools wheel
12
-
13
- COPY requirements.txt .
14
- RUN pip install --no-cache-dir -r requirements.txt
15
-
16
- COPY . .
17
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -3,8 +3,9 @@ title: Low-bit Quantized Open LLM Leaderboard
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
- sdk: docker
7
- sdk_version: 4.31.5
 
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
3
  emoji: 🏆
4
  colorFrom: green
5
  colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 6.5.1
8
+ python_version: 3.11
9
  app_file: app.py
10
  pinned: true
11
  license: apache-2.0
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
-
3
  import gradio as gr
4
  import pandas as pd
5
  import re
@@ -24,9 +23,7 @@ from src.display.utils import (
24
  NUMERIC_INTERVALS,
25
  NUMERIC_MODELSIZE,
26
  TYPES,
27
- # 改为导入实例
28
  auto_eval_cols,
29
- eval_queue_cols,
30
  GroupDtype,
31
  ModelType,
32
  fields,
@@ -46,21 +43,36 @@ from src.tools.plots import (
46
  create_plot_df,
47
  create_scores_df,
48
  )
49
- from gradio_modal import Modal
50
  import plotly.graph_objects as go
51
 
52
  selected_indices = []
53
  selected_values = {}
54
  selected_dropdown_weight = 'All'
55
 
 
 
 
56
  precision_to_dtype = {
57
- "2bit": ["int2"], "3bit": ["int3"], "4bit": ["int4", "nf4", "fp4"],
58
- "8bit": ["int8"], "16bit": ['float16', 'bfloat16'], "32bit": ["float32"], "?": ["?"],
 
 
 
 
 
59
  }
60
 
61
  dtype_to_precision = {
62
- "int2": ["2bit"], "int3": ["3bit"], "int4": ["4bit"], "nf4": ["4bit"], "fp4": ["4bit"],
63
- "int8": ["8bit"], "float16": ["16bit"], "bfloat16": ["16bit"], "float32": ["32bit"], "?": ["?"],
 
 
 
 
 
 
 
 
64
  }
65
 
66
  current_weightDtype = ["int2", "int3", "int4", "nf4", "fp4", "?"]
@@ -68,7 +80,7 @@ current_computeDtype = ['int8', 'bfloat16', 'float16', 'float32']
68
  current_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
69
  current_precision = ['2bit', '3bit', '4bit', '8bit', '?']
70
 
71
- # --- 工具函数保持不变 ---
72
  def display_sort(key):
73
  order = {"All": 0, "?": 1, "int2": 2, "int3": 3, "int4": 4, "fp4": 5, "nf4": 6, "float16": 7, "bfloat16": 8, "float32": 9}
74
  return order.get(key, float('inf'))
@@ -77,260 +89,758 @@ def comp_display_sort(key):
77
  order = {"All": 0, "?": 1, "int8": 2, "float16": 3, "bfloat16": 4, "float32": 5}
78
  return order.get(key, float('inf'))
79
 
80
- # --- 更新逻辑保持逻辑不变,仅做属性名适配 ---
81
  def update_quantization_types(selected_quant):
82
- global current_weightDtype, current_computeDtype, current_quant, current_precision
 
 
 
 
83
  if set(current_quant) == set(selected_quant):
84
- return [gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
85
- gr.Dropdown(choices=current_computeDtype, value="All"),
86
- gr.CheckboxGroup(value=current_precision)]
 
 
 
 
87
  if any(value != '✖ None' for value in selected_quant):
88
  selected_weight = ['All', '?', 'int2', 'int3', 'int4', 'nf4', 'fp4', 'int8']
89
  selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
90
  selected_precision = ["2bit", "3bit", "4bit", "8bit", "?"]
91
- current_weightDtype, current_computeDtype, current_quant, current_precision = selected_weight, selected_compute, selected_quant, selected_precision
92
- return [gr.Dropdown(choices=selected_weight, value="All"),
93
- gr.Dropdown(choices=selected_compute, value="All"),
94
- gr.CheckboxGroup(value=selected_precision)]
 
 
 
 
 
 
 
95
 
96
  def update_Weight_Precision(temp_precisions):
97
- global current_weightDtype, current_computeDtype, current_quant, current_precision, selected_dropdown_weight
 
 
 
 
 
 
98
  if set(current_precision) == set(temp_precisions):
99
- return [gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
100
- gr.Dropdown(choices=current_computeDtype, value="All"),
101
- gr.CheckboxGroup(value=current_precision),
102
- gr.CheckboxGroup(value=current_quant)]
103
- selected_weight, selected_compute = [], ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
 
 
 
 
104
  selected_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
 
105
  if temp_precisions[-1] in ["16bit", "32bit"]:
106
  selected_precisions = [p for p in temp_precisions if p in ["16bit", "32bit"]]
107
  else:
108
  selected_precisions = [p for p in temp_precisions if p not in ["16bit", "32bit"]]
 
109
  current_precision = list(set(selected_precisions))
110
- if len(current_precision) > 1 or (selected_dropdown_weight != 'All' and set(dtype_to_precision.get(selected_dropdown_weight, [])) != set(current_precision)):
 
 
111
  selected_dropdown_weight = 'All'
 
 
 
 
 
112
  for precision in current_precision:
113
- if precision in precision_to_dtype: selected_weight.extend(precision_to_dtype[precision])
 
 
 
114
  if "16bit" in current_precision:
115
- selected_weight = [o for o in selected_weight if o in ["All", "?", "float16", "bfloat16"]]
116
- if "int8" in selected_compute: selected_compute.remove("int8")
 
 
117
  if "32bit" in current_precision:
118
- selected_weight = [o for o in selected_weight if o in ["All", "?", "float32"]]
119
- if "int8" in selected_compute: selected_compute.remove("int8")
120
- if "16bit" in current_precision or "32bit" in current_precision: selected_quant = ['✖ None']
121
- selected_weight = list(set(["All", "?"] + selected_weight))
122
- selected_compute = list(set(["All", "?"] + selected_compute))
123
- current_weightDtype, current_computeDtype, current_quant = selected_weight, selected_compute, selected_quant
124
- return [gr.Dropdown(choices=selected_weight, value=selected_dropdown_weight),
125
- gr.Dropdown(choices=selected_compute, value="All"),
126
- gr.CheckboxGroup(value=selected_precisions),
127
- gr.CheckboxGroup(value=selected_quant)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def update_Weight_Dtype(weight):
130
  global selected_dropdown_weight
131
- if weight == selected_dropdown_weight or weight == 'All': return current_precision
132
- selected_precisions = dtype_to_precision.get(weight, [])
133
- selected_dropdown_weight = weight
 
 
 
 
 
 
 
134
  return selected_precisions
135
 
 
 
 
136
  def restart_space():
137
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
138
 
 
139
  def init_space(full_init: bool = True):
 
140
  if full_init:
141
  try:
142
  branch = REPO.active_branch.name
143
  REPO.remotes.origin.pull(branch)
144
- snapshot_download(repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", etag_timeout=30)
145
- except Exception as e: print(str(e)); restart_space()
146
-
147
- raw_data, original_df = get_leaderboard_df(GIT_RESULTS_PATH, GIT_STATUS_PATH, DYNAMIC_INFO_FILE_PATH, COLS, BENCHMARK_COLS)
148
-
149
- # 防御补全:如果没数据也要有骨架,防止 KeyError 'Model'
150
- if original_df.empty:
151
- original_df = pd.DataFrame(columns=[c.name for c in fields(auto_eval_cols)])
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  leaderboard_df = original_df.copy()
 
154
  plot_df = create_plot_df(create_scores_df(raw_data))
155
- (f_q, r_q, p_q) = get_evaluation_queue_df(GIT_STATUS_PATH, EVAL_COLS)
156
- return leaderboard_df, original_df, plot_df, f_q, r_q, p_q
 
 
 
 
 
 
157
 
158
  leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
159
 
160
  def str_to_bool(value):
161
- return str(value).lower() == "true"
162
-
163
- def update_table(hidden_df, columns, type_query, precision_query, size_query, params_query, hide_models, query, compute_dtype, weight_dtype, double_quant, group_dtype):
164
- global current_weightDtype, current_computeDtype
165
- w_dt = current_weightDtype if weight_dtype in [['All'], 'All'] else [weight_dtype]
166
- c_dt = current_computeDtype if compute_dtype == 'All' else [compute_dtype]
167
- try: g_dt = [int(group_dtype)] if group_dtype != 'All' else [-1, 1024, 256, 128, 64, 32]
168
- except: g_dt = [-1]
169
- dq = [True, False] if double_quant == 'All' else [str_to_bool(double_quant)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- filtered_df = filter_models(hidden_df, type_query, size_query, params_query, precision_query, hide_models, c_dt, w_dt, dq, g_dt)
172
  filtered_df = filter_queries(query, filtered_df)
173
- return select_columns(filtered_df, columns)
 
 
 
 
 
 
 
174
 
175
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
176
- if auto_eval_cols.dummy.name not in df.columns: return df
177
  return df[(df[auto_eval_cols.dummy.name].str.contains(query, case=False))]
178
 
 
179
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
180
  always_here_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
181
  dummy_col = [auto_eval_cols.dummy.name]
182
- # 动态取列,解决 KeyError
183
- req_cols = always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
184
- return df[[c for c in req_cols if c in df.columns]]
 
 
 
185
 
186
  def filter_queries(query: str, filtered_df: pd.DataFrame):
187
- if query == "": return filtered_df
188
  final_df = []
189
- for _q in [q.strip() for q in query.split(";") if q.strip() != ""]:
190
- temp = search_table(filtered_df, _q)
191
- if len(temp) > 0: final_df.append(temp)
192
- if not final_df: return filtered_df
193
- return pd.concat(final_df).drop_duplicates(subset=[auto_eval_cols.model.name, auto_eval_cols.precision.name, auto_eval_cols.revision.name])
194
-
195
- def filter_models(df, type_query, size_query, params_query, precision_query, hide_models, compute_dtype, weight_dtype, double_quant, group_dtype):
196
- f_df = df.copy()
197
- # 增加列存在性检查,防止 KeyError
198
- check_cols = {
199
- auto_eval_cols.still_on_hub.name: lambda d: d[d[auto_eval_cols.still_on_hub.name] == True] if "Private or deleted" in hide_models else d,
200
- auto_eval_cols.merged.name: lambda d: d[d[auto_eval_cols.merged.name] == False] if "Contains a merge/moerge" in hide_models else d,
201
- auto_eval_cols.moe.name: lambda d: d[d[auto_eval_cols.moe.name] == False] if "MoE" in hide_models else d,
202
- auto_eval_cols.flagged.name: lambda d: d[d[auto_eval_cols.flagged.name] == False] if "Flagged" in hide_models else d,
203
- }
204
- for col, func in check_cols.items():
205
- if col in f_df.columns: f_df = func(f_df)
206
-
207
- type_emoji = [t[0] for t in type_query if t]
208
- type_emoji = [e for e in type_emoji if e != '✖'] if any(e != '✖' for e in type_emoji) else ['✖']
209
-
210
- if auto_eval_cols.model_type_symbol.name in f_df.columns:
211
- f_df = f_df[f_df[auto_eval_cols.model_type_symbol.name].isin(type_emoji)]
212
- if auto_eval_cols.precision.name in f_df.columns:
213
- f_df = f_df[f_df[auto_eval_cols.precision.name].isin(precision_query + ["None"])]
214
- if auto_eval_cols.weight_dtype.name in f_df.columns:
215
- f_df = f_df[f_df[auto_eval_cols.weight_dtype.name].isin(weight_dtype)]
216
-
217
- # 参数量数值区间过滤
218
- if auto_eval_cols.params.name in f_df.columns:
219
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
220
- params_col = pd.to_numeric(f_df[auto_eval_cols.params.name], errors="coerce")
221
- f_df = f_df[params_col.apply(lambda x: any(numeric_interval.contains(x)) if pd.notnull(x) else False)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- return f_df
 
 
 
 
 
224
 
225
  def select(df, data: gr.SelectData):
226
- global selected_indices, selected_values
 
 
227
  selected_index = data.index[0]
228
- value = df.iloc[selected_index].iloc[1]
229
- match = re.search(r'<a[^>]+>([^<]+)</a>', value)
230
- if not match: return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
231
- text_content = match.group(1)
232
  if selected_index in selected_indices:
233
  selected_indices.remove(selected_index)
234
- if text_content in selected_values: del selected_values[text_content]
 
 
 
 
 
 
 
235
  else:
236
  selected_indices.append(selected_index)
237
- selected_values[text_content] = value
 
 
 
 
 
 
 
238
  return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  def generate_spider_chart(df, selected_keys):
241
  global selected_values
242
- current_sel = [selected_values[key] for key in selected_keys if key in selected_values]
243
- selected_rows = df[df.iloc[:, 1].isin(current_sel)]
244
- cleaned_rows = selected_rows.applymap(lambda x: re.sub(r'<[^>]*>', '', x) if isinstance(x, str) else x)
 
 
245
  fig = go.Figure()
246
- # 强制指定指标列
247
- metrics = ['Average ⬆️', 'ARC-c', 'ARC-e', 'Boolq', 'HellaSwag', 'Lambada', 'MMLU', 'Openbookqa', 'Piqa', 'Truthfulqa', 'Winogrande']
248
  for _, row in selected_rows.iterrows():
249
  fig.add_trace(go.Scatterpolar(
250
- r=[row.get(m, 0) for m in metrics],
251
- theta=metrics, fill='toself', name=re.sub(r'<[^>]*>', '', str(row.get('Model', 'Unknown')))
 
 
252
  ))
253
- fig.update_layout(polar=dict(radialaxis=dict(visible=False)), showlegend=True)
254
- return fig, cleaned_rows
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- # --- 构建界面 ---
257
- demo = gr.Blocks(css=custom_css)
258
  with demo:
 
 
 
 
 
 
 
 
 
259
  gr.HTML(TITLE)
260
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
261
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
262
- with gr.TabItem("🏅 LLM Benchmark", id=0):
263
  with gr.Row():
264
  with gr.Column():
265
- search_bar = gr.Textbox(placeholder=" 🔍 Search model...", show_label=False)
266
- shown_columns = gr.CheckboxGroup(
267
- choices=[c.name for c in fields(auto_eval_cols) if not c.hidden and not c.never_hidden and not c.dummy],
268
- value=[c.name for c in fields(auto_eval_cols) if c.displayed_by_default and not c.hidden and not c.never_hidden],
269
- label="Select columns"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  )
271
  with gr.Column(min_width=320):
272
- filter_columns_type = gr.CheckboxGroup(label="Quantization types", choices=[t.to_str() for t in QuantType if t != QuantType.QuantType_None], value=[t.to_str() for t in QuantType if t != QuantType.QuantType_None])
273
- filter_columns_precision = gr.CheckboxGroup(label="Weight precision", choices=[i.value.name for i in Precision], value=[i.value.name for i in Precision if i.value.name not in ['16bit', '32bit']])
274
- with gr.Group():
275
- gr.HTML("<p style='padding: 0.7rem; background: #fff; margin: 0; color: #6b7280;'>Quantization config</p>")
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  with gr.Row():
277
- f_compute = gr.Dropdown(choices=[i.value.name for i in ComputeDtype], label="Compute Dtype", value="All")
278
- f_weight = gr.Dropdown(choices=[i.value.name for i in WeightDtype], label="Weight Dtype", value="All")
279
- f_double = gr.Dropdown(choices=["All", "True", "False"], label="Double Quant", value="All")
280
- f_group = gr.Dropdown(choices=[i.value.name for i in GroupDtype], label="Group Size", value="All")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- model_comparison = gr.CheckboxGroup(label="Accuracy Comparison", choices=list(selected_values.keys()))
283
- spider_btn = gr.Button("Compare")
284
-
285
- # 对齐初始化列
286
- init_req = [c.name for c in fields(auto_eval_cols) if c.never_hidden] + shown_columns.value + [auto_eval_cols.dummy.name]
287
- init_act = [c for c in init_req if c in leaderboard_df.columns]
288
- if not init_act: init_act = [auto_eval_cols.model.name]
289
-
290
- leaderboard_table = gr.Dataframe(
291
- value=leaderboard_df[init_act],
292
- headers=init_act,
293
- datatype=TYPES, interactive=False,
294
- column_count=(len(init_act), "fixed")
 
 
 
 
 
 
 
 
 
 
 
295
  )
296
 
297
- with Modal(visible=False) as modal:
298
- map_p = gr.Plot()
299
- data_table = gr.Dataframe()
 
 
 
 
 
300
 
301
- leaderboard_table.select(select, leaderboard_table, model_comparison)
302
- spider_btn.click(generate_spider_chart, [leaderboard_table, model_comparison], [map_p, data_table])
303
- spider_btn.click(lambda: Modal(visible=True), None, modal)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
- hidden_leaderboard = gr.Dataframe(value=original_df[COLS] if set(COLS).issubset(original_df.columns) else original_df, visible=False)
306
- search_bar.submit(update_table, [hidden_leaderboard, shown_columns, filter_columns_type, filter_columns_precision, gr.State([]), gr.State([]), gr.State([]), search_bar, f_compute, f_weight, f_double, f_group], leaderboard_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
- with gr.TabItem("📈 Metrics through time", id=2):
309
  with gr.Row():
310
- gr.Plot(value=create_metric_plot_obj(plot_df, [auto_eval_cols.average.name], title="Average Over Time"))
311
- gr.Plot(value=create_metric_plot_obj(plot_df, BENCHMARK_COLS, title="Benchmarks Over Time"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
- with gr.TabItem("🚀 Submit", id=5):
314
  with gr.Column():
315
- gr.Markdown(EVALUATION_QUEUE_TEXT)
316
- model_name_textbox = gr.Textbox(label="Model name")
317
- revision_name_textbox = gr.Textbox(label="Revision", value="main")
318
- compute_type = gr.Dropdown(choices=[i.value.name for i in ComputeDtype if i.value.name != "All"], label="Compute dtype", value="float16")
319
- submit_button = gr.Button("Submit Eval")
320
- submission_result = gr.Markdown()
321
- submit_button.click(add_new_eval, [model_name_textbox, revision_name_textbox, gr.State(False), compute_type], submission_result)
322
-
323
- with gr.Accordion(f"✅ Finished Evaluations", open=False):
324
- # 修复对齐
325
- q_cols = [c for c in EVAL_COLS if c in finished_eval_queue_df.columns]
326
- if not q_cols: q_cols = list(finished_eval_queue_df.columns)
327
- gr.Dataframe(value=finished_eval_queue_df[q_cols], headers=q_cols, datatype=EVAL_TYPES, column_count=(len(q_cols), "fixed"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  with gr.Row():
330
  with gr.Accordion("📙 Citation", open=False):
331
- gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=10, show_copy_button=True)
 
 
 
 
 
 
332
 
333
  scheduler = BackgroundScheduler()
334
- scheduler.add_job(restart_space, "interval", hours=3)
 
335
  scheduler.start()
336
- demo.queue(default_concurrency_limit=40).launch()
 
 
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
4
  import re
 
23
  NUMERIC_INTERVALS,
24
  NUMERIC_MODELSIZE,
25
  TYPES,
 
26
  auto_eval_cols,
 
27
  GroupDtype,
28
  ModelType,
29
  fields,
 
43
  create_plot_df,
44
  create_scores_df,
45
  )
 
46
  import plotly.graph_objects as go
47
 
48
  selected_indices = []
49
  selected_values = {}
50
  selected_dropdown_weight = 'All'
51
 
52
+ # Start ephemeral Spaces on PRs (see config in README.md)
53
+ #enable_space_ci()
54
+
55
  precision_to_dtype = {
56
+ "2bit": ["int2"],
57
+ "3bit": ["int3"],
58
+ "4bit": ["int4", "nf4", "fp4"],
59
+ "8bit": ["int8"],
60
+ "16bit": ['float16', 'bfloat16'],
61
+ "32bit": ["float32"],
62
+ "?": ["?"],
63
  }
64
 
65
  dtype_to_precision = {
66
+ "int2": ["2bit"],
67
+ "int3": ["3bit"],
68
+ "int4": ["4bit"],
69
+ "nf4": ["4bit"],
70
+ "fp4": ["4bit"],
71
+ "int8": ["8bit"],
72
+ "float16": ["16bit"],
73
+ "bfloat16": ["16bit"],
74
+ "float32": ["32bit"],
75
+ "?": ["?"],
76
  }
77
 
78
  current_weightDtype = ["int2", "int3", "int4", "nf4", "fp4", "?"]
 
80
  current_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
81
  current_precision = ['2bit', '3bit', '4bit', '8bit', '?']
82
 
83
+
84
  def display_sort(key):
85
  order = {"All": 0, "?": 1, "int2": 2, "int3": 3, "int4": 4, "fp4": 5, "nf4": 6, "float16": 7, "bfloat16": 8, "float32": 9}
86
  return order.get(key, float('inf'))
 
89
  order = {"All": 0, "?": 1, "int8": 2, "float16": 3, "bfloat16": 4, "float32": 5}
90
  return order.get(key, float('inf'))
91
 
 
92
  def update_quantization_types(selected_quant):
93
+ global current_weightDtype
94
+ global current_computeDtype
95
+ global current_quant
96
+ global current_precision
97
+
98
  if set(current_quant) == set(selected_quant):
99
+ return [
100
+ gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
101
+ gr.Dropdown(choices=current_computeDtype, value="All"),
102
+ gr.CheckboxGroup(value=current_precision),
103
+ ]
104
+
105
+ # print('update_quantization_types', selected_quant, current_quant)
106
  if any(value != '✖ None' for value in selected_quant):
107
  selected_weight = ['All', '?', 'int2', 'int3', 'int4', 'nf4', 'fp4', 'int8']
108
  selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
109
  selected_precision = ["2bit", "3bit", "4bit", "8bit", "?"]
110
+
111
+ current_weightDtype = selected_weight
112
+ current_computeDtype = selected_compute
113
+ current_quant = selected_quant
114
+ current_precision = selected_precision
115
+
116
+ return [
117
+ gr.Dropdown(choices=selected_weight, value="All"),
118
+ gr.Dropdown(choices=selected_compute, value="All"),
119
+ gr.CheckboxGroup(value=selected_precision),
120
+ ]
121
 
122
  def update_Weight_Precision(temp_precisions):
123
+ global current_weightDtype
124
+ global current_computeDtype
125
+ global current_quant
126
+ global current_precision
127
+ global selected_dropdown_weight
128
+
129
+ # print('temp_precisions', temp_precisions)
130
  if set(current_precision) == set(temp_precisions):
131
+ return [
132
+ gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
133
+ gr.Dropdown(choices=current_computeDtype, value="All"),
134
+ gr.CheckboxGroup(value=current_precision),
135
+ gr.CheckboxGroup(value=current_quant),
136
+ ] # No update needed
137
+
138
+ selected_weight = []
139
+ selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
140
  selected_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
141
+
142
  if temp_precisions[-1] in ["16bit", "32bit"]:
143
  selected_precisions = [p for p in temp_precisions if p in ["16bit", "32bit"]]
144
  else:
145
  selected_precisions = [p for p in temp_precisions if p not in ["16bit", "32bit"]]
146
+
147
  current_precision = list(set(selected_precisions))
148
+ # print('selected_dropdown_weight', selected_dropdown_weight)
149
+
150
+ if len(current_precision) > 1:
151
  selected_dropdown_weight = 'All'
152
+ elif selected_dropdown_weight != 'All' and set(dtype_to_precision[selected_dropdown_weight]) != set(current_precision):
153
+ selected_dropdown_weight = 'All'
154
+
155
+ # print('final', current_precision)
156
+ # Map selected_precisions to corresponding weights
157
  for precision in current_precision:
158
+ if precision in precision_to_dtype:
159
+ selected_weight.extend(precision_to_dtype[precision])
160
+
161
+ # Special rules for 16bit and 32bit
162
  if "16bit" in current_precision:
163
+ selected_weight = [option for option in selected_weight if option in ["All", "?", "float16", "bfloat16"]]
164
+ if "int8" in selected_compute:
165
+ selected_compute.remove("int8")
166
+
167
  if "32bit" in current_precision:
168
+ selected_weight = [option for option in selected_weight if option in ["All", "?", "float32"]]
169
+ if "int8" in selected_compute:
170
+ selected_compute.remove("int8")
171
+
172
+ if "16bit" in current_precision or "32bit" in current_precision:
173
+ selected_quant = ['✖ None']
174
+ if "16bit" in current_precision and "32bit" in current_precision:
175
+ selected_weight = ["All", "?", "float16", "bfloat16", "float32"]
176
+ # Ensure "All" and "?" options are included
177
+ selected_weight = ["All", "?"] + [opt for opt in selected_weight if opt not in ["All", "?"]]
178
+ selected_compute = ["All", "?"] + [opt for opt in selected_compute if opt not in ["All", "?"]]
179
+
180
+ # Remove duplicates
181
+ selected_weight = list(set(selected_weight))
182
+ selected_compute = list(set(selected_compute))
183
+
184
+ # Update global variables
185
+ current_weightDtype = selected_weight
186
+ current_computeDtype = selected_compute
187
+ current_quant = selected_quant
188
+
189
+ # Return updated components
190
+ return [
191
+ gr.Dropdown(choices=selected_weight, value=selected_dropdown_weight),
192
+ gr.Dropdown(choices=selected_compute, value="All"),
193
+ gr.CheckboxGroup(value=selected_precisions),
194
+ gr.CheckboxGroup(value=selected_quant),
195
+ ]
196
 
197
  def update_Weight_Dtype(weight):
198
  global selected_dropdown_weight
199
+ # print('update_Weight_Dtype', weight)
200
+ # Initialize selected_precisions
201
+ if weight == selected_dropdown_weight or weight == 'All':
202
+ return current_precision
203
+ else:
204
+ selected_precisions = []
205
+ selected_precisions.extend(dtype_to_precision[weight])
206
+ selected_dropdown_weight = weight
207
+ # print('selected_precisions', selected_precisions)
208
+ # Return updated components
209
  return selected_precisions
210
 
211
+
212
+
213
+
214
  def restart_space():
215
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
216
 
217
+
218
  def init_space(full_init: bool = True):
219
+
220
  if full_init:
221
  try:
222
  branch = REPO.active_branch.name
223
  REPO.remotes.origin.pull(branch)
224
+ except Exception as e:
225
+ # print(str(e))
226
+ restart_space()
 
 
 
 
 
227
 
228
+ try:
229
+ # print(DYNAMIC_INFO_PATH)
230
+ snapshot_download(
231
+ repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
232
+ )
233
+ except Exception:
234
+ restart_space()
235
+
236
+ raw_data, original_df = get_leaderboard_df(
237
+ results_path=GIT_RESULTS_PATH,
238
+ requests_path=GIT_STATUS_PATH,
239
+ dynamic_path=DYNAMIC_INFO_FILE_PATH,
240
+ cols=COLS,
241
+ benchmark_cols=BENCHMARK_COLS
242
+ )
243
+ # update_collections(original_df.copy())
244
  leaderboard_df = original_df.copy()
245
+
246
  plot_df = create_plot_df(create_scores_df(raw_data))
247
+
248
+ (
249
+ finished_eval_queue_df,
250
+ running_eval_queue_df,
251
+ pending_eval_queue_df,
252
+ ) = get_evaluation_queue_df(GIT_STATUS_PATH, EVAL_COLS)
253
+
254
+ return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
255
 
256
  leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
257
 
258
  def str_to_bool(value):
259
+ if str(value).lower() == "true":
260
+ return True
261
+ elif str(value).lower() == "false":
262
+ return False
263
+ else:
264
+ return False
265
+
266
+ # Searching and filtering
267
+ def update_table(
268
+ hidden_df: pd.DataFrame,
269
+ columns: list,
270
+ type_query: list,
271
+ precision_query: str,
272
+ size_query: list,
273
+ params_query: list,
274
+ hide_models: list,
275
+ query: str,
276
+ compute_dtype: str,
277
+ weight_dtype: str,
278
+ double_quant: str,
279
+ group_dtype: str
280
+ ):
281
+ global init_select
282
+ global current_weightDtype
283
+ global current_computeDtype
284
+
285
+ if weight_dtype == ['All'] or weight_dtype == 'All':
286
+ weight_dtype = current_weightDtype
287
+ else:
288
+ weight_dtype = [weight_dtype]
289
+
290
+ if compute_dtype == 'All':
291
+ compute_dtype = current_computeDtype
292
+ else:
293
+ compute_dtype = [compute_dtype]
294
+
295
+ if group_dtype == 'All':
296
+ group_dtype = [-1, 1024, 256, 128, 64, 32]
297
+ else:
298
+ try:
299
+ group_dtype = [int(group_dtype)]
300
+ except ValueError:
301
+ group_dtype = [-1]
302
+
303
+ if double_quant == 'All':
304
+ double_quant = [True, False]
305
+ else:
306
+ double_quant = [str_to_bool(double_quant)]
307
 
308
+ filtered_df = filter_models(df=hidden_df, type_query=type_query, size_query=size_query, precision_query=precision_query, hide_models=hide_models, compute_dtype=compute_dtype, weight_dtype=weight_dtype, double_quant=double_quant, group_dtype=group_dtype, params_query=params_query)
309
  filtered_df = filter_queries(query, filtered_df)
310
+ df = select_columns(filtered_df, columns)
311
+ return df
312
+
313
+
314
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
315
+ query = request.query_params.get("query") or ""
316
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
317
+
318
 
319
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
 
320
  return df[(df[auto_eval_cols.dummy.name].str.contains(query, case=False))]
321
 
322
+
323
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
324
  always_here_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
325
  dummy_col = [auto_eval_cols.dummy.name]
326
+ # We use COLS to maintain sorting
327
+ filtered_df = df[
328
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
329
+ ]
330
+ return filtered_df
331
+
332
 
333
  def filter_queries(query: str, filtered_df: pd.DataFrame):
334
+ """Added by Abishek"""
335
  final_df = []
336
+ if query != "":
337
+ queries = [q.strip() for q in query.split(";")]
338
+ for _q in queries:
339
+ _q = _q.strip()
340
+ if _q != "":
341
+ temp_filtered_df = search_table(filtered_df, _q)
342
+ if len(temp_filtered_df) > 0:
343
+ final_df.append(temp_filtered_df)
344
+ if len(final_df) > 0:
345
+ filtered_df = pd.concat(final_df)
346
+ filtered_df = filtered_df.drop_duplicates(
347
+ subset=[auto_eval_cols.model.name, auto_eval_cols.precision.name, auto_eval_cols.revision.name]
348
+ )
349
+
350
+ return filtered_df
351
+
352
+
353
+ def filter_models(
354
+ df: pd.DataFrame, type_query: list, size_query: list, params_query:list, precision_query: list, hide_models: list, compute_dtype: list, weight_dtype: list, double_quant: list, group_dtype: list,
355
+ ) -> pd.DataFrame:
356
+ # Show all models
357
+ if "Private or deleted" in hide_models:
358
+ filtered_df = df[df[auto_eval_cols.still_on_hub.name] == True]
359
+ else:
360
+ filtered_df = df
361
+
362
+ if "Contains a merge/moerge" in hide_models:
363
+ filtered_df = filtered_df[filtered_df[auto_eval_cols.merged.name] == False]
364
+
365
+ if "MoE" in hide_models:
366
+ filtered_df = filtered_df[filtered_df[auto_eval_cols.moe.name] == False]
367
+
368
+ if "Flagged" in hide_models:
369
+ filtered_df = filtered_df[filtered_df[auto_eval_cols.flagged.name] == False]
370
+
371
+ type_emoji = [t[0] for t in type_query]
372
+ if any(emoji != '✖' for emoji in type_emoji):
373
+ type_emoji = [emoji for emoji in type_emoji if emoji != '✖']
374
+ else:
375
+ type_emoji = ['✖']
376
+
377
+ filtered_df = filtered_df.loc[df[auto_eval_cols.model_type_symbol.name].isin(type_emoji)]
378
+ filtered_df = filtered_df.loc[df[auto_eval_cols.precision.name].isin(precision_query + ["None"])]
379
+
380
+ filtered_df = filtered_df.loc[df[auto_eval_cols.weight_dtype.name].isin(weight_dtype)]
381
+
382
+ filtered_df = filtered_df.loc[df[auto_eval_cols.compute_dtype.name].isin(compute_dtype)]
383
+
384
+ filtered_df = filtered_df.loc[df[auto_eval_cols.double_quant.name].isin(double_quant)]
385
+
386
+ filtered_df = filtered_df.loc[df[auto_eval_cols.group_size.name].isin(group_dtype)]
387
+
388
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
389
+ params_column = pd.to_numeric(df[auto_eval_cols.params.name], errors="coerce")
390
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
391
+ filtered_df = filtered_df.loc[mask]
392
 
393
+ numeric_interval_params = pd.IntervalIndex(sorted([NUMERIC_MODELSIZE[s] for s in params_query]))
394
+ params_column_params = pd.to_numeric(df[auto_eval_cols.model_size.name], errors="coerce")
395
+ mask_params = params_column_params.apply(lambda x: any(numeric_interval_params.contains(x)))
396
+ filtered_df = filtered_df.loc[mask_params]
397
+
398
+ return filtered_df
399
 
400
  def select(df, data: gr.SelectData):
401
+ global selected_indices
402
+ global selected_values
403
+
404
  selected_index = data.index[0]
 
 
 
 
405
  if selected_index in selected_indices:
406
  selected_indices.remove(selected_index)
407
+
408
+ value = df.iloc[selected_index].iloc[1]
409
+ pattern = r'<a[^>]+>([^<]+)</a>'
410
+ match = re.search(pattern, value)
411
+ if match:
412
+ text_content = match.group(1)
413
+ if text_content in selected_values:
414
+ del selected_values[text_content]
415
  else:
416
  selected_indices.append(selected_index)
417
+
418
+ value = df.iloc[selected_index].iloc[1]
419
+ pattern = r'<a[^>]+>([^<]+)</a>'
420
+ match = re.search(pattern, value)
421
+ if match:
422
+ text_content = match.group(1)
423
+ selected_values[text_content] = value
424
+
425
  return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
426
 
427
+ def init_comparison_data():
428
+ global selected_values
429
+ return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
430
+
431
+ def remove_html_tags(value):
432
+ if isinstance(value, str):
433
+ return re.sub(r'<[^>]*>', '', value)
434
+ return value
435
+
436
+ def show_modal():
437
+ return gr.update(visible=True, elem_classes="custom-modal")
438
+
439
+ def close_modal_logic():
440
+ return gr.update(visible=False, elem_classes="modal-hidden")
441
+
442
  def generate_spider_chart(df, selected_keys):
443
  global selected_values
444
+ current_selected_values = [selected_values[key] for key in selected_keys if key in selected_values]
445
+ selected_rows = df[df.iloc[:, 1].isin(current_selected_values)]
446
+ cleaned_rows = selected_rows.map(remove_html_tags)
447
+
448
+
449
  fig = go.Figure()
 
 
450
  for _, row in selected_rows.iterrows():
451
  fig.add_trace(go.Scatterpolar(
452
+ r=[row['Average ⬆️'], row['ARC-c'], row['ARC-e'], row['Boolq'], row['HellaSwag'], row['Lambada'], row['MMLU'], row['Openbookqa'], row['Piqa'], row['Truthfulqa'], row['Winogrande']],
453
+ theta=['Average ⬆️', 'ARC-c', 'ARC-e', 'Boolq', 'HellaSwag', 'Lambada', 'MMLU', 'Openbookqa', 'Piqa', 'Truthfulqa', 'Winogrande'],
454
+ fill='toself',
455
+ name=str(row['Model'])
456
  ))
457
+ fig.update_layout(
458
+ polar=dict(
459
+ radialaxis=dict(
460
+ visible=False,
461
+ )),
462
+ showlegend=True,
463
+ margin=dict(l=50, r=50, t=50, b=50),
464
+ height=400,
465
+ autosize=True
466
+ )
467
+
468
+ return fig, cleaned_rows
469
+
470
+ leaderboard_df = filter_models(
471
+ df=leaderboard_df,
472
+ type_query=[t.to_str(" : ") for t in QuantType if t != QuantType.QuantType_None],
473
+ size_query=list(NUMERIC_INTERVALS.keys()),
474
+ params_query=list(NUMERIC_MODELSIZE.keys()),
475
+ precision_query=[i.value.name for i in Precision],
476
+ hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs,
477
+ compute_dtype=[i.value.name for i in ComputeDtype],
478
+ weight_dtype=[i.value.name for i in WeightDtype],
479
+ double_quant=[True, False],
480
+ group_dtype=[-1, 1024, 256, 128, 64, 32]
481
+ )
482
+
483
 
484
+ demo = gr.Blocks(fill_width=True)
 
485
  with demo:
486
+
487
+ with gr.Column(elem_classes="custom-modal", visible=False, elem_id="my-modal-container") as modal_window:
488
+ with gr.Column(elem_classes="modal-content"):
489
+ with gr.Column():
490
+ comparison_plot_inside = gr.Plot()
491
+ comparison_df_inside = gr.Dataframe(interactive=False)
492
+
493
+ close_btn = gr.Button("Close", variant="primary")
494
+
495
  gr.HTML(TITLE)
496
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
497
+
498
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
499
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
500
  with gr.Row():
501
  with gr.Column():
502
+ with gr.Row(variant="compact"):
503
+ search_bar = gr.Textbox(
504
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
505
+ show_label=False,
506
+ elem_id="search-bar",
507
+ )
508
+ with gr.Row():
509
+ shown_columns = gr.CheckboxGroup(
510
+ choices=[
511
+ c.name
512
+ for c in fields(auto_eval_cols)
513
+ if not c.hidden and not c.never_hidden and not c.dummy
514
+ ],
515
+ value=[
516
+ c.name
517
+ for c in fields(auto_eval_cols)
518
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
519
+ ],
520
+ label="Select columns to show",
521
+ elem_id="column-select",
522
+ interactive=True,
523
+ )
524
+
525
+ with gr.Row():
526
+ filter_columns_parameters = gr.CheckboxGroup(
527
+ label="Model parameters (in billions of parameters)",
528
+ choices=list(NUMERIC_INTERVALS.keys()),
529
+ value=list(NUMERIC_INTERVALS.keys()),
530
+ interactive=True,
531
+ elem_id="filter-columns-size",
532
+ )
533
+ with gr.Row():
534
+ filter_columns_size = gr.CheckboxGroup(
535
+ label="Model sizes (GB, int4)",
536
+ choices=list(NUMERIC_MODELSIZE.keys()),
537
+ value=list(NUMERIC_MODELSIZE.keys()),
538
+ interactive=True,
539
+ elem_id="filter-columns-size",
540
  )
541
  with gr.Column(min_width=320):
542
+ #with gr.Box(elem_id="box-filter"):
543
+ filter_columns_type = gr.CheckboxGroup(
544
+ label="Quantization types",
545
+ choices=[t.to_str() for t in QuantType if t != QuantType.QuantType_None],
546
+ value=[t.to_str() for t in QuantType if t != QuantType.QuantType_None],
547
+ interactive=True,
548
+ elem_id="filter-columns-type",
549
+ )
550
+ filter_columns_precision = gr.CheckboxGroup(
551
+ label="Weight precision",
552
+ choices=[i.value.name for i in Precision],
553
+ value=[i.value.name for i in Precision if ( i.value.name != '16bit' and i.value.name != '32bit')],
554
+ interactive=True,
555
+ elem_id="filter-columns-precision",
556
+ )
557
+ with gr.Column(elem_id="quant-config-container") as config:
558
+ gr.HTML("<div class='quant-config-header'>Quantization config</div>")
559
  with gr.Row():
560
+ filter_columns_computeDtype = gr.Dropdown(choices=[i.value.name for i in ComputeDtype], label="Compute Dtype", multiselect=False, value="All", interactive=True,)
561
+ filter_columns_weightDtype = gr.Dropdown(choices=[i.value.name for i in WeightDtype], label="Weight Dtype", multiselect=False, value="All", interactive=True,)
562
+ filter_columns_doubleQuant = gr.Dropdown(choices=["All", "True", "False"], label="Double Quant", multiselect=False, value="All", interactive=True)
563
+ filter_columns_groupDtype = gr.Dropdown(choices=[i.value.name for i in GroupDtype], label="Group Size", multiselect=False, value="All", interactive=True,)
564
+
565
+ with gr.Row():
566
+ with gr.Column(scale=4):
567
+ model_comparison = gr.CheckboxGroup(label="Accuracy Comparison (Selected Models from Table)", choices=list(selected_values.keys()), value=list(selected_values.keys()), interactive=True, elem_id="model_comparison")
568
+ with gr.Column(scale=1, min_width=150):
569
+ spider_btn = gr.Button("Compare", variant="primary", elem_id="compare-button-full")
570
+
571
+ never_hidden_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
572
+
573
+ user_cols = shown_columns.value
574
+
575
+ if len(user_cols) > 0:
576
+ first_user_col = [user_cols[0]]
577
+ remaining_user_cols = user_cols[1:]
578
+
579
+ final_cols = first_user_col + never_hidden_cols + remaining_user_cols
580
+ else:
581
+ final_cols = never_hidden_cols
582
+
583
+ leaderboard_table = gr.components.Dataframe(
584
+ value=leaderboard_df[final_cols + [auto_eval_cols.dummy.name]],
585
+ headers=final_cols,
586
+ datatype="markdown",
587
+ elem_id="leaderboard-table",
588
+ interactive=False,
589
+ visible=True,
590
+ )
591
+
592
+ # with gr.BrowserModal(visible=False) as modal:
593
+ # map = gr.Plot()
594
+ # data_table = gr.Dataframe()
595
+ # gr.Column([map, data_table])
596
 
597
+ leaderboard_table.select(select, leaderboard_table, model_comparison)
598
+ spider_btn.click(
599
+ fn=show_modal,
600
+ outputs=modal_window
601
+ ).then(
602
+ fn=generate_spider_chart,
603
+ inputs=[leaderboard_table, model_comparison],
604
+ outputs=[comparison_plot_inside, comparison_df_inside]
605
+ )
606
+ close_btn.click(
607
+ fn=close_modal_logic,
608
+ outputs=modal_window
609
+ )
610
+ demo.load(init_comparison_data, None, model_comparison)
611
+
612
+ if "Weight type" not in original_df.columns:
613
+ original_df["Weight type"] = "Unknown"
614
+
615
+ # Dummy leaderboard for handling the case when the user uses backspace key
616
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
617
+ value=original_df[COLS],
618
+ headers=COLS,
619
+ datatype=TYPES,
620
+ visible=False,
621
  )
622
 
623
+ hide_models = gr.Textbox(
624
+ placeholder="",
625
+ show_label=False,
626
+ elem_id="search-bar",
627
+ value="",
628
+ visible=False,
629
+
630
+ )
631
 
632
+ search_bar.submit(
633
+ update_table,
634
+ [
635
+ hidden_leaderboard_table_for_search,
636
+ shown_columns,
637
+ filter_columns_type,
638
+ filter_columns_precision,
639
+ filter_columns_parameters,
640
+ filter_columns_size,
641
+ hide_models,
642
+ search_bar,
643
+ filter_columns_computeDtype,
644
+ filter_columns_weightDtype,
645
+ filter_columns_doubleQuant,
646
+ filter_columns_groupDtype
647
+ ],
648
+ leaderboard_table,
649
+ )
650
 
651
+ """
652
+
653
+ # Define a hidden component that will trigger a reload only if a query parameter has been set
654
+ hidden_search_bar = gr.Textbox(value="", visible=False)
655
+ hidden_search_bar.change(
656
+ update_table,
657
+ [
658
+ hidden_leaderboard_table_for_search,
659
+ shown_columns,
660
+ filter_columns_type,
661
+ filter_columns_precision,
662
+ filter_columns_size,
663
+ hide_models,
664
+ search_bar,
665
+ ],
666
+ leaderboard_table,
667
+ )
668
+ # Check query parameter once at startup and update search bar + hidden component
669
+ demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
670
+
671
+ """
672
+ filter_columns_type.change(
673
+ update_quantization_types,
674
+ [filter_columns_type],
675
+ [filter_columns_weightDtype, filter_columns_computeDtype, filter_columns_precision]
676
+ )
677
+
678
+ filter_columns_precision.change(
679
+ update_Weight_Precision,
680
+ [filter_columns_precision],
681
+ [filter_columns_weightDtype, filter_columns_computeDtype, filter_columns_precision, filter_columns_type]
682
+ )
683
+
684
+ filter_columns_weightDtype.change(
685
+ update_Weight_Dtype,
686
+ [filter_columns_weightDtype],
687
+ [filter_columns_precision]
688
+ )
689
+ # filter_columns_computeDtype.change(
690
+ # Compute_Dtype_update,
691
+ # [filter_columns_computeDtype, filter_columns_precision],
692
+ # [filter_columns_precision, filter_columns_type]
693
+ # )
694
+
695
+
696
+
697
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, filter_columns_parameters, hide_models, filter_columns_computeDtype, filter_columns_weightDtype, filter_columns_doubleQuant, filter_columns_groupDtype]:
698
+ selector.change(
699
+ update_table,
700
+ [
701
+ hidden_leaderboard_table_for_search,
702
+ shown_columns,
703
+ filter_columns_type,
704
+ filter_columns_precision,
705
+ filter_columns_parameters,
706
+ filter_columns_size,
707
+ hide_models,
708
+ search_bar,
709
+ filter_columns_computeDtype,
710
+ filter_columns_weightDtype,
711
+ filter_columns_doubleQuant,
712
+ filter_columns_groupDtype
713
+ ],
714
+ leaderboard_table,
715
+ queue=True,
716
+ )
717
+
718
+
719
+ with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
720
+ with gr.Row():
721
+ with gr.Column():
722
+ chart = create_metric_plot_obj(
723
+ plot_df,
724
+ [auto_eval_cols.average.name],
725
+ title="Average of Top Scores and Human Baseline Over Time (from last update)",
726
+ )
727
+ gr.Plot(value=chart, min_width=500)
728
+ with gr.Column():
729
+ chart = create_metric_plot_obj(
730
+ plot_df,
731
+ BENCHMARK_COLS,
732
+ title="Top Scores and Human Baseline Over Time (from last update)",
733
+ )
734
+ gr.Plot(value=chart, min_width=500)
735
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
736
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
737
+
738
+ with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
739
+ gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
740
+
741
+ with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
742
+ with gr.Column():
743
+ with gr.Row():
744
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
745
+
746
+ with gr.Row():
747
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
748
 
 
749
  with gr.Row():
750
+ with gr.Column():
751
+ model_name_textbox = gr.Textbox(label="Model name")
752
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
753
+ private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
754
+
755
+ with gr.Column():
756
+ """
757
+ precision = gr.Dropdown(
758
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
759
+ label="Precision",
760
+ multiselect=False,
761
+ value="4bit",
762
+ interactive=True,
763
+ )
764
+ weight_type = gr.Dropdown(
765
+ choices=[i.value.name for i in WeightDtype],
766
+ label="Weights dtype",
767
+ multiselect=False,
768
+ value="int4",
769
+ interactive=True,
770
+ )
771
+ """
772
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)",
773
+ visible=not IS_PUBLIC)
774
+ compute_type = gr.Dropdown(
775
+ choices=[i.value.name for i in ComputeDtype if i.value.name != "All"],
776
+ label="Compute dtype",
777
+ multiselect=False,
778
+ value="float16",
779
+ interactive=True,
780
+ )
781
+
782
+ submit_button = gr.Button("Submit Eval")
783
+ submission_result = gr.Markdown()
784
+ submit_button.click(
785
+ add_new_eval,
786
+ [
787
+ model_name_textbox,
788
+ revision_name_textbox,
789
+ private,
790
+ compute_type,
791
+ ],
792
+ submission_result,
793
+ )
794
 
 
795
  with gr.Column():
796
+ with gr.Accordion(
797
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
798
+ open=False,
799
+ ):
800
+ with gr.Row():
801
+ finished_eval_table = gr.components.Dataframe(
802
+ value=finished_eval_queue_df,
803
+ headers=EVAL_COLS,
804
+ datatype=EVAL_TYPES,
805
+ row_count=5,
806
+ )
807
+ with gr.Accordion(
808
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
809
+ open=False,
810
+ ):
811
+ with gr.Row():
812
+ running_eval_table = gr.components.Dataframe(
813
+ value=running_eval_queue_df,
814
+ headers=EVAL_COLS,
815
+ datatype=EVAL_TYPES,
816
+ row_count=5,
817
+ )
818
+
819
+ with gr.Accordion(
820
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
821
+ open=False,
822
+ ):
823
+ with gr.Row():
824
+ pending_eval_table = gr.components.Dataframe(
825
+ value=pending_eval_queue_df,
826
+ headers=EVAL_COLS,
827
+ datatype=EVAL_TYPES,
828
+ row_count=5,
829
+ )
830
 
831
  with gr.Row():
832
  with gr.Accordion("📙 Citation", open=False):
833
+ citation_button = gr.Textbox(
834
+ value=CITATION_BUTTON_TEXT,
835
+ label=CITATION_BUTTON_LABEL,
836
+ lines=20,
837
+ elem_id="citation-button",
838
+ buttons=["copy"],
839
+ )
840
 
841
  scheduler = BackgroundScheduler()
842
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
843
+ scheduler.add_job(update_dynamic_files, "interval", hours=12) # launched every 2 hour
844
  scheduler.start()
845
+
846
+ demo.queue(default_concurrency_limit=40).launch(css=custom_css)
requirements.txt CHANGED
@@ -1,20 +1,20 @@
1
- APScheduler==3.10.1
2
- black==23.11.0
3
- click==8.1.3
4
- datasets==2.14.5
5
- huggingface-hub>=0.18.0,<1.0.0
6
- matplotlib==3.7.1
7
- numpy==1.24.2
8
- pandas==2.0.0
9
- plotly==5.14.1
10
- python-dateutil==2.8.2
11
- requests==2.28.2
12
- sentencepiece
13
- tqdm==4.65.0
14
- transformers==4.39.0
15
- tokenizers>=0.15.0
16
- #gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.1 # CI !!!
17
- gradio==3.28.0
18
- GitPython==3.1.40
19
- pydantic==1.10.15
20
- plotly==5.14.1
 
1
+ pandas
2
+ numpy
3
+ matplotlib
4
+ plotly
5
+ apscheduler
6
+ tqdm
7
+ requests
8
+ python-dateutil
9
+
10
+
11
+ huggingface-hub
12
+
13
+ transformers
14
+ gradio==6.5.1
15
+
16
+ datasets
17
+ tokenizers
18
+ GitPython
19
+
20
+ pydantic>=2.0
src/display/css_html_js.py CHANGED
@@ -13,9 +13,112 @@ table th:first-child {
13
  white-space: nowrap;
14
  }
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  /* Full width space */
17
  .gradio-container {
18
- max-width: 95%!important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
 
21
  /* Text style and margins */
@@ -44,7 +147,7 @@ table th:first-child {
44
  background: none;
45
  border: none;
46
  }
47
-
48
  #search-bar {
49
  padding: 0px;
50
  }
@@ -71,7 +174,7 @@ table th:first-child {
71
 
72
  /* 100% scale*/
73
  @media (resolution: 96dpi), (min-resolution: 1dppx) and (max-resolution: 1.25dppx) {
74
- #model_comparison {
75
  height: 6rem !important;
76
  overflow: auto !important;
77
  }
@@ -93,13 +196,10 @@ table th:first-child {
93
  }
94
 
95
  #component-31 {
96
- margin-top: 0.5rem !important;
97
  }
98
  }
99
 
100
-
101
-
102
-
103
  #model_comparison {
104
  height: 6rem !important;
105
  overflow: auto !important;
@@ -109,8 +209,8 @@ table th:first-child {
109
  font-size: 0.7rem !important;
110
  }
111
 
112
- .tab-buttons button {
113
- font-size: 20px;
114
  }
115
 
116
  /* Filters style */
@@ -147,11 +247,3 @@ table th:first-child {
147
  border: 0
148
  }
149
  """
150
-
151
- get_window_url_params = """
152
- function(url_params) {
153
- const params = new URLSearchParams(window.location.search);
154
- url_params = Object.fromEntries(params);
155
- return url_params;
156
- }
157
- """
 
13
  white-space: nowrap;
14
  }
15
 
16
+ .custom-modal:not([style*="display: none"]):not(.hidden) {
17
+ position: fixed !important;
18
+ top: 0 !important;
19
+ left: 0 !important;
20
+ width: 100vw !important;
21
+ height: 100vh !important;
22
+ background-color: rgba(0, 0, 0, 0.85) !important;
23
+ z-index: 10000 !important;
24
+ display: block !important;
25
+ overflow-y: auto !important;
26
+ pointer-events: auto !important;
27
+ padding: 5vh 0 !important;
28
+ }
29
+
30
+ .custom-modal[style*="display: none"],
31
+ .custom-modal.hidden,
32
+ .modal-hidden {
33
+ display: none !important;
34
+ visibility: hidden !important;
35
+ pointer-events: none !important;
36
+ position: absolute !important;
37
+ width: 0 !important;
38
+ height: 0 !important;
39
+ }
40
+
41
+ .modal-content {
42
+ background: white !important;
43
+ padding: 30px 50px !important;
44
+ border-radius: 12px;
45
+ width: 85% !important;
46
+ max-width: 1100px;
47
+ margin: 0 auto !important;
48
+ display: block !important;
49
+ height: auto !important;
50
+ min-height: 200px !important;
51
+ max-height: none !important;
52
+ z-index: 10001;
53
+ text-align: center;
54
+ box-shadow: 0 4px 20px rgba(0,0,0,0.3);
55
+ }
56
+
57
+ .modal-content .gradio-plot,
58
+ .modal-content .plot-container {
59
+ height: 400px !important;
60
+ min-height: 400px !important;
61
+ max-height: 400px !important;
62
+ width: 100% !important;
63
+ }
64
+
65
+ .modal-content .gradio-dataframe {
66
+ height: auto !important;
67
+ min-height: 50px !important;
68
+ max-height: 400px !important;
69
+ overflow-y: auto !important;
70
+ }
71
+
72
+ .modal-content .form,
73
+ .modal-content .gap {
74
+ gap: 0 !important;
75
+ padding: 0 !important;
76
+ margin: 0 !important;
77
+ }
78
+
79
+ .modal-content > * {
80
+ margin: 0 auto 20px auto !important;
81
+ flex: none !important;
82
+ display: block !important;
83
+ }
84
+
85
  /* Full width space */
86
  .gradio-container {
87
+ display: flex !important;
88
+ flex-direction: column !important;
89
+ align-items: center !important;
90
+ width: 95% !important;
91
+ max-width: 95% !important;
92
+ margin-left: auto !important;
93
+ margin-right: auto !important;
94
+ }
95
+
96
+ .gradio-group {
97
+ background-color: #fff !important;
98
+ border: none !important;
99
+ box-shadow: none !important;
100
+ }
101
+
102
+ #compare-button-full {
103
+ height: 100% !important;
104
+ width: 100% !important;
105
+ display: flex !important;
106
+ align-items: center !important;
107
+ justify-content: center !important;
108
+ min-height: 100px;
109
+ }
110
+
111
+ #quant-config-container {
112
+ border: 1px solid #e5e7eb !important;
113
+ border-radius: 8px !important;
114
+ background-color: transparent !important;
115
+ }
116
+
117
+ .quant-config-header {
118
+ border-radius: 8px 8px 0 0;
119
+ font-weight: 600;
120
+ background-color: #fff;
121
+ color: #71717a;
122
  }
123
 
124
  /* Text style and margins */
 
147
  background: none;
148
  border: none;
149
  }
150
+
151
  #search-bar {
152
  padding: 0px;
153
  }
 
174
 
175
  /* 100% scale*/
176
  @media (resolution: 96dpi), (min-resolution: 1dppx) and (max-resolution: 1.25dppx) {
177
+ #model_comparison {
178
  height: 6rem !important;
179
  overflow: auto !important;
180
  }
 
196
  }
197
 
198
  #component-31 {
199
+ margin-top: 0.5rem !important;
200
  }
201
  }
202
 
 
 
 
203
  #model_comparison {
204
  height: 6rem !important;
205
  overflow: auto !important;
 
209
  font-size: 0.7rem !important;
210
  }
211
 
212
+ .tab-buttons > div > button {
213
+ font-size: 18px !important;
214
  }
215
 
216
  /* Filters style */
 
247
  border: 0
248
  }
249
  """
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -44,39 +44,74 @@ class ColumnContent:
44
 
45
  auto_eval_column_dict = []
46
  # Init
47
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
48
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
49
- #Scores
50
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  for task in Tasks:
52
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
53
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
54
- auto_eval_column_dict.append(["model_size", ColumnContent, ColumnContent("#Size (G)", "number", True)])
55
- # Dummy column for the search bar (hidden by the custom CSS)
56
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
57
- # Model information
58
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
59
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
60
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
61
- auto_eval_column_dict.append(["quant_type", ColumnContent, ColumnContent("Quant type", "str", False)])
62
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
63
- auto_eval_column_dict.append(["weight_dtype", ColumnContent, ColumnContent("Weight dtype", "str", False)])
64
- auto_eval_column_dict.append(["compute_dtype", ColumnContent, ColumnContent("Compute dtype", "str", False)])
65
- auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False, hidden=True)])
66
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
67
- # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
68
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
69
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
70
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
71
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
72
- auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
73
- auto_eval_column_dict.append(["double_quant", ColumnContent, ColumnContent("Double Quant", "bool", False)])
74
- auto_eval_column_dict.append(["group_size", ColumnContent, ColumnContent("Group Size", "bool", False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # We use make dataclass to dynamically fill the scores from Tasks
76
  # auto_eval_column_dict.sort(key=lambda x: x[0])
77
  sorted_columns = sorted(auto_eval_column_dict[3:], key=lambda x: x[0])
78
  sorted_auto_eval_column_dict = auto_eval_column_dict[:3] + sorted_columns
79
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
80
 
81
  @dataclass(frozen=True)
82
  class EvalQueueColumn: # Queue column
@@ -87,31 +122,28 @@ class EvalQueueColumn: # Queue column
87
  weight_type = ColumnContent("weight_type", "str", "Original")
88
  status = ColumnContent("status", "str", True)
89
 
 
90
 
91
  baseline_row = {
92
- AutoEvalColumn.model.name: "<p>Baseline</p>",
93
- AutoEvalColumn.revision.name: "N/A",
94
- AutoEvalColumn.precision.name: None,
95
- AutoEvalColumn.merged.name: False,
96
- AutoEvalColumn.average.name: 31.0,
97
- AutoEvalColumn.arc.name: 25.0,
98
- # AutoEvalColumn.hellaswag.name: 25.0,
99
- # AutoEvalColumn.truthfulqa.name: 25.0,
100
- AutoEvalColumn.winogrande.name: 50.0,
101
- # AutoEvalColumn.gsm8k.name: 0.21,
102
- AutoEvalColumn.dummy.name: "baseline",
103
- AutoEvalColumn.model_type.name: "",
104
- AutoEvalColumn.flagged.name: False,
105
- # low-bite new params
106
- AutoEvalColumn.mmlu.name: 25.0,
107
- AutoEvalColumn.lambada_openai.name: 25.0,
108
- AutoEvalColumn.hellaswag.name: 25.0,
109
- AutoEvalColumn.piqa.name: 25.0,
110
- AutoEvalColumn.truthfulqa_mc.name: 25.0,
111
- AutoEvalColumn.openbookqa.name: 25.0,
112
- AutoEvalColumn.boolq.name: True,
113
- AutoEvalColumn.arc_easy.name: 25.0,
114
- AutoEvalColumn.double_quant.name: False,
115
  }
116
 
117
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
@@ -123,20 +155,16 @@ baseline_row = {
123
  # GSM8K: paper
124
  # Define the human baselines
125
  human_baseline_row = {
126
- AutoEvalColumn.model.name: "<p>Human performance</p>",
127
- AutoEvalColumn.revision.name: "N/A",
128
- AutoEvalColumn.precision.name: None,
129
- AutoEvalColumn.average.name: 92.75,
130
- AutoEvalColumn.merged.name: False,
131
- AutoEvalColumn.arc.name: 80.0,
132
- # AutoEvalColumn.hellaswag.name: 95.0,
133
- # AutoEvalColumn.mmlu.name: 89.8,
134
- # AutoEvalColumn.truthfulqa.name: 94.0,
135
- AutoEvalColumn.winogrande.name: 94.0,
136
- # AutoEvalColumn.gsm8k.name: 100,
137
- AutoEvalColumn.dummy.name: "human_baseline",
138
- AutoEvalColumn.model_type.name: "",
139
- AutoEvalColumn.flagged.name: False,
140
  }
141
 
142
  @dataclass
@@ -355,8 +383,8 @@ class Precision(Enum):
355
 
356
 
357
  # Column selection
358
- COLS = [c.name for c in fields(AutoEvalColumn)]
359
- TYPES = [c.type for c in fields(AutoEvalColumn)]
360
 
361
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
362
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
@@ -383,4 +411,4 @@ NUMERIC_MODELSIZE = {
383
  "~48": pd.Interval(36, 48, closed="right"),
384
  "~64": pd.Interval(48, 64, closed="right"),
385
  ">72": pd.Interval(64, 200, closed="right"),
386
- }
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
 
44
 
45
  auto_eval_column_dict = []
46
  # Init
47
+ auto_eval_column_dict.append([
48
+ "model_type_symbol",
49
+ ColumnContent,
50
+ field(default_factory=lambda: ColumnContent("T", "Type", "str"))
51
+ ])
52
+
53
+ auto_eval_column_dict.append([
54
+ "model",
55
+ ColumnContent,
56
+ field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))
57
+ ])
58
+
59
+ # Scores
60
+ auto_eval_column_dict.append([
61
+ "average",
62
+ ColumnContent,
63
+ field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))
64
+ ])
65
+
66
  for task in Tasks:
67
+ auto_eval_column_dict.append([
68
+ task.name,
69
+ ColumnContent,
70
+ field(default_factory=lambda t=task: ColumnContent(t.value.col_name, "number", True))
71
+ ])
72
+
73
+ auto_eval_column_dict.append([
74
+ "params",
75
+ ColumnContent,
76
+ field(default_factory=lambda: ColumnContent("#Params (B)", "number", True))
77
+ ])
78
+
79
+ auto_eval_column_dict.append([
80
+ "model_size",
81
+ ColumnContent,
82
+ field(default_factory=lambda: ColumnContent("#Size (G)", "number", True))
83
+ ])
84
+
85
+ # Dummy column for the search bar
86
+ auto_eval_column_dict.append([
87
+ "dummy",
88
+ ColumnContent,
89
+ field(default_factory=lambda: ColumnContent("model_name_for_query", "str", False, dummy=True))
90
+ ])
91
+
92
+ auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False, hidden=True))])
93
+ auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
94
+ auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
95
+ auto_eval_column_dict.append(["quant_type", ColumnContent, field(default_factory=lambda: ColumnContent("Quant type", "str", False))])
96
+ auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
97
+ auto_eval_column_dict.append(["weight_dtype", ColumnContent, field(default_factory=lambda: ColumnContent("Weight dtype", "str", False))])
98
+ auto_eval_column_dict.append(["compute_dtype", ColumnContent, field(default_factory=lambda: ColumnContent("Compute dtype", "str", False))])
99
+ auto_eval_column_dict.append(["merged", ColumnContent, field(default_factory=lambda: ColumnContent("Merged", "bool", False, hidden=True))])
100
+ auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
101
+ auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
102
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False, hidden=True))])
103
+ auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
104
+ auto_eval_column_dict.append(["flagged", ColumnContent, field(default_factory=lambda: ColumnContent("Flagged", "bool", False, hidden=True))])
105
+ auto_eval_column_dict.append(["moe", ColumnContent, field(default_factory=lambda: ColumnContent("MoE", "bool", False, hidden=True))])
106
+ auto_eval_column_dict.append(["double_quant", ColumnContent, field(default_factory=lambda: ColumnContent("Double Quant", "bool", False))])
107
+ auto_eval_column_dict.append(["group_size", ColumnContent, field(default_factory=lambda: ColumnContent("Group Size", "bool", False))])
108
  # We use make dataclass to dynamically fill the scores from Tasks
109
  # auto_eval_column_dict.sort(key=lambda x: x[0])
110
  sorted_columns = sorted(auto_eval_column_dict[3:], key=lambda x: x[0])
111
  sorted_auto_eval_column_dict = auto_eval_column_dict[:3] + sorted_columns
112
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", sorted_auto_eval_column_dict, frozen=True)
113
+ auto_eval_cols = AutoEvalColumn()
114
+
115
 
116
  @dataclass(frozen=True)
117
  class EvalQueueColumn: # Queue column
 
122
  weight_type = ColumnContent("weight_type", "str", "Original")
123
  status = ColumnContent("status", "str", True)
124
 
125
+ eval_queue_cols = EvalQueueColumn()
126
 
127
  baseline_row = {
128
+ auto_eval_cols.model.name: "<p>Baseline</p>",
129
+ auto_eval_cols.revision.name: "N/A",
130
+ auto_eval_cols.precision.name: None,
131
+ auto_eval_cols.merged.name: False,
132
+ auto_eval_cols.average.name: 31.0,
133
+ auto_eval_cols.arc.name: 25.0,
134
+ auto_eval_cols.winogrande.name: 50.0,
135
+ auto_eval_cols.dummy.name: "baseline",
136
+ auto_eval_cols.model_type.name: "",
137
+ auto_eval_cols.flagged.name: False,
138
+ auto_eval_cols.mmlu.name: 25.0,
139
+ auto_eval_cols.lambada_openai.name: 25.0,
140
+ auto_eval_cols.hellaswag.name: 25.0,
141
+ auto_eval_cols.piqa.name: 25.0,
142
+ auto_eval_cols.truthfulqa_mc.name: 25.0,
143
+ auto_eval_cols.openbookqa.name: 25.0,
144
+ auto_eval_cols.boolq.name: True,
145
+ auto_eval_cols.arc_easy.name: 25.0,
146
+ auto_eval_cols.double_quant.name: False,
 
 
 
 
147
  }
148
 
149
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
155
  # GSM8K: paper
156
  # Define the human baselines
157
  human_baseline_row = {
158
+ auto_eval_cols.model.name: "<p>Human performance</p>",
159
+ auto_eval_cols.revision.name: "N/A",
160
+ auto_eval_cols.precision.name: None,
161
+ auto_eval_cols.average.name: 92.75,
162
+ auto_eval_cols.merged.name: False,
163
+ auto_eval_cols.arc.name: 80.0,
164
+ auto_eval_cols.winogrande.name: 94.0,
165
+ auto_eval_cols.dummy.name: "human_baseline",
166
+ auto_eval_cols.model_type.name: "",
167
+ auto_eval_cols.flagged.name: False,
 
 
 
 
168
  }
169
 
170
  @dataclass
 
383
 
384
 
385
  # Column selection
386
+ COLS = [c.name for c in fields(auto_eval_cols)]
387
+ TYPES = [c.type for c in fields(auto_eval_cols)]
388
 
389
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
390
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
411
  "~48": pd.Interval(36, 48, closed="right"),
412
  "~64": pd.Interval(48, 64, closed="right"),
413
  ">72": pd.Interval(64, 200, closed="right"),
414
+ }
src/leaderboard/filter_models.py CHANGED
@@ -1,5 +1,5 @@
1
  from src.display.formatting import model_hyperlink
2
- from src.display.utils import AutoEvalColumn
3
 
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
@@ -130,8 +130,9 @@ DO_NOT_SUBMIT_MODELS = [
130
 
131
  def flag_models(leaderboard_data: list[dict]):
132
  for model_data in leaderboard_data:
 
133
  # Merges and moes are flagged automatically
134
- if model_data[AutoEvalColumn.flagged.name] == True:
135
  flag_key = "merged"
136
  else:
137
  flag_key = model_data["model_name_for_query"]
@@ -143,11 +144,11 @@ def flag_models(leaderboard_data: list[dict]):
143
  f"See discussion #{issue_num}",
144
  )
145
  model_data[
146
- AutoEvalColumn.model.name
147
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
148
- model_data[AutoEvalColumn.flagged.name] = True
149
  else:
150
- model_data[AutoEvalColumn.flagged.name] = False
151
 
152
 
153
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
1
  from src.display.formatting import model_hyperlink
2
+ from src.display.utils import auto_eval_cols
3
 
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
 
130
 
131
  def flag_models(leaderboard_data: list[dict]):
132
  for model_data in leaderboard_data:
133
+ # 修改点 2:将 AutoEvalColumn 替换为 auto_eval_cols
134
  # Merges and moes are flagged automatically
135
+ if model_data.get(auto_eval_cols.flagged.name) == True:
136
  flag_key = "merged"
137
  else:
138
  flag_key = model_data["model_name_for_query"]
 
144
  f"See discussion #{issue_num}",
145
  )
146
  model_data[
147
+ auto_eval_cols.model.name
148
+ ] = f"{model_data[auto_eval_cols.model.name]} has been flagged! {issue_link}"
149
+ model_data[auto_eval_cols.flagged.name] = True
150
  else:
151
+ model_data[auto_eval_cols.flagged.name] = False
152
 
153
 
154
  def remove_forbidden_models(leaderboard_data: list[dict]):
src/leaderboard/read_evals.py CHANGED
@@ -11,7 +11,7 @@ import numpy as np
11
  from huggingface_hub import ModelCard
12
 
13
  from src.display.formatting import make_clickable_model
14
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, QuantType, WeightDtype, ComputeDtype
15
 
16
 
17
  @dataclass
@@ -60,10 +60,10 @@ class EvalResult:
60
  quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
61
  weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
62
  compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
63
- # double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
64
  model_params = round(float(config["model_params"]), 2)
65
  model_size = round(float(config["model_size"]), 2)
66
- # group_size = data["quantization_config"].get("group_size", -1)
67
  if data.get("quantization_config", None):
68
  double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
69
  group_size = data["quantization_config"].get("group_size", -1)
@@ -81,7 +81,6 @@ class EvalResult:
81
 
82
  if local and org_and_model[0] != "Intel":
83
  org_and_model = config.get("model_name").split("/")
84
- # temporary "local"
85
  org_and_model = ["local", org_and_model[-1]]
86
  quant_type = QuantType.autoround
87
 
@@ -95,7 +94,7 @@ class EvalResult:
95
  result_key = f"{org}_{model}_{precision.value.name}"
96
  full_model = "/".join(org_and_model)
97
 
98
- # Extract results available in this file (some results are split in several files)
99
  results = {}
100
  for task in Tasks:
101
  task = task.value
@@ -137,19 +136,12 @@ class EvalResult:
137
  try:
138
  with open(request_file, "r") as f:
139
  request = json.load(f)
140
- # self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
141
- # self.precision = WeightType[request.get("weight_type", "Original")]
142
- # self.num_params = request.get("model_size", 0) / 2 # need fix
143
  self.date = request.get("submitted_time", "")
144
  self.architecture = request.get("architectures", "Unknown")
145
  self.status = request.get("status", "Failed")
146
  except Exception as e:
147
- print(requests_path, self.full_model,
148
- self.quant_type.value.name, self.precision.value.name,
149
- self.weight_dtype.value.name, self.compute_dtype.value.name)
150
  self.status = "Failed"
151
  print(f"Could not find request file for {self.org}/{self.model}")
152
- print(traceback.format_exc())
153
 
154
  def update_with_dynamic_file_dict(self, file_dict):
155
  self.license = file_dict.get("license", "?")
@@ -161,57 +153,67 @@ class EvalResult:
161
 
162
  def to_dict(self):
163
  """Converts the Eval Result to a dict compatible with our dataframe display"""
164
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
165
-
 
 
166
  data_dict = {
167
- "eval_name": self.eval_name, # not a column, just a save name,
168
- AutoEvalColumn.precision.name: self.precision.value.name,
169
- AutoEvalColumn.quant_type.name: self.quant_type.value.name,
170
- AutoEvalColumn.model_type_symbol.name: self.quant_type.value.symbol,
171
- AutoEvalColumn.weight_dtype.name: self.weight_dtype.value.name,
172
- AutoEvalColumn.compute_dtype.name: self.compute_dtype.value.name,
173
- AutoEvalColumn.double_quant.name: self.double_quant,
174
- AutoEvalColumn.model_type.name: self.model_type.value.name,
175
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
176
- AutoEvalColumn.architecture.name: self.architecture,
177
- AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.result_file),
178
- AutoEvalColumn.dummy.name: self.full_model,
179
- AutoEvalColumn.revision.name: self.revision,
180
- AutoEvalColumn.average.name: average,
181
- AutoEvalColumn.license.name: self.license,
182
- AutoEvalColumn.likes.name: self.likes,
183
- AutoEvalColumn.params.name: self.num_params,
184
- AutoEvalColumn.model_size.name: self.model_size,
185
- AutoEvalColumn.group_size.name: self.group_size,
186
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
187
- AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
188
- AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
189
- AutoEvalColumn.flagged.name: self.flagged
190
  }
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  for task in Tasks:
193
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
 
195
  return data_dict
196
 
197
 
 
198
  def get_request_file_for_model(requests_path, model_name,
199
  quant_type, precision, weight_dtype, compute_dtype):
200
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
201
- # {model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json
202
  request_files = os.path.join(
203
  requests_path,
204
  f"{model_name}_eval_request_*.json",
205
  )
206
  request_files = glob.glob(request_files)
207
 
208
- # Select correct request file (precision)
209
  request_file = ""
210
  request_files = sorted(request_files, reverse=True)
211
  for tmp_request_file in request_files:
212
  with open(tmp_request_file, "r") as f:
213
  req_content = json.load(f)
214
- print(model_name, req_content["precision"], precision.split(".")[-1], str(req_content["quant_type"]), quant_type, req_content["weight_dtype"], weight_dtype.split(".")[-1],req_content["compute_dtype"], compute_dtype.split(".")[-1] )
215
  if (
216
  req_content["status"] in ["Finished"]
217
  and req_content["precision"] == precision.split(".")[-1]
@@ -236,48 +238,48 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
236
  model_result_filepaths = []
237
 
238
  for root, _, files in os.walk(results_path):
239
- # We should only have json files in model results
240
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
241
  continue
242
 
243
- # Sort the files by date
244
  try:
245
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
246
- except dateutil.parser._parser.ParserError:
247
  files = [files[-1]]
248
 
249
  for file in files:
250
  model_result_filepaths.append(os.path.join(root, file))
251
 
252
- with open(dynamic_path) as f:
253
- dynamic_data = json.load(f)
 
 
254
 
255
  eval_results = {}
256
  for model_result_filepath in model_result_filepaths:
257
- # Creation of result
258
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
259
  eval_result.update_with_request_file(requests_path)
 
260
  if eval_result.full_model in dynamic_data:
261
- # eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
262
- # Hardcoding because of gating problem
263
  if "meta-llama" in eval_result.full_model:
264
  eval_result.still_on_hub = True
265
 
266
- # Store results of same eval together
267
  eval_name = eval_result.eval_name
268
- if eval_name in eval_results.keys():
269
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
270
  else:
271
  eval_results[eval_name] = eval_result
272
 
273
-
274
  results = []
275
  for v in eval_results.values():
276
  try:
277
  if v.status == "Finished":
278
- v.to_dict() # we test if the dict version is complete
279
  results.append(v)
280
- except KeyError: # not all eval values present
 
281
  continue
282
 
283
  return results
 
 
11
  from huggingface_hub import ModelCard
12
 
13
  from src.display.formatting import make_clickable_model
14
+ from src.display.utils import auto_eval_cols, ModelType, Tasks, Precision, WeightType, QuantType, WeightDtype, ComputeDtype
15
 
16
 
17
  @dataclass
 
60
  quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
61
  weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
62
  compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
63
+
64
  model_params = round(float(config["model_params"]), 2)
65
  model_size = round(float(config["model_size"]), 2)
66
+
67
  if data.get("quantization_config", None):
68
  double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
69
  group_size = data["quantization_config"].get("group_size", -1)
 
81
 
82
  if local and org_and_model[0] != "Intel":
83
  org_and_model = config.get("model_name").split("/")
 
84
  org_and_model = ["local", org_and_model[-1]]
85
  quant_type = QuantType.autoround
86
 
 
94
  result_key = f"{org}_{model}_{precision.value.name}"
95
  full_model = "/".join(org_and_model)
96
 
97
+ # Extract results
98
  results = {}
99
  for task in Tasks:
100
  task = task.value
 
136
  try:
137
  with open(request_file, "r") as f:
138
  request = json.load(f)
 
 
 
139
  self.date = request.get("submitted_time", "")
140
  self.architecture = request.get("architectures", "Unknown")
141
  self.status = request.get("status", "Failed")
142
  except Exception as e:
 
 
 
143
  self.status = "Failed"
144
  print(f"Could not find request file for {self.org}/{self.model}")
 
145
 
146
  def update_with_dynamic_file_dict(self, file_dict):
147
  self.license = file_dict.get("license", "?")
 
153
 
154
  def to_dict(self):
155
  """Converts the Eval Result to a dict compatible with our dataframe display"""
156
+
157
+ valid_results = [v for v in self.results.values() if v is not None]
158
+ average = sum(valid_results) / len(Tasks) if len(Tasks) > 0 else 0
159
+
160
  data_dict = {
161
+ "eval_name": self.eval_name,
162
+ "date": self.date,
163
+ auto_eval_cols.precision.name: self.precision.value.name,
164
+ auto_eval_cols.quant_type.name: self.quant_type.value.name,
165
+ auto_eval_cols.model_type_symbol.name: self.quant_type.value.symbol,
166
+ auto_eval_cols.weight_dtype.name: self.weight_dtype.value.name,
167
+ auto_eval_cols.compute_dtype.name: self.compute_dtype.value.name,
168
+ auto_eval_cols.model.name: make_clickable_model(self.full_model, self.result_file),
169
+ auto_eval_cols.revision.name: self.revision,
170
+ auto_eval_cols.average.name: average,
171
+ auto_eval_cols.model_size.name: self.model_size,
172
+ auto_eval_cols.dummy.name: self.full_model,
 
 
 
 
 
 
 
 
 
 
 
173
  }
174
 
175
+ data_dict[auto_eval_cols.still_on_hub.name] = self.still_on_hub
176
+ data_dict[auto_eval_cols.flagged.name] = self.flagged
177
+
178
+ if hasattr(auto_eval_cols, "double_quant"):
179
+ data_dict[auto_eval_cols.double_quant.name] = self.double_quant
180
+ if hasattr(auto_eval_cols, "architecture"):
181
+ data_dict[auto_eval_cols.architecture.name] = self.architecture
182
+ if hasattr(auto_eval_cols, "params"):
183
+ data_dict[auto_eval_cols.params.name] = self.num_params
184
+ if hasattr(auto_eval_cols, "license"):
185
+ data_dict[auto_eval_cols.license.name] = self.license
186
+ if hasattr(auto_eval_cols, "likes"):
187
+ data_dict[auto_eval_cols.likes.name] = self.likes
188
+ if hasattr(auto_eval_cols, "group_size"):
189
+ data_dict[auto_eval_cols.group_size.name] = self.group_size
190
+
191
+ if hasattr(auto_eval_cols, "merged"):
192
+ data_dict[auto_eval_cols.merged.name] = "merge" in (self.tags if self.tags else [])
193
+ if hasattr(auto_eval_cols, "moe"):
194
+ data_dict[auto_eval_cols.moe.name] = ("moe" in (self.tags if self.tags else [])) or "moe" in self.full_model.lower()
195
+
196
  for task in Tasks:
197
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
198
 
199
  return data_dict
200
 
201
 
202
+
203
  def get_request_file_for_model(requests_path, model_name,
204
  quant_type, precision, weight_dtype, compute_dtype):
205
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
206
  request_files = os.path.join(
207
  requests_path,
208
  f"{model_name}_eval_request_*.json",
209
  )
210
  request_files = glob.glob(request_files)
211
 
 
212
  request_file = ""
213
  request_files = sorted(request_files, reverse=True)
214
  for tmp_request_file in request_files:
215
  with open(tmp_request_file, "r") as f:
216
  req_content = json.load(f)
 
217
  if (
218
  req_content["status"] in ["Finished"]
219
  and req_content["precision"] == precision.split(".")[-1]
 
238
  model_result_filepaths = []
239
 
240
  for root, _, files in os.walk(results_path):
 
241
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
242
  continue
243
 
 
244
  try:
245
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
246
+ except Exception:
247
  files = [files[-1]]
248
 
249
  for file in files:
250
  model_result_filepaths.append(os.path.join(root, file))
251
 
252
+ dynamic_data = {}
253
+ if os.path.exists(dynamic_path):
254
+ with open(dynamic_path) as f:
255
+ dynamic_data = json.load(f)
256
 
257
  eval_results = {}
258
  for model_result_filepath in model_result_filepaths:
 
259
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
260
+
261
  eval_result.update_with_request_file(requests_path)
262
+
263
  if eval_result.full_model in dynamic_data:
264
+ eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
 
265
  if "meta-llama" in eval_result.full_model:
266
  eval_result.still_on_hub = True
267
 
 
268
  eval_name = eval_result.eval_name
269
+ if eval_name in eval_results:
270
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
271
  else:
272
  eval_results[eval_name] = eval_result
273
 
 
274
  results = []
275
  for v in eval_results.values():
276
  try:
277
  if v.status == "Finished":
278
+ v.to_dict()
279
  results.append(v)
280
+ except Exception as e:
281
+ print(f"Error processing {v.eval_name}: {e}")
282
  continue
283
 
284
  return results
285
+
src/populate.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
  from src.leaderboard.filter_models import filter_models_flags
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
@@ -12,20 +12,23 @@ from src.leaderboard.read_evals import get_raw_eval_results
12
  def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
- print(all_data_json)
16
  all_data_json.append(baseline_row)
17
  filter_models_flags(all_data_json)
18
- print("Keys in the first record of all_data_json:", all_data_json[0].keys())
19
-
20
 
21
  df = pd.DataFrame.from_records(all_data_json)
22
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
- print("Columns used in DataFrame:", cols, df.columns)
24
- df = df[cols].round(decimals=2)
25
-
 
 
 
26
 
27
- # filter out if any of the benchmarks have not been produced
28
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
29
  return raw_data, df
30
 
31
 
@@ -39,8 +42,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  with open(file_path) as fp:
40
  data = json.load(fp)
41
 
42
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
43
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
44
 
45
  all_evals.append(data)
46
  elif ".md" not in entry:
@@ -51,14 +54,18 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
51
  with open(file_path) as fp:
52
  data = json.load(fp)
53
 
54
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
55
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
56
  all_evals.append(data)
57
 
58
  pending_list = [e for e in all_evals if e["status"] in ["Pending", "Rerun", "Waiting"]]
59
  running_list = [e for e in all_evals if e["status"] == "Running"]
60
  finished_list = [e for e in all_evals if e["status"].startswith("Finished") or e["status"] == "PENDING_NEW_EVAL"]
61
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
62
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
63
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
64
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import auto_eval_cols, eval_queue_cols, baseline_row
8
  from src.leaderboard.filter_models import filter_models_flags
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
+
16
  all_data_json.append(baseline_row)
17
  filter_models_flags(all_data_json)
 
 
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
+
21
+ avg_col = auto_eval_cols.average.name
22
+ if avg_col in df.columns:
23
+ df = df.sort_values(by=[avg_col], ascending=False)
24
+
25
+ existing_cols = [c for c in cols if c in df.columns]
26
+ df = df[existing_cols].round(decimals=2)
27
 
28
+ existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
29
+ if existing_benchmarks:
30
+ df = df[has_no_nan_values(df, existing_benchmarks)]
31
+
32
  return raw_data, df
33
 
34
 
 
42
  with open(file_path) as fp:
43
  data = json.load(fp)
44
 
45
+ data[eval_queue_cols.model.name] = make_clickable_model(data["model"])
46
+ data[eval_queue_cols.revision.name] = data.get("revision", "main")
47
 
48
  all_evals.append(data)
49
  elif ".md" not in entry:
 
54
  with open(file_path) as fp:
55
  data = json.load(fp)
56
 
57
+ data[eval_queue_cols.model.name] = make_clickable_model(data["model"])
58
+ data[eval_queue_cols.revision.name] = data.get("revision", "main")
59
  all_evals.append(data)
60
 
61
  pending_list = [e for e in all_evals if e["status"] in ["Pending", "Rerun", "Waiting"]]
62
  running_list = [e for e in all_evals if e["status"] == "Running"]
63
  finished_list = [e for e in all_evals if e["status"].startswith("Finished") or e["status"] == "PENDING_NEW_EVAL"]
64
+
65
+ existing_q_cols = [c for c in cols if c in pd.DataFrame(all_evals).columns] if all_evals else cols
66
+
67
+ df_pending = pd.DataFrame.from_records(pending_list, columns=existing_q_cols)
68
+ df_running = pd.DataFrame.from_records(running_list, columns=existing_q_cols)
69
+ df_finished = pd.DataFrame.from_records(finished_list, columns=existing_q_cols)
70
+
71
+ return df_finished[existing_q_cols], df_running[existing_q_cols], df_pending[existing_q_cols]
src/tools/plots.py CHANGED
@@ -4,43 +4,44 @@ import plotly.express as px
4
  from plotly.graph_objs import Figure
5
 
6
  from src.leaderboard.filter_models import FLAGGED_MODELS
7
- from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
8
  from src.leaderboard.read_evals import EvalResult
9
 
10
 
11
-
12
  def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
  Generates a DataFrame containing the maximum scores until each date.
15
-
16
- :param results_df: A DataFrame containing result information including metric scores and dates.
17
- :return: A new DataFrame containing the maximum scores until each date for every metric.
18
  """
19
- # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
- results_df = pd.DataFrame(raw_data)
21
- #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
- results_df.sort_values(by="date", inplace=True)
 
23
 
24
- # Step 2: Initialize the scores dictionary
25
- scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
26
 
27
- # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
28
- for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
29
  current_max = 0
30
  last_date = ""
31
  column = task.col_name
 
32
  for _, row in results_df.iterrows():
33
- current_model = row["full_model"]
34
- # We ignore models that are flagged/no longer on the hub/not finished
35
- to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "Finished"
 
 
 
36
  if to_ignore:
37
  continue
38
 
39
- current_date = row["date"]
 
 
40
  if task.benchmark == "Average":
41
- current_score = np.mean(list(row["results"].values()))
42
  else:
43
- current_score = row["results"][task.benchmark]
44
 
45
  if current_score > current_max:
46
  if current_date == last_date and len(scores[column]) > 0:
@@ -50,57 +51,36 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
50
  current_max = current_score
51
  last_date = current_date
52
 
53
- # Step 4: Return all dictionaries as DataFrames
54
  return {k: pd.DataFrame(v) for k, v in scores.items()}
55
 
56
 
57
- def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
58
- """
59
- Transforms the scores DataFrame into a new format suitable for plotting.
60
-
61
- :param scores_df: A DataFrame containing metric scores and dates.
62
- :return: A new DataFrame reshaped for plotting purposes.
63
- """
64
- # Initialize the list to store DataFrames
65
  dfs = []
66
 
67
- # Iterate over the cols and create a new DataFrame for each column
68
- for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
69
- d = scores_df[col].reset_index(drop=True)
70
- d["task"] = col
71
- dfs.append(d)
72
 
73
- # Concatenate all the created DataFrames
74
- concat_df = pd.concat(dfs, ignore_index=True)
75
 
76
- # Sort values by 'date'
77
  concat_df.sort_values(by="date", inplace=True)
78
  concat_df.reset_index(drop=True, inplace=True)
79
  return concat_df
80
 
81
 
82
- def create_metric_plot_obj(
83
- df: pd.DataFrame, metrics: list[str], title: str
84
- ) -> Figure:
85
- """
86
- Create a Plotly figure object with lines representing different metrics
87
- and horizontal dotted lines representing human baselines.
88
-
89
- :param df: The DataFrame containing the metric values, names, and dates.
90
- :param metrics: A list of strings representing the names of the metrics
91
- to be included in the plot.
92
- :param title: A string representing the title of the plot.
93
- :return: A Plotly figure object with lines representing metrics and
94
- horizontal dotted lines representing human baselines.
95
- """
96
 
97
- # Filter the DataFrame based on the specified metrics
98
  df = df[df["task"].isin(metrics)]
99
 
100
- # Filter the human baselines based on the specified metrics
101
  filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
102
 
103
- # Create a line figure using plotly express with specified markers and custom data
104
  fig = px.line(
105
  df,
106
  x="date",
@@ -111,33 +91,21 @@ def create_metric_plot_obj(
111
  title=title,
112
  )
113
 
114
- # Update hovertemplate for better hover interaction experience
115
  fig.update_traces(
116
- hovertemplate="<br>".join(
117
- [
118
- "Model Name: %{customdata[2]}",
119
- "Metric Name: %{customdata[0]}",
120
- "Date: %{x}",
121
- "Metric Value: %{y}",
122
- ]
123
- )
124
  )
125
 
126
- # Update the range of the y-axis
127
  fig.update_layout(yaxis_range=[0, 100])
 
128
 
129
- # Create a dictionary to hold the color mapping for each metric
130
- metric_color_mapping = {}
131
-
132
- # Map each metric name to its color in the figure
133
- for trace in fig.data:
134
- metric_color_mapping[trace.name] = trace.line.color
135
-
136
- # Iterate over filtered human baselines and add horizontal lines to the figure
137
  for metric, value in filtered_human_baselines.items():
138
- color = metric_color_mapping.get(metric, "blue") # Retrieve color from mapping; default to blue if not found
139
- location = "top left" if metric == "HellaSwag" else "bottom left" # Set annotation position
140
- # Add horizontal line with matched color and positioned annotation
141
  fig.add_hline(
142
  y=value,
143
  line_dash="dot",
@@ -148,9 +116,4 @@ def create_metric_plot_obj(
148
  line_color=color,
149
  )
150
 
151
- return fig
152
-
153
-
154
- # Example Usage:
155
- # human_baselines dictionary is defined.
156
- # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
 
4
  from plotly.graph_objs import Figure
5
 
6
  from src.leaderboard.filter_models import FLAGGED_MODELS
7
+ from src.display.utils import human_baseline_row as HUMAN_BASELINE, auto_eval_cols, Tasks, Task, BENCHMARK_COLS
8
  from src.leaderboard.read_evals import EvalResult
9
 
10
 
 
11
  def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
12
  """
13
  Generates a DataFrame containing the maximum scores until each date.
 
 
 
14
  """
15
+ all_data = [v.to_dict() for v in raw_data]
16
+ results_df = pd.DataFrame(all_data)
17
+
18
+ if "date" in results_df.columns:
19
+ results_df.sort_values(by="date", inplace=True)
20
 
21
+ scores = {k: [] for k in BENCHMARK_COLS + [auto_eval_cols.average.name]}
 
22
 
23
+ for task in [t.value for t in Tasks] + [Task("Average", "avg", auto_eval_cols.average.name)]:
 
24
  current_max = 0
25
  last_date = ""
26
  column = task.col_name
27
+
28
  for _, row in results_df.iterrows():
29
+ current_model = row.get("dummy", "Unknown")
30
+
31
+ still_on_hub = row.get(auto_eval_cols.still_on_hub.name, True)
32
+ is_flagged = row.get(auto_eval_cols.flagged.name, False)
33
+
34
+ to_ignore = not still_on_hub or is_flagged or current_model in FLAGGED_MODELS
35
  if to_ignore:
36
  continue
37
 
38
+ current_date = row.get("date", "")
39
+ if not current_date: continue
40
+
41
  if task.benchmark == "Average":
42
+ current_score = row.get(auto_eval_cols.average.name, 0)
43
  else:
44
+ current_score = row.get(task.col_name, 0)
45
 
46
  if current_score > current_max:
47
  if current_date == last_date and len(scores[column]) > 0:
 
51
  current_max = current_score
52
  last_date = current_date
53
 
 
54
  return {k: pd.DataFrame(v) for k, v in scores.items()}
55
 
56
 
57
+ def create_plot_df(scores_df: dict[str, pd.DataFrame]) -> pd.DataFrame:
58
+ """Reshapes the scores DataFrame for plotting."""
 
 
 
 
 
 
59
  dfs = []
60
 
61
+ for col in BENCHMARK_COLS + [auto_eval_cols.average.name]:
62
+ if col in scores_df and not scores_df[col].empty:
63
+ d = scores_df[col].reset_index(drop=True)
64
+ d["task"] = col
65
+ dfs.append(d)
66
 
67
+ if not dfs:
68
+ return pd.DataFrame(columns=["model", "date", "score", "task"])
69
 
70
+ concat_df = pd.concat(dfs, ignore_index=True)
71
  concat_df.sort_values(by="date", inplace=True)
72
  concat_df.reset_index(drop=True, inplace=True)
73
  return concat_df
74
 
75
 
76
+ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
77
+ if df.empty:
78
+ return px.line(title="No data available")
 
 
 
 
 
 
 
 
 
 
 
79
 
 
80
  df = df[df["task"].isin(metrics)]
81
 
 
82
  filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
83
 
 
84
  fig = px.line(
85
  df,
86
  x="date",
 
91
  title=title,
92
  )
93
 
 
94
  fig.update_traces(
95
+ hovertemplate="<br>".join([
96
+ "Model Name: %{customdata[2]}",
97
+ "Metric Name: %{customdata[0]}",
98
+ "Date: %{x}",
99
+ "Metric Value: %{y}",
100
+ ])
 
 
101
  )
102
 
 
103
  fig.update_layout(yaxis_range=[0, 100])
104
+ metric_color_mapping = {trace.name: trace.line.color for trace in fig.data}
105
 
 
 
 
 
 
 
 
 
106
  for metric, value in filtered_human_baselines.items():
107
+ color = metric_color_mapping.get(metric, "blue")
108
+ location = "top left" if metric == "HellaSwag" else "bottom left"
 
109
  fig.add_hline(
110
  y=value,
111
  line_dash="dot",
 
116
  line_color=color,
117
  )
118
 
119
+ return fig