refactor: update code for latest Gradio API
Browse files- Dockerfile +0 -17
- README.md +3 -2
- app.py +684 -174
- requirements.txt +20 -20
- src/display/css_html_js.py +109 -17
- src/display/utils.py +97 -69
- src/leaderboard/filter_models.py +7 -6
- src/leaderboard/read_evals.py +56 -54
- src/populate.py +25 -18
- src/tools/plots.py +43 -80
Dockerfile
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
FROM python:3.10-slim
|
| 2 |
-
|
| 3 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
| 4 |
-
WORKDIR /app
|
| 5 |
-
|
| 6 |
-
RUN apt-get update && apt-get install -y \
|
| 7 |
-
git git-lfs ffmpeg libsm6 libxext6 libgl1 \
|
| 8 |
-
&& rm -rf /var/lib/apt/lists/* \
|
| 9 |
-
&& git lfs install
|
| 10 |
-
|
| 11 |
-
RUN pip install --no-cache-dir -U pip setuptools wheel
|
| 12 |
-
|
| 13 |
-
COPY requirements.txt .
|
| 14 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
-
|
| 16 |
-
COPY . .
|
| 17 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -3,8 +3,9 @@ title: Low-bit Quantized Open LLM Leaderboard
|
|
| 3 |
emoji: 🏆
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version:
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 3 |
emoji: 🏆
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.5.1
|
| 8 |
+
python_version: 3.11
|
| 9 |
app_file: app.py
|
| 10 |
pinned: true
|
| 11 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
import re
|
|
@@ -24,9 +23,7 @@ from src.display.utils import (
|
|
| 24 |
NUMERIC_INTERVALS,
|
| 25 |
NUMERIC_MODELSIZE,
|
| 26 |
TYPES,
|
| 27 |
-
# 改为导入实例
|
| 28 |
auto_eval_cols,
|
| 29 |
-
eval_queue_cols,
|
| 30 |
GroupDtype,
|
| 31 |
ModelType,
|
| 32 |
fields,
|
|
@@ -46,21 +43,36 @@ from src.tools.plots import (
|
|
| 46 |
create_plot_df,
|
| 47 |
create_scores_df,
|
| 48 |
)
|
| 49 |
-
from gradio_modal import Modal
|
| 50 |
import plotly.graph_objects as go
|
| 51 |
|
| 52 |
selected_indices = []
|
| 53 |
selected_values = {}
|
| 54 |
selected_dropdown_weight = 'All'
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
precision_to_dtype = {
|
| 57 |
-
"2bit": ["int2"],
|
| 58 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
dtype_to_precision = {
|
| 62 |
-
"int2": ["2bit"],
|
| 63 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
current_weightDtype = ["int2", "int3", "int4", "nf4", "fp4", "?"]
|
|
@@ -68,7 +80,7 @@ current_computeDtype = ['int8', 'bfloat16', 'float16', 'float32']
|
|
| 68 |
current_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
|
| 69 |
current_precision = ['2bit', '3bit', '4bit', '8bit', '?']
|
| 70 |
|
| 71 |
-
|
| 72 |
def display_sort(key):
|
| 73 |
order = {"All": 0, "?": 1, "int2": 2, "int3": 3, "int4": 4, "fp4": 5, "nf4": 6, "float16": 7, "bfloat16": 8, "float32": 9}
|
| 74 |
return order.get(key, float('inf'))
|
|
@@ -77,260 +89,758 @@ def comp_display_sort(key):
|
|
| 77 |
order = {"All": 0, "?": 1, "int8": 2, "float16": 3, "bfloat16": 4, "float32": 5}
|
| 78 |
return order.get(key, float('inf'))
|
| 79 |
|
| 80 |
-
# --- 更新逻辑保持逻辑不变,仅做属性名适配 ---
|
| 81 |
def update_quantization_types(selected_quant):
|
| 82 |
-
global current_weightDtype
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if set(current_quant) == set(selected_quant):
|
| 84 |
-
return [
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if any(value != '✖ None' for value in selected_quant):
|
| 88 |
selected_weight = ['All', '?', 'int2', 'int3', 'int4', 'nf4', 'fp4', 'int8']
|
| 89 |
selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
|
| 90 |
selected_precision = ["2bit", "3bit", "4bit", "8bit", "?"]
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
def update_Weight_Precision(temp_precisions):
|
| 97 |
-
global current_weightDtype
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
if set(current_precision) == set(temp_precisions):
|
| 99 |
-
return [
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
selected_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
|
|
|
|
| 105 |
if temp_precisions[-1] in ["16bit", "32bit"]:
|
| 106 |
selected_precisions = [p for p in temp_precisions if p in ["16bit", "32bit"]]
|
| 107 |
else:
|
| 108 |
selected_precisions = [p for p in temp_precisions if p not in ["16bit", "32bit"]]
|
|
|
|
| 109 |
current_precision = list(set(selected_precisions))
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
selected_dropdown_weight = 'All'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
for precision in current_precision:
|
| 113 |
-
if precision in precision_to_dtype:
|
|
|
|
|
|
|
|
|
|
| 114 |
if "16bit" in current_precision:
|
| 115 |
-
selected_weight = [
|
| 116 |
-
if "int8" in selected_compute:
|
|
|
|
|
|
|
| 117 |
if "32bit" in current_precision:
|
| 118 |
-
selected_weight = [
|
| 119 |
-
if "int8" in selected_compute:
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def update_Weight_Dtype(weight):
|
| 130 |
global selected_dropdown_weight
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
selected_dropdown_weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
return selected_precisions
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
def restart_space():
|
| 137 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 138 |
|
|
|
|
| 139 |
def init_space(full_init: bool = True):
|
|
|
|
| 140 |
if full_init:
|
| 141 |
try:
|
| 142 |
branch = REPO.active_branch.name
|
| 143 |
REPO.remotes.origin.pull(branch)
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
raw_data, original_df = get_leaderboard_df(GIT_RESULTS_PATH, GIT_STATUS_PATH, DYNAMIC_INFO_FILE_PATH, COLS, BENCHMARK_COLS)
|
| 148 |
-
|
| 149 |
-
# 防御补全:如果没数据也要有骨架,防止 KeyError 'Model'
|
| 150 |
-
if original_df.empty:
|
| 151 |
-
original_df = pd.DataFrame(columns=[c.name for c in fields(auto_eval_cols)])
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
leaderboard_df = original_df.copy()
|
|
|
|
| 154 |
plot_df = create_plot_df(create_scores_df(raw_data))
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
| 159 |
|
| 160 |
def str_to_bool(value):
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
filtered_df = filter_models(hidden_df, type_query, size_query,
|
| 172 |
filtered_df = filter_queries(query, filtered_df)
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 176 |
-
if auto_eval_cols.dummy.name not in df.columns: return df
|
| 177 |
return df[(df[auto_eval_cols.dummy.name].str.contains(query, case=False))]
|
| 178 |
|
|
|
|
| 179 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 180 |
always_here_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
|
| 181 |
dummy_col = [auto_eval_cols.dummy.name]
|
| 182 |
-
#
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
def filter_queries(query: str, filtered_df: pd.DataFrame):
|
| 187 |
-
|
| 188 |
final_df = []
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
if
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
if
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
def select(df, data: gr.SelectData):
|
| 226 |
-
global selected_indices
|
|
|
|
|
|
|
| 227 |
selected_index = data.index[0]
|
| 228 |
-
value = df.iloc[selected_index].iloc[1]
|
| 229 |
-
match = re.search(r'<a[^>]+>([^<]+)</a>', value)
|
| 230 |
-
if not match: return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
|
| 231 |
-
text_content = match.group(1)
|
| 232 |
if selected_index in selected_indices:
|
| 233 |
selected_indices.remove(selected_index)
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
else:
|
| 236 |
selected_indices.append(selected_index)
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
def generate_spider_chart(df, selected_keys):
|
| 241 |
global selected_values
|
| 242 |
-
|
| 243 |
-
selected_rows = df[df.iloc[:, 1].isin(
|
| 244 |
-
cleaned_rows = selected_rows.
|
|
|
|
|
|
|
| 245 |
fig = go.Figure()
|
| 246 |
-
# 强制指定指标列
|
| 247 |
-
metrics = ['Average ⬆️', 'ARC-c', 'ARC-e', 'Boolq', 'HellaSwag', 'Lambada', 'MMLU', 'Openbookqa', 'Piqa', 'Truthfulqa', 'Winogrande']
|
| 248 |
for _, row in selected_rows.iterrows():
|
| 249 |
fig.add_trace(go.Scatterpolar(
|
| 250 |
-
r=[row
|
| 251 |
-
theta=
|
|
|
|
|
|
|
| 252 |
))
|
| 253 |
-
fig.update_layout(
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
|
| 257 |
-
demo = gr.Blocks(css=custom_css)
|
| 258 |
with demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
gr.HTML(TITLE)
|
| 260 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
| 261 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 262 |
-
with gr.TabItem("🏅 LLM Benchmark", id=0):
|
| 263 |
with gr.Row():
|
| 264 |
with gr.Column():
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
)
|
| 271 |
with gr.Column(min_width=320):
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
with gr.Row():
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
-
spider_btn
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
)
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
-
with gr.TabItem("📈 Metrics through time", id=2):
|
| 309 |
with gr.Row():
|
| 310 |
-
gr.
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
with gr.TabItem("🚀 Submit", id=5):
|
| 314 |
with gr.Column():
|
| 315 |
-
gr.
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
with gr.Row():
|
| 330 |
with gr.Accordion("📙 Citation", open=False):
|
| 331 |
-
gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
scheduler = BackgroundScheduler()
|
| 334 |
-
scheduler.add_job(restart_space, "interval", hours=3)
|
|
|
|
| 335 |
scheduler.start()
|
| 336 |
-
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
import re
|
|
|
|
| 23 |
NUMERIC_INTERVALS,
|
| 24 |
NUMERIC_MODELSIZE,
|
| 25 |
TYPES,
|
|
|
|
| 26 |
auto_eval_cols,
|
|
|
|
| 27 |
GroupDtype,
|
| 28 |
ModelType,
|
| 29 |
fields,
|
|
|
|
| 43 |
create_plot_df,
|
| 44 |
create_scores_df,
|
| 45 |
)
|
|
|
|
| 46 |
import plotly.graph_objects as go
|
| 47 |
|
| 48 |
selected_indices = []
|
| 49 |
selected_values = {}
|
| 50 |
selected_dropdown_weight = 'All'
|
| 51 |
|
| 52 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
| 53 |
+
#enable_space_ci()
|
| 54 |
+
|
| 55 |
precision_to_dtype = {
|
| 56 |
+
"2bit": ["int2"],
|
| 57 |
+
"3bit": ["int3"],
|
| 58 |
+
"4bit": ["int4", "nf4", "fp4"],
|
| 59 |
+
"8bit": ["int8"],
|
| 60 |
+
"16bit": ['float16', 'bfloat16'],
|
| 61 |
+
"32bit": ["float32"],
|
| 62 |
+
"?": ["?"],
|
| 63 |
}
|
| 64 |
|
| 65 |
dtype_to_precision = {
|
| 66 |
+
"int2": ["2bit"],
|
| 67 |
+
"int3": ["3bit"],
|
| 68 |
+
"int4": ["4bit"],
|
| 69 |
+
"nf4": ["4bit"],
|
| 70 |
+
"fp4": ["4bit"],
|
| 71 |
+
"int8": ["8bit"],
|
| 72 |
+
"float16": ["16bit"],
|
| 73 |
+
"bfloat16": ["16bit"],
|
| 74 |
+
"float32": ["32bit"],
|
| 75 |
+
"?": ["?"],
|
| 76 |
}
|
| 77 |
|
| 78 |
current_weightDtype = ["int2", "int3", "int4", "nf4", "fp4", "?"]
|
|
|
|
| 80 |
current_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
|
| 81 |
current_precision = ['2bit', '3bit', '4bit', '8bit', '?']
|
| 82 |
|
| 83 |
+
|
| 84 |
def display_sort(key):
|
| 85 |
order = {"All": 0, "?": 1, "int2": 2, "int3": 3, "int4": 4, "fp4": 5, "nf4": 6, "float16": 7, "bfloat16": 8, "float32": 9}
|
| 86 |
return order.get(key, float('inf'))
|
|
|
|
| 89 |
order = {"All": 0, "?": 1, "int8": 2, "float16": 3, "bfloat16": 4, "float32": 5}
|
| 90 |
return order.get(key, float('inf'))
|
| 91 |
|
|
|
|
| 92 |
def update_quantization_types(selected_quant):
|
| 93 |
+
global current_weightDtype
|
| 94 |
+
global current_computeDtype
|
| 95 |
+
global current_quant
|
| 96 |
+
global current_precision
|
| 97 |
+
|
| 98 |
if set(current_quant) == set(selected_quant):
|
| 99 |
+
return [
|
| 100 |
+
gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
|
| 101 |
+
gr.Dropdown(choices=current_computeDtype, value="All"),
|
| 102 |
+
gr.CheckboxGroup(value=current_precision),
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
# print('update_quantization_types', selected_quant, current_quant)
|
| 106 |
if any(value != '✖ None' for value in selected_quant):
|
| 107 |
selected_weight = ['All', '?', 'int2', 'int3', 'int4', 'nf4', 'fp4', 'int8']
|
| 108 |
selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
|
| 109 |
selected_precision = ["2bit", "3bit", "4bit", "8bit", "?"]
|
| 110 |
+
|
| 111 |
+
current_weightDtype = selected_weight
|
| 112 |
+
current_computeDtype = selected_compute
|
| 113 |
+
current_quant = selected_quant
|
| 114 |
+
current_precision = selected_precision
|
| 115 |
+
|
| 116 |
+
return [
|
| 117 |
+
gr.Dropdown(choices=selected_weight, value="All"),
|
| 118 |
+
gr.Dropdown(choices=selected_compute, value="All"),
|
| 119 |
+
gr.CheckboxGroup(value=selected_precision),
|
| 120 |
+
]
|
| 121 |
|
| 122 |
def update_Weight_Precision(temp_precisions):
|
| 123 |
+
global current_weightDtype
|
| 124 |
+
global current_computeDtype
|
| 125 |
+
global current_quant
|
| 126 |
+
global current_precision
|
| 127 |
+
global selected_dropdown_weight
|
| 128 |
+
|
| 129 |
+
# print('temp_precisions', temp_precisions)
|
| 130 |
if set(current_precision) == set(temp_precisions):
|
| 131 |
+
return [
|
| 132 |
+
gr.Dropdown(choices=current_weightDtype, value=selected_dropdown_weight),
|
| 133 |
+
gr.Dropdown(choices=current_computeDtype, value="All"),
|
| 134 |
+
gr.CheckboxGroup(value=current_precision),
|
| 135 |
+
gr.CheckboxGroup(value=current_quant),
|
| 136 |
+
] # No update needed
|
| 137 |
+
|
| 138 |
+
selected_weight = []
|
| 139 |
+
selected_compute = ['All', '?', 'int8', 'float16', 'bfloat16', 'float32']
|
| 140 |
selected_quant = [t.to_str() for t in QuantType if t != QuantType.QuantType_None]
|
| 141 |
+
|
| 142 |
if temp_precisions[-1] in ["16bit", "32bit"]:
|
| 143 |
selected_precisions = [p for p in temp_precisions if p in ["16bit", "32bit"]]
|
| 144 |
else:
|
| 145 |
selected_precisions = [p for p in temp_precisions if p not in ["16bit", "32bit"]]
|
| 146 |
+
|
| 147 |
current_precision = list(set(selected_precisions))
|
| 148 |
+
# print('selected_dropdown_weight', selected_dropdown_weight)
|
| 149 |
+
|
| 150 |
+
if len(current_precision) > 1:
|
| 151 |
selected_dropdown_weight = 'All'
|
| 152 |
+
elif selected_dropdown_weight != 'All' and set(dtype_to_precision[selected_dropdown_weight]) != set(current_precision):
|
| 153 |
+
selected_dropdown_weight = 'All'
|
| 154 |
+
|
| 155 |
+
# print('final', current_precision)
|
| 156 |
+
# Map selected_precisions to corresponding weights
|
| 157 |
for precision in current_precision:
|
| 158 |
+
if precision in precision_to_dtype:
|
| 159 |
+
selected_weight.extend(precision_to_dtype[precision])
|
| 160 |
+
|
| 161 |
+
# Special rules for 16bit and 32bit
|
| 162 |
if "16bit" in current_precision:
|
| 163 |
+
selected_weight = [option for option in selected_weight if option in ["All", "?", "float16", "bfloat16"]]
|
| 164 |
+
if "int8" in selected_compute:
|
| 165 |
+
selected_compute.remove("int8")
|
| 166 |
+
|
| 167 |
if "32bit" in current_precision:
|
| 168 |
+
selected_weight = [option for option in selected_weight if option in ["All", "?", "float32"]]
|
| 169 |
+
if "int8" in selected_compute:
|
| 170 |
+
selected_compute.remove("int8")
|
| 171 |
+
|
| 172 |
+
if "16bit" in current_precision or "32bit" in current_precision:
|
| 173 |
+
selected_quant = ['✖ None']
|
| 174 |
+
if "16bit" in current_precision and "32bit" in current_precision:
|
| 175 |
+
selected_weight = ["All", "?", "float16", "bfloat16", "float32"]
|
| 176 |
+
# Ensure "All" and "?" options are included
|
| 177 |
+
selected_weight = ["All", "?"] + [opt for opt in selected_weight if opt not in ["All", "?"]]
|
| 178 |
+
selected_compute = ["All", "?"] + [opt for opt in selected_compute if opt not in ["All", "?"]]
|
| 179 |
+
|
| 180 |
+
# Remove duplicates
|
| 181 |
+
selected_weight = list(set(selected_weight))
|
| 182 |
+
selected_compute = list(set(selected_compute))
|
| 183 |
+
|
| 184 |
+
# Update global variables
|
| 185 |
+
current_weightDtype = selected_weight
|
| 186 |
+
current_computeDtype = selected_compute
|
| 187 |
+
current_quant = selected_quant
|
| 188 |
+
|
| 189 |
+
# Return updated components
|
| 190 |
+
return [
|
| 191 |
+
gr.Dropdown(choices=selected_weight, value=selected_dropdown_weight),
|
| 192 |
+
gr.Dropdown(choices=selected_compute, value="All"),
|
| 193 |
+
gr.CheckboxGroup(value=selected_precisions),
|
| 194 |
+
gr.CheckboxGroup(value=selected_quant),
|
| 195 |
+
]
|
| 196 |
|
| 197 |
def update_Weight_Dtype(weight):
|
| 198 |
global selected_dropdown_weight
|
| 199 |
+
# print('update_Weight_Dtype', weight)
|
| 200 |
+
# Initialize selected_precisions
|
| 201 |
+
if weight == selected_dropdown_weight or weight == 'All':
|
| 202 |
+
return current_precision
|
| 203 |
+
else:
|
| 204 |
+
selected_precisions = []
|
| 205 |
+
selected_precisions.extend(dtype_to_precision[weight])
|
| 206 |
+
selected_dropdown_weight = weight
|
| 207 |
+
# print('selected_precisions', selected_precisions)
|
| 208 |
+
# Return updated components
|
| 209 |
return selected_precisions
|
| 210 |
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
|
| 214 |
def restart_space():
|
| 215 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 216 |
|
| 217 |
+
|
| 218 |
def init_space(full_init: bool = True):
|
| 219 |
+
|
| 220 |
if full_init:
|
| 221 |
try:
|
| 222 |
branch = REPO.active_branch.name
|
| 223 |
REPO.remotes.origin.pull(branch)
|
| 224 |
+
except Exception as e:
|
| 225 |
+
# print(str(e))
|
| 226 |
+
restart_space()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
try:
|
| 229 |
+
# print(DYNAMIC_INFO_PATH)
|
| 230 |
+
snapshot_download(
|
| 231 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 232 |
+
)
|
| 233 |
+
except Exception:
|
| 234 |
+
restart_space()
|
| 235 |
+
|
| 236 |
+
raw_data, original_df = get_leaderboard_df(
|
| 237 |
+
results_path=GIT_RESULTS_PATH,
|
| 238 |
+
requests_path=GIT_STATUS_PATH,
|
| 239 |
+
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
| 240 |
+
cols=COLS,
|
| 241 |
+
benchmark_cols=BENCHMARK_COLS
|
| 242 |
+
)
|
| 243 |
+
# update_collections(original_df.copy())
|
| 244 |
leaderboard_df = original_df.copy()
|
| 245 |
+
|
| 246 |
plot_df = create_plot_df(create_scores_df(raw_data))
|
| 247 |
+
|
| 248 |
+
(
|
| 249 |
+
finished_eval_queue_df,
|
| 250 |
+
running_eval_queue_df,
|
| 251 |
+
pending_eval_queue_df,
|
| 252 |
+
) = get_evaluation_queue_df(GIT_STATUS_PATH, EVAL_COLS)
|
| 253 |
+
|
| 254 |
+
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 255 |
|
| 256 |
leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
| 257 |
|
| 258 |
def str_to_bool(value):
|
| 259 |
+
if str(value).lower() == "true":
|
| 260 |
+
return True
|
| 261 |
+
elif str(value).lower() == "false":
|
| 262 |
+
return False
|
| 263 |
+
else:
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
# Searching and filtering
|
| 267 |
+
def update_table(
|
| 268 |
+
hidden_df: pd.DataFrame,
|
| 269 |
+
columns: list,
|
| 270 |
+
type_query: list,
|
| 271 |
+
precision_query: str,
|
| 272 |
+
size_query: list,
|
| 273 |
+
params_query: list,
|
| 274 |
+
hide_models: list,
|
| 275 |
+
query: str,
|
| 276 |
+
compute_dtype: str,
|
| 277 |
+
weight_dtype: str,
|
| 278 |
+
double_quant: str,
|
| 279 |
+
group_dtype: str
|
| 280 |
+
):
|
| 281 |
+
global init_select
|
| 282 |
+
global current_weightDtype
|
| 283 |
+
global current_computeDtype
|
| 284 |
+
|
| 285 |
+
if weight_dtype == ['All'] or weight_dtype == 'All':
|
| 286 |
+
weight_dtype = current_weightDtype
|
| 287 |
+
else:
|
| 288 |
+
weight_dtype = [weight_dtype]
|
| 289 |
+
|
| 290 |
+
if compute_dtype == 'All':
|
| 291 |
+
compute_dtype = current_computeDtype
|
| 292 |
+
else:
|
| 293 |
+
compute_dtype = [compute_dtype]
|
| 294 |
+
|
| 295 |
+
if group_dtype == 'All':
|
| 296 |
+
group_dtype = [-1, 1024, 256, 128, 64, 32]
|
| 297 |
+
else:
|
| 298 |
+
try:
|
| 299 |
+
group_dtype = [int(group_dtype)]
|
| 300 |
+
except ValueError:
|
| 301 |
+
group_dtype = [-1]
|
| 302 |
+
|
| 303 |
+
if double_quant == 'All':
|
| 304 |
+
double_quant = [True, False]
|
| 305 |
+
else:
|
| 306 |
+
double_quant = [str_to_bool(double_quant)]
|
| 307 |
|
| 308 |
+
filtered_df = filter_models(df=hidden_df, type_query=type_query, size_query=size_query, precision_query=precision_query, hide_models=hide_models, compute_dtype=compute_dtype, weight_dtype=weight_dtype, double_quant=double_quant, group_dtype=group_dtype, params_query=params_query)
|
| 309 |
filtered_df = filter_queries(query, filtered_df)
|
| 310 |
+
df = select_columns(filtered_df, columns)
|
| 311 |
+
return df
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
|
| 315 |
+
query = request.query_params.get("query") or ""
|
| 316 |
+
return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
|
| 317 |
+
|
| 318 |
|
| 319 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
|
|
| 320 |
return df[(df[auto_eval_cols.dummy.name].str.contains(query, case=False))]
|
| 321 |
|
| 322 |
+
|
| 323 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 324 |
always_here_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
|
| 325 |
dummy_col = [auto_eval_cols.dummy.name]
|
| 326 |
+
# We use COLS to maintain sorting
|
| 327 |
+
filtered_df = df[
|
| 328 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
|
| 329 |
+
]
|
| 330 |
+
return filtered_df
|
| 331 |
+
|
| 332 |
|
| 333 |
def filter_queries(query: str, filtered_df: pd.DataFrame):
|
| 334 |
+
"""Added by Abishek"""
|
| 335 |
final_df = []
|
| 336 |
+
if query != "":
|
| 337 |
+
queries = [q.strip() for q in query.split(";")]
|
| 338 |
+
for _q in queries:
|
| 339 |
+
_q = _q.strip()
|
| 340 |
+
if _q != "":
|
| 341 |
+
temp_filtered_df = search_table(filtered_df, _q)
|
| 342 |
+
if len(temp_filtered_df) > 0:
|
| 343 |
+
final_df.append(temp_filtered_df)
|
| 344 |
+
if len(final_df) > 0:
|
| 345 |
+
filtered_df = pd.concat(final_df)
|
| 346 |
+
filtered_df = filtered_df.drop_duplicates(
|
| 347 |
+
subset=[auto_eval_cols.model.name, auto_eval_cols.precision.name, auto_eval_cols.revision.name]
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
return filtered_df
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def filter_models(
|
| 354 |
+
df: pd.DataFrame, type_query: list, size_query: list, params_query:list, precision_query: list, hide_models: list, compute_dtype: list, weight_dtype: list, double_quant: list, group_dtype: list,
|
| 355 |
+
) -> pd.DataFrame:
|
| 356 |
+
# Show all models
|
| 357 |
+
if "Private or deleted" in hide_models:
|
| 358 |
+
filtered_df = df[df[auto_eval_cols.still_on_hub.name] == True]
|
| 359 |
+
else:
|
| 360 |
+
filtered_df = df
|
| 361 |
+
|
| 362 |
+
if "Contains a merge/moerge" in hide_models:
|
| 363 |
+
filtered_df = filtered_df[filtered_df[auto_eval_cols.merged.name] == False]
|
| 364 |
+
|
| 365 |
+
if "MoE" in hide_models:
|
| 366 |
+
filtered_df = filtered_df[filtered_df[auto_eval_cols.moe.name] == False]
|
| 367 |
+
|
| 368 |
+
if "Flagged" in hide_models:
|
| 369 |
+
filtered_df = filtered_df[filtered_df[auto_eval_cols.flagged.name] == False]
|
| 370 |
+
|
| 371 |
+
type_emoji = [t[0] for t in type_query]
|
| 372 |
+
if any(emoji != '✖' for emoji in type_emoji):
|
| 373 |
+
type_emoji = [emoji for emoji in type_emoji if emoji != '✖']
|
| 374 |
+
else:
|
| 375 |
+
type_emoji = ['✖']
|
| 376 |
+
|
| 377 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.model_type_symbol.name].isin(type_emoji)]
|
| 378 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.precision.name].isin(precision_query + ["None"])]
|
| 379 |
+
|
| 380 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.weight_dtype.name].isin(weight_dtype)]
|
| 381 |
+
|
| 382 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.compute_dtype.name].isin(compute_dtype)]
|
| 383 |
+
|
| 384 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.double_quant.name].isin(double_quant)]
|
| 385 |
+
|
| 386 |
+
filtered_df = filtered_df.loc[df[auto_eval_cols.group_size.name].isin(group_dtype)]
|
| 387 |
+
|
| 388 |
+
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
| 389 |
+
params_column = pd.to_numeric(df[auto_eval_cols.params.name], errors="coerce")
|
| 390 |
+
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
| 391 |
+
filtered_df = filtered_df.loc[mask]
|
| 392 |
|
| 393 |
+
numeric_interval_params = pd.IntervalIndex(sorted([NUMERIC_MODELSIZE[s] for s in params_query]))
|
| 394 |
+
params_column_params = pd.to_numeric(df[auto_eval_cols.model_size.name], errors="coerce")
|
| 395 |
+
mask_params = params_column_params.apply(lambda x: any(numeric_interval_params.contains(x)))
|
| 396 |
+
filtered_df = filtered_df.loc[mask_params]
|
| 397 |
+
|
| 398 |
+
return filtered_df
|
| 399 |
|
| 400 |
def select(df, data: gr.SelectData):
|
| 401 |
+
global selected_indices
|
| 402 |
+
global selected_values
|
| 403 |
+
|
| 404 |
selected_index = data.index[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
if selected_index in selected_indices:
|
| 406 |
selected_indices.remove(selected_index)
|
| 407 |
+
|
| 408 |
+
value = df.iloc[selected_index].iloc[1]
|
| 409 |
+
pattern = r'<a[^>]+>([^<]+)</a>'
|
| 410 |
+
match = re.search(pattern, value)
|
| 411 |
+
if match:
|
| 412 |
+
text_content = match.group(1)
|
| 413 |
+
if text_content in selected_values:
|
| 414 |
+
del selected_values[text_content]
|
| 415 |
else:
|
| 416 |
selected_indices.append(selected_index)
|
| 417 |
+
|
| 418 |
+
value = df.iloc[selected_index].iloc[1]
|
| 419 |
+
pattern = r'<a[^>]+>([^<]+)</a>'
|
| 420 |
+
match = re.search(pattern, value)
|
| 421 |
+
if match:
|
| 422 |
+
text_content = match.group(1)
|
| 423 |
+
selected_values[text_content] = value
|
| 424 |
+
|
| 425 |
return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
|
| 426 |
|
| 427 |
+
def init_comparison_data():
|
| 428 |
+
global selected_values
|
| 429 |
+
return gr.CheckboxGroup(list(selected_values.keys()), value=list(selected_values.keys()))
|
| 430 |
+
|
| 431 |
+
def remove_html_tags(value):
|
| 432 |
+
if isinstance(value, str):
|
| 433 |
+
return re.sub(r'<[^>]*>', '', value)
|
| 434 |
+
return value
|
| 435 |
+
|
| 436 |
+
def show_modal():
|
| 437 |
+
return gr.update(visible=True, elem_classes="custom-modal")
|
| 438 |
+
|
| 439 |
+
def close_modal_logic():
|
| 440 |
+
return gr.update(visible=False, elem_classes="modal-hidden")
|
| 441 |
+
|
| 442 |
def generate_spider_chart(df, selected_keys):
|
| 443 |
global selected_values
|
| 444 |
+
current_selected_values = [selected_values[key] for key in selected_keys if key in selected_values]
|
| 445 |
+
selected_rows = df[df.iloc[:, 1].isin(current_selected_values)]
|
| 446 |
+
cleaned_rows = selected_rows.map(remove_html_tags)
|
| 447 |
+
|
| 448 |
+
|
| 449 |
fig = go.Figure()
|
|
|
|
|
|
|
| 450 |
for _, row in selected_rows.iterrows():
|
| 451 |
fig.add_trace(go.Scatterpolar(
|
| 452 |
+
r=[row['Average ⬆️'], row['ARC-c'], row['ARC-e'], row['Boolq'], row['HellaSwag'], row['Lambada'], row['MMLU'], row['Openbookqa'], row['Piqa'], row['Truthfulqa'], row['Winogrande']],
|
| 453 |
+
theta=['Average ⬆️', 'ARC-c', 'ARC-e', 'Boolq', 'HellaSwag', 'Lambada', 'MMLU', 'Openbookqa', 'Piqa', 'Truthfulqa', 'Winogrande'],
|
| 454 |
+
fill='toself',
|
| 455 |
+
name=str(row['Model'])
|
| 456 |
))
|
| 457 |
+
fig.update_layout(
|
| 458 |
+
polar=dict(
|
| 459 |
+
radialaxis=dict(
|
| 460 |
+
visible=False,
|
| 461 |
+
)),
|
| 462 |
+
showlegend=True,
|
| 463 |
+
margin=dict(l=50, r=50, t=50, b=50),
|
| 464 |
+
height=400,
|
| 465 |
+
autosize=True
|
| 466 |
+
)
|
| 467 |
+
|
| 468 |
+
return fig, cleaned_rows
|
| 469 |
+
|
| 470 |
+
leaderboard_df = filter_models(
|
| 471 |
+
df=leaderboard_df,
|
| 472 |
+
type_query=[t.to_str(" : ") for t in QuantType if t != QuantType.QuantType_None],
|
| 473 |
+
size_query=list(NUMERIC_INTERVALS.keys()),
|
| 474 |
+
params_query=list(NUMERIC_MODELSIZE.keys()),
|
| 475 |
+
precision_query=[i.value.name for i in Precision],
|
| 476 |
+
hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs,
|
| 477 |
+
compute_dtype=[i.value.name for i in ComputeDtype],
|
| 478 |
+
weight_dtype=[i.value.name for i in WeightDtype],
|
| 479 |
+
double_quant=[True, False],
|
| 480 |
+
group_dtype=[-1, 1024, 256, 128, 64, 32]
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
|
| 484 |
+
demo = gr.Blocks(fill_width=True)
|
|
|
|
| 485 |
with demo:
|
| 486 |
+
|
| 487 |
+
with gr.Column(elem_classes="custom-modal", visible=False, elem_id="my-modal-container") as modal_window:
|
| 488 |
+
with gr.Column(elem_classes="modal-content"):
|
| 489 |
+
with gr.Column():
|
| 490 |
+
comparison_plot_inside = gr.Plot()
|
| 491 |
+
comparison_df_inside = gr.Dataframe(interactive=False)
|
| 492 |
+
|
| 493 |
+
close_btn = gr.Button("Close", variant="primary")
|
| 494 |
+
|
| 495 |
gr.HTML(TITLE)
|
| 496 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 497 |
+
|
| 498 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 499 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 500 |
with gr.Row():
|
| 501 |
with gr.Column():
|
| 502 |
+
with gr.Row(variant="compact"):
|
| 503 |
+
search_bar = gr.Textbox(
|
| 504 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
| 505 |
+
show_label=False,
|
| 506 |
+
elem_id="search-bar",
|
| 507 |
+
)
|
| 508 |
+
with gr.Row():
|
| 509 |
+
shown_columns = gr.CheckboxGroup(
|
| 510 |
+
choices=[
|
| 511 |
+
c.name
|
| 512 |
+
for c in fields(auto_eval_cols)
|
| 513 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
| 514 |
+
],
|
| 515 |
+
value=[
|
| 516 |
+
c.name
|
| 517 |
+
for c in fields(auto_eval_cols)
|
| 518 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 519 |
+
],
|
| 520 |
+
label="Select columns to show",
|
| 521 |
+
elem_id="column-select",
|
| 522 |
+
interactive=True,
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
with gr.Row():
|
| 526 |
+
filter_columns_parameters = gr.CheckboxGroup(
|
| 527 |
+
label="Model parameters (in billions of parameters)",
|
| 528 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
| 529 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
| 530 |
+
interactive=True,
|
| 531 |
+
elem_id="filter-columns-size",
|
| 532 |
+
)
|
| 533 |
+
with gr.Row():
|
| 534 |
+
filter_columns_size = gr.CheckboxGroup(
|
| 535 |
+
label="Model sizes (GB, int4)",
|
| 536 |
+
choices=list(NUMERIC_MODELSIZE.keys()),
|
| 537 |
+
value=list(NUMERIC_MODELSIZE.keys()),
|
| 538 |
+
interactive=True,
|
| 539 |
+
elem_id="filter-columns-size",
|
| 540 |
)
|
| 541 |
with gr.Column(min_width=320):
|
| 542 |
+
#with gr.Box(elem_id="box-filter"):
|
| 543 |
+
filter_columns_type = gr.CheckboxGroup(
|
| 544 |
+
label="Quantization types",
|
| 545 |
+
choices=[t.to_str() for t in QuantType if t != QuantType.QuantType_None],
|
| 546 |
+
value=[t.to_str() for t in QuantType if t != QuantType.QuantType_None],
|
| 547 |
+
interactive=True,
|
| 548 |
+
elem_id="filter-columns-type",
|
| 549 |
+
)
|
| 550 |
+
filter_columns_precision = gr.CheckboxGroup(
|
| 551 |
+
label="Weight precision",
|
| 552 |
+
choices=[i.value.name for i in Precision],
|
| 553 |
+
value=[i.value.name for i in Precision if ( i.value.name != '16bit' and i.value.name != '32bit')],
|
| 554 |
+
interactive=True,
|
| 555 |
+
elem_id="filter-columns-precision",
|
| 556 |
+
)
|
| 557 |
+
with gr.Column(elem_id="quant-config-container") as config:
|
| 558 |
+
gr.HTML("<div class='quant-config-header'>Quantization config</div>")
|
| 559 |
with gr.Row():
|
| 560 |
+
filter_columns_computeDtype = gr.Dropdown(choices=[i.value.name for i in ComputeDtype], label="Compute Dtype", multiselect=False, value="All", interactive=True,)
|
| 561 |
+
filter_columns_weightDtype = gr.Dropdown(choices=[i.value.name for i in WeightDtype], label="Weight Dtype", multiselect=False, value="All", interactive=True,)
|
| 562 |
+
filter_columns_doubleQuant = gr.Dropdown(choices=["All", "True", "False"], label="Double Quant", multiselect=False, value="All", interactive=True)
|
| 563 |
+
filter_columns_groupDtype = gr.Dropdown(choices=[i.value.name for i in GroupDtype], label="Group Size", multiselect=False, value="All", interactive=True,)
|
| 564 |
+
|
| 565 |
+
with gr.Row():
|
| 566 |
+
with gr.Column(scale=4):
|
| 567 |
+
model_comparison = gr.CheckboxGroup(label="Accuracy Comparison (Selected Models from Table)", choices=list(selected_values.keys()), value=list(selected_values.keys()), interactive=True, elem_id="model_comparison")
|
| 568 |
+
with gr.Column(scale=1, min_width=150):
|
| 569 |
+
spider_btn = gr.Button("Compare", variant="primary", elem_id="compare-button-full")
|
| 570 |
+
|
| 571 |
+
never_hidden_cols = [c.name for c in fields(auto_eval_cols) if c.never_hidden]
|
| 572 |
+
|
| 573 |
+
user_cols = shown_columns.value
|
| 574 |
+
|
| 575 |
+
if len(user_cols) > 0:
|
| 576 |
+
first_user_col = [user_cols[0]]
|
| 577 |
+
remaining_user_cols = user_cols[1:]
|
| 578 |
+
|
| 579 |
+
final_cols = first_user_col + never_hidden_cols + remaining_user_cols
|
| 580 |
+
else:
|
| 581 |
+
final_cols = never_hidden_cols
|
| 582 |
+
|
| 583 |
+
leaderboard_table = gr.components.Dataframe(
|
| 584 |
+
value=leaderboard_df[final_cols + [auto_eval_cols.dummy.name]],
|
| 585 |
+
headers=final_cols,
|
| 586 |
+
datatype="markdown",
|
| 587 |
+
elem_id="leaderboard-table",
|
| 588 |
+
interactive=False,
|
| 589 |
+
visible=True,
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
# with gr.BrowserModal(visible=False) as modal:
|
| 593 |
+
# map = gr.Plot()
|
| 594 |
+
# data_table = gr.Dataframe()
|
| 595 |
+
# gr.Column([map, data_table])
|
| 596 |
|
| 597 |
+
leaderboard_table.select(select, leaderboard_table, model_comparison)
|
| 598 |
+
spider_btn.click(
|
| 599 |
+
fn=show_modal,
|
| 600 |
+
outputs=modal_window
|
| 601 |
+
).then(
|
| 602 |
+
fn=generate_spider_chart,
|
| 603 |
+
inputs=[leaderboard_table, model_comparison],
|
| 604 |
+
outputs=[comparison_plot_inside, comparison_df_inside]
|
| 605 |
+
)
|
| 606 |
+
close_btn.click(
|
| 607 |
+
fn=close_modal_logic,
|
| 608 |
+
outputs=modal_window
|
| 609 |
+
)
|
| 610 |
+
demo.load(init_comparison_data, None, model_comparison)
|
| 611 |
+
|
| 612 |
+
if "Weight type" not in original_df.columns:
|
| 613 |
+
original_df["Weight type"] = "Unknown"
|
| 614 |
+
|
| 615 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
| 616 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 617 |
+
value=original_df[COLS],
|
| 618 |
+
headers=COLS,
|
| 619 |
+
datatype=TYPES,
|
| 620 |
+
visible=False,
|
| 621 |
)
|
| 622 |
|
| 623 |
+
hide_models = gr.Textbox(
|
| 624 |
+
placeholder="",
|
| 625 |
+
show_label=False,
|
| 626 |
+
elem_id="search-bar",
|
| 627 |
+
value="",
|
| 628 |
+
visible=False,
|
| 629 |
+
|
| 630 |
+
)
|
| 631 |
|
| 632 |
+
search_bar.submit(
|
| 633 |
+
update_table,
|
| 634 |
+
[
|
| 635 |
+
hidden_leaderboard_table_for_search,
|
| 636 |
+
shown_columns,
|
| 637 |
+
filter_columns_type,
|
| 638 |
+
filter_columns_precision,
|
| 639 |
+
filter_columns_parameters,
|
| 640 |
+
filter_columns_size,
|
| 641 |
+
hide_models,
|
| 642 |
+
search_bar,
|
| 643 |
+
filter_columns_computeDtype,
|
| 644 |
+
filter_columns_weightDtype,
|
| 645 |
+
filter_columns_doubleQuant,
|
| 646 |
+
filter_columns_groupDtype
|
| 647 |
+
],
|
| 648 |
+
leaderboard_table,
|
| 649 |
+
)
|
| 650 |
|
| 651 |
+
"""
|
| 652 |
+
|
| 653 |
+
# Define a hidden component that will trigger a reload only if a query parameter has been set
|
| 654 |
+
hidden_search_bar = gr.Textbox(value="", visible=False)
|
| 655 |
+
hidden_search_bar.change(
|
| 656 |
+
update_table,
|
| 657 |
+
[
|
| 658 |
+
hidden_leaderboard_table_for_search,
|
| 659 |
+
shown_columns,
|
| 660 |
+
filter_columns_type,
|
| 661 |
+
filter_columns_precision,
|
| 662 |
+
filter_columns_size,
|
| 663 |
+
hide_models,
|
| 664 |
+
search_bar,
|
| 665 |
+
],
|
| 666 |
+
leaderboard_table,
|
| 667 |
+
)
|
| 668 |
+
# Check query parameter once at startup and update search bar + hidden component
|
| 669 |
+
demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
|
| 670 |
+
|
| 671 |
+
"""
|
| 672 |
+
filter_columns_type.change(
|
| 673 |
+
update_quantization_types,
|
| 674 |
+
[filter_columns_type],
|
| 675 |
+
[filter_columns_weightDtype, filter_columns_computeDtype, filter_columns_precision]
|
| 676 |
+
)
|
| 677 |
+
|
| 678 |
+
filter_columns_precision.change(
|
| 679 |
+
update_Weight_Precision,
|
| 680 |
+
[filter_columns_precision],
|
| 681 |
+
[filter_columns_weightDtype, filter_columns_computeDtype, filter_columns_precision, filter_columns_type]
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
filter_columns_weightDtype.change(
|
| 685 |
+
update_Weight_Dtype,
|
| 686 |
+
[filter_columns_weightDtype],
|
| 687 |
+
[filter_columns_precision]
|
| 688 |
+
)
|
| 689 |
+
# filter_columns_computeDtype.change(
|
| 690 |
+
# Compute_Dtype_update,
|
| 691 |
+
# [filter_columns_computeDtype, filter_columns_precision],
|
| 692 |
+
# [filter_columns_precision, filter_columns_type]
|
| 693 |
+
# )
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
|
| 697 |
+
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, filter_columns_parameters, hide_models, filter_columns_computeDtype, filter_columns_weightDtype, filter_columns_doubleQuant, filter_columns_groupDtype]:
|
| 698 |
+
selector.change(
|
| 699 |
+
update_table,
|
| 700 |
+
[
|
| 701 |
+
hidden_leaderboard_table_for_search,
|
| 702 |
+
shown_columns,
|
| 703 |
+
filter_columns_type,
|
| 704 |
+
filter_columns_precision,
|
| 705 |
+
filter_columns_parameters,
|
| 706 |
+
filter_columns_size,
|
| 707 |
+
hide_models,
|
| 708 |
+
search_bar,
|
| 709 |
+
filter_columns_computeDtype,
|
| 710 |
+
filter_columns_weightDtype,
|
| 711 |
+
filter_columns_doubleQuant,
|
| 712 |
+
filter_columns_groupDtype
|
| 713 |
+
],
|
| 714 |
+
leaderboard_table,
|
| 715 |
+
queue=True,
|
| 716 |
+
)
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
| 720 |
+
with gr.Row():
|
| 721 |
+
with gr.Column():
|
| 722 |
+
chart = create_metric_plot_obj(
|
| 723 |
+
plot_df,
|
| 724 |
+
[auto_eval_cols.average.name],
|
| 725 |
+
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
| 726 |
+
)
|
| 727 |
+
gr.Plot(value=chart, min_width=500)
|
| 728 |
+
with gr.Column():
|
| 729 |
+
chart = create_metric_plot_obj(
|
| 730 |
+
plot_df,
|
| 731 |
+
BENCHMARK_COLS,
|
| 732 |
+
title="Top Scores and Human Baseline Over Time (from last update)",
|
| 733 |
+
)
|
| 734 |
+
gr.Plot(value=chart, min_width=500)
|
| 735 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 736 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 737 |
+
|
| 738 |
+
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
| 739 |
+
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
| 740 |
+
|
| 741 |
+
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
| 742 |
+
with gr.Column():
|
| 743 |
+
with gr.Row():
|
| 744 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 745 |
+
|
| 746 |
+
with gr.Row():
|
| 747 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 748 |
|
|
|
|
| 749 |
with gr.Row():
|
| 750 |
+
with gr.Column():
|
| 751 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 752 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 753 |
+
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
| 754 |
+
|
| 755 |
+
with gr.Column():
|
| 756 |
+
"""
|
| 757 |
+
precision = gr.Dropdown(
|
| 758 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 759 |
+
label="Precision",
|
| 760 |
+
multiselect=False,
|
| 761 |
+
value="4bit",
|
| 762 |
+
interactive=True,
|
| 763 |
+
)
|
| 764 |
+
weight_type = gr.Dropdown(
|
| 765 |
+
choices=[i.value.name for i in WeightDtype],
|
| 766 |
+
label="Weights dtype",
|
| 767 |
+
multiselect=False,
|
| 768 |
+
value="int4",
|
| 769 |
+
interactive=True,
|
| 770 |
+
)
|
| 771 |
+
"""
|
| 772 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)",
|
| 773 |
+
visible=not IS_PUBLIC)
|
| 774 |
+
compute_type = gr.Dropdown(
|
| 775 |
+
choices=[i.value.name for i in ComputeDtype if i.value.name != "All"],
|
| 776 |
+
label="Compute dtype",
|
| 777 |
+
multiselect=False,
|
| 778 |
+
value="float16",
|
| 779 |
+
interactive=True,
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
submit_button = gr.Button("Submit Eval")
|
| 783 |
+
submission_result = gr.Markdown()
|
| 784 |
+
submit_button.click(
|
| 785 |
+
add_new_eval,
|
| 786 |
+
[
|
| 787 |
+
model_name_textbox,
|
| 788 |
+
revision_name_textbox,
|
| 789 |
+
private,
|
| 790 |
+
compute_type,
|
| 791 |
+
],
|
| 792 |
+
submission_result,
|
| 793 |
+
)
|
| 794 |
|
|
|
|
| 795 |
with gr.Column():
|
| 796 |
+
with gr.Accordion(
|
| 797 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 798 |
+
open=False,
|
| 799 |
+
):
|
| 800 |
+
with gr.Row():
|
| 801 |
+
finished_eval_table = gr.components.Dataframe(
|
| 802 |
+
value=finished_eval_queue_df,
|
| 803 |
+
headers=EVAL_COLS,
|
| 804 |
+
datatype=EVAL_TYPES,
|
| 805 |
+
row_count=5,
|
| 806 |
+
)
|
| 807 |
+
with gr.Accordion(
|
| 808 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 809 |
+
open=False,
|
| 810 |
+
):
|
| 811 |
+
with gr.Row():
|
| 812 |
+
running_eval_table = gr.components.Dataframe(
|
| 813 |
+
value=running_eval_queue_df,
|
| 814 |
+
headers=EVAL_COLS,
|
| 815 |
+
datatype=EVAL_TYPES,
|
| 816 |
+
row_count=5,
|
| 817 |
+
)
|
| 818 |
+
|
| 819 |
+
with gr.Accordion(
|
| 820 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 821 |
+
open=False,
|
| 822 |
+
):
|
| 823 |
+
with gr.Row():
|
| 824 |
+
pending_eval_table = gr.components.Dataframe(
|
| 825 |
+
value=pending_eval_queue_df,
|
| 826 |
+
headers=EVAL_COLS,
|
| 827 |
+
datatype=EVAL_TYPES,
|
| 828 |
+
row_count=5,
|
| 829 |
+
)
|
| 830 |
|
| 831 |
with gr.Row():
|
| 832 |
with gr.Accordion("📙 Citation", open=False):
|
| 833 |
+
citation_button = gr.Textbox(
|
| 834 |
+
value=CITATION_BUTTON_TEXT,
|
| 835 |
+
label=CITATION_BUTTON_LABEL,
|
| 836 |
+
lines=20,
|
| 837 |
+
elem_id="citation-button",
|
| 838 |
+
buttons=["copy"],
|
| 839 |
+
)
|
| 840 |
|
| 841 |
scheduler = BackgroundScheduler()
|
| 842 |
+
scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
|
| 843 |
+
scheduler.add_job(update_dynamic_files, "interval", hours=12) # launched every 2 hour
|
| 844 |
scheduler.start()
|
| 845 |
+
|
| 846 |
+
demo.queue(default_concurrency_limit=40).launch(css=custom_css)
|
requirements.txt
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
GitPython
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
matplotlib
|
| 4 |
+
plotly
|
| 5 |
+
apscheduler
|
| 6 |
+
tqdm
|
| 7 |
+
requests
|
| 8 |
+
python-dateutil
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
huggingface-hub
|
| 12 |
+
|
| 13 |
+
transformers
|
| 14 |
+
gradio==6.5.1
|
| 15 |
+
|
| 16 |
+
datasets
|
| 17 |
+
tokenizers
|
| 18 |
+
GitPython
|
| 19 |
+
|
| 20 |
+
pydantic>=2.0
|
src/display/css_html_js.py
CHANGED
|
@@ -13,9 +13,112 @@ table th:first-child {
|
|
| 13 |
white-space: nowrap;
|
| 14 |
}
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
/* Full width space */
|
| 17 |
.gradio-container {
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
/* Text style and margins */
|
|
@@ -44,7 +147,7 @@ table th:first-child {
|
|
| 44 |
background: none;
|
| 45 |
border: none;
|
| 46 |
}
|
| 47 |
-
|
| 48 |
#search-bar {
|
| 49 |
padding: 0px;
|
| 50 |
}
|
|
@@ -71,7 +174,7 @@ table th:first-child {
|
|
| 71 |
|
| 72 |
/* 100% scale*/
|
| 73 |
@media (resolution: 96dpi), (min-resolution: 1dppx) and (max-resolution: 1.25dppx) {
|
| 74 |
-
|
| 75 |
height: 6rem !important;
|
| 76 |
overflow: auto !important;
|
| 77 |
}
|
|
@@ -93,13 +196,10 @@ table th:first-child {
|
|
| 93 |
}
|
| 94 |
|
| 95 |
#component-31 {
|
| 96 |
-
|
| 97 |
}
|
| 98 |
}
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
#model_comparison {
|
| 104 |
height: 6rem !important;
|
| 105 |
overflow: auto !important;
|
|
@@ -109,8 +209,8 @@ table th:first-child {
|
|
| 109 |
font-size: 0.7rem !important;
|
| 110 |
}
|
| 111 |
|
| 112 |
-
.tab-buttons button {
|
| 113 |
-
font-size:
|
| 114 |
}
|
| 115 |
|
| 116 |
/* Filters style */
|
|
@@ -147,11 +247,3 @@ table th:first-child {
|
|
| 147 |
border: 0
|
| 148 |
}
|
| 149 |
"""
|
| 150 |
-
|
| 151 |
-
get_window_url_params = """
|
| 152 |
-
function(url_params) {
|
| 153 |
-
const params = new URLSearchParams(window.location.search);
|
| 154 |
-
url_params = Object.fromEntries(params);
|
| 155 |
-
return url_params;
|
| 156 |
-
}
|
| 157 |
-
"""
|
|
|
|
| 13 |
white-space: nowrap;
|
| 14 |
}
|
| 15 |
|
| 16 |
+
.custom-modal:not([style*="display: none"]):not(.hidden) {
|
| 17 |
+
position: fixed !important;
|
| 18 |
+
top: 0 !important;
|
| 19 |
+
left: 0 !important;
|
| 20 |
+
width: 100vw !important;
|
| 21 |
+
height: 100vh !important;
|
| 22 |
+
background-color: rgba(0, 0, 0, 0.85) !important;
|
| 23 |
+
z-index: 10000 !important;
|
| 24 |
+
display: block !important;
|
| 25 |
+
overflow-y: auto !important;
|
| 26 |
+
pointer-events: auto !important;
|
| 27 |
+
padding: 5vh 0 !important;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
.custom-modal[style*="display: none"],
|
| 31 |
+
.custom-modal.hidden,
|
| 32 |
+
.modal-hidden {
|
| 33 |
+
display: none !important;
|
| 34 |
+
visibility: hidden !important;
|
| 35 |
+
pointer-events: none !important;
|
| 36 |
+
position: absolute !important;
|
| 37 |
+
width: 0 !important;
|
| 38 |
+
height: 0 !important;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
.modal-content {
|
| 42 |
+
background: white !important;
|
| 43 |
+
padding: 30px 50px !important;
|
| 44 |
+
border-radius: 12px;
|
| 45 |
+
width: 85% !important;
|
| 46 |
+
max-width: 1100px;
|
| 47 |
+
margin: 0 auto !important;
|
| 48 |
+
display: block !important;
|
| 49 |
+
height: auto !important;
|
| 50 |
+
min-height: 200px !important;
|
| 51 |
+
max-height: none !important;
|
| 52 |
+
z-index: 10001;
|
| 53 |
+
text-align: center;
|
| 54 |
+
box-shadow: 0 4px 20px rgba(0,0,0,0.3);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.modal-content .gradio-plot,
|
| 58 |
+
.modal-content .plot-container {
|
| 59 |
+
height: 400px !important;
|
| 60 |
+
min-height: 400px !important;
|
| 61 |
+
max-height: 400px !important;
|
| 62 |
+
width: 100% !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.modal-content .gradio-dataframe {
|
| 66 |
+
height: auto !important;
|
| 67 |
+
min-height: 50px !important;
|
| 68 |
+
max-height: 400px !important;
|
| 69 |
+
overflow-y: auto !important;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
.modal-content .form,
|
| 73 |
+
.modal-content .gap {
|
| 74 |
+
gap: 0 !important;
|
| 75 |
+
padding: 0 !important;
|
| 76 |
+
margin: 0 !important;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.modal-content > * {
|
| 80 |
+
margin: 0 auto 20px auto !important;
|
| 81 |
+
flex: none !important;
|
| 82 |
+
display: block !important;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
/* Full width space */
|
| 86 |
.gradio-container {
|
| 87 |
+
display: flex !important;
|
| 88 |
+
flex-direction: column !important;
|
| 89 |
+
align-items: center !important;
|
| 90 |
+
width: 95% !important;
|
| 91 |
+
max-width: 95% !important;
|
| 92 |
+
margin-left: auto !important;
|
| 93 |
+
margin-right: auto !important;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.gradio-group {
|
| 97 |
+
background-color: #fff !important;
|
| 98 |
+
border: none !important;
|
| 99 |
+
box-shadow: none !important;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
#compare-button-full {
|
| 103 |
+
height: 100% !important;
|
| 104 |
+
width: 100% !important;
|
| 105 |
+
display: flex !important;
|
| 106 |
+
align-items: center !important;
|
| 107 |
+
justify-content: center !important;
|
| 108 |
+
min-height: 100px;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
#quant-config-container {
|
| 112 |
+
border: 1px solid #e5e7eb !important;
|
| 113 |
+
border-radius: 8px !important;
|
| 114 |
+
background-color: transparent !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.quant-config-header {
|
| 118 |
+
border-radius: 8px 8px 0 0;
|
| 119 |
+
font-weight: 600;
|
| 120 |
+
background-color: #fff;
|
| 121 |
+
color: #71717a;
|
| 122 |
}
|
| 123 |
|
| 124 |
/* Text style and margins */
|
|
|
|
| 147 |
background: none;
|
| 148 |
border: none;
|
| 149 |
}
|
| 150 |
+
|
| 151 |
#search-bar {
|
| 152 |
padding: 0px;
|
| 153 |
}
|
|
|
|
| 174 |
|
| 175 |
/* 100% scale*/
|
| 176 |
@media (resolution: 96dpi), (min-resolution: 1dppx) and (max-resolution: 1.25dppx) {
|
| 177 |
+
#model_comparison {
|
| 178 |
height: 6rem !important;
|
| 179 |
overflow: auto !important;
|
| 180 |
}
|
|
|
|
| 196 |
}
|
| 197 |
|
| 198 |
#component-31 {
|
| 199 |
+
margin-top: 0.5rem !important;
|
| 200 |
}
|
| 201 |
}
|
| 202 |
|
|
|
|
|
|
|
|
|
|
| 203 |
#model_comparison {
|
| 204 |
height: 6rem !important;
|
| 205 |
overflow: auto !important;
|
|
|
|
| 209 |
font-size: 0.7rem !important;
|
| 210 |
}
|
| 211 |
|
| 212 |
+
.tab-buttons > div > button {
|
| 213 |
+
font-size: 18px !important;
|
| 214 |
}
|
| 215 |
|
| 216 |
/* Filters style */
|
|
|
|
| 247 |
border: 0
|
| 248 |
}
|
| 249 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
@@ -44,39 +44,74 @@ class ColumnContent:
|
|
| 44 |
|
| 45 |
auto_eval_column_dict = []
|
| 46 |
# Init
|
| 47 |
-
auto_eval_column_dict.append([
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
for task in Tasks:
|
| 52 |
-
auto_eval_column_dict.append([
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
auto_eval_column_dict.append([
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
auto_eval_column_dict.append([
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
auto_eval_column_dict.append([
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 76 |
# auto_eval_column_dict.sort(key=lambda x: x[0])
|
| 77 |
sorted_columns = sorted(auto_eval_column_dict[3:], key=lambda x: x[0])
|
| 78 |
sorted_auto_eval_column_dict = auto_eval_column_dict[:3] + sorted_columns
|
| 79 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn",
|
|
|
|
|
|
|
| 80 |
|
| 81 |
@dataclass(frozen=True)
|
| 82 |
class EvalQueueColumn: # Queue column
|
|
@@ -87,31 +122,28 @@ class EvalQueueColumn: # Queue column
|
|
| 87 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
| 88 |
status = ColumnContent("status", "str", True)
|
| 89 |
|
|
|
|
| 90 |
|
| 91 |
baseline_row = {
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
AutoEvalColumn.openbookqa.name: 25.0,
|
| 112 |
-
AutoEvalColumn.boolq.name: True,
|
| 113 |
-
AutoEvalColumn.arc_easy.name: 25.0,
|
| 114 |
-
AutoEvalColumn.double_quant.name: False,
|
| 115 |
}
|
| 116 |
|
| 117 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
|
@@ -123,20 +155,16 @@ baseline_row = {
|
|
| 123 |
# GSM8K: paper
|
| 124 |
# Define the human baselines
|
| 125 |
human_baseline_row = {
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
# AutoEvalColumn.gsm8k.name: 100,
|
| 137 |
-
AutoEvalColumn.dummy.name: "human_baseline",
|
| 138 |
-
AutoEvalColumn.model_type.name: "",
|
| 139 |
-
AutoEvalColumn.flagged.name: False,
|
| 140 |
}
|
| 141 |
|
| 142 |
@dataclass
|
|
@@ -355,8 +383,8 @@ class Precision(Enum):
|
|
| 355 |
|
| 356 |
|
| 357 |
# Column selection
|
| 358 |
-
COLS = [c.name for c in fields(
|
| 359 |
-
TYPES = [c.type for c in fields(
|
| 360 |
|
| 361 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 362 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
@@ -383,4 +411,4 @@ NUMERIC_MODELSIZE = {
|
|
| 383 |
"~48": pd.Interval(36, 48, closed="right"),
|
| 384 |
"~64": pd.Interval(48, 64, closed="right"),
|
| 385 |
">72": pd.Interval(64, 200, closed="right"),
|
| 386 |
-
}
|
|
|
|
| 1 |
+
from dataclasses import dataclass, make_dataclass, field
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
| 44 |
|
| 45 |
auto_eval_column_dict = []
|
| 46 |
# Init
|
| 47 |
+
auto_eval_column_dict.append([
|
| 48 |
+
"model_type_symbol",
|
| 49 |
+
ColumnContent,
|
| 50 |
+
field(default_factory=lambda: ColumnContent("T", "Type", "str"))
|
| 51 |
+
])
|
| 52 |
+
|
| 53 |
+
auto_eval_column_dict.append([
|
| 54 |
+
"model",
|
| 55 |
+
ColumnContent,
|
| 56 |
+
field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))
|
| 57 |
+
])
|
| 58 |
+
|
| 59 |
+
# Scores
|
| 60 |
+
auto_eval_column_dict.append([
|
| 61 |
+
"average",
|
| 62 |
+
ColumnContent,
|
| 63 |
+
field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))
|
| 64 |
+
])
|
| 65 |
+
|
| 66 |
for task in Tasks:
|
| 67 |
+
auto_eval_column_dict.append([
|
| 68 |
+
task.name,
|
| 69 |
+
ColumnContent,
|
| 70 |
+
field(default_factory=lambda t=task: ColumnContent(t.value.col_name, "number", True))
|
| 71 |
+
])
|
| 72 |
+
|
| 73 |
+
auto_eval_column_dict.append([
|
| 74 |
+
"params",
|
| 75 |
+
ColumnContent,
|
| 76 |
+
field(default_factory=lambda: ColumnContent("#Params (B)", "number", True))
|
| 77 |
+
])
|
| 78 |
+
|
| 79 |
+
auto_eval_column_dict.append([
|
| 80 |
+
"model_size",
|
| 81 |
+
ColumnContent,
|
| 82 |
+
field(default_factory=lambda: ColumnContent("#Size (G)", "number", True))
|
| 83 |
+
])
|
| 84 |
+
|
| 85 |
+
# Dummy column for the search bar
|
| 86 |
+
auto_eval_column_dict.append([
|
| 87 |
+
"dummy",
|
| 88 |
+
ColumnContent,
|
| 89 |
+
field(default_factory=lambda: ColumnContent("model_name_for_query", "str", False, dummy=True))
|
| 90 |
+
])
|
| 91 |
+
|
| 92 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False, hidden=True))])
|
| 93 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
|
| 94 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
|
| 95 |
+
auto_eval_column_dict.append(["quant_type", ColumnContent, field(default_factory=lambda: ColumnContent("Quant type", "str", False))])
|
| 96 |
+
auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
|
| 97 |
+
auto_eval_column_dict.append(["weight_dtype", ColumnContent, field(default_factory=lambda: ColumnContent("Weight dtype", "str", False))])
|
| 98 |
+
auto_eval_column_dict.append(["compute_dtype", ColumnContent, field(default_factory=lambda: ColumnContent("Compute dtype", "str", False))])
|
| 99 |
+
auto_eval_column_dict.append(["merged", ColumnContent, field(default_factory=lambda: ColumnContent("Merged", "bool", False, hidden=True))])
|
| 100 |
+
auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
|
| 101 |
+
auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
|
| 102 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False, hidden=True))])
|
| 103 |
+
auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
|
| 104 |
+
auto_eval_column_dict.append(["flagged", ColumnContent, field(default_factory=lambda: ColumnContent("Flagged", "bool", False, hidden=True))])
|
| 105 |
+
auto_eval_column_dict.append(["moe", ColumnContent, field(default_factory=lambda: ColumnContent("MoE", "bool", False, hidden=True))])
|
| 106 |
+
auto_eval_column_dict.append(["double_quant", ColumnContent, field(default_factory=lambda: ColumnContent("Double Quant", "bool", False))])
|
| 107 |
+
auto_eval_column_dict.append(["group_size", ColumnContent, field(default_factory=lambda: ColumnContent("Group Size", "bool", False))])
|
| 108 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 109 |
# auto_eval_column_dict.sort(key=lambda x: x[0])
|
| 110 |
sorted_columns = sorted(auto_eval_column_dict[3:], key=lambda x: x[0])
|
| 111 |
sorted_auto_eval_column_dict = auto_eval_column_dict[:3] + sorted_columns
|
| 112 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", sorted_auto_eval_column_dict, frozen=True)
|
| 113 |
+
auto_eval_cols = AutoEvalColumn()
|
| 114 |
+
|
| 115 |
|
| 116 |
@dataclass(frozen=True)
|
| 117 |
class EvalQueueColumn: # Queue column
|
|
|
|
| 122 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
| 123 |
status = ColumnContent("status", "str", True)
|
| 124 |
|
| 125 |
+
eval_queue_cols = EvalQueueColumn()
|
| 126 |
|
| 127 |
baseline_row = {
|
| 128 |
+
auto_eval_cols.model.name: "<p>Baseline</p>",
|
| 129 |
+
auto_eval_cols.revision.name: "N/A",
|
| 130 |
+
auto_eval_cols.precision.name: None,
|
| 131 |
+
auto_eval_cols.merged.name: False,
|
| 132 |
+
auto_eval_cols.average.name: 31.0,
|
| 133 |
+
auto_eval_cols.arc.name: 25.0,
|
| 134 |
+
auto_eval_cols.winogrande.name: 50.0,
|
| 135 |
+
auto_eval_cols.dummy.name: "baseline",
|
| 136 |
+
auto_eval_cols.model_type.name: "",
|
| 137 |
+
auto_eval_cols.flagged.name: False,
|
| 138 |
+
auto_eval_cols.mmlu.name: 25.0,
|
| 139 |
+
auto_eval_cols.lambada_openai.name: 25.0,
|
| 140 |
+
auto_eval_cols.hellaswag.name: 25.0,
|
| 141 |
+
auto_eval_cols.piqa.name: 25.0,
|
| 142 |
+
auto_eval_cols.truthfulqa_mc.name: 25.0,
|
| 143 |
+
auto_eval_cols.openbookqa.name: 25.0,
|
| 144 |
+
auto_eval_cols.boolq.name: True,
|
| 145 |
+
auto_eval_cols.arc_easy.name: 25.0,
|
| 146 |
+
auto_eval_cols.double_quant.name: False,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
}
|
| 148 |
|
| 149 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
|
|
|
| 155 |
# GSM8K: paper
|
| 156 |
# Define the human baselines
|
| 157 |
human_baseline_row = {
|
| 158 |
+
auto_eval_cols.model.name: "<p>Human performance</p>",
|
| 159 |
+
auto_eval_cols.revision.name: "N/A",
|
| 160 |
+
auto_eval_cols.precision.name: None,
|
| 161 |
+
auto_eval_cols.average.name: 92.75,
|
| 162 |
+
auto_eval_cols.merged.name: False,
|
| 163 |
+
auto_eval_cols.arc.name: 80.0,
|
| 164 |
+
auto_eval_cols.winogrande.name: 94.0,
|
| 165 |
+
auto_eval_cols.dummy.name: "human_baseline",
|
| 166 |
+
auto_eval_cols.model_type.name: "",
|
| 167 |
+
auto_eval_cols.flagged.name: False,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
}
|
| 169 |
|
| 170 |
@dataclass
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
# Column selection
|
| 386 |
+
COLS = [c.name for c in fields(auto_eval_cols)]
|
| 387 |
+
TYPES = [c.type for c in fields(auto_eval_cols)]
|
| 388 |
|
| 389 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 390 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
|
| 411 |
"~48": pd.Interval(36, 48, closed="right"),
|
| 412 |
"~64": pd.Interval(48, 64, closed="right"),
|
| 413 |
">72": pd.Interval(64, 200, closed="right"),
|
| 414 |
+
}
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from src.display.formatting import model_hyperlink
|
| 2 |
-
from src.display.utils import
|
| 3 |
|
| 4 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 5 |
# (Model name to forum discussion link)
|
|
@@ -130,8 +130,9 @@ DO_NOT_SUBMIT_MODELS = [
|
|
| 130 |
|
| 131 |
def flag_models(leaderboard_data: list[dict]):
|
| 132 |
for model_data in leaderboard_data:
|
|
|
|
| 133 |
# Merges and moes are flagged automatically
|
| 134 |
-
if model_data
|
| 135 |
flag_key = "merged"
|
| 136 |
else:
|
| 137 |
flag_key = model_data["model_name_for_query"]
|
|
@@ -143,11 +144,11 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 143 |
f"See discussion #{issue_num}",
|
| 144 |
)
|
| 145 |
model_data[
|
| 146 |
-
|
| 147 |
-
] = f"{model_data[
|
| 148 |
-
model_data[
|
| 149 |
else:
|
| 150 |
-
model_data[
|
| 151 |
|
| 152 |
|
| 153 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
|
|
| 1 |
from src.display.formatting import model_hyperlink
|
| 2 |
+
from src.display.utils import auto_eval_cols
|
| 3 |
|
| 4 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 5 |
# (Model name to forum discussion link)
|
|
|
|
| 130 |
|
| 131 |
def flag_models(leaderboard_data: list[dict]):
|
| 132 |
for model_data in leaderboard_data:
|
| 133 |
+
# 修改点 2:将 AutoEvalColumn 替换为 auto_eval_cols
|
| 134 |
# Merges and moes are flagged automatically
|
| 135 |
+
if model_data.get(auto_eval_cols.flagged.name) == True:
|
| 136 |
flag_key = "merged"
|
| 137 |
else:
|
| 138 |
flag_key = model_data["model_name_for_query"]
|
|
|
|
| 144 |
f"See discussion #{issue_num}",
|
| 145 |
)
|
| 146 |
model_data[
|
| 147 |
+
auto_eval_cols.model.name
|
| 148 |
+
] = f"{model_data[auto_eval_cols.model.name]} has been flagged! {issue_link}"
|
| 149 |
+
model_data[auto_eval_cols.flagged.name] = True
|
| 150 |
else:
|
| 151 |
+
model_data[auto_eval_cols.flagged.name] = False
|
| 152 |
|
| 153 |
|
| 154 |
def remove_forbidden_models(leaderboard_data: list[dict]):
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
| 11 |
from huggingface_hub import ModelCard
|
| 12 |
|
| 13 |
from src.display.formatting import make_clickable_model
|
| 14 |
-
from src.display.utils import
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass
|
|
@@ -60,10 +60,10 @@ class EvalResult:
|
|
| 60 |
quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
|
| 61 |
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
| 62 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 63 |
-
|
| 64 |
model_params = round(float(config["model_params"]), 2)
|
| 65 |
model_size = round(float(config["model_size"]), 2)
|
| 66 |
-
|
| 67 |
if data.get("quantization_config", None):
|
| 68 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
| 69 |
group_size = data["quantization_config"].get("group_size", -1)
|
|
@@ -81,7 +81,6 @@ class EvalResult:
|
|
| 81 |
|
| 82 |
if local and org_and_model[0] != "Intel":
|
| 83 |
org_and_model = config.get("model_name").split("/")
|
| 84 |
-
# temporary "local"
|
| 85 |
org_and_model = ["local", org_and_model[-1]]
|
| 86 |
quant_type = QuantType.autoround
|
| 87 |
|
|
@@ -95,7 +94,7 @@ class EvalResult:
|
|
| 95 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 96 |
full_model = "/".join(org_and_model)
|
| 97 |
|
| 98 |
-
# Extract results
|
| 99 |
results = {}
|
| 100 |
for task in Tasks:
|
| 101 |
task = task.value
|
|
@@ -137,19 +136,12 @@ class EvalResult:
|
|
| 137 |
try:
|
| 138 |
with open(request_file, "r") as f:
|
| 139 |
request = json.load(f)
|
| 140 |
-
# self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
| 141 |
-
# self.precision = WeightType[request.get("weight_type", "Original")]
|
| 142 |
-
# self.num_params = request.get("model_size", 0) / 2 # need fix
|
| 143 |
self.date = request.get("submitted_time", "")
|
| 144 |
self.architecture = request.get("architectures", "Unknown")
|
| 145 |
self.status = request.get("status", "Failed")
|
| 146 |
except Exception as e:
|
| 147 |
-
print(requests_path, self.full_model,
|
| 148 |
-
self.quant_type.value.name, self.precision.value.name,
|
| 149 |
-
self.weight_dtype.value.name, self.compute_dtype.value.name)
|
| 150 |
self.status = "Failed"
|
| 151 |
print(f"Could not find request file for {self.org}/{self.model}")
|
| 152 |
-
print(traceback.format_exc())
|
| 153 |
|
| 154 |
def update_with_dynamic_file_dict(self, file_dict):
|
| 155 |
self.license = file_dict.get("license", "?")
|
|
@@ -161,57 +153,67 @@ class EvalResult:
|
|
| 161 |
|
| 162 |
def to_dict(self):
|
| 163 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
data_dict = {
|
| 167 |
-
"eval_name": self.eval_name,
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
AutoEvalColumn.revision.name: self.revision,
|
| 180 |
-
AutoEvalColumn.average.name: average,
|
| 181 |
-
AutoEvalColumn.license.name: self.license,
|
| 182 |
-
AutoEvalColumn.likes.name: self.likes,
|
| 183 |
-
AutoEvalColumn.params.name: self.num_params,
|
| 184 |
-
AutoEvalColumn.model_size.name: self.model_size,
|
| 185 |
-
AutoEvalColumn.group_size.name: self.group_size,
|
| 186 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 187 |
-
AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
|
| 188 |
-
AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
|
| 189 |
-
AutoEvalColumn.flagged.name: self.flagged
|
| 190 |
}
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
for task in Tasks:
|
| 193 |
-
data_dict[task.value.col_name] = self.results
|
| 194 |
|
| 195 |
return data_dict
|
| 196 |
|
| 197 |
|
|
|
|
| 198 |
def get_request_file_for_model(requests_path, model_name,
|
| 199 |
quant_type, precision, weight_dtype, compute_dtype):
|
| 200 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 201 |
-
# {model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json
|
| 202 |
request_files = os.path.join(
|
| 203 |
requests_path,
|
| 204 |
f"{model_name}_eval_request_*.json",
|
| 205 |
)
|
| 206 |
request_files = glob.glob(request_files)
|
| 207 |
|
| 208 |
-
# Select correct request file (precision)
|
| 209 |
request_file = ""
|
| 210 |
request_files = sorted(request_files, reverse=True)
|
| 211 |
for tmp_request_file in request_files:
|
| 212 |
with open(tmp_request_file, "r") as f:
|
| 213 |
req_content = json.load(f)
|
| 214 |
-
print(model_name, req_content["precision"], precision.split(".")[-1], str(req_content["quant_type"]), quant_type, req_content["weight_dtype"], weight_dtype.split(".")[-1],req_content["compute_dtype"], compute_dtype.split(".")[-1] )
|
| 215 |
if (
|
| 216 |
req_content["status"] in ["Finished"]
|
| 217 |
and req_content["precision"] == precision.split(".")[-1]
|
|
@@ -236,48 +238,48 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
| 236 |
model_result_filepaths = []
|
| 237 |
|
| 238 |
for root, _, files in os.walk(results_path):
|
| 239 |
-
# We should only have json files in model results
|
| 240 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 241 |
continue
|
| 242 |
|
| 243 |
-
# Sort the files by date
|
| 244 |
try:
|
| 245 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 246 |
-
except
|
| 247 |
files = [files[-1]]
|
| 248 |
|
| 249 |
for file in files:
|
| 250 |
model_result_filepaths.append(os.path.join(root, file))
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
| 254 |
|
| 255 |
eval_results = {}
|
| 256 |
for model_result_filepath in model_result_filepaths:
|
| 257 |
-
# Creation of result
|
| 258 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
|
| 259 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
| 260 |
if eval_result.full_model in dynamic_data:
|
| 261 |
-
|
| 262 |
-
# Hardcoding because of gating problem
|
| 263 |
if "meta-llama" in eval_result.full_model:
|
| 264 |
eval_result.still_on_hub = True
|
| 265 |
|
| 266 |
-
# Store results of same eval together
|
| 267 |
eval_name = eval_result.eval_name
|
| 268 |
-
if eval_name in eval_results
|
| 269 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 270 |
else:
|
| 271 |
eval_results[eval_name] = eval_result
|
| 272 |
|
| 273 |
-
|
| 274 |
results = []
|
| 275 |
for v in eval_results.values():
|
| 276 |
try:
|
| 277 |
if v.status == "Finished":
|
| 278 |
-
v.to_dict()
|
| 279 |
results.append(v)
|
| 280 |
-
except
|
|
|
|
| 281 |
continue
|
| 282 |
|
| 283 |
return results
|
|
|
|
|
|
| 11 |
from huggingface_hub import ModelCard
|
| 12 |
|
| 13 |
from src.display.formatting import make_clickable_model
|
| 14 |
+
from src.display.utils import auto_eval_cols, ModelType, Tasks, Precision, WeightType, QuantType, WeightDtype, ComputeDtype
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass
|
|
|
|
| 60 |
quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
|
| 61 |
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
| 62 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
| 63 |
+
|
| 64 |
model_params = round(float(config["model_params"]), 2)
|
| 65 |
model_size = round(float(config["model_size"]), 2)
|
| 66 |
+
|
| 67 |
if data.get("quantization_config", None):
|
| 68 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
| 69 |
group_size = data["quantization_config"].get("group_size", -1)
|
|
|
|
| 81 |
|
| 82 |
if local and org_and_model[0] != "Intel":
|
| 83 |
org_and_model = config.get("model_name").split("/")
|
|
|
|
| 84 |
org_and_model = ["local", org_and_model[-1]]
|
| 85 |
quant_type = QuantType.autoround
|
| 86 |
|
|
|
|
| 94 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 95 |
full_model = "/".join(org_and_model)
|
| 96 |
|
| 97 |
+
# Extract results
|
| 98 |
results = {}
|
| 99 |
for task in Tasks:
|
| 100 |
task = task.value
|
|
|
|
| 136 |
try:
|
| 137 |
with open(request_file, "r") as f:
|
| 138 |
request = json.load(f)
|
|
|
|
|
|
|
|
|
|
| 139 |
self.date = request.get("submitted_time", "")
|
| 140 |
self.architecture = request.get("architectures", "Unknown")
|
| 141 |
self.status = request.get("status", "Failed")
|
| 142 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
| 143 |
self.status = "Failed"
|
| 144 |
print(f"Could not find request file for {self.org}/{self.model}")
|
|
|
|
| 145 |
|
| 146 |
def update_with_dynamic_file_dict(self, file_dict):
|
| 147 |
self.license = file_dict.get("license", "?")
|
|
|
|
| 153 |
|
| 154 |
def to_dict(self):
|
| 155 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 156 |
+
|
| 157 |
+
valid_results = [v for v in self.results.values() if v is not None]
|
| 158 |
+
average = sum(valid_results) / len(Tasks) if len(Tasks) > 0 else 0
|
| 159 |
+
|
| 160 |
data_dict = {
|
| 161 |
+
"eval_name": self.eval_name,
|
| 162 |
+
"date": self.date,
|
| 163 |
+
auto_eval_cols.precision.name: self.precision.value.name,
|
| 164 |
+
auto_eval_cols.quant_type.name: self.quant_type.value.name,
|
| 165 |
+
auto_eval_cols.model_type_symbol.name: self.quant_type.value.symbol,
|
| 166 |
+
auto_eval_cols.weight_dtype.name: self.weight_dtype.value.name,
|
| 167 |
+
auto_eval_cols.compute_dtype.name: self.compute_dtype.value.name,
|
| 168 |
+
auto_eval_cols.model.name: make_clickable_model(self.full_model, self.result_file),
|
| 169 |
+
auto_eval_cols.revision.name: self.revision,
|
| 170 |
+
auto_eval_cols.average.name: average,
|
| 171 |
+
auto_eval_cols.model_size.name: self.model_size,
|
| 172 |
+
auto_eval_cols.dummy.name: self.full_model,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
}
|
| 174 |
|
| 175 |
+
data_dict[auto_eval_cols.still_on_hub.name] = self.still_on_hub
|
| 176 |
+
data_dict[auto_eval_cols.flagged.name] = self.flagged
|
| 177 |
+
|
| 178 |
+
if hasattr(auto_eval_cols, "double_quant"):
|
| 179 |
+
data_dict[auto_eval_cols.double_quant.name] = self.double_quant
|
| 180 |
+
if hasattr(auto_eval_cols, "architecture"):
|
| 181 |
+
data_dict[auto_eval_cols.architecture.name] = self.architecture
|
| 182 |
+
if hasattr(auto_eval_cols, "params"):
|
| 183 |
+
data_dict[auto_eval_cols.params.name] = self.num_params
|
| 184 |
+
if hasattr(auto_eval_cols, "license"):
|
| 185 |
+
data_dict[auto_eval_cols.license.name] = self.license
|
| 186 |
+
if hasattr(auto_eval_cols, "likes"):
|
| 187 |
+
data_dict[auto_eval_cols.likes.name] = self.likes
|
| 188 |
+
if hasattr(auto_eval_cols, "group_size"):
|
| 189 |
+
data_dict[auto_eval_cols.group_size.name] = self.group_size
|
| 190 |
+
|
| 191 |
+
if hasattr(auto_eval_cols, "merged"):
|
| 192 |
+
data_dict[auto_eval_cols.merged.name] = "merge" in (self.tags if self.tags else [])
|
| 193 |
+
if hasattr(auto_eval_cols, "moe"):
|
| 194 |
+
data_dict[auto_eval_cols.moe.name] = ("moe" in (self.tags if self.tags else [])) or "moe" in self.full_model.lower()
|
| 195 |
+
|
| 196 |
for task in Tasks:
|
| 197 |
+
data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
|
| 198 |
|
| 199 |
return data_dict
|
| 200 |
|
| 201 |
|
| 202 |
+
|
| 203 |
def get_request_file_for_model(requests_path, model_name,
|
| 204 |
quant_type, precision, weight_dtype, compute_dtype):
|
| 205 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
|
|
|
| 206 |
request_files = os.path.join(
|
| 207 |
requests_path,
|
| 208 |
f"{model_name}_eval_request_*.json",
|
| 209 |
)
|
| 210 |
request_files = glob.glob(request_files)
|
| 211 |
|
|
|
|
| 212 |
request_file = ""
|
| 213 |
request_files = sorted(request_files, reverse=True)
|
| 214 |
for tmp_request_file in request_files:
|
| 215 |
with open(tmp_request_file, "r") as f:
|
| 216 |
req_content = json.load(f)
|
|
|
|
| 217 |
if (
|
| 218 |
req_content["status"] in ["Finished"]
|
| 219 |
and req_content["precision"] == precision.split(".")[-1]
|
|
|
|
| 238 |
model_result_filepaths = []
|
| 239 |
|
| 240 |
for root, _, files in os.walk(results_path):
|
|
|
|
| 241 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 242 |
continue
|
| 243 |
|
|
|
|
| 244 |
try:
|
| 245 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 246 |
+
except Exception:
|
| 247 |
files = [files[-1]]
|
| 248 |
|
| 249 |
for file in files:
|
| 250 |
model_result_filepaths.append(os.path.join(root, file))
|
| 251 |
|
| 252 |
+
dynamic_data = {}
|
| 253 |
+
if os.path.exists(dynamic_path):
|
| 254 |
+
with open(dynamic_path) as f:
|
| 255 |
+
dynamic_data = json.load(f)
|
| 256 |
|
| 257 |
eval_results = {}
|
| 258 |
for model_result_filepath in model_result_filepaths:
|
|
|
|
| 259 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 260 |
+
|
| 261 |
eval_result.update_with_request_file(requests_path)
|
| 262 |
+
|
| 263 |
if eval_result.full_model in dynamic_data:
|
| 264 |
+
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
|
|
|
| 265 |
if "meta-llama" in eval_result.full_model:
|
| 266 |
eval_result.still_on_hub = True
|
| 267 |
|
|
|
|
| 268 |
eval_name = eval_result.eval_name
|
| 269 |
+
if eval_name in eval_results:
|
| 270 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 271 |
else:
|
| 272 |
eval_results[eval_name] = eval_result
|
| 273 |
|
|
|
|
| 274 |
results = []
|
| 275 |
for v in eval_results.values():
|
| 276 |
try:
|
| 277 |
if v.status == "Finished":
|
| 278 |
+
v.to_dict()
|
| 279 |
results.append(v)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
print(f"Error processing {v.eval_name}: {e}")
|
| 282 |
continue
|
| 283 |
|
| 284 |
return results
|
| 285 |
+
|
src/populate.py
CHANGED
|
@@ -4,7 +4,7 @@ import os
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
-
from src.display.utils import
|
| 8 |
from src.leaderboard.filter_models import filter_models_flags
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
|
@@ -12,20 +12,23 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 13 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
-
|
| 16 |
all_data_json.append(baseline_row)
|
| 17 |
filter_models_flags(all_data_json)
|
| 18 |
-
print("Keys in the first record of all_data_json:", all_data_json[0].keys())
|
| 19 |
-
|
| 20 |
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
return raw_data, df
|
| 30 |
|
| 31 |
|
|
@@ -39,8 +42,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 39 |
with open(file_path) as fp:
|
| 40 |
data = json.load(fp)
|
| 41 |
|
| 42 |
-
data[
|
| 43 |
-
data[
|
| 44 |
|
| 45 |
all_evals.append(data)
|
| 46 |
elif ".md" not in entry:
|
|
@@ -51,14 +54,18 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 51 |
with open(file_path) as fp:
|
| 52 |
data = json.load(fp)
|
| 53 |
|
| 54 |
-
data[
|
| 55 |
-
data[
|
| 56 |
all_evals.append(data)
|
| 57 |
|
| 58 |
pending_list = [e for e in all_evals if e["status"] in ["Pending", "Rerun", "Waiting"]]
|
| 59 |
running_list = [e for e in all_evals if e["status"] == "Running"]
|
| 60 |
finished_list = [e for e in all_evals if e["status"].startswith("Finished") or e["status"] == "PENDING_NEW_EVAL"]
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
+
from src.display.utils import auto_eval_cols, eval_queue_cols, baseline_row
|
| 8 |
from src.leaderboard.filter_models import filter_models_flags
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
|
|
|
| 12 |
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 13 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
+
|
| 16 |
all_data_json.append(baseline_row)
|
| 17 |
filter_models_flags(all_data_json)
|
|
|
|
|
|
|
| 18 |
|
| 19 |
df = pd.DataFrame.from_records(all_data_json)
|
| 20 |
+
|
| 21 |
+
avg_col = auto_eval_cols.average.name
|
| 22 |
+
if avg_col in df.columns:
|
| 23 |
+
df = df.sort_values(by=[avg_col], ascending=False)
|
| 24 |
+
|
| 25 |
+
existing_cols = [c for c in cols if c in df.columns]
|
| 26 |
+
df = df[existing_cols].round(decimals=2)
|
| 27 |
|
| 28 |
+
existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
|
| 29 |
+
if existing_benchmarks:
|
| 30 |
+
df = df[has_no_nan_values(df, existing_benchmarks)]
|
| 31 |
+
|
| 32 |
return raw_data, df
|
| 33 |
|
| 34 |
|
|
|
|
| 42 |
with open(file_path) as fp:
|
| 43 |
data = json.load(fp)
|
| 44 |
|
| 45 |
+
data[eval_queue_cols.model.name] = make_clickable_model(data["model"])
|
| 46 |
+
data[eval_queue_cols.revision.name] = data.get("revision", "main")
|
| 47 |
|
| 48 |
all_evals.append(data)
|
| 49 |
elif ".md" not in entry:
|
|
|
|
| 54 |
with open(file_path) as fp:
|
| 55 |
data = json.load(fp)
|
| 56 |
|
| 57 |
+
data[eval_queue_cols.model.name] = make_clickable_model(data["model"])
|
| 58 |
+
data[eval_queue_cols.revision.name] = data.get("revision", "main")
|
| 59 |
all_evals.append(data)
|
| 60 |
|
| 61 |
pending_list = [e for e in all_evals if e["status"] in ["Pending", "Rerun", "Waiting"]]
|
| 62 |
running_list = [e for e in all_evals if e["status"] == "Running"]
|
| 63 |
finished_list = [e for e in all_evals if e["status"].startswith("Finished") or e["status"] == "PENDING_NEW_EVAL"]
|
| 64 |
+
|
| 65 |
+
existing_q_cols = [c for c in cols if c in pd.DataFrame(all_evals).columns] if all_evals else cols
|
| 66 |
+
|
| 67 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=existing_q_cols)
|
| 68 |
+
df_running = pd.DataFrame.from_records(running_list, columns=existing_q_cols)
|
| 69 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=existing_q_cols)
|
| 70 |
+
|
| 71 |
+
return df_finished[existing_q_cols], df_running[existing_q_cols], df_pending[existing_q_cols]
|
src/tools/plots.py
CHANGED
|
@@ -4,43 +4,44 @@ import plotly.express as px
|
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 7 |
-
from src.display.utils import human_baseline_row as HUMAN_BASELINE,
|
| 8 |
from src.leaderboard.read_evals import EvalResult
|
| 9 |
|
| 10 |
|
| 11 |
-
|
| 12 |
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
| 13 |
"""
|
| 14 |
Generates a DataFrame containing the maximum scores until each date.
|
| 15 |
-
|
| 16 |
-
:param results_df: A DataFrame containing result information including metric scores and dates.
|
| 17 |
-
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
| 18 |
"""
|
| 19 |
-
|
| 20 |
-
results_df = pd.DataFrame(
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
|
| 26 |
|
| 27 |
-
|
| 28 |
-
for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
|
| 29 |
current_max = 0
|
| 30 |
last_date = ""
|
| 31 |
column = task.col_name
|
|
|
|
| 32 |
for _, row in results_df.iterrows():
|
| 33 |
-
current_model = row
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
| 36 |
if to_ignore:
|
| 37 |
continue
|
| 38 |
|
| 39 |
-
current_date = row
|
|
|
|
|
|
|
| 40 |
if task.benchmark == "Average":
|
| 41 |
-
current_score =
|
| 42 |
else:
|
| 43 |
-
current_score = row
|
| 44 |
|
| 45 |
if current_score > current_max:
|
| 46 |
if current_date == last_date and len(scores[column]) > 0:
|
|
@@ -50,57 +51,36 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
| 50 |
current_max = current_score
|
| 51 |
last_date = current_date
|
| 52 |
|
| 53 |
-
# Step 4: Return all dictionaries as DataFrames
|
| 54 |
return {k: pd.DataFrame(v) for k, v in scores.items()}
|
| 55 |
|
| 56 |
|
| 57 |
-
def create_plot_df(scores_df: dict[str
|
| 58 |
-
"""
|
| 59 |
-
Transforms the scores DataFrame into a new format suitable for plotting.
|
| 60 |
-
|
| 61 |
-
:param scores_df: A DataFrame containing metric scores and dates.
|
| 62 |
-
:return: A new DataFrame reshaped for plotting purposes.
|
| 63 |
-
"""
|
| 64 |
-
# Initialize the list to store DataFrames
|
| 65 |
dfs = []
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
|
| 76 |
-
|
| 77 |
concat_df.sort_values(by="date", inplace=True)
|
| 78 |
concat_df.reset_index(drop=True, inplace=True)
|
| 79 |
return concat_df
|
| 80 |
|
| 81 |
|
| 82 |
-
def create_metric_plot_obj(
|
| 83 |
-
df
|
| 84 |
-
|
| 85 |
-
"""
|
| 86 |
-
Create a Plotly figure object with lines representing different metrics
|
| 87 |
-
and horizontal dotted lines representing human baselines.
|
| 88 |
-
|
| 89 |
-
:param df: The DataFrame containing the metric values, names, and dates.
|
| 90 |
-
:param metrics: A list of strings representing the names of the metrics
|
| 91 |
-
to be included in the plot.
|
| 92 |
-
:param title: A string representing the title of the plot.
|
| 93 |
-
:return: A Plotly figure object with lines representing metrics and
|
| 94 |
-
horizontal dotted lines representing human baselines.
|
| 95 |
-
"""
|
| 96 |
|
| 97 |
-
# Filter the DataFrame based on the specified metrics
|
| 98 |
df = df[df["task"].isin(metrics)]
|
| 99 |
|
| 100 |
-
# Filter the human baselines based on the specified metrics
|
| 101 |
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
| 102 |
|
| 103 |
-
# Create a line figure using plotly express with specified markers and custom data
|
| 104 |
fig = px.line(
|
| 105 |
df,
|
| 106 |
x="date",
|
|
@@ -111,33 +91,21 @@ def create_metric_plot_obj(
|
|
| 111 |
title=title,
|
| 112 |
)
|
| 113 |
|
| 114 |
-
# Update hovertemplate for better hover interaction experience
|
| 115 |
fig.update_traces(
|
| 116 |
-
hovertemplate="<br>".join(
|
| 117 |
-
[
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
]
|
| 123 |
-
)
|
| 124 |
)
|
| 125 |
|
| 126 |
-
# Update the range of the y-axis
|
| 127 |
fig.update_layout(yaxis_range=[0, 100])
|
|
|
|
| 128 |
|
| 129 |
-
# Create a dictionary to hold the color mapping for each metric
|
| 130 |
-
metric_color_mapping = {}
|
| 131 |
-
|
| 132 |
-
# Map each metric name to its color in the figure
|
| 133 |
-
for trace in fig.data:
|
| 134 |
-
metric_color_mapping[trace.name] = trace.line.color
|
| 135 |
-
|
| 136 |
-
# Iterate over filtered human baselines and add horizontal lines to the figure
|
| 137 |
for metric, value in filtered_human_baselines.items():
|
| 138 |
-
color = metric_color_mapping.get(metric, "blue")
|
| 139 |
-
location = "top left" if metric == "HellaSwag" else "bottom left"
|
| 140 |
-
# Add horizontal line with matched color and positioned annotation
|
| 141 |
fig.add_hline(
|
| 142 |
y=value,
|
| 143 |
line_dash="dot",
|
|
@@ -148,9 +116,4 @@ def create_metric_plot_obj(
|
|
| 148 |
line_color=color,
|
| 149 |
)
|
| 150 |
|
| 151 |
-
return fig
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
# Example Usage:
|
| 155 |
-
# human_baselines dictionary is defined.
|
| 156 |
-
# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
|
|
|
|
| 4 |
from plotly.graph_objs import Figure
|
| 5 |
|
| 6 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
| 7 |
+
from src.display.utils import human_baseline_row as HUMAN_BASELINE, auto_eval_cols, Tasks, Task, BENCHMARK_COLS
|
| 8 |
from src.leaderboard.read_evals import EvalResult
|
| 9 |
|
| 10 |
|
|
|
|
| 11 |
def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
| 12 |
"""
|
| 13 |
Generates a DataFrame containing the maximum scores until each date.
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
+
all_data = [v.to_dict() for v in raw_data]
|
| 16 |
+
results_df = pd.DataFrame(all_data)
|
| 17 |
+
|
| 18 |
+
if "date" in results_df.columns:
|
| 19 |
+
results_df.sort_values(by="date", inplace=True)
|
| 20 |
|
| 21 |
+
scores = {k: [] for k in BENCHMARK_COLS + [auto_eval_cols.average.name]}
|
|
|
|
| 22 |
|
| 23 |
+
for task in [t.value for t in Tasks] + [Task("Average", "avg", auto_eval_cols.average.name)]:
|
|
|
|
| 24 |
current_max = 0
|
| 25 |
last_date = ""
|
| 26 |
column = task.col_name
|
| 27 |
+
|
| 28 |
for _, row in results_df.iterrows():
|
| 29 |
+
current_model = row.get("dummy", "Unknown")
|
| 30 |
+
|
| 31 |
+
still_on_hub = row.get(auto_eval_cols.still_on_hub.name, True)
|
| 32 |
+
is_flagged = row.get(auto_eval_cols.flagged.name, False)
|
| 33 |
+
|
| 34 |
+
to_ignore = not still_on_hub or is_flagged or current_model in FLAGGED_MODELS
|
| 35 |
if to_ignore:
|
| 36 |
continue
|
| 37 |
|
| 38 |
+
current_date = row.get("date", "")
|
| 39 |
+
if not current_date: continue
|
| 40 |
+
|
| 41 |
if task.benchmark == "Average":
|
| 42 |
+
current_score = row.get(auto_eval_cols.average.name, 0)
|
| 43 |
else:
|
| 44 |
+
current_score = row.get(task.col_name, 0)
|
| 45 |
|
| 46 |
if current_score > current_max:
|
| 47 |
if current_date == last_date and len(scores[column]) > 0:
|
|
|
|
| 51 |
current_max = current_score
|
| 52 |
last_date = current_date
|
| 53 |
|
|
|
|
| 54 |
return {k: pd.DataFrame(v) for k, v in scores.items()}
|
| 55 |
|
| 56 |
|
| 57 |
+
def create_plot_df(scores_df: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
| 58 |
+
"""Reshapes the scores DataFrame for plotting."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
dfs = []
|
| 60 |
|
| 61 |
+
for col in BENCHMARK_COLS + [auto_eval_cols.average.name]:
|
| 62 |
+
if col in scores_df and not scores_df[col].empty:
|
| 63 |
+
d = scores_df[col].reset_index(drop=True)
|
| 64 |
+
d["task"] = col
|
| 65 |
+
dfs.append(d)
|
| 66 |
|
| 67 |
+
if not dfs:
|
| 68 |
+
return pd.DataFrame(columns=["model", "date", "score", "task"])
|
| 69 |
|
| 70 |
+
concat_df = pd.concat(dfs, ignore_index=True)
|
| 71 |
concat_df.sort_values(by="date", inplace=True)
|
| 72 |
concat_df.reset_index(drop=True, inplace=True)
|
| 73 |
return concat_df
|
| 74 |
|
| 75 |
|
| 76 |
+
def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
|
| 77 |
+
if df.empty:
|
| 78 |
+
return px.line(title="No data available")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
|
|
|
| 80 |
df = df[df["task"].isin(metrics)]
|
| 81 |
|
|
|
|
| 82 |
filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
|
| 83 |
|
|
|
|
| 84 |
fig = px.line(
|
| 85 |
df,
|
| 86 |
x="date",
|
|
|
|
| 91 |
title=title,
|
| 92 |
)
|
| 93 |
|
|
|
|
| 94 |
fig.update_traces(
|
| 95 |
+
hovertemplate="<br>".join([
|
| 96 |
+
"Model Name: %{customdata[2]}",
|
| 97 |
+
"Metric Name: %{customdata[0]}",
|
| 98 |
+
"Date: %{x}",
|
| 99 |
+
"Metric Value: %{y}",
|
| 100 |
+
])
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
|
|
|
|
| 103 |
fig.update_layout(yaxis_range=[0, 100])
|
| 104 |
+
metric_color_mapping = {trace.name: trace.line.color for trace in fig.data}
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
for metric, value in filtered_human_baselines.items():
|
| 107 |
+
color = metric_color_mapping.get(metric, "blue")
|
| 108 |
+
location = "top left" if metric == "HellaSwag" else "bottom left"
|
|
|
|
| 109 |
fig.add_hline(
|
| 110 |
y=value,
|
| 111 |
line_dash="dot",
|
|
|
|
| 116 |
line_color=color,
|
| 117 |
)
|
| 118 |
|
| 119 |
+
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|