Spaces:
Runtime error
Runtime error
[ADD_MODEL]
#1
by Jongyoon-Song - opened
- .gitattributes +1 -1
- README.md +39 -7
- app.py +194 -160
- constants.py +0 -36
- handlers.py +0 -86
- src/about.py +10 -58
- src/config.py +2 -1
- src/data/export_category_250618.csv +11 -0
- src/data/export_category_250709.csv +0 -0
- src/data/export_lang_250618.csv +11 -0
- src/data/export_lang_250709.csv +0 -0
- src/data_utils.py +2 -2
- src/display/css_html_js.py +1 -281
- src/display/formatting.py +0 -125
- src/display/utils.py +15 -22
- src/envs.py +2 -2
- src/submission/check_validity.py +2 -3
- src/submission/submit.py +24 -215
- ui.py +0 -228
- utils.py +0 -42
.gitattributes
CHANGED
|
@@ -25,7 +25,6 @@
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,14 +1,46 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.38.0
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
| 10 |
license: apache-2.0
|
| 11 |
-
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Test Space
|
| 3 |
+
emoji: 🥇
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
license: apache-2.0
|
| 10 |
+
short_description: Duplicate this leaderboard to initialize your own!
|
| 11 |
+
sdk_version: 5.19.0
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Start the configuration
|
| 15 |
+
|
| 16 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
| 17 |
+
|
| 18 |
+
Results files should have the following format and be stored as json files:
|
| 19 |
+
```json
|
| 20 |
+
{
|
| 21 |
+
"config": {
|
| 22 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
| 23 |
+
"model_name": "path of the model on the hub: org/model",
|
| 24 |
+
"model_sha": "revision on the hub",
|
| 25 |
+
},
|
| 26 |
+
"results": {
|
| 27 |
+
"task_name": {
|
| 28 |
+
"metric_name": score,
|
| 29 |
+
},
|
| 30 |
+
"task_name2": {
|
| 31 |
+
"metric_name": score,
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
Request files are created automatically by this tool.
|
| 38 |
+
|
| 39 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
| 40 |
+
|
| 41 |
+
# Code logic for more complex edits
|
| 42 |
+
|
| 43 |
+
You'll find
|
| 44 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
| 45 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
| 46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
CHANGED
|
@@ -1,21 +1,15 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 4 |
import pandas as pd
|
| 5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 6 |
from huggingface_hub import snapshot_download
|
| 7 |
from src.data_utils import get_dataframe_category, get_dataframe_language
|
| 8 |
import src.config as configs
|
| 9 |
-
from utils import get_profile, get_organizations, get_profile_and_organizations, download_with_restart
|
| 10 |
-
|
| 11 |
|
| 12 |
from src.about import (
|
| 13 |
CITATION_BUTTON_LABEL,
|
| 14 |
CITATION_BUTTON_TEXT,
|
| 15 |
EVALUATION_QUEUE_TEXT,
|
| 16 |
-
EVALUATION_QUEUE_TEXT_OPTION1,
|
| 17 |
-
EVALUATION_QUEUE_TEXT_OPTION2,
|
| 18 |
-
EVALUATION_QUEUE_TEXT_OPTION3,
|
| 19 |
INTRODUCTION_TEXT,
|
| 20 |
LLM_BENCHMARKS_TEXT,
|
| 21 |
TITLE,
|
|
@@ -27,45 +21,37 @@ from src.display.utils import (
|
|
| 27 |
EVAL_COLS,
|
| 28 |
EVAL_TYPES,
|
| 29 |
AutoEvalColumn,
|
|
|
|
| 30 |
fields,
|
| 31 |
WeightType,
|
| 32 |
Precision
|
| 33 |
)
|
| 34 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 35 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 36 |
-
from src.submission.submit import
|
| 37 |
|
| 38 |
|
| 39 |
-
from handlers import (
|
| 40 |
-
search_leaderboard,
|
| 41 |
-
update_modelselector_group,
|
| 42 |
-
update_columnselector_group,
|
| 43 |
-
update_leaderboard,
|
| 44 |
-
get_models_by_group,
|
| 45 |
-
)
|
| 46 |
-
from ui import create_leaderboard_tab
|
| 47 |
-
from constants import TAB_KEYS, TAB_NAMES, VLLM_VERSIONS
|
| 48 |
-
|
| 49 |
def restart_space():
|
| 50 |
API.restart_space(repo_id=REPO_ID)
|
| 51 |
|
| 52 |
### Space initialisation
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
snapshot_download
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
(
|
| 71 |
finished_eval_queue_df,
|
|
@@ -73,28 +59,132 @@ download_with_restart(
|
|
| 73 |
pending_eval_queue_df,
|
| 74 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
demo = gr.Blocks(css=custom_css)
|
| 77 |
with demo:
|
| 78 |
gr.HTML(TITLE)
|
| 79 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 80 |
-
|
| 81 |
-
organization_state = gr.State()
|
| 82 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 99 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 100 |
|
|
@@ -103,23 +193,57 @@ with demo:
|
|
| 103 |
with gr.Row():
|
| 104 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 105 |
|
| 106 |
-
with gr.
|
| 107 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
with gr.Row():
|
| 110 |
-
gr.Markdown("#
|
| 111 |
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column():
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
multiselect=False,
|
| 118 |
-
value=
|
| 119 |
interactive=True,
|
| 120 |
)
|
| 121 |
-
|
| 122 |
-
|
| 123 |
precision = gr.Dropdown(
|
| 124 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 125 |
label="Precision",
|
|
@@ -127,118 +251,29 @@ with demo:
|
|
| 127 |
value="float16",
|
| 128 |
interactive=True,
|
| 129 |
)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
label="vLLM version",
|
| 134 |
multiselect=False,
|
| 135 |
-
value="
|
| 136 |
interactive=True,
|
| 137 |
)
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
top_p_textbox = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
|
| 141 |
-
top_k_textbox = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
|
| 142 |
-
presence_penalty_textbox = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
|
| 143 |
-
frequency_penalty_textbox = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
|
| 144 |
-
repetition_penalty_textbox = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
|
| 145 |
-
|
| 146 |
-
login_button = gr.LoginButton()
|
| 147 |
submit_button = gr.Button("Submit Eval")
|
| 148 |
submission_result = gr.Markdown()
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
add_new_eval_option1,
|
| 152 |
[
|
| 153 |
-
benchmark_type,
|
| 154 |
model_name_textbox,
|
| 155 |
base_model_name_textbox,
|
| 156 |
revision_name_textbox,
|
| 157 |
precision,
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
top_k_textbox,
|
| 161 |
-
presence_penalty_textbox,
|
| 162 |
-
frequency_penalty_textbox,
|
| 163 |
-
repetition_penalty_textbox,
|
| 164 |
-
vllm_version_type,
|
| 165 |
-
user_state,
|
| 166 |
-
organization_state
|
| 167 |
],
|
| 168 |
submission_result,
|
| 169 |
)
|
| 170 |
-
with gr.Row():
|
| 171 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION2, elem_classes="markdown-text")
|
| 172 |
-
|
| 173 |
-
with gr.Row():
|
| 174 |
-
gr.Markdown("## ✉️✨ Submit your model here! (if vLLM inference is unavailable)", elem_classes="markdown-text")
|
| 175 |
-
|
| 176 |
-
with gr.Row():
|
| 177 |
-
with gr.Column():
|
| 178 |
-
benchmark_type2 = gr.Dropdown(
|
| 179 |
-
choices=["TRUEBench v0.1"],
|
| 180 |
-
label="The name of the benchmark to be evaluated",
|
| 181 |
-
multiselect=False,
|
| 182 |
-
value="TRUEBench v0.1",
|
| 183 |
-
interactive=True,
|
| 184 |
-
)
|
| 185 |
-
model_name_textbox2 = gr.Textbox(label="Model name")
|
| 186 |
-
revision_name_textbox2 = gr.Textbox(label="Revision commit", placeholder="main")
|
| 187 |
-
precision2 = gr.Dropdown(
|
| 188 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 189 |
-
label="Precision",
|
| 190 |
-
multiselect=False,
|
| 191 |
-
value="float16",
|
| 192 |
-
interactive=True,
|
| 193 |
-
)
|
| 194 |
-
base_model_name_textbox2 = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 195 |
-
|
| 196 |
-
with gr.Column():
|
| 197 |
-
temperature_textbox2 = gr.Textbox(label="Sampling Temperature (default: 1.0)", placeholder="1.0")
|
| 198 |
-
top_p_textbox2 = gr.Textbox(label="Top-p (default: 1.0)", placeholder="1.0")
|
| 199 |
-
top_k_textbox2 = gr.Textbox(label="Top-k (default: -1)", placeholder="-1")
|
| 200 |
-
presence_penalty_textbox2 = gr.Textbox(label="Presence penalty (default: 0.0)", placeholder="0.0")
|
| 201 |
-
frequency_penalty_textbox2 = gr.Textbox(label="Frequency penalty (default: 0.0)", placeholder="0.0")
|
| 202 |
-
repetition_penalty_textbox2 = gr.Textbox(label="Repetition penalty (default: 1.0)", placeholder="1.0")
|
| 203 |
-
|
| 204 |
-
with gr.Row():
|
| 205 |
-
with gr.Column():
|
| 206 |
-
model_load_code_snippet_textbox = gr.Textbox(label="Code for model loading", lines=15, placeholder="model = AutoModel.from_pretrained('your model name', revision=revision)")
|
| 207 |
-
with gr.Column():
|
| 208 |
-
inference_code_snippet_textbox = gr.Textbox(label="Code for inference", lines=15, placeholder="output = model(...)")
|
| 209 |
-
with gr.Column():
|
| 210 |
-
terminate_code_snippet_textbox = gr.Textbox(label="Code for termination", lines=15)
|
| 211 |
-
|
| 212 |
-
login_button2 = gr.LoginButton()
|
| 213 |
-
|
| 214 |
-
submit_button2 = gr.Button("Submit Eval")
|
| 215 |
-
submission_result2 = gr.Markdown()
|
| 216 |
-
event2 = submit_button2.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
|
| 217 |
-
event2.then(
|
| 218 |
-
add_new_eval_option2,
|
| 219 |
-
[
|
| 220 |
-
benchmark_type2,
|
| 221 |
-
model_name_textbox2,
|
| 222 |
-
base_model_name_textbox2,
|
| 223 |
-
revision_name_textbox2,
|
| 224 |
-
precision2,
|
| 225 |
-
temperature_textbox2,
|
| 226 |
-
top_p_textbox2,
|
| 227 |
-
top_k_textbox2,
|
| 228 |
-
presence_penalty_textbox2,
|
| 229 |
-
frequency_penalty_textbox2,
|
| 230 |
-
repetition_penalty_textbox2,
|
| 231 |
-
model_load_code_snippet_textbox,
|
| 232 |
-
inference_code_snippet_textbox,
|
| 233 |
-
terminate_code_snippet_textbox,
|
| 234 |
-
user_state,
|
| 235 |
-
organization_state
|
| 236 |
-
],
|
| 237 |
-
submission_result2,
|
| 238 |
-
)
|
| 239 |
-
|
| 240 |
-
with gr.Row():
|
| 241 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION3, elem_classes="markdown-text")
|
| 242 |
|
| 243 |
with gr.Row():
|
| 244 |
with gr.Accordion("📙 Citation", open=False):
|
|
@@ -250,8 +285,7 @@ with demo:
|
|
| 250 |
show_copy_button=True,
|
| 251 |
)
|
| 252 |
|
| 253 |
-
|
| 254 |
scheduler = BackgroundScheduler()
|
| 255 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 256 |
scheduler.start()
|
| 257 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
from src.data_utils import get_dataframe_category, get_dataframe_language
|
| 7 |
import src.config as configs
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from src.about import (
|
| 10 |
CITATION_BUTTON_LABEL,
|
| 11 |
CITATION_BUTTON_TEXT,
|
| 12 |
EVALUATION_QUEUE_TEXT,
|
|
|
|
|
|
|
|
|
|
| 13 |
INTRODUCTION_TEXT,
|
| 14 |
LLM_BENCHMARKS_TEXT,
|
| 15 |
TITLE,
|
|
|
|
| 21 |
EVAL_COLS,
|
| 22 |
EVAL_TYPES,
|
| 23 |
AutoEvalColumn,
|
| 24 |
+
ModelType,
|
| 25 |
fields,
|
| 26 |
WeightType,
|
| 27 |
Precision
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 31 |
+
from src.submission.submit import add_new_eval
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def restart_space():
|
| 35 |
API.restart_space(repo_id=REPO_ID)
|
| 36 |
|
| 37 |
### Space initialisation
|
| 38 |
+
try:
|
| 39 |
+
print(EVAL_REQUESTS_PATH)
|
| 40 |
+
snapshot_download(
|
| 41 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 42 |
+
)
|
| 43 |
+
except Exception:
|
| 44 |
+
restart_space()
|
| 45 |
+
try:
|
| 46 |
+
print(EVAL_RESULTS_PATH)
|
| 47 |
+
snapshot_download(
|
| 48 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 49 |
+
)
|
| 50 |
+
except Exception:
|
| 51 |
+
restart_space()
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 55 |
|
| 56 |
(
|
| 57 |
finished_eval_queue_df,
|
|
|
|
| 59 |
pending_eval_queue_df,
|
| 60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 61 |
|
| 62 |
+
# def init_leaderboard(dataframe):
|
| 63 |
+
# if dataframe is None or dataframe.empty:
|
| 64 |
+
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 65 |
+
# return Leaderboard(
|
| 66 |
+
# value=dataframe,
|
| 67 |
+
# datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 68 |
+
# select_columns=SelectColumns(
|
| 69 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 70 |
+
# cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 71 |
+
# label="Select Columns to Display:",
|
| 72 |
+
# ),
|
| 73 |
+
# search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 74 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 75 |
+
# filter_columns=[
|
| 76 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 77 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
| 78 |
+
# ColumnFilter(
|
| 79 |
+
# AutoEvalColumn.params.name,
|
| 80 |
+
# type="slider",
|
| 81 |
+
# min=0.01,
|
| 82 |
+
# max=150,
|
| 83 |
+
# label="Select the number of parameters (B)",
|
| 84 |
+
# ),
|
| 85 |
+
# ColumnFilter(
|
| 86 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
| 87 |
+
# ),
|
| 88 |
+
# ],
|
| 89 |
+
# bool_checkboxgroup_label="Hide models",
|
| 90 |
+
# interactive=False,
|
| 91 |
+
# )
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
tab_keys = ["Category", "Language"]
|
| 95 |
+
|
| 96 |
demo = gr.Blocks(css=custom_css)
|
| 97 |
with demo:
|
| 98 |
gr.HTML(TITLE)
|
| 99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 100 |
+
|
|
|
|
| 101 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 102 |
+
|
| 103 |
+
def search_leaderboard(query, df):
|
| 104 |
+
if not query.strip():
|
| 105 |
+
return df
|
| 106 |
+
filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
|
| 107 |
+
return filtered
|
| 108 |
+
|
| 109 |
+
def update_modelselector_group(groups, df):
|
| 110 |
+
"""
|
| 111 |
+
groups (gr.CheckboxGroup): List of currently selected models
|
| 112 |
+
df (DataFrame or gr.State): Current dataframe
|
| 113 |
+
"""
|
| 114 |
+
print("groups:", groups)
|
| 115 |
+
if not groups:
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
filtered_df = df[df["Group"].isin(groups)]
|
| 119 |
+
models = filtered_df["Model Name"].unique().tolist()
|
| 120 |
+
|
| 121 |
+
return models
|
| 122 |
+
|
| 123 |
+
def update_columnselector_group(columns, groups, df):
|
| 124 |
+
print("column groups:", groups)
|
| 125 |
+
|
| 126 |
+
columns = [c for c in columns if c in df.columns[:3]]
|
| 127 |
+
|
| 128 |
+
columns.extend(df.columns[3:])
|
| 129 |
+
|
| 130 |
+
print(columns)
|
| 131 |
+
|
| 132 |
+
return columns
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def update_leaderboard(models, columns, df):
|
| 136 |
+
print("models:", models)
|
| 137 |
+
print("columns:", columns)
|
| 138 |
+
|
| 139 |
+
filtered_df = df[df["Model Name"].isin(models)]
|
| 140 |
+
filtered_columns = [c for c in df.columns if c in columns or c in ["Model Name"]]
|
| 141 |
+
filtered_df = filtered_df[filtered_columns]
|
| 142 |
+
|
| 143 |
+
for col in filtered_df.select_dtypes(include="number").columns:
|
| 144 |
+
filtered_df[col] = filtered_df[col].round(3)
|
| 145 |
+
|
| 146 |
+
return filtered_df
|
| 147 |
+
|
| 148 |
+
def get_models_by_group(df, groups):
|
| 149 |
+
return df[df["Group"].isin(groups)]["Model Name"].tolist()
|
| 150 |
+
|
| 151 |
+
for _, key in enumerate(tab_keys):
|
| 152 |
+
with gr.TabItem(key, visible=True):
|
| 153 |
+
if key == "Category":
|
| 154 |
+
df = get_dataframe_category()
|
| 155 |
+
else:
|
| 156 |
+
df = get_dataframe_language()
|
| 157 |
+
df_state = gr.State(df)
|
| 158 |
+
|
| 159 |
+
with gr.Row():
|
| 160 |
+
with gr.Column():
|
| 161 |
+
search_box = gr.Textbox(label="Search Model by Name")
|
| 162 |
+
group_list = df["Group"].unique().tolist()
|
| 163 |
+
group_selector = gr.CheckboxGroup(choices=df["Group"].unique().tolist(), value=group_list, label="Select Model Group")
|
| 164 |
+
|
| 165 |
+
if key == "Category":
|
| 166 |
+
column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_CATEGORY[3:], label="Select Columns")
|
| 167 |
+
else:
|
| 168 |
+
column_selector = gr.CheckboxGroup(choices=df.columns.tolist()[3:], value=configs.ON_LOAD_COLUMNS_LANG[3:], label="Select Columns")
|
| 169 |
+
|
| 170 |
+
with gr.Column():
|
| 171 |
+
with gr.Accordion("세부 사항", open=False):
|
| 172 |
+
model_group = df["Model Name"].tolist()
|
| 173 |
+
model_selector = gr.CheckboxGroup(choices=df["Model Name"].tolist(), value=model_group, label="Select Models")
|
| 174 |
+
|
| 175 |
+
ld = gr.DataFrame(
|
| 176 |
+
value=df.round(3)
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Define change functions for user interaction
|
| 180 |
+
search_box.change(fn=search_leaderboard, inputs=[search_box, df_state], outputs=ld)
|
| 181 |
+
group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
|
| 182 |
+
model_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
|
| 183 |
+
column_selector.change(fn=update_leaderboard, inputs=[model_selector, column_selector, df_state], outputs=ld)
|
| 184 |
+
|
| 185 |
+
# with gr.TabItem("Docs"):
|
| 186 |
+
# gr.Markdown((Path(__file__).parent / "docs.md").read_text())
|
| 187 |
+
|
| 188 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 189 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 190 |
|
|
|
|
| 193 |
with gr.Row():
|
| 194 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 195 |
|
| 196 |
+
with gr.Column():
|
| 197 |
+
with gr.Accordion(
|
| 198 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
| 199 |
+
open=False,
|
| 200 |
+
):
|
| 201 |
+
with gr.Row():
|
| 202 |
+
finished_eval_table = gr.components.Dataframe(
|
| 203 |
+
value=finished_eval_queue_df,
|
| 204 |
+
headers=EVAL_COLS,
|
| 205 |
+
datatype=EVAL_TYPES,
|
| 206 |
+
row_count=5,
|
| 207 |
+
)
|
| 208 |
+
with gr.Accordion(
|
| 209 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
| 210 |
+
open=False,
|
| 211 |
+
):
|
| 212 |
+
with gr.Row():
|
| 213 |
+
running_eval_table = gr.components.Dataframe(
|
| 214 |
+
value=running_eval_queue_df,
|
| 215 |
+
headers=EVAL_COLS,
|
| 216 |
+
datatype=EVAL_TYPES,
|
| 217 |
+
row_count=5,
|
| 218 |
+
)
|
| 219 |
|
| 220 |
+
with gr.Accordion(
|
| 221 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
| 222 |
+
open=False,
|
| 223 |
+
):
|
| 224 |
+
with gr.Row():
|
| 225 |
+
pending_eval_table = gr.components.Dataframe(
|
| 226 |
+
value=pending_eval_queue_df,
|
| 227 |
+
headers=EVAL_COLS,
|
| 228 |
+
datatype=EVAL_TYPES,
|
| 229 |
+
row_count=5,
|
| 230 |
+
)
|
| 231 |
with gr.Row():
|
| 232 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
| 233 |
|
| 234 |
with gr.Row():
|
| 235 |
with gr.Column():
|
| 236 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
| 237 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
| 238 |
+
model_type = gr.Dropdown(
|
| 239 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
| 240 |
+
label="Model type",
|
| 241 |
multiselect=False,
|
| 242 |
+
value=None,
|
| 243 |
interactive=True,
|
| 244 |
)
|
| 245 |
+
|
| 246 |
+
with gr.Column():
|
| 247 |
precision = gr.Dropdown(
|
| 248 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
| 249 |
label="Precision",
|
|
|
|
| 251 |
value="float16",
|
| 252 |
interactive=True,
|
| 253 |
)
|
| 254 |
+
weight_type = gr.Dropdown(
|
| 255 |
+
choices=[i.value.name for i in WeightType],
|
| 256 |
+
label="Weights type",
|
|
|
|
| 257 |
multiselect=False,
|
| 258 |
+
value="Original",
|
| 259 |
interactive=True,
|
| 260 |
)
|
| 261 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 262 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
submit_button = gr.Button("Submit Eval")
|
| 264 |
submission_result = gr.Markdown()
|
| 265 |
+
submit_button.click(
|
| 266 |
+
add_new_eval,
|
|
|
|
| 267 |
[
|
|
|
|
| 268 |
model_name_textbox,
|
| 269 |
base_model_name_textbox,
|
| 270 |
revision_name_textbox,
|
| 271 |
precision,
|
| 272 |
+
weight_type,
|
| 273 |
+
model_type,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
],
|
| 275 |
submission_result,
|
| 276 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
with gr.Row():
|
| 279 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
|
| 285 |
show_copy_button=True,
|
| 286 |
)
|
| 287 |
|
|
|
|
| 288 |
scheduler = BackgroundScheduler()
|
| 289 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 290 |
scheduler.start()
|
| 291 |
+
demo.queue(default_concurrency_limit=40).launch()
|
constants.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
# constants.py
|
| 2 |
-
|
| 3 |
-
TAB_KEYS = ["Category", "Language"]
|
| 4 |
-
|
| 5 |
-
TAB_NAMES = {
|
| 6 |
-
"Category": "TRUEBench v0.1 (Category 🔧)",
|
| 7 |
-
"Language": "TRUEBench v0.1 (Language 🌎)"
|
| 8 |
-
}
|
| 9 |
-
|
| 10 |
-
VLLM_VERSIONS = [
|
| 11 |
-
"v0.9.2", "v0.9.2rc2", "v0.9.2rc1", "v0.9.1", "v0.9.1rc2", "v0.9.1rc1",
|
| 12 |
-
"v0.9.0.1", "v0.9.0", "v0.8.5", "v0.8.5.post1", "v0.8.4", "v0.8.3",
|
| 13 |
-
"v0.8.3rc1", "v0.8.2", "v0.8.1", "v0.8.0", "v0.8.0rc2", "v0.8.0rc1",
|
| 14 |
-
"v0.7.3", "v0.7.2", "v0.7.1", "v0.6.6", "v0.6.6.post1", "v0.6.5",
|
| 15 |
-
"v0.6.4.post1", "v0.6.4", "v0.6.3.post1", "v0.6.2", "v0.6.1",
|
| 16 |
-
"v0.6.1.post2", "v0.6.1.post1", "v0.6.0"
|
| 17 |
-
]
|
| 18 |
-
|
| 19 |
-
# 리더보드 필수 컬럼(항상 포함되어야 함)
|
| 20 |
-
LEADERBOARD_REQUIRED_COLUMNS = [
|
| 21 |
-
"Model Name", "Group", "Overall", "Model Type", "Output Form", "Rank"
|
| 22 |
-
]
|
| 23 |
-
|
| 24 |
-
# Model badge mappings (centralized for both UI and backend)
|
| 25 |
-
MODEL_TYPE_MAP = {
|
| 26 |
-
"deepseek_r1": "open",
|
| 27 |
-
"deepseek_r1_0528": "open",
|
| 28 |
-
"Qwen3-32B": "open",
|
| 29 |
-
"Gauss2.3-Think-250708": "closed"
|
| 30 |
-
}
|
| 31 |
-
OUTPUT_FORM_MAP = {
|
| 32 |
-
"deepseek_r1": "reasoning",
|
| 33 |
-
"deepseek_r1_0528": "normal",
|
| 34 |
-
"Qwen3-32B": "reasoning",
|
| 35 |
-
"Gauss2.3-Think-250708": "reasoning"
|
| 36 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
handlers.py
DELETED
|
@@ -1,86 +0,0 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
|
| 3 |
-
def search_leaderboard(query, df, sort_col=None, sort_asc=True):
|
| 4 |
-
if not query.strip():
|
| 5 |
-
filtered = df
|
| 6 |
-
else:
|
| 7 |
-
filtered = df[df.apply(lambda row: row.astype(str).str.contains(query, case=False).any(), axis=1)]
|
| 8 |
-
if sort_col and sort_col in filtered.columns:
|
| 9 |
-
filtered = filtered.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
|
| 10 |
-
return filtered
|
| 11 |
-
|
| 12 |
-
def update_modelselector_group(groups, df):
|
| 13 |
-
"""
|
| 14 |
-
groups (gr.CheckboxGroup): List of currently selected models
|
| 15 |
-
df (DataFrame or gr.State): Current dataframe
|
| 16 |
-
"""
|
| 17 |
-
print("groups:", groups)
|
| 18 |
-
if not groups:
|
| 19 |
-
return None
|
| 20 |
-
|
| 21 |
-
filtered_df = df[df["Group"].isin(groups)]
|
| 22 |
-
models = filtered_df["Model Name"].unique().tolist()
|
| 23 |
-
|
| 24 |
-
return models
|
| 25 |
-
|
| 26 |
-
def update_columnselector_group(columns, groups, df):
|
| 27 |
-
print("column groups:", groups)
|
| 28 |
-
|
| 29 |
-
columns = [c for c in columns if c in df.columns[:3]]
|
| 30 |
-
|
| 31 |
-
columns.extend(df.columns[3:])
|
| 32 |
-
|
| 33 |
-
print(columns)
|
| 34 |
-
|
| 35 |
-
return columns
|
| 36 |
-
|
| 37 |
-
from constants import LEADERBOARD_REQUIRED_COLUMNS, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
|
| 38 |
-
|
| 39 |
-
def update_leaderboard(models, columns, df, sort_col=None, sort_asc=True):
|
| 40 |
-
print("models:", models)
|
| 41 |
-
print("columns:", columns)
|
| 42 |
-
print("sort_col:", sort_col, "sort_asc:", sort_asc)
|
| 43 |
-
|
| 44 |
-
# 필수 컬럼 항상 포함
|
| 45 |
-
columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + list(columns)))
|
| 46 |
-
|
| 47 |
-
# 뱃지/랭크 렌더링에 필요한 컬럼 항상 포함
|
| 48 |
-
always_include = ["Model Name", "Model Type", "Output Form", "Rank"]
|
| 49 |
-
filtered_df = df[df["Model Name"].isin(models)].copy()
|
| 50 |
-
|
| 51 |
-
# Model Type, Output Form, Rank 컬럼이 없으면 생성
|
| 52 |
-
if "Model Type" not in filtered_df.columns:
|
| 53 |
-
filtered_df["Model Type"] = filtered_df["Model Name"].map(MODEL_TYPE_MAP).fillna("open")
|
| 54 |
-
if "Output Form" not in filtered_df.columns:
|
| 55 |
-
filtered_df["Output Form"] = filtered_df["Model Name"].map(OUTPUT_FORM_MAP).fillna("normal")
|
| 56 |
-
if "Rank" not in filtered_df.columns:
|
| 57 |
-
# 정렬 기준: sort_col이 있으면 해당 컬럼, 없으면 Overall
|
| 58 |
-
rank_col = sort_col if sort_col and sort_col in filtered_df.columns else ("Overall" if "Overall" in filtered_df.columns else None)
|
| 59 |
-
if rank_col:
|
| 60 |
-
filtered_df = filtered_df.sort_values(rank_col, ascending=not sort_asc).reset_index(drop=True)
|
| 61 |
-
filtered_df["Rank"] = filtered_df.index + 1
|
| 62 |
-
else:
|
| 63 |
-
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
| 64 |
-
|
| 65 |
-
# always_include 컬럼은 무조건 포함
|
| 66 |
-
filtered_columns = [c for c in df.columns if c in columns or c in always_include]
|
| 67 |
-
for col in always_include:
|
| 68 |
-
if col not in filtered_columns:
|
| 69 |
-
filtered_columns.append(col)
|
| 70 |
-
|
| 71 |
-
# 중복 제거 및 순서 보장
|
| 72 |
-
filtered_columns = list(dict.fromkeys(filtered_columns))
|
| 73 |
-
filtered_df = filtered_df[filtered_columns]
|
| 74 |
-
|
| 75 |
-
for col in filtered_df.select_dtypes(include="number").columns:
|
| 76 |
-
filtered_df[col] = filtered_df[col].round(3)
|
| 77 |
-
|
| 78 |
-
if sort_col and sort_col in filtered_df.columns:
|
| 79 |
-
filtered_df = filtered_df.sort_values(sort_col, ascending=sort_asc).reset_index(drop=True)
|
| 80 |
-
# Rank 재계산
|
| 81 |
-
filtered_df["Rank"] = filtered_df.index + 1
|
| 82 |
-
|
| 83 |
-
return filtered_df
|
| 84 |
-
|
| 85 |
-
def get_models_by_group(df, groups):
|
| 86 |
-
return df[df["Group"].isin(groups)]["Model Name"].tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -21,32 +21,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
-
TITLE = """<h1 align="center" id="space-title">🥇
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
*TRUE(Trustworthy Real-world Usage Evaluation)Bench* is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
|
| 31 |
"""
|
| 32 |
|
| 33 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 34 |
LLM_BENCHMARKS_TEXT = f"""
|
| 35 |
## How it works
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
"""
|
| 38 |
|
| 39 |
EVALUATION_QUEUE_TEXT = """
|
| 40 |
-
## Submission Policy
|
| 41 |
-
For each benchmark:
|
| 42 |
-
1. Each model affiliation (individual or organization) can submit up to 3 times within 24 hours.
|
| 43 |
-
2. The same model can only be submitted once within 24 hours.
|
| 44 |
-
3. Criteria for determining duplicate submissions:
|
| 45 |
-
- Benchmark name
|
| 46 |
-
- Model full name
|
| 47 |
-
- Sampling parameters, dtype, vLLM version, etc. are not subject to duplicate checking.
|
| 48 |
-
4. Submissions are only allowed if the model's organization or username matches that of the submitter.
|
| 49 |
-
|
| 50 |
## Some good practices before submitting a model
|
| 51 |
|
| 52 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
|
@@ -69,50 +60,11 @@ This is a leaderboard for Open LLMs, and we'd love for as many people as possibl
|
|
| 69 |
|
| 70 |
### 4) Fill up your model card
|
| 71 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
| 72 |
-
"""
|
| 73 |
-
|
| 74 |
-
EVALUATION_QUEUE_TEXT_OPTION1 = """
|
| 75 |
-
# (Option 1) Submit HF model where vLLM inference is available
|
| 76 |
-
1. Fill the information including model name, vLLM version, sampling hyperparameters.
|
| 77 |
-
2. Sign in using the log-in button below.
|
| 78 |
-
3. Press "Submit Eval" button to submit.
|
| 79 |
-
"""
|
| 80 |
-
|
| 81 |
-
EVALUATION_QUEUE_TEXT_OPTION2 = """
|
| 82 |
-
# (Option 2) Submit HF model where vLLM inference is unavailable
|
| 83 |
-
1. Fill the information same with Option 1 and code snippets of model loading, inference, and termination.
|
| 84 |
-
2. Sign in using the log-in button below.
|
| 85 |
-
3. Press "Submit Eval" button to submit.
|
| 86 |
-
"""
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
```
|
| 93 |
-
### Open-weight models:
|
| 94 |
-
- Benchmark Name: [The name of benchmark to be evaluated]
|
| 95 |
-
- HugingFace Model ID: [HF_MODEL_ID]
|
| 96 |
-
- Pretty Name: [PRETTY_NAME]
|
| 97 |
-
- Sampling parameters:
|
| 98 |
-
- Temperature
|
| 99 |
-
- Top-p
|
| 100 |
-
- Top-k
|
| 101 |
-
- Presence penalty
|
| 102 |
-
- Frequency penalty
|
| 103 |
-
- Repetition penalty
|
| 104 |
-
- Supported by vLLM: [yes/no]
|
| 105 |
-
- (If yes) Version of vLLM
|
| 106 |
-
- (If no) Code snippets:
|
| 107 |
-
- Model loading
|
| 108 |
-
- Inference
|
| 109 |
-
- Termination
|
| 110 |
-
|
| 111 |
-
### Misc.
|
| 112 |
-
- Contact: [your email]
|
| 113 |
-
- Description: [e.g., paper link, blog post, etc.]
|
| 114 |
-
- Notes: [optional]
|
| 115 |
-
```
|
| 116 |
"""
|
| 117 |
|
| 118 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# Your leaderboard name
|
| 24 |
+
TITLE = """<h1 align="center" id="space-title">🥇 ProductivityBench (v1)</h1>"""
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
+
ProductivityBench is designed to evaluate LLMs for Productivity Assistants which stand for human's job productivity.
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 32 |
LLM_BENCHMARKS_TEXT = f"""
|
| 33 |
## How it works
|
| 34 |
+
|
| 35 |
+
## Reproducibility
|
| 36 |
+
To reproduce our results, here is the commands you can run:
|
| 37 |
+
|
| 38 |
"""
|
| 39 |
|
| 40 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
## Some good practices before submitting a model
|
| 42 |
|
| 43 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
|
|
|
| 60 |
|
| 61 |
### 4) Fill up your model card
|
| 62 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
## In case of model failure
|
| 65 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
| 66 |
+
Make sure you have followed the above steps first.
|
| 67 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
"""
|
| 69 |
|
| 70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/config.py
CHANGED
|
@@ -24,9 +24,10 @@ ON_LOAD_COLUMNS_CATEGORY = [
|
|
| 24 |
"Editing",
|
| 25 |
"Data Analysis",
|
| 26 |
"Reasoning",
|
|
|
|
| 27 |
"Hallucination",
|
| 28 |
"Safety",
|
| 29 |
-
"
|
| 30 |
"Summarization",
|
| 31 |
"Translation",
|
| 32 |
"Multi-Turn"
|
|
|
|
| 24 |
"Editing",
|
| 25 |
"Data Analysis",
|
| 26 |
"Reasoning",
|
| 27 |
+
"Samsung Knowledge",
|
| 28 |
"Hallucination",
|
| 29 |
"Safety",
|
| 30 |
+
"Repeatition",
|
| 31 |
"Summarization",
|
| 32 |
"Translation",
|
| 33 |
"Multi-Turn"
|
src/data/export_category_250618.csv
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"Model Name" "Group" "Overall" "Content Generation" "Editing" "Data Analysis" "Reasoning" "Samsung Knowledge" "Hallucination" "Safety" "Repeatition" "Summarization" "Translation" "Multi-Turn"
|
| 2 |
+
"claude-3-haiku-20240307" "Claude" "40.60" "44.16" "36.90" "39.33" "21.00" "23.33" "43.33" "50.00" "30.00" "60.96" "40.00" "23.33"
|
| 3 |
+
"claude-3-sonnet-20240229" "Claude" "44.47" "48.05" "42.26" "45.33" "32.00" "23.33" "45.00" "56.25" "36.67" "60.96" "46.33" "22.78"
|
| 4 |
+
"claude-3-5-sonnet-20240620" "Claude" "56.35" "53.25" "54.17" "64.00" "49.00" "55.00" "60.00" "52.50" "40.00" "69.86" "58.67" "36.67"
|
| 5 |
+
"claude-3-5-sonnet-20241022" "Claude" "58.45" "61.04" "55.36" "66.00" "54.00" "40.00" "63.33" "42.50" "40.00" "73.97" "62.33" "38.33"
|
| 6 |
+
"claude-3-7-sonnet-20250219" "Claude" "56.99" "59.09" "59.52" "64.00" "54.00" "50.00" "65.00" "37.50" "50.00" "71.58" "55.33" "37.22"
|
| 7 |
+
"claude-3-7-sonnet-20250219-thinking" "Claude" "58.70" "63.64" "58.33" "71.52" "68.00" "55.00" "62.71" "37.50" "50.00" "72.60" "55.00" "33.33"
|
| 8 |
+
"deepseek_r1" "DeepSeek" "55.27" "61.69" "54.76" "68.67" "68.00" "46.67" "51.67" "20.00" "46.67" "67.81" "49.00" "43.33"
|
| 9 |
+
"deepseek_r1_0528" "DeepSeek" "52.60" "59.09" "51.19" "65.33" "65.00" "38.33" "43.33" "27.50" "53.33" "69.18" "41.33" "41.67"
|
| 10 |
+
"deepseek_v3" "DeepSeek" "56.99" "62.99" "58.93" "58.00" "59.00" "36.67" "41.67" "25.00" "40.00" "72.60" "60.00" "46.67"
|
| 11 |
+
"deepseek_v3_0324" "DeepSeek" "54.51" "55.84" "48.21" "63.33" "70.00" "43.33" "50.00" "20.00" "46.67" "72.95" "49.67" "43.33"
|
src/data/export_category_250709.csv
DELETED
|
Binary file (1.26 kB)
|
|
|
src/data/export_lang_250618.csv
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"Model Name" "Group" "Overall" "KO" "EN" "JA" "ZH" "PL" "DE" "PT" "ES" "FR" "IT" "RU" "VI"
|
| 2 |
+
"claude-3-haiku-20240307" "Claude" "40.60" "31.87" "30.99" "41.54" "36.92" "52.24" "55.22" "56.72" "55.22" "68.66" "74.63" "50.75" "38.46"
|
| 3 |
+
"claude-3-sonnet-20240229" "Claude" "44.47" "41.32" "33.19" "50.77" "38.46" "55.22" "52.24" "58.21" "61.19" "65.67" "67.16" "49.25" "44.62"
|
| 4 |
+
"claude-3-5-sonnet-20240620" "Claude" "56.35" "55.60" "43.30" "73.85" "47.69" "64.18" "65.67" "70.15" "67.16" "76.12" "71.64" "65.67" "55.38"
|
| 5 |
+
"claude-3-5-sonnet-20241022" "Claude" "58.45" "57.14" "47.91" "69.23" "49.23" "61.19" "62.69" "70.15" "71.64" "80.60" "73.13" "67.16" "60.00"
|
| 6 |
+
"claude-3-7-sonnet-20250219" "Claude" "56.99" "55.82" "46.59" "63.08" "56.92" "68.66" "59.70" "64.18" "64.18" "74.63" "67.16" "64.18" "66.15"
|
| 7 |
+
"claude-3-7-sonnet-20250219-thinking" "Claude" "58.70" "60.44" "50.11" "64.62" "44.62" "65.67" "67.16" "65.67" "50.75" "74.63" "70.15" "67.16" "63.08"
|
| 8 |
+
"deepseek_r1" "DeepSeek" "55.27" "53.19" "50.99" "64.62" "44.62" "59.70" "64.18" "55.22" "58.21" "70.15" "67.16" "58.21" "53.85"
|
| 9 |
+
"deepseek_r1_0528" "DeepSeek" "52.60" "48.79" "47.25" "58.46" "43.08" "52.24" "61.19" "68.66" "58.21" "62.69" "65.67" "61.19" "56.92"
|
| 10 |
+
"deepseek_v3" "DeepSeek" "56.99" "53.41" "49.01" "66.15" "43.08" "59.70" "70.15" "67.16" "65.67" "79.10" "74.63" "58.21" "64.62"
|
| 11 |
+
"deepseek_v3_0324" "DeepSeek" "54.51" "50.99" "49.67" "56.92" "43.08" "64.18" "68.66" "61.19" "56.72" "71.64" "62.69" "64.18" "52.31"
|
src/data/export_lang_250709.csv
DELETED
|
Binary file (958 Bytes)
|
|
|
src/data_utils.py
CHANGED
|
@@ -3,12 +3,12 @@ from pathlib import Path
|
|
| 3 |
|
| 4 |
def get_dataframe_category():
|
| 5 |
abs_path = Path(__file__).parent
|
| 6 |
-
df = pd.read_csv(str(abs_path / "data/
|
| 7 |
df = df.sort_values("Overall", ascending=False)
|
| 8 |
return df
|
| 9 |
|
| 10 |
def get_dataframe_language():
|
| 11 |
abs_path = Path(__file__).parent
|
| 12 |
-
df = pd.read_csv(str(abs_path / "data/
|
| 13 |
df = df.sort_values("Overall", ascending=False)
|
| 14 |
return df
|
|
|
|
| 3 |
|
| 4 |
def get_dataframe_category():
|
| 5 |
abs_path = Path(__file__).parent
|
| 6 |
+
df = pd.read_csv(str(abs_path / "data/export_category_250618.csv"), encoding='utf-8', delimiter=" ")
|
| 7 |
df = df.sort_values("Overall", ascending=False)
|
| 8 |
return df
|
| 9 |
|
| 10 |
def get_dataframe_language():
|
| 11 |
abs_path = Path(__file__).parent
|
| 12 |
+
df = pd.read_csv(str(abs_path / "data/export_lang_250618.csv"), encoding='utf-8', delimiter=" ")
|
| 13 |
df = df.sort_values("Overall", ascending=False)
|
| 14 |
return df
|
src/display/css_html_js.py
CHANGED
|
@@ -1,128 +1,5 @@
|
|
| 1 |
custom_css = """
|
| 2 |
|
| 3 |
-
/* Sort arrow/button styles */
|
| 4 |
-
.sort-arrow, .sort-btn {
|
| 5 |
-
display: inline-flex;
|
| 6 |
-
align-items: center;
|
| 7 |
-
justify-content: center;
|
| 8 |
-
background: #23244a;
|
| 9 |
-
color: #ffd700 !important; /* 항상 노란색 */
|
| 10 |
-
border: 1.5px solid #ffd700; /* 금색 테두리 */
|
| 11 |
-
border-radius: 6px;
|
| 12 |
-
font-size: 15px;
|
| 13 |
-
font-weight: 700;
|
| 14 |
-
margin-left: 6px;
|
| 15 |
-
margin-right: 2px;
|
| 16 |
-
padding: 2px 8px 2px 6px;
|
| 17 |
-
cursor: pointer;
|
| 18 |
-
transition: background 0.2s, color 0.2s, border 0.2s;
|
| 19 |
-
min-width: 28px;
|
| 20 |
-
min-height: 28px;
|
| 21 |
-
outline: none;
|
| 22 |
-
}
|
| 23 |
-
.sort-arrow.active, .sort-btn.active {
|
| 24 |
-
color: #ffd700 !important; /* 금색 */
|
| 25 |
-
border-color: #ffd700;
|
| 26 |
-
background: #1a237e;
|
| 27 |
-
}
|
| 28 |
-
.sort-arrow:hover, .sort-btn:hover {
|
| 29 |
-
background: #ffd700;
|
| 30 |
-
color: #23244a !important;
|
| 31 |
-
border-color: #ffd700;
|
| 32 |
-
}
|
| 33 |
-
.sort-arrow svg, .sort-btn svg {
|
| 34 |
-
margin-left: 2px;
|
| 35 |
-
margin-right: 0;
|
| 36 |
-
width: 1em;
|
| 37 |
-
height: 1em;
|
| 38 |
-
vertical-align: middle;
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
/* Enhanced leaderboard table styles */
|
| 42 |
-
.pretty-leaderboard-table {
|
| 43 |
-
width: 100%;
|
| 44 |
-
border-collapse: separate;
|
| 45 |
-
border-spacing: 0;
|
| 46 |
-
background: rgba(30, 34, 54, 0.98);
|
| 47 |
-
border-radius: 16px;
|
| 48 |
-
box-shadow: 0 4px 24px 0 rgba(16, 152, 247, 0.10), 0 1.5px 6px 0 rgba(227, 84, 84, 0.08);
|
| 49 |
-
overflow: hidden;
|
| 50 |
-
margin-bottom: 24px;
|
| 51 |
-
}
|
| 52 |
-
.pretty-leaderboard-table th, .pretty-leaderboard-table td {
|
| 53 |
-
padding: 12px 16px;
|
| 54 |
-
text-align: left;
|
| 55 |
-
border-bottom: 1px solid #23244a;
|
| 56 |
-
font-size: 15px;
|
| 57 |
-
}
|
| 58 |
-
.pretty-leaderboard-table th {
|
| 59 |
-
background: linear-gradient(90deg, #23244a 0%, #1a237e 100%);
|
| 60 |
-
color: #F5F6F7;
|
| 61 |
-
font-weight: 700;
|
| 62 |
-
letter-spacing: 0.5px;
|
| 63 |
-
border-bottom: 2px solid #1098F7;
|
| 64 |
-
}
|
| 65 |
-
.pretty-leaderboard-table tr:nth-child(even) {
|
| 66 |
-
background: rgba(245, 246, 247, 0.03);
|
| 67 |
-
}
|
| 68 |
-
.pretty-leaderboard-table tr:hover {
|
| 69 |
-
background: rgba(16, 152, 247, 0.08);
|
| 70 |
-
transition: background 0.2s;
|
| 71 |
-
}
|
| 72 |
-
.pretty-leaderboard-table td {
|
| 73 |
-
color: #F5F6F7;
|
| 74 |
-
vertical-align: middle;
|
| 75 |
-
}
|
| 76 |
-
.pretty-leaderboard-table tr:last-child td {
|
| 77 |
-
border-bottom: none;
|
| 78 |
-
}
|
| 79 |
-
.pretty-leaderboard-table th:first-child, .pretty-leaderboard-table td:first-child {
|
| 80 |
-
border-top-left-radius: 16px;
|
| 81 |
-
}
|
| 82 |
-
.pretty-leaderboard-table th:last-child, .pretty-leaderboard-table td:last-child {
|
| 83 |
-
border-top-right-radius: 16px;
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
/* Enhanced score bar styles */
|
| 87 |
-
.score-bar {
|
| 88 |
-
display: flex;
|
| 89 |
-
align-items: center;
|
| 90 |
-
gap: 12px;
|
| 91 |
-
width: 100%;
|
| 92 |
-
}
|
| 93 |
-
.score-bar-track {
|
| 94 |
-
flex-grow: 1;
|
| 95 |
-
height: 10px;
|
| 96 |
-
background: rgba(245, 246, 247, 0.12);
|
| 97 |
-
border-radius: 5px;
|
| 98 |
-
overflow: hidden;
|
| 99 |
-
max-width: 220px;
|
| 100 |
-
box-shadow: 0 1px 4px 0 rgba(16, 152, 247, 0.10);
|
| 101 |
-
}
|
| 102 |
-
.score-bar-fill {
|
| 103 |
-
height: 100%;
|
| 104 |
-
background: linear-gradient(90deg, #E35454, #1098F7);
|
| 105 |
-
border-radius: 5px;
|
| 106 |
-
transition: width 0.3s cubic-bezier(0.4,0,0.2,1);
|
| 107 |
-
}
|
| 108 |
-
.score-bar-value {
|
| 109 |
-
font-family: 'SF Mono', monospace;
|
| 110 |
-
font-weight: 600;
|
| 111 |
-
color: #F5F6F7;
|
| 112 |
-
min-width: 60px;
|
| 113 |
-
font-size: 14px;
|
| 114 |
-
}
|
| 115 |
-
|
| 116 |
-
body {
|
| 117 |
-
min-height: 100vh;
|
| 118 |
-
background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
|
| 119 |
-
background-image:
|
| 120 |
-
radial-gradient(rgba(255,255,255,0.12) 1.2px, transparent 1.2px),
|
| 121 |
-
radial-gradient(rgba(255,255,255,0.08) 1px, transparent 1px);
|
| 122 |
-
background-size: 40px 40px, 80px 80px;
|
| 123 |
-
background-position: 0 0, 20px 20px;
|
| 124 |
-
}
|
| 125 |
-
|
| 126 |
.markdown-text {
|
| 127 |
font-size: 16px !important;
|
| 128 |
}
|
|
@@ -145,15 +22,7 @@ body {
|
|
| 145 |
}
|
| 146 |
|
| 147 |
#leaderboard-table {
|
| 148 |
-
margin-top: 15px
|
| 149 |
-
/* Space-themed background */
|
| 150 |
-
background: linear-gradient(135deg, #1a237e 0%, #311b92 100%);
|
| 151 |
-
position: relative;
|
| 152 |
-
background-image:
|
| 153 |
-
radial-gradient(rgba(255,255,255,0.15) 1.2px, transparent 1.2px),
|
| 154 |
-
radial-gradient(rgba(255,255,255,0.10) 1px, transparent 1px);
|
| 155 |
-
background-size: 40px 40px, 80px 80px;
|
| 156 |
-
background-position: 0 0, 20px 20px;
|
| 157 |
}
|
| 158 |
|
| 159 |
#leaderboard-table-lite {
|
|
@@ -225,53 +94,6 @@ body {
|
|
| 225 |
#box-filter > .form{
|
| 226 |
border: 0
|
| 227 |
}
|
| 228 |
-
|
| 229 |
-
/* Model type and output form badge styles */
|
| 230 |
-
.badge {
|
| 231 |
-
display: inline-block;
|
| 232 |
-
border-radius: 12px;
|
| 233 |
-
padding: 2px 10px;
|
| 234 |
-
font-size: 0.85em;
|
| 235 |
-
font-weight: 700;
|
| 236 |
-
margin-left: 6px;
|
| 237 |
-
box-shadow: 0 1px 4px rgba(0,0,0,0.10);
|
| 238 |
-
vertical-align: middle;
|
| 239 |
-
}
|
| 240 |
-
.badge-open {
|
| 241 |
-
background: linear-gradient(90deg, #2196f3, #21cbf3);
|
| 242 |
-
color: #fff;
|
| 243 |
-
}
|
| 244 |
-
.badge-closed {
|
| 245 |
-
background: linear-gradient(90deg, #757575, #bdbdbd);
|
| 246 |
-
color: #fff;
|
| 247 |
-
}
|
| 248 |
-
.badge-normal {
|
| 249 |
-
background: linear-gradient(90deg, #43a047, #66bb6a);
|
| 250 |
-
color: #fff;
|
| 251 |
-
}
|
| 252 |
-
.badge-reasoning {
|
| 253 |
-
background: linear-gradient(90deg, #8e24aa, #d500f9);
|
| 254 |
-
color: #fff;
|
| 255 |
-
}
|
| 256 |
-
|
| 257 |
-
/* Sort button styles */
|
| 258 |
-
.sort-btn {
|
| 259 |
-
background: #23244a;
|
| 260 |
-
color: #F5F6F7;
|
| 261 |
-
border: 1px solid #1098F7;
|
| 262 |
-
border-radius: 6px;
|
| 263 |
-
font-size: 13px;
|
| 264 |
-
font-weight: 700;
|
| 265 |
-
margin-left: 4px;
|
| 266 |
-
margin-right: 2px;
|
| 267 |
-
padding: 2px 7px;
|
| 268 |
-
cursor: pointer;
|
| 269 |
-
transition: background 0.2s, color 0.2s;
|
| 270 |
-
}
|
| 271 |
-
.sort-btn:hover {
|
| 272 |
-
background: #1098F7;
|
| 273 |
-
color: #fff;
|
| 274 |
-
}
|
| 275 |
"""
|
| 276 |
|
| 277 |
get_window_url_params = """
|
|
@@ -281,105 +103,3 @@ get_window_url_params = """
|
|
| 281 |
return url_params;
|
| 282 |
}
|
| 283 |
"""
|
| 284 |
-
|
| 285 |
-
def get_rank_badge(rank: int) -> str:
|
| 286 |
-
"""
|
| 287 |
-
Returns HTML for a rank badge (1st, 2nd, 3rd) with appropriate styling.
|
| 288 |
-
"""
|
| 289 |
-
badge_styles = {
|
| 290 |
-
1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
|
| 291 |
-
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
|
| 292 |
-
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
|
| 293 |
-
}
|
| 294 |
-
if rank in badge_styles:
|
| 295 |
-
label, gradient, text_color = badge_styles[rank]
|
| 296 |
-
return f'''
|
| 297 |
-
<div style="
|
| 298 |
-
display: inline-flex;
|
| 299 |
-
align-items: center;
|
| 300 |
-
justify-content: center;
|
| 301 |
-
min-width: 48px;
|
| 302 |
-
padding: 4px 12px;
|
| 303 |
-
background: {gradient};
|
| 304 |
-
color: {text_color};
|
| 305 |
-
border-radius: 6px;
|
| 306 |
-
font-weight: 700;
|
| 307 |
-
font-size: 1em;
|
| 308 |
-
box-shadow: 0 2px 4px rgba(0,0,0,0.18);
|
| 309 |
-
border: 1.5px solid #fff2;
|
| 310 |
-
">
|
| 311 |
-
{label}
|
| 312 |
-
</div>
|
| 313 |
-
'''
|
| 314 |
-
return f'''
|
| 315 |
-
<div style="
|
| 316 |
-
display: inline-flex;
|
| 317 |
-
align-items: center;
|
| 318 |
-
justify-content: center;
|
| 319 |
-
min-width: 28px;
|
| 320 |
-
color: #a1a1aa;
|
| 321 |
-
font-weight: 500;
|
| 322 |
-
">
|
| 323 |
-
{rank}
|
| 324 |
-
</div>
|
| 325 |
-
'''
|
| 326 |
-
|
| 327 |
-
def get_score_gauge(score: float, max_score: float = 1.0) -> str:
|
| 328 |
-
"""
|
| 329 |
-
Returns HTML for an overall score gauge (progress bar style).
|
| 330 |
-
"""
|
| 331 |
-
percent = min(max(score / max_score, 0), 1) * 100
|
| 332 |
-
return f'''
|
| 333 |
-
<div class="score-bar" style="margin: 0.5em 0;">
|
| 334 |
-
<div class="score-bar-track">
|
| 335 |
-
<div class="score-bar-fill" style="width: {percent}%;"></div>
|
| 336 |
-
</div>
|
| 337 |
-
<span class="score-bar-value">{score:.3f}</span>
|
| 338 |
-
</div>
|
| 339 |
-
'''
|
| 340 |
-
|
| 341 |
-
def get_leaderboard_table_html(df) -> str:
|
| 342 |
-
"""
|
| 343 |
-
Returns HTML for a pretty leaderboard table using badge and gauge.
|
| 344 |
-
Expects df to have columns: 'Model', 'Score', 'Model Type', 'Output Form'.
|
| 345 |
-
"""
|
| 346 |
-
def get_type_badge(model_type):
|
| 347 |
-
if model_type == "open":
|
| 348 |
-
return '<span class="badge badge-open">open</span>'
|
| 349 |
-
else:
|
| 350 |
-
return '<span class="badge badge-closed">closed</span>'
|
| 351 |
-
|
| 352 |
-
def get_output_badge(output_form):
|
| 353 |
-
if output_form == "reasoning":
|
| 354 |
-
return '<span class="badge badge-reasoning">reasoning</span>'
|
| 355 |
-
else:
|
| 356 |
-
return '<span class="badge badge-normal">normal</span>'
|
| 357 |
-
|
| 358 |
-
html = ['<table class="pretty-leaderboard-table">']
|
| 359 |
-
# Header
|
| 360 |
-
html.append(
|
| 361 |
-
"<thead><tr>"
|
| 362 |
-
"<th>Rank</th>"
|
| 363 |
-
"<th>Model</th>"
|
| 364 |
-
"<th>Overall Score</th>"
|
| 365 |
-
"</tr></thead>"
|
| 366 |
-
)
|
| 367 |
-
html.append("<tbody>")
|
| 368 |
-
for idx, row in enumerate(df.itertuples(index=False), 1):
|
| 369 |
-
model = getattr(row, "Model", "")
|
| 370 |
-
score = getattr(row, "Score", 0.0)
|
| 371 |
-
model_type = getattr(row, "Model_Type", getattr(row, "Model Type", "open"))
|
| 372 |
-
output_form = getattr(row, "Output_Form", getattr(row, "Output Form", "normal"))
|
| 373 |
-
badge = get_rank_badge(idx)
|
| 374 |
-
gauge = get_score_gauge(score)
|
| 375 |
-
type_badge = get_type_badge(model_type)
|
| 376 |
-
output_badge = get_output_badge(output_form)
|
| 377 |
-
html.append(
|
| 378 |
-
f"<tr>"
|
| 379 |
-
f"<td>{badge}</td>"
|
| 380 |
-
f"<td>{model} {type_badge} {output_badge}</td>"
|
| 381 |
-
f"<td>{gauge}</td>"
|
| 382 |
-
f"</tr>"
|
| 383 |
-
)
|
| 384 |
-
html.append("</tbody></table>")
|
| 385 |
-
return "\n".join(html)
|
|
|
|
| 1 |
custom_css = """
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
.markdown-text {
|
| 4 |
font-size: 16px !important;
|
| 5 |
}
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
#leaderboard-table {
|
| 25 |
+
margin-top: 15px
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
#leaderboard-table-lite {
|
|
|
|
| 94 |
#box-filter > .form{
|
| 95 |
border: 0
|
| 96 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
"""
|
| 98 |
|
| 99 |
get_window_url_params = """
|
|
|
|
| 103 |
return url_params;
|
| 104 |
}
|
| 105 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
CHANGED
|
@@ -25,128 +25,3 @@ def has_no_nan_values(df, columns):
|
|
| 25 |
|
| 26 |
def has_nan_values(df, columns):
|
| 27 |
return df[columns].isna().any(axis=1)
|
| 28 |
-
|
| 29 |
-
def get_score_bar(score):
|
| 30 |
-
"""
|
| 31 |
-
Generate HTML for a score bar with gradient styling.
|
| 32 |
-
Expects score in the range 0-100.
|
| 33 |
-
"""
|
| 34 |
-
width = max(0, min(score, 100)) # Clamp to [0, 100]
|
| 35 |
-
return f"""
|
| 36 |
-
<div class="score-bar">
|
| 37 |
-
<div class="score-bar-track">
|
| 38 |
-
<div class="score-bar-fill" style="width: {width}%;"></div>
|
| 39 |
-
</div>
|
| 40 |
-
<span class="score-bar-value">{score:.3f}</span>
|
| 41 |
-
</div>
|
| 42 |
-
"""
|
| 43 |
-
|
| 44 |
-
def render_leaderboard_html(df, overall_col="average"):
|
| 45 |
-
"""
|
| 46 |
-
Render a DataFrame as an HTML table, replacing the overall_col with a gauge bar.
|
| 47 |
-
"""
|
| 48 |
-
from .formatting import get_score_bar
|
| 49 |
-
from src.display.css_html_js import get_rank_badge
|
| 50 |
-
|
| 51 |
-
def get_type_badge(model_type):
|
| 52 |
-
if model_type == "open":
|
| 53 |
-
return '<span class="badge badge-open">open</span>'
|
| 54 |
-
else:
|
| 55 |
-
return '<span class="badge badge-closed">closed</span>'
|
| 56 |
-
|
| 57 |
-
def get_output_badge(output_form):
|
| 58 |
-
if output_form == "reasoning":
|
| 59 |
-
return '<span class="badge badge-reasoning">reasoning</span>'
|
| 60 |
-
else:
|
| 61 |
-
return '<span class="badge badge-normal">normal</span>'
|
| 62 |
-
|
| 63 |
-
# 숨길 컬럼
|
| 64 |
-
hidden_cols = ["Model", "Model Type", "Output Form", "Rank"]
|
| 65 |
-
|
| 66 |
-
# Build table header
|
| 67 |
-
def get_sort_arrow(col, sort_col, sort_asc):
|
| 68 |
-
# "Model Name", "Group" 컬럼을 제외한 모든 컬럼에 정렬 버튼 노출
|
| 69 |
-
if col in {"Model Name", "Group"}:
|
| 70 |
-
return ""
|
| 71 |
-
# 하나의 버튼(▲ 또는 ▼)만 노출, 클릭 시 asc가 반전됨
|
| 72 |
-
if col == sort_col:
|
| 73 |
-
# 현재 정렬 상태에 따라 아이콘과 data-asc를 반전
|
| 74 |
-
if sort_asc:
|
| 75 |
-
# 오름차순 상태: ▼ 아이콘, 클릭 시 내림차순
|
| 76 |
-
svg = (
|
| 77 |
-
'<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
|
| 78 |
-
'<polygon points="3,5 11,5 7,11" fill="currentColor"/></svg>'
|
| 79 |
-
)
|
| 80 |
-
return (
|
| 81 |
-
f'<span class="sort-arrow active" data-col="{col}" data-asc="false" aria-label="내림차순 정렬">{svg}</span>'
|
| 82 |
-
)
|
| 83 |
-
else:
|
| 84 |
-
# 내림차순 상태: ▲ 아이콘, 클릭 시 오름차순
|
| 85 |
-
svg = (
|
| 86 |
-
'<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
|
| 87 |
-
'<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
|
| 88 |
-
)
|
| 89 |
-
return (
|
| 90 |
-
f'<span class="sort-arrow active" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
|
| 91 |
-
)
|
| 92 |
-
else:
|
| 93 |
-
# 정렬 중이 아닌 컬럼: ▲(오름차순) 아이콘, 클릭 시 오름차순
|
| 94 |
-
svg = (
|
| 95 |
-
'<svg width="14" height="14" viewBox="0 0 14 14" style="vertical-align:middle">'
|
| 96 |
-
'<polygon points="7,3 11,9 3,9" fill="currentColor"/></svg>'
|
| 97 |
-
)
|
| 98 |
-
return (
|
| 99 |
-
f'<span class="sort-arrow" data-col="{col}" data-asc="true" aria-label="오름차순 정렬">{svg}</span>'
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
# 정렬 상태 추출 (State에서 전달받거나 기본값)
|
| 103 |
-
sort_col = getattr(df, "_sort_col", None) or (df.columns[0] if len(df.columns) > 0 else None)
|
| 104 |
-
sort_asc = getattr(df, "_sort_asc", None)
|
| 105 |
-
if sort_asc is None:
|
| 106 |
-
sort_asc = True
|
| 107 |
-
|
| 108 |
-
html = '<table class="pretty-leaderboard-table">\n<thead><tr>'
|
| 109 |
-
for col in df.columns:
|
| 110 |
-
if col in hidden_cols:
|
| 111 |
-
continue
|
| 112 |
-
html += f'<th>{col}{get_sort_arrow(col, sort_col, sort_asc)}</th>'
|
| 113 |
-
html += '</tr></thead>\n<tbody>\n'
|
| 114 |
-
|
| 115 |
-
# Build table rows
|
| 116 |
-
for idx, row in df.iterrows():
|
| 117 |
-
html += '<tr>'
|
| 118 |
-
for col in df.columns:
|
| 119 |
-
if col in hidden_cols:
|
| 120 |
-
continue
|
| 121 |
-
cell = row[col]
|
| 122 |
-
if col == overall_col:
|
| 123 |
-
try:
|
| 124 |
-
cell_html = get_score_bar(float(cell))
|
| 125 |
-
except Exception:
|
| 126 |
-
cell_html = str(cell)
|
| 127 |
-
html += f'<td>{cell_html}</td>'
|
| 128 |
-
elif col in ["Model Name"]:
|
| 129 |
-
# 1~3위 하이라이트 + 4등 이후 흰색 + 뱃지 항상 표시
|
| 130 |
-
rank = row.get("Rank", None)
|
| 131 |
-
model_type = row.get("Model Type", None) or row.get("Model_Type", None)
|
| 132 |
-
output_form = row.get("Output Form", None) or row.get("Output_Form", None)
|
| 133 |
-
highlight_style = ""
|
| 134 |
-
if rank == 1 or rank == "1":
|
| 135 |
-
highlight_style = "color: #ffd700; font-weight: bold; text-shadow: 0 0 4px #fff2;"
|
| 136 |
-
elif rank == 2 or rank == "2":
|
| 137 |
-
highlight_style = "color: #b0b0b0; font-weight: bold;"
|
| 138 |
-
elif rank == 3 or rank == "3":
|
| 139 |
-
highlight_style = "color: #cd7f32; font-weight: bold;"
|
| 140 |
-
else:
|
| 141 |
-
highlight_style = "color: #fff; font-weight: 600;"
|
| 142 |
-
badge_html = ""
|
| 143 |
-
if model_type:
|
| 144 |
-
badge_html += " " + get_type_badge(model_type)
|
| 145 |
-
if output_form:
|
| 146 |
-
badge_html += " " + get_output_badge(output_form)
|
| 147 |
-
html += f'<td><span style="{highlight_style}">{cell}</span>{badge_html}</td>'
|
| 148 |
-
else:
|
| 149 |
-
html += f'<td>{cell}</td>'
|
| 150 |
-
html += '</tr>\n'
|
| 151 |
-
html += '</tbody></table>'
|
| 152 |
-
return html
|
|
|
|
| 25 |
|
| 26 |
def has_nan_values(df, columns):
|
| 27 |
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
|
@@ -21,26 +21,24 @@ class ColumnContent:
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
| 24 |
-
from dataclasses import field
|
| 25 |
-
|
| 26 |
auto_eval_column_dict = []
|
| 27 |
# Init
|
| 28 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
|
| 29 |
-
auto_eval_column_dict.append(["model", ColumnContent,
|
| 30 |
-
#
|
| 31 |
-
auto_eval_column_dict.append(["average", ColumnContent,
|
| 32 |
for task in Tasks:
|
| 33 |
-
auto_eval_column_dict.append([task.name, ColumnContent,
|
| 34 |
# Model information
|
| 35 |
-
auto_eval_column_dict.append(["model_type", ColumnContent,
|
| 36 |
-
auto_eval_column_dict.append(["architecture", ColumnContent,
|
| 37 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent,
|
| 38 |
-
auto_eval_column_dict.append(["precision", ColumnContent,
|
| 39 |
-
auto_eval_column_dict.append(["license", ColumnContent,
|
| 40 |
-
auto_eval_column_dict.append(["params", ColumnContent,
|
| 41 |
-
auto_eval_column_dict.append(["likes", ColumnContent,
|
| 42 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent,
|
| 43 |
-
auto_eval_column_dict.append(["revision", ColumnContent,
|
| 44 |
|
| 45 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 46 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
@@ -93,8 +91,6 @@ class WeightType(Enum):
|
|
| 93 |
class Precision(Enum):
|
| 94 |
float16 = ModelDetails("float16")
|
| 95 |
bfloat16 = ModelDetails("bfloat16")
|
| 96 |
-
fp8 = ModelDetails("fp8")
|
| 97 |
-
int4 = ModelDetails("int4")
|
| 98 |
Unknown = ModelDetails("?")
|
| 99 |
|
| 100 |
def from_str(precision):
|
|
@@ -102,10 +98,6 @@ class Precision(Enum):
|
|
| 102 |
return Precision.float16
|
| 103 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 104 |
return Precision.bfloat16
|
| 105 |
-
if precision == "fp8":
|
| 106 |
-
return Precision.fp8
|
| 107 |
-
if precision == "int4":
|
| 108 |
-
return Precision.int4
|
| 109 |
return Precision.Unknown
|
| 110 |
|
| 111 |
# Column selection
|
|
@@ -115,3 +107,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
| 115 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 116 |
|
| 117 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
|
|
|
|
|
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
| 26 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
+
#Scores
|
| 29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 35 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 36 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 37 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 38 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 39 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 40 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 41 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 42 |
|
| 43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
| 91 |
class Precision(Enum):
|
| 92 |
float16 = ModelDetails("float16")
|
| 93 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
|
|
| 94 |
Unknown = ModelDetails("?")
|
| 95 |
|
| 96 |
def from_str(precision):
|
|
|
|
| 98 |
return Precision.float16
|
| 99 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 100 |
return Precision.bfloat16
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
return Precision.Unknown
|
| 102 |
|
| 103 |
# Column selection
|
|
|
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
+
|
src/envs.py
CHANGED
|
@@ -6,10 +6,10 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER = "
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
-
REPO_ID = f"{OWNER}/
|
| 13 |
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
+
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
src/submission/check_validity.py
CHANGED
|
@@ -88,13 +88,12 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
| 88 |
continue
|
| 89 |
with open(os.path.join(root, file), "r") as f:
|
| 90 |
info = json.load(f)
|
| 91 |
-
|
| 92 |
-
file_names.append(f"{info['benchmark']}_{info['model']}")
|
| 93 |
|
| 94 |
# Select organisation
|
| 95 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
| 96 |
continue
|
| 97 |
organisation, _ = info["model"].split("/")
|
| 98 |
-
users_to_submission_dates[organisation].
|
| 99 |
|
| 100 |
return set(file_names), users_to_submission_dates
|
|
|
|
| 88 |
continue
|
| 89 |
with open(os.path.join(root, file), "r") as f:
|
| 90 |
info = json.load(f)
|
| 91 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
|
|
|
| 92 |
|
| 93 |
# Select organisation
|
| 94 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
| 95 |
continue
|
| 96 |
organisation, _ = info["model"].split("/")
|
| 97 |
+
users_to_submission_dates[organisation].append(info["submitted_time"])
|
| 98 |
|
| 99 |
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
-
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
from src.submission.check_validity import (
|
|
@@ -10,26 +10,17 @@ from src.submission.check_validity import (
|
|
| 10 |
get_model_size,
|
| 11 |
is_model_on_hub,
|
| 12 |
)
|
| 13 |
-
import gradio as gr
|
| 14 |
|
| 15 |
REQUESTED_MODELS = None
|
| 16 |
USERS_TO_SUBMISSION_DATES = None
|
| 17 |
|
| 18 |
-
def
|
| 19 |
-
benchmark: str,
|
| 20 |
model: str,
|
| 21 |
base_model: str,
|
| 22 |
revision: str,
|
| 23 |
precision: str,
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
top_k: str,
|
| 27 |
-
presence_penalty: str,
|
| 28 |
-
frequency_penalty: str,
|
| 29 |
-
repetition_penalty: str,
|
| 30 |
-
vllm_version: str,
|
| 31 |
-
user_state: str,
|
| 32 |
-
organization_list: list
|
| 33 |
):
|
| 34 |
global REQUESTED_MODELS
|
| 35 |
global USERS_TO_SUBMISSION_DATES
|
|
@@ -43,174 +34,25 @@ def add_new_eval_option1(
|
|
| 43 |
model_path = model.split("/")[1]
|
| 44 |
|
| 45 |
precision = precision.split(" ")[0]
|
| 46 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
return styled_error("The submitter does not have submission rights for this model.")
|
| 51 |
-
|
| 52 |
-
# Does the organization submit more than three times in a day?
|
| 53 |
-
submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
|
| 54 |
-
submission_cnt = 0
|
| 55 |
-
for i in range(len(submission_times)):
|
| 56 |
-
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 57 |
-
if hours_diff <= 24:
|
| 58 |
-
submission_cnt += 1
|
| 59 |
-
if submission_cnt > 3:
|
| 60 |
-
return styled_error("The organization already submitted three times for this benchmark today.")
|
| 61 |
|
| 62 |
# Does the model actually exist?
|
| 63 |
if revision == "":
|
| 64 |
revision = "main"
|
| 65 |
|
| 66 |
-
# Is the model
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
| 73 |
-
|
| 74 |
-
# Were the model card and license filled?
|
| 75 |
-
try:
|
| 76 |
-
license = model_info.cardData["license"]
|
| 77 |
-
except Exception:
|
| 78 |
-
return styled_error("Please select a license for your model.")
|
| 79 |
-
|
| 80 |
-
modelcard_OK, error_msg = check_model_card(model)
|
| 81 |
-
if not modelcard_OK:
|
| 82 |
-
return styled_error(error_msg)
|
| 83 |
-
|
| 84 |
-
if temperature == "":
|
| 85 |
-
temperature = "1.0"
|
| 86 |
-
|
| 87 |
-
if top_p == "":
|
| 88 |
-
top_p = "1.0"
|
| 89 |
-
|
| 90 |
-
if top_k == "":
|
| 91 |
-
top_k = "-1"
|
| 92 |
-
|
| 93 |
-
if presence_penalty == "":
|
| 94 |
-
presence_penalty = "0.0"
|
| 95 |
-
|
| 96 |
-
if frequency_penalty == "":
|
| 97 |
-
frequency_penalty = "0.0"
|
| 98 |
-
|
| 99 |
-
if repetition_penalty == "":
|
| 100 |
-
repetition_penalty = "1.0"
|
| 101 |
-
|
| 102 |
-
# Seems good, creating the eval
|
| 103 |
-
print("Adding new eval")
|
| 104 |
-
|
| 105 |
-
eval_entry = {
|
| 106 |
-
"benchmark": benchmark,
|
| 107 |
-
"model": model,
|
| 108 |
-
"base_model": base_model,
|
| 109 |
-
"revision": revision,
|
| 110 |
-
"precision": precision,
|
| 111 |
-
"status": "PENDING",
|
| 112 |
-
"submitted_time": current_time,
|
| 113 |
-
"likes": model_info.likes,
|
| 114 |
-
"params": model_size,
|
| 115 |
-
"license": license,
|
| 116 |
-
"private": False,
|
| 117 |
-
"temperature": float(temperature),
|
| 118 |
-
"top_p": float(top_p),
|
| 119 |
-
"top_k": float(top_k),
|
| 120 |
-
"vllm_version": vllm_version,
|
| 121 |
-
"presence_penalty": float(presence_penalty),
|
| 122 |
-
"frequency_penalty": float(frequency_penalty),
|
| 123 |
-
"repetition_penalty": float(repetition_penalty),
|
| 124 |
-
"load_model_code": "None",
|
| 125 |
-
"inference_code": "None",
|
| 126 |
-
"termination_code": "None",
|
| 127 |
-
}
|
| 128 |
-
|
| 129 |
-
# Check for duplicate submission
|
| 130 |
-
submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark and item['model'] == model]
|
| 131 |
-
submission_cnt = 0
|
| 132 |
-
for i in range(len(submission_times)):
|
| 133 |
-
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 134 |
-
if hours_diff <= 24:
|
| 135 |
-
submission_cnt += 1
|
| 136 |
-
if submission_cnt > 1:
|
| 137 |
-
return styled_warning("This model has been already submitted within 24 hours.")
|
| 138 |
-
|
| 139 |
-
print("Creating eval file")
|
| 140 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 141 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
| 142 |
-
out_path = f"{OUT_DIR}/{benchmark}_{model_path}_eval_request_False.json"
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
API.upload_file(
|
| 149 |
-
path_or_fileobj=out_path,
|
| 150 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
| 151 |
-
repo_id=QUEUE_REPO,
|
| 152 |
-
repo_type="dataset",
|
| 153 |
-
commit_message=f"Add {model} to eval queue",
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
# Remove the local file
|
| 157 |
-
os.remove(out_path)
|
| 158 |
-
|
| 159 |
-
return styled_message(
|
| 160 |
-
"Your request has been submitted to the evaluation queue!"
|
| 161 |
-
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
def add_new_eval_option2(
|
| 166 |
-
benchmark: str,
|
| 167 |
-
model: str,
|
| 168 |
-
base_model: str,
|
| 169 |
-
revision: str,
|
| 170 |
-
precision: str,
|
| 171 |
-
temperature: str,
|
| 172 |
-
top_p: str,
|
| 173 |
-
top_k: str,
|
| 174 |
-
presence_penalty: str,
|
| 175 |
-
frequency_penalty: str,
|
| 176 |
-
repetition_penalty: str,
|
| 177 |
-
load_model_code: str,
|
| 178 |
-
inference_code: str,
|
| 179 |
-
termination_code: str,
|
| 180 |
-
user_state: str,
|
| 181 |
-
organization_list: list
|
| 182 |
-
):
|
| 183 |
-
global REQUESTED_MODELS
|
| 184 |
-
global USERS_TO_SUBMISSION_DATES
|
| 185 |
-
if not REQUESTED_MODELS:
|
| 186 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 187 |
-
|
| 188 |
-
user_name = ""
|
| 189 |
-
model_path = model
|
| 190 |
-
if "/" in model:
|
| 191 |
-
user_name = model.split("/")[0]
|
| 192 |
-
model_path = model.split("/")[1]
|
| 193 |
-
|
| 194 |
-
precision = precision.split(" ")[0]
|
| 195 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S %z")
|
| 196 |
-
|
| 197 |
-
# Check submitter qualification
|
| 198 |
-
if user_name != user_state and user_name not in organization_list:
|
| 199 |
-
return styled_error("The submitter does not have submission rights for this model.")
|
| 200 |
-
|
| 201 |
-
# Does the organization submit more than three times in a day?
|
| 202 |
-
submission_times = [item['submitted_time'] for item in USERS_TO_SUBMISSION_DATES[user_name] if item['benchmark'] == benchmark]
|
| 203 |
-
submission_cnt = 0
|
| 204 |
-
for i in range(len(submission_times)):
|
| 205 |
-
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 206 |
-
if hours_diff <= 24:
|
| 207 |
-
submission_cnt += 1
|
| 208 |
-
if submission_cnt > 3:
|
| 209 |
-
return styled_error("The organization already submitted three times for this benchmark today.")
|
| 210 |
-
|
| 211 |
-
# Does the model actually exist?
|
| 212 |
-
if revision == "":
|
| 213 |
-
revision = "main"
|
| 214 |
|
| 215 |
# Is the model info correctly filled?
|
| 216 |
try:
|
|
@@ -224,71 +66,38 @@ def add_new_eval_option2(
|
|
| 224 |
try:
|
| 225 |
license = model_info.cardData["license"]
|
| 226 |
except Exception:
|
| 227 |
-
return styled_error("Please select a license for your model
|
| 228 |
|
| 229 |
modelcard_OK, error_msg = check_model_card(model)
|
| 230 |
if not modelcard_OK:
|
| 231 |
return styled_error(error_msg)
|
| 232 |
|
| 233 |
-
if temperature == "":
|
| 234 |
-
temperature = "1.0"
|
| 235 |
-
|
| 236 |
-
if top_p == "":
|
| 237 |
-
top_p = "1.0"
|
| 238 |
-
|
| 239 |
-
if top_k == "":
|
| 240 |
-
top_k = "-1"
|
| 241 |
-
|
| 242 |
-
if presence_penalty == "":
|
| 243 |
-
presence_penalty = "0.0"
|
| 244 |
-
|
| 245 |
-
if frequency_penalty == "":
|
| 246 |
-
frequency_penalty = "0.0"
|
| 247 |
-
|
| 248 |
-
if repetition_penalty == "":
|
| 249 |
-
repetition_penalty = "1.0"
|
| 250 |
-
|
| 251 |
# Seems good, creating the eval
|
| 252 |
print("Adding new eval")
|
| 253 |
|
| 254 |
eval_entry = {
|
| 255 |
-
"benchmark": benchmark,
|
| 256 |
"model": model,
|
| 257 |
"base_model": base_model,
|
| 258 |
"revision": revision,
|
| 259 |
"precision": precision,
|
|
|
|
| 260 |
"status": "PENDING",
|
| 261 |
"submitted_time": current_time,
|
|
|
|
| 262 |
"likes": model_info.likes,
|
| 263 |
"params": model_size,
|
| 264 |
"license": license,
|
| 265 |
"private": False,
|
| 266 |
-
"temperature": float(temperature),
|
| 267 |
-
"top_p": float(top_p),
|
| 268 |
-
"top_k": float(top_k),
|
| 269 |
-
"vllm_version": "None",
|
| 270 |
-
"presence_penalty": float(presence_penalty),
|
| 271 |
-
"frequency_penalty": float(frequency_penalty),
|
| 272 |
-
"repetition_penalty": float(repetition_penalty),
|
| 273 |
-
"load_model_code": load_model_code,
|
| 274 |
-
"inference_code": inference_code,
|
| 275 |
-
"termination_code": termination_code
|
| 276 |
}
|
| 277 |
-
|
| 278 |
# Check for duplicate submission
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
for i in range(len(submission_times)):
|
| 282 |
-
hours_diff = (datetime.strptime(current_time, "%Y-%m-%dT%H:%M:%S %z") - datetime.strptime(submission_times[i], "%Y-%m-%dT%H:%M:%S %z")).total_seconds() / 3600
|
| 283 |
-
if hours_diff <= 24:
|
| 284 |
-
submission_cnt += 1
|
| 285 |
-
if submission_cnt > 1:
|
| 286 |
-
return styled_warning("This model has been already submitted within 24 hours.")
|
| 287 |
|
| 288 |
print("Creating eval file")
|
| 289 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 290 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 291 |
-
out_path = f"{OUT_DIR}/{
|
| 292 |
|
| 293 |
with open(out_path, "w") as f:
|
| 294 |
f.write(json.dumps(eval_entry))
|
|
@@ -306,5 +115,5 @@ def add_new_eval_option2(
|
|
| 306 |
os.remove(out_path)
|
| 307 |
|
| 308 |
return styled_message(
|
| 309 |
-
"Your request has been submitted to the evaluation queue!"
|
| 310 |
)
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
+
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
from src.submission.check_validity import (
|
|
|
|
| 10 |
get_model_size,
|
| 11 |
is_model_on_hub,
|
| 12 |
)
|
|
|
|
| 13 |
|
| 14 |
REQUESTED_MODELS = None
|
| 15 |
USERS_TO_SUBMISSION_DATES = None
|
| 16 |
|
| 17 |
+
def add_new_eval(
|
|
|
|
| 18 |
model: str,
|
| 19 |
base_model: str,
|
| 20 |
revision: str,
|
| 21 |
precision: str,
|
| 22 |
+
weight_type: str,
|
| 23 |
+
model_type: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
):
|
| 25 |
global REQUESTED_MODELS
|
| 26 |
global USERS_TO_SUBMISSION_DATES
|
|
|
|
| 34 |
model_path = model.split("/")[1]
|
| 35 |
|
| 36 |
precision = precision.split(" ")[0]
|
| 37 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 38 |
|
| 39 |
+
if model_type is None or model_type == "":
|
| 40 |
+
return styled_error("Please select a model type.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# Does the model actually exist?
|
| 43 |
if revision == "":
|
| 44 |
revision = "main"
|
| 45 |
|
| 46 |
+
# Is the model on the hub?
|
| 47 |
+
if weight_type in ["Delta", "Adapter"]:
|
| 48 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 49 |
+
if not base_model_on_hub:
|
| 50 |
+
return styled_error(f'Base model "{base_model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
if not weight_type == "Adapter":
|
| 53 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 54 |
+
if not model_on_hub:
|
| 55 |
+
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Is the model info correctly filled?
|
| 58 |
try:
|
|
|
|
| 66 |
try:
|
| 67 |
license = model_info.cardData["license"]
|
| 68 |
except Exception:
|
| 69 |
+
return styled_error("Please select a license for your model")
|
| 70 |
|
| 71 |
modelcard_OK, error_msg = check_model_card(model)
|
| 72 |
if not modelcard_OK:
|
| 73 |
return styled_error(error_msg)
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# Seems good, creating the eval
|
| 76 |
print("Adding new eval")
|
| 77 |
|
| 78 |
eval_entry = {
|
|
|
|
| 79 |
"model": model,
|
| 80 |
"base_model": base_model,
|
| 81 |
"revision": revision,
|
| 82 |
"precision": precision,
|
| 83 |
+
"weight_type": weight_type,
|
| 84 |
"status": "PENDING",
|
| 85 |
"submitted_time": current_time,
|
| 86 |
+
"model_type": model_type,
|
| 87 |
"likes": model_info.likes,
|
| 88 |
"params": model_size,
|
| 89 |
"license": license,
|
| 90 |
"private": False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
}
|
| 92 |
+
|
| 93 |
# Check for duplicate submission
|
| 94 |
+
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
| 95 |
+
return styled_warning("This model has been already submitted.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
print("Creating eval file")
|
| 98 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 100 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
| 101 |
|
| 102 |
with open(out_path, "w") as f:
|
| 103 |
f.write(json.dumps(eval_entry))
|
|
|
|
| 115 |
os.remove(out_path)
|
| 116 |
|
| 117 |
return styled_message(
|
| 118 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 119 |
)
|
ui.py
DELETED
|
@@ -1,228 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
import src.config as configs
|
| 3 |
-
from constants import TAB_NAMES, MODEL_TYPE_MAP, OUTPUT_FORM_MAP
|
| 4 |
-
from src.display.formatting import render_leaderboard_html
|
| 5 |
-
from src.display.css_html_js import get_leaderboard_table_html, custom_css
|
| 6 |
-
import pandas as pd
|
| 7 |
-
from constants import LEADERBOARD_REQUIRED_COLUMNS
|
| 8 |
-
|
| 9 |
-
def render_pretty_leaderboard_html(df):
|
| 10 |
-
"""
|
| 11 |
-
Renders a pretty leaderboard table using badge and gauge.
|
| 12 |
-
Supports both ['Model', 'Score'] and ['Model Name', 'Overall'] columns.
|
| 13 |
-
Sorts by score descending and rounds for display.
|
| 14 |
-
"""
|
| 15 |
-
# Flexible column mapping
|
| 16 |
-
col_map = {}
|
| 17 |
-
if "Model" in df.columns:
|
| 18 |
-
col_map["Model"] = "Model"
|
| 19 |
-
elif "Model Name" in df.columns:
|
| 20 |
-
col_map["Model"] = "Model Name"
|
| 21 |
-
else:
|
| 22 |
-
return "<div style='color:red'>DataFrame must have a 'Model' or 'Model Name' column.</div>"
|
| 23 |
-
if "Score" in df.columns:
|
| 24 |
-
col_map["Score"] = "Score"
|
| 25 |
-
elif "Overall" in df.columns:
|
| 26 |
-
col_map["Score"] = "Overall"
|
| 27 |
-
else:
|
| 28 |
-
return "<div style='color:red'>DataFrame must have a 'Score' or 'Overall' column.</div>"
|
| 29 |
-
|
| 30 |
-
# Example mappings for demonstration (expand as needed)
|
| 31 |
-
model_type_map = MODEL_TYPE_MAP
|
| 32 |
-
output_form_map = OUTPUT_FORM_MAP
|
| 33 |
-
|
| 34 |
-
# Copy and rename for uniformity
|
| 35 |
-
df2 = df.copy()
|
| 36 |
-
df2 = df2.rename(columns={col_map["Model"]: "Model", col_map["Score"]: "Score"})
|
| 37 |
-
|
| 38 |
-
# 매핑 전후로 누락된 모델명을 출력 (디버깅용)
|
| 39 |
-
missing_type = set(df2["Model"]) - set(model_type_map.keys())
|
| 40 |
-
missing_output = set(df2["Model"]) - set(output_form_map.keys())
|
| 41 |
-
if missing_type:
|
| 42 |
-
print("Model Type 매핑 누락:", missing_type)
|
| 43 |
-
if missing_output:
|
| 44 |
-
print("Output Form 매핑 누락:", missing_output)
|
| 45 |
-
|
| 46 |
-
# Add badge columns
|
| 47 |
-
df2["Model Type"] = df2["Model"].map(model_type_map).fillna("open")
|
| 48 |
-
df2["Output Form"] = df2["Model"].map(output_form_map).fillna("normal")
|
| 49 |
-
# Drop NA, sort, round
|
| 50 |
-
df2 = df2[["Model", "Score", "Model Type", "Output Form"]].dropna()
|
| 51 |
-
df2["Score"] = pd.to_numeric(df2["Score"], errors="coerce").round(2)
|
| 52 |
-
df2 = df2.sort_values("Score", ascending=False).reset_index(drop=True)
|
| 53 |
-
|
| 54 |
-
return get_leaderboard_table_html(df2)
|
| 55 |
-
|
| 56 |
-
def create_leaderboard_tab(df, key, search_leaderboard, update_modelselector_group, update_leaderboard, column_selector_value):
|
| 57 |
-
"""
|
| 58 |
-
df: DataFrame to display
|
| 59 |
-
key: "Category" or "Language"
|
| 60 |
-
search_leaderboard, update_modelselector_group, update_leaderboard: handler functions
|
| 61 |
-
column_selector_value: default columns to select
|
| 62 |
-
"""
|
| 63 |
-
with gr.TabItem(
|
| 64 |
-
TAB_NAMES[key],
|
| 65 |
-
visible=True
|
| 66 |
-
):
|
| 67 |
-
df_state = gr.State(df)
|
| 68 |
-
|
| 69 |
-
with gr.Row():
|
| 70 |
-
with gr.Column():
|
| 71 |
-
search_box = gr.Textbox(label="Search Model by Name")
|
| 72 |
-
group_list = df["Group"].unique().tolist()
|
| 73 |
-
group_selector = gr.CheckboxGroup(
|
| 74 |
-
choices=df["Group"].unique().tolist(),
|
| 75 |
-
value=group_list,
|
| 76 |
-
label="Select Model Group"
|
| 77 |
-
)
|
| 78 |
-
# 필수 컬럼 항상 포함, 체크 해제 불가(disabled)
|
| 79 |
-
# 선택지에서 "Model Name", "Group", "Overall" 제외
|
| 80 |
-
exclude_cols = {"Model Name", "Group", "Overall"}
|
| 81 |
-
selectable_columns = [col for col in df.columns.tolist()[3:] if col not in exclude_cols]
|
| 82 |
-
all_columns = list(dict.fromkeys(LEADERBOARD_REQUIRED_COLUMNS + selectable_columns))
|
| 83 |
-
column_selector = gr.CheckboxGroup(
|
| 84 |
-
choices=selectable_columns,
|
| 85 |
-
value=[col for col in column_selector_value if col in selectable_columns],
|
| 86 |
-
label="Select Columns"
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
with gr.Column():
|
| 90 |
-
with gr.Accordion("Model List", open=False):
|
| 91 |
-
model_group = df["Model Name"].tolist()
|
| 92 |
-
model_selector = gr.CheckboxGroup(
|
| 93 |
-
choices=df["Model Name"].tolist(),
|
| 94 |
-
value=model_group,
|
| 95 |
-
label="Select Models"
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
# badge 정보 포함 DataFrame 생성 (위쪽 테이블용)
|
| 99 |
-
df_badge = df.copy()
|
| 100 |
-
# Model 컬럼명 통일
|
| 101 |
-
if "Model Name" in df_badge.columns:
|
| 102 |
-
df_badge["Model"] = df_badge["Model Name"]
|
| 103 |
-
# 예시 매핑 (아래쪽과 동일하게 확장)
|
| 104 |
-
model_type_map = MODEL_TYPE_MAP
|
| 105 |
-
output_form_map = OUTPUT_FORM_MAP
|
| 106 |
-
df_badge["Model Type"] = df_badge["Model"].map(model_type_map).fillna("open")
|
| 107 |
-
df_badge["Output Form"] = df_badge["Model"].map(output_form_map).fillna("normal")
|
| 108 |
-
df_badge = df_badge.sort_values("Overall" if "Overall" in df_badge.columns else "Score", ascending=False).reset_index(drop=True)
|
| 109 |
-
df_badge["Rank"] = df_badge.index + 1
|
| 110 |
-
|
| 111 |
-
# 정렬 상태 관리용 State (한 번만 생성, 이후 재사용)
|
| 112 |
-
default_sort_col = "Overall" if "Overall" in df_badge.columns else "Score"
|
| 113 |
-
sort_col_state = gr.State(default_sort_col)
|
| 114 |
-
sort_asc_state = gr.State(False) # 내림차순이 기본값
|
| 115 |
-
|
| 116 |
-
# 정렬 함수 (JS에서 넘긴 asc 값을 그대로 사용)
|
| 117 |
-
def sort_and_render(col, asc, models, columns, df_):
|
| 118 |
-
print(f"[sort_and_render] called: col={col}, asc={asc}, models={models}, columns={columns}")
|
| 119 |
-
filtered_df = update_leaderboard(models, columns, df_, col, asc)
|
| 120 |
-
# 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
|
| 121 |
-
filtered_df._sort_col = col
|
| 122 |
-
filtered_df._sort_asc = asc
|
| 123 |
-
return render_leaderboard_html(filtered_df.round(3)), col, asc
|
| 124 |
-
|
| 125 |
-
leaderboard_html = render_leaderboard_html(df_badge.round(3))
|
| 126 |
-
leaderboard_html_comp = gr.HTML(value=leaderboard_html, elem_id="leaderboard-table")
|
| 127 |
-
|
| 128 |
-
# 정렬 트리거용 hidden textbox 추가
|
| 129 |
-
sort_trigger = gr.Textbox(visible=False, elem_id="sort-leaderboard-trigger")
|
| 130 |
-
|
| 131 |
-
# sort-arrow 클릭 시 항상 새로운 값으로 value를 변경하는 JS 삽입 (정렬 방향 포함)
|
| 132 |
-
sort_js = """
|
| 133 |
-
<script>
|
| 134 |
-
(function() {
|
| 135 |
-
document.addEventListener('DOMContentLoaded', function() {
|
| 136 |
-
const table = document.getElementById('leaderboard-table');
|
| 137 |
-
if (!table) return;
|
| 138 |
-
table.addEventListener('click', function(e) {
|
| 139 |
-
const arrow = e.target.closest('.sort-arrow');
|
| 140 |
-
if (arrow) {
|
| 141 |
-
const col = arrow.getAttribute('data-col');
|
| 142 |
-
const asc = arrow.getAttribute('data-asc');
|
| 143 |
-
// 항상 새로운 값으로 value를 변경하여 change 이벤트 강제 발생
|
| 144 |
-
const trigger = document.querySelector('#sort-leaderboard-trigger input');
|
| 145 |
-
if (trigger) {
|
| 146 |
-
trigger.value = col + '|' + asc + '|' + Date.now();
|
| 147 |
-
trigger.dispatchEvent(new Event('input', { bubbles: true }));
|
| 148 |
-
trigger.dispatchEvent(new Event('change', { bubbles: true }));
|
| 149 |
-
}
|
| 150 |
-
}
|
| 151 |
-
});
|
| 152 |
-
});
|
| 153 |
-
})();
|
| 154 |
-
</script>
|
| 155 |
-
"""
|
| 156 |
-
# 정렬 버튼 클릭 시에도 update_leaderboard를 호출하도록 wiring
|
| 157 |
-
def sort_trigger_change(col_val, models, columns, df_, prev_col, prev_asc):
|
| 158 |
-
print(f"[sort_trigger.change] col_val={col_val}, prev_col={prev_col}, prev_asc={prev_asc}")
|
| 159 |
-
col, asc = col_val.split('|')[0], col_val.split('|')[1].lower() == "true"
|
| 160 |
-
return sort_and_render(col, asc, models, columns, df_)
|
| 161 |
-
|
| 162 |
-
sort_trigger.change(
|
| 163 |
-
fn=sort_trigger_change,
|
| 164 |
-
inputs=[sort_trigger, model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
|
| 165 |
-
outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
# 커스텀 JS를 상단 테이블에 삽입
|
| 169 |
-
leaderboard_html_comp.style = None # gr.HTML에는 style 파라미터가 없으므로, 아래에서 삽입
|
| 170 |
-
leaderboard_html_comp.value += sort_js
|
| 171 |
-
|
| 172 |
-
# Pretty leaderboard preview (uses only 'Model' and 'Score' columns)
|
| 173 |
-
pretty_html = gr.HTML(value=render_pretty_leaderboard_html(df.round(3)))
|
| 174 |
-
|
| 175 |
-
# Define change functions for user interaction
|
| 176 |
-
# 모든 UI 이벤트에서 update_leaderboard → sort_and_render → render_leaderboard_html 순으로 갱신
|
| 177 |
-
def filter_and_sort_search(query, df, sort_col, sort_asc):
|
| 178 |
-
print(f"[filter_and_sort_search] sort_col={sort_col}, sort_asc={sort_asc}")
|
| 179 |
-
filtered_df = search_leaderboard(query, df, sort_col, sort_asc)
|
| 180 |
-
# 정렬 상태를 DataFrame에 임시로 저장해 헤더에 반영
|
| 181 |
-
filtered_df._sort_col = sort_col
|
| 182 |
-
filtered_df._sort_asc = sort_asc
|
| 183 |
-
return render_leaderboard_html(filtered_df), sort_col, sort_asc
|
| 184 |
-
|
| 185 |
-
def filter_and_sort_model(models, columns, df, sort_col, sort_asc):
|
| 186 |
-
print(f"[filter_and_sort_model] sort_col={sort_col}, sort_asc={sort_asc}")
|
| 187 |
-
filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
|
| 188 |
-
filtered_df._sort_col = sort_col
|
| 189 |
-
filtered_df._sort_asc = sort_asc
|
| 190 |
-
return render_leaderboard_html(filtered_df), sort_col, sort_asc
|
| 191 |
-
|
| 192 |
-
def filter_and_sort_column(models, columns, df, sort_col, sort_asc):
|
| 193 |
-
print(f"[filter_and_sort_column] sort_col={sort_col}, sort_asc={sort_asc}")
|
| 194 |
-
filtered_df = update_leaderboard(models, columns, df, sort_col, sort_asc)
|
| 195 |
-
filtered_df._sort_col = sort_col
|
| 196 |
-
filtered_df._sort_asc = sort_asc
|
| 197 |
-
return render_leaderboard_html(filtered_df), sort_col, sort_asc
|
| 198 |
-
|
| 199 |
-
search_box.change(
|
| 200 |
-
fn=filter_and_sort_search,
|
| 201 |
-
inputs=[search_box, df_state, sort_col_state, sort_asc_state],
|
| 202 |
-
outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
|
| 203 |
-
)
|
| 204 |
-
|
| 205 |
-
group_selector.change(fn=update_modelselector_group, inputs=[group_selector, df_state], outputs=model_selector)
|
| 206 |
-
model_selector.change(
|
| 207 |
-
fn=filter_and_sort_model,
|
| 208 |
-
inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
|
| 209 |
-
outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
|
| 210 |
-
)
|
| 211 |
-
|
| 212 |
-
# column_selector 변경 시에도 항상 최신 sort_col, sort_asc를 유지
|
| 213 |
-
column_selector.change(
|
| 214 |
-
fn=filter_and_sort_column,
|
| 215 |
-
inputs=[model_selector, column_selector, df_state, sort_col_state, sort_asc_state],
|
| 216 |
-
outputs=[leaderboard_html_comp, sort_col_state, sort_asc_state]
|
| 217 |
-
)
|
| 218 |
-
|
| 219 |
-
return {
|
| 220 |
-
"search_box": search_box,
|
| 221 |
-
"group_selector": group_selector,
|
| 222 |
-
"column_selector": column_selector,
|
| 223 |
-
"model_selector": model_selector,
|
| 224 |
-
"leaderboard_html_comp": leaderboard_html_comp,
|
| 225 |
-
"sort_trigger": sort_trigger,
|
| 226 |
-
"df_state": df_state,
|
| 227 |
-
"pretty_html": pretty_html
|
| 228 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import gradio as gr
|
| 4 |
-
from huggingface_hub import whoami
|
| 5 |
-
|
| 6 |
-
def get_profile(profile: gr.OAuthProfile | None) -> str:
|
| 7 |
-
if profile is None:
|
| 8 |
-
return "Anonymous"
|
| 9 |
-
return profile.username
|
| 10 |
-
|
| 11 |
-
def get_organizations(oauth_token: gr.OAuthToken | None) -> str:
|
| 12 |
-
if oauth_token is None:
|
| 13 |
-
return "No Organization"
|
| 14 |
-
org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
|
| 15 |
-
return org_names
|
| 16 |
-
|
| 17 |
-
def get_profile_and_organizations(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> tuple[str, str]:
|
| 18 |
-
if profile is None:
|
| 19 |
-
output_profile = "Anonymous"
|
| 20 |
-
else:
|
| 21 |
-
output_profile = profile.username
|
| 22 |
-
|
| 23 |
-
if oauth_token is None:
|
| 24 |
-
output_org = "No Organization"
|
| 25 |
-
else:
|
| 26 |
-
output_org = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
|
| 27 |
-
|
| 28 |
-
return output_profile, output_org
|
| 29 |
-
|
| 30 |
-
def download_with_restart(snapshot_download_func, repo_id, local_dir, repo_type, token, restart_func):
|
| 31 |
-
try:
|
| 32 |
-
print(local_dir)
|
| 33 |
-
snapshot_download_func(
|
| 34 |
-
repo_id=repo_id,
|
| 35 |
-
local_dir=local_dir,
|
| 36 |
-
repo_type=repo_type,
|
| 37 |
-
tqdm_class=None,
|
| 38 |
-
etag_timeout=30,
|
| 39 |
-
token=token
|
| 40 |
-
)
|
| 41 |
-
except Exception:
|
| 42 |
-
restart_func()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|