Terry Zhuo
commited on
Commit
·
2e84cf2
1
Parent(s):
b65f8f6
update
Browse files- app.py +76 -109
- src/display/utils.py +2 -2
- src/tools/plots.py +12 -12
app.py
CHANGED
|
@@ -38,8 +38,8 @@ from src.envs import (
|
|
| 38 |
DATA_VERSION,
|
| 39 |
DATA_REPO,
|
| 40 |
HARD_RESULT_REPO,
|
| 41 |
-
ELO_REPO,
|
| 42 |
-
HARD_ELO_REPO,
|
| 43 |
SOLVE_REPO,
|
| 44 |
HARD_SOLVE_REPO,
|
| 45 |
HF_TOKEN,
|
|
@@ -51,7 +51,7 @@ from src.envs import (
|
|
| 51 |
)
|
| 52 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 53 |
from src.execute import generate_command, default_command, stream_logs, find_result_file
|
| 54 |
-
from src.tools.plots import
|
| 55 |
# from src.voting.vote_system import VoteManager, run_scheduler
|
| 56 |
|
| 57 |
# Configure logging
|
|
@@ -66,10 +66,10 @@ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
|
| 66 |
NEW_DATA_ON_LEADERBOARD = True
|
| 67 |
LEADERBOARD_DF = None
|
| 68 |
HARD_LEADERBOARD_DF = None
|
| 69 |
-
ELO_TASK_DF = None
|
| 70 |
-
ELO_BENCH_DF = None
|
| 71 |
-
HARD_ELO_TASK_DF = None
|
| 72 |
-
HARD_ELO_BENCH_DF = None
|
| 73 |
COMPLETE_SOLVE_DF = None
|
| 74 |
INSTRUCT_SOLVE_DF = None
|
| 75 |
HARD_COMPLETE_SOLVE_DF = None
|
|
@@ -154,10 +154,10 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 154 |
def get_latest_data_leaderboard(
|
| 155 |
leaderboard_initial_df = None,
|
| 156 |
hard_leaderboard_initial_df = None,
|
| 157 |
-
elo_task_df = None,
|
| 158 |
-
elo_bench_df = None,
|
| 159 |
-
hard_elo_task_df = None,
|
| 160 |
-
hard_elo_bench_df = None,
|
| 161 |
complete_solve_df = None,
|
| 162 |
instruct_solve_df = None,
|
| 163 |
hard_complete_solve_df = None,
|
|
@@ -166,10 +166,10 @@ def get_latest_data_leaderboard(
|
|
| 166 |
global NEW_DATA_ON_LEADERBOARD
|
| 167 |
global LEADERBOARD_DF
|
| 168 |
global HARD_LEADERBOARD_DF
|
| 169 |
-
global ELO_TASK_DF
|
| 170 |
-
global ELO_BENCH_DF
|
| 171 |
-
global HARD_ELO_TASK_DF
|
| 172 |
-
global HARD_ELO_BENCH_DF
|
| 173 |
global COMPLETE_SOLVE_DF
|
| 174 |
global INSTRUCT_SOLVE_DF
|
| 175 |
global HARD_COMPLETE_SOLVE_DF
|
|
@@ -182,7 +182,7 @@ def get_latest_data_leaderboard(
|
|
| 182 |
"default",
|
| 183 |
split="train",
|
| 184 |
cache_dir=HF_HOME,
|
| 185 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 186 |
verification_mode="no_checks"
|
| 187 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
| 188 |
LEADERBOARD_DF = get_leaderboard_df(
|
|
@@ -194,7 +194,7 @@ def get_latest_data_leaderboard(
|
|
| 194 |
"default",
|
| 195 |
split="train",
|
| 196 |
cache_dir=HF_HOME,
|
| 197 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 198 |
verification_mode="no_checks"
|
| 199 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
| 200 |
hard_leaderboard_df = get_leaderboard_df(
|
|
@@ -202,51 +202,23 @@ def get_latest_data_leaderboard(
|
|
| 202 |
cols=COLS,
|
| 203 |
)
|
| 204 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
ELO_REPO,
|
| 216 |
-
"default",
|
| 217 |
-
split="benchmark_tie",
|
| 218 |
-
cache_dir=HF_HOME,
|
| 219 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 220 |
-
verification_mode="no_checks"
|
| 221 |
-
).to_pandas()
|
| 222 |
-
ELO_TASK_DF = elo_task_df
|
| 223 |
-
ELO_BENCH_DF = elo_bench_df
|
| 224 |
-
|
| 225 |
-
hard_elo_task_df = datasets.load_dataset(
|
| 226 |
-
HARD_ELO_REPO,
|
| 227 |
-
"default",
|
| 228 |
-
split="task_no_tie",
|
| 229 |
-
cache_dir=HF_HOME,
|
| 230 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 231 |
-
verification_mode="no_checks"
|
| 232 |
-
).to_pandas()
|
| 233 |
-
hard_elo_bench_df = datasets.load_dataset(
|
| 234 |
-
HARD_ELO_REPO,
|
| 235 |
-
"default",
|
| 236 |
-
split="benchmark_tie",
|
| 237 |
-
cache_dir=HF_HOME,
|
| 238 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
| 239 |
-
verification_mode="no_checks"
|
| 240 |
-
).to_pandas()
|
| 241 |
-
HARD_ELO_TASK_DF = hard_elo_task_df
|
| 242 |
-
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 243 |
|
| 244 |
complete_solve_df = datasets.load_dataset(
|
| 245 |
SOLVE_REPO,
|
| 246 |
"default",
|
| 247 |
split="complete",
|
| 248 |
cache_dir=HF_HOME,
|
| 249 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 250 |
verification_mode="no_checks"
|
| 251 |
).to_pandas()
|
| 252 |
instruct_solve_df = datasets.load_dataset(
|
|
@@ -254,7 +226,7 @@ def get_latest_data_leaderboard(
|
|
| 254 |
"default",
|
| 255 |
split="instruct",
|
| 256 |
cache_dir=HF_HOME,
|
| 257 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 258 |
verification_mode="no_checks"
|
| 259 |
).to_pandas()
|
| 260 |
COMPLETE_SOLVE_DF = complete_solve_df
|
|
@@ -265,7 +237,7 @@ def get_latest_data_leaderboard(
|
|
| 265 |
"default",
|
| 266 |
split="complete",
|
| 267 |
cache_dir=HF_HOME,
|
| 268 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 269 |
verification_mode="no_checks"
|
| 270 |
).to_pandas()
|
| 271 |
hard_instruct_solve_df = datasets.load_dataset(
|
|
@@ -273,7 +245,7 @@ def get_latest_data_leaderboard(
|
|
| 273 |
"default",
|
| 274 |
split="instruct",
|
| 275 |
cache_dir=HF_HOME,
|
| 276 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 277 |
verification_mode="no_checks"
|
| 278 |
).to_pandas()
|
| 279 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
|
@@ -283,18 +255,17 @@ def get_latest_data_leaderboard(
|
|
| 283 |
|
| 284 |
else:
|
| 285 |
LEADERBOARD_DF = leaderboard_initial_df
|
| 286 |
-
|
| 287 |
-
ELO_TASK_DF = elo_task_df
|
| 288 |
-
# ELO_BENCH_DF = elo_bench_df
|
| 289 |
-
# HARD_ELO_TASK_DF = hard_elo_task_df
|
| 290 |
-
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
| 291 |
COMPLETE_SOLVE_DF = complete_solve_df
|
| 292 |
-
|
| 293 |
-
|
| 294 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 295 |
|
| 296 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
| 297 |
-
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 298 |
|
| 299 |
|
| 300 |
def init_space():
|
|
@@ -303,19 +274,19 @@ def init_space():
|
|
| 303 |
# Always redownload the leaderboard DataFrame
|
| 304 |
global LEADERBOARD_DF
|
| 305 |
global HARD_LEADERBOARD_DF
|
| 306 |
-
global ELO_TASK_DF
|
| 307 |
-
global ELO_BENCH_DF
|
| 308 |
-
global HARD_ELO_TASK_DF
|
| 309 |
-
global HARD_ELO_BENCH_DF
|
| 310 |
global COMPLETE_SOLVE_DF
|
| 311 |
global INSTRUCT_SOLVE_DF
|
| 312 |
global HARD_COMPLETE_SOLVE_DF
|
| 313 |
global HARD_INSTRUCT_SOLVE_DF
|
| 314 |
|
| 315 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
| 316 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 317 |
|
| 318 |
-
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
| 319 |
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 320 |
|
| 321 |
# Initialize VoteManager
|
|
@@ -331,10 +302,7 @@ def init_space():
|
|
| 331 |
|
| 332 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 333 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 334 |
-
LEADERBOARD_DF, HARD_LEADERBOARD_DF,
|
| 335 |
-
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
|
| 336 |
-
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
|
| 337 |
-
HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 338 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 339 |
|
| 340 |
# Data processing for plots now only on demand in the respective Gradio tab
|
|
@@ -399,7 +367,6 @@ with main_block as demo:
|
|
| 399 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
| 400 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
| 401 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
| 402 |
-
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
|
| 403 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
| 404 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 405 |
- For more details check the 📝 About section.
|
|
@@ -407,20 +374,21 @@ with main_block as demo:
|
|
| 407 |
elem_classes="markdown-text",
|
| 408 |
)
|
| 409 |
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
|
|
|
| 424 |
|
| 425 |
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
| 426 |
with gr.Column():
|
|
@@ -448,27 +416,26 @@ with main_block as demo:
|
|
| 448 |
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
| 449 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
| 450 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
| 451 |
-
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
| 452 |
-
- `size` is the amount of activated model weight during inference.
|
| 453 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 454 |
- For more details check the 📝 About section.
|
| 455 |
""",
|
| 456 |
elem_classes="markdown-text",
|
| 457 |
)
|
| 458 |
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
|
|
|
| 472 |
|
| 473 |
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
| 474 |
with gr.Column():
|
|
@@ -602,7 +569,7 @@ with main_block as demo:
|
|
| 602 |
show_copy_button=True,
|
| 603 |
)
|
| 604 |
|
| 605 |
-
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard,
|
| 606 |
# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 607 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 608 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
|
|
|
| 38 |
DATA_VERSION,
|
| 39 |
DATA_REPO,
|
| 40 |
HARD_RESULT_REPO,
|
| 41 |
+
# ELO_REPO, # Comment out
|
| 42 |
+
# HARD_ELO_REPO, # Comment out
|
| 43 |
SOLVE_REPO,
|
| 44 |
HARD_SOLVE_REPO,
|
| 45 |
HF_TOKEN,
|
|
|
|
| 51 |
)
|
| 52 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 53 |
from src.execute import generate_command, default_command, stream_logs, find_result_file
|
| 54 |
+
from src.tools.plots import plot_solve_rate
|
| 55 |
# from src.voting.vote_system import VoteManager, run_scheduler
|
| 56 |
|
| 57 |
# Configure logging
|
|
|
|
| 66 |
NEW_DATA_ON_LEADERBOARD = True
|
| 67 |
LEADERBOARD_DF = None
|
| 68 |
HARD_LEADERBOARD_DF = None
|
| 69 |
+
# ELO_TASK_DF = None # Comment out
|
| 70 |
+
# ELO_BENCH_DF = None # Comment out
|
| 71 |
+
# HARD_ELO_TASK_DF = None # Comment out
|
| 72 |
+
# HARD_ELO_BENCH_DF = None # Comment out
|
| 73 |
COMPLETE_SOLVE_DF = None
|
| 74 |
INSTRUCT_SOLVE_DF = None
|
| 75 |
HARD_COMPLETE_SOLVE_DF = None
|
|
|
|
| 154 |
def get_latest_data_leaderboard(
|
| 155 |
leaderboard_initial_df = None,
|
| 156 |
hard_leaderboard_initial_df = None,
|
| 157 |
+
# elo_task_df = None, # Comment out
|
| 158 |
+
# elo_bench_df = None, # Comment out
|
| 159 |
+
# hard_elo_task_df = None, # Comment out
|
| 160 |
+
# hard_elo_bench_df = None, # Comment out
|
| 161 |
complete_solve_df = None,
|
| 162 |
instruct_solve_df = None,
|
| 163 |
hard_complete_solve_df = None,
|
|
|
|
| 166 |
global NEW_DATA_ON_LEADERBOARD
|
| 167 |
global LEADERBOARD_DF
|
| 168 |
global HARD_LEADERBOARD_DF
|
| 169 |
+
# global ELO_TASK_DF # Comment out
|
| 170 |
+
# global ELO_BENCH_DF # Comment out
|
| 171 |
+
# global HARD_ELO_TASK_DF # Comment out
|
| 172 |
+
# global HARD_ELO_BENCH_DF # Comment out
|
| 173 |
global COMPLETE_SOLVE_DF
|
| 174 |
global INSTRUCT_SOLVE_DF
|
| 175 |
global HARD_COMPLETE_SOLVE_DF
|
|
|
|
| 182 |
"default",
|
| 183 |
split="train",
|
| 184 |
cache_dir=HF_HOME,
|
| 185 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 186 |
verification_mode="no_checks"
|
| 187 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
| 188 |
LEADERBOARD_DF = get_leaderboard_df(
|
|
|
|
| 194 |
"default",
|
| 195 |
split="train",
|
| 196 |
cache_dir=HF_HOME,
|
| 197 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 198 |
verification_mode="no_checks"
|
| 199 |
).filter(lambda x: x['complete'] is not None or x['instruct'] is not None)
|
| 200 |
hard_leaderboard_df = get_leaderboard_df(
|
|
|
|
| 202 |
cols=COLS,
|
| 203 |
)
|
| 204 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
| 205 |
+
|
| 206 |
+
# Comment out Elo dataset loading
|
| 207 |
+
# elo_task_df = datasets.load_dataset(...)
|
| 208 |
+
# elo_bench_df = datasets.load_dataset(...)
|
| 209 |
+
# ELO_TASK_DF = elo_task_df
|
| 210 |
+
# ELO_BENCH_DF = elo_bench_df
|
| 211 |
+
# hard_elo_task_df = datasets.load_dataset(...)
|
| 212 |
+
# hard_elo_bench_df = datasets.load_dataset(...)
|
| 213 |
+
# HARD_ELO_TASK_DF = hard_elo_task_df
|
| 214 |
+
# HARD_ELO_BENCH_DF = hard_elo_bench_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
complete_solve_df = datasets.load_dataset(
|
| 217 |
SOLVE_REPO,
|
| 218 |
"default",
|
| 219 |
split="complete",
|
| 220 |
cache_dir=HF_HOME,
|
| 221 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 222 |
verification_mode="no_checks"
|
| 223 |
).to_pandas()
|
| 224 |
instruct_solve_df = datasets.load_dataset(
|
|
|
|
| 226 |
"default",
|
| 227 |
split="instruct",
|
| 228 |
cache_dir=HF_HOME,
|
| 229 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 230 |
verification_mode="no_checks"
|
| 231 |
).to_pandas()
|
| 232 |
COMPLETE_SOLVE_DF = complete_solve_df
|
|
|
|
| 237 |
"default",
|
| 238 |
split="complete",
|
| 239 |
cache_dir=HF_HOME,
|
| 240 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 241 |
verification_mode="no_checks"
|
| 242 |
).to_pandas()
|
| 243 |
hard_instruct_solve_df = datasets.load_dataset(
|
|
|
|
| 245 |
"default",
|
| 246 |
split="instruct",
|
| 247 |
cache_dir=HF_HOME,
|
| 248 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
|
| 249 |
verification_mode="no_checks"
|
| 250 |
).to_pandas()
|
| 251 |
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
|
|
|
| 255 |
|
| 256 |
else:
|
| 257 |
LEADERBOARD_DF = leaderboard_initial_df
|
| 258 |
+
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
| 259 |
+
# ELO_TASK_DF = elo_task_df # Comment out
|
| 260 |
+
# ELO_BENCH_DF = elo_bench_df # Comment out
|
| 261 |
+
# HARD_ELO_TASK_DF = hard_elo_task_df # Comment out
|
| 262 |
+
# HARD_ELO_BENCH_DF = hard_elo_bench_df # Comment out
|
| 263 |
COMPLETE_SOLVE_DF = complete_solve_df
|
| 264 |
+
INSTRUCT_SOLVE_DF = instruct_solve_df
|
| 265 |
+
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
| 266 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
| 267 |
|
| 268 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
def init_space():
|
|
|
|
| 274 |
# Always redownload the leaderboard DataFrame
|
| 275 |
global LEADERBOARD_DF
|
| 276 |
global HARD_LEADERBOARD_DF
|
| 277 |
+
# global ELO_TASK_DF # Comment out
|
| 278 |
+
# global ELO_BENCH_DF # Comment out
|
| 279 |
+
# global HARD_ELO_TASK_DF # Comment out
|
| 280 |
+
# global HARD_ELO_BENCH_DF # Comment out
|
| 281 |
global COMPLETE_SOLVE_DF
|
| 282 |
global INSTRUCT_SOLVE_DF
|
| 283 |
global HARD_COMPLETE_SOLVE_DF
|
| 284 |
global HARD_INSTRUCT_SOLVE_DF
|
| 285 |
|
| 286 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 287 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
| 288 |
|
| 289 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 290 |
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
| 291 |
|
| 292 |
# Initialize VoteManager
|
|
|
|
| 302 |
|
| 303 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
| 304 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
| 305 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
|
|
|
|
|
|
|
|
|
| 306 |
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = init_space()
|
| 307 |
|
| 308 |
# Data processing for plots now only on demand in the respective Gradio tab
|
|
|
|
| 367 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
| 368 |
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
| 369 |
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
|
|
|
| 370 |
- `#Act Params (B)` is the number of activated model parameters during inference.
|
| 371 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 372 |
- For more details check the 📝 About section.
|
|
|
|
| 374 |
elem_classes="markdown-text",
|
| 375 |
)
|
| 376 |
|
| 377 |
+
# Comment out or remove the Elo Rating tab
|
| 378 |
+
# with gr.TabItem("📊 Elo Rating", id="hard_elo"):
|
| 379 |
+
# with gr.Column():
|
| 380 |
+
# with gr.Group():
|
| 381 |
+
# gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 382 |
+
# hard_task_elo_map = gr.Plot()
|
| 383 |
+
# hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
|
| 384 |
+
# demo.load(plot_elo_mle, [hard_elo_task_gr],
|
| 385 |
+
# hard_task_elo_map)
|
| 386 |
+
# with gr.Group():
|
| 387 |
+
# gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 388 |
+
# hard_bench_elo_map = gr.Plot()
|
| 389 |
+
# hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
|
| 390 |
+
# demo.load(plot_elo_mle, [hard_elo_bench_gr],
|
| 391 |
+
# hard_bench_elo_map)
|
| 392 |
|
| 393 |
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
| 394 |
with gr.Column():
|
|
|
|
| 416 |
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
| 417 |
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
| 418 |
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
|
|
|
|
|
|
| 419 |
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
| 420 |
- For more details check the 📝 About section.
|
| 421 |
""",
|
| 422 |
elem_classes="markdown-text",
|
| 423 |
)
|
| 424 |
|
| 425 |
+
# Comment out or remove the Elo Rating tab
|
| 426 |
+
# with gr.TabItem("📊 Elo Rating", id="full_elo"):
|
| 427 |
+
# with gr.Column():
|
| 428 |
+
# with gr.Group():
|
| 429 |
+
#
|
| 430 |
+
# gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
| 431 |
+
# task_elo_map = gr.Plot()
|
| 432 |
+
# elo_task_gr = init_others(ELO_TASK_DF)
|
| 433 |
+
# demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
|
| 434 |
+
# with gr.Group():
|
| 435 |
+
# gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
| 436 |
+
# bench_elo_map = gr.Plot()
|
| 437 |
+
# elo_bench_gr = init_others(ELO_BENCH_DF)
|
| 438 |
+
# demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
|
| 439 |
|
| 440 |
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
| 441 |
with gr.Column():
|
|
|
|
| 569 |
show_copy_button=True,
|
| 570 |
)
|
| 571 |
|
| 572 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 573 |
# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
| 574 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
| 575 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
src/display/utils.py
CHANGED
|
@@ -54,7 +54,7 @@ column_map = {
|
|
| 54 |
"complete": "Complete",
|
| 55 |
"instruct": "Instruct",
|
| 56 |
"average": "Average",
|
| 57 |
-
"elo_mle": "Elo Rating",
|
| 58 |
"link": "Link",
|
| 59 |
"act_param": "#Act Params (B)",
|
| 60 |
"size": "#Params (B)",
|
|
@@ -96,7 +96,7 @@ auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_
|
|
| 96 |
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
| 97 |
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
| 98 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
| 99 |
-
auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
| 100 |
|
| 101 |
# Model information
|
| 102 |
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
|
|
|
| 54 |
"complete": "Complete",
|
| 55 |
"instruct": "Instruct",
|
| 56 |
"average": "Average",
|
| 57 |
+
# "elo_mle": "Elo Rating",
|
| 58 |
"link": "Link",
|
| 59 |
"act_param": "#Act Params (B)",
|
| 60 |
"size": "#Params (B)",
|
|
|
|
| 96 |
auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
|
| 97 |
auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
|
| 98 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
|
| 99 |
+
# auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
|
| 100 |
|
| 101 |
# Model information
|
| 102 |
auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
|
src/tools/plots.py
CHANGED
|
@@ -3,18 +3,18 @@ import plotly.express as px
|
|
| 3 |
import numpy as np
|
| 4 |
|
| 5 |
|
| 6 |
-
def plot_elo_mle(df):
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
|
| 20 |
def plot_solve_rate(df, task, rows=30, cols=38):
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
|
| 5 |
|
| 6 |
+
# def plot_elo_mle(df):
|
| 7 |
+
# fig = px.scatter(df, x="model", y="rating", error_y="error_y",
|
| 8 |
+
# error_y_minus="error_y_minus",
|
| 9 |
+
# # title="Bootstrap of Elo MLE Estimates (BigCodeBench-Complete)"
|
| 10 |
+
# )
|
| 11 |
+
# fig.update_layout(xaxis_title="Model",
|
| 12 |
+
# yaxis_title="Rating",
|
| 13 |
+
# autosize=True,
|
| 14 |
+
# # width=1300,
|
| 15 |
+
# # height=900,
|
| 16 |
+
# )
|
| 17 |
+
# return fig
|
| 18 |
|
| 19 |
|
| 20 |
def plot_solve_rate(df, task, rows=30, cols=38):
|