Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
93fda91
1
Parent(s):
394f64e
fix bugs - 0819
Browse files1) no hyperlink for "BM25" -> add hyperlink
2) unique key of df -> set to timestamp
- app.py +57 -15
- src/display/formatting.py +1 -1
- src/display/gradio_formatting.py +3 -3
- src/envs.py +10 -1
- src/read_evals.py +2 -2
- src/utils.py +2 -2
app.py
CHANGED
|
@@ -8,15 +8,57 @@ from src.about import (
|
|
| 8 |
TITLE,
|
| 9 |
EVALUATION_QUEUE_TEXT
|
| 10 |
)
|
| 11 |
-
from src.benchmarks import
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from src.display.css_html_js import custom_css
|
| 14 |
-
from src.display.utils import
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
from src.display.gradio_listener import set_listeners
|
| 21 |
|
| 22 |
def restart_space():
|
|
@@ -32,7 +74,7 @@ except Exception as e:
|
|
| 32 |
print(f'failed to download')
|
| 33 |
restart_space()
|
| 34 |
|
| 35 |
-
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/
|
| 36 |
|
| 37 |
original_df_qa = get_leaderboard_df(
|
| 38 |
raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
|
@@ -190,7 +232,7 @@ with demo:
|
|
| 190 |
queue=True
|
| 191 |
)
|
| 192 |
with gr.TabItem("Reranking Only", id=12):
|
| 193 |
-
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] ==
|
| 194 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
| 195 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 196 |
with gr.Row():
|
|
@@ -199,7 +241,7 @@ with demo:
|
|
| 199 |
with gr.Column(scale=1):
|
| 200 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 201 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
| 202 |
-
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] ==
|
| 203 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 204 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 205 |
hidden_lb_df_reranker, types_qa, visible=False
|
|
@@ -345,7 +387,7 @@ with demo:
|
|
| 345 |
)
|
| 346 |
with gr.TabItem("Reranking Only", id=22):
|
| 347 |
lb_df_reranker_ldoc = leaderboard_df_long_doc[
|
| 348 |
-
leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] ==
|
| 349 |
]
|
| 350 |
lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
|
| 351 |
reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
@@ -355,7 +397,7 @@ with demo:
|
|
| 355 |
with gr.Column(scale=1):
|
| 356 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 357 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
| 358 |
-
hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] ==
|
| 359 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 360 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 361 |
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
|
@@ -414,8 +456,8 @@ with demo:
|
|
| 414 |
with gr.Row():
|
| 415 |
with gr.Column():
|
| 416 |
benchmark_version = gr.Dropdown(
|
| 417 |
-
|
| 418 |
-
value=
|
| 419 |
interactive=True,
|
| 420 |
label="AIR-Bench Version")
|
| 421 |
with gr.Row():
|
|
|
|
| 8 |
TITLE,
|
| 9 |
EVALUATION_QUEUE_TEXT
|
| 10 |
)
|
| 11 |
+
from src.benchmarks import (
|
| 12 |
+
DOMAIN_COLS_QA,
|
| 13 |
+
LANG_COLS_QA,
|
| 14 |
+
DOMAIN_COLS_LONG_DOC,
|
| 15 |
+
LANG_COLS_LONG_DOC,
|
| 16 |
+
METRIC_LIST,
|
| 17 |
+
DEFAULT_METRIC_QA,
|
| 18 |
+
DEFAULT_METRIC_LONG_DOC
|
| 19 |
+
)
|
| 20 |
from src.display.css_html_js import custom_css
|
| 21 |
+
from src.display.utils import (
|
| 22 |
+
COL_NAME_IS_ANONYMOUS,
|
| 23 |
+
COL_NAME_REVISION,
|
| 24 |
+
COL_NAME_TIMESTAMP,
|
| 25 |
+
COL_NAME_RERANKING_MODEL,
|
| 26 |
+
COL_NAME_RETRIEVAL_MODEL
|
| 27 |
+
)
|
| 28 |
+
from src.envs import (
|
| 29 |
+
API,
|
| 30 |
+
EVAL_RESULTS_PATH,
|
| 31 |
+
REPO_ID,
|
| 32 |
+
RESULTS_REPO,
|
| 33 |
+
TOKEN,
|
| 34 |
+
BM25_LINK,
|
| 35 |
+
BENCHMARK_VERSION_LIST,
|
| 36 |
+
LATEST_BENCHMARK_VERSION
|
| 37 |
+
)
|
| 38 |
+
from src.read_evals import (
|
| 39 |
+
get_raw_eval_results,
|
| 40 |
+
get_leaderboard_df
|
| 41 |
+
)
|
| 42 |
+
from src.utils import (
|
| 43 |
+
update_metric,
|
| 44 |
+
upload_file,
|
| 45 |
+
get_default_cols,
|
| 46 |
+
submit_results,
|
| 47 |
+
reset_rank,
|
| 48 |
+
remove_html
|
| 49 |
+
)
|
| 50 |
+
from src.display.gradio_formatting import (
|
| 51 |
+
get_version_dropdown,
|
| 52 |
+
get_search_bar,
|
| 53 |
+
get_reranking_dropdown,
|
| 54 |
+
get_metric_dropdown,
|
| 55 |
+
get_domain_dropdown,
|
| 56 |
+
get_language_dropdown,
|
| 57 |
+
get_anonymous_checkbox,
|
| 58 |
+
get_revision_and_ts_checkbox,
|
| 59 |
+
get_leaderboard_table,
|
| 60 |
+
get_noreranking_dropdown
|
| 61 |
+
)
|
| 62 |
from src.display.gradio_listener import set_listeners
|
| 63 |
|
| 64 |
def restart_space():
|
|
|
|
| 74 |
print(f'failed to download')
|
| 75 |
restart_space()
|
| 76 |
|
| 77 |
+
raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/{LATEST_BENCHMARK_VERSION}")
|
| 78 |
|
| 79 |
original_df_qa = get_leaderboard_df(
|
| 80 |
raw_data, task='qa', metric=DEFAULT_METRIC_QA)
|
|
|
|
| 232 |
queue=True
|
| 233 |
)
|
| 234 |
with gr.TabItem("Reranking Only", id=12):
|
| 235 |
+
lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 236 |
lb_df_reranker = reset_rank(lb_df_reranker)
|
| 237 |
reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
| 238 |
with gr.Row():
|
|
|
|
| 241 |
with gr.Column(scale=1):
|
| 242 |
search_bar_reranker = gr.Textbox(show_label=False, visible=False)
|
| 243 |
lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
|
| 244 |
+
hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 245 |
hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
|
| 246 |
hidden_lb_table_reranker = get_leaderboard_table(
|
| 247 |
hidden_lb_df_reranker, types_qa, visible=False
|
|
|
|
| 387 |
)
|
| 388 |
with gr.TabItem("Reranking Only", id=22):
|
| 389 |
lb_df_reranker_ldoc = leaderboard_df_long_doc[
|
| 390 |
+
leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
|
| 391 |
]
|
| 392 |
lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
|
| 393 |
reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
|
|
|
|
| 397 |
with gr.Column(scale=1):
|
| 398 |
search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
|
| 399 |
lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
|
| 400 |
+
hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
|
| 401 |
hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
|
| 402 |
hidden_lb_table_reranker_ldoc = get_leaderboard_table(
|
| 403 |
hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
|
|
|
|
| 456 |
with gr.Row():
|
| 457 |
with gr.Column():
|
| 458 |
benchmark_version = gr.Dropdown(
|
| 459 |
+
BENCHMARK_VERSION_LIST,
|
| 460 |
+
value=LATEST_BENCHMARK_VERSION,
|
| 461 |
interactive=True,
|
| 462 |
label="AIR-Bench Version")
|
| 463 |
with gr.Row():
|
src/display/formatting.py
CHANGED
|
@@ -4,7 +4,7 @@ def model_hyperlink(link, model_name):
|
|
| 4 |
|
| 5 |
def make_clickable_model(model_name: str, model_link: str):
|
| 6 |
# link = f"https://huggingface.co/{model_name}"
|
| 7 |
-
if not model_link or not model_link.startswith("https://")
|
| 8 |
return model_name
|
| 9 |
return model_hyperlink(model_link, model_name)
|
| 10 |
|
|
|
|
| 4 |
|
| 5 |
def make_clickable_model(model_name: str, model_link: str):
|
| 6 |
# link = f"https://huggingface.co/{model_name}"
|
| 7 |
+
if not model_link or not model_link.startswith("https://"):
|
| 8 |
return model_name
|
| 9 |
return model_hyperlink(model_link, model_name)
|
| 10 |
|
src/display/gradio_formatting.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
|
| 4 |
def get_version_dropdown():
|
| 5 |
return gr.Dropdown(
|
| 6 |
-
choices=
|
| 7 |
-
value=
|
| 8 |
label="Select the version of AIR-Bench",
|
| 9 |
interactive=True
|
| 10 |
)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
|
| 3 |
|
| 4 |
def get_version_dropdown():
|
| 5 |
return gr.Dropdown(
|
| 6 |
+
choices=BENCHMARK_VERSION_LIST,
|
| 7 |
+
value=LATEST_BENCHMARK_VERSION,
|
| 8 |
label="Select the version of AIR-Bench",
|
| 9 |
interactive=True
|
| 10 |
)
|
src/envs.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
# Info to change for your repository
|
|
@@ -22,3 +22,12 @@ CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
| 22 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
|
| 23 |
|
| 24 |
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from display.formatting import model_hyperlink
|
| 3 |
from huggingface_hub import HfApi
|
| 4 |
|
| 5 |
# Info to change for your repository
|
|
|
|
| 22 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval_results")
|
| 23 |
|
| 24 |
API = HfApi(token=TOKEN)
|
| 25 |
+
|
| 26 |
+
BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
|
| 27 |
+
|
| 28 |
+
BENCHMARK_VERSION_LIST = [
|
| 29 |
+
"AIR-Bench_24.04",
|
| 30 |
+
# "AIR-Bench_24.05",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
|
src/read_evals.py
CHANGED
|
@@ -174,8 +174,8 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
|
|
| 174 |
print(f"loading file failed. {model_result_filepath}")
|
| 175 |
continue
|
| 176 |
print(f'file loaded: {model_result_filepath}')
|
| 177 |
-
|
| 178 |
-
eval_results[
|
| 179 |
|
| 180 |
results = []
|
| 181 |
for k, v in eval_results.items():
|
|
|
|
| 174 |
print(f"loading file failed. {model_result_filepath}")
|
| 175 |
continue
|
| 176 |
print(f'file loaded: {model_result_filepath}')
|
| 177 |
+
timestamp = eval_result.timestamp
|
| 178 |
+
eval_results[timestamp] = eval_result
|
| 179 |
|
| 180 |
results = []
|
| 181 |
for k, v in eval_results.items():
|
src/utils.py
CHANGED
|
@@ -10,7 +10,7 @@ from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, Benchmark
|
|
| 10 |
from src.display.formatting import styled_message, styled_error
|
| 11 |
from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
|
| 12 |
COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
|
| 13 |
-
from src.envs import API, SEARCH_RESULTS_REPO
|
| 14 |
from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
|
| 15 |
|
| 16 |
import re
|
|
@@ -251,7 +251,7 @@ def submit_results(
|
|
| 251 |
model_url: str,
|
| 252 |
reranking_model: str="",
|
| 253 |
reranking_model_url: str="",
|
| 254 |
-
version: str=
|
| 255 |
is_anonymous=False):
|
| 256 |
if not filepath.endswith(".zip"):
|
| 257 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
|
|
|
| 10 |
from src.display.formatting import styled_message, styled_error
|
| 11 |
from src.display.utils import COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, COL_NAME_RANK, COL_NAME_AVG, \
|
| 12 |
COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COL_NAME_IS_ANONYMOUS, COL_NAME_TIMESTAMP, COL_NAME_REVISION, get_default_auto_eval_column_dict
|
| 13 |
+
from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
|
| 14 |
from src.read_evals import FullEvalResult, get_leaderboard_df, calculate_mean
|
| 15 |
|
| 16 |
import re
|
|
|
|
| 251 |
model_url: str,
|
| 252 |
reranking_model: str="",
|
| 253 |
reranking_model_url: str="",
|
| 254 |
+
version: str=LATEST_BENCHMARK_VERSION,
|
| 255 |
is_anonymous=False):
|
| 256 |
if not filepath.endswith(".zip"):
|
| 257 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|