Spaces:
Running
Running
linzhengyu
commited on
Commit
Β·
2576caa
1
Parent(s):
e20b429
feat: list benchmarks in different tabs
Browse files- app.py +21 -9
- src/about.py +4 -0
- src/display/utils.py +1 -1
- uv.lock +1 -12
app.py
CHANGED
|
@@ -13,11 +13,13 @@ from src.about import (
|
|
| 13 |
INTRODUCTION_TEXT,
|
| 14 |
LLM_BENCHMARKS_TEXT,
|
| 15 |
TITLE,
|
|
|
|
| 16 |
)
|
| 17 |
from src.display.css_html_js import custom_css
|
| 18 |
from src.display.utils import (
|
| 19 |
BENCHMARK_COLS,
|
| 20 |
COLS,
|
|
|
|
| 21 |
EVAL_COLS,
|
| 22 |
EVAL_TYPES,
|
| 23 |
AutoEvalColumn,
|
|
@@ -76,17 +78,17 @@ LEADERBOARD_DF = get_leaderboard_df(
|
|
| 76 |
) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 77 |
|
| 78 |
|
| 79 |
-
def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
|
| 80 |
if dataframe is None or dataframe.empty:
|
| 81 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 82 |
# print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
|
| 83 |
selected_columns = SelectColumns(
|
| 84 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
| 85 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
| 86 |
label="Select Columns to Display:",
|
| 87 |
)
|
| 88 |
search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
|
| 89 |
-
hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden]
|
| 90 |
filter_columns = [
|
| 91 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 92 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
@@ -106,7 +108,7 @@ def init_leaderboard(dataframe: pd.DataFrame) -> Leaderboard:
|
|
| 106 |
]
|
| 107 |
return Leaderboard(
|
| 108 |
value=dataframe,
|
| 109 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
| 110 |
select_columns=selected_columns,
|
| 111 |
search_columns=search_columns,
|
| 112 |
hide_columns=hidden_columns,
|
|
@@ -122,13 +124,23 @@ with demo:
|
|
| 122 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 123 |
|
| 124 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 125 |
-
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 126 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 127 |
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 130 |
|
| 131 |
-
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=
|
| 132 |
with gr.Column():
|
| 133 |
with gr.Row():
|
| 134 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
| 13 |
INTRODUCTION_TEXT,
|
| 14 |
LLM_BENCHMARKS_TEXT,
|
| 15 |
TITLE,
|
| 16 |
+
BENCHMARKS,
|
| 17 |
)
|
| 18 |
from src.display.css_html_js import custom_css
|
| 19 |
from src.display.utils import (
|
| 20 |
BENCHMARK_COLS,
|
| 21 |
COLS,
|
| 22 |
+
BASE_COLS,
|
| 23 |
EVAL_COLS,
|
| 24 |
EVAL_TYPES,
|
| 25 |
AutoEvalColumn,
|
|
|
|
| 78 |
) = get_evaluation_queue_df(settings.EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 79 |
|
| 80 |
|
| 81 |
+
def init_leaderboard(dataframe: pd.DataFrame, cols: list[str]) -> Leaderboard:
|
| 82 |
if dataframe is None or dataframe.empty:
|
| 83 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 84 |
# print("///// --- dataframe.head() --- /////", Markdown(dataframe.head().to_markdown() or "No data"))
|
| 85 |
selected_columns = SelectColumns(
|
| 86 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name in cols],
|
| 87 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden and c.name in cols],
|
| 88 |
label="Select Columns to Display:",
|
| 89 |
)
|
| 90 |
search_columns = [AutoEvalColumn.model.name, AutoEvalColumn.license.name]
|
| 91 |
+
hidden_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden and c.name in cols]
|
| 92 |
filter_columns = [
|
| 93 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 94 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
|
|
| 108 |
]
|
| 109 |
return Leaderboard(
|
| 110 |
value=dataframe,
|
| 111 |
+
datatype=[c.type for c in fields(AutoEvalColumn) if c.name in cols],
|
| 112 |
select_columns=selected_columns,
|
| 113 |
search_columns=search_columns,
|
| 114 |
hide_columns=hidden_columns,
|
|
|
|
| 124 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 125 |
|
| 126 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
for i, benchmark in enumerate[str](sorted(BENCHMARKS)):
|
| 129 |
+
with gr.TabItem(f"π
{benchmark}", elem_id="llm-benchmark-tab-table", id=i):
|
| 130 |
+
benchmark_cols = [BENCHMARK_COL for BENCHMARK_COL in BENCHMARK_COLS if BENCHMARK_COL.startswith(benchmark)]
|
| 131 |
+
cols = BASE_COLS + benchmark_cols
|
| 132 |
+
BENCHMARK_DF = get_leaderboard_df(
|
| 133 |
+
settings.EVAL_RESULTS_PATH,
|
| 134 |
+
settings.EVAL_REQUESTS_PATH,
|
| 135 |
+
cols,
|
| 136 |
+
benchmark_cols,
|
| 137 |
+
)
|
| 138 |
+
leaderboard = init_leaderboard(BENCHMARK_DF, cols)
|
| 139 |
+
|
| 140 |
+
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=len(BENCHMARKS)):
|
| 141 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 142 |
|
| 143 |
+
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=len(BENCHMARKS) + 1):
|
| 144 |
with gr.Column():
|
| 145 |
with gr.Row():
|
| 146 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
|
@@ -46,6 +46,10 @@ class Tasks(Enum):
|
|
| 46 |
task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 50 |
# ---------------------------------------------------
|
| 51 |
|
|
|
|
| 46 |
task8_3 = Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
|
| 47 |
|
| 48 |
|
| 49 |
+
BENCHMARKS = {m.value.benchmark for m in Tasks}
|
| 50 |
+
METRICS = {m.value.metric for m in Tasks}
|
| 51 |
+
COL_NAMES = {m.value.col_name for m in Tasks}
|
| 52 |
+
|
| 53 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 54 |
# ---------------------------------------------------
|
| 55 |
|
src/display/utils.py
CHANGED
|
@@ -157,7 +157,7 @@ class Precision(Enum):
|
|
| 157 |
|
| 158 |
# Column selection
|
| 159 |
COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
|
| 160 |
-
|
| 161 |
EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
|
| 162 |
EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
|
| 163 |
|
|
|
|
| 157 |
|
| 158 |
# Column selection
|
| 159 |
COLS: list[str] = [c.name for c in fields(AutoEvalColumnCls) if not c.hidden]
|
| 160 |
+
BASE_COLS: list[str] = [c.name for c in fields(_AutoEvalColumnBase) if not c.hidden]
|
| 161 |
EVAL_COLS: list[str] = [c.name for c in fields(EvalQueueColumnCls)]
|
| 162 |
EVAL_TYPES: list[Literal["str", "number", "bool", "markdown"]] = [c.type for c in fields(EvalQueueColumnCls)]
|
| 163 |
|
uv.lock
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
version = 1
|
| 2 |
-
revision =
|
| 3 |
requires-python = "==3.10.*"
|
| 4 |
|
| 5 |
[[package]]
|
|
@@ -687,7 +687,6 @@ dependencies = [
|
|
| 687 |
{ name = "python-dotenv" },
|
| 688 |
{ name = "rich" },
|
| 689 |
{ name = "sentencepiece" },
|
| 690 |
-
{ name = "tabulate" },
|
| 691 |
{ name = "tokenizers" },
|
| 692 |
{ name = "tqdm" },
|
| 693 |
{ name = "transformers" },
|
|
@@ -716,7 +715,6 @@ requires-dist = [
|
|
| 716 |
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
| 717 |
{ name = "rich", specifier = ">=14.2.0" },
|
| 718 |
{ name = "sentencepiece" },
|
| 719 |
-
{ name = "tabulate", specifier = ">=0.9.0" },
|
| 720 |
{ name = "tokenizers", specifier = ">=0.15.0" },
|
| 721 |
{ name = "tqdm" },
|
| 722 |
{ name = "transformers" },
|
|
@@ -1307,15 +1305,6 @@ wheels = [
|
|
| 1307 |
{ url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
|
| 1308 |
]
|
| 1309 |
|
| 1310 |
-
[[package]]
|
| 1311 |
-
name = "tabulate"
|
| 1312 |
-
version = "0.9.0"
|
| 1313 |
-
source = { registry = "https://pypi.org/simple" }
|
| 1314 |
-
sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
|
| 1315 |
-
wheels = [
|
| 1316 |
-
{ url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
|
| 1317 |
-
]
|
| 1318 |
-
|
| 1319 |
[[package]]
|
| 1320 |
name = "tokenizers"
|
| 1321 |
version = "0.22.1"
|
|
|
|
| 1 |
version = 1
|
| 2 |
+
revision = 3
|
| 3 |
requires-python = "==3.10.*"
|
| 4 |
|
| 5 |
[[package]]
|
|
|
|
| 687 |
{ name = "python-dotenv" },
|
| 688 |
{ name = "rich" },
|
| 689 |
{ name = "sentencepiece" },
|
|
|
|
| 690 |
{ name = "tokenizers" },
|
| 691 |
{ name = "tqdm" },
|
| 692 |
{ name = "transformers" },
|
|
|
|
| 715 |
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
| 716 |
{ name = "rich", specifier = ">=14.2.0" },
|
| 717 |
{ name = "sentencepiece" },
|
|
|
|
| 718 |
{ name = "tokenizers", specifier = ">=0.15.0" },
|
| 719 |
{ name = "tqdm" },
|
| 720 |
{ name = "transformers" },
|
|
|
|
| 1305 |
{ url = "https://files.pythonhosted.org/packages/be/72/2db2f49247d0a18b4f1bb9a5a39a0162869acf235f3a96418363947b3d46/starlette-0.48.0-py3-none-any.whl", hash = "sha256:0764ca97b097582558ecb498132ed0c7d942f233f365b86ba37770e026510659", size = 73736, upload-time = "2025-09-13T08:41:03.869Z" },
|
| 1306 |
]
|
| 1307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
[[package]]
|
| 1309 |
name = "tokenizers"
|
| 1310 |
version = "0.22.1"
|