Spaces:
Runtime error
Runtime error
Commit
·
1301ce8
1
Parent(s):
634c9ed
add
Browse files- app.py +9 -4
- src/about.py +13 -0
- src/display/css_html_js.py +0 -18
- src/display/utils.py +8 -6
- src/leaderboard/read_evals.py +23 -16
- src/populate.py +4 -4
app.py
CHANGED
|
@@ -50,7 +50,7 @@ def restart_space():
|
|
| 50 |
# restart_space()
|
| 51 |
|
| 52 |
|
| 53 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 54 |
|
| 55 |
(
|
| 56 |
finished_eval_queue_df,
|
|
@@ -111,10 +111,15 @@ with demo:
|
|
| 111 |
|
| 112 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 113 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 114 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
|
|
|
| 118 |
|
| 119 |
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 120 |
# with gr.Column():
|
|
|
|
| 50 |
# restart_space()
|
| 51 |
|
| 52 |
|
| 53 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, task="Overall")
|
| 54 |
|
| 55 |
(
|
| 56 |
finished_eval_queue_df,
|
|
|
|
| 111 |
|
| 112 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 113 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 114 |
+
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 115 |
+
with gr.Tabs():
|
| 116 |
+
with gr.TabItem("Overall", elem_id="overall", id=0):
|
| 117 |
+
pdb.set_trace()
|
| 118 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 119 |
|
| 120 |
+
|
| 121 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 122 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 123 |
|
| 124 |
# with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
| 125 |
# with gr.Column():
|
src/about.py
CHANGED
|
@@ -19,6 +19,19 @@ class Tasks(Enum):
|
|
| 19 |
task4 = Task("sudoku", "EM", "Sudoku")
|
| 20 |
task5 = Task("drop_quote", "EM", "Drop Quote")
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 23 |
# ---------------------------------------------------
|
| 24 |
|
|
|
|
| 19 |
task4 = Task("sudoku", "EM", "Sudoku")
|
| 20 |
task5 = Task("drop_quote", "EM", "Drop Quote")
|
| 21 |
|
| 22 |
+
@dataclass
|
| 23 |
+
class Metric:
|
| 24 |
+
short: str
|
| 25 |
+
col_name: str
|
| 26 |
+
|
| 27 |
+
class Metrics(Enum):
|
| 28 |
+
CR = Metric("CR", "Completion Rate")
|
| 29 |
+
S_Acc = Metric("S-Acc", "Subtask Accuracy")
|
| 30 |
+
EM = Metric("EM", "Exact Match")
|
| 31 |
+
PM_05 = Metric("PM-0.5", "Partial Match (0.5)")
|
| 32 |
+
Tokens = Metric("Tokens", "Tokens")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 36 |
# ---------------------------------------------------
|
| 37 |
|
src/display/css_html_js.py
CHANGED
|
@@ -1,22 +1,4 @@
|
|
| 1 |
custom_css = """
|
| 2 |
-
/* 修改排序按钮颜色、大小等 */
|
| 3 |
-
.gr-datatable .sorting:before,
|
| 4 |
-
.gr-datatable .sorting:after {
|
| 5 |
-
color: #007bff !important; /* 修改排序图标颜色 */
|
| 6 |
-
font-size: 16px !important; /* 调整大小 */
|
| 7 |
-
}
|
| 8 |
-
|
| 9 |
-
/* 鼠标悬停时改变颜色 */
|
| 10 |
-
.gr-datatable .sorting:hover:before,
|
| 11 |
-
.gr-datatable .sorting:hover:after {
|
| 12 |
-
color: #ff4500 !important; /* 悬停时变色 */
|
| 13 |
-
}
|
| 14 |
-
|
| 15 |
-
/* 激活的排序图标 */
|
| 16 |
-
.gr-datatable .sorting_asc:before,
|
| 17 |
-
.gr-datatable .sorting_desc:before {
|
| 18 |
-
color: #28a745 !important; /* 绿色代表当前排序状态 */
|
| 19 |
-
}
|
| 20 |
|
| 21 |
.markdown-text {
|
| 22 |
font-size: 16px !important;
|
|
|
|
| 1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
.markdown-text {
|
| 4 |
font-size: 16px !important;
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -25,13 +25,15 @@ auto_eval_column_dict = []
|
|
| 25 |
# Init
|
| 26 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
| 28 |
# auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
| 29 |
#Scores
|
| 30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
|
| 31 |
# auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
| 32 |
-
for task in Tasks:
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
# Model information
|
| 36 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 37 |
# auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
|
@@ -39,7 +41,7 @@ for task in Tasks:
|
|
| 39 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 40 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 41 |
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 42 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 43 |
# auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
| 44 |
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 45 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks, Metrics
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 25 |
# Init
|
| 26 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
| 29 |
# auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
|
| 30 |
#Scores
|
| 31 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("EM ⬆️", "number", True)])
|
| 32 |
# auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
|
| 33 |
+
# for task in Tasks:
|
| 34 |
+
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 35 |
+
for metric in Metrics:
|
| 36 |
+
auto_eval_column_dict.append([metric.name, ColumnContent, ColumnContent(metric.value.col_name, "number", True)])
|
| 37 |
# Model information
|
| 38 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 39 |
# auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
|
|
|
|
| 41 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 42 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 43 |
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 44 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 45 |
# auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
|
| 46 |
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 47 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -8,7 +8,7 @@ import dateutil
|
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
@@ -34,7 +34,7 @@ class EvalResult:
|
|
| 34 |
link: str = ''
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
-
def init_from_json_file(self, json_filepath):
|
| 38 |
"""Inits the result from the specific model result file"""
|
| 39 |
with open(json_filepath) as fp:
|
| 40 |
data = json.load(fp)
|
|
@@ -73,16 +73,21 @@ class EvalResult:
|
|
| 73 |
|
| 74 |
# Extract results available in this file (some results are split in several files)
|
| 75 |
results = {}
|
| 76 |
-
for task in Tasks:
|
| 77 |
-
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
return self(
|
| 88 |
eval_name=result_key,
|
|
@@ -118,7 +123,7 @@ class EvalResult:
|
|
| 118 |
|
| 119 |
def to_dict(self):
|
| 120 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 121 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 122 |
data_dict = {
|
| 123 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 124 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -128,15 +133,17 @@ class EvalResult:
|
|
| 128 |
# AutoEvalColumn.architecture.name: self.architecture,
|
| 129 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
|
| 130 |
# AutoEvalColumn.revision.name: self.revision,
|
| 131 |
-
AutoEvalColumn.average.name: average,
|
| 132 |
# AutoEvalColumn.license.name: self.license,
|
| 133 |
# AutoEvalColumn.likes.name: self.likes,
|
| 134 |
AutoEvalColumn.params.name: self.num_params,
|
| 135 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 136 |
}
|
| 137 |
|
| 138 |
-
for task in Tasks:
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
return data_dict
|
| 142 |
|
|
@@ -164,7 +171,7 @@ class EvalResult:
|
|
| 164 |
# return request_file
|
| 165 |
|
| 166 |
|
| 167 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 168 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 169 |
model_result_filepaths = []
|
| 170 |
|
|
@@ -185,7 +192,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 185 |
eval_results = {}
|
| 186 |
for model_result_filepath in model_result_filepaths:
|
| 187 |
# Creation of result
|
| 188 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 189 |
# eval_result.update_with_request_file(requests_path)
|
| 190 |
|
| 191 |
# Store results of same eval together
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Metrics
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
|
|
| 34 |
link: str = ''
|
| 35 |
|
| 36 |
@classmethod
|
| 37 |
+
def init_from_json_file(self, json_filepath, task):
|
| 38 |
"""Inits the result from the specific model result file"""
|
| 39 |
with open(json_filepath) as fp:
|
| 40 |
data = json.load(fp)
|
|
|
|
| 73 |
|
| 74 |
# Extract results available in this file (some results are split in several files)
|
| 75 |
results = {}
|
| 76 |
+
# for task in Tasks:
|
| 77 |
+
# task = task.value
|
| 78 |
|
| 79 |
+
# # We average all scores of a given metric (not all metrics are present in all files)
|
| 80 |
+
# accs = np.array([float(v.get(task.metric, None)) for k, v in data["results"].items() if task.benchmark == k.lower()])
|
| 81 |
+
# if accs.size == 0 or any([acc is None for acc in accs]):
|
| 82 |
+
# continue
|
| 83 |
|
| 84 |
+
# mean_acc = np.mean(accs)
|
| 85 |
+
# results[task.benchmark] = mean_acc
|
| 86 |
+
# import pdb; pdb.set_trace()
|
| 87 |
+
for metric in Metrics:
|
| 88 |
+
metric = metric.value
|
| 89 |
+
|
| 90 |
+
results[metric.short] = data["results"][task][metric.short]
|
| 91 |
|
| 92 |
return self(
|
| 93 |
eval_name=result_key,
|
|
|
|
| 123 |
|
| 124 |
def to_dict(self):
|
| 125 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 126 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 127 |
data_dict = {
|
| 128 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 129 |
# AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 133 |
# AutoEvalColumn.architecture.name: self.architecture,
|
| 134 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.link),
|
| 135 |
# AutoEvalColumn.revision.name: self.revision,
|
| 136 |
+
# AutoEvalColumn.average.name: average,
|
| 137 |
# AutoEvalColumn.license.name: self.license,
|
| 138 |
# AutoEvalColumn.likes.name: self.likes,
|
| 139 |
AutoEvalColumn.params.name: self.num_params,
|
| 140 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 141 |
}
|
| 142 |
|
| 143 |
+
# for task in Tasks:
|
| 144 |
+
# data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 145 |
+
for metric in Metrics:
|
| 146 |
+
data_dict[metric.value.col_name] = self.results[metric.value.short]
|
| 147 |
|
| 148 |
return data_dict
|
| 149 |
|
|
|
|
| 171 |
# return request_file
|
| 172 |
|
| 173 |
|
| 174 |
+
def get_raw_eval_results(results_path: str, requests_path: str, task: str) -> list[EvalResult]:
|
| 175 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 176 |
model_result_filepaths = []
|
| 177 |
|
|
|
|
| 192 |
eval_results = {}
|
| 193 |
for model_result_filepath in model_result_filepaths:
|
| 194 |
# Creation of result
|
| 195 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, task)
|
| 196 |
# eval_result.update_with_request_file(requests_path)
|
| 197 |
|
| 198 |
# Store results of same eval together
|
src/populate.py
CHANGED
|
@@ -8,17 +8,17 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
-
df = df.sort_values(by=[
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
|
| 20 |
# filter out if any of the benchmarks have not been produced
|
| 21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 22 |
return df
|
| 23 |
|
| 24 |
|
|
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, task) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, task)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
df = df.sort_values(by=["Exact Match"], ascending=False)
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
|
| 20 |
# filter out if any of the benchmarks have not been produced
|
| 21 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 22 |
return df
|
| 23 |
|
| 24 |
|