Update app.py
Browse files
app.py
CHANGED
|
@@ -73,16 +73,17 @@ def get_dataframe_from_results(eval_results, split):
|
|
| 73 |
local_df = local_df.remove_columns(["url"])
|
| 74 |
local_df = local_df.rename_column("model", "Model name")
|
| 75 |
local_df = local_df.rename_column("model_family", "Model family")
|
| 76 |
-
# local_df = local_df.rename_column("score", "Average score (%)")
|
| 77 |
-
# for i in [1, 2, 3]:
|
| 78 |
-
# local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
|
| 79 |
df = pd.DataFrame(local_df)
|
| 80 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
| 81 |
|
| 82 |
-
numeric_cols = [c for c in local_df.column_names
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return df
|
| 87 |
|
| 88 |
|
|
@@ -103,20 +104,6 @@ TYPES = ["markdown", "str", "str", "str", "number", "number", "number", "number"
|
|
| 103 |
LEVELS = ["all", 1, 2, 3]
|
| 104 |
|
| 105 |
|
| 106 |
-
def round_and_pad(number, ndigits=2):
|
| 107 |
-
# 四舍五入到指定的小数位数
|
| 108 |
-
rounded_number = round(number, ndigits)
|
| 109 |
-
# 转换为字符串
|
| 110 |
-
number_str = str(rounded_number)
|
| 111 |
-
# 分离整数部分和小数部分
|
| 112 |
-
integer_part, decimal_part = number_str.split('.')
|
| 113 |
-
# 如果小数部分不足指定的位数,补零
|
| 114 |
-
while len(decimal_part) < ndigits:
|
| 115 |
-
decimal_part += '0'
|
| 116 |
-
# 拼接回去,并转换回数字
|
| 117 |
-
return '.'.join([integer_part, decimal_part])
|
| 118 |
-
|
| 119 |
-
|
| 120 |
def add_new_eval(
|
| 121 |
dataset_version: str,
|
| 122 |
model: str,
|
|
@@ -156,7 +143,6 @@ def add_new_eval(
|
|
| 156 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
| 157 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
| 158 |
|
| 159 |
-
# with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
| 160 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
| 161 |
with open(file_path, 'r') as f:
|
| 162 |
for ix, line in enumerate(f):
|
|
@@ -173,7 +159,8 @@ def add_new_eval(
|
|
| 173 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
| 174 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
| 175 |
except KeyError:
|
| 176 |
-
return format_error(
|
|
|
|
| 177 |
|
| 178 |
scored_file.write(
|
| 179 |
json.dumps({
|
|
@@ -201,11 +188,11 @@ def add_new_eval(
|
|
| 201 |
success_rate['all'] += 1
|
| 202 |
|
| 203 |
for key in LEVELS:
|
| 204 |
-
success_rate[key] =
|
| 205 |
-
completion_level[key] =
|
| 206 |
-
expertise[key] =
|
| 207 |
-
reasoning[key] =
|
| 208 |
-
comprehension[key] =
|
| 209 |
|
| 210 |
print(success_rate, completion_level, expertise, reasoning, comprehension)
|
| 211 |
|
|
@@ -265,7 +252,8 @@ def refresh():
|
|
| 265 |
dataset_version,
|
| 266 |
token=TOKEN,
|
| 267 |
download_mode="force_redownload",
|
| 268 |
-
verification_mode="no_checks"
|
|
|
|
| 269 |
)
|
| 270 |
|
| 271 |
new_eval_dataframe = {}
|
|
|
|
| 73 |
local_df = local_df.remove_columns(["url"])
|
| 74 |
local_df = local_df.rename_column("model", "Model name")
|
| 75 |
local_df = local_df.rename_column("model_family", "Model family")
|
|
|
|
|
|
|
|
|
|
| 76 |
df = pd.DataFrame(local_df)
|
| 77 |
df = df.sort_values(by=["completion_level"], ascending=False)
|
| 78 |
|
| 79 |
+
numeric_cols = [c for c in local_df.column_names if c in ["expertise", "reasoning", "comprehension"]]
|
| 80 |
+
df[numeric_cols] = df[numeric_cols].round(decimals=2)
|
| 81 |
+
|
| 82 |
+
percent_cols = [c for c in local_df.column_names if c in ["success_rate", "completion_level"]]
|
| 83 |
+
df = df.style.format("{:.2%}", subset=percent_cols)
|
| 84 |
+
|
| 85 |
+
df = df[["Model name", "Model family", "organisation", "completion_level", "success_rate", "expertise", "reasoning",
|
| 86 |
+
"comprehension"]]
|
| 87 |
return df
|
| 88 |
|
| 89 |
|
|
|
|
| 104 |
LEVELS = ["all", 1, 2, 3]
|
| 105 |
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
def add_new_eval(
|
| 108 |
dataset_version: str,
|
| 109 |
model: str,
|
|
|
|
| 143 |
comprehension = {'all': 0, 1: 0, 2: 0, 3: 0}
|
| 144 |
num = {'all': 0, 1: 0, 2: 0, 3: 0}
|
| 145 |
|
|
|
|
| 146 |
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
|
| 147 |
with open(file_path, 'r') as f:
|
| 148 |
for ix, line in enumerate(f):
|
|
|
|
| 159 |
level = int(gold_results[val_or_test][task_name]["Level"])
|
| 160 |
score = question_scorer(task, gold_results[val_or_test][task_name])
|
| 161 |
except KeyError:
|
| 162 |
+
return format_error(
|
| 163 |
+
f"{task_name} not found in split {val_or_test}. Are you sure you submitted the correct file?")
|
| 164 |
|
| 165 |
scored_file.write(
|
| 166 |
json.dumps({
|
|
|
|
| 188 |
success_rate['all'] += 1
|
| 189 |
|
| 190 |
for key in LEVELS:
|
| 191 |
+
success_rate[key] = success_rate[key] / num[key]
|
| 192 |
+
completion_level[key] = completion_level[key] / num[key] / 10
|
| 193 |
+
expertise[key] = expertise[key] / num[key]
|
| 194 |
+
reasoning[key] = reasoning[key] / num[key]
|
| 195 |
+
comprehension[key] = comprehension[key] / num[key]
|
| 196 |
|
| 197 |
print(success_rate, completion_level, expertise, reasoning, comprehension)
|
| 198 |
|
|
|
|
| 252 |
dataset_version,
|
| 253 |
token=TOKEN,
|
| 254 |
download_mode="force_redownload",
|
| 255 |
+
verification_mode="no_checks",
|
| 256 |
+
trust_remote_code=True
|
| 257 |
)
|
| 258 |
|
| 259 |
new_eval_dataframe = {}
|