Spaces:
Runtime error
Runtime error
Zhejian commited on
Commit ·
031a6d6
1
Parent(s): 32b2b23
bugfix
Browse files- app.py +103 -32
- env.py +1 -1
- evaluator.py +3 -6
- schemas.py +1 -0
- score.py +12 -37
app.py
CHANGED
|
@@ -21,12 +21,10 @@ from content import (
|
|
| 21 |
format_log,
|
| 22 |
)
|
| 23 |
from evaluator import Evaluator
|
| 24 |
-
from score import init_evaluators, score_item
|
| 25 |
from loguru import logger
|
| 26 |
|
| 27 |
|
| 28 |
-
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 29 |
-
|
| 30 |
from datasets import load_dataset, VerificationMode, Dataset, concatenate_datasets
|
| 31 |
|
| 32 |
from utils import parse_eval_dataset, parseaddr
|
|
@@ -41,7 +39,8 @@ from env import (
|
|
| 41 |
SUBMISSION_DATASET,
|
| 42 |
INTERNAL_DATASET,
|
| 43 |
EVALUATE_RESULT_DATASET,
|
| 44 |
-
REPO_ID
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
TOKEN = os.getenv("HF_TOKEN")
|
|
@@ -61,6 +60,41 @@ benchmark_dataset = parse_eval_dataset(benchmark_internal_evaluate_dataset) # ty
|
|
| 61 |
evaluator_list = init_evaluators(benchmark_dataset, llm_config)
|
| 62 |
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def get_dataframe_from_results(eval_results, split:str = 'train'):
|
| 65 |
try:
|
| 66 |
if hasattr(eval_results, "__getitem__"):
|
|
@@ -80,7 +114,6 @@ def get_dataframe_from_results(eval_results, split:str = 'train'):
|
|
| 80 |
print(f"Error applying model hyperlink: {e}")
|
| 81 |
pass
|
| 82 |
|
| 83 |
-
# 重命名列
|
| 84 |
column_renames = {
|
| 85 |
"model": "Agent name",
|
| 86 |
"model_family": "Model family",
|
|
@@ -108,7 +141,6 @@ def get_dataframe_from_results(eval_results, split:str = 'train'):
|
|
| 108 |
except:
|
| 109 |
pass
|
| 110 |
|
| 111 |
-
# 处理数值
|
| 112 |
try:
|
| 113 |
numeric_cols = [c for c in df.columns if "score" in c.lower()]
|
| 114 |
if numeric_cols:
|
|
@@ -157,6 +189,14 @@ def add_new_eval(
|
|
| 157 |
profile: gr.OAuthProfile,
|
| 158 |
):
|
| 159 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
if not LOCAL_DEBUG:
|
| 161 |
print(profile)
|
| 162 |
print(path_to_file)
|
|
@@ -168,12 +208,11 @@ def add_new_eval(
|
|
| 168 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=1):
|
| 169 |
return styled_error("This account is not authorized to submit on CAIA.")
|
| 170 |
|
| 171 |
-
contact_infos = load_dataset(
|
| 172 |
download_mode="force_redownload",
|
| 173 |
verification_mode=VerificationMode.NO_CHECKS,
|
| 174 |
trust_remote_code=True)
|
| 175 |
|
| 176 |
-
# print("Contact infos features:", contact_infos['train'].features)
|
| 177 |
|
| 178 |
user_submission_dates = []
|
| 179 |
try:
|
|
@@ -185,6 +224,7 @@ def add_new_eval(
|
|
| 185 |
print(f"Error getting user submission dates: {e}")
|
| 186 |
|
| 187 |
user_submission_dates = sorted(user_submission_dates)
|
|
|
|
| 188 |
if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
|
| 189 |
return styled_error("You already submitted once today, please try again tomorrow.")
|
| 190 |
|
|
@@ -228,31 +268,69 @@ def add_new_eval(
|
|
| 228 |
"organisation": organisation,
|
| 229 |
"username": profile.username,
|
| 230 |
"mail": mail,
|
| 231 |
-
"date": pd.Timestamp(datetime.datetime.now())
|
| 232 |
}
|
| 233 |
-
print("contact_info", contact_info)
|
| 234 |
-
temp_file_path = "temp_contact_info.json"
|
| 235 |
-
with open(temp_file_path, 'w') as f:
|
| 236 |
-
json.dump(contact_info_list, f)
|
| 237 |
-
to_add = Dataset.from_list([contact_info], features=contact_infos['train'].features)
|
| 238 |
-
new_data= concatenate_datasets([contact_infos['train'], to_add])
|
| 239 |
-
contact_infos['train'] = new_data
|
| 240 |
if LOCAL_DEBUG:
|
| 241 |
print("mock uploaded contact info")
|
| 242 |
else:
|
| 243 |
-
|
| 244 |
-
upload_file(
|
| 245 |
-
path_or_fileobj=temp_file_path,
|
| 246 |
-
path_in_repo=CONTACT_DATASET_FILE, # 目标JSON文件路径
|
| 247 |
-
repo_id=INTERNAL_DATASET,
|
| 248 |
-
token=TOKEN
|
| 249 |
-
)
|
| 250 |
|
| 251 |
# SCORE SUBMISSION
|
| 252 |
file_path = path_to_file.name
|
| 253 |
print("模拟评分过程...")
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
return format_log(f"模型 {model} 由 {organisation} 提交成功。\n请等待几个小时后刷新排行榜查看您的分数。")
|
| 258 |
except Exception as e:
|
|
@@ -318,13 +396,6 @@ with demo:
|
|
| 318 |
],
|
| 319 |
submission_result,
|
| 320 |
)
|
| 321 |
-
with gr.Row():
|
| 322 |
-
new_sub_btn = gr.Button("New submission")
|
| 323 |
-
new_sub_btn.click(
|
| 324 |
-
new_submission,
|
| 325 |
-
inputs=[],
|
| 326 |
-
outputs=[],
|
| 327 |
-
)
|
| 328 |
|
| 329 |
scheduler = BackgroundScheduler()
|
| 330 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
|
|
|
| 21 |
format_log,
|
| 22 |
)
|
| 23 |
from evaluator import Evaluator
|
| 24 |
+
from score import init_evaluators, score_item, load_agent_output_dataset, polish_scores
|
| 25 |
from loguru import logger
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
| 28 |
from datasets import load_dataset, VerificationMode, Dataset, concatenate_datasets
|
| 29 |
|
| 30 |
from utils import parse_eval_dataset, parseaddr
|
|
|
|
| 39 |
SUBMISSION_DATASET,
|
| 40 |
INTERNAL_DATASET,
|
| 41 |
EVALUATE_RESULT_DATASET,
|
| 42 |
+
REPO_ID,
|
| 43 |
+
CONTACT_DATASET
|
| 44 |
)
|
| 45 |
|
| 46 |
TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
| 60 |
evaluator_list = init_evaluators(benchmark_dataset, llm_config)
|
| 61 |
|
| 62 |
|
| 63 |
+
|
| 64 |
+
def save_contact_info(contact_info):
|
| 65 |
+
import tempfile
|
| 66 |
+
import json
|
| 67 |
+
|
| 68 |
+
# 加载现有联系人信息
|
| 69 |
+
try:
|
| 70 |
+
contact_infos = load_dataset(
|
| 71 |
+
CONTACT_DATASET,
|
| 72 |
+
data_files=CONTACT_DATASET_FILE,
|
| 73 |
+
token=TOKEN,
|
| 74 |
+
download_mode="force_redownload",
|
| 75 |
+
verification_mode=VerificationMode.NO_CHECKS,
|
| 76 |
+
trust_remote_code=True
|
| 77 |
+
)
|
| 78 |
+
contact_info_list = list(contact_infos['train'])
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"Error loading contact info: {e}")
|
| 81 |
+
contact_info_list = []
|
| 82 |
+
|
| 83 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_file:
|
| 84 |
+
json.dump(contact_info_list, temp_file, default=str, indent=4)
|
| 85 |
+
temp_file_path = temp_file.name
|
| 86 |
+
|
| 87 |
+
API.upload_file(
|
| 88 |
+
path_or_fileobj=temp_file_path,
|
| 89 |
+
path_in_repo=CONTACT_DATASET_FILE,
|
| 90 |
+
repo_id=CONTACT_DATASET,
|
| 91 |
+
repo_type='dataset',
|
| 92 |
+
token=TOKEN,
|
| 93 |
+
commit_message=f"Add new contact: {contact_info['model']} by {contact_info['organisation']}"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
os.unlink(temp_file_path)
|
| 97 |
+
|
| 98 |
def get_dataframe_from_results(eval_results, split:str = 'train'):
|
| 99 |
try:
|
| 100 |
if hasattr(eval_results, "__getitem__"):
|
|
|
|
| 114 |
print(f"Error applying model hyperlink: {e}")
|
| 115 |
pass
|
| 116 |
|
|
|
|
| 117 |
column_renames = {
|
| 118 |
"model": "Agent name",
|
| 119 |
"model_family": "Model family",
|
|
|
|
| 141 |
except:
|
| 142 |
pass
|
| 143 |
|
|
|
|
| 144 |
try:
|
| 145 |
numeric_cols = [c for c in df.columns if "score" in c.lower()]
|
| 146 |
if numeric_cols:
|
|
|
|
| 189 |
profile: gr.OAuthProfile,
|
| 190 |
):
|
| 191 |
try:
|
| 192 |
+
# 检查文件是否为JSON格式
|
| 193 |
+
try:
|
| 194 |
+
with open(path_to_file, 'r', encoding='utf-8') as f:
|
| 195 |
+
json.load(f) # 尝试解析JSON
|
| 196 |
+
except json.JSONDecodeError:
|
| 197 |
+
return styled_error("Please upload a valid JSON file.")
|
| 198 |
+
except Exception as e:
|
| 199 |
+
return styled_error(f"File read error: {str(traceback.format_exc())}")
|
| 200 |
if not LOCAL_DEBUG:
|
| 201 |
print(profile)
|
| 202 |
print(path_to_file)
|
|
|
|
| 208 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=1):
|
| 209 |
return styled_error("This account is not authorized to submit on CAIA.")
|
| 210 |
|
| 211 |
+
contact_infos = load_dataset(CONTACT_DATASET, data_files=CONTACT_DATASET_FILE, token=TOKEN,
|
| 212 |
download_mode="force_redownload",
|
| 213 |
verification_mode=VerificationMode.NO_CHECKS,
|
| 214 |
trust_remote_code=True)
|
| 215 |
|
|
|
|
| 216 |
|
| 217 |
user_submission_dates = []
|
| 218 |
try:
|
|
|
|
| 224 |
print(f"Error getting user submission dates: {e}")
|
| 225 |
|
| 226 |
user_submission_dates = sorted(user_submission_dates)
|
| 227 |
+
user_submission_dates = [date.strftime('%Y-%m-%d') if isinstance(date, pd.Timestamp) else datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d') for date in user_submission_dates if date]
|
| 228 |
if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
|
| 229 |
return styled_error("You already submitted once today, please try again tomorrow.")
|
| 230 |
|
|
|
|
| 268 |
"organisation": organisation,
|
| 269 |
"username": profile.username,
|
| 270 |
"mail": mail,
|
| 271 |
+
"date": pd.Timestamp(datetime.datetime.now())
|
| 272 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
if LOCAL_DEBUG:
|
| 274 |
print("mock uploaded contact info")
|
| 275 |
else:
|
| 276 |
+
save_contact_info(contact_info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
# SCORE SUBMISSION
|
| 279 |
file_path = path_to_file.name
|
| 280 |
print("模拟评分过程...")
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
agent_output = load_agent_output_dataset(dataset_path=file_path)
|
| 284 |
+
agent_output_task_ids = set(output.task_id for output in agent_output)
|
| 285 |
+
benchmark_task_ids = set(item.task_id for item in benchmark_dataset)
|
| 286 |
+
if agent_output_task_ids != benchmark_task_ids:
|
| 287 |
+
return styled_error("The task IDs in agent outputs do not match the task IDs in benchmark dataset.")
|
| 288 |
+
l1,l2,l3 = [],[],[]
|
| 289 |
+
for output in agent_output:
|
| 290 |
+
task_id = output.task_id
|
| 291 |
+
to_evaluate_item = [item for item in benchmark_dataset if item.task_id == task_id]
|
| 292 |
+
if not to_evaluate_item:
|
| 293 |
+
# score,detail_result = 0.0, None
|
| 294 |
+
continue
|
| 295 |
+
else:
|
| 296 |
+
level = to_evaluate_item[0].level
|
| 297 |
+
score, detail_result = asyncio.run(score_item(evaluator_list=evaluator_list, agent_output_item=output, to_evaluate_item=to_evaluate_item[0]))
|
| 298 |
+
print(score, task_id, level)
|
| 299 |
+
if level == 1:
|
| 300 |
+
l1.append((score, detail_result))
|
| 301 |
+
elif level == 2:
|
| 302 |
+
l2.append((score, detail_result))
|
| 303 |
+
elif level == 3:
|
| 304 |
+
l3.append((score, detail_result))
|
| 305 |
+
l1_scores = polish_scores([item[1] for item in l1])
|
| 306 |
+
l2_scores = polish_scores([item[1] for item in l2])
|
| 307 |
+
l3_scores = polish_scores([item[1] for item in l3])
|
| 308 |
+
print(l1_scores, l2_scores, l3_scores)
|
| 309 |
+
|
| 310 |
+
l1_total_score = sum(l1_scores) / len(l1_scores)
|
| 311 |
+
l2_total_score = sum(l2_scores) / len(l2_scores)
|
| 312 |
+
l3_total_score = sum(l3_scores) / len(l3_scores)
|
| 313 |
+
|
| 314 |
+
total_score = round((sum(l1_scores) + sum(l2_scores) + sum(l3_scores)) / (len(l1) + len(l2) + len(l3)), 2)
|
| 315 |
+
|
| 316 |
+
# add to eval_results
|
| 317 |
+
new_eval_result = {
|
| 318 |
+
"model": model,
|
| 319 |
+
"model_family": model_family,
|
| 320 |
+
"url": url,
|
| 321 |
+
"organisation": organisation,
|
| 322 |
+
"score": total_score,
|
| 323 |
+
"score_level1": l1_total_score,
|
| 324 |
+
"score_level2": l2_total_score,
|
| 325 |
+
"score_level3": l3_total_score,
|
| 326 |
+
"date": datetime.datetime.now().strftime("%Y-%m-%d")
|
| 327 |
+
}
|
| 328 |
+
print(new_eval_result)
|
| 329 |
+
|
| 330 |
+
eval_results_list = list(eval_results)
|
| 331 |
+
eval_results_list.append(new_eval_result)
|
| 332 |
+
eval_results = Dataset.from_list(eval_results_list, features=eval_results.features)
|
| 333 |
+
eval_results.push_to_hub(EVALUATE_RESULT_DATASET, token=TOKEN)
|
| 334 |
|
| 335 |
return format_log(f"模型 {model} 由 {organisation} 提交成功。\n请等待几个小时后刷新排行榜查看您的分数。")
|
| 336 |
except Exception as e:
|
|
|
|
| 396 |
],
|
| 397 |
submission_result,
|
| 398 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
scheduler = BackgroundScheduler()
|
| 401 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
env.py
CHANGED
|
@@ -49,4 +49,4 @@ CONTACT_DATASET = f"{OWNER}/contact_info"
|
|
| 49 |
|
| 50 |
BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE = f"{VERSION}/{os.getenv('BENCHMARK_INTERNAL_EVALUATE_DATASET', 'example_evaluate_data.json')}"
|
| 51 |
EVALUATE_RESULT_DATASET_FILE = f"{VERSION}/{os.getenv('EVALUATE_RESULT_DATASET', 'example_result.json')}"
|
| 52 |
-
CONTACT_DATASET_FILE = f"{
|
|
|
|
| 49 |
|
| 50 |
BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE = f"{VERSION}/{os.getenv('BENCHMARK_INTERNAL_EVALUATE_DATASET', 'example_evaluate_data.json')}"
|
| 51 |
EVALUATE_RESULT_DATASET_FILE = f"{VERSION}/{os.getenv('EVALUATE_RESULT_DATASET', 'example_result.json')}"
|
| 52 |
+
CONTACT_DATASET_FILE = f"{os.getenv('CONTACT_DATASET_FILE', 'example_contact_info.json')}"
|
evaluator.py
CHANGED
|
@@ -276,12 +276,8 @@ Evaluation Rules:
|
|
| 276 |
continue
|
| 277 |
return 0.0, None
|
| 278 |
|
| 279 |
-
async def a_evaluate(self, task_id:str, answer:Answer) -> EvaluateScore | None:
|
| 280 |
import asyncio
|
| 281 |
-
to_evaluate_item = [item for item in self.benchmark_data if item.task_id == task_id]
|
| 282 |
-
if not to_evaluate_item:
|
| 283 |
-
return None
|
| 284 |
-
to_evaluate_item = to_evaluate_item[0]
|
| 285 |
tasks = [
|
| 286 |
self.evaluate_answer(answer, to_evaluate_item),
|
| 287 |
self.evaluate_reasoning(answer, to_evaluate_item),
|
|
@@ -311,6 +307,7 @@ Evaluation Rules:
|
|
| 311 |
detail += f"Tool use score: {sum([item.score for item in tool_use_evaulate_result.items])} / {sum([item.points for item in benchmark_tool_use_items])}\n"
|
| 312 |
for item in tool_use_evaulate_result.items:
|
| 313 |
detail += f"{item.reason}\n"
|
|
|
|
| 314 |
return EvaluateScore(
|
| 315 |
model_name=self.model_name,
|
| 316 |
answer_score=answer_evaulate_result.score,
|
|
@@ -327,6 +324,6 @@ Evaluation Rules:
|
|
| 327 |
async def ensemble_evaluate(evaulator_list:list[Evaluator], answer:Answer, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
|
| 328 |
# for evaluator in evaulator_list:
|
| 329 |
# await evaluator.load_validate_data()
|
| 330 |
-
results = await asyncio.gather(*[evaluator.a_evaluate(to_evaluate_item.task_id, answer) for evaluator in evaulator_list])
|
| 331 |
return sum([result.total_score for result in results if result]) / len([result for result in results if result]), [result for result in results if result]
|
| 332 |
|
|
|
|
| 276 |
continue
|
| 277 |
return 0.0, None
|
| 278 |
|
| 279 |
+
async def a_evaluate(self, task_id:str, answer:Answer, to_evaluate_item: BenchmarkItem) -> EvaluateScore | None:
|
| 280 |
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
tasks = [
|
| 282 |
self.evaluate_answer(answer, to_evaluate_item),
|
| 283 |
self.evaluate_reasoning(answer, to_evaluate_item),
|
|
|
|
| 307 |
detail += f"Tool use score: {sum([item.score for item in tool_use_evaulate_result.items])} / {sum([item.points for item in benchmark_tool_use_items])}\n"
|
| 308 |
for item in tool_use_evaulate_result.items:
|
| 309 |
detail += f"{item.reason}\n"
|
| 310 |
+
print(detail)
|
| 311 |
return EvaluateScore(
|
| 312 |
model_name=self.model_name,
|
| 313 |
answer_score=answer_evaulate_result.score,
|
|
|
|
| 324 |
async def ensemble_evaluate(evaulator_list:list[Evaluator], answer:Answer, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
|
| 325 |
# for evaluator in evaulator_list:
|
| 326 |
# await evaluator.load_validate_data()
|
| 327 |
+
results = await asyncio.gather(*[evaluator.a_evaluate(to_evaluate_item.task_id, answer, to_evaluate_item) for evaluator in evaulator_list])
|
| 328 |
return sum([result.total_score for result in results if result]) / len([result for result in results if result]), [result for result in results if result]
|
| 329 |
|
schemas.py
CHANGED
|
@@ -88,6 +88,7 @@ class QuestionData(BaseModel):
|
|
| 88 |
|
| 89 |
class BenchmarkItem(BaseModel):
|
| 90 |
task_id: str
|
|
|
|
| 91 |
question: str = Field(description="The question to be answered")
|
| 92 |
# answer: Answer = Field(description="The agent system output")
|
| 93 |
evaluate: EvaluateData = Field(description="The evaluation result")
|
|
|
|
| 88 |
|
| 89 |
class BenchmarkItem(BaseModel):
|
| 90 |
task_id: str
|
| 91 |
+
level:Optional[int] = 1
|
| 92 |
question: str = Field(description="The question to be answered")
|
| 93 |
# answer: Answer = Field(description="The agent system output")
|
| 94 |
evaluate: EvaluateData = Field(description="The evaluation result")
|
score.py
CHANGED
|
@@ -37,43 +37,6 @@ async def run_evaluate(evaluator_list:list[Evaluator], agent_output_item:AgentOu
|
|
| 37 |
)
|
| 38 |
return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
|
| 39 |
|
| 40 |
-
# async def main():
|
| 41 |
-
# #load llm config
|
| 42 |
-
# parse_llm_config = llm_configs["parse_llm_config"]
|
| 43 |
-
# evaluate_llm_configs = llm_configs["evaluate_llm_configs"]
|
| 44 |
-
# #load agent output dataset
|
| 45 |
-
# agent_output_dataset = load_agent_output_dataset()
|
| 46 |
-
# #load evaluate dataset
|
| 47 |
-
# evaluator_list: list[Evaluator] = []
|
| 48 |
-
# for evaluate_llm_config in evaluate_llm_configs:
|
| 49 |
-
# for _ in range(3):
|
| 50 |
-
# evaluator = Evaluator(
|
| 51 |
-
# dataset_path="dataset/example_evaluate_data.json",
|
| 52 |
-
# parse_model=parse_llm_config["model_name"],
|
| 53 |
-
# parse_model_api_key=parse_llm_config.get("api_key", None),
|
| 54 |
-
# parse_model_base_url=parse_llm_config.get("base_url", None),
|
| 55 |
-
# api_key=evaluate_llm_config.get("api_key", None),
|
| 56 |
-
# model_name=evaluate_llm_config["model_name"],
|
| 57 |
-
# base_url=evaluate_llm_config.get("base_url", None),
|
| 58 |
-
# **evaluate_llm_config.get("model_params",{})
|
| 59 |
-
# )
|
| 60 |
-
# evaluator_list.append(evaluator)
|
| 61 |
-
# evaluate_dataset = await evaluator.load_validate_data()
|
| 62 |
-
# #evaluate
|
| 63 |
-
# # run parallel
|
| 64 |
-
# for agent_output_item in agent_output_dataset:
|
| 65 |
-
# task_id = agent_output_item.task_id
|
| 66 |
-
# to_evaluate_item = [item for item in evaluate_dataset if item.task_id == task_id][0]
|
| 67 |
-
# answer = Answer(
|
| 68 |
-
# answer=agent_output_item.answer,
|
| 69 |
-
# reasoning_steps=agent_output_item.reasoning_list,
|
| 70 |
-
# function_calls=agent_output_item.tool_use_list
|
| 71 |
-
# )
|
| 72 |
-
# score,results = await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
|
| 73 |
-
# print(f"Task ID: {task_id}")
|
| 74 |
-
# print(f"Score: {score}")
|
| 75 |
-
# # print(results)
|
| 76 |
-
|
| 77 |
async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
|
| 78 |
answer = Answer(
|
| 79 |
answer=agent_output_item.answer,
|
|
@@ -82,3 +45,15 @@ async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutp
|
|
| 82 |
)
|
| 83 |
return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
)
|
| 38 |
return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
|
| 41 |
answer = Answer(
|
| 42 |
answer=agent_output_item.answer,
|
|
|
|
| 45 |
)
|
| 46 |
return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
|
| 47 |
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def polish_scores(scores:list[EvaluateScore]) -> tuple[float, float, float]:
|
| 52 |
+
answer_scores = [score.answer_score for score in scores]
|
| 53 |
+
total_answer_scores = [score.answer_total_score for score in scores]
|
| 54 |
+
reasoning_scores = [score.reasoning_score for score in scores]
|
| 55 |
+
total_reasoning_scores = [score.reasoning_total_score for score in scores]
|
| 56 |
+
tool_use_scores = [score.tool_use_score for score in scores]
|
| 57 |
+
total_tool_use_scores = [score.tool_use_total_score for score in scores]
|
| 58 |
+
return sum(answer_scores) / sum(total_answer_scores), sum(reasoning_scores) / sum(total_reasoning_scores), sum(tool_use_scores) / sum(total_tool_use_scores)
|
| 59 |
+
|