Zhejian commited on
Commit
031a6d6
·
1 Parent(s): 32b2b23
Files changed (5) hide show
  1. app.py +103 -32
  2. env.py +1 -1
  3. evaluator.py +3 -6
  4. schemas.py +1 -0
  5. score.py +12 -37
app.py CHANGED
@@ -21,12 +21,10 @@ from content import (
21
  format_log,
22
  )
23
  from evaluator import Evaluator
24
- from score import init_evaluators, score_item
25
  from loguru import logger
26
 
27
 
28
- # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
-
30
  from datasets import load_dataset, VerificationMode, Dataset, concatenate_datasets
31
 
32
  from utils import parse_eval_dataset, parseaddr
@@ -41,7 +39,8 @@ from env import (
41
  SUBMISSION_DATASET,
42
  INTERNAL_DATASET,
43
  EVALUATE_RESULT_DATASET,
44
- REPO_ID
 
45
  )
46
 
47
  TOKEN = os.getenv("HF_TOKEN")
@@ -61,6 +60,41 @@ benchmark_dataset = parse_eval_dataset(benchmark_internal_evaluate_dataset) # ty
61
  evaluator_list = init_evaluators(benchmark_dataset, llm_config)
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def get_dataframe_from_results(eval_results, split:str = 'train'):
65
  try:
66
  if hasattr(eval_results, "__getitem__"):
@@ -80,7 +114,6 @@ def get_dataframe_from_results(eval_results, split:str = 'train'):
80
  print(f"Error applying model hyperlink: {e}")
81
  pass
82
 
83
- # 重命名列
84
  column_renames = {
85
  "model": "Agent name",
86
  "model_family": "Model family",
@@ -108,7 +141,6 @@ def get_dataframe_from_results(eval_results, split:str = 'train'):
108
  except:
109
  pass
110
 
111
- # 处理数值
112
  try:
113
  numeric_cols = [c for c in df.columns if "score" in c.lower()]
114
  if numeric_cols:
@@ -157,6 +189,14 @@ def add_new_eval(
157
  profile: gr.OAuthProfile,
158
  ):
159
  try:
 
 
 
 
 
 
 
 
160
  if not LOCAL_DEBUG:
161
  print(profile)
162
  print(path_to_file)
@@ -168,12 +208,11 @@ def add_new_eval(
168
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=1):
169
  return styled_error("This account is not authorized to submit on CAIA.")
170
 
171
- contact_infos = load_dataset(INTERNAL_DATASET, data_files=CONTACT_DATASET_FILE, token=TOKEN,
172
  download_mode="force_redownload",
173
  verification_mode=VerificationMode.NO_CHECKS,
174
  trust_remote_code=True)
175
 
176
- # print("Contact infos features:", contact_infos['train'].features)
177
 
178
  user_submission_dates = []
179
  try:
@@ -185,6 +224,7 @@ def add_new_eval(
185
  print(f"Error getting user submission dates: {e}")
186
 
187
  user_submission_dates = sorted(user_submission_dates)
 
188
  if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
189
  return styled_error("You already submitted once today, please try again tomorrow.")
190
 
@@ -228,31 +268,69 @@ def add_new_eval(
228
  "organisation": organisation,
229
  "username": profile.username,
230
  "mail": mail,
231
- "date": pd.Timestamp(datetime.datetime.now()).floor('ns')
232
  }
233
- print("contact_info", contact_info)
234
- temp_file_path = "temp_contact_info.json"
235
- with open(temp_file_path, 'w') as f:
236
- json.dump(contact_info_list, f)
237
- to_add = Dataset.from_list([contact_info], features=contact_infos['train'].features)
238
- new_data= concatenate_datasets([contact_infos['train'], to_add])
239
- contact_infos['train'] = new_data
240
  if LOCAL_DEBUG:
241
  print("mock uploaded contact info")
242
  else:
243
- contact_infos.push_to_hub(INTERNAL_DATASET, config_name = VERSION, token=TOKEN)
244
- upload_file(
245
- path_or_fileobj=temp_file_path,
246
- path_in_repo=CONTACT_DATASET_FILE, # 目标JSON文件路径
247
- repo_id=INTERNAL_DATASET,
248
- token=TOKEN
249
- )
250
 
251
  # SCORE SUBMISSION
252
  file_path = path_to_file.name
253
  print("模拟评分过程...")
254
- # with open(file_path, 'r') as f:
255
- # ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  return format_log(f"模型 {model} 由 {organisation} 提交成功。\n请等待几个小时后刷新排行榜查看您的分数。")
258
  except Exception as e:
@@ -318,13 +396,6 @@ with demo:
318
  ],
319
  submission_result,
320
  )
321
- with gr.Row():
322
- new_sub_btn = gr.Button("New submission")
323
- new_sub_btn.click(
324
- new_submission,
325
- inputs=[],
326
- outputs=[],
327
- )
328
 
329
  scheduler = BackgroundScheduler()
330
  scheduler.add_job(restart_space, "interval", seconds=3600)
 
21
  format_log,
22
  )
23
  from evaluator import Evaluator
24
+ from score import init_evaluators, score_item, load_agent_output_dataset, polish_scores
25
  from loguru import logger
26
 
27
 
 
 
28
  from datasets import load_dataset, VerificationMode, Dataset, concatenate_datasets
29
 
30
  from utils import parse_eval_dataset, parseaddr
 
39
  SUBMISSION_DATASET,
40
  INTERNAL_DATASET,
41
  EVALUATE_RESULT_DATASET,
42
+ REPO_ID,
43
+ CONTACT_DATASET
44
  )
45
 
46
  TOKEN = os.getenv("HF_TOKEN")
 
60
  evaluator_list = init_evaluators(benchmark_dataset, llm_config)
61
 
62
 
63
+
64
+ def save_contact_info(contact_info):
65
+ import tempfile
66
+ import json
67
+
68
+ # 加载现有联系人信息
69
+ try:
70
+ contact_infos = load_dataset(
71
+ CONTACT_DATASET,
72
+ data_files=CONTACT_DATASET_FILE,
73
+ token=TOKEN,
74
+ download_mode="force_redownload",
75
+ verification_mode=VerificationMode.NO_CHECKS,
76
+ trust_remote_code=True
77
+ )
78
+ contact_info_list = list(contact_infos['train'])
79
+ except Exception as e:
80
+ print(f"Error loading contact info: {e}")
81
+ contact_info_list = []
82
+
83
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as temp_file:
84
+ json.dump(contact_info_list, temp_file, default=str, indent=4)
85
+ temp_file_path = temp_file.name
86
+
87
+ API.upload_file(
88
+ path_or_fileobj=temp_file_path,
89
+ path_in_repo=CONTACT_DATASET_FILE,
90
+ repo_id=CONTACT_DATASET,
91
+ repo_type='dataset',
92
+ token=TOKEN,
93
+ commit_message=f"Add new contact: {contact_info['model']} by {contact_info['organisation']}"
94
+ )
95
+
96
+ os.unlink(temp_file_path)
97
+
98
  def get_dataframe_from_results(eval_results, split:str = 'train'):
99
  try:
100
  if hasattr(eval_results, "__getitem__"):
 
114
  print(f"Error applying model hyperlink: {e}")
115
  pass
116
 
 
117
  column_renames = {
118
  "model": "Agent name",
119
  "model_family": "Model family",
 
141
  except:
142
  pass
143
 
 
144
  try:
145
  numeric_cols = [c for c in df.columns if "score" in c.lower()]
146
  if numeric_cols:
 
189
  profile: gr.OAuthProfile,
190
  ):
191
  try:
192
+ # 检查文件是否为JSON格式
193
+ try:
194
+ with open(path_to_file, 'r', encoding='utf-8') as f:
195
+ json.load(f) # 尝试解析JSON
196
+ except json.JSONDecodeError:
197
+ return styled_error("Please upload a valid JSON file.")
198
+ except Exception as e:
199
+ return styled_error(f"File read error: {str(traceback.format_exc())}")
200
  if not LOCAL_DEBUG:
201
  print(profile)
202
  print(path_to_file)
 
208
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=1):
209
  return styled_error("This account is not authorized to submit on CAIA.")
210
 
211
+ contact_infos = load_dataset(CONTACT_DATASET, data_files=CONTACT_DATASET_FILE, token=TOKEN,
212
  download_mode="force_redownload",
213
  verification_mode=VerificationMode.NO_CHECKS,
214
  trust_remote_code=True)
215
 
 
216
 
217
  user_submission_dates = []
218
  try:
 
224
  print(f"Error getting user submission dates: {e}")
225
 
226
  user_submission_dates = sorted(user_submission_dates)
227
+ user_submission_dates = [date.strftime('%Y-%m-%d') if isinstance(date, pd.Timestamp) else datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S.%f').strftime('%Y-%m-%d') for date in user_submission_dates if date]
228
  if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
229
  return styled_error("You already submitted once today, please try again tomorrow.")
230
 
 
268
  "organisation": organisation,
269
  "username": profile.username,
270
  "mail": mail,
271
+ "date": pd.Timestamp(datetime.datetime.now())
272
  }
 
 
 
 
 
 
 
273
  if LOCAL_DEBUG:
274
  print("mock uploaded contact info")
275
  else:
276
+ save_contact_info(contact_info)
 
 
 
 
 
 
277
 
278
  # SCORE SUBMISSION
279
  file_path = path_to_file.name
280
  print("模拟评分过程...")
281
+
282
+
283
+ agent_output = load_agent_output_dataset(dataset_path=file_path)
284
+ agent_output_task_ids = set(output.task_id for output in agent_output)
285
+ benchmark_task_ids = set(item.task_id for item in benchmark_dataset)
286
+ if agent_output_task_ids != benchmark_task_ids:
287
+ return styled_error("The task IDs in agent outputs do not match the task IDs in benchmark dataset.")
288
+ l1,l2,l3 = [],[],[]
289
+ for output in agent_output:
290
+ task_id = output.task_id
291
+ to_evaluate_item = [item for item in benchmark_dataset if item.task_id == task_id]
292
+ if not to_evaluate_item:
293
+ # score,detail_result = 0.0, None
294
+ continue
295
+ else:
296
+ level = to_evaluate_item[0].level
297
+ score, detail_result = asyncio.run(score_item(evaluator_list=evaluator_list, agent_output_item=output, to_evaluate_item=to_evaluate_item[0]))
298
+ print(score, task_id, level)
299
+ if level == 1:
300
+ l1.append((score, detail_result))
301
+ elif level == 2:
302
+ l2.append((score, detail_result))
303
+ elif level == 3:
304
+ l3.append((score, detail_result))
305
+ l1_scores = polish_scores([item[1] for item in l1])
306
+ l2_scores = polish_scores([item[1] for item in l2])
307
+ l3_scores = polish_scores([item[1] for item in l3])
308
+ print(l1_scores, l2_scores, l3_scores)
309
+
310
+ l1_total_score = sum(l1_scores) / len(l1_scores)
311
+ l2_total_score = sum(l2_scores) / len(l2_scores)
312
+ l3_total_score = sum(l3_scores) / len(l3_scores)
313
+
314
+ total_score = round((sum(l1_scores) + sum(l2_scores) + sum(l3_scores)) / (len(l1) + len(l2) + len(l3)), 2)
315
+
316
+ # add to eval_results
317
+ new_eval_result = {
318
+ "model": model,
319
+ "model_family": model_family,
320
+ "url": url,
321
+ "organisation": organisation,
322
+ "score": total_score,
323
+ "score_level1": l1_total_score,
324
+ "score_level2": l2_total_score,
325
+ "score_level3": l3_total_score,
326
+ "date": datetime.datetime.now().strftime("%Y-%m-%d")
327
+ }
328
+ print(new_eval_result)
329
+
330
+ eval_results_list = list(eval_results)
331
+ eval_results_list.append(new_eval_result)
332
+ eval_results = Dataset.from_list(eval_results_list, features=eval_results.features)
333
+ eval_results.push_to_hub(EVALUATE_RESULT_DATASET, token=TOKEN)
334
 
335
  return format_log(f"模型 {model} 由 {organisation} 提交成功。\n请等待几个小时后刷新排行榜查看您的分数。")
336
  except Exception as e:
 
396
  ],
397
  submission_result,
398
  )
 
 
 
 
 
 
 
399
 
400
  scheduler = BackgroundScheduler()
401
  scheduler.add_job(restart_space, "interval", seconds=3600)
env.py CHANGED
@@ -49,4 +49,4 @@ CONTACT_DATASET = f"{OWNER}/contact_info"
49
 
50
  BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE = f"{VERSION}/{os.getenv('BENCHMARK_INTERNAL_EVALUATE_DATASET', 'example_evaluate_data.json')}"
51
  EVALUATE_RESULT_DATASET_FILE = f"{VERSION}/{os.getenv('EVALUATE_RESULT_DATASET', 'example_result.json')}"
52
- CONTACT_DATASET_FILE = f"{VERSION}/{os.getenv('CONTACT_DATASET', 'example_contact.json')}"
 
49
 
50
  BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE = f"{VERSION}/{os.getenv('BENCHMARK_INTERNAL_EVALUATE_DATASET', 'example_evaluate_data.json')}"
51
  EVALUATE_RESULT_DATASET_FILE = f"{VERSION}/{os.getenv('EVALUATE_RESULT_DATASET', 'example_result.json')}"
52
+ CONTACT_DATASET_FILE = f"{os.getenv('CONTACT_DATASET_FILE', 'example_contact_info.json')}"
evaluator.py CHANGED
@@ -276,12 +276,8 @@ Evaluation Rules:
276
  continue
277
  return 0.0, None
278
 
279
- async def a_evaluate(self, task_id:str, answer:Answer) -> EvaluateScore | None:
280
  import asyncio
281
- to_evaluate_item = [item for item in self.benchmark_data if item.task_id == task_id]
282
- if not to_evaluate_item:
283
- return None
284
- to_evaluate_item = to_evaluate_item[0]
285
  tasks = [
286
  self.evaluate_answer(answer, to_evaluate_item),
287
  self.evaluate_reasoning(answer, to_evaluate_item),
@@ -311,6 +307,7 @@ Evaluation Rules:
311
  detail += f"Tool use score: {sum([item.score for item in tool_use_evaulate_result.items])} / {sum([item.points for item in benchmark_tool_use_items])}\n"
312
  for item in tool_use_evaulate_result.items:
313
  detail += f"{item.reason}\n"
 
314
  return EvaluateScore(
315
  model_name=self.model_name,
316
  answer_score=answer_evaulate_result.score,
@@ -327,6 +324,6 @@ Evaluation Rules:
327
  async def ensemble_evaluate(evaulator_list:list[Evaluator], answer:Answer, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
328
  # for evaluator in evaulator_list:
329
  # await evaluator.load_validate_data()
330
- results = await asyncio.gather(*[evaluator.a_evaluate(to_evaluate_item.task_id, answer) for evaluator in evaulator_list])
331
  return sum([result.total_score for result in results if result]) / len([result for result in results if result]), [result for result in results if result]
332
 
 
276
  continue
277
  return 0.0, None
278
 
279
+ async def a_evaluate(self, task_id:str, answer:Answer, to_evaluate_item: BenchmarkItem) -> EvaluateScore | None:
280
  import asyncio
 
 
 
 
281
  tasks = [
282
  self.evaluate_answer(answer, to_evaluate_item),
283
  self.evaluate_reasoning(answer, to_evaluate_item),
 
307
  detail += f"Tool use score: {sum([item.score for item in tool_use_evaulate_result.items])} / {sum([item.points for item in benchmark_tool_use_items])}\n"
308
  for item in tool_use_evaulate_result.items:
309
  detail += f"{item.reason}\n"
310
+ print(detail)
311
  return EvaluateScore(
312
  model_name=self.model_name,
313
  answer_score=answer_evaulate_result.score,
 
324
  async def ensemble_evaluate(evaulator_list:list[Evaluator], answer:Answer, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
325
  # for evaluator in evaulator_list:
326
  # await evaluator.load_validate_data()
327
+ results = await asyncio.gather(*[evaluator.a_evaluate(to_evaluate_item.task_id, answer, to_evaluate_item) for evaluator in evaulator_list])
328
  return sum([result.total_score for result in results if result]) / len([result for result in results if result]), [result for result in results if result]
329
 
schemas.py CHANGED
@@ -88,6 +88,7 @@ class QuestionData(BaseModel):
88
 
89
  class BenchmarkItem(BaseModel):
90
  task_id: str
 
91
  question: str = Field(description="The question to be answered")
92
  # answer: Answer = Field(description="The agent system output")
93
  evaluate: EvaluateData = Field(description="The evaluation result")
 
88
 
89
  class BenchmarkItem(BaseModel):
90
  task_id: str
91
+ level:Optional[int] = 1
92
  question: str = Field(description="The question to be answered")
93
  # answer: Answer = Field(description="The agent system output")
94
  evaluate: EvaluateData = Field(description="The evaluation result")
score.py CHANGED
@@ -37,43 +37,6 @@ async def run_evaluate(evaluator_list:list[Evaluator], agent_output_item:AgentOu
37
  )
38
  return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
39
 
40
- # async def main():
41
- # #load llm config
42
- # parse_llm_config = llm_configs["parse_llm_config"]
43
- # evaluate_llm_configs = llm_configs["evaluate_llm_configs"]
44
- # #load agent output dataset
45
- # agent_output_dataset = load_agent_output_dataset()
46
- # #load evaluate dataset
47
- # evaluator_list: list[Evaluator] = []
48
- # for evaluate_llm_config in evaluate_llm_configs:
49
- # for _ in range(3):
50
- # evaluator = Evaluator(
51
- # dataset_path="dataset/example_evaluate_data.json",
52
- # parse_model=parse_llm_config["model_name"],
53
- # parse_model_api_key=parse_llm_config.get("api_key", None),
54
- # parse_model_base_url=parse_llm_config.get("base_url", None),
55
- # api_key=evaluate_llm_config.get("api_key", None),
56
- # model_name=evaluate_llm_config["model_name"],
57
- # base_url=evaluate_llm_config.get("base_url", None),
58
- # **evaluate_llm_config.get("model_params",{})
59
- # )
60
- # evaluator_list.append(evaluator)
61
- # evaluate_dataset = await evaluator.load_validate_data()
62
- # #evaluate
63
- # # run parallel
64
- # for agent_output_item in agent_output_dataset:
65
- # task_id = agent_output_item.task_id
66
- # to_evaluate_item = [item for item in evaluate_dataset if item.task_id == task_id][0]
67
- # answer = Answer(
68
- # answer=agent_output_item.answer,
69
- # reasoning_steps=agent_output_item.reasoning_list,
70
- # function_calls=agent_output_item.tool_use_list
71
- # )
72
- # score,results = await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
73
- # print(f"Task ID: {task_id}")
74
- # print(f"Score: {score}")
75
- # # print(results)
76
-
77
  async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
78
  answer = Answer(
79
  answer=agent_output_item.answer,
@@ -82,3 +45,15 @@ async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutp
82
  )
83
  return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  )
38
  return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem) -> tuple[float, list[EvaluateScore]]:
41
  answer = Answer(
42
  answer=agent_output_item.answer,
 
45
  )
46
  return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
47
 
48
+
49
+
50
+
51
+ def polish_scores(scores:list[EvaluateScore]) -> tuple[float, float, float]:
52
+ answer_scores = [score.answer_score for score in scores]
53
+ total_answer_scores = [score.answer_total_score for score in scores]
54
+ reasoning_scores = [score.reasoning_score for score in scores]
55
+ total_reasoning_scores = [score.reasoning_total_score for score in scores]
56
+ tool_use_scores = [score.tool_use_score for score in scores]
57
+ total_tool_use_scores = [score.tool_use_total_score for score in scores]
58
+ return sum(answer_scores) / sum(total_answer_scores), sum(reasoning_scores) / sum(total_reasoning_scores), sum(tool_use_scores) / sum(total_tool_use_scores)
59
+