Zhejian commited on
Commit
e23f952
·
1 Parent(s): 5002e45
Files changed (2) hide show
  1. app.py +19 -6
  2. evaluator.py +31 -11
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import asyncio
2
  import datetime
 
3
  import time
4
  from huggingface_hub import HfApi, list_repo_files
5
  from env import (
@@ -43,9 +44,9 @@ def format_score_result(score_results: list[EnsembleEvaluateScore]) -> tuple[flo
43
  l3.append(result.total_score)
44
 
45
 
46
- l1_total_score = sum(l1) / len(l1) if len(l1) > 0 else 0
47
- l2_total_score = sum(l2) / len(l2) if len(l2) > 0 else 0
48
- l3_total_score = sum(l3) / len(l3) if len(l3) > 0 else 0
49
 
50
  total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
51
  return total_score, l1_total_score, l2_total_score, l3_total_score
@@ -56,8 +57,8 @@ def on_new_files(new_files):
56
  logger.info(f"New Files Found {new_files}")
57
  for file in new_files:
58
  file_name = file.split('/')[-1]
59
- names = file_name.split('_')
60
- model, organization = names[0], names[1]
61
 
62
  json_data = read_json_file(file)
63
  if not json_data:
@@ -171,8 +172,20 @@ def monitor_hf_dataset(dataset_name, interval=60):
171
  on_new_files(new_files)
172
  last_files = current_files
173
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  if __name__ == "__main__":
175
- monitor_hf_dataset(SUBMISSION_DATASET, interval=60)
176
 
177
 
178
 
 
1
  import asyncio
2
  import datetime
3
+ import threading
4
  import time
5
  from huggingface_hub import HfApi, list_repo_files
6
  from env import (
 
44
  l3.append(result.total_score)
45
 
46
 
47
+ l1_total_score = round(sum(l1) / len(l1),2) if len(l1) > 0 else 0
48
+ l2_total_score = round(sum(l2) / len(l2),2) if len(l2) > 0 else 0
49
+ l3_total_score = round(sum(l3) / len(l3),2) if len(l3) > 0 else 0
50
 
51
  total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
52
  return total_score, l1_total_score, l2_total_score, l3_total_score
 
57
  logger.info(f"New Files Found {new_files}")
58
  for file in new_files:
59
  file_name = file.split('/')[-1]
60
+ names = file_name.split('<')
61
+ model, organization = names[0].split('>')[0], names[1].split('>')[0]
62
 
63
  json_data = read_json_file(file)
64
  if not json_data:
 
172
  on_new_files(new_files)
173
  last_files = current_files
174
 
175
+
176
+ def start_monitoring_delayed(delay_seconds=30):
177
+ """延迟启动监控任务,确保 Space 先完成启动"""
178
+ def start_monitor():
179
+ logger.info("开始监控 HuggingFace 数据集变化...")
180
+ monitor_hf_dataset(SUBMISSION_DATASET, interval=60)
181
+
182
+ # 使用线程启动监控任务
183
+ monitor_thread = threading.Thread(target=start_monitor, daemon=True)
184
+ threading.Timer(delay_seconds, monitor_thread.start).start()
185
+ logger.info(f"监控任务将在 {delay_seconds} 秒后启动")
186
+
187
  if __name__ == "__main__":
188
+ start_monitoring_delayed(30)
189
 
190
 
191
 
evaluator.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import asyncio
3
  import os
4
  from statistics import mean
 
5
  from typing import List, Optional, Type, TypeVar
6
  from tenacity import (
7
  retry,
@@ -143,9 +144,14 @@ NOTE:
143
 
144
 
145
  async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
 
 
 
 
 
 
 
146
  reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
147
- reasoning_step_prompt = "\n".join([step.to_prompt() for step in output_answer.reasoning_steps])
148
- function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
149
  if not reasoning_items:
150
  return 0.0, None
151
  prompt = f"""
@@ -153,25 +159,27 @@ Task ID: {benchmark_item.task_id}
153
  Question: {benchmark_item.question}
154
  To be evaluated Reasoning Steps:
155
  ```
156
- {reasoning_step_prompt}
157
  ```
158
 
159
  In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
160
  ```
161
- {function_call_prompt}
162
  ```
163
 
164
- Evaluation Rules:"""
 
165
  for item in reasoning_items:
166
  prompt += f"{item.to_prompt()}\n"
167
  prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
 
168
  max_retries = 3
169
  retry_count = 0
170
  while retry_count < max_retries:
171
  try:
172
  response = await self.client.chat.completions.create(
173
  model=self.model_name,
174
- messages=[{"role": "user", "content": prompt}],
175
  **self.model_params
176
  )
177
  content = response.choices[0].message.content
@@ -192,16 +200,21 @@ Evaluation Rules:"""
192
  return 0.0, None
193
 
194
  async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
 
 
 
 
 
195
  tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
196
  if not tool_use_items:
 
197
  return 0.0, None
198
- function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
199
  prompt = f"""
200
  Task ID: {benchmark_item.task_id}
201
  Question: {benchmark_item.question}
202
  To be evaluated tool use:
203
  ```
204
- {function_call_prompt}
205
  ```
206
 
207
  Evaluation Rules:
@@ -215,7 +228,7 @@ Evaluation Rules:
215
  try:
216
  response = await self.client.chat.completions.create(
217
  model=self.model_name,
218
- messages=[{"role": "user", "content": prompt}],
219
  **self.model_params
220
  )
221
  content = response.choices[0].message.content
@@ -228,7 +241,7 @@ Evaluation Rules:
228
  continue
229
  return sum([item.score for item in result.items]), result
230
  except Exception as e:
231
- print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {e}")
232
  retry_count += 1
233
  if retry_count == max_retries:
234
  return 0.0, None
@@ -237,6 +250,12 @@ Evaluation Rules:
237
 
238
 
239
  async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
 
 
 
 
 
 
240
  evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
241
  if not evaluate_items:
242
  return 0.0, None
@@ -254,12 +273,13 @@ Evaluation Rules:
254
  for item in evaluate_items:
255
  prompt += f"{item.to_prompt()}\n"
256
  prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
 
257
  max_retry = 3
258
  for _ in range(max_retry):
259
  try:
260
  response = await self.client.chat.completions.create(
261
  model=self.model_name,
262
- messages=[{"role": "user", "content": prompt}],
263
  **self.model_params
264
  )
265
 
 
2
  import asyncio
3
  import os
4
  from statistics import mean
5
+ import traceback
6
  from typing import List, Optional, Type, TypeVar
7
  from tenacity import (
8
  retry,
 
144
 
145
 
146
  async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
147
+ system_prompt = """You are a professional evaluator for AI assistants in the crypto domain. You need to score the assistant's reasoning ability based on the given evaluation criteria and reasoning process. Please follow these steps during evaluation:
148
+ 1. Review the reasoning steps and understand whether each step's logic is relevant to the task and helps solve the problem.
149
+ 2. If there are no explicit reasoning steps, treat tool calls as an alternative form of reasoning steps and consider the reasoning process represented by the tool usage.
150
+ 3. Assess the completeness and rigor of the reasoning chain, judging whether each step is reasonable and accurate, and whether there are logical flaws or missing steps.
151
+ 4. Consider the information references and tool calls in the reasoning process, judge whether the information sources are sufficient, whether the tool usage is appropriate, and analyze the connections and dependencies between each step in the reasoning chain.
152
+ 5. According to the evaluation criteria, give a score for each criterion, with the score ranging from 0 to the maximum points for that criterion.
153
+ """
154
  reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
 
 
155
  if not reasoning_items:
156
  return 0.0, None
157
  prompt = f"""
 
159
  Question: {benchmark_item.question}
160
  To be evaluated Reasoning Steps:
161
  ```
162
+ {"\n".join([step.to_prompt() for step in output_answer.reasoning_steps])}
163
  ```
164
 
165
  In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
166
  ```
167
+ {"\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])}
168
  ```
169
 
170
+ Evaluation Rules:
171
+ """
172
  for item in reasoning_items:
173
  prompt += f"{item.to_prompt()}\n"
174
  prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
175
+ # print(prompt)
176
  max_retries = 3
177
  retry_count = 0
178
  while retry_count < max_retries:
179
  try:
180
  response = await self.client.chat.completions.create(
181
  model=self.model_name,
182
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
183
  **self.model_params
184
  )
185
  content = response.choices[0].message.content
 
200
  return 0.0, None
201
 
202
  async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
203
+ system_prompt = """You are a professional crypto AI assistant evaluator. You need to score the assistant's tools using ability according to the given criterias and the tool use output. When evaluating, you should follow the following steps:
204
+ 1. Take a brief look at the tool using, descriptions and input args, to make sure the tool using is correct/related to solving the task.
205
+ 2. Evaluate each step of the tool use to estimate the efficiency and accuracy of the tool use.
206
+ 3. Consider the continuity of tool calls: The return result of the previous tool call may affect the input arguments of the next tool call.
207
+ """
208
  tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
209
  if not tool_use_items:
210
+ print(f"No tool use items for task {benchmark_item.task_id}")
211
  return 0.0, None
 
212
  prompt = f"""
213
  Task ID: {benchmark_item.task_id}
214
  Question: {benchmark_item.question}
215
  To be evaluated tool use:
216
  ```
217
+ {"\n".join([step.to_prompt() for step in output_answer.function_calls])}
218
  ```
219
 
220
  Evaluation Rules:
 
228
  try:
229
  response = await self.client.chat.completions.create(
230
  model=self.model_name,
231
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
232
  **self.model_params
233
  )
234
  content = response.choices[0].message.content
 
241
  continue
242
  return sum([item.score for item in result.items]), result
243
  except Exception as e:
244
+ print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {traceback.format_exc()}")
245
  retry_count += 1
246
  if retry_count == max_retries:
247
  return 0.0, None
 
250
 
251
 
252
  async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
253
+ system_prompt = """You are a professional evaluator for crypto AI assistant answers. You need to score the AI assistant's final answer according to the given evaluation criteria. Please follow these steps during evaluation:
254
+ 1. Carefully read the task question and the AI assistant's final output, and determine whether the answer accurately and completely solves the task requirements and conforms to basic common sense.
255
+ 2. Check whether the facts, data, and reasoning process in the answer are correct, and whether there are logical errors, numerical errors, or fabricated facts.
256
+ 3. For specific numerical values, allow a certain range of error. If the criteria do not specify the error range, use a ±5% margin.
257
+ 4. For each evaluation criterion, give a score for each item, with the score ranging from 0 to the full score for that criterion.
258
+ Please strictly follow the evaluation criteria to provide objective and fair scoring, and briefly explain your reasoning for the scores."""
259
  evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
260
  if not evaluate_items:
261
  return 0.0, None
 
273
  for item in evaluate_items:
274
  prompt += f"{item.to_prompt()}\n"
275
  prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
276
+ # print(prompt)
277
  max_retry = 3
278
  for _ in range(max_retry):
279
  try:
280
  response = await self.client.chat.completions.create(
281
  model=self.model_name,
282
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
283
  **self.model_params
284
  )
285