Spaces:

cyberco
/

CAIA-evaluate

Runtime error

App Files Files Community

Zhejian commited on Jun 3, 2025

Commit

e23f952

1 Parent(s): 5002e45

bugfix

Browse files

Files changed (2) hide show

app.py +19 -6
evaluator.py +31 -11

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import datetime
 import time
 from huggingface_hub import HfApi, list_repo_files
 from env import (
@@ -43,9 +44,9 @@ def format_score_result(score_results: list[EnsembleEvaluateScore]) -> tuple[flo
             l3.append(result.total_score)
-    l1_total_score = sum(l1) / len(l1) if len(l1) > 0 else 0
-    l2_total_score = sum(l2) / len(l2) if len(l2) > 0 else 0
-    l3_total_score = sum(l3) / len(l3) if len(l3) > 0 else 0
     total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
     return total_score, l1_total_score, l2_total_score, l3_total_score
@@ -56,8 +57,8 @@ def on_new_files(new_files):
     logger.info(f"New Files Found {new_files}")
     for file in new_files:
         file_name = file.split('/')[-1]
-        names = file_name.split('_')
-        model, organization = names[0], names[1]
         json_data = read_json_file(file)
         if not json_data:
@@ -171,8 +172,20 @@ def monitor_hf_dataset(dataset_name, interval=60):
             on_new_files(new_files)
         last_files = current_files
 if __name__ == "__main__":
-    monitor_hf_dataset(SUBMISSION_DATASET, interval=60)

 import asyncio
 import datetime
+import threading
 import time
 from huggingface_hub import HfApi, list_repo_files
 from env import (
             l3.append(result.total_score)
+    l1_total_score = round(sum(l1) / len(l1),2) if len(l1) > 0 else 0
+    l2_total_score = round(sum(l2) / len(l2),2) if len(l2) > 0 else 0
+    l3_total_score = round(sum(l3) / len(l3),2) if len(l3) > 0 else 0
     total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
     return total_score, l1_total_score, l2_total_score, l3_total_score
     logger.info(f"New Files Found {new_files}")
     for file in new_files:
         file_name = file.split('/')[-1]
+        names = file_name.split('<')
+        model, organization = names[0].split('>')[0], names[1].split('>')[0]
         json_data = read_json_file(file)
         if not json_data:
             on_new_files(new_files)
         last_files = current_files
+def start_monitoring_delayed(delay_seconds=30):
+    """延迟启动监控任务，确保 Space 先完成启动"""
+    def start_monitor():
+        logger.info("开始监控 HuggingFace 数据集变化...")
+        monitor_hf_dataset(SUBMISSION_DATASET, interval=60)
+    # 使用线程启动监控任务
+    monitor_thread = threading.Thread(target=start_monitor, daemon=True)
+    threading.Timer(delay_seconds, monitor_thread.start).start()
+    logger.info(f"监控任务将在 {delay_seconds} 秒后启动")
 if __name__ == "__main__":
+    start_monitoring_delayed(30)

evaluator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import asyncio
 import os
 from statistics import mean
 from typing import List, Optional, Type, TypeVar
 from tenacity import (
     retry,
@@ -143,9 +144,14 @@ NOTE:
     async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
         reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
-        reasoning_step_prompt = "\n".join([step.to_prompt() for step in output_answer.reasoning_steps])
-        function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
         if not reasoning_items:
             return 0.0, None
         prompt = f"""
@@ -153,25 +159,27 @@ Task ID: {benchmark_item.task_id}
 Question: {benchmark_item.question}
 To be evaluated Reasoning Steps:
 ```
-{reasoning_step_prompt}
 ```
 In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
 ```
-{function_call_prompt}
 ```
-Evaluation Rules:"""
         for item in reasoning_items:
             prompt += f"{item.to_prompt()}\n"
         prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
         max_retries = 3
         retry_count = 0
         while retry_count < max_retries:
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
-                    messages=[{"role": "user", "content": prompt}],
                     **self.model_params
                 )
                 content = response.choices[0].message.content
@@ -192,16 +200,21 @@ Evaluation Rules:"""
         return 0.0, None
     async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
         tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
         if not tool_use_items:
             return 0.0, None
-        function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
         prompt = f"""
 Task ID: {benchmark_item.task_id}
 Question: {benchmark_item.question}
 To be evaluated tool use:
 ```
-{function_call_prompt}
 ```
 Evaluation Rules:
@@ -215,7 +228,7 @@ Evaluation Rules:
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
-                    messages=[{"role": "user", "content": prompt}],
                     **self.model_params
                 )
                 content = response.choices[0].message.content
@@ -228,7 +241,7 @@ Evaluation Rules:
                     continue
                 return sum([item.score for item in result.items]), result
             except Exception as e:
-                print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {e}")
                 retry_count += 1
                 if retry_count == max_retries:
                     return 0.0, None
@@ -237,6 +250,12 @@ Evaluation Rules:
     async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
         evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
         if not evaluate_items:
             return 0.0, None
@@ -254,12 +273,13 @@ Evaluation Rules:
         for item in evaluate_items:
             prompt += f"{item.to_prompt()}\n"
         prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
         max_retry = 3
         for _ in range(max_retry):
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
-                    messages=[{"role": "user", "content": prompt}],
                     **self.model_params
                 )

 import asyncio
 import os
 from statistics import mean
+import traceback
 from typing import List, Optional, Type, TypeVar
 from tenacity import (
     retry,
     async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
+        system_prompt = """You are a professional evaluator for AI assistants in the crypto domain. You need to score the assistant's reasoning ability based on the given evaluation criteria and reasoning process. Please follow these steps during evaluation:
+1. Review the reasoning steps and understand whether each step's logic is relevant to the task and helps solve the problem.
+2. If there are no explicit reasoning steps, treat tool calls as an alternative form of reasoning steps and consider the reasoning process represented by the tool usage.
+3. Assess the completeness and rigor of the reasoning chain, judging whether each step is reasonable and accurate, and whether there are logical flaws or missing steps.
+4. Consider the information references and tool calls in the reasoning process, judge whether the information sources are sufficient, whether the tool usage is appropriate, and analyze the connections and dependencies between each step in the reasoning chain.
+5. According to the evaluation criteria, give a score for each criterion, with the score ranging from 0 to the maximum points for that criterion.
+"""
         reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
         if not reasoning_items:
             return 0.0, None
         prompt = f"""
 Question: {benchmark_item.question}
 To be evaluated Reasoning Steps:
 ```
+{"\n".join([step.to_prompt() for step in output_answer.reasoning_steps])}
 ```
 In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
 ```
+{"\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])}
 ```
+Evaluation Rules:
+"""
         for item in reasoning_items:
             prompt += f"{item.to_prompt()}\n"
         prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
+        # print(prompt)
         max_retries = 3
         retry_count = 0
         while retry_count < max_retries:
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
+                    messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
                     **self.model_params
                 )
                 content = response.choices[0].message.content
         return 0.0, None
     async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
+        system_prompt = """You are a professional crypto AI assistant evaluator. You need to score the assistant's tools using ability according to the given criterias and the tool use output. When evaluating, you should follow the following steps:
+1. Take a brief look at the tool using, descriptions and input args, to make sure the tool using is correct/related to solving the task.
+2. Evaluate each step of the tool use to estimate the efficiency and accuracy of the tool use.
+3. Consider the continuity of tool calls: The return result of the previous tool call may affect the input arguments of the next tool call.
+"""
         tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
         if not tool_use_items:
+            print(f"No tool use items for task {benchmark_item.task_id}")
             return 0.0, None
         prompt = f"""
 Task ID: {benchmark_item.task_id}
 Question: {benchmark_item.question}
 To be evaluated tool use:
 ```
+{"\n".join([step.to_prompt() for step in output_answer.function_calls])}
 ```
 Evaluation Rules:
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
+                    messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
                     **self.model_params
                 )
                 content = response.choices[0].message.content
                     continue
                 return sum([item.score for item in result.items]), result
             except Exception as e:
+                print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {traceback.format_exc()}")
                 retry_count += 1
                 if retry_count == max_retries:
                     return 0.0, None
     async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
+        system_prompt = """You are a professional evaluator for crypto AI assistant answers. You need to score the AI assistant's final answer according to the given evaluation criteria. Please follow these steps during evaluation:
+1. Carefully read the task question and the AI assistant's final output, and determine whether the answer accurately and completely solves the task requirements and conforms to basic common sense.
+2. Check whether the facts, data, and reasoning process in the answer are correct, and whether there are logical errors, numerical errors, or fabricated facts.
+3. For specific numerical values, allow a certain range of error. If the criteria do not specify the error range, use a ±5% margin.
+4. For each evaluation criterion, give a score for each item, with the score ranging from 0 to the full score for that criterion.
+Please strictly follow the evaluation criteria to provide objective and fair scoring, and briefly explain your reasoning for the scores."""
         evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
         if not evaluate_items:
             return 0.0, None
         for item in evaluate_items:
             prompt += f"{item.to_prompt()}\n"
         prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
+        # print(prompt)
         max_retry = 3
         for _ in range(max_retry):
             try:
                 response = await self.client.chat.completions.create(
                     model=self.model_name,
+                    messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}],
                     **self.model_params
                 )