Spaces:

cyberco
/

CAIA-evaluate

Runtime error

App Files Files Community

Zhejian commited on May 23, 2025

Commit

f3e6f32

1 Parent(s): 836d17c

init

Browse files

Files changed (8) hide show

.gitignore +16 -0
app.py +144 -0
env.py +54 -0
evaluator.py +347 -0
requirements.txt +21 -0
schemas.py +212 -0
score.py +74 -0
utils.py +81 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.vscode
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+__pycache__/
+pycache/
+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import asyncio
+import datetime
+import time
+from huggingface_hub import list_repo_files
+from env import (
+    REPO_ID, TOKEN, SUBMISSION_DATASET,
+    INTERNAL_DATASET, BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE, EVALUATE_RESULT_DATASET
+    llm_config
+)
+from loguru import logger
+from schemas import AgentOutputItem, EnsembleEvaluateScore
+from score import init_evaluators, score_in_threadpool
+from datasets import load_dataset, VerificationMode, Dataset, concatenate_datasets
+from utils import parse_eval_dataset
+benchmark_internal_evaluate_dataset = load_dataset(INTERNAL_DATASET, data_files=BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE, token=TOKEN, verification_mode=VerificationMode.NO_CHECKS, download_mode="force_redownload",trust_remote_code=True)
+benchmark_dataset = parse_eval_dataset(benchmark_internal_evaluate_dataset) # type: ignore
+evaluator_list = init_evaluators(benchmark_dataset, llm_config)
+def get_hf_dataset_files(dataset_name):
+    return set(list_repo_files(dataset_name, repo_type="dataset"))
+def format_score_result(score_results: list[EnsembleEvaluateScore]) -> tuple[float, float, float, float]:
+    if len(score_results) == 0:
+        return 0.0, 0.0, 0.0, 0.0
+    l1,l2,l3 = [],[],[]
+    for result in score_results:
+        if result.level == 1:
+            l1.append(
+                result.total_score
+            )
+        elif result.level== 2:
+            l2.append(result.total_score)
+        elif result.level == 3:
+            l3.append(result.total_score)
+    l1_total_score = sum(l1) / len(l1) if len(l1) > 0 else 0
+    l2_total_score = sum(l2) / len(l2) if len(l2) > 0 else 0
+    l3_total_score = sum(l3) / len(l3) if len(l3) > 0 else 0
+    total_score = round((sum(l1) + sum(l2) + sum(l3)) / (len(l1) + len(l2) + len(l3)), 2)
+    return total_score, l1_total_score, l2_total_score, l3_total_score
+def on_new_files(new_files):
+    logger.info(f"New Files Found {new_files}")
+    for file in new_files:
+        file_name = file.split('/')[-1]
+        names = file_name.split('_')
+        model, organization = names[0], names[1]
+        json_data = read_json_file(file)
+        if not json_data:
+            continue
+        agent_outputs = [AgentOutputItem(**item) for item in json_data]
+        score_results: list[EnsembleEvaluateScore] = asyncio.run(score_in_threadpool(
+            evaluator_list=evaluator_list,
+            agent_output_list=agent_outputs,
+            benchmark_data=benchmark_dataset
+        ))
+        total_score, l1_total_score, l2_total_score, l3_total_score = format_score_result(score_results)
+        #save to public result
+        # add to eval_results
+        new_eval_result = {
+            "model": model,
+            "model_family": "",
+            "url": "",
+            "organisation": organization,
+            "score": total_score,
+            "score_level1": l1_total_score,
+            "score_level2": l2_total_score,
+            "score_level3": l3_total_score,
+            "date": datetime.datetime.now().strftime("%Y-%m-%d")
+        }
+        print(new_eval_result)
+        eval_results = load_dataset(EVALUATE_RESULT_DATASET, token=TOKEN)
+        eval_results_list = list(eval_results)
+        eval_results_list.append(new_eval_result)
+        eval_results = Dataset.from_list(eval_results_list, features=eval_results.features)
+        eval_results.push_to_hub(EVALUATE_RESULT_DATASET, token=TOKEN)
+def read_json_file(file_path):
+    """
+    Read JSON file and return its contents
+    Args:
+        file_path (str): Path to the JSON file
+    Returns:
+        dict/list: Contents of the JSON file
+    """
+    import json
+    from huggingface_hub import hf_hub_download
+    try:
+        # Download file from Hugging Face Hub
+        local_path = hf_hub_download(
+            repo_id=SUBMISSION_DATASET,
+            filename=file_path,
+            token=TOKEN
+        )
+        # Read JSON file
+        with open(local_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        logger.info(f"Successfully read file: {file_path}")
+        return data
+    except Exception as e:
+        logger.error(f"Error reading file {file_path}: {str(e)}")
+        return None
+def monitor_hf_dataset(dataset_name, interval=60):
+    last_files = get_hf_dataset_files(dataset_name)
+    print(last_files)
+    while True:
+        time.sleep(interval)
+        current_files = get_hf_dataset_files(dataset_name)
+        print(current_files)
+        new_files = current_files - last_files
+        if new_files:
+            on_new_files(new_files)
+        last_files = current_files
+if __name__ == "__main__":
+    monitor_hf_dataset(SUBMISSION_DATASET, interval=60)

env.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+OWNER = "cyberco"
+VERSION = "2025_v1"
+REPO_ID = f"{OWNER}/CAIA-Benchmark-Leaderboard"
+TOKEN = os.getenv("HF_TOKEN")
+SUBMISSION_DATASET_PUBLIC = f"{OWNER}/public_submissions"  # 添加缺失的变量
+INTERNAL_DATASET = f"{OWNER}/caia_internal"
+EVALUATE_RESULT_DATASET = f"{OWNER}/public_results"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+CONTACT_DATASET = f"{OWNER}/contact_info"
+BENCHMARK_INTERNAL_EVALUATE_DATASET_FILE = f"{VERSION}/{os.getenv('BENCHMARK_INTERNAL_EVALUATE_DATASET', 'example_evaluate_data.json')}"
+EVALUATE_RESULT_DATASET_FILE = f"{VERSION}/{os.getenv('EVALUATE_RESULT_DATASET', 'example_result.json')}"
+CONTACT_DATASET_FILE = f"{os.getenv('CONTACT_DATASET_FILE', 'example_contact_info.json')}"
+llm_config = {
+    "parse_llm_config": {
+        "model_name": "gpt-4.1-mini-2025-04-14",
+        "api_key": os.getenv("OPENAI_API_KEY", None),
+        "model_params": {
+            "temperature": 0
+        }
+    },
+    "evaluate_llm_configs": [
+        {
+            "model_name": "o3-2025-04-16",
+            "api_key": os.getenv("OPENAI_API_KEY", None),
+            "model_params": {
+                "reasoning_effort": "medium"
+            }
+        },
+        {
+            "model_name": "gpt-4.1",
+            "api_key": os.getenv("OPENAI_API_KEY", None),
+            "model_params": {
+                "temperature": 0.2
+            }
+        },
+        {
+            "model_name": "deepseek-r1-250120",
+            "api_key": os.getenv("DEEPSEEK_API_KEY", None),
+            "base_url": os.getenv("DEEPSEEK_BASE_URL", None),
+            "model_params": {
+                "temperature": 0.2
+            }
+        }
+    ]
+}

evaluator.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import json
+import asyncio
+import os
+from statistics import mean
+from typing import List, Optional, Type, TypeVar
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from pydantic import BaseModel
+from schemas import (
+    Answer, EnsembleEvaluateScore, EvaluateData, QuestionData, BenchmarkItem,
+    EvaluateTarget, AnswerEvaluateResult, ReasoningEvaluateResult, ReasoningStep, ToolUse, ToolUseEvaluateResult,
+    EvaluateScore
+)
+from openai import AsyncClient
+from utils import count_tokens, truncate_text
+T = TypeVar("T", bound=BaseModel)
+llm_config = {
+    "parse_llm_config": {
+        "model_name": "gpt-4.1-mini-2025-04-14",
+        "api_key": os.getenv("OPENAI_API_KEY", None),
+        "model_params": {
+            "temperature": 0
+        }
+    },
+    "evaluate_llm_configs": [
+        {
+            "model_name": "o3-2025-04-16",
+            "api_key": os.getenv("OPENAI_API_KEY", None),
+            "model_params": {
+                "reasoning_effort": "medium"
+            }
+        },
+        {
+            "model_name": "gpt-4.1",
+            "api_key": os.getenv("OPENAI_API_KEY", None),
+            "base_url": "https://api.openai.com/v1",
+            "model_params": {
+                "temperature": 0.2
+            }
+        },
+        {
+            "model_name": "deepseek-r1-250120",
+            "api_key": os.getenv("DEEPSEEK_API_KEY", None),
+            "base_url": os.getenv("DEEPSEEK_BASE_URL", None),
+            "model_params": {
+                "temperature": 0.2
+            }
+        }
+    ]
+}
+class Evaluator:
+    def __init__(self,
+                 dataset:List[BenchmarkItem] = [],
+                 api_key:Optional[str] = None,
+                 model_name:str = "gpt-4.1",
+                 base_url:Optional[str] = None,
+                 parse_model:str = "gpt-4.1-mini",
+                 parse_model_api_key:Optional[str] = None,
+                 parse_model_base_url:Optional[str] = None,
+                 **model_params):
+        if not api_key or not parse_model_api_key:
+            raise ValueError("api_key and parse_model_api_key are required")
+        self.system_prompt = """
+        You are a helpful assistant that can evaluate the quality of a given answer.
+        """
+        # self.dataset_path = dataset_path
+        self.dataset = dataset
+        self.benchmark_data:List[BenchmarkItem] = []
+        self.model_name = model_name
+        self.base_url = base_url
+        self.parse_model = parse_model
+        self.model_params = model_params or {"temperature": 0.0}  # 默认参数
+        self.parse_client = AsyncClient(api_key=parse_model_api_key, base_url=parse_model_base_url)
+        self.client = AsyncClient(api_key=api_key, base_url=self.base_url)
+        self.tool_output_max_tokens = 2000
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=15))
+    async def parse_str_to_format(self, string_output:Optional[str], target_data_class:  Type[T]) -> Optional[T]:
+        if not string_output:
+            return None
+        try:
+            # 对于解析模型的参数，使用默认参数
+            response = await self.parse_client.beta.chat.completions.parse(
+                model=self.parse_model,
+                messages=[{"role": "user", "content": string_output}],
+                response_format=target_data_class,
+                temperature=0.0,
+            )
+            result = response.choices[0].message.parsed
+            if result:
+                return result
+        except Exception as e:
+            print(f"Error parsing string to format: {e}")
+            return None
+    async def summarize_tool_use_output(self, question:str, tool_use_list:List[ToolUse]) -> list[ToolUse]:
+        """If the tool use output is too long, summarize the tool use output to keep the important information"""
+        system_prompt = f"""
+        You are a helpful assistant that can summarize the tool use output. Your output format should be in the following format:"In order to solve <Task>, Invoked <tool_name> with <tool_input> and got <summarized_tool_output>"
+NOTE:
+1. Ignore the noise in the tool_output, only keep the important information that might help to solve/improve the possibility of solving the task.
+2. If the tool_output is not related to the question, just summarize the tool_output to "No relevant information Found"
+        """
+        async def process_tool_use(tool_use: ToolUse) -> ToolUse:
+            if count_tokens(tool_use.tool_output, self.parse_model) > self.tool_output_max_tokens:
+                user_prompt = f"""
+                Question: {question}
+                Tool use:
+                {tool_use.to_prompt()}
+                """
+                response = await self.parse_client.chat.completions.create(
+                    model=self.parse_model,
+                    messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
+                    **self.model_params
+                )
+                content = response.choices[0].message.content
+                if content:
+                    tool_use.tool_output = content
+                else:
+                    tool_use.tool_output = truncate_text(tool_use.tool_output, self.parse_model, self.tool_output_max_tokens)
+            return tool_use
+        # 并行处理所有tool_use
+        tasks = [process_tool_use(tool_use) for tool_use in tool_use_list]
+        tool_use_list = await asyncio.gather(*tasks)
+        return tool_use_list
+    async def evaluate_reasoning(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ReasoningEvaluateResult]]:
+        reasoning_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.REASONING]
+        reasoning_step_prompt = "\n".join([step.to_prompt() for step in output_answer.reasoning_steps])
+        function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
+        if not reasoning_items:
+            return 0.0, None
+        prompt = f"""
+Task ID: {benchmark_item.task_id}
+Question: {benchmark_item.question}
+To be evaluated Reasoning Steps:
+```
+{reasoning_step_prompt}
+```
+In addition, the following function calls are also part of the reasoning steps. The choose of the tool use and the arguments should be taken into account:
+```
+{function_call_prompt}
+```
+Evaluation Rules:"""
+        for item in reasoning_items:
+            prompt += f"{item.to_prompt()}\n"
+        prompt += f"Now evaluate the reasoning steps based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
+        # print(prompt)
+        max_retries = 3
+        retry_count = 0
+        while retry_count < max_retries:
+            try:
+                response = await self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    **self.model_params
+                )
+                content = response.choices[0].message.content
+                result = await self.parse_str_to_format(content, ReasoningEvaluateResult)
+                if not result:
+                    retry_count += 1
+                    continue
+                if sum([item.score for item in result.items]) > sum([item.points for item in reasoning_items]):
+                    retry_count += 1
+                    continue
+                return sum([item.score for item in result.items]), result
+            except Exception as e:
+                print(f"Error evaluating reasoning (attempt {retry_count + 1}/{max_retries}): {e}")
+                retry_count += 1
+                if retry_count == max_retries:
+                    return 0.0, None
+                await asyncio.sleep(1)  # 添加重试间隔
+        return 0.0, None
+    async def evaluate_tool_use(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[ToolUseEvaluateResult]]:
+        tool_use_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
+        if not tool_use_items:
+            return 0.0, None
+        function_call_prompt = "\n".join([step.to_prompt(ignore_output=True) for step in output_answer.function_calls])
+        prompt = f"""
+Task ID: {benchmark_item.task_id}
+Question: {benchmark_item.question}
+To be evaluated tool use:
+```
+{function_call_prompt}
+```
+Evaluation Rules:
+"""
+        for item in tool_use_items:
+            prompt += f"{item.to_prompt()}\n"
+        prompt += f"Now evaluate the tool use based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
+        max_retries = 3
+        retry_count = 0
+        while retry_count < max_retries:
+            try:
+                response = await self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    **self.model_params
+                )
+                content = response.choices[0].message.content
+                result = await self.parse_str_to_format(content, ToolUseEvaluateResult)
+                if not result:
+                    retry_count += 1
+                    continue
+                if sum([item.score for item in result.items]) > sum([item.points for item in tool_use_items]):
+                    retry_count += 1
+                    continue
+                return sum([item.score for item in result.items]), result
+            except Exception as e:
+                print(f"Error evaluating tool use (attempt {retry_count + 1}/{max_retries}): {e}")
+                retry_count += 1
+                if retry_count == max_retries:
+                    return 0.0, None
+                await asyncio.sleep(1)  # 添��重试间隔
+        return 0.0, None
+    async def evaluate_answer(self, output_answer:Answer, benchmark_item:BenchmarkItem) -> tuple[float, Optional[AnswerEvaluateResult]]:
+        evaluate_items = [item for item in benchmark_item.evaluate.items if item.target == EvaluateTarget.ANSWER]
+        if not evaluate_items:
+            return 0.0, None
+        prompt = f"""
+Task ID: {benchmark_item.task_id}
+Question: {benchmark_item.question}
+To be evaluated output:
+```
+{output_answer.to_prompt()}
+```
+Evaluation Rules:
+"""
+        for item in evaluate_items:
+            prompt += f"{item.to_prompt()}\n"
+        prompt += f"Now evaluate the output answer based on the evaluation criteria, and give the score for each item in the range of 0 to the point the criteria worth."
+        # print(prompt)
+        max_retry = 3
+        for _ in range(max_retry):
+            try:
+                response = await self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    **self.model_params
+                )
+                result = await self.parse_str_to_format(response.choices[0].message.content, AnswerEvaluateResult)
+                if not result:
+                    continue
+                if result.score > sum([item.points for item in evaluate_items]):
+                    continue
+                return result.score, result
+            except Exception as e:
+                print(f"Error evaluating answer: {e}")
+                continue
+        return 0.0, None
+    async def a_evaluate(self, task_id:str, answer:Answer, to_evaluate_item: BenchmarkItem) -> EvaluateScore | None:
+        import asyncio
+        tasks = [
+            self.evaluate_answer(answer, to_evaluate_item),
+            self.evaluate_reasoning(answer, to_evaluate_item),
+            self.evaluate_tool_use(answer, to_evaluate_item),
+        ]
+        [(answer_score, answer_evaulate_result), (reasoning_score, reasoning_evaulate_result), (tool_use_score, tool_use_evaulate_result)] = await asyncio.gather(*tasks)
+        analysis = await self.analyze_evaulate_result(answer_evaulate_result, reasoning_evaulate_result, tool_use_evaulate_result, to_evaluate_item)
+        return analysis
+    async def analyze_evaulate_result(self,
+                                      answer_evaulate_result:AnswerEvaluateResult,
+                                      reasoning_evaulate_result:ReasoningEvaluateResult,
+                                      tool_use_evaulate_result:ToolUseEvaluateResult,
+                                      to_evaluate_item:BenchmarkItem) -> EvaluateScore:
+        """Analyze the evaulate result and give the analysis"""
+        benchmark_answer_item = [item for item in to_evaluate_item.evaluate.items if item.target == EvaluateTarget.ANSWER][0]
+        benchmark_reasoning_items = [item for item in to_evaluate_item.evaluate.items if item.target == EvaluateTarget.REASONING]
+        benchmark_tool_use_items = [item for item in to_evaluate_item.evaluate.items if item.target == EvaluateTarget.TOOL_USE]
+        detail = ""
+        detail += f"Answer score: {answer_evaulate_result.score} / {benchmark_answer_item.points}\n"
+        detail += f"Reason: {answer_evaulate_result.reason}\n"
+        detail += f"Reasoning score: {sum([item.score for item in reasoning_evaulate_result.items])} / {sum([item.points for item in benchmark_reasoning_items])}\n"
+        for item in reasoning_evaulate_result.items:
+            detail += f"Reasoning step {item.step}: {item.reason} score: {item.score} / {benchmark_reasoning_items[item.step-1].points}\n"
+        detail += f"Tool use score: {sum([item.score for item in tool_use_evaulate_result.items])} / {sum([item.points for item in benchmark_tool_use_items])}\n"
+        for item in tool_use_evaulate_result.items:
+            detail += f"{item.reason}\n"
+        print(detail)
+        return EvaluateScore(
+            model_name=self.model_name,
+            answer_score=answer_evaulate_result.score,
+            answer_total_score=benchmark_answer_item.points,
+            reasoning_score=sum([item.score for item in reasoning_evaulate_result.items]),
+            reasoning_total_score=sum([item.points for item in benchmark_reasoning_items]),
+            tool_use_score=sum([item.score for item in tool_use_evaulate_result.items]),
+            tool_use_total_score=sum([item.points for item in benchmark_tool_use_items]),
+            total_score=answer_evaulate_result.score + sum([item.score for item in reasoning_evaulate_result.items]) + sum([item.score for item in tool_use_evaulate_result.items]),
+            evaluate_detail=detail,
+            task_id=to_evaluate_item.task_id,
+            level=to_evaluate_item.level or 1,
+            category=to_evaluate_item.category
+        )
+async def ensemble_evaluate(evaulator_list:list[Evaluator], answer:Answer, to_evaluate_item:BenchmarkItem) -> EnsembleEvaluateScore:
+    # for evaluator in evaulator_list:
+    #     await evaluator.load_validate_data()
+    results:list[EvaluateScore|None] = await asyncio.gather(*[evaluator.a_evaluate(to_evaluate_item.task_id, answer, to_evaluate_item) for evaluator in evaulator_list])
+    results = [item for item in results if item]
+    return EnsembleEvaluateScore(
+        task_id=to_evaluate_item.task_id,
+        answer_total_score=mean([item.answer_total_score for item in results if item]),
+        reasoning_total_score=mean([item.reasoning_total_score for item in results if item]),
+        tool_use_total_score=mean([item.tool_use_total_score for item in results if item]),
+        total_score=mean([item.total_score for item in results if item]),
+        evaluate_detail="no detail",
+        answer_score=mean([item.answer_score for item in results if item]),
+        reasoning_score=mean([item.reasoning_score for item in results if item]),
+        tool_use_score=mean([item.tool_use_score for item in results if item]),
+        level=to_evaluate_item.level or 1,
+        category=to_evaluate_item.category,
+        model_name="ensemble result"
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.13
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece
+pydantic==2.10.1
+openai==1.78.1
+tiktoken==0.9.0
+tenacity===9.1.2
+loguru

schemas.py ADDED Viewed

	@@ -0,0 +1,212 @@

+from math import isclose
+from typing import List, Optional
+from pydantic import BaseModel, Field, field_validator, model_validator
+from enum import Enum
+class EvaluateTarget(Enum):
+    ANSWER = "ANSWER"
+    REASONING = "REASONING"
+    TOOL_USE = "TOOL_USE"
+    SOURCES = "SOURCES"
+class ToolUse(BaseModel):
+    call_id: str
+    tool_name:str
+    tool_description: str
+    tool_input:str
+    tool_output: str
+    def to_prompt(self, ignore_output:bool = False) -> str:
+        prompt = f"Tool Name: {self.tool_name}\n"
+        prompt += f"Tool Description: {self.tool_description}\n"
+        prompt += f"Tool Input: {self.tool_input}\n"
+        if not ignore_output:
+            prompt += f"Tool Output: {self.tool_output}\n"
+        return prompt
+class ReasoningStep(BaseModel):
+    step: int
+    reasoning: Optional[str] = None
+    # function_call: Optional[ToolUse] = None
+    def to_prompt(self) -> str:
+        prompt = f"Step {self.step}:\n"
+        if self.reasoning:
+            prompt += f"Reasoning: {self.reasoning}\n"
+        # if self.function_call:
+        #     prompt += f"Function Call: {self.function_call.to_prompt()}\n"
+        return prompt
+class Answer(BaseModel):
+    answer: str
+    reasoning_steps: List[ReasoningStep]
+    function_calls: List[ToolUse]
+    # sources: List[str]
+    def to_prompt(self) -> str:
+        prompt = f"Final Answer: {self.answer}\n"
+        return prompt
+class EvaluateItem(BaseModel):
+    step: Optional[int] = None
+    target: EvaluateTarget
+    points: float
+    criteria: str
+    def to_prompt(self) -> str:
+        prompt = f"Step {self.step}:\n" if self.step else ""
+        prompt += f"Worth Points: {self.points}\n"
+        prompt += f"Criteria content: {self.criteria}\n"
+        return prompt
+class EvaluateData(BaseModel):
+    items: List[EvaluateItem]
+    @field_validator('items')
+    @classmethod
+    def validate_total_points(cls, items: List[EvaluateItem]) -> List[EvaluateItem]:
+        total_points = sum(item.points for item in items)
+        if abs(total_points - 10.0) != 0:
+            raise ValueError(f"所有评估项的权重总和必须等于10，当前总和为: {total_points}")
+        return items
+class QuestionData(BaseModel):
+    task_id: str
+    question: str
+    # tools:Optional[List[str]] = Field(description="The tools that can be used to answer the question")
+    def to_prompt(self) -> str:
+        prompt = f"Task ID: {self.task_id}\n"
+        prompt += f"Question: {self.question}\n"
+        return prompt
+class BenchmarkItem(BaseModel):
+    task_id: str
+    level:Optional[int] = 1
+    category:str
+    question: str = Field(description="The question to be answered")
+    # answer: Answer = Field(description="The agent system output")
+    evaluate: EvaluateData = Field(description="The evaluation result")
+class AnswerEvaluateResult(BaseModel):
+    reason: Optional[str] = None
+    score: float = Field(description="The score of the answer worth")
+    def __str__(self) -> str:
+        return f"Reason: {self.reason}\nScore: {self.score}"
+class ReasoningEvaluateItem(BaseModel):
+    step: int
+    reason: Optional[str] = None
+    score: float = Field(description="The score of the reasoning step worth")
+    def __str__(self) -> str:
+        return f"Step: {self.step}\nReason: {self.reason}\nScore: {self.score}"
+class ReasoningEvaluateResult(BaseModel):
+    items: List[ReasoningEvaluateItem]
+    def __str__(self) -> str:
+        return "\n".join([item.__str__() for item in self.items])
+class ToolUseEvaluateItem(BaseModel):
+    reason: Optional[str] = None
+    score: float = Field(description="The score of the tool use worth")
+    def __str__(self) -> str:
+        return f"Reason: {self.reason}\nScore: {self.score}"
+class ToolUseEvaluateResult(BaseModel):
+    items: List[ToolUseEvaluateItem]
+    def __str__(self) -> str:
+        return "\n".join([item.__str__() for item in self.items])
+class AgentOutputItem(BaseModel):
+    task_id: str
+    answer: str
+    tool_use_list: List[ToolUse]
+    reasoning_list: List[ReasoningStep]
+    def to_prompt(self) -> str:
+        prompt = f"Task ID: {self.task_id}\n"
+        prompt += f"Answer: {self.answer}\n"
+        prompt += f"Tool Use List: {self.tool_use_list}\n"
+        prompt += f"Reasoning List: {self.reasoning_list}\n"
+        return prompt
+class EvaluateScore(BaseModel):
+    answer_total_score: float = Field(description="The total score of the answer worth")
+    reasoning_total_score: float = Field(description="The total score of the reasoning worth")
+    tool_use_total_score: float = Field(description="The total score of the tool use worth")
+    answer_score: float = Field(description="The score of the agent get from the answer")
+    reasoning_score: float = Field(description="The score of the agent get from the reasoning")
+    tool_use_score: float = Field(description="The score of the agent get from the tool use")
+    total_score: float = Field(description="The total score of the agent")
+    evaluate_detail:Optional[str] = Field(description="The detail of the evaluation")
+    model_name: str
+    task_id:str
+    level:int
+    category:str
+    # @field_validator('total_score')
+    @field_validator('answer_score', 'reasoning_score', 'tool_use_score')
+    def non_negative(cls, v):
+        if v < 0:
+            raise ValueError('score cannot be negative')
+        return v
+    @field_validator('answer_score')
+    def check_answer_score(cls, v, info):
+        max_score = info.data.get('answer_total_score', 0)
+        if v > max_score:
+            raise ValueError('answer_score cannot exceed answer_total_score')
+        return v
+    @field_validator('reasoning_score')
+    def check_reasoning_score(cls, v, info):
+        max_score = info.data.get('reasoning_total_score', 0)
+        if v > max_score:
+            raise ValueError('reasoning_score cannot exceed reasoning_total_score')
+        return v
+    @field_validator('tool_use_score')
+    def check_tool_use_score(cls, v, info):
+        max_score = info.data.get('tool_use_total_score', 0)
+        if v > max_score:
+            raise ValueError('tool_use_score cannot exceed tool_use_total_score')
+        return v
+    @model_validator(mode='after')
+    def check_totals(self):
+        # 可选：限制总分（如果业务就是固定 10 分）
+        if self.total_score > 10:
+            raise ValueError('total_score cannot exceed 10')
+        expected = self.answer_score + self.reasoning_score + self.tool_use_score
+        if not isclose(self.total_score, expected, abs_tol=1e-6):
+            raise ValueError(
+                f'total_score ({self.total_score}) must equal the sum of '
+                f'answer_score + reasoning_score + tool_use_score ({expected})'
+            )
+        return self
+class EnsembleEvaluateScore(EvaluateScore):
+    ...

score.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import json
+from typing import List
+from evaluator import Evaluator, ensemble_evaluate
+from schemas import AgentOutputItem, Answer, BenchmarkItem, EvaluateScore, EnsembleEvaluateScore
+def init_evaluators(dataset:List[BenchmarkItem], llm_configs:dict) -> list[Evaluator]:
+    parse_llm_config = llm_configs["parse_llm_config"]
+    evaluate_llm_configs = llm_configs["evaluate_llm_configs"]
+    evaluator_list: list[Evaluator] = []
+    for evaluate_llm_config in evaluate_llm_configs:
+        for _ in range(3):
+            evaluator = Evaluator(
+                dataset=dataset,
+                parse_model=parse_llm_config["model_name"],
+                parse_model_api_key=parse_llm_config.get("api_key", None),
+                parse_model_base_url=parse_llm_config.get("base_url", None),
+                api_key=evaluate_llm_config.get("api_key", None),
+                model_name=evaluate_llm_config["model_name"],
+                base_url=evaluate_llm_config.get("base_url", None),
+                **evaluate_llm_config.get("model_params",{})
+            )
+            evaluator_list.append(evaluator)
+    return evaluator_list
+def load_agent_output_dataset(dataset_path:str = "dataset/example_agent_output.json") -> list[AgentOutputItem]:
+    with open(dataset_path, "r") as f:
+        agent_output_dataset = json.load(f)
+        return [AgentOutputItem(**item) for item in agent_output_dataset]
+async def run_evaluate(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem):
+    answer = Answer(
+        answer=agent_output_item.answer,
+        reasoning_steps=agent_output_item.reasoning_list,
+        function_calls=agent_output_item.tool_use_list
+    )
+    return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
+async def score_item(evaluator_list:list[Evaluator], agent_output_item:AgentOutputItem, to_evaluate_item:BenchmarkItem) -> EnsembleEvaluateScore:
+    answer = Answer(
+        answer=agent_output_item.answer,
+        reasoning_steps=agent_output_item.reasoning_list,
+        function_calls=agent_output_item.tool_use_list
+    )
+    return await ensemble_evaluate(evaluator_list, answer, to_evaluate_item)
+async def score_in_threadpool(evaluator_list:list[Evaluator], agent_output_list:list[AgentOutputItem], benchmark_data:list[BenchmarkItem]) -> list[EnsembleEvaluateScore]:
+    with ThreadPoolExecutor(max_workers=max(1, min(5, len(agent_output_list)))) as executor:
+        futures = []
+        for agent_output_item in agent_output_list:
+            task_id = agent_output_item.task_id
+            to_evaluate_item = next((item for item in benchmark_data if item.task_id == task_id), None)
+            if to_evaluate_item:
+                future = executor.submit(
+                    asyncio.run,
+                    score_item(
+                        evaluator_list=evaluator_list,
+                        agent_output_item=agent_output_item,
+                        to_evaluate_item=to_evaluate_item
+                    )
+                )
+                futures.append(future)
+        return [future.result() for future in futures]

utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import pandas as pd
+import tiktoken
+from typing import List, Optional
+from email._parseaddr import AddressList as _AddressList
+from schemas import BenchmarkItem, EvaluateData, EvaluateItem
+from datasets import DatasetDict
+def truncate_text(text: str, model: str = "gpt-4.1", max_tokens: Optional[int] = None) -> str:
+    """
+    Truncate text to specified token count using tiktoken
+    Args:
+        text: Text to be truncated
+        model: Model name to use, defaults to "gpt-4"
+        max_tokens: Maximum token count, if None then no truncation
+    Returns:
+        Truncated text
+    """
+    if not max_tokens:
+        return text
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        # 如果找不到指定模型的编码器，使用cl100k_base编码器
+        encoding = tiktoken.get_encoding("cl100k_base")
+    tokens = encoding.encode(text)
+    if len(tokens) <= max_tokens:
+        return text
+    truncated_tokens = tokens[:max_tokens]
+    return encoding.decode(truncated_tokens)
+def count_tokens(text: str, model: str = "gpt-4.1") -> int:
+    """
+    Count the number of tokens in a text using tiktoken
+    Args:
+        text: Text to count tokens
+        model: Model name to use, defaults to "gpt-4"
+    Returns:
+        Number of tokens in the text
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        # 如果找不到指定模型的编码器，使用cl100k_base编码器
+        encoding = tiktoken.get_encoding("cl100k_base")
+    tokens = encoding.encode(text)
+    return len(tokens)
+def parseaddr(addr):
+    """
+    Parse addr into its constituent realname and email address parts.
+    Return a tuple of realname and email address, unless the parse fails, in
+    which case return a 2-tuple of ('', '').
+    """
+    addrs = _AddressList(addr).addresslist
+    if not addrs:
+        return '', ''
+    return addrs[0]
+def parse_eval_dataset(dataset:DatasetDict) -> List[BenchmarkItem]:
+    df = pd.DataFrame(dataset['train'])
+    benchmark_items:List[BenchmarkItem] = []
+    for index, row in df.iterrows():
+        benchmark_items.append(BenchmarkItem(
+            task_id=row['task_id'],
+            question=row['question'],
+            evaluate=EvaluateData(items=[EvaluateItem(**item) for item in row['evaluate']['items']])
+        ))
+    return benchmark_items