Spaces:

xiaobin123
/

Hawk

Runtime error

App Files Files Community

xiaobin123 commited on 6 days ago

Commit

07f1898

verified ·

1 Parent(s): b77683b

Delete gaia_test.py

Browse files

Files changed (1) hide show

gaia_test.py +0 -895

gaia_test.py DELETED Viewed

@@ -1,895 +0,0 @@
-"""
-测试脚本：支持 validation 和 test 数据集，支持全量、增量、混合三种模式
-- 全量模式（full）：不管文件是否存在，都删除重新开始
-- 增量模式（incremental）：如果文件存在则增量，不存在则全量执行
-- 混合模式（hybrid）：第一次时全量（文件不存在），后面就增量（文件存在）
-"""
-import argparse
-import json
-import os
-import re
-import time
-import traceback
-from collections import OrderedDict
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from threading import Lock
-import pandas as pd
-import requests
-from datasets import load_dataset
-from huggingface_hub import snapshot_download, hf_hub_download
-from pathlib import Path
-import shutil
-# --- 1. 配置区 ---
-BASE_URL = "http://localhost:5173/api/v1"
-CHAT_URL = f"{BASE_URL}/sessions/chat"
-UPLOAD_URL = f"{BASE_URL}/files"
-HEADERS = {"Authorization": "Bearer hawk_YhCZLQYqtPOwOiEyEgeCNdfAFAbrHtTUxQvRiaOInyekgVgE"}
-DATA_PATH = "./gaia_data"
-REQUEST_TIMEOUT = 1800
-MAX_CONCURRENT = 2
-file_lock = Lock()
-# 全局变量，根据 split 类型动态设置
-dataset = None
-OUTPUT_FILE = None
-SUBMISSION_FILE = None
-SPLIT_TYPE = None  # "validation" 或 "test"
-def check_and_download_dataset_files():
-    """
-    检查并下载完整的 GAIA 数据集文件到本地目录（包含 validation 和 test 的所有文件）
-    Returns:
-        bool: 如果文件已存在或下载成功返回 True，否则返回 False
-    """
-    base_target_dir = Path(DATA_PATH) / "2023"
-    validation_dir = base_target_dir / "validation"
-    test_dir = base_target_dir / "test"
-    # 检查两个目录是否都存在且有文件
-    validation_files = list(validation_dir.glob("*")) if validation_dir.exists() else []
-    test_files = list(test_dir.glob("*")) if test_dir.exists() else []
-    if validation_files and test_files:
-        print(f"✅ 检测到数据集文件已存在")
-        print(f"   validation 文件数: {len(validation_files)}")
-        print(f"   test 文件数: {len(test_files)}")
-        return True
-    # 需要下载数据集文件
-    print(f"📥 开始下载完整的 GAIA 数据集文件...")
-    print(f"   目标目录: {base_target_dir}")
-    try:
-        # 创建基础目录
-        base_target_dir.mkdir(parents=True, exist_ok=True)
-        # 下载完整数据集到临时目录，然后复制到目标目录
-        print("   步骤 1/4: 正在从 Hugging Face 下载完整数据集...")
-        print("   提示: 下载进度会显示在下方，请耐心等待...")
-        download_start = time.time()
-        # 使用 snapshot_download 下载完整数据集
-        cache_dir = snapshot_download(
-            repo_id="gaia-benchmark/GAIA",
-            repo_type="dataset",
-            local_dir=None,  # 使用默认缓存目录
-            resume_download=True
-        )
-        download_duration = time.time() - download_start
-        print(f"   ✅ 数据集下载完成，耗时 {download_duration:.2f} 秒")
-        cache_path = Path(cache_dir)
-        source_2023_dir = cache_path / "2023"
-        if not source_2023_dir.exists():
-            print(f"   ❌ 错误: 缓存目录中未找到 2023 目录")
-            return False
-        # 复制 validation 和 test 目录
-        print("   步骤 2/4: 正在复制 validation 文件...")
-        validation_source = source_2023_dir / "validation"
-        if validation_source.exists():
-            if validation_dir.exists():
-                shutil.rmtree(validation_dir)
-            shutil.copytree(validation_source, validation_dir)
-            validation_count = len(list(validation_dir.glob("*")))
-            print(f"   ✅ validation 文件复制完成，共 {validation_count} 个文件")
-        else:
-            print(f"   ⚠️  警告: 未找到 validation 目录")
-        print("   步骤 3/4: 正在复制 test 文件...")
-        test_source = source_2023_dir / "test"
-        if test_source.exists():
-            if test_dir.exists():
-                shutil.rmtree(test_dir)
-            shutil.copytree(test_source, test_dir)
-            test_count = len(list(test_dir.glob("*")))
-            print(f"   ✅ test 文件复制完成，共 {test_count} 个文件")
-        else:
-            print(f"   ⚠️  警告: 未找到 test 目录")
-        print("   步骤 4/4: 数据集文件准备完成！")
-        print(f"   目标目录: {base_target_dir}")
-        return True
-    except Exception as e:
-        print(f"   ❌ 下载数据集文件时出错: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-def build_ordered_record(task_id, question, level, agent_answer, duration, has_file,
-                         session_id=None, attachment_name=None, ground_truth=None, is_correct=None):
-    """
-    按照固定顺序构建记录字典，确保字段��序一致
-    Args:
-        task_id: 任务ID
-        question: 问题
-        level: 难度级别
-        agent_answer: Agent答案
-        duration: 执行时长
-        has_file: 是否有文件
-        session_id: 会话 ID
-        attachment_name: 附件名称（如果有附件）
-        ground_truth: 标准答案（仅validation数据集）
-        is_correct: 是否正确（仅validation数据集）
-    Returns:
-        OrderedDict: 按固定顺序排列的记录
-    """
-    record = OrderedDict()
-    record["task_id"] = task_id
-    record["question"] = question
-    record["level"] = level
-    record["duration"] = duration
-    record["has_file"] = has_file
-    # attachment_name: 如果有值就写入（即使 agent 出错也应该写入）
-    # 只要 attachment_name 不是 None 且不是空字符串，就写入
-    if attachment_name and attachment_name.strip():
-        record["attachment_name"] = attachment_name
-    # session_id: 如果有值就写入（如果 agent 出错可能为 None，不写入是合理的）
-    if session_id:
-        record["session_id"] = session_id
-    record["agent_answer"] = agent_answer
-    # validation 数据集特有字段
-    if ground_truth is not None:
-        record["ground_truth"] = ground_truth
-    if is_correct is not None:
-        record["is_correct"] = is_correct
-    return record
-def load_existing_results():
-    """
-    加载已有的测试结果文件
-    返回: dict, task_id -> 完整记录字典
-    """
-    if not os.path.exists(OUTPUT_FILE):
-        return {}
-    results = {}
-    try:
-        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
-            for line in f:
-                if not line.strip():
-                    continue
-                try:
-                    data = json.loads(line)
-                    task_id = data.get("task_id")
-                    if task_id:
-                        results[task_id] = data
-                except json.JSONDecodeError:
-                    continue
-        print(f"✅ 已加载 {len(results)} 条历史记录")
-    except Exception as e:
-        print(f"⚠️  加载历史记录时出错: {e}")
-        return {}
-    return results
-def update_result_in_file(task_id, new_record):
-    """
-    更新 jsonl 文件中指定 task_id 的记录
-    使用临时文件方式，确保线程安全
-    """
-    if not os.path.exists(OUTPUT_FILE):
-        # 如果文件不存在，直接写入
-        with file_lock:
-            with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
-                f.write(json.dumps(new_record, ensure_ascii=False) + "\n")
-        return
-    # 读取所有记录，更新指定记录，写回文件
-    with file_lock:
-        temp_file = OUTPUT_FILE + ".tmp"
-        updated = False
-        try:
-            with open(OUTPUT_FILE, "r", encoding="utf-8") as f_in, \
-                    open(temp_file, "w", encoding="utf-8") as f_out:
-                for line in f_in:
-                    if not line.strip():
-                        continue
-                    try:
-                        data = json.loads(line)
-                        if data.get("task_id") == task_id:
-                            # 更新这条记录
-                            f_out.write(json.dumps(new_record, ensure_ascii=False) + "\n")
-                            updated = True
-                        else:
-                            # 保持原记录
-                            f_out.write(line)
-                    except json.JSONDecodeError:
-                        continue
-                # 如果没找到要更新的记录，追加新记录
-                if not updated:
-                    f_out.write(json.dumps(new_record, ensure_ascii=False) + "\n")
-            # 替换原文件
-            os.replace(temp_file, OUTPUT_FILE)
-        except Exception as e:
-            # 如果出错，删除临时文件
-            if os.path.exists(temp_file):
-                os.remove(temp_file)
-            raise e
-def upload_file(local_path):
-    """上传文件并返回符合接口要求的 file_id 和 filename"""
-    try:
-        if not os.path.exists(local_path):
-            print(f"❌ 本地文件不存在: {local_path}")
-            return None
-        with open(local_path, 'rb') as f:
-            files = {'file': f}
-            response = requests.post(UPLOAD_URL, headers=HEADERS, files=files, timeout=60)
-            response.raise_for_status()
-            res_data = response.json()
-            if res_data.get("code") == 0:
-                file_info = res_data.get("data", {})
-                return {
-                    "file_id": file_info.get("file_id"),
-                    "filename": file_info.get("filename")
-                }
-            else:
-                print(f"❌ 上传接口返回错误: {res_data.get('msg')}")
-    except Exception as e:
-        print(f"❌ 文件上传异常 ({os.path.basename(local_path)}): {e}")
-    return None
-def extract_answer(text):
-    """从文本中提取答案"""
-    if not text:
-        return ""
-    pattern = r"(?si)<\s*answer\s*>\s*(.*?)\s*</\s*answer\s*>"
-    match = re.search(pattern, text)
-    if match:
-        ans = match.group(1).strip()
-        return re.sub(r'^["\']|["\']$', '', ans)
-    backup_pattern = r"(?i)answer\s*is[:：]\s*(.*)"
-    backup_match = re.search(backup_pattern, text)
-    if backup_match:
-        return backup_match.group(1).strip().rstrip('.')
-    lines = [l.strip() for l in text.strip().split('\n') if l.strip()]
-    return lines[-1] if lines else text.strip()
-def call_my_agent_safe(question, attachments=None, task_id=None):
-    """
-    发送对话请求，包含附件数组
-    Args:
-        question: 问题内容
-        attachments: 附件列表
-        task_id: 任务ID，用于确保会话隔离
-    Returns:
-        tuple: (parsed_answer, session_id, raw_content)
-    """
-    guided_prompt = (
-        f"{question}\n\n Important Requirement: \nprovide the final answer (the answer only, without explanation) inside the tags in the following format: <answer>your answer</answer>"
-    )
-    payload = {
-        "message": guided_prompt,
-        "streaming": False,
-        "attachments": attachments if attachments else [],
-        "recycle_sandbox": True,
-        # 明确指定创建新会话，避免会话内容混乱
-        # 如果 API 支持 session_id 参数，设置为 null 表示创建新会话
-        # 如果不支持，则不传递 session_id 参数（当前做法）
-    }
-    # 如果 API 支持，可以尝试以下方式之一来确保创建新会话：
-    # 1. payload["session_id"] = None  # 明确创建新会话
-    # 2. payload["new_session"] = True  # 如果 API 支持此参数
-    # 3. 在请求头中添加唯一标识
-    if task_id:
-        # 添加 task_id 作为请求标识，帮助后端区分不同请求，确保会话隔离
-        payload["task_id"] = task_id
-    # 在请求头中添加唯一标识，进一步确保请求隔离
-    # 如果后端支持，可以通过 X-Request-ID 或类似头部来区分请求
-    request_headers = HEADERS.copy()
-    if task_id:
-        # 添加 task_id 到请求头，帮助后端识别和隔离不同请求
-        request_headers["X-Task-ID"] = task_id
-    try:
-        response = requests.post(CHAT_URL, headers=request_headers, json=payload, timeout=(30, REQUEST_TIMEOUT))
-        response.raise_for_status()
-        res_data = response.json()
-        raw_content = (res_data.get("answer") or res_data.get("content") or res_data.get("response") or "").strip()
-        session_id = res_data.get("session_id")
-        parsed_answer = extract_answer(raw_content)
-        return parsed_answer, session_id, raw_content
-    except Exception as e:
-        error_traceback = traceback.format_exc()
-        return f"ERROR: {str(e)}", session_id, error_traceback
-def process_item(item, existing_results, mode):
-    """
-    处理单条数据：上传文件 -> 发起对话 -> 记录结果
-    hybrid + validation 模式下：如果记录已存在且 is_correct 为 true，则跳过 agent 调用，只刷新字段顺序
-    其他情况：所有记录都重新执行并刷新，确保字段顺序一致
-    （test 数据集没有 is_correct 字段，无法判断是否正确，所以总是重新执行）
-    Args:
-        item: 数据集项
-        existing_results: 已有结果字典
-        mode: 执行模式 ("full"、"incremental" 或 "hybrid")
-    """
-    task_id = item['task_id']
-    level = item.get('Level', 'Unknown')
-    question = item['Question']
-    file_name = item.get('file_name', "")
-    # hybrid + validation 模式下：如果记录已存在且成功，只刷新字段顺序，不调用 agent
-    # 只有 validation 数据集有 is_correct 字段，可以判断是否正确
-    if mode == "hybrid" and SPLIT_TYPE == "validation" and task_id in existing_results:
-        existing_record = existing_results[task_id]
-        if existing_record.get("is_correct", False):
-            # 已成功，只刷新字段顺序，不调用 agent
-            # 使用当前的 file_name 更新 attachment_name，确保数据一致性
-            current_has_file = bool(file_name)
-            current_attachment_name = file_name if file_name else None
-            record = build_ordered_record(
-                task_id=task_id,
-                question=existing_record.get("question", question),
-                level=existing_record.get("level", level),
-                agent_answer=existing_record.get("agent_answer", ""),
-                duration=existing_record.get("duration", 0),
-                has_file=current_has_file,
-                session_id=existing_record.get("session_id"),
-                attachment_name=current_attachment_name,
-                ground_truth=existing_record.get("ground_truth", ""),
-                is_correct=True
-            )
-            # 更新已有记录（刷新字段顺序）
-            update_result_in_file(task_id, record)
-            return task_id, True, "refreshed"
-    # 需要调用 agent 的情况（新记录、错误记录、或非 hybrid 模式）
-    attachments = []
-    # 1. 如果有文件，先执行上传
-    if file_name:
-        # 根据 split 类型选择不同的文件夹
-        folder = "validation" if SPLIT_TYPE == "validation" else "test"
-        local_file_path = os.path.abspath(os.path.join(DATA_PATH, "2023", folder, file_name))
-        upload_data = upload_file(local_file_path)
-        if upload_data:
-            attachments.append(upload_data)
-    # 2. 调用 Agent（传递 task_id 确保会话隔离）
-    start_time = time.time()
-    agent_answer, session_id, _ = call_my_agent_safe(question, attachments, task_id=task_id)
-    duration = time.time() - start_time
-    # 3. 构建记录（使用固定顺序）
-    if SPLIT_TYPE == "validation":
-        # validation 数据集：添加标准答案和正确性判断
-        ground_truth = str(item['Final answer']).strip()
-        clean_agent = str(agent_answer).lower().rstrip('.')
-        clean_gt = ground_truth.lower().rstrip('.')
-        is_correct = (clean_agent == clean_gt)
-        record = build_ordered_record(
-            task_id=task_id,
-            question=question,
-            level=level,
-            duration=round(duration, 2),
-            has_file=bool(file_name),
-            session_id=session_id,
-            attachment_name=file_name if file_name else None,
-            agent_answer=agent_answer,
-            ground_truth=ground_truth,
-            is_correct=is_correct
-        )
-        result_correct = is_correct
-    else:  # test 数据集：没有标准答案
-        record = build_ordered_record(
-            task_id=task_id,
-            question=question,
-            level=level,
-            duration=round(duration, 2),
-            has_file=bool(file_name),
-            session_id=session_id,
-            attachment_name=file_name if file_name else None,
-            agent_answer=agent_answer
-        )
-        result_correct = None
-    # 4. 更新或追加记录
-    if task_id in existing_results:
-        # 更新已有记录
-        update_result_in_file(task_id, record)
-        return task_id, result_correct, "updated"
-    else:
-        # 追加新记录
-        with file_lock:
-            with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
-                f.write(json.dumps(record, ensure_ascii=False) + "\n")
-        return task_id, result_correct, "new"
-def generate_submission():
-    """
-    生成官网提交格式文件
-    GAIA 提交格式要求：
-    - 文件格式：JSONL（每行一个 JSON 对象）
-    - 必需字段：task_id, model_answer
-    - 编码：UTF-8
-    - test 数据集需要包含所有 285 个用例的答案
-    """
-    if not os.path.exists(OUTPUT_FILE):
-        print(f"⚠️  警告：结果文件 {OUTPUT_FILE} 不存在，无法生成提交文件")
-        return
-    # 读取所有结果并按 task_id 排序（确保顺序一致）
-    results = []
-    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
-        for line in f:
-            if not line.strip():
-                continue
-            try:
-                data = json.loads(line)
-                if "task_id" in data and "agent_answer" in data:
-                    results.append(data)
-            except json.JSONDecodeError:
-                continue
-    if not results:
-        print(f"⚠️  警告：结果文件 {OUTPUT_FILE} 中没有有效数据")
-        return
-    # 按 task_id 排序，确保顺序一致
-    results.sort(key=lambda x: x.get("task_id", ""))
-    # 生成提交文件
-    with open(SUBMISSION_FILE, "w", encoding="utf-8") as f_out:
-        for data in results:
-            submission_data = {
-                "task_id": data["task_id"],
-                "model_answer": str(data["agent_answer"])
-            }
-            f_out.write(json.dumps(submission_data, ensure_ascii=False) + "\n")
-    print(f"✅ 提交文件已生成: {SUBMISSION_FILE} (共 {len(results)} 条记录)")
-    # test 数据集验证：检查是否包含所有用例
-    if SPLIT_TYPE == "test":
-        expected_count = 285
-        if len(results) < expected_count:
-            print(f"⚠️  警告：test 数据集应该有 {expected_count} 个用例，当前只有 {len(results)} 个")
-        else:
-            print(f"✅ test 数据集已包含 {len(results)} 个用例，符合提交要求")
-def get_current_accuracy():
-    """
-    获取当前的整体正确率（仅 validation 数据集）
-    Returns:
-        float or None: 正确率（百分比），如果不是 validation 数据集或文件不存在则返回 None
-    """
-    # test 数据集没有标准答案，无法计算正确率
-    if SPLIT_TYPE != "validation":
-        return None
-    if not os.path.exists(OUTPUT_FILE):
-        return None
-    try:
-        results = []
-        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
-            for line in f:
-                if not line.strip():
-                    continue
-                try:
-                    data = json.loads(line)
-                    results.append(data)
-                except json.JSONDecodeError:
-                    continue
-        if not results:
-            return None
-        total = len(results)
-        correct = sum(1 for r in results if r.get("is_correct", False))
-        accuracy = (correct / total * 100) if total > 0 else 0.0
-        return accuracy
-    except Exception:
-        return None
-def generate_report():
-    """生成统计成绩单（仅 validation 数据集有标准答案，才生成成绩单）"""
-    # test 数据集没有标准答案，不生成成绩单
-    if SPLIT_TYPE != "validation":
-        return
-    if not os.path.exists(OUTPUT_FILE):
-        return
-    results = [json.loads(line) for line in open(OUTPUT_FILE, "r", encoding="utf-8")]
-    df = pd.DataFrame(results)
-    total = len(df)
-    acc = (df['is_correct'].sum() / total) * 100
-    print("\n" + "=" * 50)
-    print(f"测试完成! 总数: {total} | 总准确率: {acc:.2f}%")
-    print("=" * 50)
-def run_test_concurrent(num_questions=200, mode="hybrid", split="validation", threads=MAX_CONCURRENT, target_task_id=None):
-    """
-    测试主函数
-    Args:
-        num_questions: 要执行的用例数量
-        mode: 执行模式
-            - "full": 全量模式，不管文件是否存在，都删除重新开始
-            - "incremental": 增量模式，如果文件存在则增量，不存在则全量执行
-            - "hybrid": 混合模式，第一次时全量（文件不存在），后面就增量（文件存在）
-            - "error": 错误模式，只重新执行 agent_answer 包含 ERROR 的记录
-        split: 数据集类型，"validation" 或 "test"
-        threads: 并发线程数
-        target_task_id: 可选，指定要运行的 task_id，如果指定则只运行该用例
-    """
-    global dataset, OUTPUT_FILE, SUBMISSION_FILE, SPLIT_TYPE
-    # 设置全局变量
-    SPLIT_TYPE = split
-    # 根据 split 类型设置输出文件名
-    if split == "validation":
-        OUTPUT_FILE = "validation_results.jsonl"
-        SUBMISSION_FILE = "validation_submission.jsonl"
-        print("📥 正在检查 GAIA 验证集数据...")
-    else:  # test
-        OUTPUT_FILE = "test_results.jsonl"
-        SUBMISSION_FILE = "test_submission.jsonl"
-        print("📥 正在检查 GAIA 测试集数据...")
-    # 1. 检查并下载完整数据集文件（如果需要，包含 validation 和 test 的所有文件）
-    print("\n【步骤 1/2】检查数据集文件...")
-    check_and_download_dataset_files()
-    # 2. 加载数据集元数据（如果首次下载会显示下载进度）
-    print(f"\n【步骤 2/2】加载数据集元数据...")
-    print(f"   数据集: gaia-benchmark/GAIA (2023_all, split={split})")
-    print("   提示: 如果是首次下载，请耐心等待，下载进度会显示在下方...")
-    print("   如果已下载过，会直接从缓存加载，速度较快")
-    start_time = time.time()
-    dataset = load_dataset("gaia-benchmark/GAIA", "2023_all", split=split)
-    load_duration = time.time() - start_time
-    print(f"✅ 数据集元数据加载完成！共 {len(dataset)} 条记录，耗时 {load_duration:.2f} 秒\n")
-    # 1. 根据模式处理已有结果
-    file_exists = os.path.exists(OUTPUT_FILE)
-    if mode == "full":
-        # 全量模式：删除旧文件，从头开始
-        if file_exists:
-            os.remove(OUTPUT_FILE)
-            print("🔄 全量模式：已删除旧结果文件，从头开始执行")
-        existing_results = {}
-    elif mode == "incremental":
-        # 增量模式：如果文件存在则增量，不存在则全量执行
-        if file_exists:
-            existing_results = load_existing_results()
-            print(f"📋 增量模式：已加载 {len(existing_results)} 条历史记录")
-        else:
-            existing_results = {}
-            print("📋 增量模式：未找到历史记录，将全量执行")
-    elif mode == "error":
-        # 错误模式：只重新执行 agent_answer 包含 ERROR 的记录
-        if file_exists:
-            existing_results = load_existing_results()
-            print(f"📋 错误模式：已加载 {len(existing_results)} 条历史记录，将重新执行包含 ERROR 的记录")
-        else:
-            existing_results = {}
-            print("📋 错误模式：未找到历史记录，无法执行错误重试")
-    else:  # hybrid
-        # 混合模式：第一次时全量（文件不存在），后面就增量（文件存在）
-        if file_exists:
-            existing_results = load_existing_results()
-            print(f"📋 混合模式：检测到已有文件，进入增量模式（已加载 {len(existing_results)} 条历史记录）")
-        else:
-            existing_results = {}
-            print("📋 混合模式：首次执行，进入全量模式")
-    # 2. 筛选需要执行的用例
-    if target_task_id:
-        # 如果指定了 task_id，只运行该用例
-        print(f"🎯 指定运行 task_id: {target_task_id}")
-        tasks_to_run = []
-        found = False
-        for item in dataset:
-            if item['task_id'] == target_task_id:
-                tasks_to_run = [item]
-                found = True
-                break
-        if not found:
-            print(f"❌ 错误: 在 {split} 数据集中未找到 task_id: {target_task_id}")
-            return
-        num_to_run = 1
-    else:
-        # 正常模式，根据 num_questions 筛选
-        num_to_run = min(num_questions, len(dataset))
-        tasks_to_run = dataset.select(range(num_to_run))
-    # 统计需要执行的用例
-    tasks_to_execute = []
-    refresh_count = 0  # hybrid 模式下只刷新字段顺序的记录数
-    update_count = 0  # 需要重新调用 agent 的记录数
-    new_count = 0  # 新记录数
-    error_count = 0  # error 模式下包含 ERROR 的记录数
-    for item in tasks_to_run:
-        task_id = item['task_id']
-        if mode == "error":
-            # error 模式：只重新执行 agent_answer 包含 ERROR 的记录
-            if task_id in existing_results:
-                agent_answer = existing_results[task_id].get("agent_answer", "")
-                if agent_answer and "ERROR" in str(agent_answer):
-                    error_count += 1
-                    tasks_to_execute.append(item)
-            # 如果记录不存在或 agent_answer 不包含 ERROR，则跳过
-        else:
-            # 其他模式：正常处理
-            if task_id in existing_results:
-                # hybrid + validation 模式下：如果已成功，只刷新字段顺序
-                # test 数据集没有 is_correct 字段，无法判断是否正确，所以总是重新执行
-                if mode == "hybrid" and split == "validation":
-                    if existing_results[task_id].get("is_correct", False):
-                        refresh_count += 1
-                    else:
-                        update_count += 1
-                else:
-                    # 非 hybrid 模式，或 test 数据集：所有已有记录都需要重新执行
-                    update_count += 1
-            else:
-                new_count += 1
-            tasks_to_execute.append(item)
-    total_to_execute = len(tasks_to_execute)
-    print(f"\n📊 统计信息:")
-    print(f"   数据集: {split}")
-    print(f"   执行模式: {mode}")
-    if target_task_id:
-        print(f"   指定 task_id: {target_task_id}")
-    print(f"   总用例数: {num_to_run}")
-    if existing_results:
-        if mode == "error":
-            print(f"   需要执行: {total_to_execute} (包含 ERROR 的记录: {error_count})")
-        elif mode == "hybrid":
-            print(
-                f"   需要执行: {total_to_execute} (新用例: {new_count}, 刷新字段顺序: {refresh_count}, 重新测试: {update_count})")
-        else:
-            print(
-                f"   需要执行: {total_to_execute} (新用例: {new_count}, 刷新已有记录: {refresh_count + update_count})")
-    else:
-        print(f"   需要执行: {total_to_execute} (全量执行)")
-    print(f"🚀 开始测试 | 并发数: {threads} | 待执行: {total_to_execute}")
-    if total_to_execute == 0:
-        if mode == "error":
-            print("✅ 没有包含 ERROR 的记录，无需执行")
-        elif split == "validation":
-            print("✅ 所有用例已完成且正确，无需执行")
-        else:
-            print("✅ 所有用例已完成，无需执行")
-        generate_report()
-        generate_submission()
-        return
-    # 3. 并发执行
-    with ThreadPoolExecutor(max_workers=threads) as executor:
-        future_to_item = {executor.submit(process_item, item, existing_results, mode): item for item in
-                          tasks_to_execute}
-        done = 0
-        for future in as_completed(future_to_item):
-            done += 1
-            item = future_to_item[future]
-            tid = item['task_id']
-            try:
-                _, is_ok, status = future.result()
-                if status == "refreshed":
-                    status_icon = "🔄"
-                elif split == "validation":
-                    status_icon = "✅" if is_ok else "❌"
-                else:  # test
-                    status_icon = "✅"
-                # 计算并显示当前整体正确率
-                accuracy_info = ""
-                if split == "validation":
-                    current_accuracy = get_current_accuracy()
-                    if current_accuracy is not None:
-                        accuracy_info = f" | 当前正确率: {current_accuracy:.2f}%"
-                print(f"[{done}/{total_to_execute}] ID: {tid} | 状态: {status_icon} ({status}){accuracy_info}")
-            except Exception as e:
-                error_traceback = traceback.format_exc()
-                print(f"[{done}/{total_to_execute}] ID: {tid} 运行异常: {e}")
-                print(f"异常堆栈:\n{error_traceback}")
-    # 4. 生成报表
-    generate_report()
-    generate_submission()
-def print_help():
-    """打印详细的帮助信息"""
-    print("=" * 70)
-    print("GAIA 测试脚本 - 参数说明")
-    print("=" * 70)
-    print()
-    print("用法:")
-    print("  python gaia_test.py [参数]")
-    print()
-    print("参数说明:")
-    print()
-    print("  --split <类型>")
-    print("      数据集类型")
-    print("      可选值: validation, test")
-    print("      默认值: validation")
-    print("      说明:")
-    print("        - validation: 验证集，有标准答案，可以计算正确率")
-    print("        - test: 测试集，无标准答案，用于最终提交")
-    print()
-    print("  --mode <模式>")
-    print("      执行模式")
-    print("      可选值: full, incremental, hybrid, error")
-    print("      默认值: hybrid")
-    print("      说明:")
-    print("        - full: 全量模式，删除旧结果文件，从头开始执行")
-    print("        - incremental: 增量模式，如果文件存在则增量，不存在则全量执行")
-    print("        - hybrid: 混合模式（推荐），首次全量，后续增量")
-    print("                  在 hybrid 模式下，validation 数据集中已正确的记录")
-    print("                  只刷新字段顺序，不重新调用 agent")
-    print("        - error: 错误模式，只重新执行 agent_answer 包含 ERROR 的记录")
-    print()
-    print("  --num <数量>")
-    print("      要执行的用例数量")
-    print("      类型: 整数")
-    print("      默认值: 200")
-    print("      说明:")
-    print("        - test 数据集共 285 题，可以设置 --num 285 执行全部")
-    print("        - validation 数据集可以根据需要设置数量")
-    print()
-    print("  --threads <数量>")
-    print("      并发执行的线程数")
-    print("      类型: 整数")
-    print("      默认值: 2")
-    print("      说明:")
-    print("        - 根据服务器性能调整，过高可能导致服务器压力过大")
-    print("        - 建议范围: 1-4")
-    print()
-    print("  --task-id <task_id>")
-    print("      指定要运行的 task_id")
-    print("      类型: 字符串")
-    print("      默认值: 无（运行多个用例）")
-    print("      说明:")
-    print("        - 如果指定此参数，则只运行该 task_id 对应的用例")
-    print("        - 指定此参数时，--num 参数会被忽略")
-    print("        - 如果指定的 task_id 不存在，脚本会报错并退出")
-    print()
-    print("  -h, --help")
-    print("      显示此帮助信息并退出")
-    print()
-    print("示例:")
-    print("  # 使用默认参数（validation 数据集，hybrid 模式，200 题）")
-    print("  python gaia_test.py")
-    print()
-    print("  # 测试 test 数据集，执行全部 285 题")
-    print("  python gaia_test.py --split test --num 285")
-    print()
-    print("  # 使用 error 模式重新执行错误记录")
-    print("  python gaia_test.py --mode error")
-    print()
-    print("  # 使用全量模式，4 个并发线程")
-    print("  python gaia_test.py --mode full --threads 4")
-    print()
-    print("  # 运行指定的 task_id")
-    print("  python gaia_test.py --task-id c61d22de-5f6c-4958-a7f6-5e9707bd3466")
-    print()
-    print("=" * 70)
-    print("配置文件:")
-    print("  运行前请确保已正确配置 gaia_test.py 中的以下参数:")
-    print("    - BASE_URL: API 服务地址")
-    print("    - HEADERS: 认证 Token（必须修改）")
-    print("    - DATA_PATH: 数据文件路径")
-    print("    - REQUEST_TIMEOUT: 请求超时时间")
-    print("    - MAX_CONCURRENT: 最大并发数")
-    print("=" * 70)
-if __name__ == "__main__":
-    import sys
-    # 检查是否有 -h 或 --help 参数
-    if "-h" in sys.argv or "--help" in sys.argv:
-        print_help()
-        sys.exit(0)
-    parser = argparse.ArgumentParser(
-        description="GAIA 测试脚本（支持 validation 和 test 数据集，支持全量、增量、混合三种模式）",
-        formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    parser.add_argument(
-        "--split",
-        type=str,
-        choices=["validation", "test"],
-        default="validation",
-        help="数据集类型: 'validation' 验证集（有标准答案）、'test' 测试集（无标准答案，默认: validation）"
-    )
-    parser.add_argument(
-        "--mode",
-        type=str,
-        choices=["full", "incremental", "hybrid", "error"],
-        default="hybrid",
-        help="执行模式: 'full' 全量模式（删除旧文件重新执行）、'incremental' 增量模式（文件存在则增量，不存在则全量）、'hybrid' 混合模式（首次全量，后续增量，默认）、'error' 错误模式（只重新执行 agent_answer 包含 ERROR 的记录）"
-    )
-    parser.add_argument(
-        "--num",
-        type=int,
-        default=200,
-        help="要执行的用例数量（默认: 200，test 集共 285 题）"
-    )
-    parser.add_argument(
-        "--threads",
-        type=int,
-        default=MAX_CONCURRENT,
-        help="执行的并发数（默认: 2）"
-    )
-    parser.add_argument(
-        "--task-id",
-        type=str,
-        default=None,
-        help="指定要运行的 task_id，如果指定则只运行该用例（忽略 --num 参数）"
-    )
-    args = parser.parse_args()
-    run_test_concurrent(num_questions=args.num, mode=args.mode, split=args.split, threads=args.threads, target_task_id=args.task_id)