Spaces:

xiaobin123
/

Hawk

Runtime error

App Files Files Community

Hawk / app.py

xiaobin123

Update app.py

a1766c7 verified 6 days ago

raw

history blame contribute delete

35.6 kB

	"""
	测试脚本：支持 validation 和 test 数据集，支持全量、增量、混合三种模式
	- 全量模式（full）：不管文件是否存在，都删除重新开始
	- 增量模式（incremental）：如果文件存在则增量，不存在则全量执行
	- 混合模式（hybrid）：第一次时全量（文件不存在），后面就增量（文件存在）
	"""
	import argparse
	import json
	import os
	import re
	import time
	import traceback
	from collections import OrderedDict
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from threading import Lock

	import pandas as pd
	import requests
	from datasets import load_dataset
	from huggingface_hub import snapshot_download, hf_hub_download
	from pathlib import Path
	import shutil

	# --- 1. 配置区 ---
	BASE_URL = "http://localhost:5173/api/v1"
	CHAT_URL = f"{BASE_URL}/sessions/chat"
	UPLOAD_URL = f"{BASE_URL}/files"
	HEADERS = {"Authorization": "Bearer hawk_YhCZLQYqtPOwOiEyEgeCNdfAFAbrHtTUxQvRiaOInyekgVgE"}
	DATA_PATH = "./gaia_data"
	REQUEST_TIMEOUT = 1800
	MAX_CONCURRENT = 2

	file_lock = Lock()

	# 全局变量，根据 split 类型动态设置
	dataset = None
	OUTPUT_FILE = None
	SUBMISSION_FILE = None
	SPLIT_TYPE = None # "validation" 或 "test"


	def check_and_download_dataset_files():
	"""
	检查并下载完整的 GAIA 数据集文件到本地目录（包含 validation 和 test 的所有文件）

	Returns:
	bool: 如果文件已存在或下载成功返回 True，否则返回 False
	"""
	base_target_dir = Path(DATA_PATH) / "2023"
	validation_dir = base_target_dir / "validation"
	test_dir = base_target_dir / "test"

	# 检查两个目录是否都存在且有文件
	validation_files = list(validation_dir.glob("*")) if validation_dir.exists() else []
	test_files = list(test_dir.glob("*")) if test_dir.exists() else []

	if validation_files and test_files:
	print(f"✅ 检测到数据集文件已存在")
	print(f" validation 文件数: {len(validation_files)}")
	print(f" test 文件数: {len(test_files)}")
	return True

	# 需要下载数据集文件
	print(f"📥 开始下载完整的 GAIA 数据集文件...")
	print(f" 目标目录: {base_target_dir}")

	try:
	# 创建基础目录
	base_target_dir.mkdir(parents=True, exist_ok=True)

	# 下载完整数据集到临时目录，然后复制到目标目录
	print(" 步骤 1/4: 正在从 Hugging Face 下载完整数据集...")
	print(" 提示: 下载进度会显示在下方，请耐心等待...")
	download_start = time.time()

	# 使用 snapshot_download 下载完整数据集
	cache_dir = snapshot_download(
	repo_id="gaia-benchmark/GAIA",
	repo_type="dataset",
	local_dir=None, # 使用默认缓存目录
	resume_download=True
	)

	download_duration = time.time() - download_start
	print(f" ✅ 数据集下载完成，耗时 {download_duration:.2f} 秒")

	cache_path = Path(cache_dir)
	source_2023_dir = cache_path / "2023"

	if not source_2023_dir.exists():
	print(f" ❌ 错误: 缓存目录中未找到 2023 目录")
	return False

	# 复制 validation 和 test 目录
	print(" 步骤 2/4: 正在复制 validation 文件...")
	validation_source = source_2023_dir / "validation"
	if validation_source.exists():
	if validation_dir.exists():
	shutil.rmtree(validation_dir)
	shutil.copytree(validation_source, validation_dir)
	validation_count = len(list(validation_dir.glob("*")))
	print(f" ✅ validation 文件复制完成，共 {validation_count} 个文件")
	else:
	print(f" ⚠️ 警告: 未找到 validation 目录")

	print(" 步骤 3/4: 正在复制 test 文件...")
	test_source = source_2023_dir / "test"
	if test_source.exists():
	if test_dir.exists():
	shutil.rmtree(test_dir)
	shutil.copytree(test_source, test_dir)
	test_count = len(list(test_dir.glob("*")))
	print(f" ✅ test 文件复制完成，共 {test_count} 个文件")
	else:
	print(f" ⚠️ 警告: 未找到 test 目录")

	print(" 步骤 4/4: 数据集文件准备完成！")
	print(f" 目标目录: {base_target_dir}")
	return True

	except Exception as e:
	print(f" ❌ 下载数据集文件时出错: {e}")
	import traceback
	traceback.print_exc()
	return False


	def build_ordered_record(task_id, question, level, agent_answer, duration, has_file,
	session_id=None, attachment_name=None, ground_truth=None, is_correct=None):
	"""
	按照固定顺序构建记录字典，确保字段顺序一致

	Args:
	task_id: 任务ID
	question: 问题
	level: 难度级别
	agent_answer: Agent答案
	duration: 执行时长
	has_file: 是否有文件
	session_id: 会话 ID
	attachment_name: 附件名称（如果有附件）
	ground_truth: 标准答案（仅validation数据集）
	is_correct: 是否正确（仅validation数据集）

	Returns:
	OrderedDict: 按固定顺序排列的记录
	"""
	record = OrderedDict()
	record["task_id"] = task_id
	record["question"] = question
	record["level"] = level
	record["duration"] = duration
	record["has_file"] = has_file
	# attachment_name: 如果有值就写入（即使 agent 出错也应该写入）
	# 只要 attachment_name 不是 None 且不是空字符串，就写入
	if attachment_name and attachment_name.strip():
	record["attachment_name"] = attachment_name
	# session_id: 如果有值就写入（如果 agent 出错可能为 None，不写入是合理的）
	if session_id:
	record["session_id"] = session_id
	record["agent_answer"] = agent_answer
	# validation 数据集特有字段
	if ground_truth is not None:
	record["ground_truth"] = ground_truth
	if is_correct is not None:
	record["is_correct"] = is_correct

	return record


	def load_existing_results():
	"""
	加载已有的测试结果文件
	返回: dict, task_id -> 完整记录字典
	"""
	if not os.path.exists(OUTPUT_FILE):
	return {}

	results = {}
	try:
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	try:
	data = json.loads(line)
	task_id = data.get("task_id")
	if task_id:
	results[task_id] = data
	except json.JSONDecodeError:
	continue
	print(f"✅ 已加载 {len(results)} 条历史记录")
	except Exception as e:
	print(f"⚠️ 加载历史记录时出错: {e}")
	return {}

	return results


	def update_result_in_file(task_id, new_record):
	"""
	更新 jsonl 文件中指定 task_id 的记录
	使用临时文件方式，确保线程安全
	"""
	if not os.path.exists(OUTPUT_FILE):
	# 如果文件不存在，直接写入
	with file_lock:
	with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
	f.write(json.dumps(new_record, ensure_ascii=False) + "\n")
	return

	# 读取所有记录，更新指定记录，写回文件
	with file_lock:
	temp_file = OUTPUT_FILE + ".tmp"
	updated = False

	try:
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f_in, \
	open(temp_file, "w", encoding="utf-8") as f_out:
	for line in f_in:
	if not line.strip():
	continue
	try:
	data = json.loads(line)
	if data.get("task_id") == task_id:
	# 更新这条记录
	f_out.write(json.dumps(new_record, ensure_ascii=False) + "\n")
	updated = True
	else:
	# 保持原记录
	f_out.write(line)
	except json.JSONDecodeError:
	continue

	# 如果没找到要更新的记录，追加新记录
	if not updated:
	f_out.write(json.dumps(new_record, ensure_ascii=False) + "\n")

	# 替换原文件
	os.replace(temp_file, OUTPUT_FILE)
	except Exception as e:
	# 如果出错，删除临时文件
	if os.path.exists(temp_file):
	os.remove(temp_file)
	raise e


	def upload_file(local_path):
	"""上传文件并返回符合接口要求的 file_id 和 filename"""
	try:
	if not os.path.exists(local_path):
	print(f"❌ 本地文件不存在: {local_path}")
	return None

	with open(local_path, 'rb') as f:
	files = {'file': f}
	response = requests.post(UPLOAD_URL, headers=HEADERS, files=files, timeout=60)
	response.raise_for_status()
	res_data = response.json()

	if res_data.get("code") == 0:
	file_info = res_data.get("data", {})
	return {
	"file_id": file_info.get("file_id"),
	"filename": file_info.get("filename")
	}
	else:
	print(f"❌ 上传接口返回错误: {res_data.get('msg')}")
	except Exception as e:
	print(f"❌ 文件上传异常 ({os.path.basename(local_path)}): {e}")
	return None


	def extract_answer(text):
	"""从文本中提取答案"""
	if not text:
	return ""
	pattern = r"(?si)<\sanswer\s>\s(.?)\s</\sanswer\s*>"
	match = re.search(pattern, text)
	if match:
	ans = match.group(1).strip()
	return re.sub(r'^["\']\|["\']$', '', ans)
	backup_pattern = r"(?i)answer\sis[:：]\s(.*)"
	backup_match = re.search(backup_pattern, text)
	if backup_match:
	return backup_match.group(1).strip().rstrip('.')
	lines = [l.strip() for l in text.strip().split('\n') if l.strip()]
	return lines[-1] if lines else text.strip()


	def call_my_agent_safe(question, attachments=None, task_id=None):
	"""
	发送对话请求，包含附件数组

	Args:
	question: 问题内容
	attachments: 附件列表
	task_id: 任务ID，用于确保会话隔离

	Returns:
	tuple: (parsed_answer, session_id, raw_content)
	"""
	guided_prompt = (
	f"{question}\n\n Important Requirement: \nprovide the final answer (the answer only, without explanation) inside the tags in the following format: <answer>your answer</answer>"
	)

	payload = {
	"message": guided_prompt,
	"streaming": False,
	"attachments": attachments if attachments else [],
	"recycle_sandbox": True,
	# 明确指定创建新会话，避免会话内容混乱
	# 如果 API 支持 session_id 参数，设置为 null 表示创建新会话
	# 如果不支持，则不传递 session_id 参数（当前做法）
	}

	# 如果 API 支持，可以尝试以下方式之一来确保创建新会话：
	# 1. payload["session_id"] = None # 明确创建新会话
	# 2. payload["new_session"] = True # 如果 API 支持此参数
	# 3. 在请求头中添加唯一标识

	if task_id:
	# 添加 task_id 作为请求标识，帮助后端区分不同请求，确保会话隔离
	payload["task_id"] = task_id

	# 在请求头中添加唯一标识，进一步确保请求隔离
	# 如果后端支持，可以通过 X-Request-ID 或类似头部来区分请求
	request_headers = HEADERS.copy()
	if task_id:
	# 添加 task_id 到请求头，帮助后端识别和隔离不同请求
	request_headers["X-Task-ID"] = task_id

	try:
	response = requests.post(CHAT_URL, headers=request_headers, json=payload, timeout=(30, REQUEST_TIMEOUT))
	response.raise_for_status()
	res_data = response.json()
	raw_content = (res_data.get("answer") or res_data.get("content") or res_data.get("response") or "").strip()
	session_id = res_data.get("session_id")
	parsed_answer = extract_answer(raw_content)
	return parsed_answer, session_id, raw_content
	except Exception as e:
	error_traceback = traceback.format_exc()
	return f"ERROR: {str(e)}", session_id, error_traceback


	def process_item(item, existing_results, mode):
	"""
	处理单条数据：上传文件 -> 发起对话 -> 记录结果
	hybrid + validation 模式下：如果记录已存在且 is_correct 为 true，则跳过 agent 调用，只刷新字段顺序
	其他情况：所有记录都重新执行并刷新，确保字段顺序一致
	（test 数据集没有 is_correct 字段，无法判断是否正确，所以总是重新执行）

	Args:
	item: 数据集项
	existing_results: 已有结果字典
	mode: 执行模式 ("full"、"incremental" 或 "hybrid")
	"""
	task_id = item['task_id']
	level = item.get('Level', 'Unknown')
	question = item['Question']
	file_name = item.get('file_name', "")

	# hybrid + validation 模式下：如果记录已存在且成功，只刷新字段顺序，不调用 agent
	# 只有 validation 数据集有 is_correct 字段，可以判断是否正确
	if mode == "hybrid" and SPLIT_TYPE == "validation" and task_id in existing_results:
	existing_record = existing_results[task_id]
	if existing_record.get("is_correct", False):
	# 已成功，只刷新字段顺序，不调用 agent
	# 使用当前的 file_name 更新 attachment_name，确保数据一致性
	current_has_file = bool(file_name)
	current_attachment_name = file_name if file_name else None
	record = build_ordered_record(
	task_id=task_id,
	question=existing_record.get("question", question),
	level=existing_record.get("level", level),
	agent_answer=existing_record.get("agent_answer", ""),
	duration=existing_record.get("duration", 0),
	has_file=current_has_file,
	session_id=existing_record.get("session_id"),
	attachment_name=current_attachment_name,
	ground_truth=existing_record.get("ground_truth", ""),
	is_correct=True
	)
	# 更新已有记录（刷新字段顺序）
	update_result_in_file(task_id, record)
	return task_id, True, "refreshed"

	# 需要调用 agent 的情况（新记录、错误记录、或非 hybrid 模式）
	attachments = []

	# 1. 如果有文件，先执行上传
	if file_name:
	# 根据 split 类型选择不同的文件夹
	folder = "validation" if SPLIT_TYPE == "validation" else "test"
	local_file_path = os.path.abspath(os.path.join(DATA_PATH, "2023", folder, file_name))
	upload_data = upload_file(local_file_path)
	if upload_data:
	attachments.append(upload_data)

	# 2. 调用 Agent（传递 task_id 确保会话隔离）
	start_time = time.time()
	agent_answer, session_id, _ = call_my_agent_safe(question, attachments, task_id=task_id)
	duration = time.time() - start_time

	# 3. 构建记录（使用固定顺序）
	if SPLIT_TYPE == "validation":
	# validation 数据集：添加标准答案和正确性判断
	ground_truth = str(item['Final answer']).strip()
	clean_agent = str(agent_answer).lower().rstrip('.')
	clean_gt = ground_truth.lower().rstrip('.')
	is_correct = (clean_agent == clean_gt)
	record = build_ordered_record(
	task_id=task_id,
	question=question,
	level=level,
	duration=round(duration, 2),
	has_file=bool(file_name),
	session_id=session_id,
	attachment_name=file_name if file_name else None,
	agent_answer=agent_answer,
	ground_truth=ground_truth,
	is_correct=is_correct
	)
	result_correct = is_correct
	else: # test 数据集：没有标准答案
	record = build_ordered_record(
	task_id=task_id,
	question=question,
	level=level,
	duration=round(duration, 2),
	has_file=bool(file_name),
	session_id=session_id,
	attachment_name=file_name if file_name else None,
	agent_answer=agent_answer
	)
	result_correct = None

	# 4. 更新或追加记录
	if task_id in existing_results:
	# 更新已有记录
	update_result_in_file(task_id, record)
	return task_id, result_correct, "updated"
	else:
	# 追加新记录
	with file_lock:
	with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
	f.write(json.dumps(record, ensure_ascii=False) + "\n")
	return task_id, result_correct, "new"


	def generate_submission():
	"""
	生成官网提交格式文件
	GAIA 提交格式要求：
	- 文件格式：JSONL（每行一个 JSON 对象）
	- 必需字段：task_id, model_answer
	- 编码：UTF-8
	- test 数据集需要包含所有 285 个用例的答案
	"""
	if not os.path.exists(OUTPUT_FILE):
	print(f"⚠️ 警告：结果文件 {OUTPUT_FILE} 不存在，无法生成提交文件")
	return

	# 读取所有结果并按 task_id 排序（确保顺序一致）
	results = []
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	try:
	data = json.loads(line)
	if "task_id" in data and "agent_answer" in data:
	results.append(data)
	except json.JSONDecodeError:
	continue

	if not results:
	print(f"⚠️ 警告：结果文件 {OUTPUT_FILE} 中没有有效数据")
	return

	# 按 task_id 排序，确保顺序一致
	results.sort(key=lambda x: x.get("task_id", ""))

	# 生成提交文件
	with open(SUBMISSION_FILE, "w", encoding="utf-8") as f_out:
	for data in results:
	submission_data = {
	"task_id": data["task_id"],
	"model_answer": str(data["agent_answer"])
	}
	f_out.write(json.dumps(submission_data, ensure_ascii=False) + "\n")

	print(f"✅ 提交文件已生成: {SUBMISSION_FILE} (共 {len(results)} 条记录)")

	# test 数据集验证：检查是否包含所有用例
	if SPLIT_TYPE == "test":
	expected_count = 285
	if len(results) < expected_count:
	print(f"⚠️ 警告：test 数据集应该有 {expected_count} 个用例，当前只有 {len(results)} 个")
	else:
	print(f"✅ test 数据集已包含 {len(results)} 个用例，符合提交要求")


	def get_current_accuracy():
	"""
	获取当前的整体正确率（仅 validation 数据集）

	Returns:
	float or None: 正确率（百分比），如果不是 validation 数据集或文件不存在则返回 None
	"""
	# test 数据集没有标准答案，无法计算正确率
	if SPLIT_TYPE != "validation":
	return None

	if not os.path.exists(OUTPUT_FILE):
	return None

	try:
	results = []
	with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	try:
	data = json.loads(line)
	results.append(data)
	except json.JSONDecodeError:
	continue

	if not results:
	return None

	total = len(results)
	correct = sum(1 for r in results if r.get("is_correct", False))
	accuracy = (correct / total * 100) if total > 0 else 0.0
	return accuracy
	except Exception:
	return None


	def generate_report():
	"""生成统计成绩单（仅 validation 数据集有标准答案，才生成成绩单）"""
	# test 数据集没有标准答案，不生成成绩单
	if SPLIT_TYPE != "validation":
	return

	if not os.path.exists(OUTPUT_FILE):
	return

	results = [json.loads(line) for line in open(OUTPUT_FILE, "r", encoding="utf-8")]
	df = pd.DataFrame(results)
	total = len(df)
	acc = (df['is_correct'].sum() / total) * 100

	print("\n" + "=" * 50)
	print(f"测试完成! 总数: {total} \| 总准确率: {acc:.2f}%")
	print("=" * 50)


	def run_test_concurrent(num_questions=200, mode="hybrid", split="validation", threads=MAX_CONCURRENT, target_task_id=None):
	"""
	测试主函数

	Args:
	num_questions: 要执行的用例数量
	mode: 执行模式
	- "full": 全量模式，不管文件是否存在，都删除重新开始
	- "incremental": 增量模式，如果文件存在则增量，不存在则全量执行
	- "hybrid": 混合模式，第一次时全量（文件不存在），后面就增量（文件存在）
	- "error": 错误模式，只重新执行 agent_answer 包含 ERROR 的记录
	split: 数据集类型，"validation" 或 "test"
	threads: 并发线程数
	target_task_id: 可选，指定要运行的 task_id，如果指定则只运行该用例
	"""
	global dataset, OUTPUT_FILE, SUBMISSION_FILE, SPLIT_TYPE

	# 设置全局变量
	SPLIT_TYPE = split

	# 根据 split 类型设置输出文件名
	if split == "validation":
	OUTPUT_FILE = "validation_results.jsonl"
	SUBMISSION_FILE = "validation_submission.jsonl"
	print("📥 正在检查 GAIA 验证集数据...")
	else: # test
	OUTPUT_FILE = "test_results.jsonl"
	SUBMISSION_FILE = "test_submission.jsonl"
	print("📥 正在检查 GAIA 测试集数据...")

	# 1. 检查并下载完整数据集文件（如果需要，包含 validation 和 test 的所有文件）
	print("\n【步骤 1/2】检查数据集文件...")
	check_and_download_dataset_files()

	# 2. 加载数据集元数据（如果首次下载会显示下载进度）
	print(f"\n【步骤 2/2】加载数据集元数据...")
	print(f" 数据集: gaia-benchmark/GAIA (2023_all, split={split})")
	print(" 提示: 如果是首次下载，请耐心等待，下载进度会显示在下方...")
	print(" 如果已下载过，会直接从缓存加载，速度较快")
	start_time = time.time()
	dataset = load_dataset("gaia-benchmark/GAIA", "2023_all", split=split)
	load_duration = time.time() - start_time
	print(f"✅ 数据集元数据加载完成！共 {len(dataset)} 条记录，耗时 {load_duration:.2f} 秒\n")

	# 1. 根据模式处理已有结果
	file_exists = os.path.exists(OUTPUT_FILE)

	if mode == "full":
	# 全量模式：删除旧文件，从头开始
	if file_exists:
	os.remove(OUTPUT_FILE)
	print("🔄 全量模式：已删除旧结果文件，从头开始执行")
	existing_results = {}
	elif mode == "incremental":
	# 增量模式：如果文件存在则增量，不存在则全量执行
	if file_exists:
	existing_results = load_existing_results()
	print(f"📋 增量模式：已加载 {len(existing_results)} 条历史记录")
	else:
	existing_results = {}
	print("📋 增量模式：未找到历史记录，将全量执行")
	elif mode == "error":
	# 错误模式：只重新执行 agent_answer 包含 ERROR 的记录
	if file_exists:
	existing_results = load_existing_results()
	print(f"📋 错误模式：已加载 {len(existing_results)} 条历史记录，将重新执行包含 ERROR 的记录")
	else:
	existing_results = {}
	print("📋 错误模式：未找到历史记录，无法执行错误重试")
	else: # hybrid
	# 混合模式：第一次时全量（文件不存在），后面就增量（文件存在）
	if file_exists:
	existing_results = load_existing_results()
	print(f"📋 混合模式：检测到已有文件，进入增量模式（已加载 {len(existing_results)} 条历史记录）")
	else:
	existing_results = {}
	print("📋 混合模式：首次执行，进入全量模式")

	# 2. 筛选需要执行的用例
	if target_task_id:
	# 如果指定了 task_id，只运行该用例
	print(f"🎯 指定运行 task_id: {target_task_id}")
	tasks_to_run = []
	found = False
	for item in dataset:
	if item['task_id'] == target_task_id:
	tasks_to_run = [item]
	found = True
	break
	if not found:
	print(f"❌ 错误: 在 {split} 数据集中未找到 task_id: {target_task_id}")
	return
	num_to_run = 1
	else:
	# 正常模式，根据 num_questions 筛选
	num_to_run = min(num_questions, len(dataset))
	tasks_to_run = dataset.select(range(num_to_run))

	# 统计需要执行的用例
	tasks_to_execute = []
	refresh_count = 0 # hybrid 模式下只刷新字段顺序的记录数
	update_count = 0 # 需要重新调用 agent 的记录数
	new_count = 0 # 新记录数
	error_count = 0 # error 模式下包含 ERROR 的记录数

	for item in tasks_to_run:
	task_id = item['task_id']

	if mode == "error":
	# error 模式：只重新执行 agent_answer 包含 ERROR 的记录
	if task_id in existing_results:
	agent_answer = existing_results[task_id].get("agent_answer", "")
	if agent_answer and "ERROR" in str(agent_answer):
	error_count += 1
	tasks_to_execute.append(item)
	# 如果记录不存在或 agent_answer 不包含 ERROR，则跳过
	else:
	# 其他模式：正常处理
	if task_id in existing_results:
	# hybrid + validation 模式下：如果已成功，只刷新字段顺序
	# test 数据集没有 is_correct 字段，无法判断是否正确，所以总是重新执行
	if mode == "hybrid" and split == "validation":
	if existing_results[task_id].get("is_correct", False):
	refresh_count += 1
	else:
	update_count += 1
	else:
	# 非 hybrid 模式，或 test 数据集：所有已有记录都需要重新执行
	update_count += 1
	else:
	new_count += 1
	tasks_to_execute.append(item)

	total_to_execute = len(tasks_to_execute)

	print(f"\n📊 统计信息:")
	print(f" 数据集: {split}")
	print(f" 执行模式: {mode}")
	if target_task_id:
	print(f" 指定 task_id: {target_task_id}")
	print(f" 总用例数: {num_to_run}")
	if existing_results:
	if mode == "error":
	print(f" 需要执行: {total_to_execute} (包含 ERROR 的记录: {error_count})")
	elif mode == "hybrid":
	print(
	f" 需要执行: {total_to_execute} (新用例: {new_count}, 刷新字段顺序: {refresh_count}, 重新测试: {update_count})")
	else:
	print(
	f" 需要执行: {total_to_execute} (新用例: {new_count}, 刷新已有记录: {refresh_count + update_count})")
	else:
	print(f" 需要执行: {total_to_execute} (全量执行)")
	print(f"🚀 开始测试 \| 并发数: {threads} \| 待执行: {total_to_execute}")

	if total_to_execute == 0:
	if mode == "error":
	print("✅ 没有包含 ERROR 的记录，无需执行")
	elif split == "validation":
	print("✅ 所有用例已完成且正确，无需执行")
	else:
	print("✅ 所有用例已完成，无需执行")
	generate_report()
	generate_submission()
	return

	# 3. 并发执行
	with ThreadPoolExecutor(max_workers=threads) as executor:
	future_to_item = {executor.submit(process_item, item, existing_results, mode): item for item in
	tasks_to_execute}

	done = 0
	for future in as_completed(future_to_item):
	done += 1
	item = future_to_item[future]
	tid = item['task_id']
	try:
	_, is_ok, status = future.result()
	if status == "refreshed":
	status_icon = "🔄"
	elif split == "validation":
	status_icon = "✅" if is_ok else "❌"
	else: # test
	status_icon = "✅"

	# 计算并显示当前整体正确率
	accuracy_info = ""
	if split == "validation":
	current_accuracy = get_current_accuracy()
	if current_accuracy is not None:
	accuracy_info = f" \| 当前正确率: {current_accuracy:.2f}%"

	print(f"[{done}/{total_to_execute}] ID: {tid} \| 状态: {status_icon} ({status}){accuracy_info}")
	except Exception as e:
	error_traceback = traceback.format_exc()
	print(f"[{done}/{total_to_execute}] ID: {tid} 运行异常: {e}")
	print(f"异常堆栈:\n{error_traceback}")

	# 4. 生成报表
	generate_report()
	generate_submission()


	def print_help():
	"""打印详细的帮助信息"""
	print("=" * 70)
	print("GAIA 测试脚本 - 参数说明")
	print("=" * 70)
	print()
	print("用法:")
	print(" python gaia_test.py [参数]")
	print()
	print("参数说明:")
	print()
	print(" --split <类型>")
	print(" 数据集类型")
	print(" 可选值: validation, test")
	print(" 默认值: validation")
	print(" 说明:")
	print(" - validation: 验证集，有标准答案，可以计算正确率")
	print(" - test: 测试集，无标准答案，用于最终提交")
	print()
	print(" --mode <模式>")
	print(" 执行模式")
	print(" 可选值: full, incremental, hybrid, error")
	print(" 默认值: hybrid")
	print(" 说明:")
	print(" - full: 全量模式，删除旧结果文件，从头开始执行")
	print(" - incremental: 增量模式，如果文件存在则增量，不存在则全量执行")
	print(" - hybrid: 混合模式（推荐），首次全量，后续增量")
	print(" 在 hybrid 模式下，validation 数据集中已正确的记录")
	print(" 只刷新字段顺序，不重新调用 agent")
	print(" - error: 错误模式，只重新执行 agent_answer 包含 ERROR 的记录")
	print()
	print(" --num <数量>")
	print(" 要执行的用例数量")
	print(" 类型: 整数")
	print(" 默认值: 200")
	print(" 说明:")
	print(" - test 数据集共 285 题，可以设置 --num 285 执行全部")
	print(" - validation 数据集可以根据需要设置数量")
	print()
	print(" --threads <数量>")
	print(" 并发执行的线程数")
	print(" 类型: 整数")
	print(" 默认值: 2")
	print(" 说明:")
	print(" - 根据服务器性能调整，过高可能导致服务器压力过大")
	print(" - 建议范围: 1-4")
	print()
	print(" --task-id <task_id>")
	print(" 指定要运行的 task_id")
	print(" 类型: 字符串")
	print(" 默认值: 无（运行多个用例）")
	print(" 说明:")
	print(" - 如果指定此参数，则只运行该 task_id 对应的用例")
	print(" - 指定此参数时，--num 参数会被忽略")
	print(" - 如果指定的 task_id 不存在，脚本会报错并退出")
	print()
	print(" -h, --help")
	print(" 显示此帮助信息并退出")
	print()
	print("示例:")
	print(" # 使用默认参数（validation 数据集，hybrid 模式，200 题）")
	print(" python gaia_test.py")
	print()
	print(" # 测试 test 数据集，执行全部 285 题")
	print(" python gaia_test.py --split test --num 285")
	print()
	print(" # 使用 error 模式重新执行错误记录")
	print(" python gaia_test.py --mode error")
	print()
	print(" # 使用全量模式，4 个并发线程")
	print(" python gaia_test.py --mode full --threads 4")
	print()
	print(" # 运行指定的 task_id")
	print(" python gaia_test.py --task-id c61d22de-5f6c-4958-a7f6-5e9707bd3466")
	print()
	print("=" * 70)
	print("配置文件:")
	print(" 运行前请确保已正确配置 gaia_test.py 中的以下参数:")
	print(" - BASE_URL: API 服务地址")
	print(" - HEADERS: 认证 Token（必须修改）")
	print(" - DATA_PATH: 数据文件路径")
	print(" - REQUEST_TIMEOUT: 请求超时时间")
	print(" - MAX_CONCURRENT: 最大并发数")
	print("=" * 70)


	if __name__ == "__main__":
	import sys

	# 检查是否有 -h 或 --help 参数
	if "-h" in sys.argv or "--help" in sys.argv:
	print_help()
	sys.exit(0)

	parser = argparse.ArgumentParser(
	description="GAIA 测试脚本（支持 validation 和 test 数据集，支持全量、增量、混合三种模式）",
	formatter_class=argparse.RawDescriptionHelpFormatter
	)
	parser.add_argument(
	"--split",
	type=str,
	choices=["validation", "test"],
	default="validation",
	help="数据集类型: 'validation' 验证集（有标准答案）、'test' 测试集（无标准答案，默认: validation）"
	)
	parser.add_argument(
	"--mode",
	type=str,
	choices=["full", "incremental", "hybrid", "error"],
	default="hybrid",
	help="执行模式: 'full' 全量模式（删除旧文件重新执行）、'incremental' 增量模式（文件存在则增量，不存在则全量）、'hybrid' 混合模式（首次全量，后续增量，默认）、'error' 错误模式（只重新执行 agent_answer 包含 ERROR 的记录）"
	)
	parser.add_argument(
	"--num",
	type=int,
	default=200,
	help="要执行的用例数量（默认: 200，test 集共 285 题）"
	)
	parser.add_argument(
	"--threads",
	type=int,
	default=MAX_CONCURRENT,
	help="执行的并发数（默认: 2）"
	)
	parser.add_argument(
	"--task-id",
	type=str,
	default=None,
	help="指定要运行的 task_id，如果指定则只运行该用例（忽略 --num 参数）"
	)

	args = parser.parse_args()

	run_test_concurrent(num_questions=args.num, mode=args.mode, split=args.split, threads=args.threads, target_task_id=args.task_id)