Spaces:

hapda12
/

GAIA

Sleeping

App Files Files Community

hapda12 commited on Jan 29

Commit

358eb7e

verified ·

1 Parent(s): 270dca1

Upload 12 files

Browse files

Files changed (12) hide show

.gitignore +82 -0
README.md +69 -7
agent.py +752 -0
app.py +286 -0
config.py +80 -0
data_clean.csv +118 -0
extension_tools.py +368 -0
rag.py +518 -0
rag_index/index.faiss +0 -0
rag_index/index.pkl +3 -0
requirements.txt +51 -0
tools.py +945 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,82 @@

+# GAIA Agent .gitignore
+# HuggingFace Spaces 部署用
+# ========================================
+# 敏感文件 - 绝对不能上传
+# ========================================
+.env
+.env.local
+.env.*.local
+# ========================================
+# Python
+# ========================================
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# ========================================
+# 临时文件
+# ========================================
+temp/
+*.tmp
+*.temp
+*.log
+# ========================================
+# IDE 和编辑器
+# ========================================
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.project
+.pydevproject
+.settings/
+# ========================================
+# Jupyter Notebook
+# ========================================
+.ipynb_checkpoints/
+*.ipynb
+# ========================================
+# RAG 索引 - 保留预构建的索引以加快启动
+# ========================================
+# rag_index/  # 已注释，会上传索引
+# ========================================
+# Claude Code 配置
+# ========================================
+.claude/
+# ========================================
+# 测试文件
+# ========================================
+test_*.py
+*_test.py
+# ========================================
+# 其他
+# ========================================
+*.parquet
+sorted-*.md
+convert_*.py

README.md CHANGED Viewed

@@ -1,14 +1,76 @@
 ---
-title: GAIA
-emoji: 📈
-colorFrom: indigo
-colorTo: gray
 sdk: gradio
-sdk_version: 6.5.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: A space for GAIA stage 1
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GAIA Agent
+emoji: "\U0001F916"
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: "4.0.0"
 app_file: app.py
 pinned: false
 license: mit
 ---
+# GAIA Agent
+基于 LangGraph 的 GAIA 基准测试 Agent，用于 HuggingFace Agents Course Unit 4 评测。
+## 功能特点
+- **ReAct 架构**: 基于 LangGraph StateGraph 实现的推理-行动循环
+- **多种搜索工具**: DuckDuckGo、Wikipedia、arXiv、Tavily、YouTube、StackOverflow
+- **文件处理**: 支持 PDF、Excel、图片 OCR、音频转写
+- **代码执行**: 安全的 Python 沙箱环境
+- **RAG 支持**: 可选的知识库检索增强
+## 工具列表
+| 工具 | 功能 |
+|------|------|
+| `web_search` | DuckDuckGo 网络搜索 |
+| `wikipedia_search` | 维基百科搜索 |
+| `arxiv_search` | arXiv 学术论文搜索 |
+| `tavily_search` | Tavily 高质量搜索 |
+| `fetch_task_files` | 下载任务附件 |
+| `read_file` | 读取本地文件 |
+| `parse_pdf` | 解析 PDF 文件 |
+| `parse_excel` | 解析 Excel 文件 |
+| `calc` | 安全数学计算 |
+| `run_python` | Python 代码执行 |
+## 环境变量
+在 HuggingFace Spaces 的 Settings > Secrets 中配置：
+| 变量 | 必需 | 说明 |
+|------|------|------|
+| `OPENAI_API_KEY` | 是 | OpenAI API Key |
+| `OPENAI_BASE_URL` | 否 | 自定义 API 端点 |
+| `MODEL` | 否 | 模型名称，默认 gpt-4o-mini |
+| `TAVILY_API_KEY` | 否 | Tavily API Key（可选） |
+## 本地运行
+```bash
+# 安装依赖
+pip install -r requirements.txt
+# 配置环境变量
+cp .env.example .env
+# 编辑 .env 填入 API Key
+# 启动应用
+python app.py
+```
+访问 http://localhost:7860
+## 技术栈
+- [LangGraph](https://github.com/langchain-ai/langgraph) - Agent 框架
+- [LangChain](https://github.com/langchain-ai/langchain) - LLM 工具链
+- [Gradio](https://gradio.app/) - Web UI 框架
+- [FAISS](https://github.com/facebookresearch/faiss) - 向量检索
+## 许可证
+MIT License

agent.py ADDED Viewed

	@@ -0,0 +1,752 @@

+"""
+Agent 核心模块 - GAIA LangGraph ReAct Agent
+包含：AgentState, System Prompt, Graph 构建, 答案提取
+"""
+import re
+from typing import Sequence, Literal, Annotated, Optional
+from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, AIMessage
+from langchain_openai import ChatOpenAI
+from langgraph.graph import StateGraph, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode
+try:
+    from typing import TypedDict
+except ImportError:
+    from typing_extensions import TypedDict
+from config import (
+    OPENAI_BASE_URL,
+    OPENAI_API_KEY,
+    MODEL,
+    TEMPERATURE,
+    MAX_ITERATIONS,
+    DEBUG,
+    LLM_TIMEOUT,
+    RATE_LIMIT_RETRY_MAX,
+    RATE_LIMIT_RETRY_BASE_DELAY,
+)
+# 导入工具
+from tools import BASE_TOOLS
+# 尝试导入扩展工具
+try:
+    from extension_tools import EXTENSION_TOOLS
+    ALL_TOOLS = BASE_TOOLS + EXTENSION_TOOLS
+except ImportError as e:
+    print(f"⚠️ 扩展工具加载失败: {e}")
+    print("  提示: 请确保安装了 pandas 和 openpyxl (pip install pandas openpyxl)")
+    EXTENSION_TOOLS = []
+    ALL_TOOLS = BASE_TOOLS
+# 尝试导入 RAG 工具
+try:
+    from rag import RAG_TOOLS
+    ALL_TOOLS = ALL_TOOLS + RAG_TOOLS
+except ImportError:
+    RAG_TOOLS = []
+# RAG 短路辅助（可选导入，不影响工具加载）
+try:
+    from rag import rag_lookup_answer
+except ImportError:
+    rag_lookup_answer = None
+# 打印已加载的工具列表（调试用）
+_tool_names = [t.name for t in ALL_TOOLS]
+if DEBUG:
+    print(f"✓ 已加载 {len(ALL_TOOLS)} 个工具: {_tool_names}")
+    if 'parse_excel' not in _tool_names:
+        print("⚠️ 警告: parse_excel 工具未加载，Excel 文件处理将不可用！")
+# ========================================
+# System Prompt 设计
+# ========================================
+SYSTEM_PROMPT = """你是一个专业的问答助手，专门解答GAIA基准测试中的各类问题。你需要准确、简洁地回答问题。
+## 你的能力
+你可以使用以下工具来获取信息和处理任务：
+### 知识库工具（RAG）
+- `rag_query(question)`: 查询知识库中的相似问题，获取解题策略建议。返回推荐的工具和解题步骤。**遇到复杂问题时优先使用！**
+- `rag_retrieve(question)`: 仅检索相似问题，不生成建议。返回原始的相似问题和解法。
+- `rag_stats()`: 查看知识库状态（文档数量等）。
+### 信息获取工具
+- `web_search(query)`: 使用DuckDuckGo搜索网络信息。适用于查找人物、事件、地点、组织等外部知识。
+- `wikipedia_search(query)`: 在维基百科中搜索，返回简短摘要（3句话）。适用于快速确认人物/事件的基本信息。
+- `wikipedia_page(title, section)`: 获取维基百科页面的完整内容。**需要详细数据（如专辑列表、获奖记录、作品年表）时必须用此工具！**
+- `tavily_search(query)`: 使用Tavily进行高质量网络搜索，返回最多3条结果。需要API Key。
+- `arxiv_search(query)`: 在arXiv上搜索学术论文，返回最多3条结果。适用于查找科学研究和学术文献。
+### 文件处理工具
+- `fetch_task_files(task_id)`: 从评分服务器下载任务附件。当问题涉及附件时必须先调用此工具。
+- `read_file(file_path)`: 读取本地文件内容，支持txt/csv/json/zip等格式。**注意：不支持Excel和PDF！**
+- `parse_pdf(file_path)`: 解析PDF文件，提取文本内容。**PDF文件必须用此工具！**
+- `parse_excel(file_path)`: 解析Excel文件（.xlsx/.xls），返回表格内容。**Excel文件必须用此工具！**
+- `image_ocr(file_path)`: 对图片进行OCR文字识别。
+- `transcribe_audio(file_path)`: 将音频文件转写为文字。
+- `analyze_image(file_path, question)`: 使用AI分析图片内容。
+### 计算和代码工具
+- `calc(expression)`: 执行安全的数学计算，如 "2+3*4" 或 "sqrt(16)"。适用于简单算术。
+- `run_python(code)`: 在沙箱中执行Python代码。支持 import math/re/json/datetime/collections/random/string/itertools/functools 模块。适用于复杂数据处理、排序、过滤、日期计算等操作。
+## 工具使用策略
+### 优先级顺序
+0. **先查知识库**【最高优先级】:
+   - 首先调用 `rag_query(question)` 查询知识库
+   - 如果返回"知识库匹配成功"，**直接使用该答案作为最终回答**，不需要再调用其他工具
+   - 如果返回"知识库参考"，参考答案和步骤选择后续工具
+   - 如果无匹配，按后续优先级使用其他工具
+1. **有附件的问题**【重要】:
+   - 第一步：用 `fetch_task_files(task_id)` 下载文件
+   - 第二步：根据文件扩展名选择正确的读取工具：
+     * `.xlsx` / `.xls` → 必须用 `parse_excel(file_path)`
+     * `.pdf` → 必须用 `parse_pdf(file_path)`
+     * `.txt` / `.csv` / `.json` / `.md` → 用 `read_file(file_path)`
+     * `.png` / `.jpg` / `.jpeg` → 用 `image_ocr(file_path)` 或 `analyze_image(file_path, question)`
+     * `.mp3` / `.wav` → 用 `transcribe_audio(file_path)`
+   - 第三步：分析文件内容，进行必要的计算或处理
+   - **禁止**：下载文件后不要用 web_search 搜索，文件内容已经本地可用！
+2. **需要外部信息**:
+   - **百科知识查询流程**【重要】：
+     * 第一步：用 `wikipedia_search(query)` 确认页面标题
+     * 第二步：如果需要详细数据（专辑列表、作品年表、获奖记录等），必须用 `wikipedia_page(title, section)` 获取完整内容
+     * 示例：查 Mercedes Sosa 专辑数 → `wikipedia_search("Mercedes Sosa")` → `wikipedia_page("Mercedes Sosa", "Discography")`
+   - 通用搜索: 使用 `web_search` 搜索其他网络信息
+   - 学术论文: 使用 `arxiv_search` 查找研究文献
+   - 高质量结果: 使用 `tavily_search` (如果配置了API Key)
+3. **需要计算**: 简单算术用 `calc`，复杂处理用 `run_python`
+4. **数据处理**: 使用 `run_python` 进行排序、过滤、统计等操作
+### 工具使用原则
+- **只有问题明确提到"attached file"或"附件"时才调用 `fetch_task_files`**，否则不要调用
+- 每次只调用一个必要的工具，分析结果后再决定下一步
+- 如果工具返回错误，尝试调整参数或换用其他工具
+- 搜索时使用精确的关键词，避免过于宽泛
+- 读取大文件时注意内容可能被截断，关注关键信息
+- **如果 `wikipedia_search` 返回的摘要不足以回答问题，立即使用 `wikipedia_page` 获取完整内容**
+## 思考过程
+在回答问题前，请按以下步骤思考：
+1. **理解问题**: 问题在问什么？需要什么类型的信息？
+2. **咨询知识库**: 如果问题复杂或不确定解法，用 `rag_query` 查看相似问题的解题策略
+3. **判断工具**: 根据问题类型和 RAG 建议，选择合适的工具
+4. **执行获取**: 调用工具获取信息
+5. **分析整合**: 分析工具返回的信息，提取关键答案
+6. **格式化输出**: 按要求格式输出最终答案
+## 答案格式要求【非常重要】
+最终答案必须遵循以下格式：
+- **数字答案**: 直接输出数字，如 `42` 而不是 "答案是42"
+- **人名/地名**: 直接输出名称，如 `Albert Einstein` 而不是 "答案是Albert Einstein"
+- **日期答案**: 使用标准格式 `YYYY-MM-DD` 或按问题要求的格式
+- **列表答案**: 用逗号分隔，如 `A, B, C`
+- **是/否答案**: 输出 `Yes` 或 `No`
+⚠️ 最终回答时，只输出答案本身，不要包含：
+- 不要说"答案是..."、"The answer is..."
+- 不要添加解释或推理过程
+- 不要使用"最终答案："等前缀
+## 错误恢复
+如果遇到问题：
+- 工具调用失败: 检查参数，尝试简化或换用其他工具
+- 搜索无结果: 尝试不同的关键词组合
+- 文件读取失败: 确认文件路径正确，检查文件格式
+- 计算错误: 检查表达式语法，考虑使用Python代码
+## 示例
+问题: "Who was the first person to walk on the moon?"
+正确答案: Neil Armstrong
+错误答案: The answer is Neil Armstrong.
+问题: "What is 15% of 200?"
+正确答案: 30
+错误答案: 15% of 200 is 30.
+### 文件处理示例【重要】
+问题: "[Task ID: abc123] The attached Excel file contains sales data. What is the total revenue?"
+✅ 正确流程:
+1. fetch_task_files("abc123") → 下载文件到本地路径
+2. parse_excel("/path/to/file.xlsx") → 读取Excel内容，得到表格数据
+3. calc("100+200+300") 或 run_python("...") → 计算总收入
+4. 输出最终答案
+❌ 错误流程:
+1. fetch_task_files("abc123") → 下载文件
+2. web_search("sales data total revenue") → 错！文件内容在本地，不需要搜索网络！
+### RAG 辅助示例
+问题: "How many studio albums did Mercedes Sosa release between 2000 and 2009?"
+✅ 推荐流程:
+1. rag_query("How many studio albums did Mercedes Sosa release between 2000 and 2009?") → 获取建议：使用 wikipedia_page 查 Discography
+2. wikipedia_search("Mercedes Sosa") → 确认页面存在
+3. wikipedia_page("Mercedes Sosa", "Discography") → 获取完整专辑列表
+4. run_python("...") → 筛选 2000-2009 年的专辑并计数
+5. 输出最终答案
+RAG 的价值：直接告诉你该用 wikipedia_page 而不是 web_search，节省试错时间。
+现在请回答用户的问题。"""
+# ========================================
+# Agent State 定义
+# ========================================
+class AgentState(TypedDict):
+    """Agent 状态定义"""
+    # 核心字段
+    messages: Annotated[Sequence[BaseMessage], add_messages]  # 消息历史
+    # 迭代控制
+    iteration_count: int  # 当前迭代次数，防止无限循环
+# ========================================
+# LLM 初始化
+# ========================================
+# 全局 LLM 实例（避免每次迭代重复创建）
+_llm_instance = None
+_llm_with_tools = None
+def get_llm():
+    """获取 LLM 单例"""
+    global _llm_instance
+    if _llm_instance is None:
+        _llm_instance = ChatOpenAI(
+            model=MODEL,
+            temperature=TEMPERATURE,
+            base_url=OPENAI_BASE_URL,
+            api_key=OPENAI_API_KEY,
+            timeout=LLM_TIMEOUT,
+            max_retries=2,
+        )
+    return _llm_instance
+def get_llm_with_tools():
+    """获取绑定工具的 LLM 单例"""
+    global _llm_with_tools
+    if _llm_with_tools is None:
+        _llm_with_tools = get_llm().bind_tools(ALL_TOOLS)
+    return _llm_with_tools
+def invoke_llm_with_retry(llm, messages, max_retries=None, base_delay=None):
+    """
+    带重试逻辑的 LLM 调用（处理 429 速率限制错误）
+    Args:
+        llm: LLM 实例
+        messages: 消息列表
+        max_retries: 最大重试次数，默认使用配置值
+        base_delay: 基础延迟秒数，默认使用配置值
+    Returns:
+        LLM 响应
+    Raises:
+        原始异常（如果重试耗尽）
+    """
+    import time
+    from openai import RateLimitError
+    if max_retries is None:
+        max_retries = RATE_LIMIT_RETRY_MAX
+    if base_delay is None:
+        base_delay = RATE_LIMIT_RETRY_BASE_DELAY
+    last_error = None
+    for attempt in range(max_retries + 1):
+        try:
+            return llm.invoke(messages)
+        except RateLimitError as e:
+            last_error = e
+            if attempt < max_retries:
+                # 指数退避：base_delay * 2^attempt
+                delay = base_delay * (2 ** attempt)
+                print(f"[Rate Limit] 429 错误，第 {attempt + 1}/{max_retries + 1} 次尝试，等待 {delay:.1f} 秒后重试...")
+                time.sleep(delay)
+            else:
+                print(f"[Rate Limit] 重试次数已耗尽 ({max_retries + 1} 次)，抛出异常")
+                raise
+        except Exception as e:
+            # 其他错误直接抛出
+            raise
+    # 不应该到这里，但以防万一
+    if last_error:
+        raise last_error
+def create_llm():
+    """创建 LLM 实例（保留兼容性）"""
+    return get_llm()
+# ========================================
+# Graph 节点定义
+# ========================================
+def assistant(state: AgentState) -> dict:
+    """
+    LLM 推理节点
+    职责：
+    1. 接收当前状态
+    2. 构建完整消息（包含 System Prompt）
+    3. 调用 LLM 生成响应
+    4. 更新迭代计数
+    """
+    messages = state["messages"]
+    iteration = state.get("iteration_count", 0) + 1
+    # 构建完整消息列表
+    full_messages = [SystemMessage(content=SYSTEM_PROMPT)] + list(messages)
+    # 接近迭代上限时添加强制结束警告
+    if iteration >= MAX_ITERATIONS - 1:
+        print(f"[Iteration {iteration}] FORCING FINAL ANSWER (no tools)")
+        warning = f"""
+⚠️ 【最后机会】已进行 {iteration} 次迭代，达到上限 {MAX_ITERATIONS}。
+你必须立即给出最终答案！不要再调用任何工具！
+直接根据已有信息输出答案。如果信息不足，给出最佳估计。
+"""
+        full_messages.append(SystemMessage(content=warning))
+        # 不绑定工具，强制 LLM 只输出文本
+        llm = get_llm()
+        try:
+            response = invoke_llm_with_retry(llm, full_messages)
+        except Exception as e:
+            print(f"[ERROR] LLM 调用失败: {type(e).__name__}: {str(e)}")
+            raise
+    elif iteration >= MAX_ITERATIONS - 2:
+        warning = f"\n\n⚠️ 警告：已进行 {iteration} 次迭代，接近上限 {MAX_ITERATIONS}，请尽快给出最终答案，不要再搜索。"
+        full_messages.append(SystemMessage(content=warning))
+        # 使用单例 LLM（避免重复创建）
+        llm_with_tools = get_llm_with_tools()
+        try:
+            response = invoke_llm_with_retry(llm_with_tools, full_messages)
+        except Exception as e:
+            print(f"[ERROR] LLM 调用失败: {type(e).__name__}: {str(e)}")
+            raise
+    else:
+        # 使用单例 LLM（避免重复创建）
+        llm_with_tools = get_llm_with_tools()
+        try:
+            response = invoke_llm_with_retry(llm_with_tools, full_messages)
+        except Exception as e:
+            print(f"[ERROR] LLM 调用失败: {type(e).__name__}: {str(e)}")
+            raise
+    # 始终打印迭代信息（便于调试）
+    print(f"[Iteration {iteration}] LLM Response: {response.content[:200] if response.content else '(empty)'}...")
+    if hasattr(response, 'tool_calls') and response.tool_calls:
+        print(f"[Iteration {iteration}] Tool calls: {[tc['name'] for tc in response.tool_calls]}")
+    return {
+        "messages": [response],
+        "iteration_count": iteration
+    }
+def should_continue(state: AgentState) -> Literal["tools", "end"]:
+    """
+    路由判断：决定继续使用工具还是结束
+    判断逻辑：
+    1. 达到迭代上限 → 强制结束
+    2. 有工具��用 → 继续执行工具
+    3. 无工具调用 → 返回答案，结束
+    """
+    last_message = state["messages"][-1]
+    iteration = state.get("iteration_count", 0)
+    # 达到迭代上限，强制结束
+    if iteration >= MAX_ITERATIONS:
+        print(f"[Router] Reached max iterations ({MAX_ITERATIONS}), forcing end")
+        return "end"
+    # 检查是否有工具调用
+    if hasattr(last_message, "tool_calls") and last_message.tool_calls:
+        print(f"[Router] Has tool calls, continuing to tools")
+        return "tools"
+    # 无工具调用，返回答案
+    print(f"[Router] No tool calls, ending")
+    return "end"
+# ========================================
+# Graph 构建
+# ========================================
+def build_agent_graph():
+    """
+    构建 Agent Graph
+    流程：
+    START → assistant → [should_continue] → tools → assistant → ... → END
+    """
+    graph = StateGraph(AgentState)
+    # 添加节点
+    graph.add_node("assistant", assistant)
+    graph.add_node("tools", ToolNode(ALL_TOOLS))
+    # 设置入口点
+    graph.set_entry_point("assistant")
+    # 添加条件边
+    graph.add_conditional_edges(
+        "assistant",
+        should_continue,
+        {"tools": "tools", "end": END}
+    )
+    # 工具执行后返回 assistant
+    graph.add_edge("tools", "assistant")
+    return graph.compile()
+# ========================================
+# 答案提取
+# ========================================
+def extract_final_answer(result: dict) -> str:
+    """
+    从 Agent 结果中提取最终答案
+    处理步骤：
+    1. 获取最后一条消息
+    2. 移除常见前缀
+    3. 移除尾部解释
+    4. 提取 JSON 格式答案
+    5. 清理格式
+    """
+    messages = result.get("messages", [])
+    if not messages:
+        print("[extract_final_answer] No messages in result")
+        return "无法获取答案"
+    # 优先选择"无 tool_calls 的 AIMessage"
+    content = None
+    # 第一优先：无 tool_calls 的 AIMessage（真正的最终答案）
+    for msg in reversed(messages):
+        if isinstance(msg, AIMessage) and msg.content and str(msg.content).strip():
+            if not (hasattr(msg, "tool_calls") and msg.tool_calls):
+                content = msg.content
+                break
+    # 第二优先：有 tool_calls 的 AIMessage
+    if content is None:
+        for msg in reversed(messages):
+            if isinstance(msg, AIMessage) and msg.content and str(msg.content).strip():
+                content = msg.content
+                break
+    # 第三优先：任何有内容的消息（可能是 ToolMessage）
+    if content is None:
+        for msg in reversed(messages):
+            if hasattr(msg, "content") and msg.content and str(msg.content).strip():
+                content = msg.content
+                break
+    print(f"[extract_final_answer] Raw content: {content[:500] if content else '(empty)'}...")
+    if not content:
+        print("[extract_final_answer] Empty content in all messages")
+        return "无法获取答案"
+    answer = content.strip()
+    # Step 1: 移除常见前缀
+    prefix_patterns = [
+        # 英文前缀
+        r'^(?:the\s+)?(?:final\s+)?answer\s*(?:is|:)\s*',
+        r'^(?:the\s+)?result\s*(?:is|:)\s*',
+        r'^(?:therefore|thus|so|hence)[,:]?\s*',
+        r'^based\s+on\s+(?:the|my)\s+(?:analysis|research|calculations?)[,:]?\s*',
+        r'^after\s+(?:analyzing|reviewing|checking)[^,]*[,:]?\s*',
+        r'^according\s+to\s+[^,]*[,:]?\s*',
+        # 中文前缀
+        r'^(?:最终)?答案[是为：:]\s*',
+        r'^(?:结果|结论)[是为：:]\s*',
+        r'^(?:因此|所以|综上)[，,：:]?\s*',
+        r'^根据(?:以上)?(?:分析|信息|计算)[，,：:]?\s*',
+        r'^经过(?:分析|计算|查询)[，,：:]?\s*',
+    ]
+    for pattern in prefix_patterns:
+        answer = re.sub(pattern, '', answer, flags=re.IGNORECASE)
+    # Step 2: 移除尾部解释
+    suffix_patterns = [
+        r'\s*(?:This|That|The|It)\s+(?:is|was|represents|refers\s+to).*$',
+        r'\s*[（\(].*[）\)]$',
+        r'\s*[。\.]$',
+        r'\s*\n\n.*$',  # 移除额外段落
+    ]
+    for pattern in suffix_patterns:
+        answer = re.sub(pattern, '', answer, flags=re.IGNORECASE | re.DOTALL)
+    # Step 3: 提取 JSON 格式答案
+    json_patterns = [
+        r'\{["\']?(?:final_?)?answer["\']?\s*:\s*["\']?([^"\'}\n]+)["\']?\}',
+        r'"answer"\s*:\s*"([^"]+)"',
+    ]
+    for pattern in json_patterns:
+        json_match = re.search(pattern, answer, re.IGNORECASE)
+        if json_match:
+            answer = json_match.group(1)
+            break
+    # Step 4: 清理
+    answer = answer.strip()
+    answer = re.sub(r'\s+', ' ', answer)  # 合并空白
+    answer = answer.strip('"\'')  # 移除引号
+    # Step 5: 数字格式处理
+    if re.match(r'^[\d,\.]+$', answer):
+        answer = answer.replace(',', '')
+    return answer
+def post_process_answer(answer: str, expected_type: str = None) -> str:
+    """
+    根据预期类型后处理答案
+    Args:
+        answer: 原始答案
+        expected_type: 预期类型 (number, date, boolean, list)
+    Returns:
+        处理后的答案
+    """
+    if expected_type == "number":
+        match = re.search(r'-?\d+\.?\d*', answer.replace(',', ''))
+        if match:
+            return match.group()
+    elif expected_type == "date":
+        # 尝试标准化日期格式
+        date_patterns = [
+            (r'(\d{4})-(\d{1,2})-(\d{1,2})', lambda m: f"{m.group(1)}-{int(m.group(2)):02d}-{int(m.group(3)):02d}"),
+            (r'(\d{1,2})/(\d{1,2})/(\d{4})', lambda m: f"{m.group(3)}-{int(m.group(1)):02d}-{int(m.group(2)):02d}"),
+        ]
+        for pattern, formatter in date_patterns:
+            match = re.search(pattern, answer)
+            if match:
+                return formatter(match)
+    elif expected_type == "boolean":
+        lower = answer.lower().strip()
+        if lower in ['yes', 'true', '是', '对', 'correct']:
+            return "Yes"
+        elif lower in ['no', 'false', '否', '不', '错', 'incorrect']:
+            return "No"
+    elif expected_type == "list":
+        answer = re.sub(r'\s*[;；、]\s*', ', ', answer)
+    return answer
+# ========================================
+# GaiaAgent 入口类
+# ========================================
+class GaiaAgent:
+    """
+    GAIA Agent 入口类
+    使用方法：
+        agent = GaiaAgent()
+        answer = agent("Who founded Microsoft?")
+    """
+    def __init__(self):
+        """初始化 Agent"""
+        self.graph = build_agent_graph()
+    def _needs_reformatting(self, answer: str) -> bool:
+        """检查答案是否需要重新格式化"""
+        if not answer or answer == "无法获取答案":
+            return False
+        indicators = [
+            answer.startswith('http'),
+            'URL:' in answer,
+            len(answer) > 300,
+            answer.count('\n') > 3,
+            answer.startswith('1.') and '2.' in answer,
+            answer.startswith('- '),
+            '...' in answer and len(answer) > 100,
+        ]
+        return any(indicators)
+    def _force_format_answer(self, result: dict) -> str:
+        """强制格式化答案"""
+        messages = result.get("messages", [])
+        format_prompt = (
+            "根据上述对话收集的信息，输出最终答案。\n\n"
+            "【强制要求】只输出答案本身，不要解释、不要前缀。\n"
+            "- 数字：直接输出（如 42）\n"
+            "- 人名/地名：直接输出（如 Albert Einstein）\n"
+            "- 日期：YYYY-MM-DD\n"
+            "- 是/否：Yes 或 No\n\n"
+            "最终答案："
+        )
+        full_messages = [SystemMessage(content=SYSTEM_PROMPT)] + list(messages)
+        full_messages.append(HumanMessage(content=format_prompt))
+        llm = get_llm()
+        try:
+            print("[Reformat] Forcing answer formatting...")
+            response = invoke_llm_with_retry(llm, full_messages)
+            formatted = extract_final_answer({"messages": [response]})
+            print(f"[Reformat] Result: {formatted[:100]}...")
+            return formatted
+        except Exception as e:
+            print(f"[Reformat] Error: {e}")
+            return "无法获取答案"
+    def __call__(self, question: str, task_id: str = None) -> str:
+        """
+        执行问答
+        Args:
+            question: 用户问题
+            task_id: 任务 ID（可选，用于下载附件）
+        Returns:
+            最终答案
+        """
+        # 如果有 task_id，注入到问题中
+        if task_id:
+            question_with_id = f"[Task ID: {task_id}]\n\n{question}"
+        else:
+            question_with_id = question
+        # ===== RAG 前置短路：高置信度匹配直接返回 =====
+        try:
+            if rag_lookup_answer is not None:
+                hit = rag_lookup_answer(question, min_similarity=0.85)
+                if hit and hit.get("answer"):
+                    print(f"[GaiaAgent] RAG short-circuit hit: similarity={hit.get('similarity', 0):.2f}")
+                    if DEBUG:
+                        print(f"[Final Answer] {hit['answer']}")
+                    return str(hit["answer"]).strip()
+        except Exception as e:
+            if DEBUG:
+                print(f"[GaiaAgent] RAG short-circuit failed: {type(e).__name__}: {e}")
+        # ===== RAG 短路检查结束 =====
+        # 初始状态
+        initial_state = {
+            "messages": [HumanMessage(content=question_with_id)],
+            "iteration_count": 0
+        }
+        try:
+            # 执行 Agent
+            result = self.graph.invoke(initial_state)
+            # 提取答案
+            answer = extract_final_answer(result)
+            # 检查答案是否需要格式化
+            if self._needs_reformatting(answer):
+                print(f"[GaiaAgent] Answer needs reformatting: {answer[:50]}...")
+                answer = self._force_format_answer(result)
+            if DEBUG:
+                print(f"[Final Answer] {answer}")
+            return answer if answer else "无法获取答案"
+        except Exception as e:
+            import traceback
+            error_msg = f"Agent 执行出错: {type(e).__name__}: {str(e)}"
+            print(f"[ERROR] {error_msg}")
+            print(traceback.format_exc())
+            return error_msg
+    def run_with_history(self, messages: list) -> dict:
+        """
+        带历史消息执行
+        Args:
+            messages: 消息历史列表
+        Returns:
+            完整结果字典
+        """
+        initial_state = {
+            "messages": messages,
+            "iteration_count": 0
+        }
+        return self.graph.invoke(initial_state)
+# ========================================
+# 便捷函数
+# ========================================
+def run_agent(question: str, task_id: str = None) -> str:
+    """
+    运行 Agent 的便捷函数
+    Args:
+        question: 用户问题
+        task_id: 任务 ID（可选）
+    Returns:
+        最终答案
+    """
+    agent = GaiaAgent()
+    return agent(question, task_id)
+# ========================================
+# 测试
+# ========================================
+if __name__ == "__main__":
+    # 简单测试
+    agent = GaiaAgent()
+    # 测试计算
+    print("Test 1: Calculation")
+    answer = agent("What is 15% of 200?")
+    print(f"Answer: {answer}\n")
+    # 测试搜索
+    print("Test 2: Search")
+    answer = agent("Who founded Microsoft?")
+    print(f"Answer: {answer}\n")

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+GAIA Agent - HuggingFace Spaces Evaluation Runner
+基于 LangGraph 的 GAIA benchmark 评估智能体
+"""
+import os
+import time
+import gradio as gr
+import requests
+import pandas as pd
+from config import (
+    SCORING_API_URL,
+    DEBUG,
+    BATCH_QUESTION_DELAY,
+)
+from agent import GaiaAgent
+# --- Constants ---
+DEFAULT_API_URL = SCORING_API_URL
+# --- Agent Wrapper ---
+class GAIAAgentWrapper:
+    """
+    包装 GaiaAgent，适配 HuggingFace Spaces 评估接口
+    """
+    def __init__(self):
+        print("Initializing GAIA Agent...")
+        self._agent = None
+    @property
+    def agent(self) -> GaiaAgent:
+        """延迟初始化 Agent"""
+        if self._agent is None:
+            self._agent = GaiaAgent()
+            print("GAIA Agent initialized.")
+        return self._agent
+    def __call__(self, question: str, task_id: str = "") -> str:
+        """
+        处理问题并返回答案
+        Args:
+            question: 问题文本
+            task_id: 任务 ID（用于下载附件）
+        Returns:
+            答案字符串
+        """
+        if DEBUG:
+            print(f"Agent received question (first 100 chars): {question[:100]}...")
+        try:
+            if task_id:
+                answer = self.agent(question, task_id=task_id)
+            else:
+                answer = self.agent(question)
+            if DEBUG:
+                print(f"Agent returning answer: {answer[:100] if len(answer) > 100 else answer}")
+            return answer
+        except Exception as e:
+            error_msg = f"Agent error: {type(e).__name__}: {str(e)}"
+            print(error_msg)
+            return error_msg
+def run_and_submit_all(profile: gr.OAuthProfile | None):
+    """
+    Fetches all questions, runs the GAIA Agent on them, submits all answers,
+    and displays the results.
+    """
+    # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID")
+    if profile:
+        username = f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent
+    try:
+        agent = GAIAAgentWrapper()
+    except Exception as e:
+        print(f"Error instantiating agent: {e}")
+        return f"Error initializing agent: {e}", None
+    # Agent code link for HuggingFace Spaces
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
+    print(f"Agent code: {agent_code}")
+    # 2. Fetch Questions
+    print(f"Fetching questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=30)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
+        print(f"Fetched {len(questions_data)} questions.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching questions: {e}")
+        return f"Error fetching questions: {e}", None
+    except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        return f"Error decoding server response for questions: {e}", None
+    except Exception as e:
+        print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run Agent on all questions
+    results_log = []
+    answers_payload = []
+    total_questions = len(questions_data)
+    print(f"Running agent on {total_questions} questions...")
+    for idx, item in enumerate(questions_data):
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or question_text is None:
+            print(f"Skipping item with missing task_id or question: {item}")
+            continue
+        # Rate limit delay (skip first question)
+        if idx > 0 and BATCH_QUESTION_DELAY > 0:
+            print(f"Waiting {BATCH_QUESTION_DELAY}s before next question (rate limit)...")
+            time.sleep(BATCH_QUESTION_DELAY)
+        print(f"\n[{idx + 1}/{total_questions}] Processing task: {task_id}")
+        try:
+            submitted_answer = agent(question_text, task_id=task_id)
+            answers_payload.append({
+                "task_id": task_id,
+                "submitted_answer": submitted_answer
+            })
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                "Submitted Answer": submitted_answer
+            })
+        except Exception as e:
+            error_msg = f"AGENT ERROR: {type(e).__name__}: {e}"
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                "Submitted Answer": error_msg
+            })
+    if not answers_payload:
+        print("Agent did not produce any answers to submit.")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload
+    }
+    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
+    print(status_update)
+    # 5. Submit
+    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+    try:
+        response = requests.post(submit_url, json=submission_data, timeout=60)
+        response.raise_for_status()
+        result_data = response.json()
+        final_status = (
+            f"Submission Successful!\n"
+            f"User: {result_data.get('username')}\n"
+            f"Overall Score: {result_data.get('score', 'N/A')}% "
+            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+            f"Message: {result_data.get('message', 'No message received.')}"
+        )
+        print("Submission successful.")
+        results_df = pd.DataFrame(results_log)
+        return final_status, results_df
+    except requests.exceptions.HTTPError as e:
+        error_detail = f"Server responded with status {e.response.status_code}."
+        try:
+            error_json = e.response.json()
+            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+        except requests.exceptions.JSONDecodeError:
+            error_detail += f" Response: {e.response.text[:500]}"
+        status_message = f"Submission Failed: {error_detail}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.Timeout:
+        status_message = "Submission Failed: The request timed out."
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.RequestException as e:
+        status_message = f"Submission Failed: Network error - {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except Exception as e:
+        status_message = f"An unexpected error occurred during submission: {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+# --- Build Gradio Interface using Blocks ---
+with gr.Blocks(title="GAIA Agent Evaluation") as demo:
+    gr.Markdown("# GAIA Agent Evaluation Runner")
+    gr.Markdown(
+        """
+        **GAIA Agent** - 基于 LangGraph 的智能体，支持：
+        - RAG 知识库检索（高相似度直接返回答案）
+        - 网络搜索（DuckDuckGo）
+        - 文件处理（文本、ZIP、PDF、Excel）
+        - 代码执行（沙箱环境）
+        ---
+        **Instructions:**
+        1. Log in to your Hugging Face account using the button below.
+        2. Click 'Run Evaluation & Submit All Answers' to start evaluation.
+        3. Wait for the agent to process all questions (this may take a while).
+        """
+    )
+    gr.LoginButton()
+    run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
+    status_output = gr.Textbox(
+        label="Run Status / Submission Result",
+        lines=5,
+        interactive=False
+    )
+    results_table = gr.DataFrame(
+        label="Questions and Agent Answers",
+        wrap=True
+    )
+    run_button.click(
+        fn=run_and_submit_all,
+        outputs=[status_output, results_table]
+    )
+if __name__ == "__main__":
+    print("\n" + "-" * 30 + " GAIA Agent Starting " + "-" * 30)
+    # Clear proxy settings for localhost
+    os.environ['NO_PROXY'] = 'localhost,127.0.0.1'
+    os.environ.pop('HTTP_PROXY', None)
+    os.environ.pop('HTTPS_PROXY', None)
+    os.environ.pop('http_proxy', None)
+    os.environ.pop('https_proxy', None)
+    # Check for SPACE_HOST and SPACE_ID at startup
+    space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")
+    if space_host_startup:
+        print(f"SPACE_HOST found: {space_host_startup}")
+        print(f"Runtime URL: https://{space_host_startup}.hf.space")
+    else:
+        print("SPACE_HOST not found (running locally)")
+    if space_id_startup:
+        print(f"SPACE_ID found: {space_id_startup}")
+        print(f"Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+    else:
+        print("SPACE_ID not found (running locally)")
+    print("-" * (60 + len(" GAIA Agent Starting ")) + "\n")
+    print("Launching GAIA Agent Evaluation Interface...")
+    demo.launch(debug=True, share=False)

config.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+配置管理模块 - GAIA Agent 配置
+使用 .env 文件加载配置
+"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# 加载 .env 文件（支持从父目录加载）
+env_path = Path(__file__).parent / ".env"
+if not env_path.exists():
+    env_path = Path(__file__).parent.parent / ".env"
+load_dotenv(env_path)
+# ========================================
+# LLM 配置
+# ========================================
+OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+MODEL = os.getenv("MODEL", "gpt-4o-mini")
+TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
+# ========================================
+# API 配置
+# ========================================
+SCORING_API_URL = os.getenv("SCORING_API_URL", "https://agents-course-unit4-scoring.hf.space")
+# ========================================
+# Agent 配置
+# ========================================
+MAX_ITERATIONS = int(os.getenv("MAX_ITERATIONS", "10"))
+# ========================================
+# 超时配置（秒）
+# ========================================
+TOOL_TIMEOUT = int(os.getenv("TOOL_TIMEOUT", "30"))
+TOTAL_TIMEOUT = int(os.getenv("TOTAL_TIMEOUT", "300"))
+LLM_TIMEOUT = int(os.getenv("LLM_TIMEOUT", "120"))  # LLM 调用超时
+# ========================================
+# 搜索配置
+# ========================================
+SEARCH_MAX_RESULTS = int(os.getenv("SEARCH_MAX_RESULTS", "5"))
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "")
+WIKIPEDIA_MAX_RESULTS = int(os.getenv("WIKIPEDIA_MAX_RESULTS", "2"))
+ARXIV_MAX_RESULTS = int(os.getenv("ARXIV_MAX_RESULTS", "3"))
+TAVILY_MAX_RESULTS = int(os.getenv("TAVILY_MAX_RESULTS", "3"))
+# ========================================
+# 文件处理配置
+# ========================================
+MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", "10000"))
+# ========================================
+# RAG 配置
+# ========================================
+RAG_PERSIST_DIR = os.getenv("RAG_PERSIST_DIR", "./rag_index")
+RAG_CSV_PATH = os.getenv("RAG_CSV_PATH", "data_clean.csv")
+RAG_EMBEDDING_MODEL = os.getenv("RAG_EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+RAG_TOP_K = int(os.getenv("RAG_TOP_K", "3"))
+# ========================================
+# 速率限制配置
+# ========================================
+RATE_LIMIT_RETRY_MAX = int(os.getenv("RATE_LIMIT_RETRY_MAX", "5"))  # 429错误最大重试次数
+RATE_LIMIT_RETRY_BASE_DELAY = float(os.getenv("RATE_LIMIT_RETRY_BASE_DELAY", "10"))  # 基础延迟秒数
+BATCH_QUESTION_DELAY = float(os.getenv("BATCH_QUESTION_DELAY", "5"))  # 批量测试问题间延迟秒数
+# ========================================
+# 调试配置
+# ========================================
+DEBUG = os.getenv("DEBUG", "false").lower() == "true"
+# ========================================
+# 路径配置
+# ========================================
+BASE_DIR = Path(__file__).parent
+TEMP_DIR = BASE_DIR / "temp"
+TEMP_DIR.mkdir(exist_ok=True)

data_clean.csv ADDED Viewed

	@@ -0,0 +1,118 @@

+content,metadata
+"If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.","{""task_id"": ""e1fc63a2-da7a-432f-be78-7c4a95598703"", ""answer"": ""17"", ""steps"": ""1. Googled Eliud Kipchoge marathon pace to find 4min 37sec/mile\n2. Converted into fractions of hours.\n3. Found moon periapsis in miles (225,623 miles).\n4. Multiplied the two to find the number of hours and rounded to the nearest 100 hours."", ""tools"": ""1. A web browser.\n2. A search engine.\n3. A calculator."", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,"{""task_id"": ""8e867cd7-cff9-4e6c-867a-ff5ddc2550be"", ""answer"": ""3"", ""steps"": ""1. I did a search for Mercedes Sosa\n2. I went to the Wikipedia page for her\n3. I scrolled down to \""Studio albums\""\n4. I counted the ones between 2000 and 2009"", ""tools"": ""1. web browser\n2. google search"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"Here's a fun riddle that I think you'll enjoy.
+You have been selected to play the final round of the hit new game show ""Pick That Ping-Pong"". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.
+A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.
+The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.
+Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.","{""task_id"": ""ec09fa32-d03f-4bf8-84b0-1f16922c3ae4"", ""answer"": ""3"", ""steps"": ""Step 1: Evaluate the problem statement provided in my user's prompt\nStep 2: Consider the probability of any ball on the platform earning the prize.\nStep 3: Evaluate the ball in position one. The probability of it earning the prize, P1, is 1/3\nStep 4: Using a calculator, evaluate the ball in position two. The probability of it earning the prize, P2, is the difference between 1 and the product of the complementary probabilities for each trial\nP2 = 1 - (2/3)(2/3)\nP2 = 5/9\nStep 5: Using a calculator, evaluate the ball in position three. The probability of it earning the prize, P3, is the difference between 1 and the product of the complementary probabilities for each trial\nP3 = 1 - (2/3)(2/3)(2/3)\nP3 = 19/27\nStep 6: Consider the possible outcomes of numbers higher than 3.\nStep 7: For each trial, either 1 or 2 balls from the ramp will advance to the platform. For any given selection, there is a 50% chance that the ball advances to position 2 or position 3.\nStep 8: As position three holds the highest chance of earning the prize, select the only ball known to occupy position three with certainty, ball 3.\nStep 9: Report the correct answer to my user, \""3\"""", ""tools"": ""None"", ""num_steps"": 9, ""has_file"": false, ""level"": ""1""}"
+"What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""","{""task_id"": ""5d0080cb-90d7-4712-bc33-848150e917d3"", ""answer"": ""0.1777"", ""steps"": ""1. Searched '\""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\""' on Google.\n2. Opened \""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"" at https://journals.le.ac.uk/ojs1/index.php/jist/article/view/733.\n3. Clicked \""PDF\"".\n4. Found the calculations for the volume of the fish bag and noted them."", ""tools"": ""1. Web browser\n2. Search engine\n3. PDF access"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?","{""task_id"": ""a1e91b78-d3d8-4675-bb8d-62741b4b68a6"", ""answer"": ""3"", ""steps"": ""1. Navigate to the YouTube link.\n2. Watch the video to see the highest number of bird species.\n3. Note the number."", ""tools"": ""1. Web browser\n2. Video parsing"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+"Of the authors (First M. Last) that worked on the paper ""Pie Menus or Linear Menus, Which Is Better?"" in 2015, what was the title of the first paper authored by the one that had authored prior papers?","{""task_id"": ""46719c30-f4c3-4cad-be07-d5cb21eee6bb"", ""answer"": ""Mapping Human Oriented Information to Software Agents for Online Systems Usage"", ""steps"": ""1. Searched \""Pie Menus or Linear Menus, Which Is Better?\"" on Google.\n2. Opened \""Pie Menus or Linear Menus, Which Is Better?\"" on https://oda.oslomet.no/oda-xmlui/handle/10642/3162.\n3. Clicked each author's name.\n4. Noted the name that had no other papers listed.\n5. Searched \""Murano, Pietro\"" on Google.\n6. Opened http://www.pietromurano.org/.\n7. Clicked \""Publications\"".\n8. Found the earliest paper he contributed to."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.","{""task_id"": ""4b6bb5f7-f634-410e-815d-e673ab7f8632"", ""answer"": ""THE CASTLE"", ""steps"": ""1. Search the web for “Doctor Who series 9 episode 11 official script”.\n2. Click result on the BBC website.\n3. Scroll through the PDF to read the script, noting that it takes place in a mechanical castle location.\n4. Scroll back to the first scene heading to note the answer, THE CASTLE"", ""tools"": ""1. Search engine\n2. Web browser\n3. PDF viewer"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?","{""task_id"": ""cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb"", ""answer"": ""Fred"", ""steps"": ""1. Open the document.\n2. Look at gifts and recipient interests.\n3. Match Galileo Galilei biography (could apply to astronomy or books -> Miguel or Micah)\n4. Match fishing reel (only applies to fishing -> Harry)\n5. Match Raku programming guide (Perl language, but could also apply to JavaScript enthusiast - > Fred or Jun)\n6. Match chisel set (could apply to camping or woodworking, but Harry is already fulfilled -> Jun, so Raku guide is for Fred)\n7. Match custom dice (could apply to board games or tabletop RPGs -> Lucy or Sara)\n8. Match “War and Peace” American film copy (could apply to old movies or Audrey Hepburn -> Perry or Alex)\n9. Match yarn (only applies to knitting -> Micah, so the Galileo biography is for Miguel)\n10. Match \""One Piece\"" graphic novel (could apply to books or manga, but Micah already has yarn -> Alex, so the \""War and Peace\"" film is for Perry)\n11. Match \""War and Peace\"" novel (could apply to books or historical fiction novels, but Micah has yarn -> Tyson)\n12. Match Starbucks gift card (only applies to coffee -> Lucy, so the dice are for Sara)\n13. Match foam exercise mat (only applies to yoga -> Georgette)\n14. Note which recipients have gifts (Miguel, Harry, Fred, Jun, Sara, Perry, Micah, Alex, Tyson, Lucy, Georgette) and which does not (Rebecca).\n15. Find who was supposed to give Rebecca a gift (Fred)."", ""tools"": ""1. Word document access"", ""num_steps"": 15, ""has_file"": true, ""level"": ""1""}"
+".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI","{""task_id"": ""2d83110e-a098-4ebb-9987-066c06fa42d0"", ""answer"": ""Right"", ""steps"": ""1. Read the instructions in reverse"", ""tools"": ""1. A word reversal tool / script"", ""num_steps"": 1, ""has_file"": false, ""level"": ""1""}"
+"Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.","{""task_id"": ""5cfb274c-0207-4aa7-9575-6ac0bd95d9b2"", ""answer"": ""No"", ""steps"": ""1. Open the spreadsheet\n2. Analyze the green cells.\n3. Note that the shape of Earl’s plots is not a loop. There are dead-ends that can’t be traversed without doubling back to a previously-traversed cell."", ""tools"": ""1. Excel\n2. Image recognition\n3. Color recognition"", ""num_steps"": 3, ""has_file"": true, ""level"": ""1""}"
+"¬(A ∧ B) ↔ (¬A ∨ ¬B)
+¬(A ∨ B) ↔ (¬A ∧ ¬B)
+(A → B) ↔ (¬B → ¬A)
+(A → B) ↔ (¬A ∨ B)
+(¬A → B) ↔ (A ∨ ¬B)
+¬(A → B) ↔ (A ∧ ¬B)
+Which of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.","{""task_id"": ""27d5d136-8563-469e-92bf-fd103c28b57c"", ""answer"": ""(¬A → B) ↔ (A ∨ ¬B)"", ""steps"": ""1. Determine the truth values of the first statement: Recognize this is one of De Morgan's Laws showing how to distribute negation over the and conjunction - so it is a tautology.\n2. Determine the truth values of the second statement: Recognize this is one of De Morgan's Laws showing how to distribute negation over the or - so it is a tautology.\n3. Determine the truth values of the third statement: Recognize this is the definition of the contrapositive - so it is a tautology.\n4. Determine the truth values of the fourth statement: Recognize this as an alternative way of stating the conditional - so it is a tautology.\n5. Determine the truth values of the fifth statement: I don't recognize this, so check its truth values:\n6. A: True, B: True |  (¬A → B) ↔ (A ∨ ¬B) = (¬T → T) ↔ (T ∨ ¬T) = (F → T) ↔ (T ∨ F) = T ↔ T = T\n7. A: True, B: False |  (¬A → B) ↔ (A ∨ ¬B) = (¬T → F) ↔ (T ∨ ¬F) = (F → F) ↔ (T ∨ T) = T ↔ T = T\n8. A: False, B: True |  (¬A → B) ↔ (A ∨ ¬B) = (¬F → T) ↔ (F ∨ ¬T) = (T → T) ↔ (F ∨ ¬T) = T ↔ (F ∨ F) = T ↔ F = F\n9. The fifth statement is not a tautology so is the statement that is not logically equivalent. We were asked for only one statement, so can stop here."", ""tools"": ""None"", ""num_steps"": 9, ""has_file"": false, ""level"": ""1""}"
+"My family reunion is this week, and I was assigned the mashed potatoes to bring. The attendees include my married mother and father, my twin brother and his family, my aunt and her family, my grandma and her brother, her brother's daughter, and his daughter's family. All the adults but me have been married, and no one is divorced or remarried, but my grandpa and my grandma's sister-in-law passed away last year. All living spouses are attending. My brother has two children that are still kids, my aunt has one six-year-old, and my grandma's brother's daughter has three kids under 12. I figure each adult will eat about 1.5 potatoes of mashed potatoes and each kid will eat about 1/2 a potato of mashed potatoes, except my second cousins don't eat carbs. The average potato is about half a pound, and potatoes are sold in 5-pound bags. How many whole bags of potatoes do I need? Just give the number.","{""task_id"": ""dc28cf18-6431-458b-83ef-64b3ce566c10"", ""answer"": ""2"", ""steps"": ""1. Calculate the number of adults (mother, father, brother, brother's wife, aunt, aunt's husband, grandma, grandma's brother, grandma's brother's daughter, grandma's brother's daughter's husband, me = 11).\n2. Calculate the number of children (niece, nephew, cousin, grandma's brother's daughter's kids x3 = 6).\n3. Subtract the number of second cousins (grandma's brother's daughter's kids) (6 - 3 = 3).\n4. Calculate the adult potatoes (11 * 1.5 = 16.5).\n5. Calculate the child potatoes (3 * 0.5 = 1.5).\n6. Add to get the total potatoes (16.5 + 1.5 = 18).\n7. Multiply to get the pounds of potatoes (18 * 0.5 = 9 pounds).\n8. Calculate the number of 5-lb bags needed (9 / 5 = 1.8).\n9. Round up to get total bags (2)."", ""tools"": ""1. Calculator"", ""num_steps"": 9, ""has_file"": false, ""level"": ""1""}"
+"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?","{""task_id"": ""b816bfce-3d80-4913-a07d-69b752ce6377"", ""answer"": ""fluffy"", ""steps"": ""1. Searched \""Hreidmar's sons\"" on Google.\n2. Opened https://en.wikipedia.org/wiki/Hrei%C3%B0marr.\n3. Noted Fafnir guarded his house.\n4. Searched \""Emily Midkiff June 2014 Fafnir\"" on Google.\n5. Opened \""Fafnir 2/2014 |\"" at http://journal.finfar.org/journal/archive/fafnir-22014/.\n6. Clicked the title '“Dragons are Tricksy”: The Uncanny Dragons of Children’s Literature'.\n7. Found the word in quotation marks from two different authors (Ruth Stein and Margaret Blount) in the text."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 7, ""has_file"": false, ""level"": ""1""}"
+"Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?","{""task_id"": ""72e110e7-464c-453c-a309-90a95aed6538"", ""answer"": ""Guatemala"", ""steps"": ""1. Searched \""Bielefeld University Library's BASE\"" on Google.\n2. Opened https://www.base-search.net/.\n3. Clicked \""Browsing\"".\n4. Selected Clicked \""Dewey Decimal Classification (DDC) > 6 > 63 > 633.\n5. Refined to Unknown Language.\n6. Found the only article with a flag unique from the others in the search from pre-2020.\n7. Copied the country name from the institution."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 7, ""has_file"": false, ""level"": ""1""}"
+"In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend.
+The word that indicates oneself is ""Pa"" is the nominative form, ""Mato"" is the accusative form, and ""Sing"" is the genitive form.
+The root verb that indicates an intense like for something is ""Maktay"". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is ""Tay"", and when it is used in the imperfect past, it is ""Aktay"". It is used differently than in English, and is better translated as ""is pleasing to"", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.
+The word for apples is borrowed from English in Tizin, and so it is ""Apple"" is the nominative form, ""Zapple"" is the accusative form, and ""Izapple"" is the genitive form.
+Please translate ""I like apples"" to Tizin.","{""task_id"": ""42576abe-0deb-4869-8c63-225c2d75a95a"", ""answer"": ""Maktay mato apple"", ""steps"": ""1. Determine the order of words from the prompt (Verb - Object - Subject).\n2. Determine the present form of Like (\""Maktay\"")\n3. Determined that since the person doing the liking is the object of the sentence, the next word must be the one for oneself in object form.\n4. Determined the accusative form for onesself (\""mato\"").\n5. Determined the nominative form for apple. (\""apple\"").\n6. Put the words together in the correct order."", ""tools"": ""None"", ""num_steps"": 6, ""has_file"": false, ""level"": ""1""}"
+"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.","{""task_id"": ""b415aba4-4b68-4fc6-9b89-2c812e55a3e1"", ""answer"": ""diamond"", ""steps"": ""1. Searched \""nature scientific reports\"" on Google.\n2. Opened https://www.nature.com/srep/.\n3. Selected Explore Content > Research Articles.\n4. Filtered for Conference Proceedings from 2012.\n5. Opened each article link.\n6. Checked for \""plasmon\"" or \""plasmonic\"".\n7. Noted the nano-compound in the article that did not include either."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 7, ""has_file"": false, ""level"": ""1""}"
+Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,"{""task_id"": ""cca530fc-4052-43b2-b130-b30968d8aa44"", ""answer"": ""Rd5"", ""steps"": ""Step 1: Evaluate the position of the pieces in the chess position\nStep 2: Report the best move available for black: \""Rd5\"""", ""tools"": ""1. Image recognition tools"", ""num_steps"": 2, ""has_file"": true, ""level"": ""1""}"
+"In the year 2022, and before December, what does ""R"" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?","{""task_id"": ""935e2cff-ae78-4218-b3f5-115589b19dae"", ""answer"": ""research"", ""steps"": ""1. Searched \""legume wikipedia\"" on Google.\n2. Opened \""Legume\"" on Wikipedia.\n3. Clicked \""View history\"".\n4. Clicked \""View logs for this page\"".\n5. Checked all types of logs.\n6. Set the date to November 2022.\n7. Followed the BLP link of the violation.\n8. Noted the meaning of \""R\""."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,"{""task_id"": ""4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"", ""answer"": ""FunkMonk"", ""steps"": ""1. Search \""Wikipedia featured articles promoted in november 2016\""\n2. Click through to the appropriate page and find the person who nominated Giganotosaurus."", ""tools"": ""1. web browser\n2. search engine"", ""num_steps"": 2, ""has_file"": false, ""level"": ""1""}"
+"What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?","{""task_id"": ""5188369a-3bbe-43d8-8b94-11558f909a08"", ""answer"": ""Annie Levin"", ""steps"": ""1. Search \""merriam-webster word of the day\"" on Google search.\n2. Opened the top \""Word of the Day\"" result from the Merriam-Webster dictionary online.\n3. Clicked \""SEE ALL WORDS OF THE DAY\"" at the bottom.\n4. Scrolled down to June 27, 2022.\n5. Opened the Word of the Day (\""jingoism\"").\n6. Scrolled down and identified context quote for \""jingoism\"".\n7. Noted the name attributed to the quote. "", ""tools"": ""1. Web browser\n2. Search engine\n3. Audio capability"", ""num_steps"": 7, ""has_file"": false, ""level"": ""1""}"
+"Given this table defining * on the set S = {a, b, c, d, e}
+|*|a|b|c|d|e|
+|---|---|---|---|---|---|
+|a|a|b|c|b|d|
+|b|b|c|a|e|c|
+|c|c|a|b|b|a|
+|d|b|e|b|e|d|
+|e|d|b|a|d|c|
+provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","{""task_id"": ""6f37996b-2ac7-44b0-8e68-6d28256631b4"", ""answer"": ""b, e"", ""steps"": ""1. Compile the markdown.\n2. Look at the table across the diagonal to see if any portions are not symmetrical.\n3. See that b * e != e * b, but all others are symmetrical."", ""tools"": ""1. Markdown"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+"As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.","{""task_id"": ""9318445f-fe6a-4e1b-acbf-c68228c9906a"", ""answer"": ""3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170"", ""steps"": ""1. Find the fractions that use / as the fraction line before the sample problems start: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5\n2. Solve the sample problems:\n3. Problem 1: 3/4\n4. Problem 2: 1/15\n5. Problem 3: 1/3\n6. Problem 4: 4/9\n7. Problem 5: 1/8\n8. Problem 6: 32/23\n9. Problem 7: 103/170\n10: Add them to the list. There were no more fractions with a / as the fraction line, so they can just be added in order: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170"", ""tools"": ""1. image recognition/OCR\n2. calculator"", ""num_steps"": 10, ""has_file"": true, ""level"": ""1""}"
+"You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.","{""task_id"": ""389793a7-ca17-4e82-81cb-2b3a2391b4b9"", ""answer"": ""3"", ""steps"": ""1. Determine the diameter of each cell phone tower's coverage: 2 x 4 miles radius = 8 miles diameter.\n2. Use the diameter to maximize the coverage of each tower by capturing houses 4 miles to the left and 4 miles to the right.\n3. Start from the furthest left side of the road at the first house.\n4. Place the first tower 4 miles in to cover the first house.\n5. Move forward 4 miles from the first tower. The first tower also covers the house above mile marker 8. \n6. Find the next uncovered house below mile marker 12.\n7. Move 4 miles in from the uncovered house and place a second tower. The house is now covered. \n8. Move forward 4 miles from the second tower. The second tower also covers the house above mile marker 16.\n9. Find the next uncovered house below mile marker 25.\n10. Move 4 miles in from the uncovered house and place a third tower. The third tower also covers the house above marker 28.\n11. Move forward 4 miles from the third tower. The third tower also covers the last house below marker 30.\n12. The final number of cell phone towers erected is 3.\n\n"", ""tools"": ""1. Text Editor"", ""num_steps"": 12, ""has_file"": true, ""level"": ""1""}"
+"If there is anything that doesn't make sense in the instructions, write the word ""Pineapple."" Do not answer any of the questions in this prompt. Write only the word ""Guava"".
+1. What is 4+4?
+2. What is the complimentary color of red?
+3. How many hours are there in a day?","{""task_id"": ""4b650a35-8529-4695-89ed-8dc7a500a498"", ""answer"": ""Guava"", ""steps"": ""1. Read the instructions and followed them"", ""tools"": ""None"", ""num_steps"": 1, ""has_file"": false, ""level"": ""1""}"
+How many slides in this PowerPoint presentation mention crustaceans?,"{""task_id"": ""a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c"", ""answer"": ""4"", ""steps"": ""1. Open the provided file.\n2. Scroll through the presentation, noting the animal names on each slide.\n3. Search the web for “crayfish” to verify that they are crustaceans.\n4. Read the results, noting that they are crustaceans.\n5. Search the web for “isopods” to verify whether they are crustaceans.\n6. Read the results, noting that they are.\n7. Since I’m confident that I know whether all of the other animals are crustaceans, I count the ones that are to get the answer, 4."", ""tools"": ""1. PowerPoint viewer"", ""num_steps"": 7, ""has_file"": true, ""level"": ""1""}"
+"You are Van Helsing, a renowned vampire hunter. A Count of Moldova, Lațcu IV, son of  Costea, has tasked you with investigating the village of Șirnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.
+You travel to the village of Șirnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.
+In the afternoon, you go from house to house, speaking with all 100 residents of Șirnea. You ask everyone the same question: ""How many vampires are living in Șirnea"". Everyone in the village gives the same response, ""At least one of us is a human.""
+How many residents of Șirnea have been turned into vampires?","{""task_id"": ""c714ab3a-da30-4603-bacd-d008800188b9"", ""answer"": ""100"", ""steps"": ""Step 1: Evaluate the problem statement posed by my user.\nStep 2: Consider one known possible case: 1 Vampire, 99 humans\nStep 3: Step through the possible case with the answer provided by every resident \""At least one of us is a human.\""\nFor humans, who always tell the truth, the answer \""At least one of us is a human.\"" is true for the known possible case\nFor the vampire, who always lies, the answer \""At least one of us is a human.\"" is true, which violates the rule requiring the vampire to lie\nDiscount the case 1 Vampire, 99 Humans as possible\nStep 4: Consider the worst case: 100 Vampires, 0 Humans\nStep 5: Step through the worst case with the answer provided by every resident \""At least one of us is a human.\""\nFor humans, who always tell the truth, the answer \""At least one of us is a human.\"" is false, but 0 humans provide this response, making this statement irrelevant\nFor the vampire, who always lies, the answer \""At least one of us is a human.\"" is false, which respects the rule requiring vampires to lie\nConfirm the worst case as a provisional answer: 100 Vampires, 0 humans, answer: \""100\""\nStep 6: Consider a case with only one human: 99 Vampires, 1 Human\nStep 7: Step through the case with the answer provided by every resident \""At least one of us is a human.\""\nFor humans, who always tell the truth, the answer \""At least one of us is a human.\"" is true\nFor the vampire, who always lies, the answer \""At least one of us is a human.\"" is true, which violates the rule requiring vampires to lie\nDiscount the case of 99 Vampires, 1 Human as possible\nStep 8: Report the correct response to my user, \""100\"""", ""tools"": ""None"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
+What does Teal'c say in response to the question ""Isn't that hot?""","{""task_id"": ""9d191bce-651d-4746-be2d-7ef8ecadb9c2"", ""answer"": ""Extremely"", ""steps"": ""1. Follow the link\n2. Watch the clip until the question \""Isn't that hot\"" is asked\n3. Take note of the reply."", ""tools"": ""1. Web browser\n2. Video processing software\n3. Audio processing software"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+"You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?","{""task_id"": ""65afbc8a-89ca-4ad5-8d62-355bb401f61d"", ""answer"": ""F478A7"", ""steps"": ""1. Opened Map.xlsx.\n2. Counted 11 turns of 2 spaces each (22 spaces) along the path of non-blue cells.\n3. Opened cell formatting for the cell.\n4. Clicked the \""Fill\"" tab.\n5. Clicked \""More Colors...\""\n6. Noted the hex code of the color."", ""tools"": ""1. Access to Excel files\n2. Color recognition\n3. Calculator (or ability to count)"", ""num_steps"": 6, ""has_file"": true, ""level"": ""1""}"
+What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,"{""task_id"": ""cabe07ed-9eca-40ea-8ead-410ef5e83f91"", ""answer"": ""Louvrier"", ""steps"": ""1. Search for \""1.E Exercises LibreText Introductory Chemistry\""\n2. Read to see the horse doctor mentioned."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 2, ""has_file"": false, ""level"": ""1""}"
+"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
+milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
+I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","{""task_id"": ""3cef3a44-215e-4aed-8e3b-b1e3f08063b7"", ""answer"": ""broccoli, celery, fresh basil, lettuce, sweet potatoes"", ""steps"": ""Step 1: Evaluate the list provided by my user, eliminating objects which are neither fruits nor vegetables:\nsweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\nStep 2: Remove all items from the list which are botanical fruits, leaving a list of vegetables:\nsweet potatoes, fresh basil, broccoli, celery, lettuce\nStep 3: Alphabetize the remaining list as requested by my user:\nbroccoli, celery, fresh basil, lettuce, sweet potatoes\nStep 4: Provide the correct response in the requested format:\n\""broccoli\ncelery\nfresh basil\nlettuce\nsweet potatoes\"""", ""tools"": ""No tools required"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
+In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
+Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.","{""task_id"": ""99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3"", ""answer"": ""cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"", ""steps"": ""Step 1: Load the file supplied to me by my user.\nStep 2: Using speech-to-text tools, convert the audio file to plain text and store it for the candidate word list:\n\n\""In a saucepan, combine ripe strawberries, granulated sugar, freshly squeezed lemon juice, and cornstarch. Cook the mixture over medium heat, stirring constantly, until it thickens to a smooth consistency. Remove from heat and stir in a dash of pure vanilla extract. Allow the strawberry pie filling to cool before using it as a delicious and fruity filling for your pie crust.\""\n\nStep 3: Evaluate the candidate word list and process it, stripping each ingredient encountered to a provisional response list:\n\nripe strawberries\ngranulated sugar\nfreshly squeezed lemon juice\ncornstarch\npure vanilla extract\n\nStep 4: Alphabetize the list of ingredients as requested by my user to create a finalized response:\n\ncornstarch\nfreshly squeezed lemon juice\ngranulated sugar\npure vanilla extract\nripe strawberries\n\nStep 5: Report the correct response to my user:\n\n\""cornstarch\nfreshly squeezed lemon juice\ngranulated sugar\npure vanilla extract\nripe strawberries\"""", ""tools"": ""1. A file interface\n2. A speech-to-text tool"", ""num_steps"": 5, ""has_file"": true, ""level"": ""1""}"
+"In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.","{""task_id"": ""d0633230-7067-47a9-9dbf-ee11e0a2cdd6"", ""answer"": ""BaseLabelPropagation"", ""steps"": ""1. Searched \""Scikit-Learn July 2017 changelog\"" on Google.\n2. Opened \""Release History\"" from the Scikit-Learn website.\n3. Clicked \""Other versions\"" in the upper left.\n4. Opened the links, starting from the bottom, until one was found that included the \""July 2017\"" changelog under the News.\n5. Looked for the \""Bug fixes\"" section.\n6. Looked under \""Other predictors\"" in that section."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 6, ""has_file"": false, ""level"": ""1""}"
+Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,"{""task_id"": ""305ac316-eef6-4446-960a-92d80d542f82"", ""answer"": ""Wojciech"", ""steps"": ""1. Search \""Polish-language version of Everybody Loves Raymond\"" and pull up the Wiki page for Wszyscy kochają Romana.\n2. See that Bartłomiej Kasprzykowski is marked as playing Ray and go to his Wiki page.\n3. See that he is stated to have played Wojciech Płaska in Magda M."", ""tools"": ""None"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+"On the BBC Earth YouTube video of the Top 5 Silliest Animal Moments, what species of bird is featured?","{""task_id"": ""0383a3ee-47a7-41a4-b493-519bdefe0488"", ""answer"": ""Rockhopper penguin"", ""steps"": ""1. Search \""top 5 silliest animal moments bbc earth youtube\"" on Google search.\n2. Open the top link to \""Top 5 Silliest Animal Moments! | BBC Earth - YouTube\"".\n3. Listen to the video until the species is named."", ""tools"": ""1. Web browser\n2. Search engine\n3. Video recognition tools"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+What is the final numeric output from the attached Python code?,"{""task_id"": ""f918266a-b3e0-4914-865d-4faa564f1aef"", ""answer"": ""0"", ""steps"": ""1. Run the attached Python code"", ""tools"": ""1. Python"", ""num_steps"": 1, ""has_file"": true, ""level"": ""1""}"
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?,"{""task_id"": ""11af4e1a-5f45-467d-9aeb-46f4bb0bf034"", ""answer"": ""6"", ""steps"": ""1. Search the internet for \""blocks in bert base\""\n2. Examine the search results page to locate the answer (12)\n3. Search the internet for \""attention is all you need layers\""\n4, Navigate to https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf from the search results page\n5. Examine the architecture section of the PDF to locate the answer (12)\n6. Calculate the difference between the two numbers"", ""tools"": ""1. Web browser\n2. Search engine\n3. Calculator"", ""num_steps"": 6, ""has_file"": false, ""level"": ""1""}"
+"Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.
+If Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?","{""task_id"": ""e142056d-56ab-4352-b091-b56054bd1359"", ""answer"": ""16000"", ""steps"": ""Step 1: Evaluate the problem statement provided by my user, storing the relevant information: \n30 coins with a value of $1,000 distributed between 3 boxes.\nEach box must contain at least 2 coins\nOne box must contain 6 more coins than another\n\nStep 2: Evaluate the base distribution: 2-8-20, noting that two boxes must contain at least 8 coins\n\nStep 3: Evaluate the most even allowable distribution: 8,8,14, noting that two boxes must contain at least 8 coins\n\nStep 4: Evaluate a case where Bob guesses 8 for each box in the outlier distributions.\nStep 5: For the worst case 2-8-20 distribution, Bob wins 0+8+8 = 16 coins\nStep 6: For the 8-8-14 distribution, Bob wins 8+8+8 = 24 coins\nStep 7: Convert the worst-case coin count to a prize value, 16*$1,000 = $16,000\nStep 8: Report the correct answer to my user: \""$16,000\"""", ""tools"": ""1. A calculator"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+"Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:
+THESE
+AGULL
+GLIDE
+DPEAC
+EFULL
+YTOMY
+CHAIR","{""task_id"": ""50ad0280-0819-4bd9-b275-5de32d3b5bcb"", ""answer"": ""The seagull glided peacefully to my chair."", ""steps"": ""1. I start with the first line, \""T H E S E\"" and proceed to the next, \""A G U L L\"". At this point, I am able to discern that \""A G U L L\"" is probably meant to be \""A GULL\"". However, I continue to read through the rest of the lines to get a sense of any other words that might jump out that would substantiate \""A GULL\"" being accurate both semantically and syntactically. 2. So now I am on the last line and decide to work backwards. \""CHAIR\"" is on the last line all by itself and this does seem a plausible fit as a full word rather than a fragment of another word. When I look to the line directly above \""Y T O M Y\"", the word \""my\"" jumps out and this is a natural accompaniment to the noun often used to indicate possession. \n3. Eliminating the \""MY\"" at the end of \""Y T O MY\"" leaves \""Y T O\"" remaining in the line and I immediately recognize the preposition \""TO\"". It is a this point I am fairly confident that \""TO MY CHAIR\"" is most likely accurate. Given that there is only a \""Y\"" left, I discern it is more than likely the end of a word located in the row above.\n4. I am now on the fifth row down and am looking at the letters \""E F U L L\"" Attaching the \""Y\"" left over from the sixth row below I see \""E F U L L Y\""  I recognize the word \""FULLY\"" I know it can stand alone as an adverb or it can serve as a suffix to a larger adverb.\n5. Detaching the \""FULLY\"", leaves the \""E\"" alone on the line. Knowing it does not represent a word on its own in the English language, I look to attach it to the line above (row 4).\n6. The fourth row reads \""D P E A C\"". Adding the \""E\"" to the end, the first word I can separate out is \""ACE\"". However \""ACEFULLY\"" is not a word nor does \""ACE FULLY TO MY CHAIR\"" make sense. When working my way left through the line, continuing to attach each letter as I go, I land on the \""P\"" and am fairly confident that the word is \""PEACEFULLY\"".\n7. Eliminating the \""PEAC\"" from the row leaves me left with a \""D\"". Now I look at the row above, row 3 and see that the row comprises the word \""GLIDE\"" Adding the \""D\"" to the end of the word would not only be permissible in terms of a displaying appropriate tense but it also makes sense as I add it to the fragment I have so far. I now can read \""GLIDED PEACEFULLY TO MY CHAIR\"".\n8. Now, I am on the second line and if I were to read it from there on down it would read \""A GULL GLIDED PEACEFULLY TO MY CHAIR\"".  While this reads well and makes sense semantically and syntactically on its own, it does not make sense when I add the first row. THESE A GULL GLIDED PEACEFULLY TO MY CHAIR.  So now I am left with the conclusion that  \""A GULL\"" is not correct. Either it is part of a larger word or the letters need to be broken down further. At a quick glace, I can see that they don't make sense being broken down further so I leave \""GULL\"" and add the \""A\"" to the string above. Immediately my eye sees that \""A can be added to \""SE\"" to make \""SEA\"" and that the remaining\nletters spell the word \""THE\""  I now know the sentence reads \""The seagull glided peacefully to my chair."", ""tools"": ""None"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+"On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has ""witnesses"" in the most titles as of 2021?","{""task_id"": ""7673d772-ef80-4f0f-a602-1bf4485c9b43"", ""answer"": ""inference"", ""steps"": ""1. Searched \""Cornell Law School legal information institute\"" on Google.\n2. Opened https://www.law.cornell.edu/.\n3. Clicked Get The Law > Federal Rules > Federal Rules of Evidence (fourth section down).\n4. Found the article that has \""witnesses\"" in the most titles (VII).\n5. Opened the first rule (701).\n6. Scrolled to the last amendment as of 2021 (2011 amendment).\n7. Found the word that was deleted (inference)."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 7, ""has_file"": false, ""level"": ""1""}"
+"Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list","{""task_id"": ""c365c1c7-a3db-4d5e-a9a1-66f56eae7865"", ""answer"": ""Braintree, Honolulu"", ""steps"": ""1. Searched \""cities where us presidents are born\"" on Google.\n2. Opened \""List of presidents of the United States by home state\"" on Wikipedia.\n3. Searched the eastern cities to find the easternmost one (Braintree, MA).\n4. Checked the westernmost city (Honolulu, HI)."", ""tools"": ""1. Search engine\n2. Web browser"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?","{""task_id"": ""7d4a7d1d-cac6-44a8-96e8-ea9584a70825"", ""answer"": ""22"", ""steps"": ""1. Searched \""Girls Who Code\"" on Google.\n2. Opened https://girlswhocode.com/.\n3. Clicked \""About Us\"".\n4. Noted that the chart started at 37% and declined to 24%.\n5. Subtracted the marked years to find the number of years (2017 - 1995 = 22)."", ""tools"": ""1. Web browser\n2. Search engine\n3. Calculator"", ""num_steps"": 5, ""has_file"": false, ""level"": ""1""}"
+What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.,"{""task_id"": ""dc22a632-937f-4e6a-b72f-ba0ff3f5ff97"", ""answer"": ""Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them"", ""steps"": ""1. Searched \""Ali Khan New Mexico staple TV show\"" on Google.\n2. Opened \""Albuquerque | Cheap Eats\"" at https://www.cookingchanneltv.com/shows/cheap-eats/episodes/albuquerque.\n3. Noted the New Mexico staple and the list of restaurants.\n4. Searched \""Albuquerque Cheap Eats carne avodava\"" on Google.\n5. Confirmed the restaurant name (Papa Felipe's) from the results.\n6. Searched \""James Beard Award winners Papa Felipe's\"" on Google.\n7. Opened \""Papa Felipe's Mexican Restaurant - Albuquerque, New ...\"" at https://www.nmgastronome.com/?p=4572.\n8. Clicked the link on the book title.\n9. Copied the full book title from Amazon."", ""tools"": ""1. Web browser\n2. Search engine"", ""num_steps"": 9, ""has_file"": false, ""level"": ""1""}"
+How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,"{""task_id"": ""3f57289b-8c60-48be-bd80-01f8099ca449"", ""answer"": ""519"", ""steps"": ""1. Search \""yankee stats\"" to find their MLB stats page.\n2. Set the data to the 1977 regular season.\n3. Sort to find the most walks.\n4. See how many at bats the player had."", ""tools"": ""1. web browser\n2. search engine"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+"In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?","{""task_id"": ""23dd907f-1261-4488-b21c-e9185af91d5e"", ""answer"": ""2"", ""steps"": ""1. Search the web for “Audre Lorde Father Son and Holy Ghost”.\n2. Click on Poetry Foundation result.\n3. Note the stanza that appears to have lines indented, the second one.\n4. Return to search results to confirm.\n5. Click on second result.\n6. Confirm that the indentation appears in the second stanza here as well."", ""tools"": ""1. Search engine\n2. Web browser"", ""num_steps"": 6, ""has_file"": false, ""level"": ""1""}"
+"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
+Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.","{""task_id"": ""1f975693-876d-457b-a649-393859e79bf3"", ""answer"": ""132, 133, 134, 197, 245"", ""steps"": ""Step 1: Load the file supplied by my user.\nStep 2: Using audio processing tools, convert the text of the audio file to speech:\n\n\""Before you all go, I want to remind you that the midterm is next week. Here's a little hint; you should be familiar with the differential equations on page 245, problems that are very similar to problems 32, 33, and 44 from that page might be on the test. And also some of you might want to brush up on the last page in the integration section, page 197. I know some of you struggled on last week's quiz. I foresee problem 22 from page 197 being on your midterm. Oh, and don't forget to brush up on the section on related rates, on pages 132, 133, and 134.\""\n\nStep 3: Evaluate the converted audio, recording each instance of page numbers: 245, 197, 197, 132, 133, 134\nStep 4: Sort the page numbers in ascending order, omitting duplicates, and store this list as the correct answer to my user's request: 132, 133, 134, 197, 245\nStep 5: Report the correct response to my user: \""132, 133, 134, 197, 245\"""", ""tools"": ""1. A file interface\n2. A speech-to-text audio processing tool"", ""num_steps"": 5, ""has_file"": true, ""level"": ""1""}"
+"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?","{""task_id"": ""840bfca7-4f7b-481a-8794-c560c340185d"", ""answer"": ""80GSFC21M0002"", ""steps"": ""1. Google \""June 6, 2023 Carolyn Collins Petersen Universe Today\""\n2. Find the relevant link to the scientific paper and follow that link\n3. Open the PDF. \n4. Search for NASA award number"", ""tools"": ""1. Web browser\n2. Search engine\n3. Access to academic journal websites"", ""num_steps"": 4, ""has_file"": false, ""level"": ""1""}"
+What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?,"{""task_id"": ""a0068077-79f4-461a-adfe-75c1a4148545"", ""answer"": ""90"", ""steps"": ""1. Searched \""nih\"" on Google search.\n2. Clicked the top link to nih.gov.\n3. Searched \""h pylori acne\"" in the search box.\n4. Clicked \""More\"" and selected \""Clinical Trials\"".\n5. Clicked the result about H. Pylori and acne.\n6. Checked the date to confirm it was January to May 2018.\n7. Opened \""Tabular View\"".\n8. Scrolled down to Actual Enrollment and recorded the number."", ""tools"": ""1. Search engine\n2. Web browser"", ""num_steps"": 8, ""has_file"": false, ""level"": ""1""}"
+Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,"{""task_id"": ""bda648d7-d618-4883-88f4-3466eabd860e"", ""answer"": ""Saint Petersburg"", ""steps"": ""1. Search \""Kuznetzov Nedoshivina 2010\""\n2. Find the 2010 paper \""A catalogue of type specimens of the Tortricidae described by V. I. Kuznetzov from Vietnam and deposited in the Zoological Institute, St. Petersburg\"""", ""tools"": ""1. search engine"", ""num_steps"": 2, ""has_file"": false, ""level"": ""1""}"
+"A standard Rubik’s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face’s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.","{""task_id"": ""50ec8903-b81f-4257-9450-1085afd2c319"", ""answer"": ""green, white"", ""steps"": ""1. Set up a standard Rubik's cube (red opposite orange, white opposite yellow, green opposite blue).\n2. Eliminated blue cubes, along with adjacent colors.\n3. Eliminated orange cubes, along with adjacent colors.\n4. Eliminated green corners and the green/yellow edge.\n5. Eliminated red, opposite of orange, cubes and adjacent colors.\n6. Identified the last possible two-face cube."", ""tools"": ""1. Rubik's cube model"", ""num_steps"": 6, ""has_file"": false, ""level"": ""1""}"
+"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.","{""task_id"": ""cf106601-ab4f-4af9-b045-5295fe67b37d"", ""answer"": ""CUB"", ""steps"": ""1. Look up the 1928 Summer Olympics on Wikipedia\n2. Look at a table of athletes from countries.\n3. See that two countries had 1 and 2 athletes, so disregard those and choose the Cuba as CUB."", ""tools"": ""None"", ""num_steps"": 3, ""has_file"": false, ""level"": ""1""}"
+"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","{""task_id"": ""a0c07678-e491-4bbc-8f0b-07405144218f"", ""answer"": ""Yoshida, Uehara"", ""steps"": ""1. Look up Taishō Tamai on Wikipedia\n2. See the pitcher with the number 18 (before) is Kōsei Yoshida and number 20 (after) is Kenta Uehara"", ""tools"": ""1. Wikipedia"", ""num_steps"": 2, ""has_file"": false, ""level"": ""1""}"
+The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,"{""task_id"": ""7bd855d8-463d-4ed5-93ca-5fe35145f733"", ""answer"": ""89706.00"", ""steps"": ""1. Open the attached file.\n2. Read the columns representing different menu items. Note that they all appear to be food except for the “soda” column.\n3. Write a function to sum the relevant columns.\n4. Ensure the answer follows the specified formatting."", ""tools"": ""1. Excel\n2. Calculator"", ""num_steps"": 4, ""has_file"": true, ""level"": ""1""}"
+What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,"{""task_id"": ""5a0c1adf-205e-4841-a666-7c3ef95def9d"", ""answer"": ""Claus"", ""steps"": ""1. Look at the Malko Competition page on Wikipedia\n2. Scan the winners to see that the 1983 winner, Claus Peter Flor is stated to be from East Germany."", ""tools"": ""None"", ""num_steps"": 2, ""has_file"": false, ""level"": ""1""}"

extension_tools.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+扩展工具模块 - GAIA Agent 扩展功能
+包含：parse_pdf, parse_excel, image_ocr, transcribe_audio
+注意：这些工具需要额外的依赖库，如果导入失败会优雅降级。
+"""
+import os
+from typing import Optional, List
+from langchain_core.tools import tool
+from config import MAX_FILE_SIZE, TOOL_TIMEOUT
+# ========================================
+# PDF 解析工具
+# ========================================
+@tool
+def parse_pdf(file_path: str, page_numbers: str = "all") -> str:
+    """
+    解析 PDF 文件，提取文本内容。
+    Args:
+        file_path: PDF 文件路径
+        page_numbers: 页码范围
+            - "all": 所有页面
+            - "1": 第 1 页
+            - "1-5": 第 1 到 5 页
+            - "1,3,5": 第 1、3、5 页
+    Returns:
+        PDF 文本内容
+    限制：
+        - 扫描版 PDF 需配合 OCR
+        - 复杂排版可能顺序错乱
+    """
+    try:
+        import pdfplumber
+    except ImportError:
+        return "PDF 解析不可用：请安装 pdfplumber 库 (pip install pdfplumber)"
+    if not os.path.exists(file_path):
+        return f"文件不存在: {file_path}"
+    if not file_path.lower().endswith('.pdf'):
+        return f"不是 PDF 文件: {file_path}"
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            total_pages = len(pdf.pages)
+            # 解析页码范围
+            if page_numbers == "all":
+                pages_to_read = range(total_pages)
+            elif "-" in page_numbers:
+                start, end = map(int, page_numbers.split("-"))
+                pages_to_read = range(start - 1, min(end, total_pages))
+            elif "," in page_numbers:
+                pages_to_read = [int(p) - 1 for p in page_numbers.split(",")]
+                pages_to_read = [p for p in pages_to_read if 0 <= p < total_pages]
+            else:
+                page_num = int(page_numbers) - 1
+                if 0 <= page_num < total_pages:
+                    pages_to_read = [page_num]
+                else:
+                    return f"页码超出范围，PDF 共有 {total_pages} 页"
+            # 提取文本
+            text_parts = []
+            for i in pages_to_read:
+                page = pdf.pages[i]
+                text = page.extract_text()
+                if text:
+                    text_parts.append(f"--- 第 {i + 1} 页 ---\n{text}")
+            if not text_parts:
+                return "PDF 中没有提取到文本内容（可能是扫描版，请尝试使用 OCR）"
+            result = "\n\n".join(text_parts)
+            # 限制长度
+            if len(result) > MAX_FILE_SIZE:
+                return result[:MAX_FILE_SIZE] + f"\n\n... [内容已截断，共 {len(result)} 字符]"
+            return result
+    except Exception as e:
+        return f"PDF 解析出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# Excel 解析工具
+# ========================================
+@tool
+def parse_excel(file_path: str, sheet_name: str = None, max_rows: int = 100) -> str:
+    """
+    解析 Excel 文件内容。
+    Args:
+        file_path: Excel 文件路径（.xlsx, .xls）
+        sheet_name: 工作表名称，默认第一个
+        max_rows: 最大读取行数，默认 100
+    Returns:
+        表格内容（Markdown 格式）
+    """
+    try:
+        import pandas as pd
+    except ImportError:
+        return "Excel 解析不可用：请安装 pandas 和 openpyxl 库"
+    if not os.path.exists(file_path):
+        return f"文件不存在: {file_path}"
+    try:
+        # 读取 Excel
+        if sheet_name:
+            df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=max_rows)
+        else:
+            df = pd.read_excel(file_path, nrows=max_rows)
+        # 获取工作表信息
+        excel_file = pd.ExcelFile(file_path)
+        sheet_names = excel_file.sheet_names
+        # 构建输出
+        output = []
+        output.append(f"工作表: {sheet_names}")
+        output.append(f"当前读取: {sheet_name or sheet_names[0]}")
+        output.append(f"数据形状: {df.shape[0]} 行 x {df.shape[1]} 列")
+        output.append("")
+        # 转换为 Markdown 表格
+        output.append(df.to_markdown(index=False))
+        result = "\n".join(output)
+        # 限制长度
+        if len(result) > MAX_FILE_SIZE:
+            return result[:MAX_FILE_SIZE] + f"\n\n... [内容已截断]"
+        return result
+    except Exception as e:
+        return f"Excel 解析出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 图片 OCR 工具
+# ========================================
+@tool
+def image_ocr(file_path: str, language: str = "eng") -> str:
+    """
+    对图片进行 OCR 文字识别。
+    Args:
+        file_path: 图片路径（png/jpg/jpeg/bmp/gif/tiff）
+        language: 识别语言
+            - "eng": 英文
+            - "chi_sim": 简体中���
+            - "chi_tra": 繁体中文
+            - "eng+chi_sim": 多语言
+    Returns:
+        识别出的文字
+    注意：
+        需要安装 Tesseract OCR 引擎
+    """
+    try:
+        import pytesseract
+        from PIL import Image
+    except ImportError:
+        return "OCR 不可用：请安装 pytesseract 和 Pillow 库"
+    if not os.path.exists(file_path):
+        return f"文件不存在: {file_path}"
+    # 检查文件格式
+    valid_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.tif'}
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext not in valid_extensions:
+        return f"不支持的图片格式: {ext}，支持: {', '.join(valid_extensions)}"
+    try:
+        # 打开图片
+        image = Image.open(file_path)
+        # 执行 OCR
+        text = pytesseract.image_to_string(image, lang=language)
+        if not text.strip():
+            return "图片中没有识别到文字内容"
+        # 清理文本
+        text = text.strip()
+        # 限制长度
+        if len(text) > MAX_FILE_SIZE:
+            return text[:MAX_FILE_SIZE] + f"\n\n... [内容已截断]"
+        return text
+    except pytesseract.TesseractNotFoundError:
+        return "OCR 引擎未安装：请安装 Tesseract OCR (https://github.com/tesseract-ocr/tesseract)"
+    except Exception as e:
+        return f"OCR 识别出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 音频转写工具
+# ========================================
+@tool
+def transcribe_audio(file_path: str, language: str = "auto") -> str:
+    """
+    将音频文件转写为文字。
+    使用 OpenAI Whisper 模型进行转写。
+    Args:
+        file_path: 音频路径（mp3/wav/m4a/ogg/flac）
+        language: 语言代码
+            - "auto": 自动检测
+            - "en": 英文
+            - "zh": 中文
+            - "ja": 日文
+            等等
+    Returns:
+        转写的文字内容
+    """
+    try:
+        import whisper
+    except ImportError:
+        return "音频转写不可用：请安装 openai-whisper 库 (pip install openai-whisper)"
+    if not os.path.exists(file_path):
+        return f"文件不存在: {file_path}"
+    # 检查文件格式
+    valid_extensions = {'.mp3', '.wav', '.m4a', '.ogg', '.flac', '.wma', '.aac'}
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext not in valid_extensions:
+        return f"不支持的音频格式: {ext}，支持: {', '.join(valid_extensions)}"
+    try:
+        # 加载模型（使用 base 模型平衡速度和准确性）
+        model = whisper.load_model("base")
+        # 转写配置
+        options = {}
+        if language != "auto":
+            options["language"] = language
+        # 执行转写
+        result = model.transcribe(file_path, **options)
+        text = result.get("text", "").strip()
+        if not text:
+            return "音频中没有识别到语音内容"
+        # 添加语言检测信息
+        detected_lang = result.get("language", "unknown")
+        output = f"[检测到语言: {detected_lang}]\n\n{text}"
+        # 限制长度
+        if len(output) > MAX_FILE_SIZE:
+            return output[:MAX_FILE_SIZE] + f"\n\n... [内容已截断]"
+        return output
+    except Exception as e:
+        return f"音频转写出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 视觉分析工具（可选，基于多模态 LLM）
+# ========================================
+@tool
+def analyze_image(file_path: str, question: str = "请描述这张图片的内容") -> str:
+    """
+    使用多模态 LLM 分析图片内容。
+    适用于：
+    - 图片内容描述
+    - 图表数据提取
+    - 图片中的文字识别（比 OCR 更智能）
+    Args:
+        file_path: 图片路径
+        question: 关于图片的问题
+    Returns:
+        LLM 对图片的分析结果
+    """
+    try:
+        import base64
+        from langchain_openai import ChatOpenAI
+        from langchain_core.messages import HumanMessage
+        from config import OPENAI_BASE_URL, OPENAI_API_KEY, MODEL
+    except ImportError:
+        return "图片分析不可用：缺少必要的依赖"
+    if not os.path.exists(file_path):
+        return f"文件不存在: {file_path}"
+    try:
+        # 读取图片并编码
+        with open(file_path, "rb") as f:
+            image_data = base64.b64encode(f.read()).decode("utf-8")
+        # 检测图片格式
+        ext = os.path.splitext(file_path)[1].lower()
+        mime_types = {
+            '.png': 'image/png',
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.gif': 'image/gif',
+            '.webp': 'image/webp',
+        }
+        mime_type = mime_types.get(ext, 'image/png')
+        # 构建多模态消息
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": question},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{mime_type};base64,{image_data}"
+                    }
+                }
+            ]
+        )
+        # 调用 LLM（添加超时保护）
+        llm = ChatOpenAI(
+            model=MODEL,
+            base_url=OPENAI_BASE_URL,
+            api_key=OPENAI_API_KEY,
+            timeout=60,  # 60秒超时
+            max_retries=1,
+        )
+        response = llm.invoke([message])
+        return response.content
+    except Exception as e:
+        return f"图片分析出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 导出扩展工具列表
+# ========================================
+EXTENSION_TOOLS = [
+    parse_pdf,
+    parse_excel,
+    image_ocr,
+    transcribe_audio,
+    analyze_image,
+]

rag.py ADDED Viewed

	@@ -0,0 +1,518 @@

+"""
+RAG 模块 - GAIA 知识库检索增强生成
+基于 GAIA metadata 构建预置知识库，提供问题解题参考
+"""
+import os
+import csv
+import json
+from typing import Optional, List
+from langchain_core.documents import Document
+from langchain_core.tools import tool
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import ChatOpenAI
+try:
+    from langchain_community.vectorstores import FAISS
+except ImportError:
+    from langchain.vectorstores import FAISS
+from config import (
+    OPENAI_BASE_URL,
+    OPENAI_API_KEY,
+    MODEL,
+    TEMPERATURE,
+    RAG_PERSIST_DIR,
+    RAG_CSV_PATH,
+    RAG_EMBEDDING_MODEL,
+    RAG_TOP_K,
+    DEBUG,
+)
+# 使用本地 HuggingFace Embedding（免费，无需 API）
+try:
+    from langchain_huggingface import HuggingFaceEmbeddings
+    USE_LOCAL_EMBEDDING = True
+except ImportError:
+    try:
+        from langchain_community.embeddings import HuggingFaceEmbeddings
+        USE_LOCAL_EMBEDDING = True
+    except ImportError:
+        from langchain_openai import OpenAIEmbeddings
+        USE_LOCAL_EMBEDDING = False
+# ========================================
+# RAG Manager
+# ========================================
+class GAIARAGManager:
+    """
+    GAIA RAG 管理器
+    功能：
+    - 从 GAIA metadata 构建知识库
+    - 检索相似问题，提供解题参考
+    - 不直接返回答案，只提供解题步骤和工具建议
+    """
+    def __init__(self, persist_dir: str = RAG_PERSIST_DIR):
+        self.persist_dir = persist_dir
+        # 延迟初始化（首次使用时加载）
+        self._embeddings = None
+        self._llm = None
+        self._vectorstore = None
+        self._initialized = False
+        # 文本分割器（轻量级，可以立即初始化）
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            separators=["\n\n", "\n", "。", ".", " ", ""]
+        )
+        # RAG Prompt（用于生成解题建议）
+        self.rag_prompt = ChatPromptTemplate.from_messages([
+            ("system", """你是一个解题策略顾问。基于相似问题的解题经验，为新问题提供解题建议。
+注意：
+1. 只提供解题思路和工具建议，不要直接给出答案
+2. 参考历史问题的解题步骤，但要根据新问题调整
+3. 如果相似问题不太相关，明确说明
+相似问题参考：
+{context}"""),
+            ("human", "新问题：{question}\n\n请给出解题建议：")
+        ])
+    @property
+    def embeddings(self):
+        """延迟加载嵌入模型"""
+        if self._embeddings is None:
+            if DEBUG:
+                print("[RAG] 正在加载嵌入模型...")
+            if USE_LOCAL_EMBEDDING:
+                self._embeddings = HuggingFaceEmbeddings(
+                    model_name=RAG_EMBEDDING_MODEL,
+                    model_kwargs={'device': 'cpu'},
+                    encode_kwargs={'normalize_embeddings': True}
+                )
+            else:
+                self._embeddings = OpenAIEmbeddings(
+                    base_url=OPENAI_BASE_URL,
+                    api_key=OPENAI_API_KEY,
+                )
+            if DEBUG:
+                print("[RAG] 嵌入模型加载完成")
+        return self._embeddings
+    @property
+    def llm(self):
+        """延迟加载 LLM"""
+        if self._llm is None:
+            self._llm = ChatOpenAI(
+                model=MODEL,
+                temperature=TEMPERATURE,
+                base_url=OPENAI_BASE_URL,
+                api_key=OPENAI_API_KEY,
+            )
+        return self._llm
+    @property
+    def vectorstore(self) -> Optional[FAISS]:
+        """延迟加载向量存储"""
+        if not self._initialized:
+            self._load_index()
+            self._initialized = True
+        return self._vectorstore
+    @vectorstore.setter
+    def vectorstore(self, value):
+        self._vectorstore = value
+    def _load_index(self):
+        """加载已有的向量索引"""
+        index_file = os.path.join(self.persist_dir, "index.faiss")
+        if os.path.exists(index_file):
+            try:
+                self.vectorstore = FAISS.load_local(
+                    self.persist_dir,
+                    self.embeddings,
+                    allow_dangerous_deserialization=True
+                )
+                if DEBUG:
+                    print(f"[RAG] 已加载索引: {self.persist_dir}")
+            except Exception as e:
+                if DEBUG:
+                    print(f"[RAG] 加载索引失败: {e}")
+                self.vectorstore = None
+        else:
+            # 如果没有索引，尝试从默认 CSV 初始化
+            self._init_from_csv()
+    def _init_from_csv(self):
+        """从默认 CSV 文件初始化向量库"""
+        # 检查多个可能的路径
+        possible_paths = [
+            RAG_CSV_PATH,
+            os.path.join(os.path.dirname(__file__), RAG_CSV_PATH),
+            os.path.join(os.path.dirname(__file__), "data_clean.csv"),
+        ]
+        for csv_path in possible_paths:
+            if os.path.exists(csv_path):
+                if DEBUG:
+                    print(f"[RAG] 从 CSV 初始化: {csv_path}")
+                self.load_csv(csv_path)
+                return
+        if DEBUG:
+            print("[RAG] 未找到 CSV 文件，知识库为空")
+    def load_csv(self, csv_path: str):
+        """
+        从 CSV 文件加载文档
+        CSV 格式：
+        - content: 问题文本（用于 embedding）
+        - metadata: JSON 格式的元数据（answer, steps, tools, has_file）
+        """
+        if not os.path.exists(csv_path):
+            raise FileNotFoundError(f"CSV 文件不存在: {csv_path}")
+        documents = []
+        with open(csv_path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                content = row.get("content", "")
+                if not content:
+                    continue
+                # 解析 metadata
+                try:
+                    metadata = json.loads(row.get("metadata", "{}"))
+                except json.JSONDecodeError:
+                    metadata = {}
+                metadata["csv_source"] = csv_path
+                documents.append(Document(page_content=content, metadata=metadata))
+        if not documents:
+            if DEBUG:
+                print("[RAG] CSV 中没有有效文档")
+            return
+        # 构建向量库
+        self.vectorstore = FAISS.from_documents(documents, self.embeddings)
+        # 持久化
+        os.makedirs(self.persist_dir, exist_ok=True)
+        self.vectorstore.save_local(self.persist_dir)
+        if DEBUG:
+            print(f"[RAG] 已加载 {len(documents)} 条文档")
+    def retrieve(self, query: str, k: int = RAG_TOP_K) -> List[Document]:
+        """
+        检索相关文档
+        Args:
+            query: 查询文本
+            k: 返回文档数量
+        Returns:
+            相关文档列表
+        """
+        if self.vectorstore is None:
+            return []
+        return self.vectorstore.similarity_search(query, k=k)
+    def retrieve_with_scores(self, query: str, k: int = RAG_TOP_K) -> List[tuple]:
+        """
+        检索相关文档（带相似度分数）
+        Args:
+            query: 查询文本
+            k: 返回文档数量
+        Returns:
+            [(doc, score), ...] 列表
+        """
+        if self.vectorstore is None:
+            return []
+        return self.vectorstore.similarity_search_with_score(query, k=k)
+    def get_solving_hints(self, question: str, k: int = RAG_TOP_K, score_threshold: float = 1.5) -> str:
+        """
+        获取解题提示
+        根据相似问题，提取解题步骤和工具建议
+        Args:
+            question: 新问题
+            k: 检索数量
+            score_threshold: 相似度阈值（越小越相似，FAISS L2距离）
+        Returns:
+            解题提示文本
+        """
+        docs_with_scores = self.retrieve_with_scores(question, k=k)
+        if not docs_with_scores:
+            return ""
+        # 过滤低相似度结果
+        relevant_docs = [(doc, score) for doc, score in docs_with_scores if score < score_threshold]
+        if not relevant_docs:
+            return ""
+        hints = []
+        for i, (doc, score) in enumerate(relevant_docs, 1):
+            meta = doc.metadata
+            steps = meta.get('steps', '')
+            tools = meta.get('tools', '')
+            has_file = meta.get('has_file', False)
+            hint_parts = [f"### 参考 {i} (相似度: {1/(1+score):.2f})"]
+            hint_parts.append(f"**相似问题**: {doc.page_content[:100]}...")
+            if steps:
+                hint_parts.append(f"**解题步骤**: {steps[:300]}...")
+            if tools:
+                hint_parts.append(f"**推荐工具**: {tools}")
+            if has_file:
+                hint_parts.append("**注意**: 该问题有附件文件")
+            hints.append("\n".join(hint_parts))
+        return "\n\n".join(hints)
+    def query(self, question: str, k: int = RAG_TOP_K) -> str:
+        """
+        RAG 查询：检索 + 生成解题建议
+        Args:
+            question: 用户问题
+            k: 检索文档数量
+        Returns:
+            解题建议
+        """
+        # 1. 检索相关文档
+        docs = self.retrieve(question, k=k)
+        if not docs:
+            return "知识库中没有找到相似问题。建议直接分析问题并使用适当的工具。"
+        # 2. 构建上下文
+        context_parts = []
+        for i, doc in enumerate(docs, 1):
+            meta = doc.metadata
+            context_parts.append(f"""
+[相似问题 {i}]
+问题: {doc.page_content}
+解题步骤: {meta.get('steps', 'N/A')}
+使用工具: {meta.get('tools', 'N/A')}
+有附件: {'是' if meta.get('has_file') else '否'}
+答案格式参考: {meta.get('answer', 'N/A')[:50]}...
+""")
+        context = "\n".join(context_parts)
+        # 3. LLM 生成建议
+        chain = self.rag_prompt | self.llm
+        response = chain.invoke({
+            "context": context,
+            "question": question
+        })
+        return response.content
+    def get_stats(self) -> dict:
+        """获取索引统计信息"""
+        if self.vectorstore is None:
+            return {"status": "empty", "doc_count": 0}
+        try:
+            doc_count = self.vectorstore.index.ntotal
+        except:
+            doc_count = "unknown"
+        return {
+            "status": "loaded",
+            "doc_count": doc_count,
+            "persist_dir": self.persist_dir
+        }
+# ========================================
+# 全局实例
+# ========================================
+_rag_manager: Optional[GAIARAGManager] = None
+def get_rag_manager() -> GAIARAGManager:
+    """获取 RAG 管理器单例"""
+    global _rag_manager
+    if _rag_manager is None:
+        _rag_manager = GAIARAGManager()
+    return _rag_manager
+def _score_to_similarity(score) -> float:
+    """FAISS L2 距离转 [0, 1] 相似度，处理异常值"""
+    try:
+        score_f = float(score)
+    except Exception:
+        return 0.0
+    if score_f != score_f:  # NaN
+        return 0.0
+    if score_f < 0.0:
+        score_f = 0.0
+    return 1.0 / (1.0 + score_f)
+def rag_lookup_answer(question: str, min_similarity: float = 0.85):
+    """
+    RAG 短路查找：高置信度匹配时直接返回答案。
+    Returns:
+        命中: {"answer": str, "similarity": float, "score": float, "metadata": dict}
+        未命中/异常: None
+    """
+    if not question or not str(question).strip():
+        return None
+    try:
+        manager = get_rag_manager()
+        results = manager.retrieve_with_scores(str(question).strip(), k=1)
+        if not results:
+            return None
+        best_doc, best_score = results[0]
+        similarity = _score_to_similarity(best_score)
+        answer = (best_doc.metadata.get("answer") or "").strip()
+        if not answer:
+            return None
+        if similarity > float(min_similarity):
+            return {
+                "answer": answer,
+                "similarity": float(similarity),
+                "score": float(best_score),
+                "metadata": dict(best_doc.metadata),
+            }
+        return None
+    except Exception as e:
+        if DEBUG:
+            print(f"[RAG] rag_lookup_answer failed: {type(e).__name__}: {e}")
+        return None
+# ========================================
+# Agent 工具
+# ========================================
+@tool
+def rag_query(question: str) -> str:
+    """
+    查询知识库。如果找到高度匹配的问题，直接返回答案；否则返回解题建议。
+    适用于：
+    - 快速查找已知问题的答案
+    - 获取相似问题的解题思路和推荐工具
+    Args:
+        question: 用户问题
+    Returns:
+        匹配答案或解题建议
+    """
+    manager = get_rag_manager()
+    # 使用带分数的检索
+    results = manager.retrieve_with_scores(question, k=3)
+    if not results:
+        return "知识库中没有找到相似问题。建议使用 web_search 等工具获取信息。"
+    best_doc, best_score = results[0]
+    similarity = 1 / (1 + best_score)
+    # 高相似度 (>0.85)：直接返回答案
+    if similarity > 0.85:
+        answer = best_doc.metadata.get('answer', '')
+        if answer:
+            return f"【知识库匹配成功】相似度: {similarity:.2f}\n直接答案: {answer}\n请直接使用此答案作为最终回答。"
+    # 中等相似度：返回答案 + 解题参考
+    if similarity > 0.6:
+        parts = []
+        for i, (doc, score) in enumerate(results[:2], 1):
+            sim = 1 / (1 + score)
+            meta = doc.metadata
+            parts.append(
+                f"[参考 {i}] 相似度: {sim:.2f}\n"
+                f"问题: {doc.page_content[:100]}...\n"
+                f"答案: {meta.get('answer', 'N/A')}\n"
+                f"步骤: {meta.get('steps', 'N/A')[:200]}\n"
+                f"工具: {meta.get('tools', 'N/A')}"
+            )
+        return "【知识库参考】\n" + "\n---\n".join(parts)
+    # 低相似度：仅返回工具建议
+    return manager.query(question)
+@tool
+def rag_retrieve(query: str) -> str:
+    """
+    仅检索知识库中的相关文档片段，不生成建议。
+    用于查看原始的相似问题和解题步骤。
+    Args:
+        query: 检索查询
+    Returns:
+        相关文档片段
+    """
+    manager = get_rag_manager()
+    docs_with_scores = manager.retrieve_with_scores(query, k=3)
+    if not docs_with_scores:
+        return "知识库为空或未找到相关文档。"
+    results = []
+    for i, (doc, score) in enumerate(docs_with_scores, 1):
+        meta = doc.metadata
+        results.append(f"""[{i}] 相似度: {1/(1+score):.2f}
+问题: {doc.page_content[:200]}...
+解题步骤: {meta.get('steps', 'N/A')[:200]}...
+工具: {meta.get('tools', 'N/A')}
+""")
+    return "\n---\n".join(results)
+@tool
+def rag_stats() -> str:
+    """
+    获取知识库统计信息。
+    Returns:
+        知识库状态和文档数量
+    """
+    manager = get_rag_manager()
+    stats = manager.get_stats()
+    return f"知识库状态: {stats['status']}, 文档数量: {stats['doc_count']}"
+# ========================================
+# 导出 RAG 工具
+# ========================================
+RAG_TOOLS = [rag_query, rag_retrieve, rag_stats]

rag_index/index.faiss ADDED Viewed

Binary file (81.5 kB). View file

rag_index/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff44a2ea9960d8c4013b553cf3a8211f849e1e259f739e2aaf3e07f0fc0b34c8
+size 58088

requirements.txt ADDED Viewed

	@@ -0,0 +1,51 @@

+# GAIA Agent 依赖清单
+# 安装方式: pip install -r requirements.txt
+# ========================================
+# 核心依赖（必需）
+# ========================================
+langgraph>=0.2.0
+langchain-core>=0.2.0
+langchain-openai>=0.2.0
+langchain-community>=0.2.0
+requests>=2.31.0
+python-dotenv>=1.0.0
+gradio>=4.0.0
+# ========================================
+# 搜索工具依赖
+# ========================================
+ddgs>=6.0.0
+wikipedia>=1.4.0
+tavily-python>=0.3.0
+arxiv>=2.0.0
+# ========================================
+# RAG 依赖
+# ========================================
+faiss-cpu>=1.8.0
+langchain-huggingface>=0.1.0
+sentence-transformers>=3.0.0
+# ========================================
+# 数据处理
+# ========================================
+pandas>=2.0.0
+pyarrow>=14.0.0
+# ========================================
+# 扩展工具依赖（按需安装）
+# ========================================
+# PDF 解析
+pdfplumber>=0.10.0
+# Excel 解析
+openpyxl>=3.1.0
+# 图片 OCR（需要系统安装 Tesseract）
+# pytesseract>=0.3.10
+# Pillow>=10.0.0
+# 音频转写（需要较大的模型文件）
+# openai-whisper>=20231117

tools.py ADDED Viewed

	@@ -0,0 +1,945 @@

+"""
+核心工具模块 - GAIA Agent 基础工具
+包含：web_search, fetch_task_files, read_file, calc, run_python
+"""
+import os
+import re
+import json
+import tempfile
+import zipfile
+from typing import Optional
+import requests
+from langchain_core.tools import tool
+from config import (
+    SCORING_API_URL,
+    SEARCH_MAX_RESULTS,
+    MAX_FILE_SIZE,
+    TOOL_TIMEOUT,
+    TEMP_DIR,
+    TAVILY_API_KEY,
+    WIKIPEDIA_MAX_RESULTS,
+    ARXIV_MAX_RESULTS,
+    TAVILY_MAX_RESULTS,
+)
+# DuckDuckGo 搜索 (新包名: ddgs)
+try:
+    from ddgs import DDGS
+except ImportError:
+    try:
+        from duckduckgo_search import DDGS
+    except ImportError:
+        DDGS = None
+# Wikipedia 搜索
+try:
+    import wikipedia
+    wikipedia.set_lang("en")
+except ImportError:
+    wikipedia = None
+# Tavily 搜索
+try:
+    from tavily import TavilyClient
+except ImportError:
+    TavilyClient = None
+# arXiv 搜索
+try:
+    import arxiv
+except ImportError:
+    arxiv = None
+# ========================================
+# 信息获取工具
+# ========================================
+@tool
+def web_search(query: str, max_results: int = SEARCH_MAX_RESULTS) -> str:
+    """
+    使用 DuckDuckGo 搜索网络信息。
+    适用场景：
+    - 查找人物信息（生卒年、职业、成就等）
+    - 查找事件详情（时间、地点、参与者等）
+    - 查找组织/公司信息
+    - 获取最新资讯
+    Args:
+        query: 搜索关键词，建议使用英文
+        max_results: 返回结果数量，默认5条
+    Returns:
+        搜索结果摘要（标题+内容+URL）
+    """
+    if DDGS is None:
+        return "搜索服务不可用：请安装 ddgs 库 (pip install ddgs)"
+    try:
+        ddgs = DDGS()
+        results = list(ddgs.text(query, max_results=max_results))
+        if not results:
+            return f"没有找到与 '{query}' 相关的搜索结果。"
+        output = []
+        for i, r in enumerate(results, 1):
+            title = r.get('title', 'N/A')
+            body = r.get('body', 'N/A')
+            url = r.get('href', 'N/A')
+            output.append(f"{i}. {title}")
+            output.append(f"   {body}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"搜索出错: {type(e).__name__}: {str(e)}"
+@tool
+def wikipedia_search(query: str, max_results: int = WIKIPEDIA_MAX_RESULTS) -> str:
+    """
+    在维基百科中搜索信息。
+    适用场景：
+    - 查找人物传记、历史事件
+    - 获取概念定义和详细解释
+    - 查找地理、科学、文化等百科知识
+    Args:
+        query: 搜索关键词，建议使用英文
+        max_results: 返回结果数量，默认2条
+    Returns:
+        维基百科文章摘要
+    """
+    if wikipedia is None:
+        return "Wikipedia 搜索不可用：请安装 wikipedia 库 (pip install wikipedia)"
+    try:
+        # 搜索相关页面
+        search_results = wikipedia.search(query, results=max_results)
+        if not search_results:
+            return f"没有找到与 '{query}' 相关的维基百科文章。"
+        output = []
+        for i, title in enumerate(search_results, 1):
+            try:
+                # 获取页面摘要
+                page = wikipedia.page(title, auto_suggest=False)
+                summary = wikipedia.summary(title, sentences=3, auto_suggest=False)
+                output.append(f"{i}. {page.title}")
+                output.append(f"   {summary}")
+                output.append(f"   URL: {page.url}")
+                output.append("")
+            except wikipedia.exceptions.DisambiguationError as e:
+                # 处理歧义页面，取第一个选项
+                if e.options:
+                    try:
+                        page = wikipedia.page(e.options[0], auto_suggest=False)
+                        summary = wikipedia.summary(e.options[0], sentences=3, auto_suggest=False)
+                        output.append(f"{i}. {page.title}")
+                        output.append(f"   {summary}")
+                        output.append(f"   URL: {page.url}")
+                        output.append("")
+                    except:
+                        output.append(f"{i}. {title} (歧义页面，可选: {', '.join(e.options[:3])})")
+                        output.append("")
+            except wikipedia.exceptions.PageError:
+                continue
+        return "\n".join(output) if output else f"没有找到与 '{query}' 相关的详细信息。"
+    except Exception as e:
+        return f"Wikipedia 搜索出错: {type(e).__name__}: {str(e)}"
+@tool
+def wikipedia_page(title: str, section: str = None) -> str:
+    """
+    获取维基百科页面的完整内容。
+    当 wikipedia_search 返回的摘要不够详细时使用此工具。
+    特别适用于需要获取列表、表格、详细数据的场景（如专辑列表、获奖记录等）。
+    Args:
+        title: 页面标题（从 wikipedia_search 结果中获取）
+        section: 可选，指定要获取的章节名（如 "Discography", "Awards"）
+    Returns:
+        页面完整内容或指定章节内容
+    """
+    if wikipedia is None:
+        return "Wikipedia 不可用：请安装 wikipedia 库 (pip install wikipedia)"
+    try:
+        page = wikipedia.page(title, auto_suggest=False)
+        content = page.content
+        # 如果指定了章节，尝试提取该章节
+        if section:
+            # 查找章节（支持 == Section == 和 === Subsection === 格式）
+            section_name = section.strip()
+            heading_re = re.compile(r'^(=+)\s*(.+?)\s*\1\s*$', re.MULTILINE)
+            headings = list(heading_re.finditer(content))
+            # 精确匹配
+            target_idx = None
+            for i, m in enumerate(headings):
+                if m.group(2).strip().lower() == section_name.lower():
+                    target_idx = i
+                    break
+            # 模糊匹配（包含）
+            matched_label = ""
+            if target_idx is None:
+                for i, m in enumerate(headings):
+                    if section_name.lower() in m.group(2).strip().lower():
+                        target_idx = i
+                        matched_label = " (matched)"
+                        break
+            if target_idx is not None:
+                level = len(headings[target_idx].group(1))
+                start = headings[target_idx].end()
+                end = len(content)
+                for m in headings[target_idx + 1:]:
+                    if len(m.group(1)) <= level:
+                        end = m.start()
+                        break
+                section_text = content[start:end].strip()
+                content = f"{headings[target_idx].group(0)}{matched_label}\n{section_text}"
+            else:
+                available = [m.group(2).strip() for m in headings][:20]
+                content = (
+                    f"未找到 '{section_name}' 章节。\n\n可用章节：\n"
+                    + "\n".join(available)
+                    + f"\n\n完整内容：\n{content[:3000]}"
+                )
+        # 组装输出并限制长度（保留标题与 URL）
+        output = f"Wikipedia 页面: {page.title}\nURL: {page.url}\n\n{content}"
+        if len(output) > MAX_FILE_SIZE:
+            return output[:MAX_FILE_SIZE] + f"\n\n... [内容已截断，共 {len(output)} 字符]"
+        return output
+    except wikipedia.exceptions.DisambiguationError as e:
+        options = e.options[:10]
+        return f"'{title}' 是一个歧义页面，请指定更具体的标题：\n" + "\n".join(f"  - {opt}" for opt in options)
+    except wikipedia.exceptions.PageError:
+        return f"找不到标题为 '{title}' 的维基百科页面。请检查标题拼写或使用 wikipedia_search 搜索。"
+    except Exception as e:
+        return f"Wikipedia 页面获取出错: {type(e).__name__}: {str(e)}"
+@tool
+def tavily_search(query: str, max_results: int = TAVILY_MAX_RESULTS) -> str:
+    """
+    使用 Tavily 进行高质量网络搜索（需要 API Key）。
+    适用场景：
+    - 需要高质量、准确的搜索结果
+    - 查找最新新闻和实时信息
+    - 需要更智能的搜索结果排序
+    Args:
+        query: 搜索关键词
+        max_results: 返回结果数量，默认3条
+    Returns:
+        搜索结果摘要
+    """
+    if TavilyClient is None:
+        return "Tavily 搜索不可用：请安装 tavily-python 库 (pip install tavily-python)"
+    if not TAVILY_API_KEY:
+        return "Tavily 搜索不可用：请在 .env 文件中设置 TAVILY_API_KEY"
+    try:
+        client = TavilyClient(api_key=TAVILY_API_KEY)
+        response = client.search(query, max_results=max_results)
+        results = response.get('results', [])
+        if not results:
+            return f"没有找到与 '{query}' 相关的搜索结果。"
+        output = []
+        for i, r in enumerate(results, 1):
+            title = r.get('title', 'N/A')
+            content = r.get('content', 'N/A')
+            url = r.get('url', 'N/A')
+            output.append(f"{i}. {title}")
+            output.append(f"   {content[:300]}..." if len(content) > 300 else f"   {content}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"Tavily 搜索出错: {type(e).__name__}: {str(e)}"
+@tool
+def arxiv_search(query: str, max_results: int = ARXIV_MAX_RESULTS) -> str:
+    """
+    在 arXiv 上搜索学术论文。
+    适用场景：
+    - 查找最新学术研究论文
+    - 搜索特定领域的科学文献
+    - 获取论文摘要和作者信息
+    Args:
+        query: 搜索关键词（建议使用英文学术术语）
+        max_results: 返回结果数量，默认3条
+    Returns:
+        论文信息（标题、作��、摘要、链接）
+    """
+    if arxiv is None:
+        return "arXiv 搜索不可用：请安装 arxiv 库 (pip install arxiv)"
+    try:
+        client = arxiv.Client()
+        search = arxiv.Search(
+            query=query,
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.Relevance
+        )
+        results = list(client.results(search))
+        if not results:
+            return f"没有找到与 '{query}' 相关的 arXiv 论文。"
+        output = []
+        for i, paper in enumerate(results, 1):
+            title = paper.title
+            authors = ", ".join([a.name for a in paper.authors[:3]])
+            if len(paper.authors) > 3:
+                authors += f" 等 ({len(paper.authors)} 位作者)"
+            summary = paper.summary[:400] + "..." if len(paper.summary) > 400 else paper.summary
+            published = paper.published.strftime("%Y-%m-%d")
+            url = paper.entry_id
+            output.append(f"{i}. {title}")
+            output.append(f"   作者: {authors}")
+            output.append(f"   发布日期: {published}")
+            output.append(f"   摘要: {summary}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"arXiv 搜索出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# YouTube 搜索工具
+# ========================================
+@tool
+def youtube_search(query: str, max_results: int = 3) -> str:
+    """
+    搜索 YouTube 视频信息。
+    适用场景：
+    - 查找教程视频
+    - 搜索特定主题的视频内容
+    - 获取视频标题、频道和描述
+    Args:
+        query: 搜索关键词
+        max_results: 返回结果数量，默认3条
+    Returns:
+        视频信息（标题、频道、链接）
+    """
+    try:
+        from youtube_search import YoutubeSearch
+    except ImportError:
+        # 备选方案：使用 DuckDuckGo 搜索 YouTube
+        if DDGS is None:
+            return "YouTube 搜索不可用：请安装 youtube-search-python 库 (pip install youtube-search-python)"
+        try:
+            ddgs = DDGS()
+            results = list(ddgs.text(f"site:youtube.com {query}", max_results=max_results))
+            if not results:
+                return f"没有找到与 '{query}' 相关的 YouTube 视频。"
+            output = []
+            for i, r in enumerate(results, 1):
+                title = r.get('title', 'N/A')
+                url = r.get('href', 'N/A')
+                output.append(f"{i}. {title}")
+                output.append(f"   URL: {url}")
+                output.append("")
+            return "\n".join(output)
+        except Exception as e:
+            return f"YouTube 搜索出错: {type(e).__name__}: {str(e)}"
+    try:
+        results = YoutubeSearch(query, max_results=max_results).to_dict()
+        if not results:
+            return f"没有找到与 '{query}' 相关的 YouTube 视频。"
+        output = []
+        for i, video in enumerate(results, 1):
+            title = video.get('title', 'N/A')
+            channel = video.get('channel', 'N/A')
+            duration = video.get('duration', 'N/A')
+            views = video.get('views', 'N/A')
+            url_suffix = video.get('url_suffix', '')
+            url = f"https://youtube.com{url_suffix}" if url_suffix else 'N/A'
+            output.append(f"{i}. {title}")
+            output.append(f"   频道: {channel}")
+            output.append(f"   时长: {duration} | 播放量: {views}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"YouTube 搜索出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 新闻搜索工具
+# ========================================
+@tool
+def news_search(query: str, max_results: int = 5) -> str:
+    """
+    搜索最新新闻资讯。
+    适用场景：
+    - 查找最新新闻事件
+    - 获取时事热点信息
+    - 搜索特定主题的新闻报道
+    Args:
+        query: 搜索关键词
+        max_results: 返回结果数量，默认5条
+    Returns:
+        新闻标题、来源和摘要
+    """
+    if DDGS is None:
+        return "新闻搜索不可用：请安装 ddgs 库 (pip install ddgs)"
+    try:
+        ddgs = DDGS()
+        results = list(ddgs.news(query, max_results=max_results))
+        if not results:
+            return f"没有找到与 '{query}' 相关的新闻。"
+        output = []
+        for i, r in enumerate(results, 1):
+            title = r.get('title', 'N/A')
+            body = r.get('body', 'N/A')
+            source = r.get('source', 'N/A')
+            date = r.get('date', 'N/A')
+            url = r.get('url', 'N/A')
+            output.append(f"{i}. {title}")
+            output.append(f"   来源: {source} | 日期: {date}")
+            output.append(f"   {body[:200]}..." if len(body) > 200 else f"   {body}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"新闻搜索出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# StackOverflow 搜索工具
+# ========================================
+@tool
+def stackoverflow_search(query: str, max_results: int = 3) -> str:
+    """
+    在 StackOverflow 上搜索编程问题和解答。
+    适用场景：
+    - 查找编程问题的解决方案
+    - 搜索代码错误的修复方法
+    - 获取技术问题的讨论
+    Args:
+        query: 搜索关键词（建议包含编程语言或技术栈）
+        max_results: 返回结果数量，默认3条
+    Returns:
+        问题标题、回答数和链接
+    """
+    try:
+        import requests
+        # 使用 StackExchange API
+        api_url = "https://api.stackexchange.com/2.3/search/advanced"
+        params = {
+            "order": "desc",
+            "sort": "relevance",
+            "q": query,
+            "site": "stackoverflow",
+            "pagesize": max_results,
+            "filter": "withbody"
+        }
+        response = requests.get(api_url, params=params, timeout=TOOL_TIMEOUT)
+        response.raise_for_status()
+        data = response.json()
+        items = data.get('items', [])
+        if not items:
+            return f"没有找到与 '{query}' 相关的 StackOverflow 问题。"
+        output = []
+        for i, item in enumerate(items, 1):
+            title = item.get('title', 'N/A')
+            score = item.get('score', 0)
+            answer_count = item.get('answer_count', 0)
+            is_answered = "✓ 已解答" if item.get('is_answered') else "○ 待解答"
+            tags = ", ".join(item.get('tags', [])[:5])
+            url = item.get('link', 'N/A')
+            output.append(f"{i}. {title}")
+            output.append(f"   {is_answered} | 得分: {score} | 回答数: {answer_count}")
+            output.append(f"   标签: {tags}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"StackOverflow 搜索出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# Google 搜索工具
+# ========================================
+@tool
+def google_search(query: str, max_results: int = 5) -> str:
+    """
+    使用 Google 搜索网络信息（通过 DuckDuckGo 代理）。
+    适用场景：
+    - 综合网络搜索
+    - 查找官方网站和权威来源
+    - 获取多样化的搜索结果
+    Args:
+        query: 搜索关键词
+        max_results: 返回结果数量，默认5条
+    Returns:
+        搜索结果（标题+摘要+URL）
+    注意：
+        由于 Google API 限制，此工具通过 DuckDuckGo 实现类似功能
+    """
+    # 复用 DuckDuckGo 搜索，但添加 Google 特定的搜索词
+    if DDGS is None:
+        return "Google 搜索不可用：请安装 ddgs 库 (pip install ddgs)"
+    try:
+        ddgs = DDGS()
+        results = list(ddgs.text(query, max_results=max_results))
+        if not results:
+            return f"没有找到与 '{query}' 相关的搜索结果。"
+        output = []
+        for i, r in enumerate(results, 1):
+            title = r.get('title', 'N/A')
+            body = r.get('body', 'N/A')
+            url = r.get('href', 'N/A')
+            output.append(f"{i}. {title}")
+            output.append(f"   {body}")
+            output.append(f"   URL: {url}")
+            output.append("")
+        return "\n".join(output)
+    except Exception as e:
+        return f"Google 搜索出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 文件处理工具
+# ========================================
+@tool
+def fetch_task_files(task_id: str) -> str:
+    """
+    从评分服务器下载任务相关的附件文件。
+    当问题涉及附件时必须先调用此工具下载文件，然后使用 read_file 或其他工具读取。
+    Args:
+        task_id: 任务 ID（从问题中获取）
+    Returns:
+        下载文件的本地路径，或错误信息
+    """
+    try:
+        url = f"{SCORING_API_URL}/files/{task_id}"
+        response = requests.get(url, timeout=TOOL_TIMEOUT)
+        if response.status_code == 404:
+            return "该任务没有附件文件。"
+        response.raise_for_status()
+        # 从 Content-Disposition 获取文件名
+        content_disp = response.headers.get("Content-Disposition", "")
+        filename_match = re.search(r'filename="?([^";\n]+)"?', content_disp)
+        filename = filename_match.group(1) if filename_match else f"task_{task_id}_file"
+        # 保存到临时目录
+        file_path = TEMP_DIR / filename
+        with open(file_path, "wb") as f:
+            f.write(response.content)
+        # 返回文件信息和使用建议
+        file_size = len(response.content)
+        file_ext = os.path.splitext(filename)[1].lower()
+        # 根据文件类型给出下一步建议
+        next_step_hint = ""
+        if file_ext in ['.xlsx', '.xls']:
+            next_step_hint = "\n\n⚠️ 下一步：请立即使用 parse_excel(file_path) 工具读取此 Excel 文件内容，不要搜索网络。"
+        elif file_ext == '.pdf':
+            next_step_hint = "\n\n⚠️ 下一步：请立即使用 parse_pdf(file_path) 工具读取此 PDF 文件，不要搜索网络。"
+        elif file_ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
+            next_step_hint = "\n\n⚠️ 下一步：请使用 image_ocr(file_path) 或 analyze_image(file_path, question) 工具处理此图片。"
+        elif file_ext in ['.mp3', '.wav', '.m4a', '.ogg']:
+            next_step_hint = "\n\n⚠️ 下一步：请使用 transcribe_audio(file_path) 工具转写此音频文件。"
+        elif file_ext in ['.txt', '.csv', '.json', '.md', '.py', '.html', '.xml']:
+            next_step_hint = "\n\n⚠️ 下一步：请立即使用 read_file(file_path) 工具读取此文件内容。"
+        elif file_ext == '.zip':
+            next_step_hint = "\n\n⚠️ 下一步：请使用 read_file(file_path) 工具解压此 ZIP 文件。"
+        return f"文件已下载到: {file_path}\n文件大小: {file_size} 字节\n文件名: {filename}{next_step_hint}"
+    except requests.Timeout:
+        return f"下载超时（{TOOL_TIMEOUT}秒），请稍后重试。"
+    except Exception as e:
+        return f"下载文件出错: {type(e).__name__}: {str(e)}"
+@tool
+def read_file(file_path: str, encoding: str = "utf-8") -> str:
+    """
+    读取本地文件内容。
+    支持格式：txt, csv, json, py, html, xml, zip, md
+    Args:
+        file_path: 文件完整路径
+        encoding: 编码格式，默认 utf-8
+    Returns:
+        文件内容（超过指定字符数会截断）
+    注意：
+        - ZIP 文件会自动解压并列出内容
+        - JSON 文件会自动美化输出
+        - PDF/Excel 需使用专门的扩展工具
+    """
+    try:
+        if not os.path.exists(file_path):
+            return f"文件不存在: {file_path}"
+        file_ext = os.path.splitext(file_path)[1].lower()
+        # 处理 ZIP 文件
+        if file_ext == '.zip':
+            extract_dir = file_path.replace('.zip', '_extracted')
+            with zipfile.ZipFile(file_path, 'r') as zip_ref:
+                zip_ref.extractall(extract_dir)
+            files = os.listdir(extract_dir)
+            file_list = "\n".join(f"  - {f}" for f in files)
+            return f"ZIP 已解压到: {extract_dir}\n包含文件:\n{file_list}"
+        # 读取文本文件
+        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+            content = f.read()
+        # JSON 美化
+        if file_ext == '.json':
+            try:
+                data = json.loads(content)
+                content = json.dumps(data, indent=2, ensure_ascii=False)
+            except json.JSONDecodeError:
+                pass  # 保持原始内容
+        # 限制返回长度
+        if len(content) > MAX_FILE_SIZE:
+            return content[:MAX_FILE_SIZE] + f"\n\n... [内容已截断，共 {len(content)} 字符]"
+        return content
+    except Exception as e:
+        return f"读取文件出错: {type(e).__name__}: {str(e)}"
+# ========================================
+# 计算和代码工具
+# ========================================
+@tool
+def calc(expression: str) -> str:
+    """
+    执行安全的数学计算。
+    支持：
+    - 基础运算：+, -, *, /, **, %
+    - 数学函数：sqrt, sin, cos, tan, log, log10, exp, floor, ceil
+    - 常量：pi, e
+    Args:
+        expression: 数学表达式，如 "2+3*4" 或 "sqrt(16)"
+    Returns:
+        计算结果
+    """
+    import math
+    # 允许的数学函数和常量
+    safe_dict = {
+        # 内置函数
+        'abs': abs, 'round': round, 'min': min, 'max': max,
+        'sum': sum, 'pow': pow, 'len': len,
+        # math 函数
+        'sqrt': math.sqrt, 'sin': math.sin, 'cos': math.cos,
+        'tan': math.tan, 'log': math.log, 'log10': math.log10,
+        'exp': math.exp, 'floor': math.floor, 'ceil': math.ceil,
+        'asin': math.asin, 'acos': math.acos, 'atan': math.atan,
+        'sinh': math.sinh, 'cosh': math.cosh, 'tanh': math.tanh,
+        'degrees': math.degrees, 'radians': math.radians,
+        'factorial': math.factorial, 'gcd': math.gcd,
+        # 常量
+        'pi': math.pi, 'e': math.e,
+    }
+    try:
+        # 清理表达式
+        expression = expression.strip()
+        # 安全执行
+        result = eval(expression, {"__builtins__": {}}, safe_dict)
+        # 格式化结果
+        if isinstance(result, float):
+            # 避免浮点数精度问题
+            if result.is_integer():
+                return str(int(result))
+            return str(round(result, 10))
+        return str(result)
+    except ZeroDivisionError:
+        return "计算出错: 除数不能为零"
+    except ValueError as e:
+        return f"计算出错: 无效的数学操作 - {str(e)}"
+    except Exception as e:
+        return f"计算出错: {type(e).__name__}: {str(e)}"
+@tool
+def run_python(code: str) -> str:
+    """
+    在沙箱环境中执行 Python 代码。
+    支持 import 以下模块：
+    - math: 数学模块
+    - re: 正则表达式模块
+    - json: JSON 处理模块
+    - datetime: 日期时间模块
+    - collections: 集合工具模块
+    - random: 随机数模块
+    - string: 字符串常量模块
+    - itertools: 迭代器工具模块
+    - functools: 函数工具模块
+    可用内置函数：
+    - 类型: list, dict, set, tuple, str, int, float, bool, bytes
+    - 函数: print, len, range, enumerate, zip, map, filter, sorted, reversed
+    - 数值: sum, min, max, abs, round, pow, divmod, all, any
+    - 转换: ord, chr, hex, bin, oct, isinstance, type, format, repr
+    适用场景：
+    - 复杂数学计算
+    - 数据排序和过滤
+    - 字符串处理
+    - 日期计算
+    Args:
+        code: Python 代码，需使用 print() 输出结果
+    Returns:
+        代码的标准输出
+    示例：
+        from datetime import datetime, timedelta
+        today = datetime(2024, 1, 15)
+        print(today + timedelta(days=30))
+    """
+    import io
+    import sys
+    import math
+    import re as re_module
+    import json as json_module
+    import datetime as datetime_module
+    import collections as collections_module
+    import random as random_module
+    import string as string_module
+    import itertools as itertools_module
+    import functools as functools_module
+    # 允许导入的模块白名单
+    ALLOWED_MODULES = {
+        'math': math,
+        're': re_module,
+        'json': json_module,
+        'datetime': datetime_module,
+        'collections': collections_module,
+        'random': random_module,
+        'string': string_module,
+        'itertools': itertools_module,
+        'functools': functools_module,
+    }
+    def restricted_import(name, globals=None, locals=None, fromlist=(), level=0):
+        """受限的 import 函数，只允许导入白名单中的模块"""
+        if name not in ALLOWED_MODULES:
+            raise ImportError(f"不允许导入模块 '{name}'，可用模块: {', '.join(ALLOWED_MODULES.keys())}")
+        return ALLOWED_MODULES[name]
+    # 受限的内置函数
+    safe_builtins = {
+        # 类型
+        'list': list, 'dict': dict, 'set': set, 'tuple': tuple,
+        'str': str, 'int': int, 'float': float, 'bool': bool,
+        'bytes': bytes, 'bytearray': bytearray,
+        # 函数
+        'print': print, 'len': len, 'range': range, 'enumerate': enumerate,
+        'zip': zip, 'map': map, 'filter': filter, 'sorted': sorted,
+        'reversed': reversed, 'iter': iter, 'next': next,
+        'sum': sum, 'min': min, 'max': max, 'abs': abs, 'round': round,
+        'pow': pow, 'divmod': divmod,
+        'all': all, 'any': any,
+        'isinstance': isinstance, 'type': type,
+        'ord': ord, 'chr': chr,
+        'hex': hex, 'bin': bin, 'oct': oct,
+        'format': format, 'repr': repr,
+        'hasattr': hasattr, 'getattr': getattr, 'setattr': setattr,
+        'slice': slice, 'object': object,
+        # 支持 import
+        '__import__': restricted_import,
+        # 常量
+        'True': True, 'False': False, 'None': None,
+    }
+    # 预注入的模块（可以直接使用，也可以 import）
+    preloaded = {
+        'math': math,
+        're': re_module,
+        'json': json_module,
+        'datetime': datetime_module.datetime,
+        'date': datetime_module.date,
+        'timedelta': datetime_module.timedelta,
+        'Counter': collections_module.Counter,
+        'defaultdict': collections_module.defaultdict,
+        'OrderedDict': collections_module.OrderedDict,
+        'random': random_module,
+    }
+    # 合并命名空间
+    namespace = {"__builtins__": safe_builtins}
+    namespace.update(preloaded)
+    # 捕获 stdout
+    old_stdout = sys.stdout
+    sys.stdout = io.StringIO()
+    try:
+        exec(code, namespace)
+        output = sys.stdout.getvalue()
+        if not output:
+            return "代码执行成功，无输出。请使用 print() 输出结果。"
+        # 限制输出长度
+        if len(output) > MAX_FILE_SIZE:
+            return output[:MAX_FILE_SIZE] + f"\n\n... [输出已截断，共 {len(output)} 字符]"
+        return output.strip()
+    except SyntaxError as e:
+        return f"语法错误: 第 {e.lineno} 行 - {e.msg}"
+    except NameError as e:
+        return f"名称错误: {str(e)}（该函数或变量在沙箱中不可用）"
+    except Exception as e:
+        return f"执行出错: {type(e).__name__}: {str(e)}"
+    finally:
+        sys.stdout = old_stdout
+# ========================================
+# 导出工具列表
+# ========================================
+BASE_TOOLS = [
+    # 搜索工具
+    web_search,
+    wikipedia_search,
+    wikipedia_page,  # 获取完整 Wikipedia 页面内容
+    tavily_search,
+    arxiv_search,
+    youtube_search,
+    news_search,
+    stackoverflow_search,
+    google_search,
+    # 文件工具
+    fetch_task_files,
+    read_file,
+    # 计算工具
+    calc,
+    run_python,
+]
+# 尝试导入扩展工具
+try:
+    from extension_tools import EXTENSION_TOOLS
+    ALL_TOOLS = BASE_TOOLS + EXTENSION_TOOLS
+except ImportError:
+    ALL_TOOLS = BASE_TOOLS
+# 尝试导入 RAG 工具
+try:
+    from rag import RAG_TOOLS
+    ALL_TOOLS = ALL_TOOLS + RAG_TOOLS
+except ImportError:
+    pass