Spaces:
Sleeping
Sleeping
naclfish Claude Sonnet 4.6 commited on
Commit ·
2628a0b
1
Parent(s): 81917a3
Add tools/ folder, fix agent answer format and Wikipedia proxy
Browse files- Add tools/ with web_search (Serper), wikipedia_search, python_repl, file_handler
- Fix Wikipedia 403 by adding User-Agent header
- Fix answer format: enforce ENTIRE response = only final answer
- Fix list output: comma-separated, no Python brackets
- Add [Task ID] prefix to questions so agent uses correct task_id
- Increase max iterations from 10 to 15
- Add .gitignore to protect .env, data/, __pycache__
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- .gitignore +6 -0
- app.py +140 -7
- requirements.txt +4 -1
- test_agent.py +30 -0
- tools/__init__.py +6 -0
- tools/_session.py +5 -0
- tools/calculator.py +26 -0
- tools/file_handler.py +90 -0
- tools/search.py +40 -0
- tools/wikipedia.py +51 -0
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
data/
|
| 6 |
+
记忆.md
|
app.py
CHANGED
|
@@ -1,23 +1,150 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import inspect
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# (Keep Constants as is)
|
| 8 |
# --- Constants ---
|
| 9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# --- Basic Agent Definition ---
|
| 12 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 13 |
class BasicAgent:
|
| 14 |
def __init__(self):
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def __call__(self, question: str) -> str:
|
| 17 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 23 |
"""
|
|
@@ -51,7 +178,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 51 |
# 2. Fetch Questions
|
| 52 |
print(f"Fetching questions from: {questions_url}")
|
| 53 |
try:
|
| 54 |
-
response = requests.get(questions_url, timeout=15)
|
| 55 |
response.raise_for_status()
|
| 56 |
questions_data = response.json()
|
| 57 |
if not questions_data:
|
|
@@ -80,7 +207,13 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 80 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 81 |
continue
|
| 82 |
try:
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 85 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 86 |
except Exception as e:
|
|
@@ -99,7 +232,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 99 |
# 5. Submit
|
| 100 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 101 |
try:
|
| 102 |
-
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 103 |
response.raise_for_status()
|
| 104 |
result_data = response.json()
|
| 105 |
final_status = (
|
|
|
|
| 1 |
import os
|
| 2 |
+
import json
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
import gradio as gr
|
| 6 |
import requests
|
| 7 |
import inspect
|
| 8 |
import pandas as pd
|
| 9 |
+
from tools import web_search, wikipedia_search, python_repl, download_and_read_file
|
| 10 |
+
from tools.file_handler import prefetch_file
|
| 11 |
|
| 12 |
# (Keep Constants as is)
|
| 13 |
# --- Constants ---
|
| 14 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 15 |
|
| 16 |
+
TOOL_FUNCTIONS = {
|
| 17 |
+
"web_search": web_search,
|
| 18 |
+
"wikipedia_search": wikipedia_search,
|
| 19 |
+
"python_repl": python_repl,
|
| 20 |
+
"download_and_read_file": download_and_read_file,
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
TOOL_SCHEMAS = [
|
| 24 |
+
{
|
| 25 |
+
"type": "function",
|
| 26 |
+
"function": {
|
| 27 |
+
"name": "web_search",
|
| 28 |
+
"description": "Search Google for current information. Use for factual questions, recent events, or any topic requiring web search.",
|
| 29 |
+
"parameters": {
|
| 30 |
+
"type": "object",
|
| 31 |
+
"properties": {"query": {"type": "string", "description": "Search query"}},
|
| 32 |
+
"required": ["query"],
|
| 33 |
+
},
|
| 34 |
+
},
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"type": "function",
|
| 38 |
+
"function": {
|
| 39 |
+
"name": "wikipedia_search",
|
| 40 |
+
"description": "Search Wikipedia for encyclopedic or historical information about a topic.",
|
| 41 |
+
"parameters": {
|
| 42 |
+
"type": "object",
|
| 43 |
+
"properties": {"query": {"type": "string", "description": "Topic to search"}},
|
| 44 |
+
"required": ["query"],
|
| 45 |
+
},
|
| 46 |
+
},
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"type": "function",
|
| 50 |
+
"function": {
|
| 51 |
+
"name": "python_repl",
|
| 52 |
+
"description": "Execute Python code for math, calculations, data analysis, or logic. Use print() to output results.",
|
| 53 |
+
"parameters": {
|
| 54 |
+
"type": "object",
|
| 55 |
+
"properties": {"code": {"type": "string", "description": "Python code to execute"}},
|
| 56 |
+
"required": ["code"],
|
| 57 |
+
},
|
| 58 |
+
},
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"type": "function",
|
| 62 |
+
"function": {
|
| 63 |
+
"name": "download_and_read_file",
|
| 64 |
+
"description": "Download and read a file attachment (CSV, Excel, text) for a given task_id from the question.",
|
| 65 |
+
"parameters": {
|
| 66 |
+
"type": "object",
|
| 67 |
+
"properties": {"task_id": {"type": "string", "description": "The task_id of the question"}},
|
| 68 |
+
"required": ["task_id"],
|
| 69 |
+
},
|
| 70 |
+
},
|
| 71 |
+
},
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
SYSTEM_PROMPT = (
|
| 75 |
+
"You are a precise research assistant. Solve each question step by step using tools.\n\n"
|
| 76 |
+
|
| 77 |
+
"STRICT RULES:\n"
|
| 78 |
+
"1. NEVER search for 'GAIA benchmark', 'GAIA answer', 'HuggingFace discussion', or any meta-search for pre-solved answers. Solve the problem yourself.\n"
|
| 79 |
+
"2. For ANY text manipulation (reversing, encoding, counting characters, etc.), ALWAYS use python_repl — never guess by eye.\n"
|
| 80 |
+
"3. Keep search queries SHORT and targeted (under 8 words). Never enumerate values (e.g. years) in one query.\n"
|
| 81 |
+
"4. If you have an [Attached file content] section, read it directly — do NOT call download_and_read_file again.\n"
|
| 82 |
+
"5. OUTPUT FORMAT: Your ENTIRE response must be ONLY the final answer — no explanation, no reasoning, no 'The answer is', no preamble. A single word, number, or comma-separated list. Nothing else.\n"
|
| 83 |
+
"6. Numbers: digits only (e.g. '42', not 'forty-two'). Names: as they appear in the source.\n"
|
| 84 |
+
"7. If a question involves reversing or encoding text, use python_repl to decode it first before reasoning.\n"
|
| 85 |
+
"8. Lists: output as plain comma-separated values (e.g. 'apple, banana, cherry') — NO brackets, NO quotes, NO Python syntax.\n"
|
| 86 |
+
"9. If the question has a [Task ID: xxx] prefix, use that exact value when calling download_and_read_file.\n"
|
| 87 |
+
"10. If an attached file is audio/image/video (marked [UNSUPPORTED]), do NOT call download_and_read_file — use web_search to find the answer instead.\n"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
# --- Basic Agent Definition ---
|
| 91 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 92 |
class BasicAgent:
|
| 93 |
def __init__(self):
|
| 94 |
+
self.api_key = os.getenv("DEEPSEEK_API_KEY")
|
| 95 |
+
self.model = os.getenv("DEEPSEEK_MODEL", "deepseek-chat")
|
| 96 |
+
self.api_url = "https://www.ggwk1.online/v1/chat/completions"
|
| 97 |
+
print("BasicAgent initialized with DeepSeek (native HTTP).")
|
| 98 |
+
|
| 99 |
+
def _call_llm(self, messages: list) -> dict:
|
| 100 |
+
headers = {
|
| 101 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 102 |
+
"Content-Type": "application/json",
|
| 103 |
+
}
|
| 104 |
+
payload = {
|
| 105 |
+
"model": self.model,
|
| 106 |
+
"messages": messages,
|
| 107 |
+
"tools": TOOL_SCHEMAS,
|
| 108 |
+
"tool_choice": "auto",
|
| 109 |
+
"temperature": 0,
|
| 110 |
+
}
|
| 111 |
+
response = requests.post(
|
| 112 |
+
self.api_url, headers=headers, json=payload, timeout=60,
|
| 113 |
+
proxies={"http": None, "https": None}
|
| 114 |
+
)
|
| 115 |
+
response.raise_for_status()
|
| 116 |
+
return response.json()
|
| 117 |
+
|
| 118 |
def __call__(self, question: str) -> str:
|
| 119 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 120 |
+
messages = [
|
| 121 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 122 |
+
{"role": "user", "content": question},
|
| 123 |
+
]
|
| 124 |
+
for _ in range(15): # max iterations
|
| 125 |
+
result = self._call_llm(messages)
|
| 126 |
+
choice = result["choices"][0]
|
| 127 |
+
message = choice["message"]
|
| 128 |
+
messages.append(message)
|
| 129 |
+
|
| 130 |
+
if choice["finish_reason"] == "tool_calls":
|
| 131 |
+
for tool_call in message.get("tool_calls", []):
|
| 132 |
+
fn_name = tool_call["function"]["name"]
|
| 133 |
+
fn_args = json.loads(tool_call["function"]["arguments"])
|
| 134 |
+
print(f" -> Tool: {fn_name}({fn_args})")
|
| 135 |
+
tool_result = TOOL_FUNCTIONS[fn_name](**fn_args)
|
| 136 |
+
print(f" <- Result (first 200): {str(tool_result)[:200]}")
|
| 137 |
+
messages.append({
|
| 138 |
+
"role": "tool",
|
| 139 |
+
"tool_call_id": tool_call["id"],
|
| 140 |
+
"content": str(tool_result),
|
| 141 |
+
})
|
| 142 |
+
else:
|
| 143 |
+
answer = message.get("content", "")
|
| 144 |
+
print(f"Agent answer: {answer}")
|
| 145 |
+
return answer
|
| 146 |
+
|
| 147 |
+
return "Max iterations reached without a final answer."
|
| 148 |
|
| 149 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 150 |
"""
|
|
|
|
| 178 |
# 2. Fetch Questions
|
| 179 |
print(f"Fetching questions from: {questions_url}")
|
| 180 |
try:
|
| 181 |
+
response = requests.get(questions_url, timeout=15, proxies={"http": None, "https": None})
|
| 182 |
response.raise_for_status()
|
| 183 |
questions_data = response.json()
|
| 184 |
if not questions_data:
|
|
|
|
| 207 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 208 |
continue
|
| 209 |
try:
|
| 210 |
+
# Pre-fetch file attachment and embed content directly in question
|
| 211 |
+
file_content = prefetch_file(task_id)
|
| 212 |
+
if file_content:
|
| 213 |
+
full_question = f"[Task ID: {task_id}]\n{question_text}\n\n[Attached file content]:\n{file_content}"
|
| 214 |
+
else:
|
| 215 |
+
full_question = f"[Task ID: {task_id}]\n{question_text}"
|
| 216 |
+
submitted_answer = agent(full_question)
|
| 217 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 218 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 219 |
except Exception as e:
|
|
|
|
| 232 |
# 5. Submit
|
| 233 |
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 234 |
try:
|
| 235 |
+
response = requests.post(submit_url, json=submission_data, timeout=60, proxies={"http": None, "https": None})
|
| 236 |
response.raise_for_status()
|
| 237 |
result_data = response.json()
|
| 238 |
final_status = (
|
requirements.txt
CHANGED
|
@@ -1,2 +1,5 @@
|
|
| 1 |
gradio
|
| 2 |
-
requests
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
requests
|
| 3 |
+
python-dotenv
|
| 4 |
+
pandas
|
| 5 |
+
openpyxl
|
test_agent.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Quick test script to debug a single question without launching Gradio.
|
| 3 |
+
Usage: python test_agent.py
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
from app import BasicAgent
|
| 11 |
+
from tools.file_handler import prefetch_file
|
| 12 |
+
|
| 13 |
+
agent = BasicAgent()
|
| 14 |
+
|
| 15 |
+
# --- Edit these to test any question ---
|
| 16 |
+
task_id = "test-task-id" # replace with real task_id if needed
|
| 17 |
+
question = 'If we reverse the word "tfel", what is the antonym of the result?'
|
| 18 |
+
# ----------------------------------------
|
| 19 |
+
|
| 20 |
+
file_content = prefetch_file(task_id)
|
| 21 |
+
if file_content:
|
| 22 |
+
full_question = f"{question}\n\n[Attached file content]:\n{file_content}"
|
| 23 |
+
print(f"[File found and attached, length={len(file_content)}]")
|
| 24 |
+
else:
|
| 25 |
+
full_question = question
|
| 26 |
+
print("[No file attachment]")
|
| 27 |
+
|
| 28 |
+
print(f"\nQuestion: {full_question[:200]}\n")
|
| 29 |
+
answer = agent(full_question)
|
| 30 |
+
print(f"\n=== Final Answer ===\n{answer}")
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .search import web_search
|
| 2 |
+
from .wikipedia import wikipedia_search
|
| 3 |
+
from .calculator import python_repl
|
| 4 |
+
from .file_handler import download_and_read_file
|
| 5 |
+
|
| 6 |
+
__all__ = ["web_search", "wikipedia_search", "python_repl", "download_and_read_file"]
|
tools/_session.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
# Shared session that ignores ALL system proxy environment variables
|
| 4 |
+
_session = requests.Session()
|
| 5 |
+
_session.trust_env = False
|
tools/calculator.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import io
|
| 3 |
+
import traceback
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def python_repl(code: str) -> str:
|
| 7 |
+
"""Execute Python code for calculations or data processing."""
|
| 8 |
+
stdout_capture = io.StringIO()
|
| 9 |
+
stderr_capture = io.StringIO()
|
| 10 |
+
local_vars = {}
|
| 11 |
+
try:
|
| 12 |
+
sys.stdout = stdout_capture
|
| 13 |
+
sys.stderr = stderr_capture
|
| 14 |
+
exec(compile(code, "<string>", "exec"), {"__builtins__": __builtins__}, local_vars)
|
| 15 |
+
except Exception:
|
| 16 |
+
sys.stdout = sys.__stdout__
|
| 17 |
+
sys.stderr = sys.__stderr__
|
| 18 |
+
return f"Error:\n{traceback.format_exc()}"
|
| 19 |
+
finally:
|
| 20 |
+
sys.stdout = sys.__stdout__
|
| 21 |
+
sys.stderr = sys.__stderr__
|
| 22 |
+
output = stdout_capture.getvalue()
|
| 23 |
+
err = stderr_capture.getvalue()
|
| 24 |
+
if err:
|
| 25 |
+
return f"Stderr:\n{err}\nStdout:\n{output}"
|
| 26 |
+
return output if output else "Code executed successfully (no output)."
|
tools/file_handler.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from tools._session import _session
|
| 5 |
+
|
| 6 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 7 |
+
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
| 8 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _parse_file(file_path: str, content_bytes: bytes, ext: str) -> str:
|
| 12 |
+
"""Parse file content based on extension and return as string."""
|
| 13 |
+
try:
|
| 14 |
+
if ext == ".csv":
|
| 15 |
+
df = pd.read_csv(file_path)
|
| 16 |
+
return f"CSV file ({len(df)} rows, {len(df.columns)} columns):\n{df.to_string(index=False)}"
|
| 17 |
+
elif ext in (".xlsx", ".xls"):
|
| 18 |
+
# Read all sheets
|
| 19 |
+
xl = pd.ExcelFile(file_path)
|
| 20 |
+
parts = []
|
| 21 |
+
for sheet in xl.sheet_names:
|
| 22 |
+
df = xl.parse(sheet)
|
| 23 |
+
parts.append(f"Sheet '{sheet}' ({len(df)} rows, {len(df.columns)} columns):\n{df.to_string(index=False)}")
|
| 24 |
+
return "\n\n".join(parts)
|
| 25 |
+
elif ext in (".py", ".txt", ".md", ".json", ".xml", ".html", ""):
|
| 26 |
+
return f"File contents:\n{content_bytes.decode('utf-8', errors='replace')[:5000]}"
|
| 27 |
+
else:
|
| 28 |
+
try:
|
| 29 |
+
return f"File contents:\n{content_bytes.decode('utf-8', errors='replace')[:5000]}"
|
| 30 |
+
except Exception:
|
| 31 |
+
return f"Binary file, cannot display as text. Size: {len(content_bytes)} bytes."
|
| 32 |
+
except Exception as e:
|
| 33 |
+
return f"Failed to parse file: {e}"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def prefetch_file(task_id: str) -> str | None:
|
| 37 |
+
"""
|
| 38 |
+
Try to download the file for a task_id.
|
| 39 |
+
Returns parsed file content string if found, None if no attachment exists.
|
| 40 |
+
Caches file to data/ directory.
|
| 41 |
+
"""
|
| 42 |
+
# Check cache first
|
| 43 |
+
cached = [f for f in os.listdir(DATA_DIR) if f.startswith(task_id)]
|
| 44 |
+
if cached:
|
| 45 |
+
file_path = os.path.join(DATA_DIR, cached[0])
|
| 46 |
+
ext = os.path.splitext(cached[0])[-1].lower()
|
| 47 |
+
with open(file_path, "rb") as f:
|
| 48 |
+
content_bytes = f.read()
|
| 49 |
+
return _parse_file(file_path, content_bytes, ext)
|
| 50 |
+
|
| 51 |
+
file_url = f"{DEFAULT_API_URL}/files/{task_id}"
|
| 52 |
+
try:
|
| 53 |
+
response = _session.get(file_url, timeout=30)
|
| 54 |
+
if response.status_code == 404:
|
| 55 |
+
return None
|
| 56 |
+
response.raise_for_status()
|
| 57 |
+
except Exception:
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
# Determine extension
|
| 61 |
+
ext = ""
|
| 62 |
+
cd = response.headers.get("content-disposition", "")
|
| 63 |
+
if "filename=" in cd:
|
| 64 |
+
fname = cd.split("filename=")[-1].strip().strip('"')
|
| 65 |
+
ext = os.path.splitext(fname)[-1].lower()
|
| 66 |
+
content_type = response.headers.get("content-type", "")
|
| 67 |
+
if not ext:
|
| 68 |
+
if "csv" in content_type:
|
| 69 |
+
ext = ".csv"
|
| 70 |
+
elif "excel" in content_type or "spreadsheet" in content_type or "openxmlformats" in content_type:
|
| 71 |
+
ext = ".xlsx"
|
| 72 |
+
elif "text" in content_type:
|
| 73 |
+
ext = ".txt"
|
| 74 |
+
|
| 75 |
+
# Save to data/
|
| 76 |
+
file_path = os.path.join(DATA_DIR, f"{task_id}{ext}")
|
| 77 |
+
with open(file_path, "wb") as f:
|
| 78 |
+
f.write(response.content)
|
| 79 |
+
|
| 80 |
+
return _parse_file(file_path, response.content, ext)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def download_and_read_file(task_id: str) -> str:
|
| 84 |
+
"""Download and read a file attachment for a given task_id.
|
| 85 |
+
Supports CSV, Excel (.xlsx/.xls), and plain text files.
|
| 86 |
+
"""
|
| 87 |
+
result = prefetch_file(task_id)
|
| 88 |
+
if result is None:
|
| 89 |
+
return "No file attachment found for this task."
|
| 90 |
+
return result
|
tools/search.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from tools._session import _session
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def web_search(query: str) -> str:
|
| 7 |
+
"""Search Google via Serper API for current information.
|
| 8 |
+
Use this for factual questions, recent events, or anything requiring web search.
|
| 9 |
+
"""
|
| 10 |
+
api_key = os.getenv("SERPER_API_KEY")
|
| 11 |
+
if not api_key:
|
| 12 |
+
return "SERPER_API_KEY not set."
|
| 13 |
+
try:
|
| 14 |
+
response = _session.post(
|
| 15 |
+
"https://google.serper.dev/search",
|
| 16 |
+
headers={"X-API-KEY": api_key, "Content-Type": "application/json"},
|
| 17 |
+
data=json.dumps({"q": query, "num": 5}),
|
| 18 |
+
timeout=15,
|
| 19 |
+
)
|
| 20 |
+
response.raise_for_status()
|
| 21 |
+
data = response.json()
|
| 22 |
+
|
| 23 |
+
parts = []
|
| 24 |
+
# Answer box (direct answer)
|
| 25 |
+
if "answerBox" in data:
|
| 26 |
+
ab = data["answerBox"]
|
| 27 |
+
answer = ab.get("answer") or ab.get("snippet") or ""
|
| 28 |
+
if answer:
|
| 29 |
+
parts.append(f"Direct answer: {answer}")
|
| 30 |
+
|
| 31 |
+
# Organic results
|
| 32 |
+
for r in data.get("organic", [])[:5]:
|
| 33 |
+
title = r.get("title", "")
|
| 34 |
+
link = r.get("link", "")
|
| 35 |
+
snippet = r.get("snippet", "")
|
| 36 |
+
parts.append(f"Title: {title}\nURL: {link}\nSnippet: {snippet}")
|
| 37 |
+
|
| 38 |
+
return "\n---\n".join(parts) if parts else "No results found."
|
| 39 |
+
except Exception as e:
|
| 40 |
+
return f"Search error: {e}"
|
tools/wikipedia.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
|
| 4 |
+
# Wikipedia needs the local proxy to be accessible from China
|
| 5 |
+
_PROXY = os.getenv("LOCAL_PROXY", "http://127.0.0.1:7890")
|
| 6 |
+
_wiki_session = requests.Session()
|
| 7 |
+
_wiki_session.proxies = {"http": _PROXY, "https": _PROXY}
|
| 8 |
+
_wiki_session.trust_env = False
|
| 9 |
+
_wiki_session.headers.update({
|
| 10 |
+
"User-Agent": "Mozilla/5.0 (compatible; ResearchAgent/1.0; educational use)"
|
| 11 |
+
})
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def wikipedia_search(query: str) -> str:
|
| 15 |
+
"""Search Wikipedia for encyclopedic information."""
|
| 16 |
+
# Use Wikipedia REST API directly
|
| 17 |
+
search_url = "https://en.wikipedia.org/w/api.php"
|
| 18 |
+
try:
|
| 19 |
+
# Step 1: search for page title
|
| 20 |
+
search_resp = _wiki_session.get(search_url, timeout=15, params={
|
| 21 |
+
"action": "query",
|
| 22 |
+
"list": "search",
|
| 23 |
+
"srsearch": query,
|
| 24 |
+
"srlimit": 3,
|
| 25 |
+
"format": "json",
|
| 26 |
+
})
|
| 27 |
+
search_resp.raise_for_status()
|
| 28 |
+
results = search_resp.json().get("query", {}).get("search", [])
|
| 29 |
+
if not results:
|
| 30 |
+
return "No Wikipedia articles found."
|
| 31 |
+
|
| 32 |
+
# Step 2: fetch content of top result
|
| 33 |
+
title = results[0]["title"]
|
| 34 |
+
content_resp = _wiki_session.get(search_url, timeout=15, params={
|
| 35 |
+
"action": "query",
|
| 36 |
+
"titles": title,
|
| 37 |
+
"prop": "extracts",
|
| 38 |
+
"exintro": False,
|
| 39 |
+
"explaintext": True,
|
| 40 |
+
"format": "json",
|
| 41 |
+
})
|
| 42 |
+
content_resp.raise_for_status()
|
| 43 |
+
pages = content_resp.json().get("query", {}).get("pages", {})
|
| 44 |
+
page = next(iter(pages.values()))
|
| 45 |
+
text = page.get("extract", "")
|
| 46 |
+
if not text:
|
| 47 |
+
return f"Wikipedia article '{title}' has no extractable content."
|
| 48 |
+
url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
|
| 49 |
+
return f"Wikipedia: {title}\nURL: {url}\n\n{text[:3000]}"
|
| 50 |
+
except Exception as e:
|
| 51 |
+
return f"Wikipedia search failed: {e}"
|