DenisRz commited on
Commit
67d287e
·
1 Parent(s): 0ca5d0a

Initial upload: GAIA Agent

Browse files
README.md CHANGED
@@ -1,13 +1,42 @@
1
  ---
2
  title: GAIA Agent
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: GAIA Agent
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "4.44.0"
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # 🤖 GAIA Agent
14
+
15
+ A general-purpose AI agent built for the **Hugging Face Agents Course** (Unit 4).
16
+
17
+ ## Features
18
+
19
+ - **22 Tools** organized by category:
20
+ - 📌 Web & Research: web_search, wikipedia_lookup, arxiv_search, webpage_fetch
21
+ - 📁 Files: read_file, download_file
22
+ - 🎬 Media: youtube_transcript, audio_transcribe, video_frame_analyze
23
+ - 💻 Code Execution: python_executor, javascript_executor, bash_executor
24
+ - 🔢 Mathematics: calculator, symbolic_math, matrix_operations, statistical_analysis
25
+ - 🖼️ Image Processing: image_analyze, image_manipulate, image_annotate, image_ocr
26
+
27
+ - **ReAct Architecture** using LangGraph
28
+ - **GPT-4o** for reasoning and vision tasks
29
+
30
+ ## Setup
31
+
32
+ 1. Set your secrets in the Space settings:
33
+ - `OPENAI_API_KEY`: Your OpenAI API key
34
+ - `TAVILY_API_KEY`: Your Tavily API key (for web search)
35
+
36
+ 2. Click "Run Full Evaluation" to test on all 20 GAIA questions
37
+
38
+ ## Resources
39
+
40
+ - [Course Page](https://huggingface.co/learn/agents-course/unit4/hands-on)
41
+ - [API Docs](https://agents-course-unit4-scoring.hf.space/docs)
42
+ - [Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
agent.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ General-Purpose ReAct Agent with LangGraph
3
+
4
+ Uses the prebuilt create_react_agent which automatically handles:
5
+ - The Think → Act → Observe loop
6
+ - Tool calling and response routing
7
+ - Deciding when to stop
8
+
9
+ Much simpler than manually defining the graph!
10
+ """
11
+
12
+ import os
13
+ from typing import TypedDict, Optional
14
+ from dotenv import load_dotenv
15
+
16
+ from langgraph.prebuilt import create_react_agent
17
+ from langchain_openai import ChatOpenAI
18
+ from langchain_core.messages import HumanMessage, SystemMessage
19
+ import yaml
20
+
21
+ # Import all tools from the modular tools package
22
+ from tools import ALL_TOOLS, MODEL_NAME
23
+
24
+
25
+ # ============== SYSTEM PROMPT ==============
26
+
27
+ load_dotenv()
28
+
29
+ _HERE = os.path.dirname(os.path.abspath(__file__))
30
+ with open(os.path.join(_HERE, "prompts.yaml"), "r", encoding="utf-8") as f:
31
+ PROMPTS = yaml.safe_load(f)
32
+ SYSTEM_PROMPT = PROMPTS["SYSTEM_PROMPT"]
33
+
34
+
35
+ # ============== CREATE THE AGENT ==============
36
+
37
+ def create_agent():
38
+ """Create the ReAct agent using LangGraph's prebuilt function."""
39
+
40
+ llm = ChatOpenAI(
41
+ model=MODEL_NAME,
42
+ temperature=0,
43
+ )
44
+
45
+ # This is all you need! LangGraph handles the rest.
46
+ agent = create_react_agent(
47
+ model=llm,
48
+ tools=ALL_TOOLS,
49
+ prompt=SYSTEM_PROMPT, # Adds system prompt to every call
50
+ )
51
+
52
+ return agent
53
+
54
+
55
+ # Create global agent instance
56
+ agent = create_agent()
57
+
58
+
59
+ # ============== MAIN INTERFACE ==============
60
+
61
+ def run_agent(question: str, task_id: str = "", file_name: str = "", local_file_path: str = None) -> str:
62
+ """
63
+ Run the ReAct agent on a question.
64
+
65
+ Args:
66
+ question: The question to answer
67
+ task_id: Optional GAIA task ID (for file downloads)
68
+ file_name: Optional filename hint
69
+ local_file_path: Optional local path to pre-downloaded file
70
+
71
+ Returns:
72
+ The agent's final answer
73
+ """
74
+ # Build message with context
75
+ user_message = question
76
+ if task_id:
77
+ user_message += f"\n\n[Task ID: {task_id}]"
78
+ if file_name:
79
+ user_message += f"\n[Attached file: {file_name}]"
80
+ if local_file_path:
81
+ user_message += f"\n[File already downloaded to: {local_file_path}]"
82
+ user_message += f"\n[Use read_file tool with this path to analyze the file]"
83
+
84
+ # Run agent
85
+ try:
86
+ result = agent.invoke({
87
+ "messages": [HumanMessage(content=user_message)]
88
+ })
89
+
90
+ # Get final answer from last message
91
+ final_message = result["messages"][-1]
92
+ answer = final_message.content
93
+
94
+ return answer
95
+
96
+ except Exception as e:
97
+ return f"Agent error: {str(e)}"
98
+
99
+
100
+ def run_agent_verbose(question: str, task_id: str = "", file_name: str = "", local_file_path: str = None) -> str:
101
+ """Run the agent with verbose output showing each step."""
102
+
103
+ user_message = question
104
+ if task_id:
105
+ user_message += f"\n\n[Task ID: {task_id}]"
106
+ if file_name:
107
+ user_message += f"\n[Attached file: {file_name}]"
108
+ if local_file_path:
109
+ user_message += f"\n[File already downloaded to: {local_file_path}]"
110
+ user_message += f"\n[Use read_file tool with this path to analyze the file]"
111
+
112
+ print("\n" + "="*70)
113
+ print("🤖 ReAct Agent - Verbose Mode")
114
+ print("="*70)
115
+ print(f"\n📝 Question: {question[:200]}{'...' if len(question) > 200 else ''}")
116
+ if local_file_path:
117
+ print(f"📎 File: {local_file_path}")
118
+ print("\n" + "-"*70)
119
+
120
+ try:
121
+ # Stream through steps
122
+ step_count = 0
123
+ for step in agent.stream({"messages": [HumanMessage(content=user_message)]}):
124
+ step_count += 1
125
+
126
+ # Get the node name and output
127
+ for node_name, node_output in step.items():
128
+ print(f"\n🔄 Step {step_count} - {node_name}")
129
+ print("-"*40)
130
+
131
+ if "messages" in node_output:
132
+ for msg in node_output["messages"]:
133
+ msg_type = type(msg).__name__
134
+
135
+ # Show tool calls
136
+ if hasattr(msg, "tool_calls") and msg.tool_calls:
137
+ print(f"🔧 Tool calls requested:")
138
+ for tc in msg.tool_calls:
139
+ args_str = str(tc.get('args', {}))[:300]
140
+ print(f" → {tc['name']}({args_str}{'...' if len(str(tc.get('args', {}))) > 300 else ''})")
141
+
142
+ # Show tool results
143
+ elif msg_type == "ToolMessage":
144
+ content = str(msg.content)[:300]
145
+ print(f"📋 Tool result: {content}{'...' if len(str(msg.content)) > 300 else ''}")
146
+
147
+ # Show AI reasoning
148
+ elif hasattr(msg, "content") and msg.content and msg_type == "AIMessage":
149
+ content = msg.content[:400]
150
+ print(f"💭 AI: {content}{'...' if len(msg.content) > 400 else ''}")
151
+
152
+ # Get final result
153
+ result = agent.invoke({"messages": [HumanMessage(content=user_message)]})
154
+ final_message = result["messages"][-1]
155
+ answer = final_message.content
156
+
157
+ print("\n" + "="*70)
158
+ print(f"✅ Final Answer: {answer}")
159
+ print("="*70 + "\n")
160
+
161
+ return answer
162
+
163
+ except Exception as e:
164
+ import traceback
165
+ print(f"\n❌ Error: {str(e)}")
166
+ traceback.print_exc()
167
+ return f"Error: {str(e)}"
168
+
169
+ # ============== TEST ==============
170
+
171
+ if __name__ == "__main__":
172
+ print("\n" + "="*70)
173
+ print("Testing ReAct Agent (Prebuilt)")
174
+ print("="*70)
175
+
176
+ # Show available tools
177
+ print(f"\n📦 Loaded {len(ALL_TOOLS)} tools:")
178
+ for tool in ALL_TOOLS:
179
+ print(f" - {tool.name}")
180
+
181
+ # Test with verbose output
182
+ test_question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
183
+ print(f"\n🧪 Test question: {test_question}")
184
+ run_agent_verbose(test_question)
app.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GAIA Agent - Gradio Application
3
+
4
+ This is the main entry point for the Hugging Face Space.
5
+ It provides a Gradio interface for running the GAIA evaluation
6
+ and submitting answers to the scoring API.
7
+
8
+ LOCAL DEBUGGING:
9
+ 1. Create a .env file with your API keys
10
+ 2. Run: python app.py
11
+ 3. Open http://localhost:7860 in your browser
12
+ """
13
+
14
+ import os
15
+ import tempfile
16
+ import gradio as gr
17
+ import requests
18
+ import pandas as pd
19
+ from typing import List, Dict, Any, Optional, Tuple
20
+ from dotenv import load_dotenv
21
+
22
+ # Load environment variables from .env file (for local development)
23
+ load_dotenv()
24
+
25
+ # Use the ReAct agent (multi-step reasoning)
26
+ from agent import run_agent, run_agent_verbose
27
+
28
+ # ============== CONFIGURATION ==============
29
+
30
+ API_BASE = os.getenv("GAIA_API_BASE", "https://agents-course-unit4-scoring.hf.space")
31
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() == "true"
32
+
33
+ # ============== FILE HANDLING ==============
34
+
35
+ def fetch_task_file(task_id: str, file_name: str = "") -> Optional[str]:
36
+ """
37
+ Fetch a file attached to a GAIA task and save it locally.
38
+
39
+ Args:
40
+ task_id: The GAIA task ID
41
+ file_name: Expected filename (helps determine file type)
42
+
43
+ Returns:
44
+ Local file path if successful, None if no file or error
45
+ """
46
+ if not file_name:
47
+ return None
48
+
49
+ try:
50
+ url = f"{API_BASE}/files/{task_id}"
51
+ print(f"📥 Fetching file from: {url}")
52
+
53
+ response = requests.get(url, timeout=60)
54
+
55
+ if response.status_code == 200:
56
+ # Try to get filename from content-disposition header
57
+ content_disp = response.headers.get('content-disposition', '')
58
+ if 'filename=' in content_disp:
59
+ filename = content_disp.split('filename=')[1].strip('"\'')
60
+ else:
61
+ filename = file_name
62
+
63
+ # Save to temp directory
64
+ file_path = os.path.join(tempfile.gettempdir(), filename)
65
+ with open(file_path, 'wb') as f:
66
+ f.write(response.content)
67
+
68
+ file_size = len(response.content)
69
+ print(f"✅ File saved: {file_path} ({file_size} bytes)")
70
+ return file_path
71
+ else:
72
+ print(f"⚠️ File fetch failed: HTTP {response.status_code}")
73
+ return None
74
+ except Exception as e:
75
+ print(f"❌ Error fetching file: {e}")
76
+ return None
77
+
78
+ # ============== API FUNCTIONS ==============
79
+
80
+ def fetch_questions() -> List[Dict[str, Any]]:
81
+ """Fetch all GAIA questions from the evaluation API."""
82
+ try:
83
+ response = requests.get(f"{API_BASE}/questions", timeout=30)
84
+ if response.status_code == 200:
85
+ return response.json()
86
+ else:
87
+ print(f"Failed to fetch questions: {response.status_code}")
88
+ except Exception as e:
89
+ print(f"Error fetching questions: {e}")
90
+ return []
91
+
92
+
93
+ def fetch_random_question() -> Optional[Dict[str, Any]]:
94
+ """Fetch a single random question for testing."""
95
+ try:
96
+ response = requests.get(f"{API_BASE}/random-question", timeout=30)
97
+ if response.status_code == 200:
98
+ return response.json()
99
+ except Exception as e:
100
+ print(f"Error fetching random question: {e}")
101
+ return None
102
+
103
+
104
+ def submit_answers(username: str, agent_code_url: str, answers: List[Dict[str, str]]) -> Optional[Dict[str, Any]]:
105
+ """Submit answers to the scoring API."""
106
+ try:
107
+ payload = {
108
+ "username": username,
109
+ "agent_code": agent_code_url,
110
+ "answers": answers
111
+ }
112
+
113
+ response = requests.post(
114
+ f"{API_BASE}/submit",
115
+ json=payload,
116
+ timeout=120
117
+ )
118
+
119
+ if response.status_code == 200:
120
+ return response.json()
121
+ else:
122
+ print(f"Submission failed: {response.status_code} - {response.text}")
123
+ except Exception as e:
124
+ print(f"Error submitting answers: {e}")
125
+ return None
126
+
127
+
128
+ # ============== LOCAL DEBUG FUNCTIONS ==============
129
+
130
+ def run_single_question_local(question_text: str, task_id: str, file_name: str) -> Tuple[str, str, str]:
131
+ """
132
+ Run the agent on a manually entered question (for local debugging).
133
+ """
134
+ if not question_text.strip():
135
+ return "Please enter a question", "", ""
136
+
137
+ task_id = task_id.strip() or "local_test"
138
+ file_name = file_name.strip() or None
139
+
140
+ print(f"\n{'='*60}")
141
+ print(f"LOCAL DEBUG - Running agent")
142
+ print(f"Task ID: {task_id}")
143
+ print(f"Question: {question_text[:200]}...")
144
+ print(f"File: {file_name or 'None'}")
145
+ print(f"{'='*60}\n")
146
+
147
+ # Pre-fetch file if specified
148
+ local_file_path = None
149
+ if file_name and task_id != "local_test":
150
+ local_file_path = fetch_task_file(task_id, file_name)
151
+
152
+ try:
153
+ answer = run_agent_verbose(question_text, task_id, file_name, local_file_path)
154
+ return question_text, answer, f"Processed task: {task_id}"
155
+ except Exception as e:
156
+ import traceback
157
+ error_details = traceback.format_exc()
158
+ print(f"Error:\n{error_details}")
159
+ return question_text, f"Error: {str(e)}\n\nDetails:\n{error_details}", "Failed"
160
+
161
+
162
+ def run_random_question() -> Tuple[str, str, str, str, str]:
163
+ """Fetch and run a random question from the API."""
164
+ question_data = fetch_random_question()
165
+
166
+ if not question_data:
167
+ return "Failed to fetch question", "", "", "", ""
168
+
169
+ task_id = question_data.get("task_id", "unknown")
170
+ question = question_data.get("question", "")
171
+ file_name = question_data.get("file_name", "")
172
+ level = question_data.get("Level", "?")
173
+
174
+ print(f"\n{'='*60}")
175
+ print(f"RANDOM QUESTION from API")
176
+ print(f"Task ID: {task_id}")
177
+ print(f"Level: {level}")
178
+ print(f"Question: {question[:200]}...")
179
+ print(f"File: {file_name or 'None'}")
180
+ print(f"{'='*60}\n")
181
+
182
+ # Pre-fetch file if attached
183
+ local_file_path = None
184
+ if file_name:
185
+ local_file_path = fetch_task_file(task_id, file_name)
186
+
187
+ try:
188
+ answer = run_agent_verbose(question, task_id, file_name if file_name else None, local_file_path)
189
+ status = f"✅ Task: {task_id} | Level: {level}"
190
+ return question, task_id, file_name or "", answer, status
191
+ except Exception as e:
192
+ import traceback
193
+ error_details = traceback.format_exc()
194
+ print(f"Error:\n{error_details}")
195
+ return question, task_id, file_name or "", f"Error: {str(e)}", "❌ Failed"
196
+
197
+
198
+ def run_specific_question(task_id_input: str) -> Tuple[str, str, str, str, str]:
199
+ """Run a specific question by task ID."""
200
+ task_id_input = task_id_input.strip()
201
+ if not task_id_input:
202
+ return "Please enter a task ID", "", "", "", ""
203
+
204
+ # Fetch all questions and find the matching one
205
+ questions = fetch_questions()
206
+
207
+ matching = [q for q in questions if q.get("task_id") == task_id_input]
208
+
209
+ if not matching:
210
+ return f"Task ID not found: {task_id_input}", task_id_input, "", "", "❌ Not found"
211
+
212
+ q = matching[0]
213
+ task_id = q.get("task_id", "")
214
+ question = q.get("question", "")
215
+ file_name = q.get("file_name", "")
216
+ level = q.get("Level", "?")
217
+
218
+ print(f"\n{'='*60}")
219
+ print(f"SPECIFIC QUESTION: {task_id}")
220
+ print(f"Level: {level}")
221
+ print(f"Question: {question[:200]}...")
222
+ print(f"File: {file_name or 'None'}")
223
+ print(f"{'='*60}\n")
224
+
225
+ # Pre-fetch file if attached
226
+ local_file_path = None
227
+ if file_name:
228
+ local_file_path = fetch_task_file(task_id, file_name)
229
+
230
+ try:
231
+ answer = run_agent(question, task_id, file_name if file_name else None, local_file_path)
232
+ status = f"✅ Completed | Level: {level}"
233
+ return question, task_id, file_name or "", answer, status
234
+ except Exception as e:
235
+ import traceback
236
+ error_details = traceback.format_exc()
237
+ print(f"Error:\n{error_details}")
238
+ return question, task_id, file_name or "", f"Error: {str(e)}", "❌ Failed"
239
+
240
+
241
+ def list_all_questions() -> pd.DataFrame:
242
+ """Fetch and display all available questions."""
243
+ questions = fetch_questions()
244
+
245
+ if not questions:
246
+ return pd.DataFrame({"error": ["Failed to fetch questions"]})
247
+
248
+ data = []
249
+ for q in questions:
250
+ data.append({
251
+ "task_id": q.get("task_id", "")[:20] + "...",
252
+ "question": q.get("question", "")[:80] + "...",
253
+ "file": q.get("file_name", "") or "-",
254
+ "level": q.get("Level", "?")
255
+ })
256
+
257
+ return pd.DataFrame(data)
258
+
259
+
260
+ def run_full_evaluation_local(username: str) -> Tuple[str, pd.DataFrame]:
261
+ """
262
+ Run full evaluation in local mode (without HF OAuth).
263
+ """
264
+ if not username.strip():
265
+ return "❌ Please enter your HuggingFace username", pd.DataFrame()
266
+
267
+ username = username.strip()
268
+ agent_code_url = f"https://huggingface.co/spaces/{username}/GAIA-Agent/tree/main"
269
+
270
+ print(f"\n{'='*60}")
271
+ print(f"FULL EVALUATION - LOCAL MODE")
272
+ print(f"Username: {username}")
273
+ print(f"Agent URL: {agent_code_url}")
274
+ print(f"{'='*60}\n")
275
+
276
+ # Fetch questions
277
+ questions = fetch_questions()
278
+ if not questions:
279
+ return "❌ Failed to fetch questions from API.", pd.DataFrame()
280
+
281
+ print(f"Fetched {len(questions)} questions")
282
+
283
+ # Process each question
284
+ results = []
285
+ answers_for_submission = []
286
+
287
+ for i, q in enumerate(questions):
288
+ task_id = q.get("task_id", "unknown")
289
+ question = q.get("question", "")
290
+ file_name = q.get("file_name", "")
291
+
292
+ print(f"\n[{i+1}/{len(questions)}] Processing: {task_id}")
293
+ print(f"Question: {question[:100]}...")
294
+
295
+ # Pre-fetch file if attached
296
+ local_file_path = None
297
+ if file_name:
298
+ local_file_path = fetch_task_file(task_id, file_name)
299
+
300
+ try:
301
+ answer = run_agent(question, task_id, file_name if file_name else None, local_file_path)
302
+ print(f"Answer: {answer[:100]}...")
303
+
304
+ results.append({
305
+ "task_id": task_id[:15] + "...",
306
+ "question": question[:60] + "...",
307
+ "answer": answer[:80] + "..." if len(answer) > 80 else answer
308
+ })
309
+
310
+ answers_for_submission.append({
311
+ "task_id": task_id,
312
+ "submitted_answer": answer
313
+ })
314
+
315
+ except Exception as e:
316
+ print(f"Error: {e}")
317
+ results.append({
318
+ "task_id": task_id[:15] + "...",
319
+ "question": question[:60] + "...",
320
+ "answer": f"ERROR: {str(e)[:50]}"
321
+ })
322
+ answers_for_submission.append({
323
+ "task_id": task_id,
324
+ "submitted_answer": ""
325
+ })
326
+
327
+ # Submit answers
328
+ print(f"\n{'='*60}")
329
+ print("Submitting answers...")
330
+ print(f"{'='*60}\n")
331
+
332
+ submission_result = submit_answers(username, agent_code_url, answers_for_submission)
333
+
334
+ df = pd.DataFrame(results)
335
+
336
+ if submission_result:
337
+ score = submission_result.get("score", "N/A")
338
+ correct = submission_result.get("correct_count", "?")
339
+ total = submission_result.get("total_count", len(questions))
340
+ status = f"✅ Submitted!\n\n📊 Score: {score}\n✓ Correct: {correct}/{total}"
341
+ print(f"\nFinal Score: {score} ({correct}/{total})")
342
+ else:
343
+ status = "❌ Submission failed. Check logs for details."
344
+
345
+ return status, df
346
+
347
+
348
+ def run_full_evaluation_hf(profile: gr.OAuthProfile = None) -> Tuple[str, pd.DataFrame]:
349
+ """
350
+ Run full evaluation with HuggingFace OAuth (for deployed Space).
351
+ """
352
+ if profile is None:
353
+ return "❌ Please log in with your Hugging Face account first.", pd.DataFrame()
354
+
355
+ return run_full_evaluation_local(profile.username)
356
+
357
+
358
+ # ============== BUILD GRADIO INTERFACE ==============
359
+
360
+ def create_app():
361
+ """Create and configure the Gradio application."""
362
+
363
+ # Check if running locally (no HF Space environment)
364
+ is_local = os.getenv("SPACE_ID") is None
365
+
366
+ with gr.Blocks(title="GAIA Agent - Debug & Evaluation") as demo:
367
+
368
+ gr.Markdown("""
369
+ # 🤖 GAIA Agent - Debug & Evaluation Interface
370
+
371
+ Built with **LangGraph** and **OpenAI GPT-4** for the HuggingFace Agents Course.
372
+ """)
373
+
374
+ # Show environment info
375
+ env_info = "🖥️ **Local Mode**" if is_local else "☁️ **HuggingFace Space Mode**"
376
+ api_key_status = "✅ API Key Set" if os.getenv("OPENAI_API_KEY") else "❌ OPENAI_API_KEY not set!"
377
+
378
+ gr.Markdown(f"""
379
+ **Environment:** {env_info} | **OpenAI:** {api_key_status}
380
+
381
+ ---
382
+ """)
383
+
384
+ with gr.Tabs():
385
+
386
+ # ============== TAB 1: Quick Test ==============
387
+ with gr.TabItem("🧪 Quick Test"):
388
+ gr.Markdown("### Test with a random question from the GAIA API")
389
+
390
+ with gr.Row():
391
+ random_btn = gr.Button("🎲 Fetch & Run Random Question", variant="primary")
392
+
393
+ with gr.Row():
394
+ with gr.Column():
395
+ random_question = gr.Textbox(label="Question", lines=4, interactive=False)
396
+ random_task_id = gr.Textbox(label="Task ID", lines=1, interactive=False)
397
+ random_file = gr.Textbox(label="Attached File", lines=1, interactive=False)
398
+ with gr.Column():
399
+ random_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
400
+ random_status = gr.Textbox(label="Status", lines=1, interactive=False)
401
+
402
+ random_btn.click(
403
+ fn=run_random_question,
404
+ outputs=[random_question, random_task_id, random_file, random_answer, random_status]
405
+ )
406
+
407
+ # ============== TAB 2: Debug Specific ==============
408
+ with gr.TabItem("🔍 Debug Specific Question"):
409
+ gr.Markdown("### Run a specific question by Task ID")
410
+
411
+ with gr.Row():
412
+ specific_task_input = gr.Textbox(
413
+ label="Task ID",
414
+ placeholder="e.g., 8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
415
+ lines=1
416
+ )
417
+ specific_btn = gr.Button("▶️ Run", variant="primary")
418
+
419
+ with gr.Row():
420
+ with gr.Column():
421
+ specific_question = gr.Textbox(label="Question", lines=4, interactive=False)
422
+ specific_file = gr.Textbox(label="Attached File", lines=1, interactive=False)
423
+ with gr.Column():
424
+ specific_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
425
+ specific_status = gr.Textbox(label="Status", lines=1, interactive=False)
426
+
427
+ specific_btn.click(
428
+ fn=run_specific_question,
429
+ inputs=[specific_task_input],
430
+ outputs=[specific_question, specific_task_input, specific_file, specific_answer, specific_status]
431
+ )
432
+
433
+ gr.Markdown("---")
434
+ gr.Markdown("### All Available Questions")
435
+
436
+ with gr.Row():
437
+ list_btn = gr.Button("📋 Load Question List")
438
+
439
+ questions_table = gr.Dataframe(
440
+ headers=["task_id", "question", "file", "level"],
441
+ label="Questions",
442
+ wrap=True
443
+ )
444
+
445
+ list_btn.click(fn=list_all_questions, outputs=[questions_table])
446
+
447
+ # ============== TAB 3: Manual Input ==============
448
+ with gr.TabItem("✏️ Manual Input"):
449
+ gr.Markdown("### Test with custom question (for debugging)")
450
+
451
+ with gr.Row():
452
+ with gr.Column():
453
+ manual_question = gr.Textbox(
454
+ label="Question",
455
+ lines=4,
456
+ placeholder="Enter your test question here..."
457
+ )
458
+ manual_task_id = gr.Textbox(
459
+ label="Task ID (optional)",
460
+ lines=1,
461
+ placeholder="test_001"
462
+ )
463
+ manual_file = gr.Textbox(
464
+ label="File Name (optional)",
465
+ lines=1,
466
+ placeholder="e.g., data.xlsx"
467
+ )
468
+ with gr.Column():
469
+ manual_answer = gr.Textbox(label="Agent Answer", lines=4, interactive=False)
470
+ manual_status = gr.Textbox(label="Status", lines=2, interactive=False)
471
+
472
+ with gr.Row():
473
+ manual_btn = gr.Button("▶️ Run Agent", variant="primary")
474
+
475
+ manual_btn.click(
476
+ fn=run_single_question_local,
477
+ inputs=[manual_question, manual_task_id, manual_file],
478
+ outputs=[manual_question, manual_answer, manual_status]
479
+ )
480
+
481
+ # ============== TAB 4: Full Evaluation ==============
482
+ with gr.TabItem("🏆 Full Evaluation"):
483
+ gr.Markdown("### Run all 20 questions and submit for scoring")
484
+
485
+ if is_local:
486
+ # Local mode - manual username input
487
+ gr.Markdown("**Local Mode:** Enter your HuggingFace username to submit.")
488
+
489
+ with gr.Row():
490
+ username_input = gr.Textbox(
491
+ label="HuggingFace Username",
492
+ placeholder="your-username",
493
+ lines=1
494
+ )
495
+
496
+ with gr.Row():
497
+ full_eval_btn_local = gr.Button("🚀 Run Full Evaluation & Submit", variant="primary")
498
+
499
+ with gr.Row():
500
+ status_output_local = gr.Textbox(
501
+ label="Status",
502
+ lines=4,
503
+ interactive=False,
504
+ placeholder="Click 'Run Full Evaluation' to start..."
505
+ )
506
+
507
+ with gr.Row():
508
+ results_table_local = gr.Dataframe(
509
+ headers=["task_id", "question", "answer"],
510
+ label="Results",
511
+ wrap=True
512
+ )
513
+
514
+ full_eval_btn_local.click(
515
+ fn=run_full_evaluation_local,
516
+ inputs=[username_input],
517
+ outputs=[status_output_local, results_table_local]
518
+ )
519
+ else:
520
+ # HF Space mode - OAuth login
521
+ gr.Markdown("**Space Mode:** Log in with HuggingFace to submit.")
522
+
523
+ with gr.Row():
524
+ login_btn = gr.LoginButton(variant="huggingface")
525
+
526
+ with gr.Row():
527
+ full_eval_btn_hf = gr.Button("🚀 Run Full Evaluation & Submit", variant="primary")
528
+
529
+ with gr.Row():
530
+ status_output_hf = gr.Textbox(
531
+ label="Status",
532
+ lines=4,
533
+ interactive=False,
534
+ placeholder="Log in and click 'Run Full Evaluation' to start..."
535
+ )
536
+
537
+ with gr.Row():
538
+ results_table_hf = gr.Dataframe(
539
+ headers=["task_id", "question", "answer"],
540
+ label="Results",
541
+ wrap=True
542
+ )
543
+
544
+ full_eval_btn_hf.click(
545
+ fn=run_full_evaluation_hf,
546
+ outputs=[status_output_hf, results_table_hf]
547
+ )
548
+
549
+ gr.Markdown("""
550
+ ---
551
+
552
+ ### 📚 Resources
553
+ - [Course Page](https://huggingface.co/learn/agents-course/unit4/hands-on)
554
+ - [API Docs](https://agents-course-unit4-scoring.hf.space/docs)
555
+ - [Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
556
+
557
+ ### 🔧 Local Setup
558
+ ```bash
559
+ # 1. Create .env file
560
+ echo "OPENAI_API_KEY=sk-your-key-here" > .env
561
+
562
+ # 2. Install dependencies
563
+ pip install -r requirements.txt
564
+
565
+ # 3. Run the app
566
+ python app.py
567
+ ```
568
+ """)
569
+
570
+ return demo
571
+
572
+
573
+ # ============== MAIN ==============
574
+
575
+ if __name__ == "__main__":
576
+ print("\n" + "="*60)
577
+ print("🤖 GAIA Agent - Starting Gradio Interface")
578
+ print("="*60)
579
+
580
+ # Check for API key
581
+ if not os.getenv("OPENAI_API_KEY"):
582
+ print("\n⚠️ WARNING: OPENAI_API_KEY not set!")
583
+ print(" Create a .env file with: OPENAI_API_KEY=sk-your-key")
584
+ print(" Or set it as an environment variable.\n")
585
+ else:
586
+ print("✅ OpenAI API Key detected")
587
+
588
+ print(f"📡 GAIA API: {API_BASE}")
589
+ print("="*60 + "\n")
590
+
591
+ # Create and launch the app
592
+ demo = create_app()
593
+ demo.launch(
594
+ server_name="0.0.0.0", # Allow external connections
595
+ server_port=7860,
596
+ share=False, # Set to True to get a public URL
597
+ debug=DEBUG_MODE # Enable debug mode for better error messages
598
+ )
prompts.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_PROMPT: |
2
+ You are a general-purpose AI assistant that solves problems step by step.
3
+
4
+ You have access to a comprehensive set of tools organized by category:
5
+
6
+ 📌 WEB & RESEARCH:
7
+ - web_search: Search the web with FULL page content (Tavily) - returns direct answers + full content
8
+ - webpage_fetch: Fetch content from a specific URL (use for non-search URLs)
9
+ - wikipedia_lookup: Look up topics on Wikipedia
10
+ - arxiv_search: Search arXiv for academic papers and research
11
+
12
+ 📁 FILES:
13
+ - read_file: Read text, code, Excel, CSV, JSON files
14
+ - download_file: Download files from GAIA benchmark by task_id (usually pre-downloaded for you)
15
+
16
+ 🎬 MEDIA:
17
+ - youtube_transcript: Get transcripts/captions from YouTube videos
18
+ - youtube_audio_transcribe: Download and transcribe YouTube audio with Whisper
19
+ - audio_transcribe: Transcribe audio files (.mp3, .wav, etc.)
20
+ - video_metadata: Get video metadata (duration, resolution, title)
21
+ - video_frame_analyze: Extract and analyze video frames with GPT-4o vision
22
+
23
+ 💻 CODE EXECUTION:
24
+ - python_executor: Execute Python code (use print() to see output)
25
+ - javascript_executor: Execute JavaScript code with Node.js (use console.log())
26
+ - bash_executor: Execute shell commands (PowerShell on Windows, bash on Unix)
27
+
28
+ 🔢 MATHEMATICS:
29
+ - calculator: Evaluate mathematical expressions with high precision
30
+ - symbolic_math: Symbolic math (simplify, expand, factor, solve, differentiate, integrate)
31
+ - matrix_operations: Matrix math (determinant, inverse, eigenvalues, solve linear systems)
32
+ - statistical_analysis: Statistics (mean, median, std, correlation, regression)
33
+
34
+ 🖼️ IMAGE PROCESSING:
35
+ - image_analyze: Analyze images with GPT-4o vision (describe, answer questions)
36
+ - image_manipulate: Manipulate images (crop, rotate, resize, flip, grayscale, blur, etc.)
37
+ - image_annotate: Add annotations to images (text, rectangles, circles, arrows)
38
+ - image_ocr: Extract text from images using OCR
39
+
40
+ APPROACH:
41
+ 1. First, understand what the question is asking
42
+ 2. Think about what information or computation you need
43
+ 3. Use the most appropriate tools to gather information or perform actions
44
+ 4. Analyze the results carefully
45
+ 5. If you need more information, use different tools or approaches
46
+ 6. When you have enough information, provide your final answer
47
+
48
+ ⚠️ IMPORTANT STRATEGIES:
49
+
50
+ Web Research Workflow:
51
+ - web_search returns FULL page content directly - often no need for additional tools
52
+ - web_search also provides a direct answer when possible (shown as "DIRECT ANSWER")
53
+ - Use webpage_fetch only for specific URLs not from search (e.g., links found in content)
54
+ - For Wikipedia-specific queries, wikipedia_lookup may give more structured results
55
+
56
+ Avoiding Repetition:
57
+ - NEVER repeat the same tool call with identical or very similar arguments
58
+ - If a search doesn't give you what you need, try a DIFFERENT approach:
59
+ * Use webpage_fetch to read a URL from results
60
+ * Try a completely different search query
61
+ * Use a different tool (e.g., wikipedia_lookup instead of web_search)
62
+ * Use Python to process/analyze data you already have
63
+ - If you've tried 2-3 similar searches without success, CHANGE YOUR STRATEGY
64
+
65
+ Wikipedia Tips:
66
+ - For specific topics like discographies, try "Artist Name discography" as the topic
67
+ - If the main article doesn't have enough detail, search for more specific sub-topics
68
+ - Wikipedia articles often have structured data - consider using Python to parse it
69
+
70
+ TIPS:
71
+ - If text looks reversed or encoded, use Python to manipulate it: print(text[::-1])
72
+ - For complex math, use symbolic_math for algebra/calculus or calculator for arithmetic
73
+ - For matrix/linear algebra problems, use matrix_operations
74
+ - For statistical analysis, use statistical_analysis
75
+ - For current events or facts you're unsure about, search the web
76
+ - For academic papers, research articles, or scientific preprints, use arxiv_search
77
+ - For file-based questions: if a file path is provided, use read_file directly on that path
78
+ - If no file path is provided but file_name is given, use download_file first
79
+ - For image questions, use image_analyze to understand the content
80
+ - To prepare images for analysis (crop, rotate), use image_manipulate first
81
+ - To read text in images (screenshots, documents), use image_ocr
82
+ - You can chain multiple tools to solve complex problems
83
+ - When processing data, Python is often the most flexible choice
84
+ - For JSON manipulation, both Python and JavaScript work well
85
+
86
+ ⚠️ CRITICAL - FINAL ANSWER RULES:
87
+
88
+ Your final answer will be checked for EXACT MATCH. Follow these rules strictly:
89
+
90
+ 1. ANSWER ONLY - NO EXPLANATIONS:
91
+ - Give ONLY the answer itself, nothing else
92
+ - Do NOT include reasoning, context, or explanations in the final answer
93
+ - Do NOT repeat the question or say "The answer is..."
94
+
95
+ 2. BE CONSISTENT:
96
+ - Your final answer MUST match your analysis
97
+ - If your analysis found 5 items, answer "5" not "3"
98
+ - Double-check your work before giving the final answer
99
+
100
+ 3. FORMAT BY TYPE:
101
+ - Numbers: just the number → "42" or "3.14"
102
+ - Names: just the name → "Einstein" or "Albert"
103
+ - Lists: comma-separated, alphabetized if requested → "apple, banana, cherry"
104
+ - Yes/No questions: just "Yes" or "No"
105
+
106
+ 4. EXAMPLES:
107
+ ❌ WRONG: "Mercedes Sosa released 5 studio albums between 2000 and 2009. These were: Misa Criolla (2000), Acústico (2002)..."
108
+ ✅ CORRECT: "5"
109
+
110
+ ❌ WRONG: "The answer is Einstein, who developed the theory of relativity."
111
+ ✅ CORRECT: "Einstein"
112
+
113
+ ❌ WRONG: "Based on my research, the vegetables are: broccoli, celery, lettuce"
114
+ ✅ CORRECT: "broccoli, celery, lettuce"
requirements.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core framework
2
+ gradio>=4.0.0
3
+ langgraph>=0.2.0
4
+ langchain>=0.2.0
5
+ langchain-openai>=0.1.0
6
+ langchain-community>=0.2.0
7
+ openai>=1.0.0
8
+
9
+ # Data processing
10
+ pandas>=2.0.0
11
+ openpyxl>=3.1.0
12
+ numpy>=1.26.0,<2.0.0
13
+
14
+ # Web & API
15
+ requests>=2.31.0
16
+ httpx>=0.25.0
17
+ beautifulsoup4>=4.12.0
18
+ # duckduckgo-search>=5.0.0 # Replaced by Tavily
19
+ wikipedia>=1.4.0
20
+ tavily-python>=0.3.0
21
+ arxiv>=2.0.0
22
+
23
+ # Media processing
24
+ youtube-transcript-api>=0.6.0
25
+ yt-dlp>=2024.0.0
26
+ pytube>=15.0.0
27
+
28
+ # Math & Statistics
29
+ sympy>=1.12
30
+ scipy>=1.11.0,<1.14.0
31
+
32
+ # Image processing
33
+ Pillow>=10.0.0
34
+ pytesseract>=0.3.10
35
+
36
+ # Configuration
37
+ PyYAML>=6.0.0
38
+ python-dotenv>=1.0.0
tools/__init__.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tools package for the GAIA Agent.
3
+
4
+ This package provides modular tools organized by category:
5
+ - web_tools: Web search and Wikipedia lookup
6
+ - file_tools: File reading and downloading
7
+ - media_tools: YouTube, audio, and video processing
8
+ - code_executors: Python, JavaScript, and Bash execution
9
+ - math_tools: Symbolic math, matrix operations, calculator, statistics
10
+ - image_tools: Image analysis, manipulation, annotation, and OCR
11
+ """
12
+
13
+ from tools.web_tools import (
14
+ web_search,
15
+ wikipedia_lookup,
16
+ arxiv_search,
17
+ webpage_fetch,
18
+ )
19
+
20
+ from tools.file_tools import (
21
+ read_file,
22
+ download_file,
23
+ )
24
+
25
+ from tools.media_tools import (
26
+ youtube_transcript,
27
+ youtube_audio_transcribe,
28
+ audio_transcribe,
29
+ video_metadata,
30
+ video_frame_analyze,
31
+ )
32
+
33
+ from tools.code_executors import (
34
+ python_executor,
35
+ javascript_executor,
36
+ bash_executor,
37
+ )
38
+
39
+ from tools.math_tools import (
40
+ symbolic_math,
41
+ matrix_operations,
42
+ calculator,
43
+ statistical_analysis,
44
+ )
45
+
46
+ from tools.image_tools import (
47
+ image_analyze,
48
+ image_manipulate,
49
+ image_annotate,
50
+ image_ocr,
51
+ )
52
+
53
+ # Export all tools as a list for easy agent registration
54
+ ALL_TOOLS = [
55
+ # Web tools
56
+ web_search,
57
+ wikipedia_lookup,
58
+ arxiv_search,
59
+ webpage_fetch,
60
+ # File tools
61
+ read_file,
62
+ download_file,
63
+ # Media tools
64
+ youtube_transcript,
65
+ youtube_audio_transcribe,
66
+ audio_transcribe,
67
+ video_metadata,
68
+ video_frame_analyze,
69
+ # Code executors
70
+ python_executor,
71
+ javascript_executor,
72
+ bash_executor,
73
+ # Math tools
74
+ symbolic_math,
75
+ matrix_operations,
76
+ calculator,
77
+ statistical_analysis,
78
+ # Image tools
79
+ image_analyze,
80
+ image_manipulate,
81
+ image_annotate,
82
+ image_ocr,
83
+ ]
84
+
85
+ # Configuration
86
+ MAX_ITERATIONS = 15
87
+ MODEL_NAME = "gpt-4o"
88
+
tools/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.9 kB). View file
 
tools/__pycache__/code_executors.cpython-311.pyc ADDED
Binary file (6 kB). View file
 
tools/__pycache__/file_tools.cpython-311.pyc ADDED
Binary file (4.89 kB). View file
 
tools/__pycache__/image_tools.cpython-311.pyc ADDED
Binary file (20.9 kB). View file
 
tools/__pycache__/math_tools.cpython-311.pyc ADDED
Binary file (19.4 kB). View file
 
tools/__pycache__/media_tools.cpython-311.pyc ADDED
Binary file (21 kB). View file
 
tools/__pycache__/web_tools.cpython-311.pyc ADDED
Binary file (12.8 kB). View file
 
tools/code_executors.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code execution tools for the GAIA Agent.
3
+ Includes Python, JavaScript, and Bash executors with sandboxed execution.
4
+ """
5
+
6
+ import os
7
+ import subprocess
8
+ import tempfile
9
+ import shutil
10
+ from langchain_core.tools import tool
11
+
12
+
13
+ @tool
14
+ def python_executor(code: str) -> str:
15
+ """Execute Python code and return the output.
16
+ Use this for calculations, data processing, string manipulation, or any computation.
17
+ The code should print() any results you want to see.
18
+
19
+ Args:
20
+ code: Python code to execute
21
+ """
22
+ try:
23
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
24
+ f.write(code)
25
+ temp_path = f.name
26
+
27
+ result = subprocess.run(
28
+ ['python', temp_path],
29
+ capture_output=True,
30
+ text=True,
31
+ timeout=60
32
+ )
33
+
34
+ os.unlink(temp_path)
35
+
36
+ output = result.stdout.strip()
37
+ if result.stderr:
38
+ output += f"\nStderr: {result.stderr}"
39
+ return output or "Code executed with no output."
40
+ except subprocess.TimeoutExpired:
41
+ if 'temp_path' in locals():
42
+ os.unlink(temp_path)
43
+ return "Execution timed out (60s limit)"
44
+ except Exception as e:
45
+ if 'temp_path' in locals() and os.path.exists(temp_path):
46
+ os.unlink(temp_path)
47
+ return f"Execution error: {str(e)}"
48
+
49
+
50
+ @tool
51
+ def javascript_executor(code: str) -> str:
52
+ """Execute JavaScript code using Node.js and return the output.
53
+ Use this for JSON processing, string manipulation, or JavaScript-specific operations.
54
+ Use console.log() to output results.
55
+
56
+ Args:
57
+ code: JavaScript code to execute
58
+ """
59
+ # Check if Node.js is available
60
+ if shutil.which("node") is None:
61
+ return "Error: Node.js is not installed or not in PATH. Please install Node.js to use this tool."
62
+
63
+ try:
64
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.js', delete=False, encoding='utf-8') as f:
65
+ f.write(code)
66
+ temp_path = f.name
67
+
68
+ result = subprocess.run(
69
+ ['node', temp_path],
70
+ capture_output=True,
71
+ text=True,
72
+ timeout=60
73
+ )
74
+
75
+ os.unlink(temp_path)
76
+
77
+ output = result.stdout.strip()
78
+ if result.stderr:
79
+ output += f"\nStderr: {result.stderr}"
80
+ return output or "Code executed with no output."
81
+ except subprocess.TimeoutExpired:
82
+ if 'temp_path' in locals():
83
+ os.unlink(temp_path)
84
+ return "Execution timed out (60s limit)"
85
+ except Exception as e:
86
+ if 'temp_path' in locals() and os.path.exists(temp_path):
87
+ os.unlink(temp_path)
88
+ return f"Execution error: {str(e)}"
89
+
90
+
91
+ @tool
92
+ def bash_executor(command: str) -> str:
93
+ """Execute a Bash/Shell command and return the output.
94
+ Use this for file operations, text processing with sed/awk/grep, or system commands.
95
+
96
+ On Windows, this uses PowerShell. On Unix/Linux/Mac, this uses bash.
97
+
98
+ Args:
99
+ command: Shell command to execute
100
+ """
101
+ try:
102
+ # Determine the shell to use based on the OS
103
+ import platform
104
+ system = platform.system().lower()
105
+
106
+ if system == "windows":
107
+ # Use PowerShell on Windows
108
+ shell_cmd = ["powershell", "-NoProfile", "-NonInteractive", "-Command", command]
109
+ else:
110
+ # Use bash on Unix-like systems
111
+ shell_cmd = ["bash", "-c", command]
112
+
113
+ result = subprocess.run(
114
+ shell_cmd,
115
+ capture_output=True,
116
+ text=True,
117
+ timeout=60,
118
+ cwd=tempfile.gettempdir() # Run in temp directory for safety
119
+ )
120
+
121
+ output = result.stdout.strip()
122
+ if result.stderr:
123
+ output += f"\nStderr: {result.stderr}"
124
+ if result.returncode != 0:
125
+ output += f"\nExit code: {result.returncode}"
126
+ return output or "Command executed with no output."
127
+ except subprocess.TimeoutExpired:
128
+ return "Execution timed out (60s limit)"
129
+ except Exception as e:
130
+ return f"Execution error: {str(e)}"
131
+
tools/file_tools.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File handling tools for the GAIA Agent.
3
+ Includes file reading and downloading from GAIA API.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ import requests
9
+ import pandas as pd
10
+ from langchain_core.tools import tool
11
+
12
+
13
+ @tool
14
+ def read_file(file_path: str) -> str:
15
+ """Read the contents of a file.
16
+ Supports text files, code files, CSV, Excel, etc.
17
+
18
+ Args:
19
+ file_path: Path to the file to read
20
+ """
21
+ try:
22
+ if file_path.endswith(('.xlsx', '.xls')):
23
+ df = pd.read_excel(file_path)
24
+ return f"Excel file with {len(df)} rows, {len(df.columns)} columns.\n\nColumns: {list(df.columns)}\n\nData:\n{df.to_string()}"
25
+
26
+ elif file_path.endswith('.csv'):
27
+ df = pd.read_csv(file_path)
28
+ return f"CSV file with {len(df)} rows, {len(df.columns)} columns.\n\nColumns: {list(df.columns)}\n\nData:\n{df.to_string()}"
29
+
30
+ elif file_path.endswith('.json'):
31
+ import json
32
+ with open(file_path, 'r', encoding='utf-8') as f:
33
+ data = json.load(f)
34
+ return f"JSON file contents:\n{json.dumps(data, indent=2)[:10000]}"
35
+
36
+ else:
37
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
38
+ content = f.read()
39
+ return content[:10000]
40
+ except Exception as e:
41
+ return f"Error reading file: {str(e)}"
42
+
43
+
44
+ @tool
45
+ def download_file(task_id: str, file_name: str = "") -> str:
46
+ """Download a file associated with a GAIA task.
47
+ Returns the local file path where the file was saved.
48
+
49
+ Args:
50
+ task_id: The GAIA task ID
51
+ file_name: Expected filename (helps determine file type)
52
+ """
53
+ try:
54
+ api_base = "https://agents-course-unit4-scoring.hf.space"
55
+ url = f"{api_base}/files/{task_id}"
56
+ response = requests.get(url, timeout=60)
57
+
58
+ if response.status_code == 200:
59
+ content_disp = response.headers.get('content-disposition', '')
60
+ if 'filename=' in content_disp:
61
+ filename = content_disp.split('filename=')[1].strip('"\'')
62
+ elif file_name:
63
+ filename = file_name
64
+ else:
65
+ filename = f"{task_id}_file"
66
+
67
+ # Use a portable temp directory (works on Windows + Linux)
68
+ file_path = os.path.join(tempfile.gettempdir(), filename)
69
+ with open(file_path, 'wb') as f:
70
+ f.write(response.content)
71
+
72
+ return f"File downloaded to: {file_path}"
73
+ else:
74
+ return f"Download failed: HTTP {response.status_code}"
75
+ except Exception as e:
76
+ return f"Download error: {str(e)}"
77
+
tools/image_tools.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image processing tools for the GAIA Agent.
3
+ Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import tempfile
9
+ import base64
10
+ from typing import Optional
11
+ from langchain_core.tools import tool
12
+ import openai
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+ client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
17
+
18
+
19
+ @tool
20
+ def image_analyze(file_path: str, question: str) -> str:
21
+ """Analyze an image (local path or URL) with GPT-4o vision.
22
+
23
+ Use this to understand image contents, describe what's shown, read text,
24
+ analyze diagrams, identify objects, or answer questions about images.
25
+
26
+ Args:
27
+ file_path: Path to the image file OR an http/https URL
28
+ question: What you want to know about the image
29
+ """
30
+ try:
31
+ # Decide whether this is a URL or a local file
32
+ is_url = file_path.lower().startswith(("http://", "https://"))
33
+
34
+ if is_url:
35
+ image_content = {"type": "image_url", "image_url": {"url": file_path}}
36
+ else:
37
+ with open(file_path, "rb") as img_file:
38
+ image_data = base64.b64encode(img_file.read()).decode("utf-8")
39
+ ext = file_path.lower().split('.')[-1]
40
+ media_type = {
41
+ "png": "image/png",
42
+ "jpg": "image/jpeg",
43
+ "jpeg": "image/jpeg",
44
+ "gif": "image/gif",
45
+ "webp": "image/webp",
46
+ }.get(ext, "image/png")
47
+ image_content = {
48
+ "type": "image_url",
49
+ "image_url": {"url": f"data:{media_type};base64,{image_data}"},
50
+ }
51
+
52
+ response = client.chat.completions.create(
53
+ model="gpt-4o",
54
+ messages=[
55
+ {
56
+ "role": "user",
57
+ "content": [
58
+ {"type": "text", "text": question},
59
+ image_content,
60
+ ],
61
+ }
62
+ ],
63
+ max_tokens=800,
64
+ )
65
+ return response.choices[0].message.content
66
+ except Exception as e:
67
+ return f"Image analysis error: {str(e)}"
68
+
69
+
70
+ @tool
71
+ def image_manipulate(
72
+ file_path: str,
73
+ operation: str,
74
+ params: str = "{}"
75
+ ) -> str:
76
+ """Manipulate an image file using PIL/Pillow.
77
+
78
+ Operations available:
79
+ - crop: Crop image. Params: {"box": [left, top, right, bottom]}
80
+ - rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise)
81
+ - resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5}
82
+ - flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"}
83
+ - grayscale: Convert to grayscale. No params needed.
84
+ - brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original)
85
+ - contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original)
86
+ - sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original)
87
+ - blur: Apply Gaussian blur. Params: {"radius": 2}
88
+ - thumbnail: Create thumbnail. Params: {"size": [128, 128]}
89
+
90
+ Args:
91
+ file_path: Path to the image file
92
+ operation: One of the operations listed above
93
+ params: JSON string with operation parameters
94
+ """
95
+ try:
96
+ from PIL import Image, ImageEnhance, ImageFilter
97
+
98
+ # Parse parameters
99
+ try:
100
+ p = json.loads(params) if params else {}
101
+ except json.JSONDecodeError:
102
+ return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}"
103
+
104
+ # Open the image
105
+ img = Image.open(file_path)
106
+ original_format = img.format or "PNG"
107
+
108
+ operation = operation.lower().strip()
109
+
110
+ if operation == "crop":
111
+ if "box" not in p:
112
+ return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}"
113
+ box = tuple(p["box"])
114
+ img = img.crop(box)
115
+
116
+ elif operation == "rotate":
117
+ angle = p.get("angle", 90)
118
+ expand = p.get("expand", True)
119
+ img = img.rotate(angle, expand=expand)
120
+
121
+ elif operation == "resize":
122
+ if "scale" in p:
123
+ new_width = int(img.width * p["scale"])
124
+ new_height = int(img.height * p["scale"])
125
+ elif "width" in p and "height" in p:
126
+ new_width = p["width"]
127
+ new_height = p["height"]
128
+ elif "width" in p:
129
+ new_width = p["width"]
130
+ new_height = int(img.height * (p["width"] / img.width))
131
+ elif "height" in p:
132
+ new_height = p["height"]
133
+ new_width = int(img.width * (p["height"] / img.height))
134
+ else:
135
+ return "Error: resize requires 'width'/'height' or 'scale' param"
136
+ img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
137
+
138
+ elif operation == "flip":
139
+ direction = p.get("direction", "horizontal")
140
+ if direction == "horizontal":
141
+ img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
142
+ elif direction == "vertical":
143
+ img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
144
+ else:
145
+ return "Error: flip direction must be 'horizontal' or 'vertical'"
146
+
147
+ elif operation == "grayscale":
148
+ img = img.convert("L")
149
+
150
+ elif operation == "brightness":
151
+ factor = p.get("factor", 1.0)
152
+ enhancer = ImageEnhance.Brightness(img)
153
+ img = enhancer.enhance(factor)
154
+
155
+ elif operation == "contrast":
156
+ factor = p.get("factor", 1.0)
157
+ enhancer = ImageEnhance.Contrast(img)
158
+ img = enhancer.enhance(factor)
159
+
160
+ elif operation == "sharpen":
161
+ factor = p.get("factor", 2.0)
162
+ enhancer = ImageEnhance.Sharpness(img)
163
+ img = enhancer.enhance(factor)
164
+
165
+ elif operation == "blur":
166
+ radius = p.get("radius", 2)
167
+ img = img.filter(ImageFilter.GaussianBlur(radius=radius))
168
+
169
+ elif operation == "thumbnail":
170
+ size = tuple(p.get("size", [128, 128]))
171
+ img.thumbnail(size, Image.Resampling.LANCZOS)
172
+
173
+ else:
174
+ return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail"
175
+
176
+ # Save to temp file
177
+ ext = file_path.lower().split('.')[-1]
178
+ if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
179
+ ext = 'png'
180
+
181
+ output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}")
182
+
183
+ # Handle mode conversion for JPEG
184
+ if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']:
185
+ img = img.convert('RGB')
186
+
187
+ img.save(output_path, format=original_format if original_format else None)
188
+
189
+ return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}"
190
+
191
+ except ImportError:
192
+ return "Error: Pillow is not installed. Please install it with: pip install Pillow"
193
+ except Exception as e:
194
+ return f"Image manipulation error: {str(e)}"
195
+
196
+
197
+ @tool
198
+ def image_annotate(
199
+ file_path: str,
200
+ annotations: str
201
+ ) -> str:
202
+ """Add annotations (text, rectangles, circles, lines) to an image.
203
+
204
+ Annotations format (JSON array):
205
+ [
206
+ {"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20},
207
+ {"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2},
208
+ {"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2},
209
+ {"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2},
210
+ {"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2}
211
+ ]
212
+
213
+ Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0]
214
+
215
+ Args:
216
+ file_path: Path to the image file
217
+ annotations: JSON string with list of annotations
218
+ """
219
+ try:
220
+ from PIL import Image, ImageDraw, ImageFont
221
+ import math
222
+
223
+ # Parse annotations
224
+ try:
225
+ annots = json.loads(annotations)
226
+ except json.JSONDecodeError:
227
+ return f"Error parsing annotations: {annotations}. Use JSON array format."
228
+
229
+ if not isinstance(annots, list):
230
+ annots = [annots]
231
+
232
+ # Open the image
233
+ img = Image.open(file_path)
234
+ if img.mode != 'RGBA':
235
+ img = img.convert('RGBA')
236
+
237
+ draw = ImageDraw.Draw(img)
238
+
239
+ # Color mapping
240
+ color_map = {
241
+ "red": (255, 0, 0),
242
+ "green": (0, 255, 0),
243
+ "blue": (0, 0, 255),
244
+ "yellow": (255, 255, 0),
245
+ "white": (255, 255, 255),
246
+ "black": (0, 0, 0),
247
+ "orange": (255, 165, 0),
248
+ "purple": (128, 0, 128),
249
+ "cyan": (0, 255, 255),
250
+ "magenta": (255, 0, 255),
251
+ }
252
+
253
+ def get_color(c):
254
+ if isinstance(c, str):
255
+ return color_map.get(c.lower(), (255, 0, 0))
256
+ elif isinstance(c, list):
257
+ return tuple(c)
258
+ return (255, 0, 0)
259
+
260
+ # Try to load a font, fall back to default
261
+ def get_font(size):
262
+ try:
263
+ # Try common font paths
264
+ font_paths = [
265
+ "arial.ttf",
266
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
267
+ "/System/Library/Fonts/Helvetica.ttc",
268
+ "C:/Windows/Fonts/arial.ttf",
269
+ ]
270
+ for fp in font_paths:
271
+ try:
272
+ return ImageFont.truetype(fp, size)
273
+ except:
274
+ continue
275
+ return ImageFont.load_default()
276
+ except:
277
+ return ImageFont.load_default()
278
+
279
+ # Process each annotation
280
+ for annot in annots:
281
+ atype = annot.get("type", "").lower()
282
+ color = get_color(annot.get("color", "red"))
283
+ width = annot.get("width", 2)
284
+
285
+ if atype == "text":
286
+ text = annot.get("text", "")
287
+ position = tuple(annot.get("position", [10, 10]))
288
+ size = annot.get("size", 20)
289
+ font = get_font(size)
290
+ draw.text(position, text, fill=color, font=font)
291
+
292
+ elif atype == "rectangle":
293
+ box = annot.get("box", [0, 0, 100, 100])
294
+ fill = annot.get("fill")
295
+ fill_color = get_color(fill) if fill else None
296
+ draw.rectangle(box, outline=color, width=width, fill=fill_color)
297
+
298
+ elif atype == "circle":
299
+ center = annot.get("center", [50, 50])
300
+ radius = annot.get("radius", 25)
301
+ box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
302
+ fill = annot.get("fill")
303
+ fill_color = get_color(fill) if fill else None
304
+ draw.ellipse(box, outline=color, width=width, fill=fill_color)
305
+
306
+ elif atype == "line":
307
+ start = tuple(annot.get("start", [0, 0]))
308
+ end = tuple(annot.get("end", [100, 100]))
309
+ draw.line([start, end], fill=color, width=width)
310
+
311
+ elif atype == "arrow":
312
+ start = annot.get("start", [0, 0])
313
+ end = annot.get("end", [100, 100])
314
+ draw.line([tuple(start), tuple(end)], fill=color, width=width)
315
+
316
+ # Draw arrowhead
317
+ angle = math.atan2(end[1] - start[1], end[0] - start[0])
318
+ arrow_length = 15
319
+ arrow_angle = math.pi / 6 # 30 degrees
320
+
321
+ p1 = (
322
+ end[0] - arrow_length * math.cos(angle - arrow_angle),
323
+ end[1] - arrow_length * math.sin(angle - arrow_angle)
324
+ )
325
+ p2 = (
326
+ end[0] - arrow_length * math.cos(angle + arrow_angle),
327
+ end[1] - arrow_length * math.sin(angle + arrow_angle)
328
+ )
329
+ draw.polygon([tuple(end), p1, p2], fill=color)
330
+
331
+ # Save to temp file
332
+ output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}")
333
+
334
+ # Convert back to RGB if saving as JPEG
335
+ ext = file_path.lower().split('.')[-1]
336
+ if ext in ['jpg', 'jpeg']:
337
+ img = img.convert('RGB')
338
+
339
+ img.save(output_path)
340
+
341
+ return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}"
342
+
343
+ except ImportError:
344
+ return "Error: Pillow is not installed. Please install it with: pip install Pillow"
345
+ except Exception as e:
346
+ return f"Image annotation error: {str(e)}"
347
+
348
+
349
+ @tool
350
+ def image_ocr(file_path: str, lang: str = "eng") -> str:
351
+ """Extract text from an image using OCR (Optical Character Recognition).
352
+
353
+ Uses Tesseract OCR engine. Requires tesseract to be installed on the system.
354
+
355
+ Args:
356
+ file_path: Path to the image file
357
+ lang: Language code for OCR (default: "eng" for English).
358
+ Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor
359
+ """
360
+ try:
361
+ import pytesseract
362
+ from PIL import Image
363
+
364
+ # Open and preprocess image
365
+ img = Image.open(file_path)
366
+
367
+ # Convert to RGB if necessary
368
+ if img.mode not in ['RGB', 'L']:
369
+ img = img.convert('RGB')
370
+
371
+ # Extract text
372
+ text = pytesseract.image_to_string(img, lang=lang)
373
+
374
+ # Also get structured data with confidence
375
+ try:
376
+ data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
377
+
378
+ # Calculate average confidence for detected words
379
+ confidences = [int(c) for c in data['conf'] if int(c) > 0]
380
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
381
+ word_count = len([w for w in data['text'] if w.strip()])
382
+
383
+ return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%"
384
+ except:
385
+ return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}"
386
+
387
+ except ImportError as e:
388
+ if "pytesseract" in str(e):
389
+ return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system."
390
+ return f"Import error: {str(e)}"
391
+ except Exception as e:
392
+ error_msg = str(e)
393
+ if "tesseract" in error_msg.lower():
394
+ return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr"
395
+ return f"OCR error: {error_msg}"
396
+
tools/math_tools.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Mathematical computation tools for the GAIA Agent.
3
+ Includes symbolic math, matrix operations, calculator, and statistics.
4
+ """
5
+
6
+ import json
7
+ from langchain_core.tools import tool
8
+
9
+
10
+ @tool
11
+ def symbolic_math(expression: str, operation: str = "simplify", variable: str = "x") -> str:
12
+ """Perform symbolic mathematics operations using SymPy.
13
+
14
+ Operations available:
15
+ - simplify: Simplify an expression
16
+ - expand: Expand an expression
17
+ - factor: Factor an expression
18
+ - solve: Solve an equation (set equal to 0)
19
+ - differentiate: Compute derivative with respect to variable
20
+ - integrate: Compute indefinite integral with respect to variable
21
+ - limit: Compute limit as variable approaches 0 (use expression like "sin(x)/x")
22
+ - series: Compute Taylor series expansion
23
+
24
+ Args:
25
+ expression: Mathematical expression (use ** for power, e.g., "x**2 + 2*x + 1")
26
+ operation: One of: simplify, expand, factor, solve, differentiate, integrate, limit, series
27
+ variable: Variable to use for calculus operations (default: "x")
28
+ """
29
+ try:
30
+ import sympy as sp
31
+ from sympy.parsing.sympy_parser import parse_expr, standard_transformations, implicit_multiplication_application
32
+
33
+ # Define common symbols
34
+ x, y, z, t, n, a, b, c = sp.symbols('x y z t n a b c')
35
+ symbol_map = {'x': x, 'y': y, 'z': z, 't': t, 'n': n, 'a': a, 'b': b, 'c': c}
36
+
37
+ # Get the main variable
38
+ var = symbol_map.get(variable, sp.Symbol(variable))
39
+
40
+ # Parse the expression with implicit multiplication support
41
+ transformations = standard_transformations + (implicit_multiplication_application,)
42
+ expr = parse_expr(expression, local_dict=symbol_map, transformations=transformations)
43
+
44
+ operation = operation.lower().strip()
45
+
46
+ if operation == "simplify":
47
+ result = sp.simplify(expr)
48
+ elif operation == "expand":
49
+ result = sp.expand(expr)
50
+ elif operation == "factor":
51
+ result = sp.factor(expr)
52
+ elif operation == "solve":
53
+ result = sp.solve(expr, var)
54
+ elif operation in ("differentiate", "diff", "derivative"):
55
+ result = sp.diff(expr, var)
56
+ elif operation in ("integrate", "integral"):
57
+ result = sp.integrate(expr, var)
58
+ elif operation == "limit":
59
+ result = sp.limit(expr, var, 0)
60
+ elif operation == "series":
61
+ result = sp.series(expr, var, 0, 6)
62
+ else:
63
+ return f"Unknown operation: {operation}. Available: simplify, expand, factor, solve, differentiate, integrate, limit, series"
64
+
65
+ return f"Input: {expression}\nOperation: {operation}\nResult: {result}"
66
+
67
+ except ImportError:
68
+ return "Error: SymPy is not installed. Please install it with: pip install sympy"
69
+ except Exception as e:
70
+ return f"Symbolic math error: {str(e)}"
71
+
72
+
73
+ @tool
74
+ def matrix_operations(matrix_a: str, operation: str = "determinant", matrix_b: str = "") -> str:
75
+ """Perform matrix operations using NumPy.
76
+
77
+ Operations available:
78
+ - determinant: Compute determinant of matrix_a
79
+ - inverse: Compute inverse of matrix_a
80
+ - transpose: Compute transpose of matrix_a
81
+ - eigenvalues: Compute eigenvalues of matrix_a
82
+ - eigenvectors: Compute eigenvectors of matrix_a
83
+ - rank: Compute rank of matrix_a
84
+ - trace: Compute trace of matrix_a
85
+ - multiply: Matrix multiplication of matrix_a @ matrix_b
86
+ - add: Element-wise addition of matrix_a + matrix_b
87
+ - solve: Solve linear system Ax = b (matrix_a is A, matrix_b is b vector)
88
+
89
+ Args:
90
+ matrix_a: Matrix as JSON array, e.g., "[[1,2],[3,4]]"
91
+ operation: One of: determinant, inverse, transpose, eigenvalues, eigenvectors, rank, trace, multiply, add, solve
92
+ matrix_b: Second matrix for binary operations (as JSON array)
93
+ """
94
+ try:
95
+ import numpy as np
96
+
97
+ # Parse matrix_a
98
+ try:
99
+ a = np.array(json.loads(matrix_a), dtype=float)
100
+ except json.JSONDecodeError:
101
+ return f"Error parsing matrix_a: {matrix_a}. Use JSON format like [[1,2],[3,4]]"
102
+
103
+ operation = operation.lower().strip()
104
+
105
+ # Parse matrix_b if needed
106
+ b = None
107
+ if matrix_b and operation in ("multiply", "add", "solve"):
108
+ try:
109
+ b = np.array(json.loads(matrix_b), dtype=float)
110
+ except json.JSONDecodeError:
111
+ return f"Error parsing matrix_b: {matrix_b}. Use JSON format like [[1,2],[3,4]]"
112
+
113
+ if operation == "determinant":
114
+ if a.shape[0] != a.shape[1]:
115
+ return "Error: Determinant requires a square matrix"
116
+ result = np.linalg.det(a)
117
+ elif operation == "inverse":
118
+ if a.shape[0] != a.shape[1]:
119
+ return "Error: Inverse requires a square matrix"
120
+ result = np.linalg.inv(a)
121
+ elif operation == "transpose":
122
+ result = a.T
123
+ elif operation == "eigenvalues":
124
+ if a.shape[0] != a.shape[1]:
125
+ return "Error: Eigenvalues require a square matrix"
126
+ result = np.linalg.eigvals(a)
127
+ elif operation == "eigenvectors":
128
+ if a.shape[0] != a.shape[1]:
129
+ return "Error: Eigenvectors require a square matrix"
130
+ eigenvalues, eigenvectors = np.linalg.eig(a)
131
+ result = {"eigenvalues": eigenvalues.tolist(), "eigenvectors": eigenvectors.tolist()}
132
+ elif operation == "rank":
133
+ result = np.linalg.matrix_rank(a)
134
+ elif operation == "trace":
135
+ result = np.trace(a)
136
+ elif operation == "multiply":
137
+ if b is None:
138
+ return "Error: multiply operation requires matrix_b"
139
+ result = a @ b
140
+ elif operation == "add":
141
+ if b is None:
142
+ return "Error: add operation requires matrix_b"
143
+ result = a + b
144
+ elif operation == "solve":
145
+ if b is None:
146
+ return "Error: solve operation requires matrix_b (the b vector)"
147
+ result = np.linalg.solve(a, b)
148
+ else:
149
+ return f"Unknown operation: {operation}. Available: determinant, inverse, transpose, eigenvalues, eigenvectors, rank, trace, multiply, add, solve"
150
+
151
+ # Format result
152
+ if isinstance(result, np.ndarray):
153
+ result_str = np.array2string(result, precision=6, suppress_small=True)
154
+ elif isinstance(result, dict):
155
+ result_str = json.dumps(result, indent=2)
156
+ else:
157
+ result_str = str(result)
158
+
159
+ return f"Matrix A:\n{np.array2string(a)}\nOperation: {operation}\nResult:\n{result_str}"
160
+
161
+ except ImportError:
162
+ return "Error: NumPy is not installed. Please install it with: pip install numpy"
163
+ except np.linalg.LinAlgError as e:
164
+ return f"Linear algebra error: {str(e)}"
165
+ except Exception as e:
166
+ return f"Matrix operation error: {str(e)}"
167
+
168
+
169
+ @tool
170
+ def calculator(expression: str) -> str:
171
+ """Evaluate a mathematical expression with high precision.
172
+
173
+ Supports standard math operations: +, -, *, /, ** (power), % (modulo)
174
+ Also supports functions: sqrt, sin, cos, tan, log, log10, exp, abs, ceil, floor, round
175
+ Constants: pi, e
176
+
177
+ Args:
178
+ expression: Mathematical expression to evaluate, e.g., "sqrt(2) * pi" or "2**10 + 5"
179
+ """
180
+ try:
181
+ import math
182
+ from decimal import Decimal, getcontext
183
+
184
+ # Set high precision
185
+ getcontext().prec = 50
186
+
187
+ # Safe evaluation context with math functions
188
+ safe_dict = {
189
+ 'sqrt': math.sqrt,
190
+ 'sin': math.sin,
191
+ 'cos': math.cos,
192
+ 'tan': math.tan,
193
+ 'asin': math.asin,
194
+ 'acos': math.acos,
195
+ 'atan': math.atan,
196
+ 'atan2': math.atan2,
197
+ 'log': math.log,
198
+ 'log10': math.log10,
199
+ 'log2': math.log2,
200
+ 'exp': math.exp,
201
+ 'pow': pow,
202
+ 'abs': abs,
203
+ 'ceil': math.ceil,
204
+ 'floor': math.floor,
205
+ 'round': round,
206
+ 'pi': math.pi,
207
+ 'e': math.e,
208
+ 'inf': math.inf,
209
+ 'factorial': math.factorial,
210
+ 'gcd': math.gcd,
211
+ 'lcm': math.lcm,
212
+ 'degrees': math.degrees,
213
+ 'radians': math.radians,
214
+ }
215
+
216
+ # Basic security check - only allow safe characters
217
+ allowed_chars = set('0123456789+-*/.()%, abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_')
218
+ if not all(c in allowed_chars for c in expression):
219
+ return f"Error: Expression contains invalid characters. Only math operations and functions are allowed."
220
+
221
+ # Evaluate the expression
222
+ result = eval(expression, {"__builtins__": {}}, safe_dict)
223
+
224
+ # Format the result
225
+ if isinstance(result, float):
226
+ # Check if it's close to an integer
227
+ if result == int(result):
228
+ result_str = str(int(result))
229
+ else:
230
+ result_str = f"{result:.15g}" # High precision but remove trailing zeros
231
+ else:
232
+ result_str = str(result)
233
+
234
+ return f"Expression: {expression}\nResult: {result_str}"
235
+
236
+ except ZeroDivisionError:
237
+ return "Error: Division by zero"
238
+ except ValueError as e:
239
+ return f"Math error: {str(e)}"
240
+ except Exception as e:
241
+ return f"Calculator error: {str(e)}"
242
+
243
+
244
+ @tool
245
+ def statistical_analysis(data: str, operation: str = "describe") -> str:
246
+ """Perform statistical analysis on numerical data.
247
+
248
+ Operations available:
249
+ - describe: Full statistical summary (mean, median, std, min, max, quartiles)
250
+ - mean: Arithmetic mean
251
+ - median: Median value
252
+ - mode: Most frequent value
253
+ - std: Standard deviation
254
+ - var: Variance
255
+ - correlation: Correlation coefficient (requires 2D data like [[x1,y1],[x2,y2],...])
256
+ - regression: Linear regression (requires 2D data)
257
+ - percentile: Compute 25th, 50th, 75th percentiles
258
+ - zscore: Compute z-scores for each value
259
+
260
+ Args:
261
+ data: Numerical data as JSON array, e.g., "[1, 2, 3, 4, 5]" or "[[1,2],[3,4]]" for 2D
262
+ operation: One of: describe, mean, median, mode, std, var, correlation, regression, percentile, zscore
263
+ """
264
+ try:
265
+ import numpy as np
266
+ from scipy import stats as sp_stats
267
+
268
+ # Parse data
269
+ try:
270
+ arr = np.array(json.loads(data), dtype=float)
271
+ except json.JSONDecodeError:
272
+ return f"Error parsing data: {data}. Use JSON format like [1, 2, 3, 4, 5]"
273
+
274
+ operation = operation.lower().strip()
275
+
276
+ if operation == "describe":
277
+ if arr.ndim == 1:
278
+ result = {
279
+ "count": len(arr),
280
+ "mean": float(np.mean(arr)),
281
+ "median": float(np.median(arr)),
282
+ "std": float(np.std(arr)),
283
+ "variance": float(np.var(arr)),
284
+ "min": float(np.min(arr)),
285
+ "max": float(np.max(arr)),
286
+ "25th_percentile": float(np.percentile(arr, 25)),
287
+ "50th_percentile": float(np.percentile(arr, 50)),
288
+ "75th_percentile": float(np.percentile(arr, 75)),
289
+ "sum": float(np.sum(arr)),
290
+ }
291
+ else:
292
+ result = {
293
+ "shape": arr.shape,
294
+ "mean_per_column": np.mean(arr, axis=0).tolist(),
295
+ "std_per_column": np.std(arr, axis=0).tolist(),
296
+ }
297
+ elif operation == "mean":
298
+ result = float(np.mean(arr))
299
+ elif operation == "median":
300
+ result = float(np.median(arr))
301
+ elif operation == "mode":
302
+ mode_result = sp_stats.mode(arr.flatten(), keepdims=False)
303
+ result = {"mode": float(mode_result.mode), "count": int(mode_result.count)}
304
+ elif operation == "std":
305
+ result = float(np.std(arr))
306
+ elif operation == "var":
307
+ result = float(np.var(arr))
308
+ elif operation == "correlation":
309
+ if arr.ndim != 2 or arr.shape[1] != 2:
310
+ return "Error: correlation requires 2D data with 2 columns, e.g., [[x1,y1],[x2,y2],...]"
311
+ result = float(np.corrcoef(arr[:, 0], arr[:, 1])[0, 1])
312
+ elif operation == "regression":
313
+ if arr.ndim != 2 or arr.shape[1] != 2:
314
+ return "Error: regression requires 2D data with 2 columns, e.g., [[x1,y1],[x2,y2],...]"
315
+ slope, intercept, r_value, p_value, std_err = sp_stats.linregress(arr[:, 0], arr[:, 1])
316
+ result = {
317
+ "slope": float(slope),
318
+ "intercept": float(intercept),
319
+ "r_squared": float(r_value**2),
320
+ "p_value": float(p_value),
321
+ "std_error": float(std_err),
322
+ "equation": f"y = {slope:.6f}x + {intercept:.6f}"
323
+ }
324
+ elif operation == "percentile":
325
+ result = {
326
+ "25th": float(np.percentile(arr, 25)),
327
+ "50th": float(np.percentile(arr, 50)),
328
+ "75th": float(np.percentile(arr, 75)),
329
+ "90th": float(np.percentile(arr, 90)),
330
+ "95th": float(np.percentile(arr, 95)),
331
+ "99th": float(np.percentile(arr, 99)),
332
+ }
333
+ elif operation == "zscore":
334
+ zscores = sp_stats.zscore(arr.flatten())
335
+ result = zscores.tolist()
336
+ else:
337
+ return f"Unknown operation: {operation}. Available: describe, mean, median, mode, std, var, correlation, regression, percentile, zscore"
338
+
339
+ # Format result
340
+ if isinstance(result, dict):
341
+ result_str = json.dumps(result, indent=2)
342
+ elif isinstance(result, list):
343
+ result_str = json.dumps(result)
344
+ else:
345
+ result_str = str(result)
346
+
347
+ return f"Data: {len(arr.flatten())} values\nOperation: {operation}\nResult:\n{result_str}"
348
+
349
+ except ImportError as e:
350
+ missing = "scipy" if "scipy" in str(e) else "numpy"
351
+ return f"Error: {missing} is not installed. Please install it with: pip install {missing}"
352
+ except Exception as e:
353
+ return f"Statistical analysis error: {str(e)}"
354
+
tools/media_tools.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Media processing tools for the GAIA Agent.
3
+ Includes YouTube transcript, audio transcription, and video analysis.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import math
9
+ import shutil
10
+ import tempfile
11
+ import subprocess
12
+ from typing import Optional, List, Dict
13
+ from urllib.parse import urlparse, parse_qs
14
+
15
+ import requests
16
+ import openai
17
+ import base64
18
+ from langchain_core.tools import tool
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+ client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
23
+
24
+
25
+ def _extract_youtube_id(url: str) -> Optional[str]:
26
+ """Extract YouTube video ID from various URL formats."""
27
+ try:
28
+ u = urlparse(url)
29
+ host = (u.netloc or "").lower()
30
+ path = u.path or ""
31
+
32
+ # watch?v=VIDEO_ID
33
+ qs = parse_qs(u.query)
34
+ if "v" in qs and qs["v"]:
35
+ vid = qs["v"][0]
36
+ if len(vid) == 11:
37
+ return vid
38
+
39
+ # youtu.be/VIDEO_ID
40
+ if "youtu.be" in host:
41
+ seg = path.strip("/").split("/")
42
+ if seg and len(seg[0]) == 11:
43
+ return seg[0]
44
+
45
+ # /embed/VIDEO_ID, /shorts/VIDEO_ID, /live/VIDEO_ID
46
+ parts = path.strip("/").split("/")
47
+ for i, p in enumerate(parts[:-1]):
48
+ if p in {"embed", "shorts", "live"}:
49
+ vid = parts[i + 1]
50
+ if len(vid) == 11:
51
+ return vid
52
+
53
+ return None
54
+ except Exception:
55
+ return None
56
+
57
+
58
+ def _transcribe_audio_file(path: str) -> str:
59
+ """Shared helper to transcribe a local audio file with Whisper-1."""
60
+ with open(path, "rb") as audio_file:
61
+ transcript = client.audio.transcriptions.create(
62
+ model="whisper-1",
63
+ file=audio_file,
64
+ response_format="text"
65
+ )
66
+ return transcript
67
+
68
+
69
+ def _encode_image_to_data_url(path: str) -> str:
70
+ """Helper to turn a local image into a data: URL for GPT-4o vision."""
71
+ with open(path, "rb") as img_file:
72
+ image_data = base64.b64encode(img_file.read()).decode("utf-8")
73
+ ext = path.lower().split('.')[-1]
74
+ media_type = {
75
+ "png": "image/png",
76
+ "jpg": "image/jpeg",
77
+ "jpeg": "image/jpeg",
78
+ "gif": "image/gif",
79
+ "webp": "image/webp",
80
+ }.get(ext, "image/png")
81
+ return f"data:{media_type};base64,{image_data}"
82
+
83
+
84
+ @tool
85
+ def youtube_transcript(video_url: str, languages: str = "en") -> str:
86
+ """Get the transcript/captions from a YouTube video.
87
+
88
+ First tries to get existing captions. If captions are disabled,
89
+ falls back to downloading audio and transcribing with Whisper.
90
+
91
+ Args:
92
+ video_url: The YouTube video URL
93
+ languages: Comma-separated language codes to prefer (default: "en")
94
+ """
95
+ video_id = _extract_youtube_id(video_url)
96
+ if not video_id:
97
+ return f"Could not extract video ID from: {video_url}"
98
+
99
+ lang_list = [l.strip() for l in languages.split(",") if l.strip()]
100
+
101
+ # First, try to get existing captions
102
+ caption_error = None
103
+ try:
104
+ from youtube_transcript_api import YouTubeTranscriptApi
105
+
106
+ api = YouTubeTranscriptApi()
107
+ tlist = api.fetch(video_id)
108
+
109
+ transcript = None
110
+
111
+ # Prefer manually created captions
112
+ for lang in lang_list:
113
+ try:
114
+ transcript = tlist.find_manually_created_transcript([lang])
115
+ break
116
+ except Exception:
117
+ pass
118
+
119
+ # Otherwise try auto-generated captions
120
+ if transcript is None:
121
+ for lang in lang_list:
122
+ try:
123
+ transcript = tlist.find_generated_transcript([lang])
124
+ break
125
+ except Exception:
126
+ pass
127
+
128
+ # Otherwise fall back to whatever exists for those languages
129
+ if transcript is None:
130
+ for lang in lang_list:
131
+ try:
132
+ transcript = tlist.find_transcript([lang])
133
+ break
134
+ except Exception:
135
+ pass
136
+
137
+ if transcript is not None:
138
+ items = transcript.fetch()
139
+ text = " ".join(i.get("text", "") for i in items).strip()
140
+ if text:
141
+ return text[:8000]
142
+
143
+ except Exception as e:
144
+ # Captions might be disabled - we'll try fallback
145
+ caption_error = f"{type(e).__name__}: {e}"
146
+
147
+ # Fallback: Download audio and transcribe with Whisper
148
+ try:
149
+ return youtube_audio_transcribe.invoke({"video_url": video_url})
150
+ except Exception as whisper_error:
151
+ return (
152
+ f"Transcript error: Captions unavailable and audio transcription failed.\n"
153
+ f"Caption error: {caption_error or 'Unknown'}\n"
154
+ f"Whisper error: {whisper_error}\n\n"
155
+ f"Suggestion: Try using web_search to find information about this video instead."
156
+ )
157
+
158
+
159
+ @tool
160
+ def youtube_audio_transcribe(video_url: str) -> str:
161
+ """Download YouTube audio and transcribe with Whisper-1.
162
+
163
+ Use when captions are unavailable or you want an audio-based transcript.
164
+
165
+ Args:
166
+ video_url: The YouTube video URL
167
+ """
168
+ video_id = _extract_youtube_id(video_url)
169
+ if not video_id:
170
+ return f"Could not extract video ID from: {video_url}"
171
+
172
+ # Create temp directory for audio
173
+ with tempfile.TemporaryDirectory() as tmpdir:
174
+ audio_path = f"{tmpdir}/{video_id}.webm"
175
+
176
+ # Download audio using yt-dlp
177
+ result = subprocess.run(
178
+ [
179
+ "yt-dlp",
180
+ "-f", "bestaudio/best",
181
+ "-o", audio_path,
182
+ "--no-playlist",
183
+ "--max-filesize", "25M",
184
+ video_url,
185
+ ],
186
+ capture_output=True,
187
+ text=True,
188
+ timeout=120
189
+ )
190
+
191
+ if result.returncode != 0 or not os.path.exists(audio_path):
192
+ raise RuntimeError(f"yt-dlp failed.\nSTDERR:\n{result.stderr}\nSTDOUT:\n{result.stdout}")
193
+
194
+ return _transcribe_audio_file(audio_path)
195
+
196
+
197
+ @tool
198
+ def audio_transcribe(file_path: str) -> str:
199
+ """Transcribe an audio file to text using speech recognition.
200
+
201
+ Args:
202
+ file_path: Path to the audio file (.mp3, .wav, .m4a, etc.) or an http/https URL
203
+ """
204
+ try:
205
+ # If it's a URL, download first
206
+ if file_path.lower().startswith(("http://", "https://")):
207
+ with tempfile.NamedTemporaryFile(suffix=".audio", delete=False) as tmp:
208
+ r = requests.get(file_path, timeout=120)
209
+ r.raise_for_status()
210
+ tmp.write(r.content)
211
+ tmp_path = tmp.name
212
+ try:
213
+ return _transcribe_audio_file(tmp_path)
214
+ finally:
215
+ os.unlink(tmp_path)
216
+ else:
217
+ return _transcribe_audio_file(file_path)
218
+ except Exception as e:
219
+ return f"Transcription error: {str(e)}"
220
+
221
+
222
+ @tool
223
+ def video_metadata(video_url: str) -> str:
224
+ """Fetch coarse metadata for a video (duration, resolution, fps, title) using yt-dlp.
225
+
226
+ Args:
227
+ video_url: The video URL (YouTube or direct link)
228
+ """
229
+ try:
230
+ result = subprocess.run(
231
+ [
232
+ "yt-dlp",
233
+ "--dump-single-json",
234
+ "--no-playlist",
235
+ "--no-warnings",
236
+ video_url,
237
+ ],
238
+ capture_output=True,
239
+ text=True,
240
+ timeout=90,
241
+ )
242
+ if result.returncode != 0:
243
+ return f"Metadata error: yt-dlp failed.\nStdout: {result.stdout[:4000]}\nStderr: {result.stderr[:4000]}"
244
+
245
+ data = json.loads(result.stdout)
246
+ core = {
247
+ "title": data.get("title"),
248
+ "uploader": data.get("uploader"),
249
+ "duration_seconds": data.get("duration"),
250
+ "width": data.get("width"),
251
+ "height": data.get("height"),
252
+ "fps": data.get("fps"),
253
+ "url": video_url,
254
+ }
255
+ return json.dumps(core, indent=2)
256
+ except Exception as e:
257
+ return f"Metadata error: {str(e)}"
258
+
259
+
260
+ @tool
261
+ def video_frame_analyze(
262
+ video_url: str,
263
+ vision_task_prompt: str,
264
+ scene_threshold: Optional[float] = None,
265
+ scene_threshold_low: float = 0.2,
266
+ scene_threshold_high: float = 0.4,
267
+ max_frames: int = 120,
268
+ batch_size: int = 6,
269
+ ) -> str:
270
+ """Download a video, extract scene-change frames, and run GPT-4o vision batches.
271
+
272
+ Args:
273
+ video_url: URL to the video (YouTube or direct)
274
+ vision_task_prompt: Task for the vision model (e.g., count bird species per frame)
275
+ scene_threshold: Optional direct ffmpeg scene threshold (0-1). If None, use mid of low/high.
276
+ scene_threshold_low: Lower bound for threshold (default 0.2)
277
+ scene_threshold_high: Upper bound for threshold (default 0.4)
278
+ max_frames: Cap on frames to send to vision (downsamples if exceeded).
279
+ batch_size: Number of frames per GPT-4o call (keep modest to control context size).
280
+ """
281
+ tmpdir = tempfile.mkdtemp(prefix="video_analyze_")
282
+ try:
283
+ video_path = os.path.join(tmpdir, "video.mp4")
284
+ frame_dir = os.path.join(tmpdir, "frames")
285
+ os.makedirs(frame_dir, exist_ok=True)
286
+
287
+ # Step 1: obtain video (URL via yt-dlp, or local path copy)
288
+ if video_url.lower().startswith(("http://", "https://")):
289
+ # Ensure ffmpeg exists for merging
290
+ if shutil.which("ffmpeg") is None and shutil.which("avconv") is None:
291
+ return "Download error: ffmpeg/avconv not found in PATH; required for muxing."
292
+
293
+ # Use an AVC/H.264 + m4a combination to avoid unsupported codecs, cap at 1080p.
294
+ out_template = os.path.join(tmpdir, "video.%(ext)s")
295
+ dl = subprocess.run(
296
+ [
297
+ "yt-dlp",
298
+ "-f",
299
+ "bestvideo[ext=mp4][vcodec^=avc1][height<=1080]+bestaudio[ext=m4a]/best[ext=mp4]/best",
300
+ "--merge-output-format",
301
+ "mp4",
302
+ "--recode-video",
303
+ "mp4",
304
+ "--no-keep-video",
305
+ "--no-playlist",
306
+ "--no-warnings",
307
+ "-o",
308
+ out_template,
309
+ video_url,
310
+ ],
311
+ capture_output=True,
312
+ text=True,
313
+ timeout=240,
314
+ )
315
+ if dl.returncode != 0:
316
+ return f"Download error: {dl.stderr[:4000] or dl.stdout[:4000]}"
317
+
318
+ # Locate the merged/re-encoded mp4
319
+ candidates = [
320
+ os.path.join(tmpdir, f)
321
+ for f in os.listdir(tmpdir)
322
+ if f.lower().endswith(".mp4")
323
+ ]
324
+ if not candidates:
325
+ return (
326
+ "Download error: no mp4 produced after merge/recode. "
327
+ f"yt-dlp stdout: {dl.stdout[:2000]} stderr: {dl.stderr[:2000]}"
328
+ )
329
+ # Pick the largest mp4 (most likely the merged one)
330
+ best_mp4 = max(candidates, key=lambda p: os.path.getsize(p))
331
+ if os.path.getsize(best_mp4) < 1024:
332
+ return (
333
+ "Download error: merged file is empty or too small. "
334
+ f"yt-dlp stdout: {dl.stdout[:2000]} stderr: {dl.stderr[:2000]}"
335
+ )
336
+ shutil.move(best_mp4, video_path)
337
+ else:
338
+ if not os.path.exists(video_url):
339
+ return f"Video path not found: {video_url}"
340
+ shutil.copy2(video_url, video_path)
341
+
342
+ # Step 2: choose scene threshold
343
+ thr_low = max(0.0, min(1.0, scene_threshold_low))
344
+ thr_high = max(thr_low, min(1.0, scene_threshold_high))
345
+ if scene_threshold is not None:
346
+ thr = max(thr_low, min(thr_high, scene_threshold))
347
+ else:
348
+ thr = (thr_low + thr_high) / 2.0
349
+
350
+ # Step 3: extract frames on scene changes
351
+ ffmpeg_cmd = [
352
+ "ffmpeg",
353
+ "-i",
354
+ video_path,
355
+ "-vf",
356
+ f"select='gt(scene,{thr})',showinfo",
357
+ "-vsync",
358
+ "vfr",
359
+ os.path.join(frame_dir, "frame_%05d.jpg"),
360
+ ]
361
+ ff = subprocess.run(
362
+ ffmpeg_cmd,
363
+ capture_output=True,
364
+ text=True,
365
+ timeout=180,
366
+ )
367
+ frames = sorted(
368
+ [
369
+ os.path.join(frame_dir, f)
370
+ for f in os.listdir(frame_dir)
371
+ if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
372
+ ]
373
+ )
374
+ if not frames:
375
+ return f"No frames extracted with scene threshold {thr}. ffmpeg stderr: {ff.stderr[:2000]}"
376
+
377
+ total_frames = len(frames)
378
+ if total_frames > max_frames:
379
+ step = math.ceil(total_frames / max_frames)
380
+ frames = frames[::step]
381
+
382
+ # Step 4: batch frames and call GPT-4o vision
383
+ batches = [frames[i : i + batch_size] for i in range(0, len(frames), batch_size)]
384
+ batch_outputs: List[Dict[str, str]] = []
385
+
386
+ for idx, batch in enumerate(batches, start=1):
387
+ content = [
388
+ {
389
+ "type": "text",
390
+ "text": (
391
+ "You are a vision assistant. "
392
+ "For each image, run the requested task and return a compact JSON array "
393
+ "with objects: {frame_id, result}. "
394
+ "frame_id should match the filename. "
395
+ "Task:\n"
396
+ f"{vision_task_prompt}"
397
+ ),
398
+ }
399
+ ]
400
+ for p in batch:
401
+ data_url = _encode_image_to_data_url(p)
402
+ content.append(
403
+ {
404
+ "type": "image_url",
405
+ "image_url": {"url": data_url},
406
+ }
407
+ )
408
+
409
+ resp = client.chat.completions.create(
410
+ model="gpt-4o",
411
+ messages=[{"role": "user", "content": content}],
412
+ max_tokens=1200,
413
+ )
414
+ batch_outputs.append(
415
+ {
416
+ "batch_index": idx,
417
+ "frames": [os.path.basename(p) for p in batch],
418
+ "response": resp.choices[0].message.content,
419
+ }
420
+ )
421
+
422
+ summary = {
423
+ "scene_threshold_used": thr,
424
+ "frames_extracted": total_frames,
425
+ "frames_sent": len(frames),
426
+ "batch_size": batch_size,
427
+ "batches": batch_outputs,
428
+ }
429
+ return json.dumps(summary, indent=2)
430
+ except Exception as e:
431
+ return f"Video frame analyze error: {str(e)}"
432
+ finally:
433
+ shutil.rmtree(tmpdir, ignore_errors=True)
434
+
tools/web_tools.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web-related tools for the GAIA Agent.
3
+ Includes web search, Wikipedia lookup, arXiv search, and webpage fetching.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import requests
9
+ from langchain_core.tools import tool
10
+ from tavily import TavilyClient
11
+ import wikipedia
12
+ import arxiv
13
+ from dotenv import load_dotenv
14
+
15
+ load_dotenv()
16
+
17
+
18
+ @tool
19
+ def web_search(query: str, include_content: bool = True, max_results: int = 5) -> str:
20
+ """Search the web for current information with full page content.
21
+
22
+ Use this for facts, news, people, places, events, or anything you need to look up.
23
+ Returns search results WITH full page content, so you can get detailed information directly.
24
+
25
+ Args:
26
+ query: The search query
27
+ include_content: If True, includes full page content (default: True)
28
+ max_results: Number of results to return (default: 5)
29
+ """
30
+ try:
31
+ client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
32
+ response = client.search(
33
+ query,
34
+ max_results=max_results,
35
+ include_raw_content=include_content, # Get full page content
36
+ include_answer=True, # Get a direct answer if available
37
+ )
38
+
39
+ output = []
40
+
41
+ # Include Tavily's direct answer if available
42
+ if response.get("answer"):
43
+ output.append(f"📌 DIRECT ANSWER: {response['answer']}")
44
+ output.append("=" * 50)
45
+
46
+ results = response.get("results", [])
47
+ if not results:
48
+ return "No search results found."
49
+
50
+ for i, r in enumerate(results, 1):
51
+ output.append(f"\n[{i}] {r.get('title', 'N/A')}")
52
+ output.append(f"URL: {r.get('url', 'N/A')}")
53
+ output.append(f"Snippet: {r.get('content', 'N/A')}")
54
+
55
+ # Include full page content if available
56
+ raw_content = r.get('raw_content')
57
+ if raw_content:
58
+ # Truncate to reasonable length per result
59
+ content_preview = raw_content[:3000]
60
+ if len(raw_content) > 3000:
61
+ content_preview += "\n...[content truncated]"
62
+ output.append(f"\nFull Content:\n{content_preview}")
63
+ output.append("-" * 40)
64
+
65
+ return "\n".join(output)
66
+ except Exception as e:
67
+ return f"Search error: {str(e)}"
68
+
69
+
70
+ @tool
71
+ def wikipedia_lookup(topic: str) -> str:
72
+ """Look up a topic on Wikipedia for detailed encyclopedic information.
73
+
74
+ Args:
75
+ topic: The topic to look up
76
+ """
77
+ try:
78
+ search_results = wikipedia.search(topic, results=3)
79
+ if not search_results:
80
+ return f"No Wikipedia article found for: {topic}"
81
+
82
+ try:
83
+ page = wikipedia.page(search_results[0], auto_suggest=False)
84
+ return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}"
85
+ except wikipedia.DisambiguationError as e:
86
+ if e.options:
87
+ page = wikipedia.page(e.options[0], auto_suggest=False)
88
+ return f"Title: {page.title}\n\nSummary:\n{page.summary[:4000]}"
89
+ return f"Multiple matches found: {e.options[:5]}"
90
+ except wikipedia.PageError:
91
+ return f"Page not found: {search_results[0]}"
92
+ except Exception as e:
93
+ return f"Wikipedia error: {str(e)}"
94
+
95
+
96
+ @tool
97
+ def arxiv_search(query: str, max_results: int = 5) -> str:
98
+ """Search arXiv for academic papers and research articles.
99
+
100
+ Use this for scientific papers, research, preprints, and academic publications.
101
+ Returns paper titles, authors, abstracts, and arXiv IDs.
102
+
103
+ Args:
104
+ query: Search query (can include author names, titles, or topics)
105
+ max_results: Maximum number of results to return (default: 5)
106
+ """
107
+ try:
108
+ # Create arXiv client and search
109
+ client = arxiv.Client()
110
+ search = arxiv.Search(
111
+ query=query,
112
+ max_results=max_results,
113
+ sort_by=arxiv.SortCriterion.Relevance
114
+ )
115
+
116
+ results = list(client.results(search))
117
+
118
+ if not results:
119
+ return f"No arXiv papers found for: {query}"
120
+
121
+ output = []
122
+ for i, paper in enumerate(results, 1):
123
+ output.append(f"[{i}] {paper.title}")
124
+ output.append(f" Authors: {', '.join(a.name for a in paper.authors[:5])}")
125
+ if len(paper.authors) > 5:
126
+ output[-1] += f" et al. ({len(paper.authors)} total)"
127
+ output.append(f" Published: {paper.published.strftime('%Y-%m-%d')}")
128
+ output.append(f" arXiv ID: {paper.entry_id.split('/')[-1]}")
129
+ output.append(f" Categories: {', '.join(paper.categories[:3])}")
130
+ output.append(f" PDF: {paper.pdf_url}")
131
+ # Truncate abstract to ~500 chars
132
+ abstract = paper.summary.replace('\n', ' ')[:500]
133
+ if len(paper.summary) > 500:
134
+ abstract += "..."
135
+ output.append(f" Abstract: {abstract}")
136
+ output.append("---")
137
+
138
+ return "\n".join(output)
139
+ except Exception as e:
140
+ return f"arXiv search error: {str(e)}"
141
+
142
+
143
+ @tool
144
+ def webpage_fetch(url: str, extract_links: bool = False) -> str:
145
+ """Fetch and read the content of a webpage URL.
146
+
147
+ Use this to read the full content of a page from search results.
148
+ After web_search returns URLs, use this tool to get detailed information.
149
+
150
+ Args:
151
+ url: The URL to fetch (http or https)
152
+ extract_links: If True, also extract and list links found on the page
153
+ """
154
+ try:
155
+ headers = {
156
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
157
+ }
158
+
159
+ response = requests.get(url, headers=headers, timeout=30)
160
+ response.raise_for_status()
161
+
162
+ html = response.text
163
+
164
+ # Try to use BeautifulSoup for better parsing
165
+ try:
166
+ from bs4 import BeautifulSoup
167
+ soup = BeautifulSoup(html, 'html.parser')
168
+
169
+ # Remove script and style elements
170
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
171
+ element.decompose()
172
+
173
+ # Get title
174
+ title = soup.title.string if soup.title else "No title"
175
+
176
+ # Get main text content
177
+ text = soup.get_text(separator='\n', strip=True)
178
+
179
+ # Clean up excessive whitespace
180
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
181
+ text = '\n'.join(lines)
182
+
183
+ # Extract links if requested
184
+ links_text = ""
185
+ if extract_links:
186
+ links = []
187
+ for a in soup.find_all('a', href=True)[:20]: # Limit to 20 links
188
+ href = a['href']
189
+ link_text = a.get_text(strip=True)[:50]
190
+ if href.startswith('http'):
191
+ links.append(f" - {link_text}: {href}")
192
+ if links:
193
+ links_text = "\n\nLinks found:\n" + "\n".join(links)
194
+
195
+ # Truncate to reasonable length
196
+ if len(text) > 8000:
197
+ text = text[:8000] + "\n...[truncated]"
198
+
199
+ return f"Title: {title}\nURL: {url}\n\nContent:\n{text}{links_text}"
200
+
201
+ except ImportError:
202
+ # Fallback: basic HTML tag stripping without BeautifulSoup
203
+ # Remove script and style content
204
+ html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
205
+ html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
206
+
207
+ # Remove HTML tags
208
+ text = re.sub(r'<[^>]+>', ' ', html)
209
+
210
+ # Decode HTML entities
211
+ import html as html_module
212
+ text = html_module.unescape(text)
213
+
214
+ # Clean up whitespace
215
+ text = re.sub(r'\s+', ' ', text).strip()
216
+
217
+ # Truncate
218
+ if len(text) > 8000:
219
+ text = text[:8000] + "...[truncated]"
220
+
221
+ return f"URL: {url}\n\nContent:\n{text}\n\n(Note: Install beautifulsoup4 for better parsing: pip install beautifulsoup4)"
222
+
223
+ except requests.exceptions.Timeout:
224
+ return f"Error: Request timed out for URL: {url}"
225
+ except requests.exceptions.HTTPError as e:
226
+ return f"HTTP Error {e.response.status_code}: Could not fetch {url}"
227
+ except Exception as e:
228
+ return f"Error fetching webpage: {str(e)}"
229
+