jebaponselvasingh commited on
Commit
d1dcd56
Β·
1 Parent(s): e0ff305

changes in the domain structure

Browse files
Files changed (5) hide show
  1. .env +0 -1
  2. __pycache__/agent_enhanced.cpython-312.pyc +0 -0
  3. agent_enhanced.py +549 -520
  4. app.py +160 -277
  5. requirements.txt +14 -4
.env DELETED
@@ -1 +0,0 @@
1
- OPENAI_API_KEY="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"
 
 
__pycache__/agent_enhanced.cpython-312.pyc ADDED
Binary file (36.9 kB). View file
 
agent_enhanced.py CHANGED
@@ -1,33 +1,68 @@
1
  """
2
- Enhanced GAIA Agent with LangGraph
3
- Separate module for cleaner architecture and easier customization
4
  """
5
 
6
  import os
7
  import re
8
  import json
9
  import requests
10
- import tempfile
11
- from typing import TypedDict, Annotated, Sequence, Literal, Any
 
 
12
  import operator
 
 
 
13
 
14
  from langgraph.graph import StateGraph, END
15
  from langgraph.prebuilt import ToolNode
16
  from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
17
  from langchain_core.tools import tool
18
- from langchain_openai import ChatOpenAI
19
  from langchain_community.tools import DuckDuckGoSearchResults
20
  from langchain_experimental.utilities import PythonREPL
21
  import pandas as pd
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # ============ STATE DEFINITION ============
25
  class AgentState(TypedDict):
26
- """State maintained throughout the agent's execution."""
27
  messages: Annotated[Sequence[BaseMessage], operator.add]
28
  task_id: str
29
  file_path: str | None
30
- file_content: str | None
31
  iteration_count: int
32
  final_answer: str | None
33
 
@@ -36,74 +71,46 @@ class AgentState(TypedDict):
36
  @tool
37
  def web_search(query: str) -> str:
38
  """
39
- Search the web using DuckDuckGo for current information.
40
- Use this for questions about recent events, facts, statistics, or any information
41
- that might have changed or that you're uncertain about.
42
 
43
  Args:
44
- query: The search query string
45
-
46
- Returns:
47
- Search results with relevant snippets
48
  """
49
- import logging
50
-
51
- # Suppress non-critical errors from DuckDuckGo's internal engines
52
- # (Some engines like grokipedia may fail due to DNS issues, but others work fine)
53
- ddgs_logger = logging.getLogger("ddgs.ddgs")
54
- primp_logger = logging.getLogger("primp")
55
-
56
- # Store original levels
57
- ddgs_original = ddgs_logger.level if ddgs_logger.level else logging.NOTSET
58
- primp_original = primp_logger.level if primp_logger.level else logging.NOTSET
59
-
60
- # Suppress INFO level logs (which include non-critical engine errors)
61
- ddgs_logger.setLevel(logging.WARNING)
62
- primp_logger.setLevel(logging.WARNING)
63
 
64
  try:
65
- search = DuckDuckGoSearchResults(max_results=5, output_format="list")
66
  results = search.run(query)
67
 
68
- # Restore original logging levels
69
- if ddgs_original != logging.NOTSET:
70
- ddgs_logger.setLevel(ddgs_original)
71
- if primp_original != logging.NOTSET:
72
- primp_logger.setLevel(primp_original)
73
-
74
  if isinstance(results, list):
75
  formatted = []
76
  for r in results:
77
  if isinstance(r, dict):
78
- formatted.append(f"Title: {r.get('title', 'N/A')}\nSnippet: {r.get('snippet', 'N/A')}\nLink: {r.get('link', 'N/A')}")
79
- else:
80
- formatted.append(str(r))
81
- return "\n\n---\n\n".join(formatted)
82
- return str(results)
 
 
83
  except Exception as e:
84
- # Restore original logging levels even on exception
85
- if ddgs_original != logging.NOTSET:
86
- ddgs_logger.setLevel(ddgs_original)
87
- if primp_original != logging.NOTSET:
88
- primp_logger.setLevel(primp_original)
89
- return f"Search failed: {str(e)}. Try a different query or approach."
90
 
91
 
92
  @tool
93
  def python_executor(code: str) -> str:
94
  """
95
- Execute Python code for calculations, data analysis, or any computational task.
96
- You have access to standard libraries: math, statistics, datetime, json, re, collections.
 
97
 
98
  Args:
99
- code: Python code to execute. Print statements will show in output.
100
-
101
- Returns:
102
- The output/result of the code execution
103
  """
104
  try:
105
  repl = PythonREPL()
106
- # Add common imports to the code
107
  augmented_code = """
108
  import math
109
  import statistics
@@ -111,24 +118,28 @@ import datetime
111
  import json
112
  import re
113
  from collections import Counter, defaultdict
 
 
 
 
114
  """ + code
115
  result = repl.run(augmented_code)
116
- return result.strip() if result else "Code executed successfully with no output. Add print() to see results."
 
 
 
117
  except Exception as e:
118
- return f"Execution error: {str(e)}. Please fix the code and try again."
119
 
120
 
121
  @tool
122
  def read_file(file_path: str) -> str:
123
  """
124
- Read and extract content from various file types.
125
- Supports: PDF, TXT, MD, CSV, JSON, XLSX, XLS, PY, and other text files.
126
 
127
  Args:
128
- file_path: Path to the file to read
129
-
130
- Returns:
131
- The content of the file as a string
132
  """
133
  try:
134
  if not os.path.exists(file_path):
@@ -136,598 +147,616 @@ def read_file(file_path: str) -> str:
136
 
137
  file_lower = file_path.lower()
138
 
 
 
 
 
 
 
 
 
 
139
  if file_lower.endswith('.pdf'):
140
- from langchain_community.document_loaders import PyPDFLoader
141
- loader = PyPDFLoader(file_path)
142
- pages = loader.load()
143
- content = "\n\n--- Page Break ---\n\n".join([p.page_content for p in pages])
144
- return f"PDF Content ({len(pages)} pages):\n{content}"
145
-
146
- elif file_lower.endswith(('.xlsx', '.xls')):
147
- df = pd.read_excel(file_path, sheet_name=None) # Read all sheets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  result = []
149
- for sheet_name, sheet_df in df.items():
150
- result.append(f"=== Sheet: {sheet_name} ===\n{sheet_df.to_string()}")
 
 
151
  return "\n\n".join(result)
152
 
153
- elif file_lower.endswith('.csv'):
 
154
  df = pd.read_csv(file_path)
155
- return f"CSV Data ({len(df)} rows):\n{df.to_string()}"
156
 
157
- elif file_lower.endswith('.json'):
 
158
  with open(file_path, 'r', encoding='utf-8') as f:
159
  data = json.load(f)
160
- return f"JSON Content:\n{json.dumps(data, indent=2)}"
161
-
162
- else: # Default: treat as text
163
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
164
- content = f.read()
165
- return f"File Content:\n{content}"
 
 
166
 
167
  except Exception as e:
168
- return f"Error reading file: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
 
171
  @tool
172
  def calculator(expression: str) -> str:
173
  """
174
- Evaluate a mathematical expression safely.
175
- Supports: basic arithmetic, trigonometry, logarithms, exponents, etc.
176
 
177
  Args:
178
- expression: Mathematical expression (e.g., "sqrt(16) + log(100, 10)")
179
-
180
- Returns:
181
- The numerical result as a string
182
  """
183
  try:
184
  import math
185
-
186
- # Define allowed functions and constants
187
  safe_dict = {
188
  'abs': abs, 'round': round, 'min': min, 'max': max,
189
- 'sum': sum, 'pow': pow, 'len': len,
190
  'sqrt': math.sqrt, 'log': math.log, 'log10': math.log10,
191
  'log2': math.log2, 'exp': math.exp,
192
  'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
193
- 'asin': math.asin, 'acos': math.acos, 'atan': math.atan,
194
- 'sinh': math.sinh, 'cosh': math.cosh, 'tanh': math.tanh,
195
  'ceil': math.ceil, 'floor': math.floor,
196
- 'pi': math.pi, 'e': math.e,
197
- 'factorial': math.factorial, 'gcd': math.gcd,
198
- 'degrees': math.degrees, 'radians': math.radians,
199
  }
200
-
201
  result = eval(expression, {"__builtins__": {}}, safe_dict)
202
-
203
- # Format nicely
204
- if isinstance(result, float):
205
- if result.is_integer():
206
- return str(int(result))
207
- return f"{result:.10g}" # Remove trailing zeros
208
- return str(result)
209
-
210
  except Exception as e:
211
- return f"Calculation error: {str(e)}. Check your expression syntax."
212
 
213
 
214
  @tool
215
  def wikipedia_search(query: str) -> str:
216
  """
217
- Search Wikipedia for factual information about a specific topic.
218
- Best for: historical facts, biographies, scientific concepts, definitions.
219
 
220
  Args:
221
- query: The topic to search for on Wikipedia
222
-
223
- Returns:
224
- Summary and key information from relevant Wikipedia articles
225
  """
226
  try:
227
  import urllib.parse
228
-
229
- # Search for articles
230
  search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&format=json&srlimit=3"
231
- response = requests.get(search_url, timeout=10)
232
  data = response.json()
233
 
234
- if 'query' not in data or 'search' not in data['query'] or not data['query']['search']:
235
  return f"No Wikipedia articles found for '{query}'"
236
 
237
- # Get full content of top result
238
- top_title = data['query']['search'][0]['title']
239
- content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro=true&explaintext=true&titles={urllib.parse.quote(top_title)}&format=json"
240
-
241
- content_response = requests.get(content_url, timeout=10)
242
- content_data = content_response.json()
243
-
244
- pages = content_data.get('query', {}).get('pages', {})
245
- for page_id, page_data in pages.items():
246
- if page_id != '-1':
247
- title = page_data.get('title', '')
248
- extract = page_data.get('extract', 'No content available')
249
- return f"Wikipedia: {title}\n\n{extract[:2000]}"
250
-
251
- return "Could not retrieve article content."
252
-
253
  except Exception as e:
254
- return f"Wikipedia search failed: {str(e)}"
255
 
256
 
257
  @tool
258
- def analyze_image(image_path: str, question: str) -> str:
259
  """
260
- Analyze an image file and answer questions about it.
261
- Note: This is a placeholder - implement with vision model if needed.
262
 
263
  Args:
264
- image_path: Path to the image file
265
- question: What to analyze or find in the image
266
-
267
- Returns:
268
- Description or analysis of the image
269
  """
270
- # This is a placeholder - you can integrate with GPT-4V or other vision models
271
- return f"Image analysis not implemented. File: {image_path}, Question: {question}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
 
274
- # Collect all tools
275
- TOOLS = [web_search, python_executor, read_file, calculator, wikipedia_search]
276
 
277
 
278
  # ============ SYSTEM PROMPT ============
279
- SYSTEM_PROMPT = """You are an expert AI assistant designed to solve GAIA benchmark questions with maximum accuracy.
280
-
281
- ## Your Mission
282
- Provide PRECISE, EXACT answers. The benchmark uses EXACT STRING MATCHING, so your final answer must match the ground truth character-for-character.
283
-
284
- ## Critical Answer Formatting Rules (MUST FOLLOW)
285
-
286
- **DO NOT include "FINAL ANSWER:" or any prefix - just the answer itself.**
287
-
288
- 1. **Numbers**: Give just the number.
289
- - βœ… CORRECT: "42"
290
- - ❌ WRONG: "The answer is 42", "42 units", "Answer: 42"
291
-
292
- 2. **Names**: Exact spelling as found in sources. Check Wikipedia/official sources for correct spelling, capitalization, and punctuation.
293
- - βœ… CORRECT: "John Smith"
294
- - ❌ WRONG: "john smith", "John smith"
295
-
296
- 3. **Lists**: Comma-separated, NO spaces after commas.
297
- - βœ… CORRECT: "apple,banana,cherry"
298
- - ❌ WRONG: "apple, banana, cherry", "apple,banana, cherry"
299
-
300
- 4. **Dates**: Use the format specified in the question, or YYYY-MM-DD if not specified.
301
- - βœ… CORRECT: "2024-01-15" or "January 15, 2024" (if question asks for that format)
302
- - ❌ WRONG: "1/15/2024" (unless question asks for it)
303
-
304
- 5. **Yes/No**: Just "Yes" or "No" (capitalized, no period).
305
- - βœ… CORRECT: "Yes"
306
- - ❌ WRONG: "yes", "Yes.", "The answer is Yes"
307
-
308
- 6. **Counts**: Just the number.
309
- - βœ… CORRECT: "5"
310
- - ❌ WRONG: "5 items", "five", "There are 5"
311
-
312
- 7. **No explanations**: Your final response must contain ONLY the answer, nothing else.
313
- - βœ… CORRECT: "Paris"
314
- - ❌ WRONG: "The answer is Paris because..."
315
-
316
- ## Detailed Problem-Solving Strategy
317
-
318
- ### Step 1: Analyze the Question
319
- - Read the question word-by-word. What exactly is being asked?
320
- - Identify keywords: "what", "who", "when", "where", "how many", "calculate", "find"
321
- - Note any format requirements or constraints mentioned in the question
322
- - Check if the question references specific data, files, or time periods
323
-
324
- ### Step 2: File Priority (CRITICAL)
325
- - If a file is mentioned or available, you MUST read it FIRST before any other action
326
- - Files often contain the exact answer or the data needed to calculate it
327
- - After reading the file, carefully search through ALL content - don't miss details
328
- - For Excel/CSV files, examine ALL sheets and ALL columns
329
- - For PDFs, read ALL pages - answers can be anywhere in the document
330
-
331
- ### Step 3: Plan Your Approach
332
- - Based on the question type, decide which tools you need:
333
- - **Data extraction from file**: read_file (then possibly python_executor for analysis)
334
- - **Mathematical calculations**: python_executor or calculator
335
- - **Historical/factual information**: wikipedia_search first, then web_search if needed
336
- - **Current/recent information**: web_search
337
- - **Complex data analysis**: python_executor with pandas/numpy
338
- - Create a step-by-step plan before executing
339
-
340
- ### Step 4: Execute Systematically
341
- - Use ONE tool at a time, wait for results
342
- - For file-based questions: read file β†’ extract relevant data β†’ calculate/analyze β†’ verify
343
- - For fact-based questions: search β†’ verify from multiple sources if possible β†’ extract exact answer
344
- - For calculation questions: gather inputs β†’ perform calculation β†’ double-check math
345
- - If initial search doesn't yield results, try different query keywords
346
-
347
- ### Step 5: Verify and Cross-Check
348
- - Verify your answer matches what was asked
349
- - For names: double-check spelling, capitalization, punctuation
350
- - For numbers: verify calculations, check units, ensure precision
351
- - For dates: verify format matches question requirements
352
- - If you found information from one source, try to verify with another if time permits
353
- - For lists: ensure proper comma-separated format with NO spaces
354
-
355
- ### Step 6: Format Correctly
356
- - Remove ALL prefixes ("FINAL ANSWER:", "The answer is:", etc.)
357
- - Remove ALL explanations and context
358
- - Ensure exact formatting (spaces, commas, capitalization)
359
- - Double-check: is this the EXACT format the question expects?
360
-
361
- ## Available Tools
362
- - `read_file`: Read PDFs, spreadsheets, text files - USE THIS FIRST if a file is available
363
- - `web_search`: Current information, recent events, facts (use for recent/current info)
364
- - `wikipedia_search`: Historical facts, biographies, definitions (use for established facts)
365
- - `python_executor`: Calculations, data processing, analysis (use for complex calculations or data analysis)
366
- - `calculator`: Quick mathematical calculations (use for simple arithmetic)
367
-
368
- ## Tool Usage Guidelines
369
-
370
- ### Reading Files (HIGHEST PRIORITY)
371
- - ALWAYS read files FIRST if available
372
- - For Excel files: check ALL sheets, read ALL relevant columns
373
- - For PDFs: read ALL pages, search for keywords from the question
374
- - For CSV files: examine ALL rows, look for patterns
375
- - Extract numbers, names, dates EXACTLY as they appear
376
-
377
- ### Web Search Strategy
378
- - Use specific, targeted queries with key terms from the question
379
- - If first search doesn't help, try rephrasing with different keywords
380
- - Look for official sources, authoritative websites
381
- - Extract exact values (numbers, names) - don't round or approximate
382
-
383
- ### Wikipedia Search Strategy
384
- - Use exact terms or names from the question
385
- - Read the summary/intro carefully - it often contains the answer
386
- - Check spelling, capitalization, dates exactly as shown
387
- - For biographical questions, search for the person's name
388
-
389
- ### Python Execution
390
- - Use for calculations, data analysis, or processing file contents
391
- - Be explicit with calculations - show your work in code
392
- - Use appropriate precision - don't round unnecessarily
393
- - Print the final result clearly
394
-
395
- ### Calculator
396
- - Use for simple arithmetic operations
397
- - Preserve precision - use exact fractions if possible
398
- - Format output correctly (integers as integers, decimals as needed)
399
-
400
- ## Critical Reminders
401
- - NEVER include "FINAL ANSWER:" or any prefix in your response
402
- - NEVER add explanations or context to your final answer
403
- - ALWAYS verify spelling, capitalization, and formatting
404
- - ALWAYS read files first if they are available - don't skip this step
405
- - For file-based questions, the answer is almost always in the file
406
- - Extract exact values - don't approximate or round unless necessary
407
- - If uncertain about format, look for clues in the question itself
408
- - Never guess - use tools to find accurate information
409
- - Use multiple tools if needed - don't stop after the first result if unsure
410
- - Cross-reference important facts when possible
411
-
412
- ## When You're Ready to Answer
413
- - Review your final answer one more time
414
- - Ensure it's formatted correctly (no prefixes, no explanations)
415
- - Ensure spelling, capitalization, and punctuation are exact
416
- - Ensure numbers are precise
417
- - When satisfied, respond with ONLY the answer - nothing else
418
-
419
- Remember: Your final message must contain ONLY the answer, nothing else. The scoring system uses exact string matching."""
420
-
421
-
422
- # ============ LANGGRAPH AGENT ============
423
  class GAIAAgent:
424
- """LangGraph-based agent for GAIA benchmark."""
425
 
426
  def __init__(
427
  self,
428
- model_name: str = "gpt-4o",
429
- api_key: str = None,
430
  temperature: float = 0,
431
- max_iterations: int = 25
432
  ):
433
- """
434
- Initialize the GAIA agent.
435
-
436
- Args:
437
- model_name: OpenAI model to use
438
- api_key: OpenAI API key (or set OPENAI_API_KEY env var)
439
- temperature: Model temperature (0 for deterministic)
440
- max_iterations: Maximum tool-use iterations
441
- """
442
- self.model_name = model_name
443
  self.max_iterations = max_iterations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
- self.llm = ChatOpenAI(
446
- model=model_name,
447
- temperature=temperature,
448
- api_key=api_key or os.environ.get("OPENAI_API_KEY")
449
- )
450
  self.llm_with_tools = self.llm.bind_tools(TOOLS)
451
  self.graph = self._build_graph()
452
 
453
  def _build_graph(self) -> StateGraph:
454
- """Construct the LangGraph workflow."""
455
  workflow = StateGraph(AgentState)
456
-
457
- # Define nodes
458
  workflow.add_node("agent", self._agent_node)
459
  workflow.add_node("tools", ToolNode(TOOLS))
460
  workflow.add_node("extract_answer", self._extract_answer_node)
461
-
462
- # Set entry point
463
  workflow.set_entry_point("agent")
464
-
465
- # Define edges
466
- workflow.add_conditional_edges(
467
- "agent",
468
- self._route_agent_output,
469
- {
470
- "tools": "tools",
471
- "end": "extract_answer"
472
- }
473
- )
474
  workflow.add_edge("tools", "agent")
475
  workflow.add_edge("extract_answer", END)
476
-
477
  return workflow.compile()
478
 
479
  def _agent_node(self, state: AgentState) -> dict:
480
- """Process messages and decide on next action."""
481
- messages = state["messages"]
482
  iteration = state.get("iteration_count", 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
- # Add iteration warnings to guide the agent
485
  if iteration >= self.max_iterations - 2:
486
- warning_msg = "⚠️ CRITICAL: You have reached the iteration limit. You MUST provide your final answer NOW in your next response. Format: ONLY the answer itself, no prefixes like 'FINAL ANSWER:' or 'The answer is:' - just the answer."
487
- messages = list(messages) + [SystemMessage(content=warning_msg)]
488
  elif iteration >= self.max_iterations - 5:
489
- warning_msg = "⚠️ WARNING: Approaching iteration limit. Start wrapping up and provide your final answer soon. Remember: just the answer, no prefix."
490
- messages = list(messages) + [SystemMessage(content=warning_msg)]
491
- elif iteration >= self.max_iterations - 8:
492
- reminder_msg = "Reminder: When you're ready to answer, provide ONLY the final answer with no prefix like 'FINAL ANSWER:' or 'The answer is:'. Check your answer format carefully."
493
- messages = list(messages) + [SystemMessage(content=reminder_msg)]
494
 
495
  try:
496
  response = self.llm_with_tools.invoke(messages)
497
  except Exception as e:
498
- # Graceful error handling
499
- error_msg = AIMessage(content=f"Error during reasoning: {str(e)}. Please try a different approach or provide your best answer.")
500
- return {
501
- "messages": [error_msg],
502
- "iteration_count": iteration + 1
503
- }
504
-
505
- return {
506
- "messages": [response],
507
- "iteration_count": iteration + 1
508
- }
509
-
510
- def _route_agent_output(self, state: AgentState) -> Literal["tools", "end"]:
511
- """Determine whether to use tools or finish."""
512
- last_message = state["messages"][-1]
513
- iteration = state.get("iteration_count", 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
- # Force end if max iterations reached
516
- if iteration >= self.max_iterations:
 
 
 
517
  return "end"
518
-
519
- # Check if agent wants to use tools
520
- if hasattr(last_message, "tool_calls") and last_message.tool_calls:
521
  return "tools"
522
-
523
  return "end"
524
 
525
  def _extract_answer_node(self, state: AgentState) -> dict:
526
- """Extract and clean the final answer."""
527
- # Try to find the answer in the last few messages
528
  messages = state["messages"]
529
 
530
- # Look for answer in last message first
531
- last_message = messages[-1]
532
- content = last_message.content if hasattr(last_message, "content") else str(last_message)
533
-
534
- # If last message is empty or doesn't contain clear answer, check previous messages
535
- if not content or len(content.strip()) < 3:
536
- # Look backwards through messages for the last non-empty content
537
- for msg in reversed(messages[:-1]):
538
- msg_content = msg.content if hasattr(msg, "content") else str(msg)
539
- if msg_content and len(msg_content.strip()) >= 3:
540
- content = msg_content
541
  break
542
 
543
- # Also check if we have tool results that might contain the answer
544
- # Look for tool results in recent messages
545
- for msg in reversed(messages[-5:]): # Check last 5 messages
546
- if hasattr(msg, "content") and msg.content:
547
- # Sometimes answers are in tool responses
548
- if "result" in msg.content.lower() or "answer" in msg.content.lower():
549
- # Extract potential answer from tool response
550
- lines = msg.content.split('\n')
551
- for line in lines:
552
- line_lower = line.lower()
553
- if any(word in line_lower for word in ["the answer is", "result is", "found:", "value:", "equals"]):
554
- # Try to extract just the answer part
555
- content = line
556
- break
557
-
558
  answer = self._clean_answer(content)
559
-
560
  return {"final_answer": answer}
561
 
562
- def _clean_answer(self, raw_answer: str) -> str:
563
- """Clean and format the final answer for exact matching."""
564
- if not raw_answer:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  return ""
566
 
567
- answer = raw_answer.strip()
 
 
 
 
 
568
 
569
- # Remove common prefixes (case-insensitive, with variations)
570
  prefixes = [
571
- "the answer is:", "the answer is", "answer is:",
572
- "answer:", "answer", "answer:",
573
- "final answer:", "final answer", "FINAL ANSWER:", "FINAL ANSWER",
574
- "the final answer is:", "the final answer is",
575
- "result:", "result", "result is:",
576
- "solution:", "solution", "solution is:",
577
- "the solution is:", "the solution is",
578
- "it is", "it's", "that is", "that's",
579
- "the value is:", "the value is", "value is:",
580
- "the result is:", "the result is",
581
- "found:", "found", "equals:", "equals", "is:",
582
- "according to the", "based on the", "from the",
583
  ]
 
 
584
 
585
- answer_lower = answer.lower()
586
- for prefix in prefixes:
587
- if answer_lower.startswith(prefix):
588
- answer = answer[len(prefix):].strip()
589
- # Remove any leading colon, dash, or space
590
- answer = answer.lstrip(':').lstrip('-').lstrip().strip()
591
- answer_lower = answer.lower()
592
-
593
- # Remove explanations after the answer (look for common patterns)
594
- # Split by common explanation starters
595
- explanation_markers = [" because", " since", " as", " due to", " which", " that", " - ", " (", " [", "\n\n"]
596
- for marker in explanation_markers:
597
- if marker in answer:
598
- # For some markers, split and take first part
599
- if marker in [" - ", "\n\n"]:
600
- answer = answer.split(marker)[0].strip()
601
- # For parentheses/brackets, be more careful
602
- elif marker in [" (", " ["]:
603
- # Only remove if it looks like an explanation
604
- idx = answer.find(marker)
605
- if idx > 0 and idx < len(answer) - 3: # Not at start/end
606
- # Check if it's likely an explanation (has words, not just numbers/dates)
607
- rest = answer[idx+1:]
608
- if any(char.isalpha() for char in rest[:20]): # Has letters in first 20 chars
609
- answer = answer[:idx].strip()
610
- else:
611
- # For words like "because", split and take first part
612
- parts = answer.split(marker, 1)
613
- if len(parts) > 1:
614
- answer = parts[0].strip()
615
-
616
- # Remove quotes if they wrap the entire answer
617
  if (answer.startswith('"') and answer.endswith('"')) or \
618
  (answer.startswith("'") and answer.endswith("'")):
619
- answer = answer[1:-1].strip()
620
-
621
- # Remove trailing periods, commas, or semicolons for single-word/number answers
622
- # But preserve trailing punctuation for dates or other formatted answers
623
- if answer and ' ' not in answer:
624
- # Don't remove trailing punctuation if it's part of a date format or URL
625
- if not (answer.count('-') == 2 or answer.count('/') == 2 or '://' in answer):
626
- answer = answer.rstrip('.,;:')
627
-
628
- # Remove leading/trailing whitespace and normalize internal whitespace
629
- # But preserve formatting for lists (comma-separated)
630
- if ',' in answer and ' ' not in answer.replace(',', '').replace(' ', ''):
631
- # Comma-separated list without spaces - keep as is
632
- answer = answer.strip()
633
- else:
634
- answer = ' '.join(answer.split())
635
 
636
- # Remove markdown formatting if present
637
- if answer.startswith('**') and answer.endswith('**'):
638
- answer = answer[2:-2].strip()
639
- if answer.startswith('*') and answer.endswith('*') and not answer.startswith('**'):
640
- answer = answer[1:-1].strip()
641
 
642
- # Remove code block markers if present
643
- if answer.startswith('```') and answer.endswith('```'):
644
- lines = answer.split('\n')
645
- if len(lines) > 2:
646
- answer = '\n'.join(lines[1:-1]).strip()
647
-
648
- # Final cleanup: remove any remaining explanation patterns at the end
649
- answer = answer.split('\n')[0].strip() # Take first line only
650
- answer = answer.split('.')[0].strip() if answer.count('.') > 1 else answer # Take first sentence if multiple
651
 
652
  return answer.strip()
653
 
654
  def run(self, question: str, task_id: str = "", file_path: str = None) -> str:
655
- """
656
- Run the agent on a question.
657
-
658
- Args:
659
- question: The GAIA question to answer
660
- task_id: Optional task identifier
661
- file_path: Optional path to associated file
662
-
663
- Returns:
664
- The agent's final answer
665
- """
666
- # Prepare the user message with file priority
667
  user_content = question
 
 
 
668
  if file_path and os.path.exists(file_path):
669
- # Strongly emphasize reading the file first with detailed instructions
670
- file_extension = os.path.splitext(file_path)[1].lower()
671
- file_instructions = ""
672
 
673
- if file_extension in ['.xlsx', '.xls', '.csv']:
674
- file_instructions = "This is a spreadsheet file. Read it completely and examine ALL sheets (if Excel) and ALL columns. The answer is likely a number, date, name, or value extracted from this data. After reading, you may need to perform calculations or analysis using python_executor."
675
- elif file_extension == '.pdf':
676
- file_instructions = "This is a PDF file. Read ALL pages carefully. The answer may be anywhere in the document - in tables, text, or images. Search for keywords from the question."
677
- else:
678
- file_instructions = "This is a text-based file. Read it completely and carefully. The answer is likely somewhere in this file - look for exact values, names, dates, or information that matches the question."
679
 
680
- user_content = f"""CRITICAL: A file is available at {file_path}
681
-
682
- {file_instructions}
683
-
684
- **You MUST read this file FIRST before doing anything else.** Do not search the web or use other tools until you have read the file completely. The answer is very likely in this file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  Question: {question}"""
687
 
688
- # Initialize state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  initial_state: AgentState = {
690
- "messages": [
691
- SystemMessage(content=SYSTEM_PROMPT),
692
- HumanMessage(content=user_content)
693
- ],
694
  "task_id": task_id,
695
  "file_path": file_path,
696
- "file_content": None,
697
  "iteration_count": 0,
698
  "final_answer": None
699
  }
700
 
701
- # Execute the graph
702
  try:
703
- final_state = self.graph.invoke(
704
- initial_state,
705
- {"recursion_limit": self.max_iterations * 2 + 5}
706
- )
707
- answer = final_state.get("final_answer", "Unable to determine answer")
708
 
709
- # Final validation - ensure answer is not empty or error message
710
- if not answer or answer.startswith("Agent error:") or answer.startswith("Unable to determine"):
711
- # Try to extract from last message if available
712
- if final_state.get("messages"):
713
- last_msg = final_state["messages"][-1]
714
- if hasattr(last_msg, "content") and last_msg.content:
715
- answer = self._clean_answer(last_msg.content)
 
716
 
717
  return answer if answer else "Unable to determine answer"
718
  except Exception as e:
719
- # Log error for debugging but return a clean error message
720
- import logging
721
- logging.error(f"Agent execution error: {str(e)}")
722
  return f"Agent error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
 
724
 
725
- # ============ UTILITY FUNCTIONS ============
726
- def create_agent(api_key: str = None, model: str = "gpt-4o") -> GAIAAgent:
727
- """Factory function to create a configured agent."""
728
- return GAIAAgent(
729
- model_name=model,
730
- api_key=api_key,
731
- temperature=0,
732
- max_iterations=15
733
- )
 
1
  """
2
+ Enhanced GAIA Agent with LangGraph - Fixed Version
3
+ Supports Ollama (local) and OpenAI (production)
4
  """
5
 
6
  import os
7
  import re
8
  import json
9
  import requests
10
+ import time
11
+ import logging
12
+ import base64
13
+ from typing import TypedDict, Annotated, Sequence, Literal
14
  import operator
15
+ from dotenv import load_dotenv
16
+
17
+ load_dotenv()
18
 
19
  from langgraph.graph import StateGraph, END
20
  from langgraph.prebuilt import ToolNode
21
  from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
22
  from langchain_core.tools import tool
 
23
  from langchain_community.tools import DuckDuckGoSearchResults
24
  from langchain_experimental.utilities import PythonREPL
25
  import pandas as pd
26
 
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # ============ CONFIGURATION ============
31
+ OLLAMA_MODEL = "qwen2.5:32b" # Vision-capable model for image support
32
+ OLLAMA_BASE_URL = "http://localhost:11434"
33
+ OPENAI_MODEL = "gpt-4o"
34
+
35
+ # Vision-capable Ollama models
36
+ VISION_MODEL_KEYWORDS = ["vision", "vl", "llava", "bakllava", "gemma3", "qwen2.5-vl", "llama3.2-vision"]
37
+
38
+
39
+ def _is_vision_model(model_name: str) -> bool:
40
+ """Check if the model name suggests vision capability."""
41
+ if not model_name:
42
+ return False
43
+ model_lower = model_name.lower()
44
+ return any(keyword in model_lower for keyword in VISION_MODEL_KEYWORDS)
45
+
46
+
47
+ def is_ollama_available() -> bool:
48
+ """Check if Ollama is running locally."""
49
+ try:
50
+ response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=2)
51
+ return response.status_code == 200
52
+ except:
53
+ return False
54
+
55
+
56
+ def is_production() -> bool:
57
+ """Check if running on HuggingFace Spaces."""
58
+ return bool(os.environ.get("SPACE_ID"))
59
+
60
 
61
  # ============ STATE DEFINITION ============
62
  class AgentState(TypedDict):
 
63
  messages: Annotated[Sequence[BaseMessage], operator.add]
64
  task_id: str
65
  file_path: str | None
 
66
  iteration_count: int
67
  final_answer: str | None
68
 
 
71
  @tool
72
  def web_search(query: str) -> str:
73
  """
74
+ Search the web for current information using DuckDuckGo.
75
+ Use for recent events, facts, statistics, or information you're uncertain about.
 
76
 
77
  Args:
78
+ query: Search query string
 
 
 
79
  """
80
+ for name in ["ddgs.ddgs", "primp"]:
81
+ logging.getLogger(name).setLevel(logging.ERROR)
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  try:
84
+ search = DuckDuckGoSearchResults(max_results=8, output_format="list")
85
  results = search.run(query)
86
 
 
 
 
 
 
 
87
  if isinstance(results, list):
88
  formatted = []
89
  for r in results:
90
  if isinstance(r, dict):
91
+ formatted.append(
92
+ f"Title: {r.get('title', 'N/A')}\n"
93
+ f"Snippet: {r.get('snippet', 'N/A')}\n"
94
+ f"Link: {r.get('link', 'N/A')}"
95
+ )
96
+ return "\n\n---\n\n".join(formatted) if formatted else "No results found."
97
+ return str(results) if results else "No results found."
98
  except Exception as e:
99
+ return f"Search failed: {e}"
 
 
 
 
 
100
 
101
 
102
  @tool
103
  def python_executor(code: str) -> str:
104
  """
105
+ Execute Python code for calculations, data analysis, or computational tasks.
106
+ Available libraries: math, statistics, datetime, json, re, collections, pandas, numpy.
107
+ Use print() to see output.
108
 
109
  Args:
110
+ code: Python code to execute
 
 
 
111
  """
112
  try:
113
  repl = PythonREPL()
 
114
  augmented_code = """
115
  import math
116
  import statistics
 
118
  import json
119
  import re
120
  from collections import Counter, defaultdict
121
+ import pandas as pd
122
+ import numpy as np
123
+ from fractions import Fraction
124
+ from decimal import Decimal
125
  """ + code
126
  result = repl.run(augmented_code)
127
+ output = result.strip() if result else "Code executed with no output. Use print()."
128
+ if len(output) > 5000:
129
+ output = output[:5000] + "\n... (truncated)"
130
+ return output
131
  except Exception as e:
132
+ return f"Execution error: {e}"
133
 
134
 
135
  @tool
136
  def read_file(file_path: str) -> str:
137
  """
138
+ Read content from files. Supports: PDF, TXT, CSV, JSON, XLSX, XLS, PY, MP3, WAV, images.
139
+ ALWAYS use this FIRST when a file is provided.
140
 
141
  Args:
142
+ file_path: Path to the file
 
 
 
143
  """
144
  try:
145
  if not os.path.exists(file_path):
 
147
 
148
  file_lower = file_path.lower()
149
 
150
+ # Audio files
151
+ if file_lower.endswith(('.mp3', '.wav', '.m4a', '.ogg', '.flac', '.webm')):
152
+ return _transcribe_audio(file_path)
153
+
154
+ # Image files - return path for vision model
155
+ if file_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp')):
156
+ return f"IMAGE_FILE:{file_path}"
157
+
158
+ # PDF files
159
  if file_lower.endswith('.pdf'):
160
+ try:
161
+ from langchain_community.document_loaders import PyPDFLoader
162
+ loader = PyPDFLoader(file_path)
163
+ pages = loader.load()
164
+ content = "\n\n--- Page Break ---\n\n".join([p.page_content for p in pages])
165
+ return f"PDF Content ({len(pages)} pages):\n{content}"
166
+ except Exception as e:
167
+ try:
168
+ import pdfplumber
169
+ with pdfplumber.open(file_path) as pdf:
170
+ text = []
171
+ for i, page in enumerate(pdf.pages):
172
+ page_text = page.extract_text() or ""
173
+ tables = page.extract_tables()
174
+ table_text = ""
175
+ for table in tables:
176
+ if table:
177
+ table_text += "\n[TABLE]\n"
178
+ for row in table:
179
+ table_text += " | ".join(str(c) if c else "" for c in row) + "\n"
180
+ text.append(f"Page {i+1}:\n{page_text}\n{table_text}")
181
+ return f"PDF Content:\n" + "\n\n".join(text)
182
+ except:
183
+ return f"Error reading PDF: {e}"
184
+
185
+ # Excel files
186
+ if file_lower.endswith(('.xlsx', '.xls')):
187
+ df_dict = pd.read_excel(file_path, sheet_name=None)
188
  result = []
189
+ for sheet_name, df in df_dict.items():
190
+ result.append(f"=== Sheet: {sheet_name} ({len(df)} rows) ===")
191
+ result.append(f"Columns: {list(df.columns)}")
192
+ result.append(df.to_string(max_rows=200))
193
  return "\n\n".join(result)
194
 
195
+ # CSV files
196
+ if file_lower.endswith('.csv'):
197
  df = pd.read_csv(file_path)
198
+ return f"CSV ({len(df)} rows):\nColumns: {list(df.columns)}\n{df.to_string(max_rows=200)}"
199
 
200
+ # JSON files
201
+ if file_lower.endswith('.json'):
202
  with open(file_path, 'r', encoding='utf-8') as f:
203
  data = json.load(f)
204
+ return f"JSON:\n{json.dumps(data, indent=2)}"
205
+
206
+ # Default: text
207
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
208
+ content = f.read()
209
+ if len(content) > 15000:
210
+ content = content[:15000] + "\n... (truncated)"
211
+ return f"File Content:\n{content}"
212
 
213
  except Exception as e:
214
+ return f"Error reading file: {e}"
215
+
216
+
217
+ def _transcribe_audio(file_path: str) -> str:
218
+ """Transcribe audio using local Whisper (faster-whisper)."""
219
+ try:
220
+ from faster_whisper import WhisperModel
221
+ # Use base model for speed, can be upgraded to "small", "medium", "large" for better accuracy
222
+ model = WhisperModel("base", device="cpu", compute_type="int8")
223
+ segments, info = model.transcribe(file_path, beam_size=5)
224
+ transcript = " ".join([segment.text for segment in segments])
225
+ return f"Audio Transcription:\n{transcript}"
226
+ except ImportError:
227
+ return "Error: faster-whisper not installed. Install with: pip install faster-whisper"
228
+ except Exception as e:
229
+ logger.error(f"Audio transcription error: {e}")
230
+ return f"Audio transcription failed: {e}"
231
 
232
 
233
  @tool
234
  def calculator(expression: str) -> str:
235
  """
236
+ Evaluate mathematical expressions safely.
 
237
 
238
  Args:
239
+ expression: Math expression like "sqrt(16) + log(100, 10)"
 
 
 
240
  """
241
  try:
242
  import math
 
 
243
  safe_dict = {
244
  'abs': abs, 'round': round, 'min': min, 'max': max,
245
+ 'sum': sum, 'pow': pow, 'int': int, 'float': float,
246
  'sqrt': math.sqrt, 'log': math.log, 'log10': math.log10,
247
  'log2': math.log2, 'exp': math.exp,
248
  'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
 
 
249
  'ceil': math.ceil, 'floor': math.floor,
250
+ 'pi': math.pi, 'e': math.e, 'factorial': math.factorial,
 
 
251
  }
 
252
  result = eval(expression, {"__builtins__": {}}, safe_dict)
253
+ if isinstance(result, float) and result.is_integer():
254
+ return str(int(result))
255
+ return f"{result:.10g}" if isinstance(result, float) else str(result)
 
 
 
 
 
256
  except Exception as e:
257
+ return f"Calculation error: {e}"
258
 
259
 
260
  @tool
261
  def wikipedia_search(query: str) -> str:
262
  """
263
+ Search Wikipedia for factual information.
264
+ Best for historical facts, biographies, scientific concepts.
265
 
266
  Args:
267
+ query: Topic to search
 
 
 
268
  """
269
  try:
270
  import urllib.parse
 
 
271
  search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&format=json&srlimit=3"
272
+ response = requests.get(search_url, timeout=15)
273
  data = response.json()
274
 
275
+ if 'query' not in data or not data['query'].get('search'):
276
  return f"No Wikipedia articles found for '{query}'"
277
 
278
+ results = []
279
+ for item in data['query']['search'][:2]:
280
+ title = item['title']
281
+ content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro=false&explaintext=true&titles={urllib.parse.quote(title)}&format=json&exchars=4000"
282
+ content_response = requests.get(content_url, timeout=15)
283
+ pages = content_response.json().get('query', {}).get('pages', {})
284
+ for page_id, page_data in pages.items():
285
+ if page_id != '-1':
286
+ results.append(f"## {title}\n{page_data.get('extract', 'No content')}")
287
+
288
+ return "\n\n---\n\n".join(results) if results else "No content found."
 
 
 
 
 
289
  except Exception as e:
290
+ return f"Wikipedia search failed: {e}"
291
 
292
 
293
  @tool
294
+ def fetch_webpage(url: str) -> str:
295
  """
296
+ Fetch and extract text from a webpage URL.
 
297
 
298
  Args:
299
+ url: The webpage URL
 
 
 
 
300
  """
301
+ try:
302
+ headers = {'User-Agent': 'Mozilla/5.0 (compatible; GaiaBot/1.0)'}
303
+ response = requests.get(url, headers=headers, timeout=15)
304
+ response.raise_for_status()
305
+
306
+ try:
307
+ from bs4 import BeautifulSoup
308
+ soup = BeautifulSoup(response.text, 'html.parser')
309
+ for el in soup(['script', 'style', 'nav', 'footer', 'header']):
310
+ el.decompose()
311
+ text = soup.get_text(separator='\n', strip=True)
312
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
313
+ text = '\n'.join(lines)
314
+ if len(text) > 10000:
315
+ text = text[:10000] + "\n... (truncated)"
316
+ return f"Webpage ({url}):\n{text}"
317
+ except ImportError:
318
+ return f"Raw HTML:\n{response.text[:10000]}"
319
+ except Exception as e:
320
+ return f"Failed to fetch: {e}"
321
 
322
 
323
+ TOOLS = [web_search, python_executor, read_file, calculator, wikipedia_search, fetch_webpage]
 
324
 
325
 
326
  # ============ SYSTEM PROMPT ============
327
+ SYSTEM_PROMPT = """You are an expert AI solving GAIA benchmark questions. Your goal is MAXIMUM ACCURACY.
328
+
329
+ ## CRITICAL: Answer Format (EXACT STRING MATCHING)
330
+ Your final answer must be ONLY the answer value - nothing else.
331
+
332
+ **Rules:**
333
+ - Numbers: "42" (not "The answer is 42")
334
+ - Names: Exact spelling "John Smith"
335
+ - Lists: Comma-separated, NO spaces: "apple,banana,cherry"
336
+ - Dates: Requested format or YYYY-MM-DD
337
+ - Yes/No: "Yes" or "No"
338
+ - NEVER use prefixes like "Answer:", "FINAL ANSWER:", etc.
339
+ - NEVER explain - just the answer
340
+
341
+ ## Strategy
342
+
343
+ 1. **If file provided**: Use read_file FIRST - answer is usually there
344
+ 2. **For calculations**: Use python_executor or calculator
345
+ 3. **For facts**: wikipedia_search for historical, web_search for current
346
+ 4. **For URLs in question**: Use fetch_webpage
347
+ 5. **Verify**: Check spelling, formatting, precision
348
+
349
+ ## When Ready
350
+ State ONLY the answer value. Nothing else."""
351
+
352
+
353
+ # ============ AGENT CLASS ============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  class GAIAAgent:
355
+ """LangGraph agent for GAIA benchmark."""
356
 
357
  def __init__(
358
  self,
359
+ model_name: str = None,
 
360
  temperature: float = 0,
361
+ max_iterations: int = 25,
362
  ):
 
 
 
 
 
 
 
 
 
 
363
  self.max_iterations = max_iterations
364
+ self.use_openai = is_production() or not is_ollama_available()
365
+
366
+ if self.use_openai:
367
+ from langchain_openai import ChatOpenAI
368
+ api_key = os.environ.get("OPENAI_API_KEY")
369
+ if not api_key:
370
+ raise ValueError("OPENAI_API_KEY not found")
371
+ self.model_name = model_name or OPENAI_MODEL
372
+ self.llm = ChatOpenAI(model=self.model_name, temperature=temperature, api_key=api_key)
373
+ self.supports_vision = True # OpenAI models support vision
374
+ logger.info(f"Using OpenAI: {self.model_name}")
375
+ else:
376
+ from langchain_ollama import ChatOllama
377
+ self.model_name = model_name or OLLAMA_MODEL
378
+ self.llm = ChatOllama(model=self.model_name, base_url=OLLAMA_BASE_URL, temperature=temperature)
379
+ self.supports_vision = _is_vision_model(self.model_name)
380
+ logger.info(f"Using Ollama: {self.model_name} (vision: {self.supports_vision})")
381
 
 
 
 
 
 
382
  self.llm_with_tools = self.llm.bind_tools(TOOLS)
383
  self.graph = self._build_graph()
384
 
385
  def _build_graph(self) -> StateGraph:
 
386
  workflow = StateGraph(AgentState)
 
 
387
  workflow.add_node("agent", self._agent_node)
388
  workflow.add_node("tools", ToolNode(TOOLS))
389
  workflow.add_node("extract_answer", self._extract_answer_node)
 
 
390
  workflow.set_entry_point("agent")
391
+ workflow.add_conditional_edges("agent", self._route, {"tools": "tools", "end": "extract_answer"})
 
 
 
 
 
 
 
 
 
392
  workflow.add_edge("tools", "agent")
393
  workflow.add_edge("extract_answer", END)
 
394
  return workflow.compile()
395
 
396
  def _agent_node(self, state: AgentState) -> dict:
397
+ messages = list(state["messages"])
 
398
  iteration = state.get("iteration_count", 0)
399
+ file_path = state.get("file_path")
400
+
401
+ # If using Ollama vision and image exists, ensure image is included in the last user message
402
+ if not self.use_openai and self.supports_vision and file_path and os.path.exists(file_path):
403
+ ext = os.path.splitext(file_path)[1].lower()
404
+ is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
405
+
406
+ if is_image:
407
+ # Check if the last message is a HumanMessage without image content
408
+ # If so, we need to add the image to it
409
+ last_msg = messages[-1] if messages else None
410
+ if isinstance(last_msg, HumanMessage):
411
+ # Check if message content is a string (text only) or list (multimodal)
412
+ if isinstance(last_msg.content, str):
413
+ # Convert text-only message to multimodal with image
414
+ try:
415
+ with open(file_path, "rb") as f:
416
+ image_data = base64.b64encode(f.read()).decode('utf-8')
417
+
418
+ media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
419
+ "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
420
+
421
+ # Replace the last message with multimodal version
422
+ messages[-1] = HumanMessage(
423
+ content=[
424
+ {"type": "text", "text": last_msg.content},
425
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
426
+ ]
427
+ )
428
+ except Exception as e:
429
+ logger.warning(f"Failed to add image to message: {e}")
430
 
 
431
  if iteration >= self.max_iterations - 2:
432
+ messages.append(SystemMessage(content="⚠️ FINAL: Provide answer NOW. Just the value."))
 
433
  elif iteration >= self.max_iterations - 5:
434
+ messages.append(SystemMessage(content="⚠️ Conclude soon. Provide the answer."))
435
+
436
+ if self.use_openai:
437
+ time.sleep(0.5)
 
438
 
439
  try:
440
  response = self.llm_with_tools.invoke(messages)
441
  except Exception as e:
442
+ error_str = str(e)
443
+ logger.error(f"LLM error: {error_str}")
444
+
445
+ # Check if error contains raw Python code (common with Ollama)
446
+ if "error parsing tool call" in error_str.lower() and "raw=" in error_str:
447
+ # Extract the raw code from the error message
448
+ try:
449
+ # Find the raw code between raw=' and '
450
+ match = re.search(r"raw='(.*?)'", error_str, re.DOTALL)
451
+ if match:
452
+ raw_code = match.group(1)
453
+ logger.info(f"Detected raw Python code, wrapping in python_executor tool call")
454
+
455
+ # Create a manual tool call for python_executor (dict format for langchain-core 0.3.x)
456
+ from langchain_core.messages import ToolMessage
457
+
458
+ tool_call_id = f"call_{int(time.time() * 1000)}"
459
+
460
+ # Execute the code directly via the tool
461
+ result = python_executor.invoke({"code": raw_code})
462
+
463
+ # Create a proper response with tool call (dict format)
464
+ tool_call_dict = {
465
+ "name": "python_executor",
466
+ "args": {"code": raw_code},
467
+ "id": tool_call_id
468
+ }
469
+ ai_msg = AIMessage(
470
+ content="",
471
+ tool_calls=[tool_call_dict]
472
+ )
473
+ tool_msg = ToolMessage(
474
+ content=result,
475
+ tool_call_id=tool_call_id
476
+ )
477
+ return {
478
+ "messages": [ai_msg, tool_msg],
479
+ "iteration_count": iteration + 1
480
+ }
481
+ except Exception as parse_error:
482
+ logger.error(f"Failed to extract code from error: {parse_error}")
483
+
484
+ return {"messages": [AIMessage(content="Error occurred.")], "iteration_count": iteration + 1}
485
 
486
+ return {"messages": [response], "iteration_count": iteration + 1}
487
+
488
+ def _route(self, state: AgentState) -> Literal["tools", "end"]:
489
+ last = state["messages"][-1]
490
+ if state.get("iteration_count", 0) >= self.max_iterations:
491
  return "end"
492
+ if hasattr(last, "tool_calls") and last.tool_calls:
 
 
493
  return "tools"
 
494
  return "end"
495
 
496
  def _extract_answer_node(self, state: AgentState) -> dict:
 
 
497
  messages = state["messages"]
498
 
499
+ # Find last substantive AI response
500
+ content = ""
501
+ for msg in reversed(messages):
502
+ if isinstance(msg, AIMessage) and msg.content:
503
+ c = msg.content.strip()
504
+ # Skip if it's clearly garbage/prompt repetition
505
+ if self._is_valid_answer_candidate(c):
506
+ content = c
 
 
 
507
  break
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  answer = self._clean_answer(content)
 
510
  return {"final_answer": answer}
511
 
512
+ def _is_valid_answer_candidate(self, text: str) -> bool:
513
+ """Check if text looks like a valid answer, not garbage."""
514
+ if not text or len(text) < 1:
515
+ return False
516
+
517
+ text_lower = text.lower()
518
+
519
+ # Reject if it contains prompt text patterns
520
+ bad_patterns = [
521
+ "numbers: just", "format rules", "must follow",
522
+ "critical: answer format", "when ready", "your final answer",
523
+ "the benchmark uses", "exact string matching",
524
+ "no prefixes", "no explanations"
525
+ ]
526
+ if any(p in text_lower for p in bad_patterns):
527
+ return False
528
+
529
+ # Reject if it looks like the question was repeated
530
+ if "provide the correct next move" in text_lower:
531
+ return False
532
+ if text.startswith("Review the"):
533
+ return False
534
+
535
+ # Reject tool call syntax
536
+ if text.startswith("web_search(") or text.startswith("read_file("):
537
+ return False
538
+
539
+ return True
540
+
541
+ def _clean_answer(self, raw: str) -> str:
542
+ if not raw:
543
  return ""
544
 
545
+ answer = raw.strip()
546
+
547
+ # Remove markdown
548
+ answer = re.sub(r'\*\*(.+?)\*\*', r'\1', answer)
549
+ answer = re.sub(r'\*(.+?)\*', r'\1', answer)
550
+ answer = re.sub(r'`(.+?)`', r'\1', answer)
551
 
552
+ # Remove prefixes
553
  prefixes = [
554
+ r"^(?:the\s+)?(?:final\s+)?answer\s*(?:is)?:?\s*",
555
+ r"^result\s*:?\s*",
556
+ r"^therefore\s*,?\s*",
557
+ r"^thus\s*,?\s*",
558
+ r"^so\s*,?\s*",
 
 
 
 
 
 
 
559
  ]
560
+ for p in prefixes:
561
+ answer = re.sub(p, "", answer, flags=re.IGNORECASE)
562
 
563
+ # Remove quotes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  if (answer.startswith('"') and answer.endswith('"')) or \
565
  (answer.startswith("'") and answer.endswith("'")):
566
+ answer = answer[1:-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
+ # Take first line
569
+ answer = answer.split('\n')[0].strip()
 
 
 
570
 
571
+ # Remove trailing period for short answers
572
+ if answer.endswith('.') and len(answer.split()) <= 3:
573
+ answer = answer[:-1]
 
 
 
 
 
 
574
 
575
  return answer.strip()
576
 
577
  def run(self, question: str, task_id: str = "", file_path: str = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
578
  user_content = question
579
+ audio_transcript = None
580
+
581
+ # Handle files - dynamic image and audio detection
582
  if file_path and os.path.exists(file_path):
583
+ ext = os.path.splitext(file_path)[1].lower()
 
 
584
 
585
+ # Check for image files
586
+ is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
587
+ is_audio = ext in ['.mp3', '.wav', '.m4a', '.ogg', '.flac', '.webm']
 
 
 
588
 
589
+ # Handle images with OpenAI vision
590
+ if is_image and self.use_openai:
591
+ return self._run_with_vision(question, task_id, file_path)
592
+
593
+ # Handle images with Ollama vision (if model supports it)
594
+ if is_image and not self.use_openai and self.supports_vision:
595
+ return self._run_with_ollama_vision(question, task_id, file_path)
596
+
597
+ # Handle audio files - transcribe first
598
+ if is_audio:
599
+ audio_transcript = _transcribe_audio(file_path)
600
+ # If transcription failed, continue with error message
601
+ if audio_transcript.startswith("Error:"):
602
+ logger.warning(f"Audio transcription failed: {audio_transcript}")
603
+ else:
604
+ # Combine question with audio transcript
605
+ user_content = f"{question}\n\n{audio_transcript}"
606
+
607
+ # Handle image + audio combination
608
+ if is_image and is_audio:
609
+ # This case is handled above - audio transcribed, image will be passed in messages
610
+ pass
611
+ elif is_image and not self.supports_vision:
612
+ # Image detected but model doesn't support vision
613
+ logger.warning(f"Image file detected but model {self.model_name} doesn't support vision")
614
+ return f"Error: Image file provided but model {self.model_name} doesn't support vision. Please use a vision-capable model like llama3.2-vision or qwen2.5-vl."
615
+
616
+ # Handle other file types
617
+ if not is_image and not is_audio:
618
+ file_hints = {
619
+ '.xlsx': "EXCEL file - use read_file to examine ALL sheets",
620
+ '.xls': "EXCEL file - use read_file to examine ALL sheets",
621
+ '.csv': "CSV file - use read_file, then python_executor for analysis",
622
+ '.pdf': "PDF file - use read_file to extract ALL text",
623
+ '.py': "Python file - use read_file to see the code",
624
+ }
625
+ hint = file_hints.get(ext, "Use read_file to examine contents")
626
+
627
+ user_content = f"""⚠️ FILE PROVIDED: {file_path}
628
+
629
+ {hint}
630
+
631
+ **Use read_file("{file_path}") FIRST.**
632
 
633
  Question: {question}"""
634
 
635
+ # Check for URLs in question
636
+ url_match = re.search(r'https?://[^\s]+', question)
637
+ if url_match:
638
+ user_content += f"\n\nπŸ’‘ URL detected: {url_match.group()} - Consider using fetch_webpage if needed."
639
+
640
+ # Build initial message - include image if using Ollama vision
641
+ initial_messages = [SystemMessage(content=SYSTEM_PROMPT)]
642
+
643
+ # If using Ollama vision and image exists, include image in message
644
+ if file_path and os.path.exists(file_path):
645
+ ext = os.path.splitext(file_path)[1].lower()
646
+ is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
647
+
648
+ if is_image and not self.use_openai and self.supports_vision:
649
+ # Include image in HumanMessage for Ollama vision
650
+ try:
651
+ with open(file_path, "rb") as f:
652
+ image_data = base64.b64encode(f.read()).decode('utf-8')
653
+
654
+ media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
655
+ "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
656
+
657
+ user_msg = HumanMessage(
658
+ content=[
659
+ {"type": "text", "text": user_content},
660
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
661
+ ]
662
+ )
663
+ except Exception as e:
664
+ logger.error(f"Failed to encode image: {e}")
665
+ user_msg = HumanMessage(content=user_content)
666
+ else:
667
+ user_msg = HumanMessage(content=user_content)
668
+ else:
669
+ user_msg = HumanMessage(content=user_content)
670
+
671
+ initial_messages.append(user_msg)
672
+
673
  initial_state: AgentState = {
674
+ "messages": initial_messages,
 
 
 
675
  "task_id": task_id,
676
  "file_path": file_path,
 
677
  "iteration_count": 0,
678
  "final_answer": None
679
  }
680
 
 
681
  try:
682
+ final_state = self.graph.invoke(initial_state, {"recursion_limit": self.max_iterations * 2 + 10})
683
+ answer = final_state.get("final_answer", "")
 
 
 
684
 
685
+ if not answer or not self._is_valid_answer_candidate(answer):
686
+ # Try harder to find an answer
687
+ for msg in reversed(final_state.get("messages", [])):
688
+ if isinstance(msg, AIMessage) and msg.content:
689
+ candidate = self._clean_answer(msg.content)
690
+ if candidate and self._is_valid_answer_candidate(candidate):
691
+ answer = candidate
692
+ break
693
 
694
  return answer if answer else "Unable to determine answer"
695
  except Exception as e:
696
+ logger.error(f"Agent error: {e}")
 
 
697
  return f"Agent error: {str(e)}"
698
+
699
+ def _run_with_vision(self, question: str, task_id: str, image_path: str) -> str:
700
+ """Handle image questions using GPT-4o vision."""
701
+ try:
702
+ from openai import OpenAI
703
+ client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
704
+
705
+ # Read and encode image
706
+ with open(image_path, "rb") as f:
707
+ image_data = base64.b64encode(f.read()).decode('utf-8')
708
+
709
+ ext = os.path.splitext(image_path)[1].lower()
710
+ media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
711
+ "gif": "image/gif", "webp": "image/webp"}.get(ext.lstrip('.'), "image/png")
712
+
713
+ response = client.chat.completions.create(
714
+ model="gpt-4o",
715
+ messages=[
716
+ {"role": "system", "content": "You are solving GAIA benchmark questions. Provide ONLY the answer value, no explanations or prefixes."},
717
+ {"role": "user", "content": [
718
+ {"type": "text", "text": question},
719
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
720
+ ]}
721
+ ],
722
+ max_tokens=500,
723
+ temperature=0
724
+ )
725
+
726
+ answer = response.choices[0].message.content.strip()
727
+ return self._clean_answer(answer)
728
+ except Exception as e:
729
+ logger.error(f"Vision error: {e}")
730
+ return f"Vision error: {str(e)}"
731
+
732
+ def _run_with_ollama_vision(self, question: str, task_id: str, image_path: str) -> str:
733
+ """Handle image questions using Ollama vision models."""
734
+ try:
735
+ # Read and encode image
736
+ with open(image_path, "rb") as f:
737
+ image_data = base64.b64encode(f.read()).decode('utf-8')
738
+
739
+ ext = os.path.splitext(image_path)[1].lower()
740
+ media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
741
+ "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
742
+
743
+ # Create message with image
744
+ message = HumanMessage(
745
+ content=[
746
+ {"type": "text", "text": question},
747
+ {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
748
+ ]
749
+ )
750
+
751
+ # Invoke model with system prompt and image message
752
+ response = self.llm.invoke([SystemMessage(content=SYSTEM_PROMPT), message])
753
+ answer = response.content if hasattr(response, 'content') else str(response)
754
+ return self._clean_answer(answer)
755
+ except Exception as e:
756
+ logger.error(f"Ollama vision error: {e}")
757
+ return f"Vision error: {str(e)}"
758
 
759
 
760
+ def create_agent() -> GAIAAgent:
761
+ """Create a configured agent."""
762
+ return GAIAAgent(temperature=0, max_iterations=25)
 
 
 
 
 
 
app.py CHANGED
@@ -6,166 +6,144 @@ import tempfile
6
  import json
7
  import logging
8
  from typing import Optional
 
9
 
10
- # Import the optimized agent from the separate module
11
- from agent_enhanced import GAIAAgent
 
12
 
13
- # ============ CONFIGURATION ============
14
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
15
 
16
- # Set up logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
- # ============ API INTERACTION ============
22
- def fetch_questions(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> list:
23
- """Fetch all questions from the GAIA API with retry logic."""
24
- for attempt in range(max_retries):
25
  try:
26
  response = requests.get(f"{api_url}/questions", timeout=30)
27
  response.raise_for_status()
28
- return response.json()
29
- except requests.exceptions.RequestException as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  logger.warning(f"Attempt {attempt + 1} failed: {e}")
31
- if attempt == max_retries - 1:
32
- raise
33
  return []
34
 
35
- def fetch_random_question(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
36
- """Fetch a random question from the GAIA API with retry logic."""
37
- for attempt in range(max_retries):
 
38
  try:
39
  response = requests.get(f"{api_url}/random-question", timeout=30)
40
  response.raise_for_status()
41
  return response.json()
42
- except requests.exceptions.RequestException as e:
43
  logger.warning(f"Attempt {attempt + 1} failed: {e}")
44
- if attempt == max_retries - 1:
45
- raise
46
  return {}
47
 
48
- def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> Optional[str]:
49
- """Fetch a file associated with a task with retry logic."""
50
- for attempt in range(max_retries):
51
- try:
52
- response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
53
- if response.status_code == 200:
54
- # Save to temp file
55
- content_disposition = response.headers.get('content-disposition', '')
56
- filename = f"task_{task_id}_file"
57
- if 'filename=' in content_disposition:
58
- filename = content_disposition.split('filename=')[1].strip('"')
59
-
60
- temp_dir = tempfile.mkdtemp()
61
- file_path = os.path.join(temp_dir, filename)
62
-
63
- with open(file_path, 'wb') as f:
64
- f.write(response.content)
65
-
66
- logger.info(f"Downloaded file: {file_path}")
67
- return file_path
68
- elif response.status_code == 404:
69
- logger.info(f"No file found for task {task_id}")
70
- return None
71
- except requests.exceptions.RequestException as e:
72
- logger.warning(f"File fetch attempt {attempt + 1} failed: {e}")
73
- if attempt == max_retries - 1:
74
- logger.error(f"Failed to fetch file for task {task_id}: {e}")
75
  return None
76
 
77
- def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
78
- """Submit answers to the GAIA API with retry logic."""
79
- payload = {
80
- "username": username,
81
- "agent_code": agent_code,
82
- "answers": answers
83
- }
84
-
85
- for attempt in range(max_retries):
86
- try:
87
- response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
88
- response.raise_for_status()
89
- return response.json()
90
- except requests.exceptions.RequestException as e:
91
- logger.warning(f"Submission attempt {attempt + 1} failed: {e}")
92
- if attempt == max_retries - 1:
93
- raise
94
- return {}
95
 
 
 
 
 
 
 
96
 
97
- # ============ ANSWER VALIDATION ============
98
- def validate_answer_format(answer: str) -> tuple[bool, str]:
99
- """Validate answer format and return (is_valid, warning_message)."""
100
- if not answer or answer.strip() == "":
101
- return False, "Warning: Answer is empty"
102
-
103
- # Check for common prefixes that should be removed
104
- prefixes = ["FINAL ANSWER:", "The answer is:", "Answer:", "final answer:"]
105
- answer_lower = answer.lower()
106
- for prefix in prefixes:
107
- if answer_lower.startswith(prefix.lower()):
108
- return False, f"Warning: Answer contains prefix '{prefix}' which will be removed. Consider removing it."
109
-
110
- # Check for explanations (multiple sentences)
111
- if answer.count('.') > 1 or answer.count('because') > 0 or answer.count('since') > 0:
112
- return False, "Warning: Answer may contain explanations. Only the answer should be submitted."
113
-
114
- return True, ""
115
 
116
- # ============ GRADIO INTERFACE ============
117
- def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()):
118
- """Run the agent on all GAIA questions."""
119
- if not openai_api_key:
120
- return "Please provide your OpenAI API key.", None
121
-
 
 
 
 
 
 
 
 
122
  try:
123
- # Initialize agent
124
  progress(0, desc="Initializing agent...")
125
- agent = GAIAAgent(api_key=openai_api_key)
126
 
127
- # Fetch questions
128
- progress(0.05, desc="Fetching questions from API...")
 
129
  questions = fetch_questions()
130
 
131
  if not questions:
132
- return "Error: Failed to fetch questions from API. Please try again.", None
133
 
134
- total_questions = len(questions)
135
  results = []
136
  answers_for_submission = []
137
 
138
  for i, q in enumerate(questions):
139
- progress((i + 1) / total_questions, desc=f"Processing question {i+1}/{total_questions}...")
140
 
141
  task_id = q.get("task_id", "")
142
  question_text = q.get("question", "")
143
 
144
- # Check if there's an associated file
145
  file_path = None
146
  if q.get("file_name"):
147
- progress((i + 0.5) / total_questions, desc=f"Downloading file for question {i+1}...")
148
  file_path = fetch_file(task_id)
149
 
150
- # Run agent
151
  try:
152
- progress((i + 0.7) / total_questions, desc=f"Agent reasoning for question {i+1}...")
153
  answer = agent.run(question_text, task_id, file_path)
154
-
155
- # Validate answer format
156
- is_valid, warning = validate_answer_format(answer)
157
- if not is_valid:
158
- logger.warning(f"Question {i+1} ({task_id}): {warning}")
159
-
160
  except Exception as e:
161
- logger.error(f"Error processing question {i+1} ({task_id}): {e}")
162
  answer = f"Error: {str(e)}"
163
 
164
  results.append({
165
  "Task ID": task_id,
166
- "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
167
  "Answer": answer,
168
- "Status": "βœ“" if answer and not answer.startswith("Error:") else "βœ—"
169
  })
170
 
171
  answers_for_submission.append({
@@ -173,31 +151,59 @@ def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()):
173
  "submitted_answer": answer
174
  })
175
 
176
- # Cleanup temp file
177
  if file_path and os.path.exists(file_path):
178
  try:
179
  os.remove(file_path)
180
- # Also try to remove temp directory if empty
181
- temp_dir = os.path.dirname(file_path)
182
- if os.path.exists(temp_dir):
183
- try:
184
- os.rmdir(temp_dir)
185
- except:
186
- pass
187
- except Exception as e:
188
- logger.warning(f"Failed to cleanup file {file_path}: {e}")
189
 
190
  df = pd.DataFrame(results)
191
  progress(1.0, desc="Complete!")
192
  return df, answers_for_submission
193
 
194
  except Exception as e:
195
- logger.error(f"Error in run_agent_on_questions: {e}")
196
  return f"Error: {str(e)}", None
197
 
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
200
- """Submit answers to the leaderboard."""
201
  if not username or not space_url or not answers_json:
202
  return "Please fill in all fields and run the agent first."
203
 
@@ -205,207 +211,84 @@ def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
205
  answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
206
 
207
  if not isinstance(answers, list) or len(answers) == 0:
208
- return "Error: Answers must be a non-empty list. Please run the agent first."
209
 
210
- # Validate answer format before submission
211
- warnings = []
212
- for ans in answers:
213
- if "task_id" not in ans or "submitted_answer" not in ans:
214
- return "Error: Invalid answer format. Each answer must have 'task_id' and 'submitted_answer'."
215
- is_valid, warning = validate_answer_format(ans.get("submitted_answer", ""))
216
- if not is_valid:
217
- warnings.append(f"Task {ans.get('task_id')}: {warning}")
218
-
219
- # Ensure space URL ends with /tree/main
220
  if not space_url.endswith("/tree/main"):
221
  space_url = space_url.rstrip("/") + "/tree/main"
222
 
223
- # Submit to API
224
  result = submit_answers(username, space_url, answers)
225
-
226
- score = result.get("score", 0)
227
  print(result)
 
228
  correct = result.get("correct_count", 0)
229
  total = result.get("total_attempted", 0)
230
 
231
- warning_text = ""
232
- if warnings:
233
- warning_text = f"\n\n⚠️ **Warnings:**\n" + "\n".join(f"- {w}" for w in warnings[:5])
234
- if len(warnings) > 5:
235
- warning_text += f"\n- ... and {len(warnings) - 5} more warnings"
236
 
237
  return f"""
238
- ## Submission Successful! πŸŽ‰
239
 
240
  **Score:** {score:.1%}
241
  **Correct:** {correct}/{total}
242
 
243
- {'πŸ† **Congratulations!** Your agent scored above 30% and has earned the certificate!' if score > 0.3 else '❌ **Certificate Requirement:** Your agent must score above 30% to earn your certificate. Current score is below the threshold.'}
244
- {warning_text}
245
 
246
- Check the [leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard) to see your ranking!
247
  """
248
- except json.JSONDecodeError as e:
249
- return f"Error: Invalid JSON format. Please run the agent first.\nDetails: {str(e)}"
250
  except Exception as e:
251
  logger.error(f"Submission error: {e}")
252
- return f"Submission error: {str(e)}"
253
 
254
 
255
- def test_single_question(openai_api_key: str):
256
- """Test the agent on a single random question."""
257
- if not openai_api_key:
258
- return "Please provide your OpenAI API key.", "", "", ""
259
-
260
- try:
261
- agent = GAIAAgent(api_key=openai_api_key)
262
- question_data = fetch_random_question()
263
-
264
- if not question_data:
265
- return "Error: Failed to fetch question from API.", "", "", ""
266
-
267
- task_id = question_data.get("task_id", "")
268
- question_text = question_data.get("question", "")
269
-
270
- file_path = None
271
- if question_data.get("file_name"):
272
- file_path = fetch_file(task_id)
273
-
274
- answer = agent.run(question_text, task_id, file_path)
275
-
276
- # Validate answer format
277
- is_valid, warning = validate_answer_format(answer)
278
- validation_status = "βœ“ Valid format" if is_valid else f"⚠️ {warning}"
279
-
280
- # Cleanup temp file
281
- if file_path and os.path.exists(file_path):
282
- try:
283
- os.remove(file_path)
284
- temp_dir = os.path.dirname(file_path)
285
- if os.path.exists(temp_dir):
286
- try:
287
- os.rmdir(temp_dir)
288
- except:
289
- pass
290
- except Exception as e:
291
- logger.warning(f"Failed to cleanup file: {e}")
292
-
293
- return question_text, answer, task_id, validation_status
294
-
295
- except Exception as e:
296
- logger.error(f"Error in test_single_question: {e}")
297
- return f"Error: {str(e)}", "", "", ""
298
-
299
-
300
- # ============ BUILD GRADIO APP ============
301
- with gr.Blocks(title="GAIA Agent - LangGraph", theme=gr.themes.Soft()) as demo:
302
  gr.Markdown("""
303
- # πŸ€– GAIA Benchmark Agent (LangGraph)
304
 
305
- This agent uses **LangGraph** to solve GAIA benchmark questions. It has access to:
306
- - πŸ” Web Search (DuckDuckGo)
307
- - πŸ“š Wikipedia Search
308
- - 🐍 Python Code Execution
309
- - πŸ“„ File Reading (PDF, Text, Excel)
310
- - πŸ”’ Calculator
311
-
312
- ## Instructions
313
- 1. Enter your OpenAI API key
314
- 2. Test with a single question or run on all questions
315
- 3. Submit your answers to the leaderboard
316
  """)
317
 
318
- with gr.Row():
319
- openai_key = gr.Textbox(
320
- label="OpenAI API Key",
321
- type="password",
322
- placeholder="sk-...",
323
- info="Required for GPT-4o"
324
- )
325
 
326
  with gr.Tabs():
327
- with gr.TabItem("πŸ§ͺ Test Single Question"):
328
  test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
329
- test_question = gr.Textbox(label="Question", lines=5, interactive=False)
330
- test_answer = gr.Textbox(label="Agent's Answer", lines=3, interactive=False)
331
- test_task_id = gr.Textbox(label="Task ID", interactive=False)
332
- test_validation = gr.Textbox(label="Answer Validation", interactive=False)
333
 
334
- test_btn.click(
335
- test_single_question,
336
- inputs=[openai_key],
337
- outputs=[test_question, test_answer, test_task_id, test_validation]
338
- )
339
 
340
- with gr.TabItem("πŸš€ Run Full Benchmark"):
341
- run_btn = gr.Button("Run Agent on All Questions", variant="primary")
342
- results_table = gr.Dataframe(label="Results")
343
  answers_state = gr.State()
344
 
345
- run_btn.click(
346
- run_agent_on_questions,
347
- inputs=[openai_key],
348
- outputs=[results_table, answers_state]
349
- )
350
 
351
- with gr.TabItem("πŸ“€ Submit to Leaderboard"):
352
- gr.Markdown("""
353
- ### Submit Your Results
354
-
355
- After running the full benchmark, fill in your details and submit to the leaderboard.
356
-
357
- **Requirements:**
358
- - Your HuggingFace username
359
- - Your Space URL (must end with `/tree/main`)
360
- - Answers will be auto-filled after running the benchmark
361
- """)
362
 
363
  with gr.Row():
364
- username_input = gr.Textbox(
365
- label="HuggingFace Username",
366
- placeholder="your-username",
367
- info="Your HuggingFace account username"
368
- )
369
- space_url_input = gr.Textbox(
370
- label="Your Space URL",
371
- placeholder="https://huggingface.co/spaces/your-username/your-space",
372
- info="Full URL to your Space (will auto-append /tree/main if needed)"
373
- )
374
 
375
- answers_input = gr.Textbox(
376
- label="Answers JSON (auto-filled after running benchmark)",
377
- lines=10,
378
- placeholder="Run the full benchmark first...",
379
- info="This will be automatically populated after running the benchmark"
380
- )
381
-
382
- submit_btn = gr.Button("Submit to Leaderboard", variant="primary")
383
  submit_result = gr.Markdown()
384
 
385
- # Auto-fill answers when benchmark completes
386
- def format_answers(answers):
387
- if answers:
388
- return json.dumps(answers, indent=2)
389
- return ""
390
-
391
- answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_input])
392
 
393
- submit_btn.click(
394
- submit_to_leaderboard,
395
- inputs=[username_input, space_url_input, answers_input],
396
- outputs=[submit_result]
397
- )
398
 
399
  gr.Markdown("""
400
  ---
401
- ### πŸ”— Links
402
- - [GAIA Benchmark](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
403
- - [Student Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
404
- - [Course Unit 4](https://huggingface.co/learn/agents-course/en/unit4/hands-on)
405
- - [API Documentation](https://agents-course-unit4-scoring.hf.space/docs)
406
  """)
407
 
408
  if __name__ == "__main__":
409
- # For HuggingFace Spaces, use share=False
410
- # For local development, you can use share=True to get a public link
411
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
6
  import json
7
  import logging
8
  from typing import Optional
9
+ from dotenv import load_dotenv
10
 
11
+ load_dotenv()
12
+
13
+ from agent_enhanced import GAIAAgent, is_ollama_available, is_production
14
 
 
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
 
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
+ def fetch_questions(api_url: str = DEFAULT_API_URL) -> list:
22
+ """Fetch all questions from the GAIA API."""
23
+ for attempt in range(3):
 
24
  try:
25
  response = requests.get(f"{api_url}/questions", timeout=30)
26
  response.raise_for_status()
27
+ questions = response.json()
28
+
29
+ # Print all questions with their task IDs
30
+ print("\n" + "="*80)
31
+ print("ALL QUESTIONS WITH TASK IDs:")
32
+ print("="*80)
33
+ for i, q in enumerate(questions, 1):
34
+ task_id = q.get("task_id", "N/A")
35
+ question_text = q.get("question", "N/A")
36
+ file_name = q.get("file_name", "")
37
+ print(f"\n[{i}] Task ID: {task_id}")
38
+ print(f" Question: {question_text[:200]}{'...' if len(question_text) > 200 else ''}")
39
+ if file_name:
40
+ print(f" File: {file_name}")
41
+ print("\n" + "="*80)
42
+ print(f"Total questions: {len(questions)}")
43
+ print("="*80 + "\n")
44
+
45
+ return questions
46
+ except Exception as e:
47
  logger.warning(f"Attempt {attempt + 1} failed: {e}")
 
 
48
  return []
49
 
50
+
51
+ def fetch_random_question(api_url: str = DEFAULT_API_URL) -> dict:
52
+ """Fetch a random question."""
53
+ for attempt in range(3):
54
  try:
55
  response = requests.get(f"{api_url}/random-question", timeout=30)
56
  response.raise_for_status()
57
  return response.json()
58
+ except Exception as e:
59
  logger.warning(f"Attempt {attempt + 1} failed: {e}")
 
 
60
  return {}
61
 
62
+
63
+ def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL) -> Optional[str]:
64
+ """Fetch file for a task."""
65
+ try:
66
+ response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
67
+ if response.status_code == 200:
68
+ content_disposition = response.headers.get('content-disposition', '')
69
+ filename = f"task_{task_id}_file"
70
+ if 'filename=' in content_disposition:
71
+ filename = content_disposition.split('filename=')[1].strip('"')
72
+
73
+ temp_dir = tempfile.mkdtemp()
74
+ file_path = os.path.join(temp_dir, filename)
75
+
76
+ with open(file_path, 'wb') as f:
77
+ f.write(response.content)
78
+
79
+ logger.info(f"Downloaded: {file_path}")
80
+ return file_path
81
+ elif response.status_code == 404:
82
+ return None
83
+ except Exception as e:
84
+ logger.error(f"File fetch failed: {e}")
 
 
 
 
85
  return None
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL) -> dict:
89
+ """Submit answers to API."""
90
+ payload = {"username": username, "agent_code": agent_code, "answers": answers}
91
+ response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
92
+ response.raise_for_status()
93
+ return response.json()
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ def get_env_status() -> str:
97
+ """Get environment status."""
98
+ if is_production():
99
+ return "☁️ **Production Mode** (HuggingFace Spaces) - Using OpenAI GPT-4o"
100
+ elif is_ollama_available():
101
+ return "🏠 **Local Mode** - Using Ollama"
102
+ elif os.environ.get("OPENAI_API_KEY"):
103
+ return "☁️ **Local + OpenAI** - Using OpenAI GPT-4o"
104
+ else:
105
+ return "⚠️ **No Backend** - Set OPENAI_API_KEY or start Ollama"
106
+
107
+
108
+ def run_agent_on_questions(progress=gr.Progress()):
109
+ """Run agent on all questions."""
110
  try:
111
+ env_info = get_env_status()
112
  progress(0, desc="Initializing agent...")
 
113
 
114
+ agent = GAIAAgent()
115
+
116
+ progress(0.05, desc="Fetching questions...")
117
  questions = fetch_questions()
118
 
119
  if not questions:
120
+ return "Error: Failed to fetch questions.", None
121
 
122
+ total = len(questions)
123
  results = []
124
  answers_for_submission = []
125
 
126
  for i, q in enumerate(questions):
127
+ progress((i + 1) / total, desc=f"Question {i+1}/{total}...")
128
 
129
  task_id = q.get("task_id", "")
130
  question_text = q.get("question", "")
131
 
 
132
  file_path = None
133
  if q.get("file_name"):
 
134
  file_path = fetch_file(task_id)
135
 
 
136
  try:
 
137
  answer = agent.run(question_text, task_id, file_path)
 
 
 
 
 
 
138
  except Exception as e:
139
+ logger.error(f"Error on question {i+1}: {e}")
140
  answer = f"Error: {str(e)}"
141
 
142
  results.append({
143
  "Task ID": task_id,
144
+ "Question": question_text,
145
  "Answer": answer,
146
+ "Status": "βœ“" if answer and not answer.startswith("Error:") and answer != "Unable to determine answer" else "βœ—"
147
  })
148
 
149
  answers_for_submission.append({
 
151
  "submitted_answer": answer
152
  })
153
 
154
+ # Cleanup
155
  if file_path and os.path.exists(file_path):
156
  try:
157
  os.remove(file_path)
158
+ os.rmdir(os.path.dirname(file_path))
159
+ except:
160
+ pass
 
 
 
 
 
 
161
 
162
  df = pd.DataFrame(results)
163
  progress(1.0, desc="Complete!")
164
  return df, answers_for_submission
165
 
166
  except Exception as e:
167
+ logger.error(f"Error: {e}")
168
  return f"Error: {str(e)}", None
169
 
170
 
171
+ def test_single_question():
172
+ """Test on a single random question."""
173
+ try:
174
+ agent = GAIAAgent()
175
+ question_data = fetch_random_question()
176
+
177
+ if not question_data:
178
+ return "Error: Failed to fetch question.", "", "", ""
179
+
180
+ task_id = question_data.get("task_id", "")
181
+ question_text = question_data.get("question", "")
182
+
183
+ file_path = None
184
+ if question_data.get("file_name"):
185
+ file_path = fetch_file(task_id)
186
+
187
+ answer = agent.run(question_text, task_id, file_path)
188
+
189
+ # Cleanup
190
+ if file_path and os.path.exists(file_path):
191
+ try:
192
+ os.remove(file_path)
193
+ os.rmdir(os.path.dirname(file_path))
194
+ except:
195
+ pass
196
+
197
+ status = "βœ“ Valid" if answer and not answer.startswith("Error") else "⚠️ Check answer"
198
+ return question_text, answer, task_id, status
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error: {e}")
202
+ return f"Error: {str(e)}", "", "", ""
203
+
204
+
205
  def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
206
+ """Submit to leaderboard."""
207
  if not username or not space_url or not answers_json:
208
  return "Please fill in all fields and run the agent first."
209
 
 
211
  answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
212
 
213
  if not isinstance(answers, list) or len(answers) == 0:
214
+ return "Error: Run the benchmark first."
215
 
 
 
 
 
 
 
 
 
 
 
216
  if not space_url.endswith("/tree/main"):
217
  space_url = space_url.rstrip("/") + "/tree/main"
218
 
 
219
  result = submit_answers(username, space_url, answers)
 
 
220
  print(result)
221
+ score = result.get("score", 0)
222
  correct = result.get("correct_count", 0)
223
  total = result.get("total_attempted", 0)
224
 
225
+ cert_msg = "πŸ† **Congratulations!** Score above 30% - Certificate earned!" if score > 0.3 else "❌ Need >30% for certificate."
 
 
 
 
226
 
227
  return f"""
228
+ ## Submission Results
229
 
230
  **Score:** {score:.1%}
231
  **Correct:** {correct}/{total}
232
 
233
+ {cert_msg}
 
234
 
235
+ [View Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
236
  """
 
 
237
  except Exception as e:
238
  logger.error(f"Submission error: {e}")
239
+ return f"Error: {str(e)}"
240
 
241
 
242
+ # ============ GRADIO APP ============
243
+ with gr.Blocks(title="GAIA Agent", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  gr.Markdown("""
245
+ # πŸ€– GAIA Benchmark Agent
246
 
247
+ **Tools:** πŸ” Web Search | πŸ“š Wikipedia | 🐍 Python | πŸ“„ Files | πŸ”’ Calculator | 🌐 Webpages | πŸ‘οΈ Vision (OpenAI)
 
 
 
 
 
 
 
 
 
 
248
  """)
249
 
250
+ env_status = gr.Markdown(get_env_status())
 
 
 
 
 
 
251
 
252
  with gr.Tabs():
253
+ with gr.TabItem("πŸ§ͺ Test Single"):
254
  test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
255
+ test_q = gr.Textbox(label="Question", lines=4, interactive=False)
256
+ test_a = gr.Textbox(label="Answer", lines=2, interactive=False)
257
+ test_id = gr.Textbox(label="Task ID", interactive=False)
258
+ test_status = gr.Textbox(label="Status", interactive=False)
259
 
260
+ test_btn.click(test_single_question, outputs=[test_q, test_a, test_id, test_status])
 
 
 
 
261
 
262
+ with gr.TabItem("πŸš€ Full Benchmark"):
263
+ run_btn = gr.Button("Run on All Questions", variant="primary")
264
+ results_df = gr.Dataframe(label="Results")
265
  answers_state = gr.State()
266
 
267
+ run_btn.click(run_agent_on_questions, outputs=[results_df, answers_state])
 
 
 
 
268
 
269
+ with gr.TabItem("πŸ“€ Submit"):
270
+ gr.Markdown("### Submit to Leaderboard")
 
 
 
 
 
 
 
 
 
271
 
272
  with gr.Row():
273
+ username_in = gr.Textbox(label="HF Username", placeholder="your-username")
274
+ space_url_in = gr.Textbox(label="Space URL", placeholder="https://huggingface.co/spaces/you/space")
 
 
 
 
 
 
 
 
275
 
276
+ answers_in = gr.Textbox(label="Answers JSON (auto-filled)", lines=8)
277
+ submit_btn = gr.Button("Submit", variant="primary")
 
 
 
 
 
 
278
  submit_result = gr.Markdown()
279
 
280
+ def format_answers(a):
281
+ return json.dumps(a, indent=2) if a else ""
 
 
 
 
 
282
 
283
+ answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_in])
284
+ submit_btn.click(submit_to_leaderboard, inputs=[username_in, space_url_in, answers_in], outputs=[submit_result])
 
 
 
285
 
286
  gr.Markdown("""
287
  ---
288
+ **Setup:**
289
+ - Local: `ollama serve` + `ollama pull qwen2.5:32b`
290
+ - Production: Set `OPENAI_API_KEY` in `.env` or HF Secrets
 
 
291
  """)
292
 
293
  if __name__ == "__main__":
 
 
294
  demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,20 +1,30 @@
1
- # Core dependencies
2
  gradio>=4.0.0,<5.0.0
3
  requests>=2.31.0,<3.0.0
4
  pandas>=2.0.0,<3.0.0
 
5
 
6
  # LangChain & LangGraph
7
  langgraph>=0.2.0,<1.0.0
8
  langchain>=0.2.0,<1.0.0
9
- langchain-core>=0.2.0,<1.0.0
10
  langchain-openai>=0.1.0,<1.0.0
 
11
  langchain-community>=0.2.0,<1.0.0
12
  langchain-experimental>=0.0.60,<1.0.0
13
 
14
- # Tools dependencies
 
 
 
15
  duckduckgo-search>=6.0.0,<7.0.0
16
  pypdf>=4.0.0,<5.0.0
 
17
  openpyxl>=3.1.0,<4.0.0
 
18
 
19
- # Utilities
20
  python-dotenv>=1.0.0,<2.0.0
 
 
 
 
1
+ # Core
2
  gradio>=4.0.0,<5.0.0
3
  requests>=2.31.0,<3.0.0
4
  pandas>=2.0.0,<3.0.0
5
+ numpy>=1.24.0,<3.0.0
6
 
7
  # LangChain & LangGraph
8
  langgraph>=0.2.0,<1.0.0
9
  langchain>=0.2.0,<1.0.0
10
+ langchain-core>=0.2.0,<0.4.0
11
  langchain-openai>=0.1.0,<1.0.0
12
+ langchain-ollama>=0.1.0,<2.0.0
13
  langchain-community>=0.2.0,<1.0.0
14
  langchain-experimental>=0.0.60,<1.0.0
15
 
16
+ # OpenAI (for GPT-4o + Whisper)
17
+ openai>=1.0.0,<2.0.0
18
+
19
+ # Tools
20
  duckduckgo-search>=6.0.0,<7.0.0
21
  pypdf>=4.0.0,<5.0.0
22
+ pdfplumber>=0.10.0,<1.0.0
23
  openpyxl>=3.1.0,<4.0.0
24
+ beautifulsoup4>=4.12.0,<5.0.0
25
 
26
+ # Utils
27
  python-dotenv>=1.0.0,<2.0.0
28
+
29
+ # Audio Transcription (for Ollama)
30
+ faster-whisper>=0.10.0