Spaces:

jebaselvasingh
/

mycertification

Sleeping

App Files Files Community

jebaponselvasingh commited on Jan 16

Commit

d1dcd56

1 Parent(s): e0ff305

changes in the domain structure

Browse files

Files changed (5) hide show

.env +0 -1
__pycache__/agent_enhanced.cpython-312.pyc +0 -0
agent_enhanced.py +549 -520
app.py +160 -277
requirements.txt +14 -4

.env DELETED Viewed

	@@ -1 +0,0 @@
1	- OPENAI_API_KEY="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"

__pycache__/agent_enhanced.cpython-312.pyc ADDED Viewed

Binary file (36.9 kB). View file

agent_enhanced.py CHANGED Viewed

@@ -1,33 +1,68 @@
 """
-Enhanced GAIA Agent with LangGraph
-Separate module for cleaner architecture and easier customization
 """
 import os
 import re
 import json
 import requests
-import tempfile
-from typing import TypedDict, Annotated, Sequence, Literal, Any
 import operator
 from langgraph.graph import StateGraph, END
 from langgraph.prebuilt import ToolNode
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
 from langchain_core.tools import tool
-from langchain_openai import ChatOpenAI
 from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_experimental.utilities import PythonREPL
 import pandas as pd
 # ============ STATE DEFINITION ============
 class AgentState(TypedDict):
-    """State maintained throughout the agent's execution."""
     messages: Annotated[Sequence[BaseMessage], operator.add]
     task_id: str
     file_path: str | None
-    file_content: str | None
     iteration_count: int
     final_answer: str | None
@@ -36,74 +71,46 @@ class AgentState(TypedDict):
 @tool
 def web_search(query: str) -> str:
     """
-    Search the web using DuckDuckGo for current information.
-    Use this for questions about recent events, facts, statistics, or any information
-    that might have changed or that you're uncertain about.
     Args:
-        query: The search query string
-    Returns:
-        Search results with relevant snippets
     """
-    import logging
-    # Suppress non-critical errors from DuckDuckGo's internal engines
-    # (Some engines like grokipedia may fail due to DNS issues, but others work fine)
-    ddgs_logger = logging.getLogger("ddgs.ddgs")
-    primp_logger = logging.getLogger("primp")
-    # Store original levels
-    ddgs_original = ddgs_logger.level if ddgs_logger.level else logging.NOTSET
-    primp_original = primp_logger.level if primp_logger.level else logging.NOTSET
-    # Suppress INFO level logs (which include non-critical engine errors)
-    ddgs_logger.setLevel(logging.WARNING)
-    primp_logger.setLevel(logging.WARNING)
     try:
-        search = DuckDuckGoSearchResults(max_results=5, output_format="list")
         results = search.run(query)
-        # Restore original logging levels
-        if ddgs_original != logging.NOTSET:
-            ddgs_logger.setLevel(ddgs_original)
-        if primp_original != logging.NOTSET:
-            primp_logger.setLevel(primp_original)
         if isinstance(results, list):
             formatted = []
             for r in results:
                 if isinstance(r, dict):
-                    formatted.append(f"Title: {r.get('title', 'N/A')}\nSnippet: {r.get('snippet', 'N/A')}\nLink: {r.get('link', 'N/A')}")
-                else:
-                    formatted.append(str(r))
-            return "\n\n---\n\n".join(formatted)
-        return str(results)
     except Exception as e:
-        # Restore original logging levels even on exception
-        if ddgs_original != logging.NOTSET:
-            ddgs_logger.setLevel(ddgs_original)
-        if primp_original != logging.NOTSET:
-            primp_logger.setLevel(primp_original)
-        return f"Search failed: {str(e)}. Try a different query or approach."
 @tool
 def python_executor(code: str) -> str:
     """
-    Execute Python code for calculations, data analysis, or any computational task.
-    You have access to standard libraries: math, statistics, datetime, json, re, collections.
     Args:
-        code: Python code to execute. Print statements will show in output.
-    Returns:
-        The output/result of the code execution
     """
     try:
         repl = PythonREPL()
-        # Add common imports to the code
         augmented_code = """
 import math
 import statistics
@@ -111,24 +118,28 @@ import datetime
 import json
 import re
 from collections import Counter, defaultdict
 """ + code
         result = repl.run(augmented_code)
-        return result.strip() if result else "Code executed successfully with no output. Add print() to see results."
     except Exception as e:
-        return f"Execution error: {str(e)}. Please fix the code and try again."
 @tool
 def read_file(file_path: str) -> str:
     """
-    Read and extract content from various file types.
-    Supports: PDF, TXT, MD, CSV, JSON, XLSX, XLS, PY, and other text files.
     Args:
-        file_path: Path to the file to read
-    Returns:
-        The content of the file as a string
     """
     try:
         if not os.path.exists(file_path):
@@ -136,598 +147,616 @@ def read_file(file_path: str) -> str:
         file_lower = file_path.lower()
         if file_lower.endswith('.pdf'):
-            from langchain_community.document_loaders import PyPDFLoader
-            loader = PyPDFLoader(file_path)
-            pages = loader.load()
-            content = "\n\n--- Page Break ---\n\n".join([p.page_content for p in pages])
-            return f"PDF Content ({len(pages)} pages):\n{content}"
-        elif file_lower.endswith(('.xlsx', '.xls')):
-            df = pd.read_excel(file_path, sheet_name=None)  # Read all sheets
             result = []
-            for sheet_name, sheet_df in df.items():
-                result.append(f"=== Sheet: {sheet_name} ===\n{sheet_df.to_string()}")
             return "\n\n".join(result)
-        elif file_lower.endswith('.csv'):
             df = pd.read_csv(file_path)
-            return f"CSV Data ({len(df)} rows):\n{df.to_string()}"
-        elif file_lower.endswith('.json'):
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
-            return f"JSON Content:\n{json.dumps(data, indent=2)}"
-        else:  # Default: treat as text
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                content = f.read()
-            return f"File Content:\n{content}"
     except Exception as e:
-        return f"Error reading file: {str(e)}"
 @tool
 def calculator(expression: str) -> str:
     """
-    Evaluate a mathematical expression safely.
-    Supports: basic arithmetic, trigonometry, logarithms, exponents, etc.
     Args:
-        expression: Mathematical expression (e.g., "sqrt(16) + log(100, 10)")
-    Returns:
-        The numerical result as a string
     """
     try:
         import math
-        # Define allowed functions and constants
         safe_dict = {
             'abs': abs, 'round': round, 'min': min, 'max': max,
-            'sum': sum, 'pow': pow, 'len': len,
             'sqrt': math.sqrt, 'log': math.log, 'log10': math.log10,
             'log2': math.log2, 'exp': math.exp,
             'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
-            'asin': math.asin, 'acos': math.acos, 'atan': math.atan,
-            'sinh': math.sinh, 'cosh': math.cosh, 'tanh': math.tanh,
             'ceil': math.ceil, 'floor': math.floor,
-            'pi': math.pi, 'e': math.e,
-            'factorial': math.factorial, 'gcd': math.gcd,
-            'degrees': math.degrees, 'radians': math.radians,
         }
         result = eval(expression, {"__builtins__": {}}, safe_dict)
-        # Format nicely
-        if isinstance(result, float):
-            if result.is_integer():
-                return str(int(result))
-            return f"{result:.10g}"  # Remove trailing zeros
-        return str(result)
     except Exception as e:
-        return f"Calculation error: {str(e)}. Check your expression syntax."
 @tool
 def wikipedia_search(query: str) -> str:
     """
-    Search Wikipedia for factual information about a specific topic.
-    Best for: historical facts, biographies, scientific concepts, definitions.
     Args:
-        query: The topic to search for on Wikipedia
-    Returns:
-        Summary and key information from relevant Wikipedia articles
     """
     try:
         import urllib.parse
-        # Search for articles
         search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&format=json&srlimit=3"
-        response = requests.get(search_url, timeout=10)
         data = response.json()
-        if 'query' not in data or 'search' not in data['query'] or not data['query']['search']:
             return f"No Wikipedia articles found for '{query}'"
-        # Get full content of top result
-        top_title = data['query']['search'][0]['title']
-        content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro=true&explaintext=true&titles={urllib.parse.quote(top_title)}&format=json"
-        content_response = requests.get(content_url, timeout=10)
-        content_data = content_response.json()
-        pages = content_data.get('query', {}).get('pages', {})
-        for page_id, page_data in pages.items():
-            if page_id != '-1':
-                title = page_data.get('title', '')
-                extract = page_data.get('extract', 'No content available')
-                return f"Wikipedia: {title}\n\n{extract[:2000]}"
-        return "Could not retrieve article content."
     except Exception as e:
-        return f"Wikipedia search failed: {str(e)}"
 @tool
-def analyze_image(image_path: str, question: str) -> str:
     """
-    Analyze an image file and answer questions about it.
-    Note: This is a placeholder - implement with vision model if needed.
     Args:
-        image_path: Path to the image file
-        question: What to analyze or find in the image
-    Returns:
-        Description or analysis of the image
     """
-    # This is a placeholder - you can integrate with GPT-4V or other vision models
-    return f"Image analysis not implemented. File: {image_path}, Question: {question}"
-# Collect all tools
-TOOLS = [web_search, python_executor, read_file, calculator, wikipedia_search]
 # ============ SYSTEM PROMPT ============
-SYSTEM_PROMPT = """You are an expert AI assistant designed to solve GAIA benchmark questions with maximum accuracy.
-## Your Mission
-Provide PRECISE, EXACT answers. The benchmark uses EXACT STRING MATCHING, so your final answer must match the ground truth character-for-character.
-## Critical Answer Formatting Rules (MUST FOLLOW)
-**DO NOT include "FINAL ANSWER:" or any prefix - just the answer itself.**
-1. **Numbers**: Give just the number.
-   - ✅ CORRECT: "42"
-   - ❌ WRONG: "The answer is 42", "42 units", "Answer: 42"
-2. **Names**: Exact spelling as found in sources. Check Wikipedia/official sources for correct spelling, capitalization, and punctuation.
-   - ✅ CORRECT: "John Smith"
-   - ❌ WRONG: "john smith", "John smith"
-3. **Lists**: Comma-separated, NO spaces after commas.
-   - ✅ CORRECT: "apple,banana,cherry"
-   - ❌ WRONG: "apple, banana, cherry", "apple,banana, cherry"
-4. **Dates**: Use the format specified in the question, or YYYY-MM-DD if not specified.
-   - ✅ CORRECT: "2024-01-15" or "January 15, 2024" (if question asks for that format)
-   - ❌ WRONG: "1/15/2024" (unless question asks for it)
-5. **Yes/No**: Just "Yes" or "No" (capitalized, no period).
-   - ✅ CORRECT: "Yes"
-   - ❌ WRONG: "yes", "Yes.", "The answer is Yes"
-6. **Counts**: Just the number.
-   - ✅ CORRECT: "5"
-   - ❌ WRONG: "5 items", "five", "There are 5"
-7. **No explanations**: Your final response must contain ONLY the answer, nothing else.
-   - ✅ CORRECT: "Paris"
-   - ❌ WRONG: "The answer is Paris because..."
-## Detailed Problem-Solving Strategy
-### Step 1: Analyze the Question
-- Read the question word-by-word. What exactly is being asked?
-- Identify keywords: "what", "who", "when", "where", "how many", "calculate", "find"
-- Note any format requirements or constraints mentioned in the question
-- Check if the question references specific data, files, or time periods
-### Step 2: File Priority (CRITICAL)
-- If a file is mentioned or available, you MUST read it FIRST before any other action
-- Files often contain the exact answer or the data needed to calculate it
-- After reading the file, carefully search through ALL content - don't miss details
-- For Excel/CSV files, examine ALL sheets and ALL columns
-- For PDFs, read ALL pages - answers can be anywhere in the document
-### Step 3: Plan Your Approach
-- Based on the question type, decide which tools you need:
-  - **Data extraction from file**: read_file (then possibly python_executor for analysis)
-  - **Mathematical calculations**: python_executor or calculator
-  - **Historical/factual information**: wikipedia_search first, then web_search if needed
-  - **Current/recent information**: web_search
-  - **Complex data analysis**: python_executor with pandas/numpy
-- Create a step-by-step plan before executing
-### Step 4: Execute Systematically
-- Use ONE tool at a time, wait for results
-- For file-based questions: read file → extract relevant data → calculate/analyze → verify
-- For fact-based questions: search → verify from multiple sources if possible → extract exact answer
-- For calculation questions: gather inputs → perform calculation → double-check math
-- If initial search doesn't yield results, try different query keywords
-### Step 5: Verify and Cross-Check
-- Verify your answer matches what was asked
-- For names: double-check spelling, capitalization, punctuation
-- For numbers: verify calculations, check units, ensure precision
-- For dates: verify format matches question requirements
-- If you found information from one source, try to verify with another if time permits
-- For lists: ensure proper comma-separated format with NO spaces
-### Step 6: Format Correctly
-- Remove ALL prefixes ("FINAL ANSWER:", "The answer is:", etc.)
-- Remove ALL explanations and context
-- Ensure exact formatting (spaces, commas, capitalization)
-- Double-check: is this the EXACT format the question expects?
-## Available Tools
-- `read_file`: Read PDFs, spreadsheets, text files - USE THIS FIRST if a file is available
-- `web_search`: Current information, recent events, facts (use for recent/current info)
-- `wikipedia_search`: Historical facts, biographies, definitions (use for established facts)
-- `python_executor`: Calculations, data processing, analysis (use for complex calculations or data analysis)
-- `calculator`: Quick mathematical calculations (use for simple arithmetic)
-## Tool Usage Guidelines
-### Reading Files (HIGHEST PRIORITY)
-- ALWAYS read files FIRST if available
-- For Excel files: check ALL sheets, read ALL relevant columns
-- For PDFs: read ALL pages, search for keywords from the question
-- For CSV files: examine ALL rows, look for patterns
-- Extract numbers, names, dates EXACTLY as they appear
-### Web Search Strategy
-- Use specific, targeted queries with key terms from the question
-- If first search doesn't help, try rephrasing with different keywords
-- Look for official sources, authoritative websites
-- Extract exact values (numbers, names) - don't round or approximate
-### Wikipedia Search Strategy
-- Use exact terms or names from the question
-- Read the summary/intro carefully - it often contains the answer
-- Check spelling, capitalization, dates exactly as shown
-- For biographical questions, search for the person's name
-### Python Execution
-- Use for calculations, data analysis, or processing file contents
-- Be explicit with calculations - show your work in code
-- Use appropriate precision - don't round unnecessarily
-- Print the final result clearly
-### Calculator
-- Use for simple arithmetic operations
-- Preserve precision - use exact fractions if possible
-- Format output correctly (integers as integers, decimals as needed)
-## Critical Reminders
-- NEVER include "FINAL ANSWER:" or any prefix in your response
-- NEVER add explanations or context to your final answer
-- ALWAYS verify spelling, capitalization, and formatting
-- ALWAYS read files first if they are available - don't skip this step
-- For file-based questions, the answer is almost always in the file
-- Extract exact values - don't approximate or round unless necessary
-- If uncertain about format, look for clues in the question itself
-- Never guess - use tools to find accurate information
-- Use multiple tools if needed - don't stop after the first result if unsure
-- Cross-reference important facts when possible
-## When You're Ready to Answer
-- Review your final answer one more time
-- Ensure it's formatted correctly (no prefixes, no explanations)
-- Ensure spelling, capitalization, and punctuation are exact
-- Ensure numbers are precise
-- When satisfied, respond with ONLY the answer - nothing else
-Remember: Your final message must contain ONLY the answer, nothing else. The scoring system uses exact string matching."""
-# ============ LANGGRAPH AGENT ============
 class GAIAAgent:
-    """LangGraph-based agent for GAIA benchmark."""
     def __init__(
         self,
-        model_name: str = "gpt-4o",
-        api_key: str = None,
         temperature: float = 0,
-        max_iterations: int = 25
     ):
-        """
-        Initialize the GAIA agent.
-        Args:
-            model_name: OpenAI model to use
-            api_key: OpenAI API key (or set OPENAI_API_KEY env var)
-            temperature: Model temperature (0 for deterministic)
-            max_iterations: Maximum tool-use iterations
-        """
-        self.model_name = model_name
         self.max_iterations = max_iterations
-        self.llm = ChatOpenAI(
-            model=model_name,
-            temperature=temperature,
-            api_key=api_key or os.environ.get("OPENAI_API_KEY")
-        )
         self.llm_with_tools = self.llm.bind_tools(TOOLS)
         self.graph = self._build_graph()
     def _build_graph(self) -> StateGraph:
-        """Construct the LangGraph workflow."""
         workflow = StateGraph(AgentState)
-        # Define nodes
         workflow.add_node("agent", self._agent_node)
         workflow.add_node("tools", ToolNode(TOOLS))
         workflow.add_node("extract_answer", self._extract_answer_node)
-        # Set entry point
         workflow.set_entry_point("agent")
-        # Define edges
-        workflow.add_conditional_edges(
-            "agent",
-            self._route_agent_output,
-            {
-                "tools": "tools",
-                "end": "extract_answer"
-            }
-        )
         workflow.add_edge("tools", "agent")
         workflow.add_edge("extract_answer", END)
         return workflow.compile()
     def _agent_node(self, state: AgentState) -> dict:
-        """Process messages and decide on next action."""
-        messages = state["messages"]
         iteration = state.get("iteration_count", 0)
-        # Add iteration warnings to guide the agent
         if iteration >= self.max_iterations - 2:
-            warning_msg = "⚠️ CRITICAL: You have reached the iteration limit. You MUST provide your final answer NOW in your next response. Format: ONLY the answer itself, no prefixes like 'FINAL ANSWER:' or 'The answer is:' - just the answer."
-            messages = list(messages) + [SystemMessage(content=warning_msg)]
         elif iteration >= self.max_iterations - 5:
-            warning_msg = "⚠️ WARNING: Approaching iteration limit. Start wrapping up and provide your final answer soon. Remember: just the answer, no prefix."
-            messages = list(messages) + [SystemMessage(content=warning_msg)]
-        elif iteration >= self.max_iterations - 8:
-            reminder_msg = "Reminder: When you're ready to answer, provide ONLY the final answer with no prefix like 'FINAL ANSWER:' or 'The answer is:'. Check your answer format carefully."
-            messages = list(messages) + [SystemMessage(content=reminder_msg)]
         try:
             response = self.llm_with_tools.invoke(messages)
         except Exception as e:
-            # Graceful error handling
-            error_msg = AIMessage(content=f"Error during reasoning: {str(e)}. Please try a different approach or provide your best answer.")
-            return {
-                "messages": [error_msg],
-                "iteration_count": iteration + 1
-            }
-        return {
-            "messages": [response],
-            "iteration_count": iteration + 1
-        }
-    def _route_agent_output(self, state: AgentState) -> Literal["tools", "end"]:
-        """Determine whether to use tools or finish."""
-        last_message = state["messages"][-1]
-        iteration = state.get("iteration_count", 0)
-        # Force end if max iterations reached
-        if iteration >= self.max_iterations:
             return "end"
-        # Check if agent wants to use tools
-        if hasattr(last_message, "tool_calls") and last_message.tool_calls:
             return "tools"
         return "end"
     def _extract_answer_node(self, state: AgentState) -> dict:
-        """Extract and clean the final answer."""
-        # Try to find the answer in the last few messages
         messages = state["messages"]
-        # Look for answer in last message first
-        last_message = messages[-1]
-        content = last_message.content if hasattr(last_message, "content") else str(last_message)
-        # If last message is empty or doesn't contain clear answer, check previous messages
-        if not content or len(content.strip()) < 3:
-            # Look backwards through messages for the last non-empty content
-            for msg in reversed(messages[:-1]):
-                msg_content = msg.content if hasattr(msg, "content") else str(msg)
-                if msg_content and len(msg_content.strip()) >= 3:
-                    content = msg_content
                     break
-        # Also check if we have tool results that might contain the answer
-        # Look for tool results in recent messages
-        for msg in reversed(messages[-5:]):  # Check last 5 messages
-            if hasattr(msg, "content") and msg.content:
-                # Sometimes answers are in tool responses
-                if "result" in msg.content.lower() or "answer" in msg.content.lower():
-                    # Extract potential answer from tool response
-                    lines = msg.content.split('\n')
-                    for line in lines:
-                        line_lower = line.lower()
-                        if any(word in line_lower for word in ["the answer is", "result is", "found:", "value:", "equals"]):
-                            # Try to extract just the answer part
-                            content = line
-                            break
         answer = self._clean_answer(content)
         return {"final_answer": answer}
-    def _clean_answer(self, raw_answer: str) -> str:
-        """Clean and format the final answer for exact matching."""
-        if not raw_answer:
             return ""
-        answer = raw_answer.strip()
-        # Remove common prefixes (case-insensitive, with variations)
         prefixes = [
-            "the answer is:", "the answer is", "answer is:",
-            "answer:", "answer", "answer:",
-            "final answer:", "final answer", "FINAL ANSWER:", "FINAL ANSWER",
-            "the final answer is:", "the final answer is",
-            "result:", "result", "result is:",
-            "solution:", "solution", "solution is:",
-            "the solution is:", "the solution is",
-            "it is", "it's", "that is", "that's",
-            "the value is:", "the value is", "value is:",
-            "the result is:", "the result is",
-            "found:", "found", "equals:", "equals", "is:",
-            "according to the", "based on the", "from the",
         ]
-        answer_lower = answer.lower()
-        for prefix in prefixes:
-            if answer_lower.startswith(prefix):
-                answer = answer[len(prefix):].strip()
-                # Remove any leading colon, dash, or space
-                answer = answer.lstrip(':').lstrip('-').lstrip().strip()
-                answer_lower = answer.lower()
-        # Remove explanations after the answer (look for common patterns)
-        # Split by common explanation starters
-        explanation_markers = [" because", " since", " as", " due to", " which", " that", " - ", " (", " [", "\n\n"]
-        for marker in explanation_markers:
-            if marker in answer:
-                # For some markers, split and take first part
-                if marker in [" - ", "\n\n"]:
-                    answer = answer.split(marker)[0].strip()
-                # For parentheses/brackets, be more careful
-                elif marker in [" (", " ["]:
-                    # Only remove if it looks like an explanation
-                    idx = answer.find(marker)
-                    if idx > 0 and idx < len(answer) - 3:  # Not at start/end
-                        # Check if it's likely an explanation (has words, not just numbers/dates)
-                        rest = answer[idx+1:]
-                        if any(char.isalpha() for char in rest[:20]):  # Has letters in first 20 chars
-                            answer = answer[:idx].strip()
-                else:
-                    # For words like "because", split and take first part
-                    parts = answer.split(marker, 1)
-                    if len(parts) > 1:
-                        answer = parts[0].strip()
-        # Remove quotes if they wrap the entire answer
         if (answer.startswith('"') and answer.endswith('"')) or \
            (answer.startswith("'") and answer.endswith("'")):
-            answer = answer[1:-1].strip()
-        # Remove trailing periods, commas, or semicolons for single-word/number answers
-        # But preserve trailing punctuation for dates or other formatted answers
-        if answer and ' ' not in answer:
-            # Don't remove trailing punctuation if it's part of a date format or URL
-            if not (answer.count('-') == 2 or answer.count('/') == 2 or '://' in answer):
-                answer = answer.rstrip('.,;:')
-        # Remove leading/trailing whitespace and normalize internal whitespace
-        # But preserve formatting for lists (comma-separated)
-        if ',' in answer and ' ' not in answer.replace(',', '').replace(' ', ''):
-            # Comma-separated list without spaces - keep as is
-            answer = answer.strip()
-        else:
-            answer = ' '.join(answer.split())
-        # Remove markdown formatting if present
-        if answer.startswith('**') and answer.endswith('**'):
-            answer = answer[2:-2].strip()
-        if answer.startswith('*') and answer.endswith('*') and not answer.startswith('**'):
-            answer = answer[1:-1].strip()
-        # Remove code block markers if present
-        if answer.startswith('```') and answer.endswith('```'):
-            lines = answer.split('\n')
-            if len(lines) > 2:
-                answer = '\n'.join(lines[1:-1]).strip()
-        # Final cleanup: remove any remaining explanation patterns at the end
-        answer = answer.split('\n')[0].strip()  # Take first line only
-        answer = answer.split('.')[0].strip() if answer.count('.') > 1 else answer  # Take first sentence if multiple
         return answer.strip()
     def run(self, question: str, task_id: str = "", file_path: str = None) -> str:
-        """
-        Run the agent on a question.
-        Args:
-            question: The GAIA question to answer
-            task_id: Optional task identifier
-            file_path: Optional path to associated file
-        Returns:
-            The agent's final answer
-        """
-        # Prepare the user message with file priority
         user_content = question
         if file_path and os.path.exists(file_path):
-            # Strongly emphasize reading the file first with detailed instructions
-            file_extension = os.path.splitext(file_path)[1].lower()
-            file_instructions = ""
-            if file_extension in ['.xlsx', '.xls', '.csv']:
-                file_instructions = "This is a spreadsheet file. Read it completely and examine ALL sheets (if Excel) and ALL columns. The answer is likely a number, date, name, or value extracted from this data. After reading, you may need to perform calculations or analysis using python_executor."
-            elif file_extension == '.pdf':
-                file_instructions = "This is a PDF file. Read ALL pages carefully. The answer may be anywhere in the document - in tables, text, or images. Search for keywords from the question."
-            else:
-                file_instructions = "This is a text-based file. Read it completely and carefully. The answer is likely somewhere in this file - look for exact values, names, dates, or information that matches the question."
-            user_content = f"""CRITICAL: A file is available at {file_path}
-{file_instructions}
-**You MUST read this file FIRST before doing anything else.** Do not search the web or use other tools until you have read the file completely. The answer is very likely in this file.
 Question: {question}"""
-        # Initialize state
         initial_state: AgentState = {
-            "messages": [
-                SystemMessage(content=SYSTEM_PROMPT),
-                HumanMessage(content=user_content)
-            ],
             "task_id": task_id,
             "file_path": file_path,
-            "file_content": None,
             "iteration_count": 0,
             "final_answer": None
         }
-        # Execute the graph
         try:
-            final_state = self.graph.invoke(
-                initial_state,
-                {"recursion_limit": self.max_iterations * 2 + 5}
-            )
-            answer = final_state.get("final_answer", "Unable to determine answer")
-            # Final validation - ensure answer is not empty or error message
-            if not answer or answer.startswith("Agent error:") or answer.startswith("Unable to determine"):
-                # Try to extract from last message if available
-                if final_state.get("messages"):
-                    last_msg = final_state["messages"][-1]
-                    if hasattr(last_msg, "content") and last_msg.content:
-                        answer = self._clean_answer(last_msg.content)
             return answer if answer else "Unable to determine answer"
         except Exception as e:
-            # Log error for debugging but return a clean error message
-            import logging
-            logging.error(f"Agent execution error: {str(e)}")
             return f"Agent error: {str(e)}"
-# ============ UTILITY FUNCTIONS ============
-def create_agent(api_key: str = None, model: str = "gpt-4o") -> GAIAAgent:
-    """Factory function to create a configured agent."""
-    return GAIAAgent(
-        model_name=model,
-        api_key=api_key,
-        temperature=0,
-        max_iterations=15
-    )

 """
+Enhanced GAIA Agent with LangGraph - Fixed Version
+Supports Ollama (local) and OpenAI (production)
 """
 import os
 import re
 import json
 import requests
+import time
+import logging
+import base64
+from typing import TypedDict, Annotated, Sequence, Literal
 import operator
+from dotenv import load_dotenv
+load_dotenv()
 from langgraph.graph import StateGraph, END
 from langgraph.prebuilt import ToolNode
 from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
 from langchain_core.tools import tool
 from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_experimental.utilities import PythonREPL
 import pandas as pd
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ============ CONFIGURATION ============
+OLLAMA_MODEL = "qwen2.5:32b"  # Vision-capable model for image support
+OLLAMA_BASE_URL = "http://localhost:11434"
+OPENAI_MODEL = "gpt-4o"
+# Vision-capable Ollama models
+VISION_MODEL_KEYWORDS = ["vision", "vl", "llava", "bakllava", "gemma3", "qwen2.5-vl", "llama3.2-vision"]
+def _is_vision_model(model_name: str) -> bool:
+    """Check if the model name suggests vision capability."""
+    if not model_name:
+        return False
+    model_lower = model_name.lower()
+    return any(keyword in model_lower for keyword in VISION_MODEL_KEYWORDS)
+def is_ollama_available() -> bool:
+    """Check if Ollama is running locally."""
+    try:
+        response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=2)
+        return response.status_code == 200
+    except:
+        return False
+def is_production() -> bool:
+    """Check if running on HuggingFace Spaces."""
+    return bool(os.environ.get("SPACE_ID"))
 # ============ STATE DEFINITION ============
 class AgentState(TypedDict):
     messages: Annotated[Sequence[BaseMessage], operator.add]
     task_id: str
     file_path: str | None
     iteration_count: int
     final_answer: str | None
 @tool
 def web_search(query: str) -> str:
     """
+    Search the web for current information using DuckDuckGo.
+    Use for recent events, facts, statistics, or information you're uncertain about.
     Args:
+        query: Search query string
     """
+    for name in ["ddgs.ddgs", "primp"]:
+        logging.getLogger(name).setLevel(logging.ERROR)
     try:
+        search = DuckDuckGoSearchResults(max_results=8, output_format="list")
         results = search.run(query)
         if isinstance(results, list):
             formatted = []
             for r in results:
                 if isinstance(r, dict):
+                    formatted.append(
+                        f"Title: {r.get('title', 'N/A')}\n"
+                        f"Snippet: {r.get('snippet', 'N/A')}\n"
+                        f"Link: {r.get('link', 'N/A')}"
+                    )
+            return "\n\n---\n\n".join(formatted) if formatted else "No results found."
+        return str(results) if results else "No results found."
     except Exception as e:
+        return f"Search failed: {e}"
 @tool
 def python_executor(code: str) -> str:
     """
+    Execute Python code for calculations, data analysis, or computational tasks.
+    Available libraries: math, statistics, datetime, json, re, collections, pandas, numpy.
+    Use print() to see output.
     Args:
+        code: Python code to execute
     """
     try:
         repl = PythonREPL()
         augmented_code = """
 import math
 import statistics
 import json
 import re
 from collections import Counter, defaultdict
+import pandas as pd
+import numpy as np
+from fractions import Fraction
+from decimal import Decimal
 """ + code
         result = repl.run(augmented_code)
+        output = result.strip() if result else "Code executed with no output. Use print()."
+        if len(output) > 5000:
+            output = output[:5000] + "\n... (truncated)"
+        return output
     except Exception as e:
+        return f"Execution error: {e}"
 @tool
 def read_file(file_path: str) -> str:
     """
+    Read content from files. Supports: PDF, TXT, CSV, JSON, XLSX, XLS, PY, MP3, WAV, images.
+    ALWAYS use this FIRST when a file is provided.
     Args:
+        file_path: Path to the file
     """
     try:
         if not os.path.exists(file_path):
         file_lower = file_path.lower()
+        # Audio files
+        if file_lower.endswith(('.mp3', '.wav', '.m4a', '.ogg', '.flac', '.webm')):
+            return _transcribe_audio(file_path)
+        # Image files - return path for vision model
+        if file_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp')):
+            return f"IMAGE_FILE:{file_path}"
+        # PDF files
         if file_lower.endswith('.pdf'):
+            try:
+                from langchain_community.document_loaders import PyPDFLoader
+                loader = PyPDFLoader(file_path)
+                pages = loader.load()
+                content = "\n\n--- Page Break ---\n\n".join([p.page_content for p in pages])
+                return f"PDF Content ({len(pages)} pages):\n{content}"
+            except Exception as e:
+                try:
+                    import pdfplumber
+                    with pdfplumber.open(file_path) as pdf:
+                        text = []
+                        for i, page in enumerate(pdf.pages):
+                            page_text = page.extract_text() or ""
+                            tables = page.extract_tables()
+                            table_text = ""
+                            for table in tables:
+                                if table:
+                                    table_text += "\n[TABLE]\n"
+                                    for row in table:
+                                        table_text += " | ".join(str(c) if c else "" for c in row) + "\n"
+                            text.append(f"Page {i+1}:\n{page_text}\n{table_text}")
+                        return f"PDF Content:\n" + "\n\n".join(text)
+                except:
+                    return f"Error reading PDF: {e}"
+        # Excel files
+        if file_lower.endswith(('.xlsx', '.xls')):
+            df_dict = pd.read_excel(file_path, sheet_name=None)
             result = []
+            for sheet_name, df in df_dict.items():
+                result.append(f"=== Sheet: {sheet_name} ({len(df)} rows) ===")
+                result.append(f"Columns: {list(df.columns)}")
+                result.append(df.to_string(max_rows=200))
             return "\n\n".join(result)
+        # CSV files
+        if file_lower.endswith('.csv'):
             df = pd.read_csv(file_path)
+            return f"CSV ({len(df)} rows):\nColumns: {list(df.columns)}\n{df.to_string(max_rows=200)}"
+        # JSON files
+        if file_lower.endswith('.json'):
             with open(file_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            return f"JSON:\n{json.dumps(data, indent=2)}"
+        # Default: text
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            content = f.read()
+        if len(content) > 15000:
+            content = content[:15000] + "\n... (truncated)"
+        return f"File Content:\n{content}"
     except Exception as e:
+        return f"Error reading file: {e}"
+def _transcribe_audio(file_path: str) -> str:
+    """Transcribe audio using local Whisper (faster-whisper)."""
+    try:
+        from faster_whisper import WhisperModel
+        # Use base model for speed, can be upgraded to "small", "medium", "large" for better accuracy
+        model = WhisperModel("base", device="cpu", compute_type="int8")
+        segments, info = model.transcribe(file_path, beam_size=5)
+        transcript = " ".join([segment.text for segment in segments])
+        return f"Audio Transcription:\n{transcript}"
+    except ImportError:
+        return "Error: faster-whisper not installed. Install with: pip install faster-whisper"
+    except Exception as e:
+        logger.error(f"Audio transcription error: {e}")
+        return f"Audio transcription failed: {e}"
 @tool
 def calculator(expression: str) -> str:
     """
+    Evaluate mathematical expressions safely.
     Args:
+        expression: Math expression like "sqrt(16) + log(100, 10)"
     """
     try:
         import math
         safe_dict = {
             'abs': abs, 'round': round, 'min': min, 'max': max,
+            'sum': sum, 'pow': pow, 'int': int, 'float': float,
             'sqrt': math.sqrt, 'log': math.log, 'log10': math.log10,
             'log2': math.log2, 'exp': math.exp,
             'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
             'ceil': math.ceil, 'floor': math.floor,
+            'pi': math.pi, 'e': math.e, 'factorial': math.factorial,
         }
         result = eval(expression, {"__builtins__": {}}, safe_dict)
+        if isinstance(result, float) and result.is_integer():
+            return str(int(result))
+        return f"{result:.10g}" if isinstance(result, float) else str(result)
     except Exception as e:
+        return f"Calculation error: {e}"
 @tool
 def wikipedia_search(query: str) -> str:
     """
+    Search Wikipedia for factual information.
+    Best for historical facts, biographies, scientific concepts.
     Args:
+        query: Topic to search
     """
     try:
         import urllib.parse
         search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&format=json&srlimit=3"
+        response = requests.get(search_url, timeout=15)
         data = response.json()
+        if 'query' not in data or not data['query'].get('search'):
             return f"No Wikipedia articles found for '{query}'"
+        results = []
+        for item in data['query']['search'][:2]:
+            title = item['title']
+            content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro=false&explaintext=true&titles={urllib.parse.quote(title)}&format=json&exchars=4000"
+            content_response = requests.get(content_url, timeout=15)
+            pages = content_response.json().get('query', {}).get('pages', {})
+            for page_id, page_data in pages.items():
+                if page_id != '-1':
+                    results.append(f"## {title}\n{page_data.get('extract', 'No content')}")
+        return "\n\n---\n\n".join(results) if results else "No content found."
     except Exception as e:
+        return f"Wikipedia search failed: {e}"
 @tool
+def fetch_webpage(url: str) -> str:
     """
+    Fetch and extract text from a webpage URL.
     Args:
+        url: The webpage URL
     """
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0 (compatible; GaiaBot/1.0)'}
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        try:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(response.text, 'html.parser')
+            for el in soup(['script', 'style', 'nav', 'footer', 'header']):
+                el.decompose()
+            text = soup.get_text(separator='\n', strip=True)
+            lines = [l.strip() for l in text.splitlines() if l.strip()]
+            text = '\n'.join(lines)
+            if len(text) > 10000:
+                text = text[:10000] + "\n... (truncated)"
+            return f"Webpage ({url}):\n{text}"
+        except ImportError:
+            return f"Raw HTML:\n{response.text[:10000]}"
+    except Exception as e:
+        return f"Failed to fetch: {e}"
+TOOLS = [web_search, python_executor, read_file, calculator, wikipedia_search, fetch_webpage]
 # ============ SYSTEM PROMPT ============
+SYSTEM_PROMPT = """You are an expert AI solving GAIA benchmark questions. Your goal is MAXIMUM ACCURACY.
+## CRITICAL: Answer Format (EXACT STRING MATCHING)
+Your final answer must be ONLY the answer value - nothing else.
+**Rules:**
+- Numbers: "42" (not "The answer is 42")
+- Names: Exact spelling "John Smith"
+- Lists: Comma-separated, NO spaces: "apple,banana,cherry"
+- Dates: Requested format or YYYY-MM-DD
+- Yes/No: "Yes" or "No"
+- NEVER use prefixes like "Answer:", "FINAL ANSWER:", etc.
+- NEVER explain - just the answer
+## Strategy
+1. **If file provided**: Use read_file FIRST - answer is usually there
+2. **For calculations**: Use python_executor or calculator
+3. **For facts**: wikipedia_search for historical, web_search for current
+4. **For URLs in question**: Use fetch_webpage
+5. **Verify**: Check spelling, formatting, precision
+## When Ready
+State ONLY the answer value. Nothing else."""
+# ============ AGENT CLASS ============
 class GAIAAgent:
+    """LangGraph agent for GAIA benchmark."""
     def __init__(
         self,
+        model_name: str = None,
         temperature: float = 0,
+        max_iterations: int = 25,
     ):
         self.max_iterations = max_iterations
+        self.use_openai = is_production() or not is_ollama_available()
+        if self.use_openai:
+            from langchain_openai import ChatOpenAI
+            api_key = os.environ.get("OPENAI_API_KEY")
+            if not api_key:
+                raise ValueError("OPENAI_API_KEY not found")
+            self.model_name = model_name or OPENAI_MODEL
+            self.llm = ChatOpenAI(model=self.model_name, temperature=temperature, api_key=api_key)
+            self.supports_vision = True  # OpenAI models support vision
+            logger.info(f"Using OpenAI: {self.model_name}")
+        else:
+            from langchain_ollama import ChatOllama
+            self.model_name = model_name or OLLAMA_MODEL
+            self.llm = ChatOllama(model=self.model_name, base_url=OLLAMA_BASE_URL, temperature=temperature)
+            self.supports_vision = _is_vision_model(self.model_name)
+            logger.info(f"Using Ollama: {self.model_name} (vision: {self.supports_vision})")
         self.llm_with_tools = self.llm.bind_tools(TOOLS)
         self.graph = self._build_graph()
     def _build_graph(self) -> StateGraph:
         workflow = StateGraph(AgentState)
         workflow.add_node("agent", self._agent_node)
         workflow.add_node("tools", ToolNode(TOOLS))
         workflow.add_node("extract_answer", self._extract_answer_node)
         workflow.set_entry_point("agent")
+        workflow.add_conditional_edges("agent", self._route, {"tools": "tools", "end": "extract_answer"})
         workflow.add_edge("tools", "agent")
         workflow.add_edge("extract_answer", END)
         return workflow.compile()
     def _agent_node(self, state: AgentState) -> dict:
+        messages = list(state["messages"])
         iteration = state.get("iteration_count", 0)
+        file_path = state.get("file_path")
+        # If using Ollama vision and image exists, ensure image is included in the last user message
+        if not self.use_openai and self.supports_vision and file_path and os.path.exists(file_path):
+            ext = os.path.splitext(file_path)[1].lower()
+            is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
+            if is_image:
+                # Check if the last message is a HumanMessage without image content
+                # If so, we need to add the image to it
+                last_msg = messages[-1] if messages else None
+                if isinstance(last_msg, HumanMessage):
+                    # Check if message content is a string (text only) or list (multimodal)
+                    if isinstance(last_msg.content, str):
+                        # Convert text-only message to multimodal with image
+                        try:
+                            with open(file_path, "rb") as f:
+                                image_data = base64.b64encode(f.read()).decode('utf-8')
+                            media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
+                                         "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
+                            # Replace the last message with multimodal version
+                            messages[-1] = HumanMessage(
+                                content=[
+                                    {"type": "text", "text": last_msg.content},
+                                    {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
+                                ]
+                            )
+                        except Exception as e:
+                            logger.warning(f"Failed to add image to message: {e}")
         if iteration >= self.max_iterations - 2:
+            messages.append(SystemMessage(content="⚠️ FINAL: Provide answer NOW. Just the value."))
         elif iteration >= self.max_iterations - 5:
+            messages.append(SystemMessage(content="⚠️ Conclude soon. Provide the answer."))
+        if self.use_openai:
+            time.sleep(0.5)
         try:
             response = self.llm_with_tools.invoke(messages)
         except Exception as e:
+            error_str = str(e)
+            logger.error(f"LLM error: {error_str}")
+            # Check if error contains raw Python code (common with Ollama)
+            if "error parsing tool call" in error_str.lower() and "raw=" in error_str:
+                # Extract the raw code from the error message
+                try:
+                    # Find the raw code between raw=' and '
+                    match = re.search(r"raw='(.*?)'", error_str, re.DOTALL)
+                    if match:
+                        raw_code = match.group(1)
+                        logger.info(f"Detected raw Python code, wrapping in python_executor tool call")
+                        # Create a manual tool call for python_executor (dict format for langchain-core 0.3.x)
+                        from langchain_core.messages import ToolMessage
+                        tool_call_id = f"call_{int(time.time() * 1000)}"
+                        # Execute the code directly via the tool
+                        result = python_executor.invoke({"code": raw_code})
+                        # Create a proper response with tool call (dict format)
+                        tool_call_dict = {
+                            "name": "python_executor",
+                            "args": {"code": raw_code},
+                            "id": tool_call_id
+                        }
+                        ai_msg = AIMessage(
+                            content="",
+                            tool_calls=[tool_call_dict]
+                        )
+                        tool_msg = ToolMessage(
+                            content=result,
+                            tool_call_id=tool_call_id
+                        )
+                        return {
+                            "messages": [ai_msg, tool_msg],
+                            "iteration_count": iteration + 1
+                        }
+                except Exception as parse_error:
+                    logger.error(f"Failed to extract code from error: {parse_error}")
+            return {"messages": [AIMessage(content="Error occurred.")], "iteration_count": iteration + 1}
+        return {"messages": [response], "iteration_count": iteration + 1}
+    def _route(self, state: AgentState) -> Literal["tools", "end"]:
+        last = state["messages"][-1]
+        if state.get("iteration_count", 0) >= self.max_iterations:
             return "end"
+        if hasattr(last, "tool_calls") and last.tool_calls:
             return "tools"
         return "end"
     def _extract_answer_node(self, state: AgentState) -> dict:
         messages = state["messages"]
+        # Find last substantive AI response
+        content = ""
+        for msg in reversed(messages):
+            if isinstance(msg, AIMessage) and msg.content:
+                c = msg.content.strip()
+                # Skip if it's clearly garbage/prompt repetition
+                if self._is_valid_answer_candidate(c):
+                    content = c
                     break
         answer = self._clean_answer(content)
         return {"final_answer": answer}
+    def _is_valid_answer_candidate(self, text: str) -> bool:
+        """Check if text looks like a valid answer, not garbage."""
+        if not text or len(text) < 1:
+            return False
+        text_lower = text.lower()
+        # Reject if it contains prompt text patterns
+        bad_patterns = [
+            "numbers: just", "format rules", "must follow",
+            "critical: answer format", "when ready", "your final answer",
+            "the benchmark uses", "exact string matching",
+            "no prefixes", "no explanations"
+        ]
+        if any(p in text_lower for p in bad_patterns):
+            return False
+        # Reject if it looks like the question was repeated
+        if "provide the correct next move" in text_lower:
+            return False
+        if text.startswith("Review the"):
+            return False
+        # Reject tool call syntax
+        if text.startswith("web_search(") or text.startswith("read_file("):
+            return False
+        return True
+    def _clean_answer(self, raw: str) -> str:
+        if not raw:
             return ""
+        answer = raw.strip()
+        # Remove markdown
+        answer = re.sub(r'\*\*(.+?)\*\*', r'\1', answer)
+        answer = re.sub(r'\*(.+?)\*', r'\1', answer)
+        answer = re.sub(r'`(.+?)`', r'\1', answer)
+        # Remove prefixes
         prefixes = [
+            r"^(?:the\s+)?(?:final\s+)?answer\s*(?:is)?:?\s*",
+            r"^result\s*:?\s*",
+            r"^therefore\s*,?\s*",
+            r"^thus\s*,?\s*",
+            r"^so\s*,?\s*",
         ]
+        for p in prefixes:
+            answer = re.sub(p, "", answer, flags=re.IGNORECASE)
+        # Remove quotes
         if (answer.startswith('"') and answer.endswith('"')) or \
            (answer.startswith("'") and answer.endswith("'")):
+            answer = answer[1:-1]
+        # Take first line
+        answer = answer.split('\n')[0].strip()
+        # Remove trailing period for short answers
+        if answer.endswith('.') and len(answer.split()) <= 3:
+            answer = answer[:-1]
         return answer.strip()
     def run(self, question: str, task_id: str = "", file_path: str = None) -> str:
         user_content = question
+        audio_transcript = None
+        # Handle files - dynamic image and audio detection
         if file_path and os.path.exists(file_path):
+            ext = os.path.splitext(file_path)[1].lower()
+            # Check for image files
+            is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
+            is_audio = ext in ['.mp3', '.wav', '.m4a', '.ogg', '.flac', '.webm']
+            # Handle images with OpenAI vision
+            if is_image and self.use_openai:
+                return self._run_with_vision(question, task_id, file_path)
+            # Handle images with Ollama vision (if model supports it)
+            if is_image and not self.use_openai and self.supports_vision:
+                return self._run_with_ollama_vision(question, task_id, file_path)
+            # Handle audio files - transcribe first
+            if is_audio:
+                audio_transcript = _transcribe_audio(file_path)
+                # If transcription failed, continue with error message
+                if audio_transcript.startswith("Error:"):
+                    logger.warning(f"Audio transcription failed: {audio_transcript}")
+                else:
+                    # Combine question with audio transcript
+                    user_content = f"{question}\n\n{audio_transcript}"
+            # Handle image + audio combination
+            if is_image and is_audio:
+                # This case is handled above - audio transcribed, image will be passed in messages
+                pass
+            elif is_image and not self.supports_vision:
+                # Image detected but model doesn't support vision
+                logger.warning(f"Image file detected but model {self.model_name} doesn't support vision")
+                return f"Error: Image file provided but model {self.model_name} doesn't support vision. Please use a vision-capable model like llama3.2-vision or qwen2.5-vl."
+            # Handle other file types
+            if not is_image and not is_audio:
+                file_hints = {
+                    '.xlsx': "EXCEL file - use read_file to examine ALL sheets",
+                    '.xls': "EXCEL file - use read_file to examine ALL sheets",
+                    '.csv': "CSV file - use read_file, then python_executor for analysis",
+                    '.pdf': "PDF file - use read_file to extract ALL text",
+                    '.py': "Python file - use read_file to see the code",
+                }
+                hint = file_hints.get(ext, "Use read_file to examine contents")
+                user_content = f"""⚠️ FILE PROVIDED: {file_path}
+{hint}
+**Use read_file("{file_path}") FIRST.**
 Question: {question}"""
+        # Check for URLs in question
+        url_match = re.search(r'https?://[^\s]+', question)
+        if url_match:
+            user_content += f"\n\n💡 URL detected: {url_match.group()} - Consider using fetch_webpage if needed."
+        # Build initial message - include image if using Ollama vision
+        initial_messages = [SystemMessage(content=SYSTEM_PROMPT)]
+        # If using Ollama vision and image exists, include image in message
+        if file_path and os.path.exists(file_path):
+            ext = os.path.splitext(file_path)[1].lower()
+            is_image = ext in ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp']
+            if is_image and not self.use_openai and self.supports_vision:
+                # Include image in HumanMessage for Ollama vision
+                try:
+                    with open(file_path, "rb") as f:
+                        image_data = base64.b64encode(f.read()).decode('utf-8')
+                    media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
+                                 "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
+                    user_msg = HumanMessage(
+                        content=[
+                            {"type": "text", "text": user_content},
+                            {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
+                        ]
+                    )
+                except Exception as e:
+                    logger.error(f"Failed to encode image: {e}")
+                    user_msg = HumanMessage(content=user_content)
+            else:
+                user_msg = HumanMessage(content=user_content)
+        else:
+            user_msg = HumanMessage(content=user_content)
+        initial_messages.append(user_msg)
         initial_state: AgentState = {
+            "messages": initial_messages,
             "task_id": task_id,
             "file_path": file_path,
             "iteration_count": 0,
             "final_answer": None
         }
         try:
+            final_state = self.graph.invoke(initial_state, {"recursion_limit": self.max_iterations * 2 + 10})
+            answer = final_state.get("final_answer", "")
+            if not answer or not self._is_valid_answer_candidate(answer):
+                # Try harder to find an answer
+                for msg in reversed(final_state.get("messages", [])):
+                    if isinstance(msg, AIMessage) and msg.content:
+                        candidate = self._clean_answer(msg.content)
+                        if candidate and self._is_valid_answer_candidate(candidate):
+                            answer = candidate
+                            break
             return answer if answer else "Unable to determine answer"
         except Exception as e:
+            logger.error(f"Agent error: {e}")
             return f"Agent error: {str(e)}"
+    def _run_with_vision(self, question: str, task_id: str, image_path: str) -> str:
+        """Handle image questions using GPT-4o vision."""
+        try:
+            from openai import OpenAI
+            client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+            # Read and encode image
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
+                         "gif": "image/gif", "webp": "image/webp"}.get(ext.lstrip('.'), "image/png")
+            response = client.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": "You are solving GAIA benchmark questions. Provide ONLY the answer value, no explanations or prefixes."},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": question},
+                        {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
+                    ]}
+                ],
+                max_tokens=500,
+                temperature=0
+            )
+            answer = response.choices[0].message.content.strip()
+            return self._clean_answer(answer)
+        except Exception as e:
+            logger.error(f"Vision error: {e}")
+            return f"Vision error: {str(e)}"
+    def _run_with_ollama_vision(self, question: str, task_id: str, image_path: str) -> str:
+        """Handle image questions using Ollama vision models."""
+        try:
+            # Read and encode image
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
+                         "gif": "image/gif", "webp": "image/webp", "bmp": "image/bmp"}.get(ext.lstrip('.'), "image/png")
+            # Create message with image
+            message = HumanMessage(
+                content=[
+                    {"type": "text", "text": question},
+                    {"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}}
+                ]
+            )
+            # Invoke model with system prompt and image message
+            response = self.llm.invoke([SystemMessage(content=SYSTEM_PROMPT), message])
+            answer = response.content if hasattr(response, 'content') else str(response)
+            return self._clean_answer(answer)
+        except Exception as e:
+            logger.error(f"Ollama vision error: {e}")
+            return f"Vision error: {str(e)}"
+def create_agent() -> GAIAAgent:
+    """Create a configured agent."""
+    return GAIAAgent(temperature=0, max_iterations=25)

app.py CHANGED Viewed

@@ -6,166 +6,144 @@ import tempfile
 import json
 import logging
 from typing import Optional
-# Import the optimized agent from the separate module
-from agent_enhanced import GAIAAgent
-# ============ CONFIGURATION ============
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# ============ API INTERACTION ============
-def fetch_questions(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> list:
-    """Fetch all questions from the GAIA API with retry logic."""
-    for attempt in range(max_retries):
         try:
             response = requests.get(f"{api_url}/questions", timeout=30)
             response.raise_for_status()
-            return response.json()
-        except requests.exceptions.RequestException as e:
             logger.warning(f"Attempt {attempt + 1} failed: {e}")
-            if attempt == max_retries - 1:
-                raise
     return []
-def fetch_random_question(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
-    """Fetch a random question from the GAIA API with retry logic."""
-    for attempt in range(max_retries):
         try:
             response = requests.get(f"{api_url}/random-question", timeout=30)
             response.raise_for_status()
             return response.json()
-        except requests.exceptions.RequestException as e:
             logger.warning(f"Attempt {attempt + 1} failed: {e}")
-            if attempt == max_retries - 1:
-                raise
     return {}
-def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> Optional[str]:
-    """Fetch a file associated with a task with retry logic."""
-    for attempt in range(max_retries):
-        try:
-            response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
-            if response.status_code == 200:
-                # Save to temp file
-                content_disposition = response.headers.get('content-disposition', '')
-                filename = f"task_{task_id}_file"
-                if 'filename=' in content_disposition:
-                    filename = content_disposition.split('filename=')[1].strip('"')
-                temp_dir = tempfile.mkdtemp()
-                file_path = os.path.join(temp_dir, filename)
-                with open(file_path, 'wb') as f:
-                    f.write(response.content)
-                logger.info(f"Downloaded file: {file_path}")
-                return file_path
-            elif response.status_code == 404:
-                logger.info(f"No file found for task {task_id}")
-                return None
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"File fetch attempt {attempt + 1} failed: {e}")
-            if attempt == max_retries - 1:
-                logger.error(f"Failed to fetch file for task {task_id}: {e}")
     return None
-def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
-    """Submit answers to the GAIA API with retry logic."""
-    payload = {
-        "username": username,
-        "agent_code": agent_code,
-        "answers": answers
-    }
-    for attempt in range(max_retries):
-        try:
-            response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
-            response.raise_for_status()
-            return response.json()
-        except requests.exceptions.RequestException as e:
-            logger.warning(f"Submission attempt {attempt + 1} failed: {e}")
-            if attempt == max_retries - 1:
-                raise
-    return {}
-# ============ ANSWER VALIDATION ============
-def validate_answer_format(answer: str) -> tuple[bool, str]:
-    """Validate answer format and return (is_valid, warning_message)."""
-    if not answer or answer.strip() == "":
-        return False, "Warning: Answer is empty"
-    # Check for common prefixes that should be removed
-    prefixes = ["FINAL ANSWER:", "The answer is:", "Answer:", "final answer:"]
-    answer_lower = answer.lower()
-    for prefix in prefixes:
-        if answer_lower.startswith(prefix.lower()):
-            return False, f"Warning: Answer contains prefix '{prefix}' which will be removed. Consider removing it."
-    # Check for explanations (multiple sentences)
-    if answer.count('.') > 1 or answer.count('because') > 0 or answer.count('since') > 0:
-        return False, "Warning: Answer may contain explanations. Only the answer should be submitted."
-    return True, ""
-# ============ GRADIO INTERFACE ============
-def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()):
-    """Run the agent on all GAIA questions."""
-    if not openai_api_key:
-        return "Please provide your OpenAI API key.", None
     try:
-        # Initialize agent
         progress(0, desc="Initializing agent...")
-        agent = GAIAAgent(api_key=openai_api_key)
-        # Fetch questions
-        progress(0.05, desc="Fetching questions from API...")
         questions = fetch_questions()
         if not questions:
-            return "Error: Failed to fetch questions from API. Please try again.", None
-        total_questions = len(questions)
         results = []
         answers_for_submission = []
         for i, q in enumerate(questions):
-            progress((i + 1) / total_questions, desc=f"Processing question {i+1}/{total_questions}...")
             task_id = q.get("task_id", "")
             question_text = q.get("question", "")
-            # Check if there's an associated file
             file_path = None
             if q.get("file_name"):
-                progress((i + 0.5) / total_questions, desc=f"Downloading file for question {i+1}...")
                 file_path = fetch_file(task_id)
-            # Run agent
             try:
-                progress((i + 0.7) / total_questions, desc=f"Agent reasoning for question {i+1}...")
                 answer = agent.run(question_text, task_id, file_path)
-                # Validate answer format
-                is_valid, warning = validate_answer_format(answer)
-                if not is_valid:
-                    logger.warning(f"Question {i+1} ({task_id}): {warning}")
             except Exception as e:
-                logger.error(f"Error processing question {i+1} ({task_id}): {e}")
                 answer = f"Error: {str(e)}"
             results.append({
                 "Task ID": task_id,
-                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
                 "Answer": answer,
-                "Status": "✓" if answer and not answer.startswith("Error:") else "✗"
             })
             answers_for_submission.append({
@@ -173,31 +151,59 @@ def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()):
                 "submitted_answer": answer
             })
-            # Cleanup temp file
             if file_path and os.path.exists(file_path):
                 try:
                     os.remove(file_path)
-                    # Also try to remove temp directory if empty
-                    temp_dir = os.path.dirname(file_path)
-                    if os.path.exists(temp_dir):
-                        try:
-                            os.rmdir(temp_dir)
-                        except:
-                            pass
-                except Exception as e:
-                    logger.warning(f"Failed to cleanup file {file_path}: {e}")
         df = pd.DataFrame(results)
         progress(1.0, desc="Complete!")
         return df, answers_for_submission
     except Exception as e:
-        logger.error(f"Error in run_agent_on_questions: {e}")
         return f"Error: {str(e)}", None
 def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
-    """Submit answers to the leaderboard."""
     if not username or not space_url or not answers_json:
         return "Please fill in all fields and run the agent first."
@@ -205,207 +211,84 @@ def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
         answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
         if not isinstance(answers, list) or len(answers) == 0:
-            return "Error: Answers must be a non-empty list. Please run the agent first."
-        # Validate answer format before submission
-        warnings = []
-        for ans in answers:
-            if "task_id" not in ans or "submitted_answer" not in ans:
-                return "Error: Invalid answer format. Each answer must have 'task_id' and 'submitted_answer'."
-            is_valid, warning = validate_answer_format(ans.get("submitted_answer", ""))
-            if not is_valid:
-                warnings.append(f"Task {ans.get('task_id')}: {warning}")
-        # Ensure space URL ends with /tree/main
         if not space_url.endswith("/tree/main"):
             space_url = space_url.rstrip("/") + "/tree/main"
-        # Submit to API
         result = submit_answers(username, space_url, answers)
-        score = result.get("score", 0)
         print(result)
         correct = result.get("correct_count", 0)
         total = result.get("total_attempted", 0)
-        warning_text = ""
-        if warnings:
-            warning_text = f"\n\n⚠️ **Warnings:**\n" + "\n".join(f"- {w}" for w in warnings[:5])
-            if len(warnings) > 5:
-                warning_text += f"\n- ... and {len(warnings) - 5} more warnings"
         return f"""
-## Submission Successful! 🎉
 **Score:** {score:.1%}
 **Correct:** {correct}/{total}
-{'🏆 **Congratulations!** Your agent scored above 30% and has earned the certificate!' if score > 0.3 else '❌ **Certificate Requirement:** Your agent must score above 30% to earn your certificate. Current score is below the threshold.'}
-{warning_text}
-Check the [leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard) to see your ranking!
 """
-    except json.JSONDecodeError as e:
-        return f"Error: Invalid JSON format. Please run the agent first.\nDetails: {str(e)}"
     except Exception as e:
         logger.error(f"Submission error: {e}")
-        return f"Submission error: {str(e)}"
-def test_single_question(openai_api_key: str):
-    """Test the agent on a single random question."""
-    if not openai_api_key:
-        return "Please provide your OpenAI API key.", "", "", ""
-    try:
-        agent = GAIAAgent(api_key=openai_api_key)
-        question_data = fetch_random_question()
-        if not question_data:
-            return "Error: Failed to fetch question from API.", "", "", ""
-        task_id = question_data.get("task_id", "")
-        question_text = question_data.get("question", "")
-        file_path = None
-        if question_data.get("file_name"):
-            file_path = fetch_file(task_id)
-        answer = agent.run(question_text, task_id, file_path)
-        # Validate answer format
-        is_valid, warning = validate_answer_format(answer)
-        validation_status = "✓ Valid format" if is_valid else f"⚠️ {warning}"
-        # Cleanup temp file
-        if file_path and os.path.exists(file_path):
-            try:
-                os.remove(file_path)
-                temp_dir = os.path.dirname(file_path)
-                if os.path.exists(temp_dir):
-                    try:
-                        os.rmdir(temp_dir)
-                    except:
-                        pass
-            except Exception as e:
-                logger.warning(f"Failed to cleanup file: {e}")
-        return question_text, answer, task_id, validation_status
-    except Exception as e:
-        logger.error(f"Error in test_single_question: {e}")
-        return f"Error: {str(e)}", "", "", ""
-# ============ BUILD GRADIO APP ============
-with gr.Blocks(title="GAIA Agent - LangGraph", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-# 🤖 GAIA Benchmark Agent (LangGraph)
-This agent uses **LangGraph** to solve GAIA benchmark questions. It has access to:
-- 🔍 Web Search (DuckDuckGo)
-- 📚 Wikipedia Search
-- 🐍 Python Code Execution
-- 📄 File Reading (PDF, Text, Excel)
-- 🔢 Calculator
-## Instructions
-1. Enter your OpenAI API key
-2. Test with a single question or run on all questions
-3. Submit your answers to the leaderboard
 """)
-    with gr.Row():
-        openai_key = gr.Textbox(
-            label="OpenAI API Key",
-            type="password",
-            placeholder="sk-...",
-            info="Required for GPT-4o"
-        )
     with gr.Tabs():
-        with gr.TabItem("🧪 Test Single Question"):
             test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
-            test_question = gr.Textbox(label="Question", lines=5, interactive=False)
-            test_answer = gr.Textbox(label="Agent's Answer", lines=3, interactive=False)
-            test_task_id = gr.Textbox(label="Task ID", interactive=False)
-            test_validation = gr.Textbox(label="Answer Validation", interactive=False)
-            test_btn.click(
-                test_single_question,
-                inputs=[openai_key],
-                outputs=[test_question, test_answer, test_task_id, test_validation]
-            )
-        with gr.TabItem("🚀 Run Full Benchmark"):
-            run_btn = gr.Button("Run Agent on All Questions", variant="primary")
-            results_table = gr.Dataframe(label="Results")
             answers_state = gr.State()
-            run_btn.click(
-                run_agent_on_questions,
-                inputs=[openai_key],
-                outputs=[results_table, answers_state]
-            )
-        with gr.TabItem("📤 Submit to Leaderboard"):
-            gr.Markdown("""
-            ### Submit Your Results
-            After running the full benchmark, fill in your details and submit to the leaderboard.
-            **Requirements:**
-            - Your HuggingFace username
-            - Your Space URL (must end with `/tree/main`)
-            - Answers will be auto-filled after running the benchmark
-            """)
             with gr.Row():
-                username_input = gr.Textbox(
-                    label="HuggingFace Username",
-                    placeholder="your-username",
-                    info="Your HuggingFace account username"
-                )
-                space_url_input = gr.Textbox(
-                    label="Your Space URL",
-                    placeholder="https://huggingface.co/spaces/your-username/your-space",
-                    info="Full URL to your Space (will auto-append /tree/main if needed)"
-                )
-            answers_input = gr.Textbox(
-                label="Answers JSON (auto-filled after running benchmark)",
-                lines=10,
-                placeholder="Run the full benchmark first...",
-                info="This will be automatically populated after running the benchmark"
-            )
-            submit_btn = gr.Button("Submit to Leaderboard", variant="primary")
             submit_result = gr.Markdown()
-            # Auto-fill answers when benchmark completes
-            def format_answers(answers):
-                if answers:
-                    return json.dumps(answers, indent=2)
-                return ""
-            answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_input])
-            submit_btn.click(
-                submit_to_leaderboard,
-                inputs=[username_input, space_url_input, answers_input],
-                outputs=[submit_result]
-            )
     gr.Markdown("""
 ---
-### 🔗 Links
-- [GAIA Benchmark](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
-- [Student Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
-- [Course Unit 4](https://huggingface.co/learn/agents-course/en/unit4/hands-on)
-- [API Documentation](https://agents-course-unit4-scoring.hf.space/docs)
 """)
 if __name__ == "__main__":
-    # For HuggingFace Spaces, use share=False
-    # For local development, you can use share=True to get a public link
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import json
 import logging
 from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()
+from agent_enhanced import GAIAAgent, is_ollama_available, is_production
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def fetch_questions(api_url: str = DEFAULT_API_URL) -> list:
+    """Fetch all questions from the GAIA API."""
+    for attempt in range(3):
         try:
             response = requests.get(f"{api_url}/questions", timeout=30)
             response.raise_for_status()
+            questions = response.json()
+            # Print all questions with their task IDs
+            print("\n" + "="*80)
+            print("ALL QUESTIONS WITH TASK IDs:")
+            print("="*80)
+            for i, q in enumerate(questions, 1):
+                task_id = q.get("task_id", "N/A")
+                question_text = q.get("question", "N/A")
+                file_name = q.get("file_name", "")
+                print(f"\n[{i}] Task ID: {task_id}")
+                print(f"    Question: {question_text[:200]}{'...' if len(question_text) > 200 else ''}")
+                if file_name:
+                    print(f"    File: {file_name}")
+            print("\n" + "="*80)
+            print(f"Total questions: {len(questions)}")
+            print("="*80 + "\n")
+            return questions
+        except Exception as e:
             logger.warning(f"Attempt {attempt + 1} failed: {e}")
     return []
+def fetch_random_question(api_url: str = DEFAULT_API_URL) -> dict:
+    """Fetch a random question."""
+    for attempt in range(3):
         try:
             response = requests.get(f"{api_url}/random-question", timeout=30)
             response.raise_for_status()
             return response.json()
+        except Exception as e:
             logger.warning(f"Attempt {attempt + 1} failed: {e}")
     return {}
+def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL) -> Optional[str]:
+    """Fetch file for a task."""
+    try:
+        response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
+        if response.status_code == 200:
+            content_disposition = response.headers.get('content-disposition', '')
+            filename = f"task_{task_id}_file"
+            if 'filename=' in content_disposition:
+                filename = content_disposition.split('filename=')[1].strip('"')
+            temp_dir = tempfile.mkdtemp()
+            file_path = os.path.join(temp_dir, filename)
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            logger.info(f"Downloaded: {file_path}")
+            return file_path
+        elif response.status_code == 404:
+            return None
+    except Exception as e:
+        logger.error(f"File fetch failed: {e}")
     return None
+def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL) -> dict:
+    """Submit answers to API."""
+    payload = {"username": username, "agent_code": agent_code, "answers": answers}
+    response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
+    response.raise_for_status()
+    return response.json()
+def get_env_status() -> str:
+    """Get environment status."""
+    if is_production():
+        return "☁️ **Production Mode** (HuggingFace Spaces) - Using OpenAI GPT-4o"
+    elif is_ollama_available():
+        return "🏠 **Local Mode** - Using Ollama"
+    elif os.environ.get("OPENAI_API_KEY"):
+        return "☁️ **Local + OpenAI** - Using OpenAI GPT-4o"
+    else:
+        return "⚠️ **No Backend** - Set OPENAI_API_KEY or start Ollama"
+def run_agent_on_questions(progress=gr.Progress()):
+    """Run agent on all questions."""
     try:
+        env_info = get_env_status()
         progress(0, desc="Initializing agent...")
+        agent = GAIAAgent()
+        progress(0.05, desc="Fetching questions...")
         questions = fetch_questions()
         if not questions:
+            return "Error: Failed to fetch questions.", None
+        total = len(questions)
         results = []
         answers_for_submission = []
         for i, q in enumerate(questions):
+            progress((i + 1) / total, desc=f"Question {i+1}/{total}...")
             task_id = q.get("task_id", "")
             question_text = q.get("question", "")
             file_path = None
             if q.get("file_name"):
                 file_path = fetch_file(task_id)
             try:
                 answer = agent.run(question_text, task_id, file_path)
             except Exception as e:
+                logger.error(f"Error on question {i+1}: {e}")
                 answer = f"Error: {str(e)}"
             results.append({
                 "Task ID": task_id,
+                "Question": question_text,
                 "Answer": answer,
+                "Status": "✓" if answer and not answer.startswith("Error:") and answer != "Unable to determine answer" else "✗"
             })
             answers_for_submission.append({
                 "submitted_answer": answer
             })
+            # Cleanup
             if file_path and os.path.exists(file_path):
                 try:
                     os.remove(file_path)
+                    os.rmdir(os.path.dirname(file_path))
+                except:
+                    pass
         df = pd.DataFrame(results)
         progress(1.0, desc="Complete!")
         return df, answers_for_submission
     except Exception as e:
+        logger.error(f"Error: {e}")
         return f"Error: {str(e)}", None
+def test_single_question():
+    """Test on a single random question."""
+    try:
+        agent = GAIAAgent()
+        question_data = fetch_random_question()
+        if not question_data:
+            return "Error: Failed to fetch question.", "", "", ""
+        task_id = question_data.get("task_id", "")
+        question_text = question_data.get("question", "")
+        file_path = None
+        if question_data.get("file_name"):
+            file_path = fetch_file(task_id)
+        answer = agent.run(question_text, task_id, file_path)
+        # Cleanup
+        if file_path and os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+                os.rmdir(os.path.dirname(file_path))
+            except:
+                pass
+        status = "✓ Valid" if answer and not answer.startswith("Error") else "⚠️ Check answer"
+        return question_text, answer, task_id, status
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return f"Error: {str(e)}", "", "", ""
 def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
+    """Submit to leaderboard."""
     if not username or not space_url or not answers_json:
         return "Please fill in all fields and run the agent first."
         answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
         if not isinstance(answers, list) or len(answers) == 0:
+            return "Error: Run the benchmark first."
         if not space_url.endswith("/tree/main"):
             space_url = space_url.rstrip("/") + "/tree/main"
         result = submit_answers(username, space_url, answers)
         print(result)
+        score = result.get("score", 0)
         correct = result.get("correct_count", 0)
         total = result.get("total_attempted", 0)
+        cert_msg = "🏆 **Congratulations!** Score above 30% - Certificate earned!" if score > 0.3 else "❌ Need >30% for certificate."
         return f"""
+## Submission Results
 **Score:** {score:.1%}
 **Correct:** {correct}/{total}
+{cert_msg}
+[View Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
 """
     except Exception as e:
         logger.error(f"Submission error: {e}")
+        return f"Error: {str(e)}"
+# ============ GRADIO APP ============
+with gr.Blocks(title="GAIA Agent", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+# 🤖 GAIA Benchmark Agent
+**Tools:** 🔍 Web Search | 📚 Wikipedia | 🐍 Python | 📄 Files | 🔢 Calculator | 🌐 Webpages | 👁️ Vision (OpenAI)
 """)
+    env_status = gr.Markdown(get_env_status())
     with gr.Tabs():
+        with gr.TabItem("🧪 Test Single"):
             test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
+            test_q = gr.Textbox(label="Question", lines=4, interactive=False)
+            test_a = gr.Textbox(label="Answer", lines=2, interactive=False)
+            test_id = gr.Textbox(label="Task ID", interactive=False)
+            test_status = gr.Textbox(label="Status", interactive=False)
+            test_btn.click(test_single_question, outputs=[test_q, test_a, test_id, test_status])
+        with gr.TabItem("🚀 Full Benchmark"):
+            run_btn = gr.Button("Run on All Questions", variant="primary")
+            results_df = gr.Dataframe(label="Results")
             answers_state = gr.State()
+            run_btn.click(run_agent_on_questions, outputs=[results_df, answers_state])
+        with gr.TabItem("📤 Submit"):
+            gr.Markdown("### Submit to Leaderboard")
             with gr.Row():
+                username_in = gr.Textbox(label="HF Username", placeholder="your-username")
+                space_url_in = gr.Textbox(label="Space URL", placeholder="https://huggingface.co/spaces/you/space")
+            answers_in = gr.Textbox(label="Answers JSON (auto-filled)", lines=8)
+            submit_btn = gr.Button("Submit", variant="primary")
             submit_result = gr.Markdown()
+            def format_answers(a):
+                return json.dumps(a, indent=2) if a else ""
+            answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_in])
+            submit_btn.click(submit_to_leaderboard, inputs=[username_in, space_url_in, answers_in], outputs=[submit_result])
     gr.Markdown("""
 ---
+**Setup:**
+- Local: `ollama serve` + `ollama pull qwen2.5:32b`
+- Production: Set `OPENAI_API_KEY` in `.env` or HF Secrets
 """)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,20 +1,30 @@
-# Core dependencies
 gradio>=4.0.0,<5.0.0
 requests>=2.31.0,<3.0.0
 pandas>=2.0.0,<3.0.0
 # LangChain & LangGraph
 langgraph>=0.2.0,<1.0.0
 langchain>=0.2.0,<1.0.0
-langchain-core>=0.2.0,<1.0.0
 langchain-openai>=0.1.0,<1.0.0
 langchain-community>=0.2.0,<1.0.0
 langchain-experimental>=0.0.60,<1.0.0
-# Tools dependencies
 duckduckgo-search>=6.0.0,<7.0.0
 pypdf>=4.0.0,<5.0.0
 openpyxl>=3.1.0,<4.0.0
-# Utilities
 python-dotenv>=1.0.0,<2.0.0

+# Core
 gradio>=4.0.0,<5.0.0
 requests>=2.31.0,<3.0.0
 pandas>=2.0.0,<3.0.0
+numpy>=1.24.0,<3.0.0
 # LangChain & LangGraph
 langgraph>=0.2.0,<1.0.0
 langchain>=0.2.0,<1.0.0
+langchain-core>=0.2.0,<0.4.0
 langchain-openai>=0.1.0,<1.0.0
+langchain-ollama>=0.1.0,<2.0.0
 langchain-community>=0.2.0,<1.0.0
 langchain-experimental>=0.0.60,<1.0.0
+# OpenAI (for GPT-4o + Whisper)
+openai>=1.0.0,<2.0.0
+# Tools
 duckduckgo-search>=6.0.0,<7.0.0
 pypdf>=4.0.0,<5.0.0
+pdfplumber>=0.10.0,<1.0.0
 openpyxl>=3.1.0,<4.0.0
+beautifulsoup4>=4.12.0,<5.0.0
+# Utils
 python-dotenv>=1.0.0,<2.0.0
+# Audio Transcription (for Ollama)
+faster-whisper>=0.10.0