Spaces:

jebaselvasingh
/

mycertification

Sleeping

App Files Files Community

jebaponselvasingh commited on Jan 16

Commit

93b72dc

1 Parent(s): 6190a63

Add application file

Browse files

Files changed (5) hide show

.env +1 -0
agent_enhanced.py +564 -0
app.py +432 -0
flagged/log.csv +2 -0
requirements.txt +20 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY="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"

agent_enhanced.py ADDED Viewed

	@@ -0,0 +1,564 @@

+"""
+Enhanced GAIA Agent with LangGraph
+Separate module for cleaner architecture and easier customization
+"""
+import os
+import re
+import json
+import requests
+import tempfile
+from typing import TypedDict, Annotated, Sequence, Literal, Any
+import operator
+from langgraph.graph import StateGraph, END
+from langgraph.prebuilt import ToolNode
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage, BaseMessage
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from langchain_community.tools import DuckDuckGoSearchResults
+from langchain_experimental.utilities import PythonREPL
+import pandas as pd
+# ============ STATE DEFINITION ============
+class AgentState(TypedDict):
+    """State maintained throughout the agent's execution."""
+    messages: Annotated[Sequence[BaseMessage], operator.add]
+    task_id: str
+    file_path: str | None
+    file_content: str | None
+    iteration_count: int
+    final_answer: str | None
+# ============ TOOL DEFINITIONS ============
+@tool
+def web_search(query: str) -> str:
+    """
+    Search the web using DuckDuckGo for current information.
+    Use this for questions about recent events, facts, statistics, or any information
+    that might have changed or that you're uncertain about.
+    Args:
+        query: The search query string
+    Returns:
+        Search results with relevant snippets
+    """
+    import logging
+    # Suppress non-critical errors from DuckDuckGo's internal engines
+    # (Some engines like grokipedia may fail due to DNS issues, but others work fine)
+    ddgs_logger = logging.getLogger("ddgs.ddgs")
+    primp_logger = logging.getLogger("primp")
+    # Store original levels
+    ddgs_original = ddgs_logger.level if ddgs_logger.level else logging.NOTSET
+    primp_original = primp_logger.level if primp_logger.level else logging.NOTSET
+    # Suppress INFO level logs (which include non-critical engine errors)
+    ddgs_logger.setLevel(logging.WARNING)
+    primp_logger.setLevel(logging.WARNING)
+    try:
+        search = DuckDuckGoSearchResults(max_results=5, output_format="list")
+        results = search.run(query)
+        # Restore original logging levels
+        if ddgs_original != logging.NOTSET:
+            ddgs_logger.setLevel(ddgs_original)
+        if primp_original != logging.NOTSET:
+            primp_logger.setLevel(primp_original)
+        if isinstance(results, list):
+            formatted = []
+            for r in results:
+                if isinstance(r, dict):
+                    formatted.append(f"Title: {r.get('title', 'N/A')}\nSnippet: {r.get('snippet', 'N/A')}\nLink: {r.get('link', 'N/A')}")
+                else:
+                    formatted.append(str(r))
+            return "\n\n---\n\n".join(formatted)
+        return str(results)
+    except Exception as e:
+        # Restore original logging levels even on exception
+        if ddgs_original != logging.NOTSET:
+            ddgs_logger.setLevel(ddgs_original)
+        if primp_original != logging.NOTSET:
+            primp_logger.setLevel(primp_original)
+        return f"Search failed: {str(e)}. Try a different query or approach."
+@tool
+def python_executor(code: str) -> str:
+    """
+    Execute Python code for calculations, data analysis, or any computational task.
+    You have access to standard libraries: math, statistics, datetime, json, re, collections.
+    Args:
+        code: Python code to execute. Print statements will show in output.
+    Returns:
+        The output/result of the code execution
+    """
+    try:
+        repl = PythonREPL()
+        # Add common imports to the code
+        augmented_code = """
+import math
+import statistics
+import datetime
+import json
+import re
+from collections import Counter, defaultdict
+""" + code
+        result = repl.run(augmented_code)
+        return result.strip() if result else "Code executed successfully with no output. Add print() to see results."
+    except Exception as e:
+        return f"Execution error: {str(e)}. Please fix the code and try again."
+@tool
+def read_file(file_path: str) -> str:
+    """
+    Read and extract content from various file types.
+    Supports: PDF, TXT, MD, CSV, JSON, XLSX, XLS, PY, and other text files.
+    Args:
+        file_path: Path to the file to read
+    Returns:
+        The content of the file as a string
+    """
+    try:
+        if not os.path.exists(file_path):
+            return f"Error: File not found at {file_path}"
+        file_lower = file_path.lower()
+        if file_lower.endswith('.pdf'):
+            from langchain_community.document_loaders import PyPDFLoader
+            loader = PyPDFLoader(file_path)
+            pages = loader.load()
+            content = "\n\n--- Page Break ---\n\n".join([p.page_content for p in pages])
+            return f"PDF Content ({len(pages)} pages):\n{content}"
+        elif file_lower.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file_path, sheet_name=None)  # Read all sheets
+            result = []
+            for sheet_name, sheet_df in df.items():
+                result.append(f"=== Sheet: {sheet_name} ===\n{sheet_df.to_string()}")
+            return "\n\n".join(result)
+        elif file_lower.endswith('.csv'):
+            df = pd.read_csv(file_path)
+            return f"CSV Data ({len(df)} rows):\n{df.to_string()}"
+        elif file_lower.endswith('.json'):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            return f"JSON Content:\n{json.dumps(data, indent=2)}"
+        else:  # Default: treat as text
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+            return f"File Content:\n{content}"
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+@tool
+def calculator(expression: str) -> str:
+    """
+    Evaluate a mathematical expression safely.
+    Supports: basic arithmetic, trigonometry, logarithms, exponents, etc.
+    Args:
+        expression: Mathematical expression (e.g., "sqrt(16) + log(100, 10)")
+    Returns:
+        The numerical result as a string
+    """
+    try:
+        import math
+        # Define allowed functions and constants
+        safe_dict = {
+            'abs': abs, 'round': round, 'min': min, 'max': max,
+            'sum': sum, 'pow': pow, 'len': len,
+            'sqrt': math.sqrt, 'log': math.log, 'log10': math.log10,
+            'log2': math.log2, 'exp': math.exp,
+            'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
+            'asin': math.asin, 'acos': math.acos, 'atan': math.atan,
+            'sinh': math.sinh, 'cosh': math.cosh, 'tanh': math.tanh,
+            'ceil': math.ceil, 'floor': math.floor,
+            'pi': math.pi, 'e': math.e,
+            'factorial': math.factorial, 'gcd': math.gcd,
+            'degrees': math.degrees, 'radians': math.radians,
+        }
+        result = eval(expression, {"__builtins__": {}}, safe_dict)
+        # Format nicely
+        if isinstance(result, float):
+            if result.is_integer():
+                return str(int(result))
+            return f"{result:.10g}"  # Remove trailing zeros
+        return str(result)
+    except Exception as e:
+        return f"Calculation error: {str(e)}. Check your expression syntax."
+@tool
+def wikipedia_search(query: str) -> str:
+    """
+    Search Wikipedia for factual information about a specific topic.
+    Best for: historical facts, biographies, scientific concepts, definitions.
+    Args:
+        query: The topic to search for on Wikipedia
+    Returns:
+        Summary and key information from relevant Wikipedia articles
+    """
+    try:
+        import urllib.parse
+        # Search for articles
+        search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={urllib.parse.quote(query)}&format=json&srlimit=3"
+        response = requests.get(search_url, timeout=10)
+        data = response.json()
+        if 'query' not in data or 'search' not in data['query'] or not data['query']['search']:
+            return f"No Wikipedia articles found for '{query}'"
+        # Get full content of top result
+        top_title = data['query']['search'][0]['title']
+        content_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exintro=true&explaintext=true&titles={urllib.parse.quote(top_title)}&format=json"
+        content_response = requests.get(content_url, timeout=10)
+        content_data = content_response.json()
+        pages = content_data.get('query', {}).get('pages', {})
+        for page_id, page_data in pages.items():
+            if page_id != '-1':
+                title = page_data.get('title', '')
+                extract = page_data.get('extract', 'No content available')
+                return f"Wikipedia: {title}\n\n{extract[:2000]}"
+        return "Could not retrieve article content."
+    except Exception as e:
+        return f"Wikipedia search failed: {str(e)}"
+@tool
+def analyze_image(image_path: str, question: str) -> str:
+    """
+    Analyze an image file and answer questions about it.
+    Note: This is a placeholder - implement with vision model if needed.
+    Args:
+        image_path: Path to the image file
+        question: What to analyze or find in the image
+    Returns:
+        Description or analysis of the image
+    """
+    # This is a placeholder - you can integrate with GPT-4V or other vision models
+    return f"Image analysis not implemented. File: {image_path}, Question: {question}"
+# Collect all tools
+TOOLS = [web_search, python_executor, read_file, calculator, wikipedia_search]
+# ============ SYSTEM PROMPT ============
+SYSTEM_PROMPT = """You are an expert AI assistant designed to solve GAIA benchmark questions with maximum accuracy.
+## Your Mission
+Provide PRECISE, EXACT answers. The benchmark uses EXACT STRING MATCHING, so your final answer must match the ground truth character-for-character.
+## Critical Answer Formatting Rules (MUST FOLLOW)
+**DO NOT include "FINAL ANSWER:" or any prefix - just the answer itself.**
+1. **Numbers**: Give just the number.
+   - ✅ CORRECT: "42"
+   - ❌ WRONG: "The answer is 42", "42 units", "Answer: 42"
+2. **Names**: Exact spelling as found in sources. Check Wikipedia/official sources for correct spelling, capitalization, and punctuation.
+   - ✅ CORRECT: "John Smith"
+   - ❌ WRONG: "john smith", "John smith"
+3. **Lists**: Comma-separated, NO spaces after commas.
+   - ✅ CORRECT: "apple,banana,cherry"
+   - ❌ WRONG: "apple, banana, cherry", "apple,banana, cherry"
+4. **Dates**: Use the format specified in the question, or YYYY-MM-DD if not specified.
+   - ✅ CORRECT: "2024-01-15" or "January 15, 2024" (if question asks for that format)
+   - ❌ WRONG: "1/15/2024" (unless question asks for it)
+5. **Yes/No**: Just "Yes" or "No" (capitalized, no period).
+   - ✅ CORRECT: "Yes"
+   - ❌ WRONG: "yes", "Yes.", "The answer is Yes"
+6. **Counts**: Just the number.
+   - ✅ CORRECT: "5"
+   - ❌ WRONG: "5 items", "five", "There are 5"
+7. **No explanations**: Your final response must contain ONLY the answer, nothing else.
+   - ✅ CORRECT: "Paris"
+   - ❌ WRONG: "The answer is Paris because..."
+## Problem-Solving Strategy
+1. **Understand**: Read the question carefully. What exactly is being asked? Note any specific format requirements.
+2. **Check for File**: If a file is mentioned or available, ALWAYS read it FIRST - the answer is likely there.
+3. **Plan**: What information do I need? Which tools should I use?
+4. **Execute**: Use tools systematically. Verify information from multiple sources when possible.
+5. **Verify**: Double-check your answer format. Does it match the question's requirements? Is spelling correct?
+6. **Respond**: Give ONLY the final answer, no prefixes, no explanations.
+## Available Tools
+- `read_file`: Read PDFs, spreadsheets, text files - USE THIS FIRST if a file is available
+- `web_search`: Current information, recent events, facts
+- `wikipedia_search`: Historical facts, biographies, definitions
+- `python_executor`: Calculations, data processing, analysis
+- `calculator`: Quick mathematical calculations
+## Tool Usage Priority
+1. **If file available**: Read file FIRST before doing anything else
+2. **For calculations**: Use python_executor for complex math, calculator for simple expressions
+3. **For facts**: Use wikipedia_search for established facts, web_search for current/recent information
+4. **Cross-reference**: When possible, verify important facts from multiple sources
+## Critical Reminders
+- NEVER include "FINAL ANSWER:" or any prefix in your response
+- NEVER add explanations or context to your final answer
+- ALWAYS verify spelling, capitalization, and formatting
+- ALWAYS read files first if they are available
+- If uncertain about format, look for clues in the question itself
+- Never guess - use tools to find accurate information
+Remember: Your final message must contain ONLY the answer, nothing else. The scoring system uses exact string matching."""
+# ============ LANGGRAPH AGENT ============
+class GAIAAgent:
+    """LangGraph-based agent for GAIA benchmark."""
+    def __init__(
+        self,
+        model_name: str = "gpt-4o",
+        api_key: str = None,
+        temperature: float = 0,
+        max_iterations: int = 15
+    ):
+        """
+        Initialize the GAIA agent.
+        Args:
+            model_name: OpenAI model to use
+            api_key: OpenAI API key (or set OPENAI_API_KEY env var)
+            temperature: Model temperature (0 for deterministic)
+            max_iterations: Maximum tool-use iterations
+        """
+        self.model_name = model_name
+        self.max_iterations = max_iterations
+        self.llm = ChatOpenAI(
+            model=model_name,
+            temperature=temperature,
+            api_key=api_key or os.environ.get("OPENAI_API_KEY")
+        )
+        self.llm_with_tools = self.llm.bind_tools(TOOLS)
+        self.graph = self._build_graph()
+    def _build_graph(self) -> StateGraph:
+        """Construct the LangGraph workflow."""
+        workflow = StateGraph(AgentState)
+        # Define nodes
+        workflow.add_node("agent", self._agent_node)
+        workflow.add_node("tools", ToolNode(TOOLS))
+        workflow.add_node("extract_answer", self._extract_answer_node)
+        # Set entry point
+        workflow.set_entry_point("agent")
+        # Define edges
+        workflow.add_conditional_edges(
+            "agent",
+            self._route_agent_output,
+            {
+                "tools": "tools",
+                "end": "extract_answer"
+            }
+        )
+        workflow.add_edge("tools", "agent")
+        workflow.add_edge("extract_answer", END)
+        return workflow.compile()
+    def _agent_node(self, state: AgentState) -> dict:
+        """Process messages and decide on next action."""
+        messages = state["messages"]
+        iteration = state.get("iteration_count", 0)
+        # Add iteration warnings earlier to give agent more time to finish
+        if iteration >= self.max_iterations - 3:
+            warning_msg = "WARNING: Approaching iteration limit. Please provide your final answer now. Remember: just the answer, no prefix."
+            messages = list(messages) + [SystemMessage(content=warning_msg)]
+        elif iteration >= self.max_iterations - 5:
+            reminder_msg = "Reminder: When you're ready to answer, provide ONLY the final answer with no prefix like 'FINAL ANSWER:' or 'The answer is:'"
+            messages = list(messages) + [SystemMessage(content=reminder_msg)]
+        try:
+            response = self.llm_with_tools.invoke(messages)
+        except Exception as e:
+            # Graceful error handling
+            error_msg = AIMessage(content=f"Error during reasoning: {str(e)}. Please try a different approach or provide your best answer.")
+            return {
+                "messages": [error_msg],
+                "iteration_count": iteration + 1
+            }
+        return {
+            "messages": [response],
+            "iteration_count": iteration + 1
+        }
+    def _route_agent_output(self, state: AgentState) -> Literal["tools", "end"]:
+        """Determine whether to use tools or finish."""
+        last_message = state["messages"][-1]
+        iteration = state.get("iteration_count", 0)
+        # Force end if max iterations reached
+        if iteration >= self.max_iterations:
+            return "end"
+        # Check if agent wants to use tools
+        if hasattr(last_message, "tool_calls") and last_message.tool_calls:
+            return "tools"
+        return "end"
+    def _extract_answer_node(self, state: AgentState) -> dict:
+        """Extract and clean the final answer."""
+        last_message = state["messages"][-1]
+        content = last_message.content if hasattr(last_message, "content") else str(last_message)
+        answer = self._clean_answer(content)
+        return {"final_answer": answer}
+    def _clean_answer(self, raw_answer: str) -> str:
+        """Clean and format the final answer for exact matching."""
+        answer = raw_answer.strip()
+        # Remove common prefixes (case-insensitive, with variations)
+        prefixes = [
+            "the answer is:", "the answer is", "answer is:",
+            "answer:", "answer", "answer:",
+            "final answer:", "final answer", "FINAL ANSWER:", "FINAL ANSWER",
+            "the final answer is:", "the final answer is",
+            "result:", "result", "result is:",
+            "solution:", "solution", "solution is:",
+            "the solution is:", "the solution is",
+            "it is", "it's", "that is", "that's",
+        ]
+        answer_lower = answer.lower()
+        for prefix in prefixes:
+            if answer_lower.startswith(prefix):
+                answer = answer[len(prefix):].strip()
+                # Remove any leading colon or dash
+                answer = answer.lstrip(':').lstrip('-').strip()
+                answer_lower = answer.lower()
+        # Remove quotes if they wrap the entire answer
+        if (answer.startswith('"') and answer.endswith('"')) or \
+           (answer.startswith("'") and answer.endswith("'")):
+            answer = answer[1:-1].strip()
+        # Remove trailing periods, commas, or semicolons for single-word/number answers
+        if answer and ' ' not in answer:
+            answer = answer.rstrip('.,;:')
+        # Remove leading/trailing whitespace and normalize internal whitespace
+        answer = ' '.join(answer.split())
+        # Remove markdown formatting if present
+        if answer.startswith('**') and answer.endswith('**'):
+            answer = answer[2:-2]
+        if answer.startswith('*') and answer.endswith('*'):
+            answer = answer[1:-1]
+        return answer.strip()
+    def run(self, question: str, task_id: str = "", file_path: str = None) -> str:
+        """
+        Run the agent on a question.
+        Args:
+            question: The GAIA question to answer
+            task_id: Optional task identifier
+            file_path: Optional path to associated file
+        Returns:
+            The agent's final answer
+        """
+        # Prepare the user message with file priority
+        user_content = question
+        if file_path and os.path.exists(file_path):
+            # Strongly emphasize reading the file first
+            user_content = f"[IMPORTANT: A file is available at {file_path}]\n\nYou MUST read this file FIRST using the read_file tool before attempting to answer. The answer is very likely contained in this file.\n\nQuestion: {question}"
+        # Initialize state
+        initial_state: AgentState = {
+            "messages": [
+                SystemMessage(content=SYSTEM_PROMPT),
+                HumanMessage(content=user_content)
+            ],
+            "task_id": task_id,
+            "file_path": file_path,
+            "file_content": None,
+            "iteration_count": 0,
+            "final_answer": None
+        }
+        # Execute the graph
+        try:
+            final_state = self.graph.invoke(
+                initial_state,
+                {"recursion_limit": self.max_iterations * 2 + 5}
+            )
+            answer = final_state.get("final_answer", "Unable to determine answer")
+            # Final validation - ensure answer is not empty or error message
+            if not answer or answer.startswith("Agent error:") or answer.startswith("Unable to determine"):
+                # Try to extract from last message if available
+                if final_state.get("messages"):
+                    last_msg = final_state["messages"][-1]
+                    if hasattr(last_msg, "content") and last_msg.content:
+                        answer = self._clean_answer(last_msg.content)
+            return answer if answer else "Unable to determine answer"
+        except Exception as e:
+            # Log error for debugging but return a clean error message
+            import logging
+            logging.error(f"Agent execution error: {str(e)}")
+            return f"Agent error: {str(e)}"
+# ============ UTILITY FUNCTIONS ============
+def create_agent(api_key: str = None, model: str = "gpt-4o") -> GAIAAgent:
+    """Factory function to create a configured agent."""
+    return GAIAAgent(
+        model_name=model,
+        api_key=api_key,
+        temperature=0,
+        max_iterations=15
+    )

app.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import os
+import gradio as gr
+import requests
+import pandas as pd
+import tempfile
+import json
+import logging
+from typing import Optional
+# Import the optimized agent from the separate module
+from agent_enhanced import GAIAAgent
+# ============ CONFIGURATION ============
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ============ API INTERACTION ============
+def fetch_questions(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> list:
+    """Fetch all questions from the GAIA API with retry logic."""
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(f"{api_url}/questions", timeout=30)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Attempt {attempt + 1} failed: {e}")
+            if attempt == max_retries - 1:
+                raise
+    return []
+def fetch_random_question(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
+    """Fetch a random question from the GAIA API with retry logic."""
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(f"{api_url}/random-question", timeout=30)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Attempt {attempt + 1} failed: {e}")
+            if attempt == max_retries - 1:
+                raise
+    return {}
+def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> Optional[str]:
+    """Fetch a file associated with a task with retry logic."""
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(f"{api_url}/files/{task_id}", timeout=30)
+            if response.status_code == 200:
+                # Save to temp file
+                content_disposition = response.headers.get('content-disposition', '')
+                filename = f"task_{task_id}_file"
+                if 'filename=' in content_disposition:
+                    filename = content_disposition.split('filename=')[1].strip('"')
+                temp_dir = tempfile.mkdtemp()
+                file_path = os.path.join(temp_dir, filename)
+                with open(file_path, 'wb') as f:
+                    f.write(response.content)
+                logger.info(f"Downloaded file: {file_path}")
+                return file_path
+            elif response.status_code == 404:
+                logger.info(f"No file found for task {task_id}")
+                return None
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"File fetch attempt {attempt + 1} failed: {e}")
+            if attempt == max_retries - 1:
+                logger.error(f"Failed to fetch file for task {task_id}: {e}")
+    return None
+def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict:
+    """Submit answers to the GAIA API with retry logic."""
+    payload = {
+        "username": username,
+        "agent_code": agent_code,
+        "answers": answers
+    }
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(f"{api_url}/submit", json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Submission attempt {attempt + 1} failed: {e}")
+            if attempt == max_retries - 1:
+                raise
+    return {}
+# ============ ANSWER VALIDATION ============
+def validate_answer_format(answer: str) -> tuple[bool, str]:
+    """Validate answer format and return (is_valid, warning_message)."""
+    if not answer or answer.strip() == "":
+        return False, "Warning: Answer is empty"
+    # Check for common prefixes that should be removed
+    prefixes = ["FINAL ANSWER:", "The answer is:", "Answer:", "final answer:"]
+    answer_lower = answer.lower()
+    for prefix in prefixes:
+        if answer_lower.startswith(prefix.lower()):
+            return False, f"Warning: Answer contains prefix '{prefix}' which will be removed. Consider removing it."
+    # Check for explanations (multiple sentences)
+    if answer.count('.') > 1 or answer.count('because') > 0 or answer.count('since') > 0:
+        return False, "Warning: Answer may contain explanations. Only the answer should be submitted."
+    return True, ""
+# ============ GRADIO INTERFACE ============
+def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()):
+    """Run the agent on all GAIA questions."""
+    if not openai_api_key:
+        return "Please provide your OpenAI API key.", None
+    try:
+        # Initialize agent
+        progress(0, desc="Initializing agent...")
+        agent = GAIAAgent(api_key=openai_api_key)
+        # Fetch questions
+        progress(0.05, desc="Fetching questions from API...")
+        questions = fetch_questions()
+        if not questions:
+            return "Error: Failed to fetch questions from API. Please try again.", None
+        total_questions = len(questions)
+        results = []
+        answers_for_submission = []
+        for i, q in enumerate(questions):
+            progress((i + 1) / total_questions, desc=f"Processing question {i+1}/{total_questions}...")
+            task_id = q.get("task_id", "")
+            question_text = q.get("question", "")
+            # Check if there's an associated file
+            file_path = None
+            if q.get("file_name"):
+                progress((i + 0.5) / total_questions, desc=f"Downloading file for question {i+1}...")
+                file_path = fetch_file(task_id)
+            # Run agent
+            try:
+                progress((i + 0.7) / total_questions, desc=f"Agent reasoning for question {i+1}...")
+                answer = agent.run(question_text, task_id, file_path)
+                # Validate answer format
+                is_valid, warning = validate_answer_format(answer)
+                if not is_valid:
+                    logger.warning(f"Question {i+1} ({task_id}): {warning}")
+            except Exception as e:
+                logger.error(f"Error processing question {i+1} ({task_id}): {e}")
+                answer = f"Error: {str(e)}"
+            results.append({
+                "Task ID": task_id,
+                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                "Answer": answer,
+                "Status": "✓" if answer and not answer.startswith("Error:") else "✗"
+            })
+            answers_for_submission.append({
+                "task_id": task_id,
+                "submitted_answer": answer
+            })
+            # Cleanup temp file
+            if file_path and os.path.exists(file_path):
+                try:
+                    os.remove(file_path)
+                    # Also try to remove temp directory if empty
+                    temp_dir = os.path.dirname(file_path)
+                    if os.path.exists(temp_dir):
+                        try:
+                            os.rmdir(temp_dir)
+                        except:
+                            pass
+                except Exception as e:
+                    logger.warning(f"Failed to cleanup file {file_path}: {e}")
+        df = pd.DataFrame(results)
+        progress(1.0, desc="Complete!")
+        return df, answers_for_submission
+    except Exception as e:
+        logger.error(f"Error in run_agent_on_questions: {e}")
+        return f"Error: {str(e)}", None
+def submit_to_leaderboard(username: str, space_url: str, answers_json: str):
+    """Submit answers to the leaderboard."""
+    if not username or not space_url or not answers_json:
+        return "Please fill in all fields and run the agent first."
+    try:
+        answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json
+        if not isinstance(answers, list) or len(answers) == 0:
+            return "Error: Answers must be a non-empty list. Please run the agent first."
+        # Validate answer format before submission
+        warnings = []
+        for ans in answers:
+            if "task_id" not in ans or "submitted_answer" not in ans:
+                return "Error: Invalid answer format. Each answer must have 'task_id' and 'submitted_answer'."
+            is_valid, warning = validate_answer_format(ans.get("submitted_answer", ""))
+            if not is_valid:
+                warnings.append(f"Task {ans.get('task_id')}: {warning}")
+        # Ensure space URL ends with /tree/main
+        if not space_url.endswith("/tree/main"):
+            space_url = space_url.rstrip("/") + "/tree/main"
+        # Submit to API
+        result = submit_answers(username, space_url, answers)
+        score = result.get("score", 0)
+        correct = result.get("correct_count", 0)
+        total = result.get("total_attempted", 0)
+        warning_text = ""
+        if warnings:
+            warning_text = f"\n\n⚠️ **Warnings:**\n" + "\n".join(f"- {w}" for w in warnings[:5])
+            if len(warnings) > 5:
+                warning_text += f"\n- ... and {len(warnings) - 5} more warnings"
+        return f"""
+## Submission Successful! 🎉
+**Score:** {score:.1%}
+**Correct:** {correct}/{total}
+{'🏆 Congratulations! You passed the 30% threshold!' if score >= 0.3 else '📈 Keep improving! You need 30% to earn your certificate.'}
+{warning_text}
+Check the [leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard) to see your ranking!
+"""
+    except json.JSONDecodeError as e:
+        return f"Error: Invalid JSON format. Please run the agent first.\nDetails: {str(e)}"
+    except Exception as e:
+        logger.error(f"Submission error: {e}")
+        return f"Submission error: {str(e)}"
+def test_single_question(openai_api_key: str):
+    """Test the agent on a single random question."""
+    if not openai_api_key:
+        return "Please provide your OpenAI API key.", "", "", ""
+    try:
+        agent = GAIAAgent(api_key=openai_api_key)
+        question_data = fetch_random_question()
+        if not question_data:
+            return "Error: Failed to fetch question from API.", "", "", ""
+        task_id = question_data.get("task_id", "")
+        question_text = question_data.get("question", "")
+        file_path = None
+        if question_data.get("file_name"):
+            file_path = fetch_file(task_id)
+        answer = agent.run(question_text, task_id, file_path)
+        # Validate answer format
+        is_valid, warning = validate_answer_format(answer)
+        validation_status = "✓ Valid format" if is_valid else f"⚠️ {warning}"
+        # Cleanup temp file
+        if file_path and os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+                temp_dir = os.path.dirname(file_path)
+                if os.path.exists(temp_dir):
+                    try:
+                        os.rmdir(temp_dir)
+                    except:
+                        pass
+            except Exception as e:
+                logger.warning(f"Failed to cleanup file: {e}")
+        return question_text, answer, task_id, validation_status
+    except Exception as e:
+        logger.error(f"Error in test_single_question: {e}")
+        return f"Error: {str(e)}", "", "", ""
+# ============ BUILD GRADIO APP ============
+with gr.Blocks(title="GAIA Agent - LangGraph", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+# 🤖 GAIA Benchmark Agent (LangGraph)
+This agent uses **LangGraph** to solve GAIA benchmark questions. It has access to:
+- 🔍 Web Search (DuckDuckGo)
+- 📚 Wikipedia Search
+- 🐍 Python Code Execution
+- 📄 File Reading (PDF, Text, Excel)
+- 🔢 Calculator
+## Instructions
+1. Enter your OpenAI API key
+2. Test with a single question or run on all questions
+3. Submit your answers to the leaderboard
+""")
+    with gr.Row():
+        openai_key = gr.Textbox(
+            label="OpenAI API Key",
+            type="password",
+            placeholder="sk-...",
+            info="Required for GPT-4o"
+        )
+    with gr.Tabs():
+        with gr.TabItem("🧪 Test Single Question"):
+            test_btn = gr.Button("Fetch & Solve Random Question", variant="primary")
+            test_question = gr.Textbox(label="Question", lines=5, interactive=False)
+            test_answer = gr.Textbox(label="Agent's Answer", lines=3, interactive=False)
+            test_task_id = gr.Textbox(label="Task ID", interactive=False)
+            test_validation = gr.Textbox(label="Answer Validation", interactive=False)
+            test_btn.click(
+                test_single_question,
+                inputs=[openai_key],
+                outputs=[test_question, test_answer, test_task_id, test_validation]
+            )
+        with gr.TabItem("🚀 Run Full Benchmark"):
+            run_btn = gr.Button("Run Agent on All Questions", variant="primary")
+            results_table = gr.Dataframe(label="Results")
+            answers_state = gr.State()
+            run_btn.click(
+                run_agent_on_questions,
+                inputs=[openai_key],
+                outputs=[results_table, answers_state]
+            )
+        with gr.TabItem("📤 Submit to Leaderboard"):
+            gr.Markdown("""
+            ### Submit Your Results
+            After running the full benchmark, fill in your details and submit to the leaderboard.
+            **Requirements:**
+            - Your HuggingFace username
+            - Your Space URL (must end with `/tree/main`)
+            - Answers will be auto-filled after running the benchmark
+            """)
+            with gr.Row():
+                username_input = gr.Textbox(
+                    label="HuggingFace Username",
+                    placeholder="your-username",
+                    info="Your HuggingFace account username"
+                )
+                space_url_input = gr.Textbox(
+                    label="Your Space URL",
+                    placeholder="https://huggingface.co/spaces/your-username/your-space",
+                    info="Full URL to your Space (will auto-append /tree/main if needed)"
+                )
+            answers_input = gr.Textbox(
+                label="Answers JSON (auto-filled after running benchmark)",
+                lines=10,
+                placeholder="Run the full benchmark first...",
+                info="This will be automatically populated after running the benchmark"
+            )
+            submit_btn = gr.Button("Submit to Leaderboard", variant="primary")
+            submit_result = gr.Markdown()
+            # Auto-fill answers when benchmark completes
+            def format_answers(answers):
+                if answers:
+                    return json.dumps(answers, indent=2)
+                return ""
+            answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_input])
+            submit_btn.click(
+                submit_to_leaderboard,
+                inputs=[username_input, space_url_input, answers_input],
+                outputs=[submit_result]
+            )
+    gr.Markdown("""
+---
+### 📋 Tips for Better Scores
+**Answer Formatting:**
+- Answers are matched **exactly** (character-for-character), so precision is critical
+- Do NOT include prefixes like "FINAL ANSWER:" or "The answer is:"
+- For lists: use comma-separated format with NO spaces (e.g., "item1,item2,item3")
+- For numbers: just the number, no units unless specified
+- Check the validation status in the test tab
+**Agent Capabilities:**
+- Uses GPT-4o for optimal reasoning
+- Automatically reads files (PDFs, Excel, text) when available
+- Web search for current information
+- Wikipedia for factual lookups
+- Python execution for calculations
+**Best Practices:**
+1. Test with a single question first to verify the agent works
+2. Run the full benchmark (takes ~10-15 minutes)
+3. Review answers before submission
+4. Ensure your Space is public for verification
+### 🔗 Links
+- [GAIA Benchmark](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
+- [Student Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
+- [Course Unit 4](https://huggingface.co/learn/agents-course/en/unit4/hands-on)
+- [API Documentation](https://agents-course-unit4-scoring.hf.space/docs)
+""")
+if __name__ == "__main__":
+    # For HuggingFace Spaces, use share=False
+    # For local development, you can use share=True to get a public link
+    demo.launch(server_name="0.0.0.0", server_port=7860)

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name,output,flag,username,timestamp
2	+ asdf,Hello asdf!!,,,2026-01-16 10:40:06.831644

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core dependencies
+gradio>=4.0.0,<5.0.0
+requests>=2.31.0,<3.0.0
+pandas>=2.0.0,<3.0.0
+# LangChain & LangGraph
+langgraph>=0.2.0,<1.0.0
+langchain>=0.2.0,<1.0.0
+langchain-core>=0.2.0,<1.0.0
+langchain-openai>=0.1.0,<1.0.0
+langchain-community>=0.2.0,<1.0.0
+langchain-experimental>=0.0.60,<1.0.0
+# Tools dependencies
+duckduckgo-search>=6.0.0,<7.0.0
+pypdf>=4.0.0,<5.0.0
+openpyxl>=3.1.0,<4.0.0
+# Utilities
+python-dotenv>=1.0.0,<2.0.0