Final_Assignment_Template

Sleeping

App Files Files Community

pmeyhoefer commited on May 2, 2025

Commit

009368a

verified ·

1 Parent(s): 81d72bd

Update app.py

Browse files

Files changed (1) hide show

app.py +288 -282

app.py CHANGED Viewed

@@ -1,371 +1,377 @@
 import os
 import logging
-import traceback # Import traceback for better error logging
 import gradio as gr
 import requests
 import pandas as pd
 from openai import OpenAI
 from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
 from smolagents.models import OpenAIServerModel
-# --- Logging ---
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)
-# --- Constants ---
-DEFAULT_API_URL   = "https://agents-course-unit4-scoring.hf.space"
-# --- GitHub Models Configuration ---
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 if not GITHUB_TOKEN:
-    raise RuntimeError("Please set GITHUB_TOKEN in your Space secrets.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
-MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") # Using mini as per logs
-# --- Configure OpenAI SDK (Optional) ---
-# Less critical if tools don't directly use it
 try:
-    client = OpenAI(
-        base_url=GITHUB_ENDPOINT,
-        api_key=GITHUB_TOKEN,
-    )
 except Exception as e:
-    logger.error(f"Ignoring error during optional OpenAI client init for GitHub Models: {e}")
-    pass
-# --- Tools ---
-# Instantiate the search tool ONCE
-search_tool_instance = DuckDuckGoSearchTool()
 @tool
-def duckduckgo_search(query: str) -> str:
     """
-    Performs a DuckDuckGo search for the given query and returns the results.
-    Use this for general web searches.
     Args:
-        query (str): The search query.
     Returns:
-        str: The search results, or an error message.
     """
-    logger.info(f"Executing duckduckgo_search with query: {query}")
     try:
-        # Call the instantiated search tool
         result = search_tool_instance(query=query)
-        logger.info(f"DuckDuckGo search returned {len(result)} characters.")
-        # Maybe truncate long results if they cause issues downstream?
-        # max_len = 2000
-        # if len(result) > max_len:
-        #     logger.warning(f"Truncating DuckDuckGo result from {len(result)} to {max_len} chars.")
-        #     result = result[:max_len] + "... (truncated)"
         return result
     except Exception as e:
-        logger.exception(f"DuckDuckGoSearchTool failed for query: {query}")
         return f"Search Error: {e}"
 @tool
-def summarize_query(query: str) -> str:
-    """
-    Reframes an unclear search query to improve relevance. Often useful before calling duckduckgo_search if the initial query is vague.
-    Args:
-        query (str): The original search query.
-    Returns:
-        str: A concise, improved version prepended with 'Summarize and reframe:'.
-    """
-    logger.info(f"Executing summarize_query with query: {query}")
-    # This still doesn't use an LLM, it's just a placeholder/reframing instruction
-    return f"Summarize and reframe: {query}"
-@tool
-def wikipedia_search(page: str) -> str:
     """
-    Fetches the summary extract of an English Wikipedia page. Use specific page titles.
     Args:
-        page (str): The exact Wikipedia page title (e.g., 'Mercedes_Sosa', 'List_of_Mercedes_Sosa_albums'). Spaces will be replaced by underscores.
     Returns:
-        str: The page’s extract text or an error message (e.g., 'Wikipedia page '[page]' not found.').
     """
-    page_safe = page.replace(" ", "_")
-    logger.info(f"Executing wikipedia_search with page: {page} (URL-safe: {page_safe})")
     try:
         url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
-        # Add a more specific user agent if running in HF Spaces
-        space_id = os.getenv("SPACE_ID", "unknown-space")
-        headers = {'User-Agent': f'SmolAgentGAIARunner/1.1 ({space_id})'}
-        r = requests.get(url, headers=headers, timeout=12)
-        r.raise_for_status() # Raises HTTPError for 4xx/5xx
         data = r.json()
         extract = data.get("extract", "")
-        if not extract:
-             # Handle disambiguation or empty pages
-             page_title = data.get("title", page)
-             page_type = data.get("type", "standard")
-             if page_type == "disambiguation":
-                 logger.warning(f"Wikipedia page '{page_title}' is a disambiguation page.")
-                 # Try to get description which might list options
-                 description = data.get("description", "disambiguation page.")
-                 return f"Wikipedia page '{page_title}' is a {description}. Try a more specific page title."
-             else: # Standard page but no extract
-                 logger.warning(f"Wikipedia page '{page_title}' found, but has no summary extract.")
-                 return f"Wikipedia page '{page_title}' found, but has no summary extract."
-        logger.info(f"Wikipedia search for '{page}' returned {len(extract)} characters.")
-        return extract
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
-             logger.warning(f"Wikipedia page not found: {page_safe}")
-             return f"Wikipedia page '{page_safe}' not found."
         else:
-             logger.exception(f"Wikipedia lookup failed for page: {page_safe} with status {e.response.status_code}")
-             return f"Wikipedia HTTP error {e.response.status_code} for page '{page_safe}': {e}"
     except requests.exceptions.RequestException as e:
         logger.exception(f"Wikipedia network request failed for page: {page_safe}")
-        return f"Wikipedia network error for page '{page_safe}': {e}"
     except Exception as e:
-        logger.exception(f"Unexpected Wikipedia lookup error for page: {page_safe}")
-        return f"Unexpected Wikipedia error for page '{page_safe}': {e}"
-# --- ReACT Prompt ---
-# *** THIS IS THE CRITICAL FIX: Ensure the tool name here matches the @tool function ***
-instruction_prompt = """
-You are a ReACT agent with three tools:
- • duckduckgo_search(query: str)  # Correct function name
- • wikipedia_search(page: str)
- • summarize_query(query: str)
-Internally, for each question:
-1. Thought: Decide which tool is most appropriate. If searching the web, use duckduckgo_search. If looking for encyclopedic info on a specific topic/entity, try wikipedia_search first with the most likely page title. If a search or lookup fails or returns irrelevant info, think about why and try reformulating the query or using a different tool. Maybe use summarize_query on a complex question before searching.
-2. Action: Call the chosen tool with the correct arguments. For wikipedia_search, use page titles like 'Entity_Name' or 'List_of_Entity_Albums'.
-3. Observation: Record the result returned by the tool. Note error messages like 'page not found' or 'Search Error'.
-4. Thought: Analyze the observation. Was the information found? Is it relevant? If not, what should be the next step? Try duckduckgo_search if Wikipedia failed? Try a different Wikipedia page title (e.g., 'List_of_Mercedes_Sosa_albums' instead of 'Mercedes_Sosa_discography')? If search results are messy, maybe try summarize_query on the topic and search again?
-5. Action: Execute the next action based on the thought.
-6. Repeat steps 3-5 until the answer is found or you determine it cannot be found with the available tools.
-7. Thought: Synthesize all observations into a final answer based *only* on the information gathered.
-Finally, output your answer with the following template *exactly*:
-FINAL ANSWER: [YOUR FINAL ANSWER].
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
-If you are asked for a number, output only the number (e.g., 42). No commas in numbers (e.g., 1000 not 1,000). No units ($ or %).
-If you are asked for a string, use minimal words, no articles (a, an, the), no abbreviations (e.g., New York City not NYC). Write digits as words (e.g., seven not 7) unless the question implies numerical output.
-If you are asked for a comma separated list, apply the above rules to each element. Example: red,blue,three.
 """
-# --- Build the Agent with OpenAIServerModel pointing to GitHub Models ---
 try:
-    model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
         base_url=GITHUB_ENDPOINT,
-        # Add timeout if needed, e.g., request_timeout=60
-        # Add model_kwargs if needed, e.g. model_kwargs={'temperature': 0.5}
     )
-    logger.info(f"Configured OpenAIServerModel(id={MODEL_ID}, endpoint={GITHUB_ENDPOINT})")
 except Exception as e:
-    logger.exception("Failed to configure OpenAIServerModel")
     raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
-# Pass the list of FUNCTION objects decorated with @tool
-smart_agent = CodeAgent(
-    tools=[duckduckgo_search, wikipedia_search, summarize_query],
-    model=model
-)
-logger.info(f"CodeAgent initialized with tools: {[t.__name__ for t in smart_agent.tools]}")
-# --- Gradio Wrapper ---
-class BasicAgent:
-    def __init__(self):
-        logger.info(f"BasicAgent initialized, using SmolAgent with model {MODEL_ID}")
-    def __call__(self, question: str) -> str:
-        question = question.strip()
-        if not question:
-            logger.error("Agent called with empty question.")
-            return "AGENT ERROR: empty question"
-        # Use the updated instruction_prompt
-        prompt = instruction_prompt.strip() + "\n\nQUESTION: " + question
-        # Log the exact prompt being sent (optional, can be verbose)
-        # logger.debug(f"--- Sending Prompt to Agent ---\n{prompt}\n-----------------------------")
-        try:
-            logger.info(f"Running agent for question: '{question}'")
-            # The agent uses the 'model' instance and tools configured above
-            result = smart_agent.run(prompt)
-            # Log the raw result (optional, can be verbose)
-            # logger.debug(f"--- Raw Agent Result ---\n{result}\n--------------------------")
-            logger.info(f"Agent finished run for question: '{question}'")
-            # Basic check if the agent failed to produce a final answer format
-            if "FINAL ANSWER:" not in result:
-                 logger.warning(f"Agent output for question '{question}' did not contain 'FINAL ANSWER:'. Raw output: {result}")
-                 # Decide how to handle this - return error or raw output?
-                 # Returning raw output might be better for debugging but fail submission check.
-                 # Let's return a specific error for submission.
-                 return f"AGENT ERROR: Malformed response - No 'FINAL ANSWER:' block found."
-            return result # Return the full raw output including thought process and FINAL ANSWER
-        except Exception as e:
-            logger.exception(f"Agent run failed for question '{question}'")
-            # Get traceback details
-            tb_str = traceback.format_exc()
-            return f"AGENT ERROR: Exception during run: {e}\nTraceback:\n{tb_str}"
-# --- Submission Logic ---
-def run_and_submit_all(profile: gr.OAuthProfile | None):
-    if not profile:
         logger.warning("Submission attempt failed: User not logged in.")
-        return "Please log in to Hugging Face to submit.", None
-    username   = profile.username
-    space_id   = os.getenv("SPACE_ID", "")
-    if not space_id:
-        logger.warning("SPACE_ID environment variable not set. Agent code URL will be incomplete.")
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Agent code URL unavailable (SPACE_ID not set)"
-    logger.info(f"Starting evaluation run for user '{username}'")
-    agent = BasicAgent()
-    # Fetch questions
     try:
-        logger.info(f"Fetching questions from {DEFAULT_API_URL}/questions")
-        resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
         resp.raise_for_status()
         questions_data = resp.json()
-        if not isinstance(questions_data, list):
-             logger.error(f"Fetched questions is not a list: {type(questions_data)}")
-             return f"Error: Fetched questions format is incorrect (expected list, got {type(questions_data)}).", None
-        questions = questions_data or []
-        logger.info(f"Fetched {len(questions)} questions successfully.")
     except Exception as e:
         logger.exception("Failed to fetch questions")
-        return f"Error fetching questions: {e}", None
     if not questions:
         logger.warning("No questions fetched or questions list is empty.")
-        return "No questions were fetched from the server.", None
-    logs, payload = [], []
-    question_count = len(questions)
     for i, item in enumerate(questions):
-        if not isinstance(item, dict):
-             logger.warning(f"Skipping invalid question item (not a dict): {item}")
-             continue
-        tid = item.get("task_id")
-        q   = item.get("question")
-        if not tid or not q:
-            logger.warning(f"Skipping question with missing task_id or question: {item}")
             continue
-        logger.info(f"Processing question {i+1}/{question_count} - Task ID: {tid}")
-        ans_raw = agent(q) # Run the agent
-        # Extract only the final answer part for submission
-        final_ans_marker = "FINAL ANSWER:"
-        submitted_ans = f"ERROR (Agent did not produce output with {final_ans_marker})" # Default if parsing fails
-        if final_ans_marker in ans_raw:
-            # Split and take the part *after* the marker
-            submitted_ans = ans_raw.split(final_ans_marker, 1)[1].strip()
-            # Optional: Basic validation/cleanup of the extracted answer?
-            # e.g., remove leading/trailing quotes if not needed
-            # submitted_ans = submitted_ans.strip(' "')
-        elif "AGENT ERROR:" in ans_raw:
-             # If agent returned an error string, submit that
-             submitted_ans = ans_raw # Keep the AGENT ERROR message
-             logger.warning(f"Agent returned an error for Task ID {tid}: {submitted_ans}")
-        else:
-             logger.warning(f"Could not extract final answer from raw output for Task ID {tid}. Raw: {ans_raw[:500]}...") # Log snippet
-        logger.info(f"Task ID: {tid}, Question: '{q}', Submitted Answer: '{submitted_ans}'")
-        # Store more info for the Gradio table, including the raw output for debugging
-        logs.append({
-            "Task ID": tid,
-            "Question": q,
-            "Submitted Answer": submitted_ans,
-            "Agent Raw Output": ans_raw # Show the full thought process in the table
         })
-        payload.append({"task_id": tid, "submitted_answer": submitted_ans})
-    if not payload:
-        logger.warning("Agent did not produce any valid answers to submit.")
-        # Check if logs have entries to display potential errors
-        if logs:
-            return "Agent ran but did not produce any answers in the expected format.", pd.DataFrame(logs)
-        else:
-            return "Agent did not produce any answers.", None
-    logger.info(f"Submitting {len(payload)} answers for user '{username}'...")
-    # Submit answers
     try:
-        submit_payload = {"username": username, "agent_code": agent_code, "answers": payload}
-        # logger.debug(f"Submission Payload: {submit_payload}") # Careful logging PII
-        post = requests.post(
-            f"{DEFAULT_API_URL}/submit",
-            json=submit_payload,
-            timeout=90 # Increased timeout for submission
         )
-        post.raise_for_status() # Check for HTTP errors from submission endpoint
-        result = post.json()
-        logger.info(f"Submission successful. Result: {result}")
-        score_percent = result.get('score', 'N/A')
-        try: # Format score nicely
-            score_percent = f"{float(score_percent):.2f}" if isinstance(score_percent, (int, float)) else score_percent
-        except (ValueError, TypeError): pass
-        status = (
-            f"Submission Successful!\n"
-            f"User: {result.get('username', 'N/A')}\n"
-            f"Score: {score_percent}%\n"
-            f"Correct: {result.get('correct_count','?')} / Attempted: {result.get('total_attempted','?')}\n"
-            f"Message: {result.get('message','(No message)')}"
-        )
-        # Update logs DataFrame with final status if needed, though usually not necessary
-        return status, pd.DataFrame(logs) # Return status and the detailed logs
     except requests.exceptions.RequestException as e:
         logger.exception("Submission request failed")
         error_details = str(e)
         if e.response is not None:
-             error_details += f" | Status Code: {e.response.status_code} | Response: {e.response.text[:500]}"
-        return f"Submission Failed: {error_details}", pd.DataFrame(logs) # Return error and logs
     except Exception as e:
-        logger.exception("Submission failed with unexpected error")
-        return f"Submission Failed with unexpected error: {e}", pd.DataFrame(logs) # Return error and logs
-# --- Gradio App ---
-with gr.Blocks() as demo:
-    gr.Markdown("# SmolAgent GAIA Runner (using GitHub Models) 🚀")
-    gr.Markdown("""
-**Instructions:**
-1. Ensure `GITHUB_TOKEN` secret is set. Optionally set `MODEL_ID`.
-2. Log in to Hugging Face below.
-3. Click **Run Evaluation & Submit All Answers**.
-4. Check the Status and the Questions & Answers table for results. The raw agent output includes the thinking process.
-""")
-    gr.LoginButton()
-    btn = gr.Button("Run Evaluation & Submit All Answers")
-    out_status = gr.Textbox(label="Submission Status", lines=5, interactive=False)
-    # *** FIX: Remove the 'height' argument ***
-    out_table  = gr.DataFrame(
-        label="Questions & Answers Log",
         wrap=True,
-        # Add headers if you want to control column names/order explicitly
-        headers=["Task ID", "Question", "Submitted Answer", "Agent Raw Output"],
-        column_widths=["10%", "30%", "20%", "40%"] # Adjust widths as needed
     )
-    btn.click(run_and_submit_all, outputs=[out_status, out_table], api_name="run_submit") # Add api_name
 if __name__ == "__main__":
-    if not GITHUB_TOKEN:
-        logger.error("GITHUB_TOKEN environment variable not set. Cannot start effectively.")
-        # Optionally raise error or exit? For now, just log.
-    logger.info("Launching Gradio App...")
-    # share=True is needed for public link if running on HF Spaces
-    # debug=True provides more verbose Gradio logging
-    demo.launch(debug=True, share=True)

 import os
 import logging
+import traceback
 import gradio as gr
 import requests
 import pandas as pd
 from openai import OpenAI
+# Assuming these imports from smolagents are correct
 from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
 from smolagents.models import OpenAIServerModel
+# --- Basic Logging Setup ---
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 logger = logging.getLogger(__name__)
+# --- Configuration ---
+# URL for fetching questions and submitting answers
+SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
+# GitHub Models Configuration
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 if not GITHUB_TOKEN:
+    # Critical error if token is missing
+    raise ValueError("GITHUB_TOKEN environment variable not set. Please set it in Space secrets.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
+# Use a known model ID compatible with the endpoint
+# Let's stick to gpt-4o-mini based on previous logs, ensure it's available.
+MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
+# --- Tool Definitions ---
+# Instantiate the search tool ONCE to reuse its state/connection if any
 try:
+    search_tool_instance = DuckDuckGoSearchTool()
 except Exception as e:
+    logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}")
+    # Depending on the app's requirements, you might want to raise an error here
+    # or allow the app to start but log the failure.
+    search_tool_instance = None # Indicate failure
+# IMPORTANT: Define wrapper functions that the LLM will be instructed to call.
+# Use the @tool decorator so CodeAgent recognizes them.
 @tool
+def web_search(query: str) -> str:
     """
+    Performs a web search using DuckDuckGo for the given query.
+    Use this for general questions, finding current information, or when Wikipedia fails.
     Args:
+        query (str): The search query string.
     Returns:
+        str: The search results obtained from DuckDuckGo, or an error message.
     """
+    logger.info(f"Executing web_search with query: '{query[:100]}...'") # Log snippet
+    if search_tool_instance is None:
+        logger.error("web_search cannot execute because DuckDuckGoSearchTool failed to initialize.")
+        return "Search Error: Tool not initialized."
     try:
         result = search_tool_instance(query=query)
+        logger.info(f"web_search returned {len(result)} characters.")
+        # Limit result length to prevent excessively large observations
+        max_len = 3000
+        if len(result) > max_len:
+            logger.warning(f"Truncating web_search result from {len(result)} to {max_len} chars.")
+            return result[:max_len] + "... (truncated)"
         return result
     except Exception as e:
+        logger.exception(f"web_search failed for query: {query}")
         return f"Search Error: {e}"
 @tool
+def wikipedia_lookup(page_title: str) -> str:
     """
+    Fetches the summary introduction text of an English Wikipedia page.
+    Use this for factual information about specific topics, people, or entities.
     Args:
+        page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein', 'List_of_programming_languages'). Spaces will be converted to underscores.
     Returns:
+        str: The summary text of the page, or an error message if not found or failed.
     """
+    page_safe = page_title.replace(" ", "_")
+    logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
     try:
         url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
+        space_id = os.getenv("SPACE_ID", "unknown-huggingface-space")
+        headers = {'User-Agent': f'GAIAgent/1.0 ({space_id})'}
+        r = requests.get(url, headers=headers, timeout=15) # Increased timeout
+        r.raise_for_status() # Check for HTTP 4xx/5xx errors
         data = r.json()
         extract = data.get("extract", "")
+        if extract:
+            logger.info(f"wikipedia_lookup found summary ({len(extract)} chars) for '{page_title}'.")
+            return extract
+        else:
+            # Handle pages found but without extracts (e.g., disambiguation)
+            page_type = data.get("type", "standard")
+            title = data.get("title", page_title)
+            if page_type == "disambiguation":
+                description = data.get("description", "multiple meanings")
+                logger.warning(f"wikipedia_lookup found a disambiguation page for '{title}': {description}")
+                return f"Wikipedia Error: '{title}' refers to {description}. Please provide a more specific page title."
+            else:
+                 logger.warning(f"wikipedia_lookup found page '{title}' but it has no summary text.")
+                 return f"Wikipedia Error: Page '{title}' found but has no summary."
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
+            logger.warning(f"Wikipedia page not found: {page_safe}")
+            return f"Wikipedia Error: Page '{page_safe}' not found."
         else:
+            logger.error(f"Wikipedia HTTP error {e.response.status_code} for page: {page_safe}")
+            return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
     except requests.exceptions.RequestException as e:
         logger.exception(f"Wikipedia network request failed for page: {page_safe}")
+        return f"Wikipedia Error: Network error for page '{page_safe}': {e}"
     except Exception as e:
+        logger.exception(f"Unexpected error during wikipedia_lookup for page: {page_safe}")
+        return f"Wikipedia Error: Unexpected error: {e}"
+# Removed summarize_query tool for simplicity, as it wasn't adding much value in logs
+# --- The ReACT Prompt ---
+# Define the *exact* instructions for the LLM, listing the *actual* tool function names.
+# Keep it clear and concise.
+REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
+Available Tools:
+- web_search(query: str): Use this for searching the web for general information, current events, or when you don't know a specific Wikipedia page title.
+- wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page. Use exact page titles (e.g., 'Berlin', 'Python_(programming_language)').
+Follow these steps for each question:
+1.  **Thought:** Briefly explain your plan and which tool you will use and why.
+2.  **Action:** Call ONE tool using the correct function name and arguments. Example: web_search(query="latest news") or wikipedia_lookup(page_title="Artificial_intelligence").
+3.  **Observation:** Record the result provided by the tool.
+4.  **Thought:** Analyze the observation. Does it answer the question? If yes, prepare the final answer. If not, plan the next step (e.g., try a different tool, refine the search query, use a different Wikipedia title).
+5.  Repeat Action/Observation/Thought until you have the answer or determine it cannot be found.
+6.  **Thought:** Summarize the findings and prepare the final answer based ONLY on the observations.
+7.  **Final Answer:** Provide the final answer in the required format (number, short string, or comma-separated list) on a new line starting exactly with "FINAL ANSWER: ".
+Formatting Rules for FINAL ANSWER:
+-   Numbers: Output only the number (e.g., `42`, `1000`). No commas, units ($).
+-   Strings: Use minimal words, no articles (a, an, the). Write digits as words (e.g., `seven`) unless numerical output is implied.
+-   Lists: Comma-separated, apply number/string rules to each item (e.g., `paris,london,three`).
+Let's begin!
 """
+# --- SmolAgent Setup ---
+logger.info(f"Initializing LLM connection to {MODEL_ID} via {GITHUB_ENDPOINT}")
 try:
+    # Configure the model connection to use GitHub's endpoint
+    llm_model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
         base_url=GITHUB_ENDPOINT,
+        request_timeout=60 # Add a timeout for model requests
     )
+    # Verify connection (optional, depends on OpenAIServerModel implementation)
+    # You might add a simple test call here if the library supports it easily
+    logger.info("LLM connection configured successfully.")
 except Exception as e:
+    logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
     raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
+logger.info("Initializing CodeAgent...")
+try:
+    # Create the agent instance, passing the *list of actual functions* decorated with @tool
+    agent = CodeAgent(
+        tools=[web_search, wikipedia_lookup], # Only include the defined tool functions
+        model=llm_model
+    )
+    # Log the names of the tools the agent actually recognized (if possible/safe)
+    # This depends on how CodeAgent stores tools. Avoid the previous error.
+    # logger.info(f"CodeAgent initialized. Tools detected by agent (if available): {agent.tools}") # Be cautious with this line
+    logger.info("CodeAgent initialized successfully.")
+except Exception as e:
+    logger.exception("CRITICAL: Failed to initialize CodeAgent")
+    raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
+# --- Gradio Interface ---
+def run_agent_on_question(question: str) -> str:
+    """
+    Takes a question, runs the SmolAgent, and returns the raw output.
+    Handles basic validation and error catching.
+    """
+    question = question.strip()
+    if not question:
+        logger.error("Agent called with empty question.")
+        return "AGENT_ERROR: Question cannot be empty."
+    # Construct the full prompt for the agent run
+    full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
+    logger.info(f"--- Running Agent for Question: '{question}' ---")
+    # Log first few lines of prompt for verification (optional)
+    # logger.debug(f"Prompt start:\n{full_prompt[:300]}...")
+    try:
+        # Execute the agent run
+        raw_result = agent.run(full_prompt)
+        logger.info(f"Agent run completed for question: '{question}'. Output length: {len(raw_result)}")
+        # Log first/last parts of the raw result for debugging (optional)
+        # logger.debug(f"Raw agent result snippet:\n{raw_result[:500]}...\n...{raw_result[-500:]}")
+        return raw_result
+    except Exception as e:
+        logger.exception(f"Agent run failed for question '{question}'")
+        tb_str = traceback.format_exc() # Get detailed traceback
+        return f"AGENT_ERROR: An exception occurred during agent execution: {e}\nTraceback:\n{tb_str}"
+def evaluate_and_submit(hf_profile: gr.OAuthProfile | None):
+    """
+    Gradio action: Fetches questions, runs agent on each, submits results.
+    """
+    if not hf_profile:
         logger.warning("Submission attempt failed: User not logged in.")
+        return "⚠️ Please log in to Hugging Face via the button above to submit.", None # Status message, empty DataFrame
+    username = hf_profile.username
+    logger.info(f"🚀 Starting evaluation run for user '{username}'...")
+    # 1. Fetch Questions
+    questions = []
     try:
+        logger.info(f"Fetching questions from {SUBMISSION_URL}/questions")
+        resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
         resp.raise_for_status()
         questions_data = resp.json()
+        if isinstance(questions_data, list):
+            questions = questions_data
+            logger.info(f"✅ Fetched {len(questions)} questions.")
+        else:
+            logger.error(f"Fetched questions data is not a list: {type(questions_data)}")
+            return "❌ Error: Fetched questions format is incorrect.", None
     except Exception as e:
         logger.exception("Failed to fetch questions")
+        return f"❌ Error fetching questions: {e}", None
     if not questions:
         logger.warning("No questions fetched or questions list is empty.")
+        return "ℹ️ No questions were fetched from the server.", None
+    # 2. Run Agent on Questions
+    results_log = []
+    answers_payload = []
+    total_questions = len(questions)
     for i, item in enumerate(questions):
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or not question_text:
+            logger.warning(f"Skipping invalid question item {i+1}/{total_questions}: Missing task_id or question. Data: {item}")
             continue
+        logger.info(f"Processing question {i+1}/{total_questions} (Task ID: {task_id})...")
+        raw_agent_output = run_agent_on_question(question_text) # Run the agent
+        # Extract final answer for submission
+        final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker found in output." # Default if parsing fails
+        marker = "FINAL ANSWER:"
+        if marker in raw_agent_output:
+            final_answer = raw_agent_output.split(marker, 1)[1].strip()
+        elif "AGENT_ERROR:" in raw_agent_output: # If agent returned an error explicitly
+            final_answer = raw_agent_output # Submit the error message
+        logger.info(f"Task ID: {task_id} -> Submitted Answer: '{final_answer}'")
+        # Log results for Gradio table
+        results_log.append({
+            "Task ID": task_id,
+            "Question": question_text,
+            "Submitted Answer": final_answer,
+            "Full Agent Output": raw_agent_output # Show full trace in UI
         })
+        # Prepare payload for submission API
+        answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
+    results_df = pd.DataFrame(results_log)
+    if not answers_payload:
+        logger.warning("Agent did not produce any answers to submit.")
+        return "⚠️ Agent ran but produced no answers in the expected format.", results_df
+    # 3. Submit Answers
+    logger.info(f"Submitting {len(answers_payload)} answers for user '{username}'...")
+    space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
+    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "Agent code URL unavailable"
+    submit_data = {
+        "username": username,
+        "agent_code": agent_code_url,
+        "answers": answers_payload
+    }
     try:
+        response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
+        response.raise_for_status() # Check for HTTP errors
+        submission_result = response.json()
+        logger.info(f"✅ Submission successful! API Response: {submission_result}")
+        score = submission_result.get('score', 'N/A')
+        score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
+        correct = submission_result.get('correct_count', '?')
+        attempted = submission_result.get('total_attempted', '?')
+        message = submission_result.get('message', '(No message from server)')
+        status_message = (
+            f"✅ Submission Successful!\n"
+            f"User: {username}\n"
+            f"Score: {score_str}\n"
+            f"Details: {correct} / {attempted} correct\n"
+            f"Server Message: {message}"
         )
+        return status_message, results_df
     except requests.exceptions.RequestException as e:
         logger.exception("Submission request failed")
         error_details = str(e)
         if e.response is not None:
+            error_details += f" | Status: {e.response.status_code} | Response: {e.response.text[:300]}" # Log snippet
+        return f"❌ Submission Failed: {error_details}", results_df
     except Exception as e:
+        logger.exception("Unexpected error during submission")
+        return f"❌ Submission Failed with unexpected error: {e}", results_df
+# --- Build Gradio App ---
+logger.info("Setting up Gradio interface...")
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🚀 Agent Evaluation Runner 🚀
+        Connect your Hugging Face account, then click the button below to fetch tasks, run the agent, and submit the answers.
+        Ensure the `GITHUB_TOKEN` secret is correctly set in your Space settings.
+        """
+    )
+    with gr.Row():
+        hf_login_button = gr.LoginButton() # Use the login button component
+    run_button = gr.Button("▶️ Run Evaluation & Submit All Answers", variant="primary")
+    submission_status_textbox = gr.Textbox(
+        label="📊 Submission Status",
+        lines=5,
+        interactive=False,
+        placeholder="Submission status will appear here..."
+    )
+    results_dataframe = gr.DataFrame(
+        label="📋 Detailed Log (Questions & Agent Output)",
+        headers=["Task ID", "Question", "Submitted Answer", "Full Agent Output"],
         wrap=True,
+        # Removed height, let Gradio manage it or control via CSS if needed
+        column_widths=["10%", "25%", "20%", "45%"]
+    )
+    # Connect button click to the evaluation function
+    # Pass the login button's profile info to the function
+    run_button.click(
+        fn=evaluate_and_submit,
+        inputs=[hf_login_button], # Pass the profile info from the login button
+        outputs=[submission_status_textbox, results_dataframe],
+        api_name="evaluate_submit" # For API usage if needed
     )
+logger.info("Gradio interface setup complete.")
+# --- Launch the App ---
 if __name__ == "__main__":
+    logger.info("Launching Gradio application...")
+    demo.launch(
+        debug=True,  # Provides more detailed logs for Gradio itself
+        share=True   # Necessary for public access on Hugging Face Spaces
+    )
+    logger.info("Gradio application has been launched.")