Final_Assignment_Template

Sleeping

App Files Files Community

pmeyhoefer commited on May 2, 2025

Commit

81d72bd

verified ·

1 Parent(s): d2d0f74

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -149

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import logging
 import gradio as gr
 import requests
@@ -7,7 +8,6 @@ import pandas as pd
 from openai import OpenAI
 from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
-# Assuming OpenAIServerModel correctly handles base_url/api_base
 from smolagents.models import OpenAIServerModel
 # --- Logging ---
@@ -15,7 +15,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(mess
 logger = logging.getLogger(__name__)
 # --- Constants ---
-DEFAULT_API_URL   = "https://agents-course-unit4-scoring.hf.space" # Keep this for submission
 # --- GitHub Models Configuration ---
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
@@ -23,24 +23,19 @@ if not GITHUB_TOKEN:
     raise RuntimeError("Please set GITHUB_TOKEN in your Space secrets.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
-# Verify this model ID with GitHub Models documentation. Using mini for potentially faster/cheaper tests.
-MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") # Changed to mini based on logs
-# --- Configure OpenAI SDK (Optional - for tools if needed, points to GitHub) ---
-# If tools don't use this client directly, this might be redundant,
-# but it doesn't hurt to have it configured consistently.
 try:
     client = OpenAI(
         base_url=GITHUB_ENDPOINT,
         api_key=GITHUB_TOKEN,
     )
 except Exception as e:
-    logger.error(f"Failed to initialize OpenAI client for GitHub Models: {e}")
-    # Decide how to handle this - raise error, log warning, etc.
-    # For now, just log and proceed, as the agent itself uses OpenAIServerModel
     pass
 # --- Tools ---
 # Instantiate the search tool ONCE
@@ -50,14 +45,23 @@ search_tool_instance = DuckDuckGoSearchTool()
 def duckduckgo_search(query: str) -> str:
     """
     Performs a DuckDuckGo search for the given query and returns the results.
     Args:
         query (str): The search query.
     Returns:
-        str: The search results.
     """
     try:
         # Call the instantiated search tool
-        return search_tool_instance(query=query)
     except Exception as e:
         logger.exception(f"DuckDuckGoSearchTool failed for query: {query}")
         return f"Search Error: {e}"
@@ -65,172 +69,188 @@ def duckduckgo_search(query: str) -> str:
 @tool
 def summarize_query(query: str) -> str:
     """
-    Reframes an unclear search query to improve relevance.
     Args:
         query (str): The original search query.
     Returns:
-        str: A concise, improved version.
     """
-    # Assuming this doesn't need an LLM call. If it did, it would use 'client'.
     return f"Summarize and reframe: {query}"
 @tool
 def wikipedia_search(page: str) -> str:
     """
-    Fetches the summary extract of an English Wikipedia page.
     Args:
-        page (str): e.g. 'Mercedes_Sosa_discography' or 'Mercedes_Sosa'
     Returns:
-        str: The page’s extract text or an error message.
     """
-    # Make page names URL-safe (replace spaces with underscores)
-    page = page.replace(" ", "_")
     try:
-        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page}"
-        headers = {'User-Agent': 'SmolAgentGAIARunner/1.0 (https://huggingface.co/spaces/YOUR_SPACE_ID)'} # Good practice
-        r = requests.get(url, headers=headers, timeout=10)
         r.raise_for_status() # Raises HTTPError for 4xx/5xx
         data = r.json()
         extract = data.get("extract", "")
-        if not extract and data.get("title") and data.get("type") == "disambiguation":
-            # Handle disambiguation pages better if needed, maybe return links?
-             return f"Wikipedia page '{page}' is a disambiguation page. Try a more specific query."
-        elif not extract:
-             return f"Wikipedia page '{page}' found, but has no summary extract."
         return extract
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
-             logger.warning(f"Wikipedia page not found: {page}")
-             return f"Wikipedia page '{page}' not found."
         else:
-             logger.exception(f"Wikipedia lookup failed for page: {page}")
-             return f"Wikipedia HTTP error {e.response.status_code}: {e}"
     except Exception as e:
-        logger.exception(f"Wikipedia lookup failed for page: {page}")
-        return f"Wikipedia error: {e}"
-# No longer need separate variable names for the functions if they match the @tool name
-# wiki_tool      = wikipedia_search # Redundant if function name is clear
-# summarize_tool = summarize_query # Redundant
 # --- ReACT Prompt ---
-# *** IMPORTANT: Update the prompt to use the NEW function name 'duckduckgo_search' ***
 instruction_prompt = """
 You are a ReACT agent with three tools:
- • duckduckgo_search(query: str)
  • wikipedia_search(page: str)
  • summarize_query(query: str)
 Internally, for each question:
-1. Thought: decide which tool to call.
-2. Action: call the chosen tool.
-3. Observation: record the result.
-4. If empty/irrelevant (e.g., 'page not found', empty search results, or 404 error):
-   Thought: Re-evaluate. Should I try summarizing the query first with summarize_query and then searching with duckduckgo_search? Or try a different Wikipedia page name? Or maybe the information isn't available via these tools.
-   Action: Call the chosen alternative tool (or conclude if necessary).
-   Record new Observation.
-5. Thought: integrate observations. If multiple searches were needed, synthesize the results.
-Finally, output your answer with the following template:
 FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
-If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
-If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-Only output the FINAL ANSWER line once all thinking is done.
 """
 # --- Build the Agent with OpenAIServerModel pointing to GitHub Models ---
 try:
-    # Try with base_url first, as it's the modern OpenAI SDK parameter
     model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
-        base_url=GITHUB_ENDPOINT # Use base_url
-        # You might need to pass model_kwargs if specific settings are required
-        # model_kwargs={'temperature': 0.7} # Example
     )
-    logger.info(f"Configured OpenAIServerModel with GitHub endpoint using 'base_url'.")
-except TypeError:
-    logger.warning("Configuring OpenAIServerModel with 'base_url' failed, trying 'api_base'.")
-    # Fallback attempt using api_base if base_url caused a TypeError
-    try:
-        model = OpenAIServerModel(
-            model_id=MODEL_ID,
-            api_key=GITHUB_TOKEN,
-            api_base=GITHUB_ENDPOINT # Use api_base
-        )
-        logger.info(f"Successfully configured OpenAIServerModel with GitHub endpoint using 'api_base'.")
-    except Exception as e:
-        logger.error(f"Failed to configure OpenAIServerModel with both 'base_url' and 'api_base': {e}")
-        raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
 except Exception as e:
-    logger.error(f"Failed to configure OpenAIServerModel: {e}")
     raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
-# *** Pass the list of FUNCTION objects to the CodeAgent ***
 smart_agent = CodeAgent(
-    tools=[duckduckgo_search, wikipedia_search, summarize_query], # Use the function names directly
     model=model
-    # Check smolagents docs if there's a way to pass globals/context for execution
-    # e.g., execution_globals={'duckduckgo_search': duckduckgo_search, ...} might be needed
-    # but often passing the functions in the 'tools' list is enough if they are decorated correctly.
 )
 # --- Gradio Wrapper ---
 class BasicAgent:
     def __init__(self):
-        logger.info(f"Initialized SmolAgent with GitHub Model: {MODEL_ID} via {GITHUB_ENDPOINT}")
     def __call__(self, question: str) -> str:
-        if not question.strip():
             return "AGENT ERROR: empty question"
-        # Ensure the prompt ends correctly before adding the question
-        prompt = instruction_prompt.strip() + "\n\nQUESTION: " + question.strip()
-        logger.info(f"Running agent with prompt:\n-------\n{prompt}\n-------")
         try:
-            # The agent uses the 'model' instance we configured above
             result = smart_agent.run(prompt)
-            logger.info(f"Agent returned: {result}")
-            # Basic check if the agent failed to produce a final answer
             if "FINAL ANSWER:" not in result:
-                 logger.warning("Agent did not produce a 'FINAL ANSWER:' block.")
-                 # You might return a generic error or the raw output
-                 return f"AGENT WARNING: No 'FINAL ANSWER:' found. Raw output: {result}"
-            return result # Return the full output including FINAL ANSWER:
         except Exception as e:
-            logger.exception("Agent run error")
-            return f"AGENT ERROR: {e}"
 # --- Submission Logic ---
-# (No changes needed here, it uses the BasicAgent instance)
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     if not profile:
-        return "Please log in to Hugging Face.", None
     username   = profile.username
     space_id   = os.getenv("SPACE_ID", "")
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    agent      = BasicAgent() # Instantiates the agent with the corrected tool setup
-    # fetch questions (unchanged)
     try:
-        resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
         resp.raise_for_status()
         questions_data = resp.json()
         if not isinstance(questions_data, list):
-             logger.error(f"Fetched questions is not a list: {questions_data}")
-             return "Error: Fetched questions format is incorrect.", None
         questions = questions_data or []
-        logger.info(f"Fetched {len(questions)} questions.")
     except Exception as e:
-        logger.exception("Failed fetch")
         return f"Error fetching questions: {e}", None
     logs, payload = [], []
-    for item in questions:
         if not isinstance(item, dict):
-             logger.warning(f"Skipping invalid question item: {item}")
              continue
         tid = item.get("task_id")
         q   = item.get("question")
@@ -238,92 +258,114 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             logger.warning(f"Skipping question with missing task_id or question: {item}")
             continue
-        logger.info(f"Processing Task ID: {tid}, Question: {q}")
         ans_raw = agent(q) # Run the agent
         # Extract only the final answer part for submission
         final_ans_marker = "FINAL ANSWER:"
         if final_ans_marker in ans_raw:
             submitted_ans = ans_raw.split(final_ans_marker, 1)[1].strip()
-        elif "AGENT ERROR:" in ans_raw or "AGENT WARNING:" in ans_raw:
-             submitted_ans = f"ERROR ({ans_raw})" # Submit error message
         else:
-             logger.warning(f"Could not extract final answer from raw output for Task ID {tid}. Raw: {ans_raw}")
-             submitted_ans = f"ERROR (Could not parse agent output)" # Fallback
-        logger.info(f"Task ID: {tid}, Submitted Answer: {submitted_ans}")
-        logs.append({"Task ID": tid, "Question": q, "Submitted Answer": submitted_ans, "Raw Output": ans_raw})
         payload.append({"task_id": tid, "submitted_answer": submitted_ans})
     if not payload:
         logger.warning("Agent did not produce any valid answers to submit.")
-        return "Agent did not produce any answers.", pd.DataFrame(logs)
-    logger.info(f"Submitting {len(payload)} answers...")
-    # submit answers (unchanged, uses extracted answer)
     try:
         submit_payload = {"username": username, "agent_code": agent_code, "answers": payload}
-        logger.debug(f"Submission Payload: {submit_payload}") # Log payload for debugging if needed
         post = requests.post(
             f"{DEFAULT_API_URL}/submit",
             json=submit_payload,
-            timeout=60
         )
-        post.raise_for_status()
         result = post.json()
-        logger.info(f"Submission Result: {result}")
         score_percent = result.get('score', 'N/A')
-        # Ensure score is formatted reasonably if it's a number
-        try:
-            score_percent = f"{float(score_percent):.2f}" if score_percent != 'N/A' else 'N/A'
-        except (ValueError, TypeError):
-             pass # Keep as 'N/A' or original string if conversion fails
         status = (
             f"Submission Successful!\n"
-            f"User: {result.get('username')}\n"
             f"Score: {score_percent}%\n"
-            f"({result.get('correct_count','?')}/"
-            f"{result.get('total_attempted','?')})\n"
-            f"Message: {result.get('message','')}"
         )
-        return status, pd.DataFrame(logs)
     except requests.exceptions.RequestException as e:
-        logger.exception("Submit failed")
-        # Try to get more info from the response if possible
         error_details = str(e)
         if e.response is not None:
-             error_details += f" | Status Code: {e.response.status_code} | Response: {e.response.text[:500]}" # Limit response size
-        return f"Submission Failed: {error_details}", pd.DataFrame(logs)
     except Exception as e:
-        logger.exception("Submit failed")
-        return f"Submission Failed with unexpected error: {e}", pd.DataFrame(logs)
 # --- Gradio App ---
-# (No changes needed here)
 with gr.Blocks() as demo:
     gr.Markdown("# SmolAgent GAIA Runner (using GitHub Models) 🚀")
     gr.Markdown("""
 **Instructions:**
-1. Clone this space.
-2. In Settings → Secrets, add `GITHUB_TOKEN` (your GitHub access token with appropriate permissions for GitHub Models).
-3. Optionally, set `MODEL_ID` if you want to use a model other than the default (e.g., `openai/gpt-4o`). Verify the correct model identifier for GitHub Models.
-4. Log in to Hugging Face.
-5. Click **Run Evaluation & Submit All Answers**.
 """)
     gr.LoginButton()
     btn = gr.Button("Run Evaluation & Submit All Answers")
-    out_status = gr.Textbox(label="Status", lines=5, interactive=False)
-    out_table  = gr.DataFrame(label="Questions & Answers", wrap=True, height=400) # Increased height maybe
-    btn.click(run_and_submit_all, outputs=[out_status, out_table])
 if __name__ == "__main__":
     if not GITHUB_TOKEN:
-        logger.error("GITHUB_TOKEN environment variable not set. Cannot start.")
-    else:
-        logger.info("Launching Gradio App...")
-        # share=True needed for public link as mentioned in logs
-        # debug=True provides more verbose Gradio logging if needed
-        demo.launch(debug=True, share=True)

 import os
 import logging
+import traceback # Import traceback for better error logging
 import gradio as gr
 import requests
 from openai import OpenAI
 from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
 from smolagents.models import OpenAIServerModel
 # --- Logging ---
 logger = logging.getLogger(__name__)
 # --- Constants ---
+DEFAULT_API_URL   = "https://agents-course-unit4-scoring.hf.space"
 # --- GitHub Models Configuration ---
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
     raise RuntimeError("Please set GITHUB_TOKEN in your Space secrets.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
+MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") # Using mini as per logs
+# --- Configure OpenAI SDK (Optional) ---
+# Less critical if tools don't directly use it
 try:
     client = OpenAI(
         base_url=GITHUB_ENDPOINT,
         api_key=GITHUB_TOKEN,
     )
 except Exception as e:
+    logger.error(f"Ignoring error during optional OpenAI client init for GitHub Models: {e}")
     pass
 # --- Tools ---
 # Instantiate the search tool ONCE
 def duckduckgo_search(query: str) -> str:
     """
     Performs a DuckDuckGo search for the given query and returns the results.
+    Use this for general web searches.
     Args:
         query (str): The search query.
     Returns:
+        str: The search results, or an error message.
     """
+    logger.info(f"Executing duckduckgo_search with query: {query}")
     try:
         # Call the instantiated search tool
+        result = search_tool_instance(query=query)
+        logger.info(f"DuckDuckGo search returned {len(result)} characters.")
+        # Maybe truncate long results if they cause issues downstream?
+        # max_len = 2000
+        # if len(result) > max_len:
+        #     logger.warning(f"Truncating DuckDuckGo result from {len(result)} to {max_len} chars.")
+        #     result = result[:max_len] + "... (truncated)"
+        return result
     except Exception as e:
         logger.exception(f"DuckDuckGoSearchTool failed for query: {query}")
         return f"Search Error: {e}"
 @tool
 def summarize_query(query: str) -> str:
     """
+    Reframes an unclear search query to improve relevance. Often useful before calling duckduckgo_search if the initial query is vague.
     Args:
         query (str): The original search query.
     Returns:
+        str: A concise, improved version prepended with 'Summarize and reframe:'.
     """
+    logger.info(f"Executing summarize_query with query: {query}")
+    # This still doesn't use an LLM, it's just a placeholder/reframing instruction
     return f"Summarize and reframe: {query}"
 @tool
 def wikipedia_search(page: str) -> str:
     """
+    Fetches the summary extract of an English Wikipedia page. Use specific page titles.
     Args:
+        page (str): The exact Wikipedia page title (e.g., 'Mercedes_Sosa', 'List_of_Mercedes_Sosa_albums'). Spaces will be replaced by underscores.
     Returns:
+        str: The page’s extract text or an error message (e.g., 'Wikipedia page '[page]' not found.').
     """
+    page_safe = page.replace(" ", "_")
+    logger.info(f"Executing wikipedia_search with page: {page} (URL-safe: {page_safe})")
     try:
+        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
+        # Add a more specific user agent if running in HF Spaces
+        space_id = os.getenv("SPACE_ID", "unknown-space")
+        headers = {'User-Agent': f'SmolAgentGAIARunner/1.1 ({space_id})'}
+        r = requests.get(url, headers=headers, timeout=12)
         r.raise_for_status() # Raises HTTPError for 4xx/5xx
         data = r.json()
         extract = data.get("extract", "")
+        if not extract:
+             # Handle disambiguation or empty pages
+             page_title = data.get("title", page)
+             page_type = data.get("type", "standard")
+             if page_type == "disambiguation":
+                 logger.warning(f"Wikipedia page '{page_title}' is a disambiguation page.")
+                 # Try to get description which might list options
+                 description = data.get("description", "disambiguation page.")
+                 return f"Wikipedia page '{page_title}' is a {description}. Try a more specific page title."
+             else: # Standard page but no extract
+                 logger.warning(f"Wikipedia page '{page_title}' found, but has no summary extract.")
+                 return f"Wikipedia page '{page_title}' found, but has no summary extract."
+        logger.info(f"Wikipedia search for '{page}' returned {len(extract)} characters.")
         return extract
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
+             logger.warning(f"Wikipedia page not found: {page_safe}")
+             return f"Wikipedia page '{page_safe}' not found."
         else:
+             logger.exception(f"Wikipedia lookup failed for page: {page_safe} with status {e.response.status_code}")
+             return f"Wikipedia HTTP error {e.response.status_code} for page '{page_safe}': {e}"
+    except requests.exceptions.RequestException as e:
+        logger.exception(f"Wikipedia network request failed for page: {page_safe}")
+        return f"Wikipedia network error for page '{page_safe}': {e}"
     except Exception as e:
+        logger.exception(f"Unexpected Wikipedia lookup error for page: {page_safe}")
+        return f"Unexpected Wikipedia error for page '{page_safe}': {e}"
 # --- ReACT Prompt ---
+# *** THIS IS THE CRITICAL FIX: Ensure the tool name here matches the @tool function ***
 instruction_prompt = """
 You are a ReACT agent with three tools:
+ • duckduckgo_search(query: str)  # Correct function name
  • wikipedia_search(page: str)
  • summarize_query(query: str)
 Internally, for each question:
+1. Thought: Decide which tool is most appropriate. If searching the web, use duckduckgo_search. If looking for encyclopedic info on a specific topic/entity, try wikipedia_search first with the most likely page title. If a search or lookup fails or returns irrelevant info, think about why and try reformulating the query or using a different tool. Maybe use summarize_query on a complex question before searching.
+2. Action: Call the chosen tool with the correct arguments. For wikipedia_search, use page titles like 'Entity_Name' or 'List_of_Entity_Albums'.
+3. Observation: Record the result returned by the tool. Note error messages like 'page not found' or 'Search Error'.
+4. Thought: Analyze the observation. Was the information found? Is it relevant? If not, what should be the next step? Try duckduckgo_search if Wikipedia failed? Try a different Wikipedia page title (e.g., 'List_of_Mercedes_Sosa_albums' instead of 'Mercedes_Sosa_discography')? If search results are messy, maybe try summarize_query on the topic and search again?
+5. Action: Execute the next action based on the thought.
+6. Repeat steps 3-5 until the answer is found or you determine it cannot be found with the available tools.
+7. Thought: Synthesize all observations into a final answer based *only* on the information gathered.
+Finally, output your answer with the following template *exactly*:
 FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+If you are asked for a number, output only the number (e.g., 42). No commas in numbers (e.g., 1000 not 1,000). No units ($ or %).
+If you are asked for a string, use minimal words, no articles (a, an, the), no abbreviations (e.g., New York City not NYC). Write digits as words (e.g., seven not 7) unless the question implies numerical output.
+If you are asked for a comma separated list, apply the above rules to each element. Example: red,blue,three.
 """
 # --- Build the Agent with OpenAIServerModel pointing to GitHub Models ---
 try:
     model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
+        base_url=GITHUB_ENDPOINT,
+        # Add timeout if needed, e.g., request_timeout=60
+        # Add model_kwargs if needed, e.g. model_kwargs={'temperature': 0.5}
     )
+    logger.info(f"Configured OpenAIServerModel(id={MODEL_ID}, endpoint={GITHUB_ENDPOINT})")
 except Exception as e:
+    logger.exception("Failed to configure OpenAIServerModel")
     raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
+# Pass the list of FUNCTION objects decorated with @tool
 smart_agent = CodeAgent(
+    tools=[duckduckgo_search, wikipedia_search, summarize_query],
     model=model
 )
+logger.info(f"CodeAgent initialized with tools: {[t.__name__ for t in smart_agent.tools]}")
 # --- Gradio Wrapper ---
 class BasicAgent:
     def __init__(self):
+        logger.info(f"BasicAgent initialized, using SmolAgent with model {MODEL_ID}")
     def __call__(self, question: str) -> str:
+        question = question.strip()
+        if not question:
+            logger.error("Agent called with empty question.")
             return "AGENT ERROR: empty question"
+        # Use the updated instruction_prompt
+        prompt = instruction_prompt.strip() + "\n\nQUESTION: " + question
+        # Log the exact prompt being sent (optional, can be verbose)
+        # logger.debug(f"--- Sending Prompt to Agent ---\n{prompt}\n-----------------------------")
         try:
+            logger.info(f"Running agent for question: '{question}'")
+            # The agent uses the 'model' instance and tools configured above
             result = smart_agent.run(prompt)
+            # Log the raw result (optional, can be verbose)
+            # logger.debug(f"--- Raw Agent Result ---\n{result}\n--------------------------")
+            logger.info(f"Agent finished run for question: '{question}'")
+            # Basic check if the agent failed to produce a final answer format
             if "FINAL ANSWER:" not in result:
+                 logger.warning(f"Agent output for question '{question}' did not contain 'FINAL ANSWER:'. Raw output: {result}")
+                 # Decide how to handle this - return error or raw output?
+                 # Returning raw output might be better for debugging but fail submission check.
+                 # Let's return a specific error for submission.
+                 return f"AGENT ERROR: Malformed response - No 'FINAL ANSWER:' block found."
+            return result # Return the full raw output including thought process and FINAL ANSWER
         except Exception as e:
+            logger.exception(f"Agent run failed for question '{question}'")
+            # Get traceback details
+            tb_str = traceback.format_exc()
+            return f"AGENT ERROR: Exception during run: {e}\nTraceback:\n{tb_str}"
 # --- Submission Logic ---
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     if not profile:
+        logger.warning("Submission attempt failed: User not logged in.")
+        return "Please log in to Hugging Face to submit.", None
     username   = profile.username
     space_id   = os.getenv("SPACE_ID", "")
+    if not space_id:
+        logger.warning("SPACE_ID environment variable not set. Agent code URL will be incomplete.")
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Agent code URL unavailable (SPACE_ID not set)"
+    logger.info(f"Starting evaluation run for user '{username}'")
+    agent = BasicAgent()
+    # Fetch questions
     try:
+        logger.info(f"Fetching questions from {DEFAULT_API_URL}/questions")
+        resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
         resp.raise_for_status()
         questions_data = resp.json()
         if not isinstance(questions_data, list):
+             logger.error(f"Fetched questions is not a list: {type(questions_data)}")
+             return f"Error: Fetched questions format is incorrect (expected list, got {type(questions_data)}).", None
         questions = questions_data or []
+        logger.info(f"Fetched {len(questions)} questions successfully.")
     except Exception as e:
+        logger.exception("Failed to fetch questions")
         return f"Error fetching questions: {e}", None
+    if not questions:
+        logger.warning("No questions fetched or questions list is empty.")
+        return "No questions were fetched from the server.", None
     logs, payload = [], []
+    question_count = len(questions)
+    for i, item in enumerate(questions):
         if not isinstance(item, dict):
+             logger.warning(f"Skipping invalid question item (not a dict): {item}")
              continue
         tid = item.get("task_id")
         q   = item.get("question")
             logger.warning(f"Skipping question with missing task_id or question: {item}")
             continue
+        logger.info(f"Processing question {i+1}/{question_count} - Task ID: {tid}")
         ans_raw = agent(q) # Run the agent
         # Extract only the final answer part for submission
         final_ans_marker = "FINAL ANSWER:"
+        submitted_ans = f"ERROR (Agent did not produce output with {final_ans_marker})" # Default if parsing fails
         if final_ans_marker in ans_raw:
+            # Split and take the part *after* the marker
             submitted_ans = ans_raw.split(final_ans_marker, 1)[1].strip()
+            # Optional: Basic validation/cleanup of the extracted answer?
+            # e.g., remove leading/trailing quotes if not needed
+            # submitted_ans = submitted_ans.strip(' "')
+        elif "AGENT ERROR:" in ans_raw:
+             # If agent returned an error string, submit that
+             submitted_ans = ans_raw # Keep the AGENT ERROR message
+             logger.warning(f"Agent returned an error for Task ID {tid}: {submitted_ans}")
         else:
+             logger.warning(f"Could not extract final answer from raw output for Task ID {tid}. Raw: {ans_raw[:500]}...") # Log snippet
+        logger.info(f"Task ID: {tid}, Question: '{q}', Submitted Answer: '{submitted_ans}'")
+        # Store more info for the Gradio table, including the raw output for debugging
+        logs.append({
+            "Task ID": tid,
+            "Question": q,
+            "Submitted Answer": submitted_ans,
+            "Agent Raw Output": ans_raw # Show the full thought process in the table
+        })
         payload.append({"task_id": tid, "submitted_answer": submitted_ans})
     if not payload:
         logger.warning("Agent did not produce any valid answers to submit.")
+        # Check if logs have entries to display potential errors
+        if logs:
+            return "Agent ran but did not produce any answers in the expected format.", pd.DataFrame(logs)
+        else:
+            return "Agent did not produce any answers.", None
+    logger.info(f"Submitting {len(payload)} answers for user '{username}'...")
+    # Submit answers
     try:
         submit_payload = {"username": username, "agent_code": agent_code, "answers": payload}
+        # logger.debug(f"Submission Payload: {submit_payload}") # Careful logging PII
         post = requests.post(
             f"{DEFAULT_API_URL}/submit",
             json=submit_payload,
+            timeout=90 # Increased timeout for submission
         )
+        post.raise_for_status() # Check for HTTP errors from submission endpoint
         result = post.json()
+        logger.info(f"Submission successful. Result: {result}")
         score_percent = result.get('score', 'N/A')
+        try: # Format score nicely
+            score_percent = f"{float(score_percent):.2f}" if isinstance(score_percent, (int, float)) else score_percent
+        except (ValueError, TypeError): pass
         status = (
             f"Submission Successful!\n"
+            f"User: {result.get('username', 'N/A')}\n"
             f"Score: {score_percent}%\n"
+            f"Correct: {result.get('correct_count','?')} / Attempted: {result.get('total_attempted','?')}\n"
+            f"Message: {result.get('message','(No message)')}"
         )
+        # Update logs DataFrame with final status if needed, though usually not necessary
+        return status, pd.DataFrame(logs) # Return status and the detailed logs
     except requests.exceptions.RequestException as e:
+        logger.exception("Submission request failed")
         error_details = str(e)
         if e.response is not None:
+             error_details += f" | Status Code: {e.response.status_code} | Response: {e.response.text[:500]}"
+        return f"Submission Failed: {error_details}", pd.DataFrame(logs) # Return error and logs
     except Exception as e:
+        logger.exception("Submission failed with unexpected error")
+        return f"Submission Failed with unexpected error: {e}", pd.DataFrame(logs) # Return error and logs
 # --- Gradio App ---
 with gr.Blocks() as demo:
     gr.Markdown("# SmolAgent GAIA Runner (using GitHub Models) 🚀")
     gr.Markdown("""
 **Instructions:**
+1. Ensure `GITHUB_TOKEN` secret is set. Optionally set `MODEL_ID`.
+2. Log in to Hugging Face below.
+3. Click **Run Evaluation & Submit All Answers**.
+4. Check the Status and the Questions & Answers table for results. The raw agent output includes the thinking process.
 """)
     gr.LoginButton()
     btn = gr.Button("Run Evaluation & Submit All Answers")
+    out_status = gr.Textbox(label="Submission Status", lines=5, interactive=False)
+    # *** FIX: Remove the 'height' argument ***
+    out_table  = gr.DataFrame(
+        label="Questions & Answers Log",
+        wrap=True,
+        # Add headers if you want to control column names/order explicitly
+        headers=["Task ID", "Question", "Submitted Answer", "Agent Raw Output"],
+        column_widths=["10%", "30%", "20%", "40%"] # Adjust widths as needed
+    )
+    btn.click(run_and_submit_all, outputs=[out_status, out_table], api_name="run_submit") # Add api_name
 if __name__ == "__main__":
     if not GITHUB_TOKEN:
+        logger.error("GITHUB_TOKEN environment variable not set. Cannot start effectively.")
+        # Optionally raise error or exit? For now, just log.
+    logger.info("Launching Gradio App...")
+    # share=True is needed for public link if running on HF Spaces
+    # debug=True provides more verbose Gradio logging
+    demo.launch(debug=True, share=True)