Final_Assignment_Template

Sleeping

App Files Files Community

pmeyhoefer commited on May 2, 2025

Commit

15fa167

verified ·

1 Parent(s): 009368a

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -217

app.py CHANGED Viewed

@@ -16,362 +16,261 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
 logger = logging.getLogger(__name__)
 # --- Configuration ---
-# URL for fetching questions and submitting answers
 SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
-# GitHub Models Configuration
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 if not GITHUB_TOKEN:
-    # Critical error if token is missing
-    raise ValueError("GITHUB_TOKEN environment variable not set. Please set it in Space secrets.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
-# Use a known model ID compatible with the endpoint
-# Let's stick to gpt-4o-mini based on previous logs, ensure it's available.
 MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
 # --- Tool Definitions ---
-# Instantiate the search tool ONCE to reuse its state/connection if any
 try:
     search_tool_instance = DuckDuckGoSearchTool()
 except Exception as e:
-    logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}")
-    # Depending on the app's requirements, you might want to raise an error here
-    # or allow the app to start but log the failure.
-    search_tool_instance = None # Indicate failure
-# IMPORTANT: Define wrapper functions that the LLM will be instructed to call.
-# Use the @tool decorator so CodeAgent recognizes them.
 @tool
 def web_search(query: str) -> str:
-    """
-    Performs a web search using DuckDuckGo for the given query.
-    Use this for general questions, finding current information, or when Wikipedia fails.
-    Args:
-        query (str): The search query string.
-    Returns:
-        str: The search results obtained from DuckDuckGo, or an error message.
-    """
-    logger.info(f"Executing web_search with query: '{query[:100]}...'") # Log snippet
     if search_tool_instance is None:
-        logger.error("web_search cannot execute because DuckDuckGoSearchTool failed to initialize.")
         return "Search Error: Tool not initialized."
     try:
         result = search_tool_instance(query=query)
-        logger.info(f"web_search returned {len(result)} characters.")
-        # Limit result length to prevent excessively large observations
         max_len = 3000
-        if len(result) > max_len:
-            logger.warning(f"Truncating web_search result from {len(result)} to {max_len} chars.")
-            return result[:max_len] + "... (truncated)"
-        return result
     except Exception as e:
         logger.exception(f"web_search failed for query: {query}")
         return f"Search Error: {e}"
 @tool
 def wikipedia_lookup(page_title: str) -> str:
-    """
-    Fetches the summary introduction text of an English Wikipedia page.
-    Use this for factual information about specific topics, people, or entities.
-    Args:
-        page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein', 'List_of_programming_languages'). Spaces will be converted to underscores.
-    Returns:
-        str: The summary text of the page, or an error message if not found or failed.
-    """
     page_safe = page_title.replace(" ", "_")
     logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
     try:
         url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
-        space_id = os.getenv("SPACE_ID", "unknown-huggingface-space")
-        headers = {'User-Agent': f'GAIAgent/1.0 ({space_id})'}
-        r = requests.get(url, headers=headers, timeout=15) # Increased timeout
-        r.raise_for_status() # Check for HTTP 4xx/5xx errors
         data = r.json()
         extract = data.get("extract", "")
         if extract:
-            logger.info(f"wikipedia_lookup found summary ({len(extract)} chars) for '{page_title}'.")
             return extract
         else:
-            # Handle pages found but without extracts (e.g., disambiguation)
             page_type = data.get("type", "standard")
             title = data.get("title", page_title)
             if page_type == "disambiguation":
-                description = data.get("description", "multiple meanings")
-                logger.warning(f"wikipedia_lookup found a disambiguation page for '{title}': {description}")
-                return f"Wikipedia Error: '{title}' refers to {description}. Please provide a more specific page title."
             else:
-                 logger.warning(f"wikipedia_lookup found page '{title}' but it has no summary text.")
                  return f"Wikipedia Error: Page '{title}' found but has no summary."
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
-            logger.warning(f"Wikipedia page not found: {page_safe}")
             return f"Wikipedia Error: Page '{page_safe}' not found."
         else:
-            logger.error(f"Wikipedia HTTP error {e.response.status_code} for page: {page_safe}")
             return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
-    except requests.exceptions.RequestException as e:
-        logger.exception(f"Wikipedia network request failed for page: {page_safe}")
-        return f"Wikipedia Error: Network error for page '{page_safe}': {e}"
     except Exception as e:
-        logger.exception(f"Unexpected error during wikipedia_lookup for page: {page_safe}")
         return f"Wikipedia Error: Unexpected error: {e}"
-# Removed summarize_query tool for simplicity, as it wasn't adding much value in logs
-# --- The ReACT Prompt ---
 # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
-# Keep it clear and concise.
 REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
 Available Tools:
-- web_search(query: str): Use this for searching the web for general information, current events, or when you don't know a specific Wikipedia page title.
-- wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page. Use exact page titles (e.g., 'Berlin', 'Python_(programming_language)').
-Follow these steps for each question:
-1.  **Thought:** Briefly explain your plan and which tool you will use and why.
-2.  **Action:** Call ONE tool using the correct function name and arguments. Example: web_search(query="latest news") or wikipedia_lookup(page_title="Artificial_intelligence").
-3.  **Observation:** Record the result provided by the tool.
-4.  **Thought:** Analyze the observation. Does it answer the question? If yes, prepare the final answer. If not, plan the next step (e.g., try a different tool, refine the search query, use a different Wikipedia title).
-5.  Repeat Action/Observation/Thought until you have the answer or determine it cannot be found.
-6.  **Thought:** Summarize the findings and prepare the final answer based ONLY on the observations.
-7.  **Final Answer:** Provide the final answer in the required format (number, short string, or comma-separated list) on a new line starting exactly with "FINAL ANSWER: ".
 Formatting Rules for FINAL ANSWER:
--   Numbers: Output only the number (e.g., `42`, `1000`). No commas, units ($).
--   Strings: Use minimal words, no articles (a, an, the). Write digits as words (e.g., `seven`) unless numerical output is implied.
--   Lists: Comma-separated, apply number/string rules to each item (e.g., `paris,london,three`).
 Let's begin!
 """
 # --- SmolAgent Setup ---
-logger.info(f"Initializing LLM connection to {MODEL_ID} via {GITHUB_ENDPOINT}")
 try:
-    # Configure the model connection to use GitHub's endpoint
     llm_model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
         base_url=GITHUB_ENDPOINT,
-        request_timeout=60 # Add a timeout for model requests
     )
-    # Verify connection (optional, depends on OpenAIServerModel implementation)
-    # You might add a simple test call here if the library supports it easily
-    logger.info("LLM connection configured successfully.")
 except Exception as e:
     logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
-    raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
 logger.info("Initializing CodeAgent...")
 try:
-    # Create the agent instance, passing the *list of actual functions* decorated with @tool
     agent = CodeAgent(
-        tools=[web_search, wikipedia_lookup], # Only include the defined tool functions
         model=llm_model
     )
-    # Log the names of the tools the agent actually recognized (if possible/safe)
-    # This depends on how CodeAgent stores tools. Avoid the previous error.
-    # logger.info(f"CodeAgent initialized. Tools detected by agent (if available): {agent.tools}") # Be cautious with this line
-    logger.info("CodeAgent initialized successfully.")
 except Exception as e:
     logger.exception("CRITICAL: Failed to initialize CodeAgent")
     raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
-# --- Gradio Interface ---
 def run_agent_on_question(question: str) -> str:
-    """
-    Takes a question, runs the SmolAgent, and returns the raw output.
-    Handles basic validation and error catching.
-    """
     question = question.strip()
     if not question:
-        logger.error("Agent called with empty question.")
         return "AGENT_ERROR: Question cannot be empty."
-    # Construct the full prompt for the agent run
     full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
     logger.info(f"--- Running Agent for Question: '{question}' ---")
-    # Log first few lines of prompt for verification (optional)
-    # logger.debug(f"Prompt start:\n{full_prompt[:300]}...")
     try:
-        # Execute the agent run
-        raw_result = agent.run(full_prompt)
-        logger.info(f"Agent run completed for question: '{question}'. Output length: {len(raw_result)}")
-        # Log first/last parts of the raw result for debugging (optional)
-        # logger.debug(f"Raw agent result snippet:\n{raw_result[:500]}...\n...{raw_result[-500:]}")
         return raw_result
     except Exception as e:
         logger.exception(f"Agent run failed for question '{question}'")
-        tb_str = traceback.format_exc() # Get detailed traceback
-        return f"AGENT_ERROR: An exception occurred during agent execution: {e}\nTraceback:\n{tb_str}"
-def evaluate_and_submit(hf_profile: gr.OAuthProfile | None):
-    """
-    Gradio action: Fetches questions, runs agent on each, submits results.
-    """
-    if not hf_profile:
-        logger.warning("Submission attempt failed: User not logged in.")
-        return "⚠️ Please log in to Hugging Face via the button above to submit.", None # Status message, empty DataFrame
-    username = hf_profile.username
-    logger.info(f"🚀 Starting evaluation run for user '{username}'...")
     # 1. Fetch Questions
     questions = []
     try:
-        logger.info(f"Fetching questions from {SUBMISSION_URL}/questions")
         resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
         resp.raise_for_status()
-        questions_data = resp.json()
-        if isinstance(questions_data, list):
-            questions = questions_data
-            logger.info(f"✅ Fetched {len(questions)} questions.")
-        else:
-            logger.error(f"Fetched questions data is not a list: {type(questions_data)}")
-            return "❌ Error: Fetched questions format is incorrect.", None
     except Exception as e:
         logger.exception("Failed to fetch questions")
-        return f"❌ Error fetching questions: {e}", None
     if not questions:
-        logger.warning("No questions fetched or questions list is empty.")
-        return "ℹ️ No questions were fetched from the server.", None
-    # 2. Run Agent on Questions
     results_log = []
     answers_payload = []
-    total_questions = len(questions)
     for i, item in enumerate(questions):
         task_id = item.get("task_id")
         question_text = item.get("question")
-        if not task_id or not question_text:
-            logger.warning(f"Skipping invalid question item {i+1}/{total_questions}: Missing task_id or question. Data: {item}")
-            continue
-        logger.info(f"Processing question {i+1}/{total_questions} (Task ID: {task_id})...")
-        raw_agent_output = run_agent_on_question(question_text) # Run the agent
-        # Extract final answer for submission
-        final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker found in output." # Default if parsing fails
         marker = "FINAL ANSWER:"
         if marker in raw_agent_output:
             final_answer = raw_agent_output.split(marker, 1)[1].strip()
-        elif "AGENT_ERROR:" in raw_agent_output: # If agent returned an error explicitly
-            final_answer = raw_agent_output # Submit the error message
-        logger.info(f"Task ID: {task_id} -> Submitted Answer: '{final_answer}'")
-        # Log results for Gradio table
         results_log.append({
-            "Task ID": task_id,
-            "Question": question_text,
-            "Submitted Answer": final_answer,
-            "Full Agent Output": raw_agent_output # Show full trace in UI
         })
-        # Prepare payload for submission API
         answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
     results_df = pd.DataFrame(results_log)
     if not answers_payload:
-        logger.warning("Agent did not produce any answers to submit.")
-        return "⚠️ Agent ran but produced no answers in the expected format.", results_df
     # 3. Submit Answers
-    logger.info(f"Submitting {len(answers_payload)} answers for user '{username}'...")
     space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
-    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "Agent code URL unavailable"
-    submit_data = {
-        "username": username,
-        "agent_code": agent_code_url,
-        "answers": answers_payload
-    }
     try:
         response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
-        response.raise_for_status() # Check for HTTP errors
-        submission_result = response.json()
-        logger.info(f"✅ Submission successful! API Response: {submission_result}")
-        score = submission_result.get('score', 'N/A')
         score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
-        correct = submission_result.get('correct_count', '?')
-        attempted = submission_result.get('total_attempted', '?')
-        message = submission_result.get('message', '(No message from server)')
-        status_message = (
-            f"✅ Submission Successful!\n"
-            f"User: {username}\n"
-            f"Score: {score_str}\n"
-            f"Details: {correct} / {attempted} correct\n"
-            f"Server Message: {message}"
-        )
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        logger.exception("Submission request failed")
-        error_details = str(e)
-        if e.response is not None:
-            error_details += f" | Status: {e.response.status_code} | Response: {e.response.text[:300]}" # Log snippet
-        return f"❌ Submission Failed: {error_details}", results_df
     except Exception as e:
-        logger.exception("Unexpected error during submission")
-        return f"❌ Submission Failed with unexpected error: {e}", results_df
 # --- Build Gradio App ---
 logger.info("Setting up Gradio interface...")
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🚀 Agent Evaluation Runner 🚀
-        Connect your Hugging Face account, then click the button below to fetch tasks, run the agent, and submit the answers.
-        Ensure the `GITHUB_TOKEN` secret is correctly set in your Space settings.
-        """
-    )
-    with gr.Row():
-        hf_login_button = gr.LoginButton() # Use the login button component
     run_button = gr.Button("▶️ Run Evaluation & Submit All Answers", variant="primary")
-    submission_status_textbox = gr.Textbox(
-        label="📊 Submission Status",
-        lines=5,
-        interactive=False,
-        placeholder="Submission status will appear here..."
     )
-    results_dataframe = gr.DataFrame(
-        label="📋 Detailed Log (Questions & Agent Output)",
-        headers=["Task ID", "Question", "Submitted Answer", "Full Agent Output"],
-        wrap=True,
-        # Removed height, let Gradio manage it or control via CSS if needed
-        column_widths=["10%", "25%", "20%", "45%"]
-    )
-    # Connect button click to the evaluation function
-    # Pass the login button's profile info to the function
     run_button.click(
         fn=evaluate_and_submit,
-        inputs=[hf_login_button], # Pass the profile info from the login button
-        outputs=[submission_status_textbox, results_dataframe],
-        api_name="evaluate_submit" # For API usage if needed
     )
 logger.info("Gradio interface setup complete.")
-# --- Launch the App ---
 if __name__ == "__main__":
     logger.info("Launching Gradio application...")
-    demo.launch(
-        debug=True,  # Provides more detailed logs for Gradio itself
-        share=True   # Necessary for public access on Hugging Face Spaces
-    )
-    logger.info("Gradio application has been launched.")

 logger = logging.getLogger(__name__)
 # --- Configuration ---
 SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 if not GITHUB_TOKEN:
+    raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.")
 GITHUB_ENDPOINT = "https://models.github.ai/inference"
 MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
 # --- Tool Definitions ---
 try:
     search_tool_instance = DuckDuckGoSearchTool()
+    logger.info("DuckDuckGoSearchTool initialized successfully.")
 except Exception as e:
+    logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}. Web search will not work.")
+    search_tool_instance = None
 @tool
 def web_search(query: str) -> str:
+    """Performs a web search using DuckDuckGo."""
+    logger.info(f"Executing web_search with query: '{query[:100]}...'")
     if search_tool_instance is None:
         return "Search Error: Tool not initialized."
     try:
         result = search_tool_instance(query=query)
+        logger.info(f"web_search returned {len(result)} chars.")
         max_len = 3000
+        return result[:max_len] + "... (truncated)" if len(result) > max_len else result
     except Exception as e:
         logger.exception(f"web_search failed for query: {query}")
         return f"Search Error: {e}"
 @tool
 def wikipedia_lookup(page_title: str) -> str:
+    """Fetches the summary introduction text of an English Wikipedia page."""
     page_safe = page_title.replace(" ", "_")
     logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
     try:
         url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
+        headers = {'User-Agent': f'GAIAgent/1.1 ({os.getenv("SPACE_ID", "unknown")})'}
+        r = requests.get(url, headers=headers, timeout=15)
+        r.raise_for_status()
         data = r.json()
         extract = data.get("extract", "")
         if extract:
             return extract
         else:
             page_type = data.get("type", "standard")
             title = data.get("title", page_title)
             if page_type == "disambiguation":
+                return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
             else:
                  return f"Wikipedia Error: Page '{title}' found but has no summary."
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 404:
             return f"Wikipedia Error: Page '{page_safe}' not found."
         else:
             return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
     except Exception as e:
+        logger.exception(f"wikipedia_lookup failed for page: {page_safe}")
         return f"Wikipedia Error: Unexpected error: {e}"
+# --- The ReACT Prompt (ensure this is the *only* main prompt definition) ---
 # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
 REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
 Available Tools:
+- web_search(query: str): Use this for searching the web.
+- wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page (e.g., 'Berlin', 'Python_(programming_language)').
+Follow these steps:
+1. Thought: Plan which tool to use.
+2. Action: Call ONE tool (e.g., web_search(query="...") or wikipedia_lookup(page_title="...")).
+3. Observation: Record the result.
+4. Thought: Analyze the result. If answer found, prepare it. If not, plan next step.
+5. Repeat Action/Observation/Thought until answer is found or determined impossible.
+6. Thought: Summarize findings based ONLY on observations.
+7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
 Formatting Rules for FINAL ANSWER:
+- Numbers: Just the number (e.g., `42`).
+- Strings: Minimal words, no articles. Digits as words (e.g., `seven`).
+- Lists: Comma-separated (e.g., `paris,london,three`).
 Let's begin!
 """
 # --- SmolAgent Setup ---
+logger.info(f"Initializing LLM connection: {MODEL_ID} @ {GITHUB_ENDPOINT}")
 try:
     llm_model = OpenAIServerModel(
         model_id=MODEL_ID,
         api_key=GITHUB_TOKEN,
         base_url=GITHUB_ENDPOINT,
+        request_timeout=60
     )
+    logger.info("LLM connection OK.")
 except Exception as e:
     logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
+    raise RuntimeError(f"Could not configure SmolAgents model: {e}") from e
 logger.info("Initializing CodeAgent...")
 try:
+    # Pass the list of actual tool functions
     agent = CodeAgent(
+        tools=[web_search, wikipedia_lookup],
         model=llm_model
     )
+    logger.info("CodeAgent initialized OK.")
 except Exception as e:
     logger.exception("CRITICAL: Failed to initialize CodeAgent")
     raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
+# --- Agent Execution Function ---
 def run_agent_on_question(question: str) -> str:
+    """Runs the agent with the CORRECT prompt."""
     question = question.strip()
     if not question:
         return "AGENT_ERROR: Question cannot be empty."
+    # *** CRITICAL: Construct the prompt HERE using the correct variable ***
     full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
     logger.info(f"--- Running Agent for Question: '{question}' ---")
+    # Add debug log to show the start of the prompt being used
+    logger.info(f"DEBUG: Using prompt starting with: {full_prompt[:300]}...") # Log beginning of prompt
     try:
+        raw_result = agent.run(full_prompt) # Pass the correctly constructed prompt
+        logger.info(f"Agent run completed. Output length: {len(raw_result)}")
         return raw_result
     except Exception as e:
         logger.exception(f"Agent run failed for question '{question}'")
+        return f"AGENT_ERROR: Exception during run: {e}\n{traceback.format_exc()}"
+# --- Gradio Interface & Submission Logic ---
+# FIX: Define evaluate_and_submit WITHOUT the hf_profile argument initially
+# We will get the profile *inside* the function if needed.
+def evaluate_and_submit():
+    """Gradio action: Fetches questions, runs agent, submits results."""
+    logger.info("🚀 Starting evaluation run...")
+    # Get profile info *inside* the function - this avoids the TypeError
+    # Note: This requires the user to be logged in via the button *before* clicking Run.
+    try:
+        # This method of getting profile might need adjustment depending on Gradio version/context
+        # Placeholder: Assuming we can get username some other way if direct profile access fails.
+        # For now, let's hardcode or retrieve differently if `gr.OAuthProfile()` isn't available here.
+        # Let's proceed without username for now if OAuthProfile is problematic.
+        # A better approach might involve JavaScript interaction or different Gradio auth flow.
+        username = os.getenv("HF_USERNAME", "unknown_user") # Fallback to env var or default
+        if username == "unknown_user":
+             logger.warning("Could not determine Hugging Face username reliably. Using fallback.")
+        # Alternative: Could try reading from OAuth info if available in request context (advanced)
+    except Exception as auth_e:
+         logger.error(f"Could not get user profile: {auth_e}. Using fallback username.")
+         username = "unknown_user_error"
+    logger.info(f"Running as user (best effort): {username}")
     # 1. Fetch Questions
     questions = []
     try:
         resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
         resp.raise_for_status()
+        questions = resp.json()
+        if not isinstance(questions, list): raise ValueError("Invalid format")
+        logger.info(f"✅ Fetched {len(questions)} questions.")
     except Exception as e:
         logger.exception("Failed to fetch questions")
+        return f"❌ Error fetching questions: {e}", pd.DataFrame() # Return empty DF on fetch error
     if not questions:
+        return "ℹ️ No questions fetched.", pd.DataFrame()
+    # 2. Run Agent & Collect Results
     results_log = []
     answers_payload = []
     for i, item in enumerate(questions):
         task_id = item.get("task_id")
         question_text = item.get("question")
+        if not task_id or not question_text: continue
+        logger.info(f"Processing Q {i+1}/{len(questions)} (ID: {task_id})...")
+        raw_agent_output = run_agent_on_question(question_text)
+        final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker." # Default
         marker = "FINAL ANSWER:"
         if marker in raw_agent_output:
             final_answer = raw_agent_output.split(marker, 1)[1].strip()
+        elif "AGENT_ERROR:" in raw_agent_output:
+            final_answer = raw_agent_output # Submit the error
         results_log.append({
+            "Task ID": task_id, "Question": question_text,
+            "Submitted Answer": final_answer, "Full Output": raw_agent_output
         })
         answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
     results_df = pd.DataFrame(results_log)
     if not answers_payload:
+        return "⚠️ Agent ran but produced no answers.", results_df
     # 3. Submit Answers
+    logger.info(f"Submitting {len(answers_payload)} answers...")
     space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
+    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "URL_NA"
+    submit_data = {"username": username, "agent_code": agent_code_url, "answers": answers_payload}
     try:
         response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
+        response.raise_for_status()
+        result = response.json()
+        logger.info(f"✅ Submission successful! Response: {result}")
+        score = result.get('score', 'N/A')
         score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
+        status = (f"✅ Success! Score: {score_str} "
+                  f"({result.get('correct_count','?')}/{result.get('total_attempted','?')}). "
+                  f"Msg: {result.get('message','')}")
+        return status, results_df
     except Exception as e:
+        logger.exception("Submission failed")
+        err_msg = f"❌ Submission Failed: {e}"
+        if hasattr(e, 'response') and e.response is not None:
+            err_msg += f" | Response: {e.response.text[:300]}"
+        return err_msg, results_df
 # --- Build Gradio App ---
 logger.info("Setting up Gradio interface...")
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 Agent Evaluation Runner 🚀")
+    gr.Markdown("Ensure `GITHUB_TOKEN` secret is set. Click Run to start.")
+    # Removed LoginButton to simplify and avoid TypeError for now
+    # gr.LoginButton()
     run_button = gr.Button("▶️ Run Evaluation & Submit All Answers", variant="primary")
+    status_textbox = gr.Textbox(label="📊 Status", lines=4, interactive=False)
+    results_df_display = gr.DataFrame(
+        label="📋 Detailed Log",
+        headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
+        wrap=True, column_widths=["10%", "25%", "20%", "45%"]
     )
+    # Connect button click to the function WITHOUT inputs arg for now
     run_button.click(
         fn=evaluate_and_submit,
+        inputs=None, # No direct inputs from UI components
+        outputs=[status_textbox, results_df_display]
     )
 logger.info("Gradio interface setup complete.")
+# --- Launch ---
 if __name__ == "__main__":
     logger.info("Launching Gradio application...")
+    demo.launch(debug=True, share=False) # share=False is fine for HF Spaces internally
+    logger.info("Gradio application launched.")