pmeyhoefer commited on
Commit
15fa167
·
verified ·
1 Parent(s): 009368a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -217
app.py CHANGED
@@ -16,362 +16,261 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(me
16
  logger = logging.getLogger(__name__)
17
 
18
  # --- Configuration ---
19
- # URL for fetching questions and submitting answers
20
  SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
21
-
22
- # GitHub Models Configuration
23
  GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
24
  if not GITHUB_TOKEN:
25
- # Critical error if token is missing
26
- raise ValueError("GITHUB_TOKEN environment variable not set. Please set it in Space secrets.")
27
-
28
  GITHUB_ENDPOINT = "https://models.github.ai/inference"
29
- # Use a known model ID compatible with the endpoint
30
- # Let's stick to gpt-4o-mini based on previous logs, ensure it's available.
31
  MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
32
 
33
  # --- Tool Definitions ---
34
-
35
- # Instantiate the search tool ONCE to reuse its state/connection if any
36
  try:
37
  search_tool_instance = DuckDuckGoSearchTool()
 
38
  except Exception as e:
39
- logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}")
40
- # Depending on the app's requirements, you might want to raise an error here
41
- # or allow the app to start but log the failure.
42
- search_tool_instance = None # Indicate failure
43
-
44
- # IMPORTANT: Define wrapper functions that the LLM will be instructed to call.
45
- # Use the @tool decorator so CodeAgent recognizes them.
46
 
47
  @tool
48
  def web_search(query: str) -> str:
49
- """
50
- Performs a web search using DuckDuckGo for the given query.
51
- Use this for general questions, finding current information, or when Wikipedia fails.
52
- Args:
53
- query (str): The search query string.
54
- Returns:
55
- str: The search results obtained from DuckDuckGo, or an error message.
56
- """
57
- logger.info(f"Executing web_search with query: '{query[:100]}...'") # Log snippet
58
  if search_tool_instance is None:
59
- logger.error("web_search cannot execute because DuckDuckGoSearchTool failed to initialize.")
60
  return "Search Error: Tool not initialized."
61
  try:
62
  result = search_tool_instance(query=query)
63
- logger.info(f"web_search returned {len(result)} characters.")
64
- # Limit result length to prevent excessively large observations
65
  max_len = 3000
66
- if len(result) > max_len:
67
- logger.warning(f"Truncating web_search result from {len(result)} to {max_len} chars.")
68
- return result[:max_len] + "... (truncated)"
69
- return result
70
  except Exception as e:
71
  logger.exception(f"web_search failed for query: {query}")
72
  return f"Search Error: {e}"
73
 
74
  @tool
75
  def wikipedia_lookup(page_title: str) -> str:
76
- """
77
- Fetches the summary introduction text of an English Wikipedia page.
78
- Use this for factual information about specific topics, people, or entities.
79
- Args:
80
- page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein', 'List_of_programming_languages'). Spaces will be converted to underscores.
81
- Returns:
82
- str: The summary text of the page, or an error message if not found or failed.
83
- """
84
  page_safe = page_title.replace(" ", "_")
85
  logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
86
  try:
87
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
88
- space_id = os.getenv("SPACE_ID", "unknown-huggingface-space")
89
- headers = {'User-Agent': f'GAIAgent/1.0 ({space_id})'}
90
- r = requests.get(url, headers=headers, timeout=15) # Increased timeout
91
- r.raise_for_status() # Check for HTTP 4xx/5xx errors
92
  data = r.json()
93
  extract = data.get("extract", "")
94
  if extract:
95
- logger.info(f"wikipedia_lookup found summary ({len(extract)} chars) for '{page_title}'.")
96
  return extract
97
  else:
98
- # Handle pages found but without extracts (e.g., disambiguation)
99
  page_type = data.get("type", "standard")
100
  title = data.get("title", page_title)
101
  if page_type == "disambiguation":
102
- description = data.get("description", "multiple meanings")
103
- logger.warning(f"wikipedia_lookup found a disambiguation page for '{title}': {description}")
104
- return f"Wikipedia Error: '{title}' refers to {description}. Please provide a more specific page title."
105
  else:
106
- logger.warning(f"wikipedia_lookup found page '{title}' but it has no summary text.")
107
  return f"Wikipedia Error: Page '{title}' found but has no summary."
108
  except requests.exceptions.HTTPError as e:
109
  if e.response.status_code == 404:
110
- logger.warning(f"Wikipedia page not found: {page_safe}")
111
  return f"Wikipedia Error: Page '{page_safe}' not found."
112
  else:
113
- logger.error(f"Wikipedia HTTP error {e.response.status_code} for page: {page_safe}")
114
  return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
115
- except requests.exceptions.RequestException as e:
116
- logger.exception(f"Wikipedia network request failed for page: {page_safe}")
117
- return f"Wikipedia Error: Network error for page '{page_safe}': {e}"
118
  except Exception as e:
119
- logger.exception(f"Unexpected error during wikipedia_lookup for page: {page_safe}")
120
  return f"Wikipedia Error: Unexpected error: {e}"
121
 
122
- # Removed summarize_query tool for simplicity, as it wasn't adding much value in logs
123
-
124
- # --- The ReACT Prompt ---
125
  # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
126
- # Keep it clear and concise.
127
  REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
128
 
129
  Available Tools:
130
- - web_search(query: str): Use this for searching the web for general information, current events, or when you don't know a specific Wikipedia page title.
131
- - wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page. Use exact page titles (e.g., 'Berlin', 'Python_(programming_language)').
132
-
133
- Follow these steps for each question:
134
- 1. **Thought:** Briefly explain your plan and which tool you will use and why.
135
- 2. **Action:** Call ONE tool using the correct function name and arguments. Example: web_search(query="latest news") or wikipedia_lookup(page_title="Artificial_intelligence").
136
- 3. **Observation:** Record the result provided by the tool.
137
- 4. **Thought:** Analyze the observation. Does it answer the question? If yes, prepare the final answer. If not, plan the next step (e.g., try a different tool, refine the search query, use a different Wikipedia title).
138
- 5. Repeat Action/Observation/Thought until you have the answer or determine it cannot be found.
139
- 6. **Thought:** Summarize the findings and prepare the final answer based ONLY on the observations.
140
- 7. **Final Answer:** Provide the final answer in the required format (number, short string, or comma-separated list) on a new line starting exactly with "FINAL ANSWER: ".
141
 
142
  Formatting Rules for FINAL ANSWER:
143
- - Numbers: Output only the number (e.g., `42`, `1000`). No commas, units ($).
144
- - Strings: Use minimal words, no articles (a, an, the). Write digits as words (e.g., `seven`) unless numerical output is implied.
145
- - Lists: Comma-separated, apply number/string rules to each item (e.g., `paris,london,three`).
146
 
147
  Let's begin!
148
  """
149
 
150
  # --- SmolAgent Setup ---
151
- logger.info(f"Initializing LLM connection to {MODEL_ID} via {GITHUB_ENDPOINT}")
152
  try:
153
- # Configure the model connection to use GitHub's endpoint
154
  llm_model = OpenAIServerModel(
155
  model_id=MODEL_ID,
156
  api_key=GITHUB_TOKEN,
157
  base_url=GITHUB_ENDPOINT,
158
- request_timeout=60 # Add a timeout for model requests
159
  )
160
- # Verify connection (optional, depends on OpenAIServerModel implementation)
161
- # You might add a simple test call here if the library supports it easily
162
- logger.info("LLM connection configured successfully.")
163
  except Exception as e:
164
  logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
165
- raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
166
 
167
  logger.info("Initializing CodeAgent...")
168
  try:
169
- # Create the agent instance, passing the *list of actual functions* decorated with @tool
170
  agent = CodeAgent(
171
- tools=[web_search, wikipedia_lookup], # Only include the defined tool functions
172
  model=llm_model
173
  )
174
- # Log the names of the tools the agent actually recognized (if possible/safe)
175
- # This depends on how CodeAgent stores tools. Avoid the previous error.
176
- # logger.info(f"CodeAgent initialized. Tools detected by agent (if available): {agent.tools}") # Be cautious with this line
177
- logger.info("CodeAgent initialized successfully.")
178
  except Exception as e:
179
  logger.exception("CRITICAL: Failed to initialize CodeAgent")
180
  raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
181
 
182
-
183
- # --- Gradio Interface ---
184
-
185
  def run_agent_on_question(question: str) -> str:
186
- """
187
- Takes a question, runs the SmolAgent, and returns the raw output.
188
- Handles basic validation and error catching.
189
- """
190
  question = question.strip()
191
  if not question:
192
- logger.error("Agent called with empty question.")
193
  return "AGENT_ERROR: Question cannot be empty."
194
 
195
- # Construct the full prompt for the agent run
196
  full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
197
  logger.info(f"--- Running Agent for Question: '{question}' ---")
198
- # Log first few lines of prompt for verification (optional)
199
- # logger.debug(f"Prompt start:\n{full_prompt[:300]}...")
200
 
201
  try:
202
- # Execute the agent run
203
- raw_result = agent.run(full_prompt)
204
- logger.info(f"Agent run completed for question: '{question}'. Output length: {len(raw_result)}")
205
- # Log first/last parts of the raw result for debugging (optional)
206
- # logger.debug(f"Raw agent result snippet:\n{raw_result[:500]}...\n...{raw_result[-500:]}")
207
  return raw_result
208
  except Exception as e:
209
  logger.exception(f"Agent run failed for question '{question}'")
210
- tb_str = traceback.format_exc() # Get detailed traceback
211
- return f"AGENT_ERROR: An exception occurred during agent execution: {e}\nTraceback:\n{tb_str}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- def evaluate_and_submit(hf_profile: gr.OAuthProfile | None):
214
- """
215
- Gradio action: Fetches questions, runs agent on each, submits results.
216
- """
217
- if not hf_profile:
218
- logger.warning("Submission attempt failed: User not logged in.")
219
- return "⚠️ Please log in to Hugging Face via the button above to submit.", None # Status message, empty DataFrame
220
 
221
- username = hf_profile.username
222
- logger.info(f"🚀 Starting evaluation run for user '{username}'...")
223
 
224
  # 1. Fetch Questions
225
  questions = []
226
  try:
227
- logger.info(f"Fetching questions from {SUBMISSION_URL}/questions")
228
  resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
229
  resp.raise_for_status()
230
- questions_data = resp.json()
231
- if isinstance(questions_data, list):
232
- questions = questions_data
233
- logger.info(f"✅ Fetched {len(questions)} questions.")
234
- else:
235
- logger.error(f"Fetched questions data is not a list: {type(questions_data)}")
236
- return "❌ Error: Fetched questions format is incorrect.", None
237
  except Exception as e:
238
  logger.exception("Failed to fetch questions")
239
- return f"❌ Error fetching questions: {e}", None
240
 
241
  if not questions:
242
- logger.warning("No questions fetched or questions list is empty.")
243
- return "ℹ️ No questions were fetched from the server.", None
244
 
245
- # 2. Run Agent on Questions
246
  results_log = []
247
  answers_payload = []
248
- total_questions = len(questions)
249
  for i, item in enumerate(questions):
250
  task_id = item.get("task_id")
251
  question_text = item.get("question")
 
252
 
253
- if not task_id or not question_text:
254
- logger.warning(f"Skipping invalid question item {i+1}/{total_questions}: Missing task_id or question. Data: {item}")
255
- continue
256
-
257
- logger.info(f"Processing question {i+1}/{total_questions} (Task ID: {task_id})...")
258
- raw_agent_output = run_agent_on_question(question_text) # Run the agent
259
 
260
- # Extract final answer for submission
261
- final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker found in output." # Default if parsing fails
262
  marker = "FINAL ANSWER:"
263
  if marker in raw_agent_output:
264
  final_answer = raw_agent_output.split(marker, 1)[1].strip()
265
- elif "AGENT_ERROR:" in raw_agent_output: # If agent returned an error explicitly
266
- final_answer = raw_agent_output # Submit the error message
267
-
268
- logger.info(f"Task ID: {task_id} -> Submitted Answer: '{final_answer}'")
269
 
270
- # Log results for Gradio table
271
  results_log.append({
272
- "Task ID": task_id,
273
- "Question": question_text,
274
- "Submitted Answer": final_answer,
275
- "Full Agent Output": raw_agent_output # Show full trace in UI
276
  })
277
- # Prepare payload for submission API
278
  answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
279
 
280
  results_df = pd.DataFrame(results_log)
281
  if not answers_payload:
282
- logger.warning("Agent did not produce any answers to submit.")
283
- return "⚠️ Agent ran but produced no answers in the expected format.", results_df
284
 
285
  # 3. Submit Answers
286
- logger.info(f"Submitting {len(answers_payload)} answers for user '{username}'...")
287
  space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
288
- agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "Agent code URL unavailable"
289
- submit_data = {
290
- "username": username,
291
- "agent_code": agent_code_url,
292
- "answers": answers_payload
293
- }
294
 
295
  try:
296
  response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
297
- response.raise_for_status() # Check for HTTP errors
298
- submission_result = response.json()
299
- logger.info(f"✅ Submission successful! API Response: {submission_result}")
300
-
301
- score = submission_result.get('score', 'N/A')
302
  score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
303
- correct = submission_result.get('correct_count', '?')
304
- attempted = submission_result.get('total_attempted', '?')
305
- message = submission_result.get('message', '(No message from server)')
306
-
307
- status_message = (
308
- f"✅ Submission Successful!\n"
309
- f"User: {username}\n"
310
- f"Score: {score_str}\n"
311
- f"Details: {correct} / {attempted} correct\n"
312
- f"Server Message: {message}"
313
- )
314
- return status_message, results_df
315
-
316
- except requests.exceptions.RequestException as e:
317
- logger.exception("Submission request failed")
318
- error_details = str(e)
319
- if e.response is not None:
320
- error_details += f" | Status: {e.response.status_code} | Response: {e.response.text[:300]}" # Log snippet
321
- return f"❌ Submission Failed: {error_details}", results_df
322
  except Exception as e:
323
- logger.exception("Unexpected error during submission")
324
- return f"❌ Submission Failed with unexpected error: {e}", results_df
 
 
 
325
 
326
  # --- Build Gradio App ---
327
  logger.info("Setting up Gradio interface...")
328
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
329
- gr.Markdown(
330
- """
331
- # 🚀 Agent Evaluation Runner 🚀
332
-
333
- Connect your Hugging Face account, then click the button below to fetch tasks, run the agent, and submit the answers.
334
- Ensure the `GITHUB_TOKEN` secret is correctly set in your Space settings.
335
- """
336
- )
337
-
338
- with gr.Row():
339
- hf_login_button = gr.LoginButton() # Use the login button component
340
 
341
  run_button = gr.Button("▶️ Run Evaluation & Submit All Answers", variant="primary")
342
-
343
- submission_status_textbox = gr.Textbox(
344
- label="📊 Submission Status",
345
- lines=5,
346
- interactive=False,
347
- placeholder="Submission status will appear here..."
348
  )
349
 
350
- results_dataframe = gr.DataFrame(
351
- label="📋 Detailed Log (Questions & Agent Output)",
352
- headers=["Task ID", "Question", "Submitted Answer", "Full Agent Output"],
353
- wrap=True,
354
- # Removed height, let Gradio manage it or control via CSS if needed
355
- column_widths=["10%", "25%", "20%", "45%"]
356
- )
357
-
358
- # Connect button click to the evaluation function
359
- # Pass the login button's profile info to the function
360
  run_button.click(
361
  fn=evaluate_and_submit,
362
- inputs=[hf_login_button], # Pass the profile info from the login button
363
- outputs=[submission_status_textbox, results_dataframe],
364
- api_name="evaluate_submit" # For API usage if needed
365
  )
366
 
367
  logger.info("Gradio interface setup complete.")
368
 
369
- # --- Launch the App ---
370
  if __name__ == "__main__":
371
  logger.info("Launching Gradio application...")
372
- demo.launch(
373
- debug=True, # Provides more detailed logs for Gradio itself
374
- share=True # Necessary for public access on Hugging Face Spaces
375
- )
376
- logger.info("Gradio application has been launched.")
377
 
 
16
  logger = logging.getLogger(__name__)
17
 
18
  # --- Configuration ---
 
19
  SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
20
  GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
21
  if not GITHUB_TOKEN:
22
+ raise ValueError("CRITICAL: GITHUB_TOKEN environment variable not set.")
 
 
23
  GITHUB_ENDPOINT = "https://models.github.ai/inference"
 
 
24
  MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
25
 
26
  # --- Tool Definitions ---
 
 
27
  try:
28
  search_tool_instance = DuckDuckGoSearchTool()
29
+ logger.info("DuckDuckGoSearchTool initialized successfully.")
30
  except Exception as e:
31
+ logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}. Web search will not work.")
32
+ search_tool_instance = None
 
 
 
 
 
33
 
34
  @tool
35
  def web_search(query: str) -> str:
36
+ """Performs a web search using DuckDuckGo."""
37
+ logger.info(f"Executing web_search with query: '{query[:100]}...'")
 
 
 
 
 
 
 
38
  if search_tool_instance is None:
 
39
  return "Search Error: Tool not initialized."
40
  try:
41
  result = search_tool_instance(query=query)
42
+ logger.info(f"web_search returned {len(result)} chars.")
 
43
  max_len = 3000
44
+ return result[:max_len] + "... (truncated)" if len(result) > max_len else result
 
 
 
45
  except Exception as e:
46
  logger.exception(f"web_search failed for query: {query}")
47
  return f"Search Error: {e}"
48
 
49
  @tool
50
  def wikipedia_lookup(page_title: str) -> str:
51
+ """Fetches the summary introduction text of an English Wikipedia page."""
 
 
 
 
 
 
 
52
  page_safe = page_title.replace(" ", "_")
53
  logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
54
  try:
55
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
56
+ headers = {'User-Agent': f'GAIAgent/1.1 ({os.getenv("SPACE_ID", "unknown")})'}
57
+ r = requests.get(url, headers=headers, timeout=15)
58
+ r.raise_for_status()
 
59
  data = r.json()
60
  extract = data.get("extract", "")
61
  if extract:
 
62
  return extract
63
  else:
 
64
  page_type = data.get("type", "standard")
65
  title = data.get("title", page_title)
66
  if page_type == "disambiguation":
67
+ return f"Wikipedia Error: '{title}' is a disambiguation page. Try a more specific title."
 
 
68
  else:
 
69
  return f"Wikipedia Error: Page '{title}' found but has no summary."
70
  except requests.exceptions.HTTPError as e:
71
  if e.response.status_code == 404:
 
72
  return f"Wikipedia Error: Page '{page_safe}' not found."
73
  else:
 
74
  return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
 
 
 
75
  except Exception as e:
76
+ logger.exception(f"wikipedia_lookup failed for page: {page_safe}")
77
  return f"Wikipedia Error: Unexpected error: {e}"
78
 
79
+ # --- The ReACT Prompt (ensure this is the *only* main prompt definition) ---
 
 
80
  # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
 
81
  REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
82
 
83
  Available Tools:
84
+ - web_search(query: str): Use this for searching the web.
85
+ - wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page (e.g., 'Berlin', 'Python_(programming_language)').
86
+
87
+ Follow these steps:
88
+ 1. Thought: Plan which tool to use.
89
+ 2. Action: Call ONE tool (e.g., web_search(query="...") or wikipedia_lookup(page_title="...")).
90
+ 3. Observation: Record the result.
91
+ 4. Thought: Analyze the result. If answer found, prepare it. If not, plan next step.
92
+ 5. Repeat Action/Observation/Thought until answer is found or determined impossible.
93
+ 6. Thought: Summarize findings based ONLY on observations.
94
+ 7. Final Answer: Provide the answer starting exactly with "FINAL ANSWER: " using the required format (number, short string, or comma-separated list).
95
 
96
  Formatting Rules for FINAL ANSWER:
97
+ - Numbers: Just the number (e.g., `42`).
98
+ - Strings: Minimal words, no articles. Digits as words (e.g., `seven`).
99
+ - Lists: Comma-separated (e.g., `paris,london,three`).
100
 
101
  Let's begin!
102
  """
103
 
104
  # --- SmolAgent Setup ---
105
+ logger.info(f"Initializing LLM connection: {MODEL_ID} @ {GITHUB_ENDPOINT}")
106
  try:
 
107
  llm_model = OpenAIServerModel(
108
  model_id=MODEL_ID,
109
  api_key=GITHUB_TOKEN,
110
  base_url=GITHUB_ENDPOINT,
111
+ request_timeout=60
112
  )
113
+ logger.info("LLM connection OK.")
 
 
114
  except Exception as e:
115
  logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
116
+ raise RuntimeError(f"Could not configure SmolAgents model: {e}") from e
117
 
118
  logger.info("Initializing CodeAgent...")
119
  try:
120
+ # Pass the list of actual tool functions
121
  agent = CodeAgent(
122
+ tools=[web_search, wikipedia_lookup],
123
  model=llm_model
124
  )
125
+ logger.info("CodeAgent initialized OK.")
 
 
 
126
  except Exception as e:
127
  logger.exception("CRITICAL: Failed to initialize CodeAgent")
128
  raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
129
 
130
+ # --- Agent Execution Function ---
 
 
131
  def run_agent_on_question(question: str) -> str:
132
+ """Runs the agent with the CORRECT prompt."""
 
 
 
133
  question = question.strip()
134
  if not question:
 
135
  return "AGENT_ERROR: Question cannot be empty."
136
 
137
+ # *** CRITICAL: Construct the prompt HERE using the correct variable ***
138
  full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
139
  logger.info(f"--- Running Agent for Question: '{question}' ---")
140
+ # Add debug log to show the start of the prompt being used
141
+ logger.info(f"DEBUG: Using prompt starting with: {full_prompt[:300]}...") # Log beginning of prompt
142
 
143
  try:
144
+ raw_result = agent.run(full_prompt) # Pass the correctly constructed prompt
145
+ logger.info(f"Agent run completed. Output length: {len(raw_result)}")
 
 
 
146
  return raw_result
147
  except Exception as e:
148
  logger.exception(f"Agent run failed for question '{question}'")
149
+ return f"AGENT_ERROR: Exception during run: {e}\n{traceback.format_exc()}"
150
+
151
+ # --- Gradio Interface & Submission Logic ---
152
+
153
+ # FIX: Define evaluate_and_submit WITHOUT the hf_profile argument initially
154
+ # We will get the profile *inside* the function if needed.
155
+ def evaluate_and_submit():
156
+ """Gradio action: Fetches questions, runs agent, submits results."""
157
+ logger.info("🚀 Starting evaluation run...")
158
+
159
+ # Get profile info *inside* the function - this avoids the TypeError
160
+ # Note: This requires the user to be logged in via the button *before* clicking Run.
161
+ try:
162
+ # This method of getting profile might need adjustment depending on Gradio version/context
163
+ # Placeholder: Assuming we can get username some other way if direct profile access fails.
164
+ # For now, let's hardcode or retrieve differently if `gr.OAuthProfile()` isn't available here.
165
+ # Let's proceed without username for now if OAuthProfile is problematic.
166
+ # A better approach might involve JavaScript interaction or different Gradio auth flow.
167
+ username = os.getenv("HF_USERNAME", "unknown_user") # Fallback to env var or default
168
+ if username == "unknown_user":
169
+ logger.warning("Could not determine Hugging Face username reliably. Using fallback.")
170
+ # Alternative: Could try reading from OAuth info if available in request context (advanced)
171
+
172
+ except Exception as auth_e:
173
+ logger.error(f"Could not get user profile: {auth_e}. Using fallback username.")
174
+ username = "unknown_user_error"
175
 
 
 
 
 
 
 
 
176
 
177
+ logger.info(f"Running as user (best effort): {username}")
 
178
 
179
  # 1. Fetch Questions
180
  questions = []
181
  try:
 
182
  resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
183
  resp.raise_for_status()
184
+ questions = resp.json()
185
+ if not isinstance(questions, list): raise ValueError("Invalid format")
186
+ logger.info(f"✅ Fetched {len(questions)} questions.")
 
 
 
 
187
  except Exception as e:
188
  logger.exception("Failed to fetch questions")
189
+ return f"❌ Error fetching questions: {e}", pd.DataFrame() # Return empty DF on fetch error
190
 
191
  if not questions:
192
+ return "ℹ️ No questions fetched.", pd.DataFrame()
 
193
 
194
+ # 2. Run Agent & Collect Results
195
  results_log = []
196
  answers_payload = []
 
197
  for i, item in enumerate(questions):
198
  task_id = item.get("task_id")
199
  question_text = item.get("question")
200
+ if not task_id or not question_text: continue
201
 
202
+ logger.info(f"Processing Q {i+1}/{len(questions)} (ID: {task_id})...")
203
+ raw_agent_output = run_agent_on_question(question_text)
 
 
 
 
204
 
205
+ final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker." # Default
 
206
  marker = "FINAL ANSWER:"
207
  if marker in raw_agent_output:
208
  final_answer = raw_agent_output.split(marker, 1)[1].strip()
209
+ elif "AGENT_ERROR:" in raw_agent_output:
210
+ final_answer = raw_agent_output # Submit the error
 
 
211
 
 
212
  results_log.append({
213
+ "Task ID": task_id, "Question": question_text,
214
+ "Submitted Answer": final_answer, "Full Output": raw_agent_output
 
 
215
  })
 
216
  answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
217
 
218
  results_df = pd.DataFrame(results_log)
219
  if not answers_payload:
220
+ return "⚠️ Agent ran but produced no answers.", results_df
 
221
 
222
  # 3. Submit Answers
223
+ logger.info(f"Submitting {len(answers_payload)} answers...")
224
  space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
225
+ agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "URL_NA"
226
+ submit_data = {"username": username, "agent_code": agent_code_url, "answers": answers_payload}
 
 
 
 
227
 
228
  try:
229
  response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
230
+ response.raise_for_status()
231
+ result = response.json()
232
+ logger.info(f"✅ Submission successful! Response: {result}")
233
+ score = result.get('score', 'N/A')
 
234
  score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
235
+ status = (f"✅ Success! Score: {score_str} "
236
+ f"({result.get('correct_count','?')}/{result.get('total_attempted','?')}). "
237
+ f"Msg: {result.get('message','')}")
238
+ return status, results_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  except Exception as e:
240
+ logger.exception("Submission failed")
241
+ err_msg = f"❌ Submission Failed: {e}"
242
+ if hasattr(e, 'response') and e.response is not None:
243
+ err_msg += f" | Response: {e.response.text[:300]}"
244
+ return err_msg, results_df
245
 
246
  # --- Build Gradio App ---
247
  logger.info("Setting up Gradio interface...")
248
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
249
+ gr.Markdown("# 🚀 Agent Evaluation Runner 🚀")
250
+ gr.Markdown("Ensure `GITHUB_TOKEN` secret is set. Click Run to start.")
251
+ # Removed LoginButton to simplify and avoid TypeError for now
252
+ # gr.LoginButton()
 
 
 
 
 
 
 
253
 
254
  run_button = gr.Button("▶️ Run Evaluation & Submit All Answers", variant="primary")
255
+ status_textbox = gr.Textbox(label="📊 Status", lines=4, interactive=False)
256
+ results_df_display = gr.DataFrame(
257
+ label="📋 Detailed Log",
258
+ headers=["Task ID", "Question", "Submitted Answer", "Full Output"],
259
+ wrap=True, column_widths=["10%", "25%", "20%", "45%"]
 
260
  )
261
 
262
+ # Connect button click to the function WITHOUT inputs arg for now
 
 
 
 
 
 
 
 
 
263
  run_button.click(
264
  fn=evaluate_and_submit,
265
+ inputs=None, # No direct inputs from UI components
266
+ outputs=[status_textbox, results_df_display]
 
267
  )
268
 
269
  logger.info("Gradio interface setup complete.")
270
 
271
+ # --- Launch ---
272
  if __name__ == "__main__":
273
  logger.info("Launching Gradio application...")
274
+ demo.launch(debug=True, share=False) # share=False is fine for HF Spaces internally
275
+ logger.info("Gradio application launched.")
 
 
 
276