pmeyhoefer commited on
Commit
009368a
Β·
verified Β·
1 Parent(s): 81d72bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +288 -282
app.py CHANGED
@@ -1,371 +1,377 @@
1
  import os
2
  import logging
3
- import traceback # Import traceback for better error logging
4
 
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
8
  from openai import OpenAI
9
 
 
10
  from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
11
  from smolagents.models import OpenAIServerModel
12
 
13
- # --- Logging ---
14
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
15
  logger = logging.getLogger(__name__)
16
 
17
- # --- Constants ---
18
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
19
 
20
- # --- GitHub Models Configuration ---
21
  GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
22
  if not GITHUB_TOKEN:
23
- raise RuntimeError("Please set GITHUB_TOKEN in your Space secrets.")
 
24
 
25
  GITHUB_ENDPOINT = "https://models.github.ai/inference"
26
- MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini") # Using mini as per logs
 
 
27
 
28
- # --- Configure OpenAI SDK (Optional) ---
29
- # Less critical if tools don't directly use it
 
30
  try:
31
- client = OpenAI(
32
- base_url=GITHUB_ENDPOINT,
33
- api_key=GITHUB_TOKEN,
34
- )
35
  except Exception as e:
36
- logger.error(f"Ignoring error during optional OpenAI client init for GitHub Models: {e}")
37
- pass
38
-
39
- # --- Tools ---
40
 
41
- # Instantiate the search tool ONCE
42
- search_tool_instance = DuckDuckGoSearchTool()
43
 
44
  @tool
45
- def duckduckgo_search(query: str) -> str:
46
  """
47
- Performs a DuckDuckGo search for the given query and returns the results.
48
- Use this for general web searches.
49
  Args:
50
- query (str): The search query.
51
  Returns:
52
- str: The search results, or an error message.
53
  """
54
- logger.info(f"Executing duckduckgo_search with query: {query}")
 
 
 
55
  try:
56
- # Call the instantiated search tool
57
  result = search_tool_instance(query=query)
58
- logger.info(f"DuckDuckGo search returned {len(result)} characters.")
59
- # Maybe truncate long results if they cause issues downstream?
60
- # max_len = 2000
61
- # if len(result) > max_len:
62
- # logger.warning(f"Truncating DuckDuckGo result from {len(result)} to {max_len} chars.")
63
- # result = result[:max_len] + "... (truncated)"
64
  return result
65
  except Exception as e:
66
- logger.exception(f"DuckDuckGoSearchTool failed for query: {query}")
67
  return f"Search Error: {e}"
68
 
69
  @tool
70
- def summarize_query(query: str) -> str:
71
- """
72
- Reframes an unclear search query to improve relevance. Often useful before calling duckduckgo_search if the initial query is vague.
73
- Args:
74
- query (str): The original search query.
75
- Returns:
76
- str: A concise, improved version prepended with 'Summarize and reframe:'.
77
- """
78
- logger.info(f"Executing summarize_query with query: {query}")
79
- # This still doesn't use an LLM, it's just a placeholder/reframing instruction
80
- return f"Summarize and reframe: {query}"
81
-
82
- @tool
83
- def wikipedia_search(page: str) -> str:
84
  """
85
- Fetches the summary extract of an English Wikipedia page. Use specific page titles.
 
86
  Args:
87
- page (str): The exact Wikipedia page title (e.g., 'Mercedes_Sosa', 'List_of_Mercedes_Sosa_albums'). Spaces will be replaced by underscores.
88
  Returns:
89
- str: The page’s extract text or an error message (e.g., 'Wikipedia page '[page]' not found.').
90
  """
91
- page_safe = page.replace(" ", "_")
92
- logger.info(f"Executing wikipedia_search with page: {page} (URL-safe: {page_safe})")
93
  try:
94
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
95
- # Add a more specific user agent if running in HF Spaces
96
- space_id = os.getenv("SPACE_ID", "unknown-space")
97
- headers = {'User-Agent': f'SmolAgentGAIARunner/1.1 ({space_id})'}
98
- r = requests.get(url, headers=headers, timeout=12)
99
- r.raise_for_status() # Raises HTTPError for 4xx/5xx
100
  data = r.json()
101
  extract = data.get("extract", "")
102
- if not extract:
103
- # Handle disambiguation or empty pages
104
- page_title = data.get("title", page)
105
- page_type = data.get("type", "standard")
106
- if page_type == "disambiguation":
107
- logger.warning(f"Wikipedia page '{page_title}' is a disambiguation page.")
108
- # Try to get description which might list options
109
- description = data.get("description", "disambiguation page.")
110
- return f"Wikipedia page '{page_title}' is a {description}. Try a more specific page title."
111
- else: # Standard page but no extract
112
- logger.warning(f"Wikipedia page '{page_title}' found, but has no summary extract.")
113
- return f"Wikipedia page '{page_title}' found, but has no summary extract."
114
- logger.info(f"Wikipedia search for '{page}' returned {len(extract)} characters.")
115
- return extract
116
  except requests.exceptions.HTTPError as e:
117
  if e.response.status_code == 404:
118
- logger.warning(f"Wikipedia page not found: {page_safe}")
119
- return f"Wikipedia page '{page_safe}' not found."
120
  else:
121
- logger.exception(f"Wikipedia lookup failed for page: {page_safe} with status {e.response.status_code}")
122
- return f"Wikipedia HTTP error {e.response.status_code} for page '{page_safe}': {e}"
123
  except requests.exceptions.RequestException as e:
124
  logger.exception(f"Wikipedia network request failed for page: {page_safe}")
125
- return f"Wikipedia network error for page '{page_safe}': {e}"
126
  except Exception as e:
127
- logger.exception(f"Unexpected Wikipedia lookup error for page: {page_safe}")
128
- return f"Unexpected Wikipedia error for page '{page_safe}': {e}"
129
-
130
-
131
- # --- ReACT Prompt ---
132
- # *** THIS IS THE CRITICAL FIX: Ensure the tool name here matches the @tool function ***
133
- instruction_prompt = """
134
- You are a ReACT agent with three tools:
135
- β€’ duckduckgo_search(query: str) # Correct function name
136
- β€’ wikipedia_search(page: str)
137
- β€’ summarize_query(query: str)
138
- Internally, for each question:
139
- 1. Thought: Decide which tool is most appropriate. If searching the web, use duckduckgo_search. If looking for encyclopedic info on a specific topic/entity, try wikipedia_search first with the most likely page title. If a search or lookup fails or returns irrelevant info, think about why and try reformulating the query or using a different tool. Maybe use summarize_query on a complex question before searching.
140
- 2. Action: Call the chosen tool with the correct arguments. For wikipedia_search, use page titles like 'Entity_Name' or 'List_of_Entity_Albums'.
141
- 3. Observation: Record the result returned by the tool. Note error messages like 'page not found' or 'Search Error'.
142
- 4. Thought: Analyze the observation. Was the information found? Is it relevant? If not, what should be the next step? Try duckduckgo_search if Wikipedia failed? Try a different Wikipedia page title (e.g., 'List_of_Mercedes_Sosa_albums' instead of 'Mercedes_Sosa_discography')? If search results are messy, maybe try summarize_query on the topic and search again?
143
- 5. Action: Execute the next action based on the thought.
144
- 6. Repeat steps 3-5 until the answer is found or you determine it cannot be found with the available tools.
145
- 7. Thought: Synthesize all observations into a final answer based *only* on the information gathered.
146
- Finally, output your answer with the following template *exactly*:
147
- FINAL ANSWER: [YOUR FINAL ANSWER].
148
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
149
- If you are asked for a number, output only the number (e.g., 42). No commas in numbers (e.g., 1000 not 1,000). No units ($ or %).
150
- If you are asked for a string, use minimal words, no articles (a, an, the), no abbreviations (e.g., New York City not NYC). Write digits as words (e.g., seven not 7) unless the question implies numerical output.
151
- If you are asked for a comma separated list, apply the above rules to each element. Example: red,blue,three.
 
 
 
 
152
  """
153
 
154
- # --- Build the Agent with OpenAIServerModel pointing to GitHub Models ---
 
155
  try:
156
- model = OpenAIServerModel(
 
157
  model_id=MODEL_ID,
158
  api_key=GITHUB_TOKEN,
159
  base_url=GITHUB_ENDPOINT,
160
- # Add timeout if needed, e.g., request_timeout=60
161
- # Add model_kwargs if needed, e.g. model_kwargs={'temperature': 0.5}
162
  )
163
- logger.info(f"Configured OpenAIServerModel(id={MODEL_ID}, endpoint={GITHUB_ENDPOINT})")
 
 
164
  except Exception as e:
165
- logger.exception("Failed to configure OpenAIServerModel")
166
  raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
167
 
168
- # Pass the list of FUNCTION objects decorated with @tool
169
- smart_agent = CodeAgent(
170
- tools=[duckduckgo_search, wikipedia_search, summarize_query],
171
- model=model
172
- )
173
- logger.info(f"CodeAgent initialized with tools: {[t.__name__ for t in smart_agent.tools]}")
174
-
175
- # --- Gradio Wrapper ---
176
-
177
- class BasicAgent:
178
- def __init__(self):
179
- logger.info(f"BasicAgent initialized, using SmolAgent with model {MODEL_ID}")
180
-
181
- def __call__(self, question: str) -> str:
182
- question = question.strip()
183
- if not question:
184
- logger.error("Agent called with empty question.")
185
- return "AGENT ERROR: empty question"
186
-
187
- # Use the updated instruction_prompt
188
- prompt = instruction_prompt.strip() + "\n\nQUESTION: " + question
189
- # Log the exact prompt being sent (optional, can be verbose)
190
- # logger.debug(f"--- Sending Prompt to Agent ---\n{prompt}\n-----------------------------")
191
-
192
- try:
193
- logger.info(f"Running agent for question: '{question}'")
194
- # The agent uses the 'model' instance and tools configured above
195
- result = smart_agent.run(prompt)
196
- # Log the raw result (optional, can be verbose)
197
- # logger.debug(f"--- Raw Agent Result ---\n{result}\n--------------------------")
198
- logger.info(f"Agent finished run for question: '{question}'")
199
-
200
- # Basic check if the agent failed to produce a final answer format
201
- if "FINAL ANSWER:" not in result:
202
- logger.warning(f"Agent output for question '{question}' did not contain 'FINAL ANSWER:'. Raw output: {result}")
203
- # Decide how to handle this - return error or raw output?
204
- # Returning raw output might be better for debugging but fail submission check.
205
- # Let's return a specific error for submission.
206
- return f"AGENT ERROR: Malformed response - No 'FINAL ANSWER:' block found."
207
- return result # Return the full raw output including thought process and FINAL ANSWER
208
-
209
- except Exception as e:
210
- logger.exception(f"Agent run failed for question '{question}'")
211
- # Get traceback details
212
- tb_str = traceback.format_exc()
213
- return f"AGENT ERROR: Exception during run: {e}\nTraceback:\n{tb_str}"
214
-
215
- # --- Submission Logic ---
216
-
217
- def run_and_submit_all(profile: gr.OAuthProfile | None):
218
- if not profile:
219
  logger.warning("Submission attempt failed: User not logged in.")
220
- return "Please log in to Hugging Face to submit.", None
221
 
222
- username = profile.username
223
- space_id = os.getenv("SPACE_ID", "")
224
- if not space_id:
225
- logger.warning("SPACE_ID environment variable not set. Agent code URL will be incomplete.")
226
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Agent code URL unavailable (SPACE_ID not set)"
227
- logger.info(f"Starting evaluation run for user '{username}'")
228
- agent = BasicAgent()
229
 
230
- # Fetch questions
 
231
  try:
232
- logger.info(f"Fetching questions from {DEFAULT_API_URL}/questions")
233
- resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=20)
234
  resp.raise_for_status()
235
  questions_data = resp.json()
236
- if not isinstance(questions_data, list):
237
- logger.error(f"Fetched questions is not a list: {type(questions_data)}")
238
- return f"Error: Fetched questions format is incorrect (expected list, got {type(questions_data)}).", None
239
- questions = questions_data or []
240
- logger.info(f"Fetched {len(questions)} questions successfully.")
 
241
  except Exception as e:
242
  logger.exception("Failed to fetch questions")
243
- return f"Error fetching questions: {e}", None
244
 
245
  if not questions:
246
  logger.warning("No questions fetched or questions list is empty.")
247
- return "No questions were fetched from the server.", None
248
 
249
- logs, payload = [], []
250
- question_count = len(questions)
 
 
251
  for i, item in enumerate(questions):
252
- if not isinstance(item, dict):
253
- logger.warning(f"Skipping invalid question item (not a dict): {item}")
254
- continue
255
- tid = item.get("task_id")
256
- q = item.get("question")
257
- if not tid or not q:
258
- logger.warning(f"Skipping question with missing task_id or question: {item}")
259
  continue
260
 
261
- logger.info(f"Processing question {i+1}/{question_count} - Task ID: {tid}")
262
- ans_raw = agent(q) # Run the agent
263
-
264
- # Extract only the final answer part for submission
265
- final_ans_marker = "FINAL ANSWER:"
266
- submitted_ans = f"ERROR (Agent did not produce output with {final_ans_marker})" # Default if parsing fails
267
- if final_ans_marker in ans_raw:
268
- # Split and take the part *after* the marker
269
- submitted_ans = ans_raw.split(final_ans_marker, 1)[1].strip()
270
- # Optional: Basic validation/cleanup of the extracted answer?
271
- # e.g., remove leading/trailing quotes if not needed
272
- # submitted_ans = submitted_ans.strip(' "')
273
- elif "AGENT ERROR:" in ans_raw:
274
- # If agent returned an error string, submit that
275
- submitted_ans = ans_raw # Keep the AGENT ERROR message
276
- logger.warning(f"Agent returned an error for Task ID {tid}: {submitted_ans}")
277
- else:
278
- logger.warning(f"Could not extract final answer from raw output for Task ID {tid}. Raw: {ans_raw[:500]}...") # Log snippet
279
-
280
- logger.info(f"Task ID: {tid}, Question: '{q}', Submitted Answer: '{submitted_ans}'")
281
- # Store more info for the Gradio table, including the raw output for debugging
282
- logs.append({
283
- "Task ID": tid,
284
- "Question": q,
285
- "Submitted Answer": submitted_ans,
286
- "Agent Raw Output": ans_raw # Show the full thought process in the table
287
  })
288
- payload.append({"task_id": tid, "submitted_answer": submitted_ans})
289
-
290
- if not payload:
291
- logger.warning("Agent did not produce any valid answers to submit.")
292
- # Check if logs have entries to display potential errors
293
- if logs:
294
- return "Agent ran but did not produce any answers in the expected format.", pd.DataFrame(logs)
295
- else:
296
- return "Agent did not produce any answers.", None
297
-
 
 
 
 
 
 
 
298
 
299
- logger.info(f"Submitting {len(payload)} answers for user '{username}'...")
300
- # Submit answers
301
  try:
302
- submit_payload = {"username": username, "agent_code": agent_code, "answers": payload}
303
- # logger.debug(f"Submission Payload: {submit_payload}") # Careful logging PII
304
- post = requests.post(
305
- f"{DEFAULT_API_URL}/submit",
306
- json=submit_payload,
307
- timeout=90 # Increased timeout for submission
 
 
 
 
 
 
 
 
 
 
 
308
  )
309
- post.raise_for_status() # Check for HTTP errors from submission endpoint
310
- result = post.json()
311
- logger.info(f"Submission successful. Result: {result}")
312
-
313
- score_percent = result.get('score', 'N/A')
314
- try: # Format score nicely
315
- score_percent = f"{float(score_percent):.2f}" if isinstance(score_percent, (int, float)) else score_percent
316
- except (ValueError, TypeError): pass
317
-
318
- status = (
319
- f"Submission Successful!\n"
320
- f"User: {result.get('username', 'N/A')}\n"
321
- f"Score: {score_percent}%\n"
322
- f"Correct: {result.get('correct_count','?')} / Attempted: {result.get('total_attempted','?')}\n"
323
- f"Message: {result.get('message','(No message)')}"
324
- )
325
- # Update logs DataFrame with final status if needed, though usually not necessary
326
- return status, pd.DataFrame(logs) # Return status and the detailed logs
327
 
328
  except requests.exceptions.RequestException as e:
329
  logger.exception("Submission request failed")
330
  error_details = str(e)
331
  if e.response is not None:
332
- error_details += f" | Status Code: {e.response.status_code} | Response: {e.response.text[:500]}"
333
- return f"Submission Failed: {error_details}", pd.DataFrame(logs) # Return error and logs
334
  except Exception as e:
335
- logger.exception("Submission failed with unexpected error")
336
- return f"Submission Failed with unexpected error: {e}", pd.DataFrame(logs) # Return error and logs
337
-
338
-
339
- # --- Gradio App ---
340
-
341
- with gr.Blocks() as demo:
342
- gr.Markdown("# SmolAgent GAIA Runner (using GitHub Models) πŸš€")
343
- gr.Markdown("""
344
- **Instructions:**
345
- 1. Ensure `GITHUB_TOKEN` secret is set. Optionally set `MODEL_ID`.
346
- 2. Log in to Hugging Face below.
347
- 3. Click **Run Evaluation & Submit All Answers**.
348
- 4. Check the Status and the Questions & Answers table for results. The raw agent output includes the thinking process.
349
- """)
350
- gr.LoginButton()
351
- btn = gr.Button("Run Evaluation & Submit All Answers")
352
- out_status = gr.Textbox(label="Submission Status", lines=5, interactive=False)
353
- # *** FIX: Remove the 'height' argument ***
354
- out_table = gr.DataFrame(
355
- label="Questions & Answers Log",
 
 
 
 
 
 
 
 
 
356
  wrap=True,
357
- # Add headers if you want to control column names/order explicitly
358
- headers=["Task ID", "Question", "Submitted Answer", "Agent Raw Output"],
359
- column_widths=["10%", "30%", "20%", "40%"] # Adjust widths as needed
 
 
 
 
 
 
 
 
360
  )
361
- btn.click(run_and_submit_all, outputs=[out_status, out_table], api_name="run_submit") # Add api_name
362
 
 
 
 
363
  if __name__ == "__main__":
364
- if not GITHUB_TOKEN:
365
- logger.error("GITHUB_TOKEN environment variable not set. Cannot start effectively.")
366
- # Optionally raise error or exit? For now, just log.
367
- logger.info("Launching Gradio App...")
368
- # share=True is needed for public link if running on HF Spaces
369
- # debug=True provides more verbose Gradio logging
370
- demo.launch(debug=True, share=True)
371
 
 
1
  import os
2
  import logging
3
+ import traceback
4
 
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
8
  from openai import OpenAI
9
 
10
+ # Assuming these imports from smolagents are correct
11
  from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
12
  from smolagents.models import OpenAIServerModel
13
 
14
+ # --- Basic Logging Setup ---
15
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
16
  logger = logging.getLogger(__name__)
17
 
18
+ # --- Configuration ---
19
+ # URL for fetching questions and submitting answers
20
+ SUBMISSION_URL = "https://agents-course-unit4-scoring.hf.space"
21
 
22
+ # GitHub Models Configuration
23
  GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
24
  if not GITHUB_TOKEN:
25
+ # Critical error if token is missing
26
+ raise ValueError("GITHUB_TOKEN environment variable not set. Please set it in Space secrets.")
27
 
28
  GITHUB_ENDPOINT = "https://models.github.ai/inference"
29
+ # Use a known model ID compatible with the endpoint
30
+ # Let's stick to gpt-4o-mini based on previous logs, ensure it's available.
31
+ MODEL_ID = os.getenv("MODEL_ID", "openai/gpt-4o-mini")
32
 
33
+ # --- Tool Definitions ---
34
+
35
+ # Instantiate the search tool ONCE to reuse its state/connection if any
36
  try:
37
+ search_tool_instance = DuckDuckGoSearchTool()
 
 
 
38
  except Exception as e:
39
+ logger.error(f"Failed to instantiate DuckDuckGoSearchTool: {e}")
40
+ # Depending on the app's requirements, you might want to raise an error here
41
+ # or allow the app to start but log the failure.
42
+ search_tool_instance = None # Indicate failure
43
 
44
+ # IMPORTANT: Define wrapper functions that the LLM will be instructed to call.
45
+ # Use the @tool decorator so CodeAgent recognizes them.
46
 
47
  @tool
48
+ def web_search(query: str) -> str:
49
  """
50
+ Performs a web search using DuckDuckGo for the given query.
51
+ Use this for general questions, finding current information, or when Wikipedia fails.
52
  Args:
53
+ query (str): The search query string.
54
  Returns:
55
+ str: The search results obtained from DuckDuckGo, or an error message.
56
  """
57
+ logger.info(f"Executing web_search with query: '{query[:100]}...'") # Log snippet
58
+ if search_tool_instance is None:
59
+ logger.error("web_search cannot execute because DuckDuckGoSearchTool failed to initialize.")
60
+ return "Search Error: Tool not initialized."
61
  try:
 
62
  result = search_tool_instance(query=query)
63
+ logger.info(f"web_search returned {len(result)} characters.")
64
+ # Limit result length to prevent excessively large observations
65
+ max_len = 3000
66
+ if len(result) > max_len:
67
+ logger.warning(f"Truncating web_search result from {len(result)} to {max_len} chars.")
68
+ return result[:max_len] + "... (truncated)"
69
  return result
70
  except Exception as e:
71
+ logger.exception(f"web_search failed for query: {query}")
72
  return f"Search Error: {e}"
73
 
74
  @tool
75
+ def wikipedia_lookup(page_title: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  """
77
+ Fetches the summary introduction text of an English Wikipedia page.
78
+ Use this for factual information about specific topics, people, or entities.
79
  Args:
80
+ page_title (str): The exact title of the Wikipedia page (e.g., 'Albert Einstein', 'List_of_programming_languages'). Spaces will be converted to underscores.
81
  Returns:
82
+ str: The summary text of the page, or an error message if not found or failed.
83
  """
84
+ page_safe = page_title.replace(" ", "_")
85
+ logger.info(f"Executing wikipedia_lookup for page: '{page_title}' (URL: {page_safe})")
86
  try:
87
  url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_safe}"
88
+ space_id = os.getenv("SPACE_ID", "unknown-huggingface-space")
89
+ headers = {'User-Agent': f'GAIAgent/1.0 ({space_id})'}
90
+ r = requests.get(url, headers=headers, timeout=15) # Increased timeout
91
+ r.raise_for_status() # Check for HTTP 4xx/5xx errors
 
92
  data = r.json()
93
  extract = data.get("extract", "")
94
+ if extract:
95
+ logger.info(f"wikipedia_lookup found summary ({len(extract)} chars) for '{page_title}'.")
96
+ return extract
97
+ else:
98
+ # Handle pages found but without extracts (e.g., disambiguation)
99
+ page_type = data.get("type", "standard")
100
+ title = data.get("title", page_title)
101
+ if page_type == "disambiguation":
102
+ description = data.get("description", "multiple meanings")
103
+ logger.warning(f"wikipedia_lookup found a disambiguation page for '{title}': {description}")
104
+ return f"Wikipedia Error: '{title}' refers to {description}. Please provide a more specific page title."
105
+ else:
106
+ logger.warning(f"wikipedia_lookup found page '{title}' but it has no summary text.")
107
+ return f"Wikipedia Error: Page '{title}' found but has no summary."
108
  except requests.exceptions.HTTPError as e:
109
  if e.response.status_code == 404:
110
+ logger.warning(f"Wikipedia page not found: {page_safe}")
111
+ return f"Wikipedia Error: Page '{page_safe}' not found."
112
  else:
113
+ logger.error(f"Wikipedia HTTP error {e.response.status_code} for page: {page_safe}")
114
+ return f"Wikipedia Error: HTTP {e.response.status_code} for page '{page_safe}'."
115
  except requests.exceptions.RequestException as e:
116
  logger.exception(f"Wikipedia network request failed for page: {page_safe}")
117
+ return f"Wikipedia Error: Network error for page '{page_safe}': {e}"
118
  except Exception as e:
119
+ logger.exception(f"Unexpected error during wikipedia_lookup for page: {page_safe}")
120
+ return f"Wikipedia Error: Unexpected error: {e}"
121
+
122
+ # Removed summarize_query tool for simplicity, as it wasn't adding much value in logs
123
+
124
+ # --- The ReACT Prompt ---
125
+ # Define the *exact* instructions for the LLM, listing the *actual* tool function names.
126
+ # Keep it clear and concise.
127
+ REACT_INSTRUCTION_PROMPT = """You are a helpful assistant that answers questions using the provided tools.
128
+
129
+ Available Tools:
130
+ - web_search(query: str): Use this for searching the web for general information, current events, or when you don't know a specific Wikipedia page title.
131
+ - wikipedia_lookup(page_title: str): Use this to get information from a specific English Wikipedia page. Use exact page titles (e.g., 'Berlin', 'Python_(programming_language)').
132
+
133
+ Follow these steps for each question:
134
+ 1. **Thought:** Briefly explain your plan and which tool you will use and why.
135
+ 2. **Action:** Call ONE tool using the correct function name and arguments. Example: web_search(query="latest news") or wikipedia_lookup(page_title="Artificial_intelligence").
136
+ 3. **Observation:** Record the result provided by the tool.
137
+ 4. **Thought:** Analyze the observation. Does it answer the question? If yes, prepare the final answer. If not, plan the next step (e.g., try a different tool, refine the search query, use a different Wikipedia title).
138
+ 5. Repeat Action/Observation/Thought until you have the answer or determine it cannot be found.
139
+ 6. **Thought:** Summarize the findings and prepare the final answer based ONLY on the observations.
140
+ 7. **Final Answer:** Provide the final answer in the required format (number, short string, or comma-separated list) on a new line starting exactly with "FINAL ANSWER: ".
141
+
142
+ Formatting Rules for FINAL ANSWER:
143
+ - Numbers: Output only the number (e.g., `42`, `1000`). No commas, units ($).
144
+ - Strings: Use minimal words, no articles (a, an, the). Write digits as words (e.g., `seven`) unless numerical output is implied.
145
+ - Lists: Comma-separated, apply number/string rules to each item (e.g., `paris,london,three`).
146
+
147
+ Let's begin!
148
  """
149
 
150
+ # --- SmolAgent Setup ---
151
+ logger.info(f"Initializing LLM connection to {MODEL_ID} via {GITHUB_ENDPOINT}")
152
  try:
153
+ # Configure the model connection to use GitHub's endpoint
154
+ llm_model = OpenAIServerModel(
155
  model_id=MODEL_ID,
156
  api_key=GITHUB_TOKEN,
157
  base_url=GITHUB_ENDPOINT,
158
+ request_timeout=60 # Add a timeout for model requests
 
159
  )
160
+ # Verify connection (optional, depends on OpenAIServerModel implementation)
161
+ # You might add a simple test call here if the library supports it easily
162
+ logger.info("LLM connection configured successfully.")
163
  except Exception as e:
164
+ logger.exception("CRITICAL: Failed to configure OpenAIServerModel")
165
  raise RuntimeError(f"Could not configure SmolAgents model for GitHub endpoint: {e}") from e
166
 
167
+ logger.info("Initializing CodeAgent...")
168
+ try:
169
+ # Create the agent instance, passing the *list of actual functions* decorated with @tool
170
+ agent = CodeAgent(
171
+ tools=[web_search, wikipedia_lookup], # Only include the defined tool functions
172
+ model=llm_model
173
+ )
174
+ # Log the names of the tools the agent actually recognized (if possible/safe)
175
+ # This depends on how CodeAgent stores tools. Avoid the previous error.
176
+ # logger.info(f"CodeAgent initialized. Tools detected by agent (if available): {agent.tools}") # Be cautious with this line
177
+ logger.info("CodeAgent initialized successfully.")
178
+ except Exception as e:
179
+ logger.exception("CRITICAL: Failed to initialize CodeAgent")
180
+ raise RuntimeError(f"Could not initialize CodeAgent: {e}") from e
181
+
182
+
183
+ # --- Gradio Interface ---
184
+
185
+ def run_agent_on_question(question: str) -> str:
186
+ """
187
+ Takes a question, runs the SmolAgent, and returns the raw output.
188
+ Handles basic validation and error catching.
189
+ """
190
+ question = question.strip()
191
+ if not question:
192
+ logger.error("Agent called with empty question.")
193
+ return "AGENT_ERROR: Question cannot be empty."
194
+
195
+ # Construct the full prompt for the agent run
196
+ full_prompt = REACT_INSTRUCTION_PROMPT.strip() + "\n\nQUESTION: " + question
197
+ logger.info(f"--- Running Agent for Question: '{question}' ---")
198
+ # Log first few lines of prompt for verification (optional)
199
+ # logger.debug(f"Prompt start:\n{full_prompt[:300]}...")
200
+
201
+ try:
202
+ # Execute the agent run
203
+ raw_result = agent.run(full_prompt)
204
+ logger.info(f"Agent run completed for question: '{question}'. Output length: {len(raw_result)}")
205
+ # Log first/last parts of the raw result for debugging (optional)
206
+ # logger.debug(f"Raw agent result snippet:\n{raw_result[:500]}...\n...{raw_result[-500:]}")
207
+ return raw_result
208
+ except Exception as e:
209
+ logger.exception(f"Agent run failed for question '{question}'")
210
+ tb_str = traceback.format_exc() # Get detailed traceback
211
+ return f"AGENT_ERROR: An exception occurred during agent execution: {e}\nTraceback:\n{tb_str}"
212
+
213
+ def evaluate_and_submit(hf_profile: gr.OAuthProfile | None):
214
+ """
215
+ Gradio action: Fetches questions, runs agent on each, submits results.
216
+ """
217
+ if not hf_profile:
218
  logger.warning("Submission attempt failed: User not logged in.")
219
+ return "⚠️ Please log in to Hugging Face via the button above to submit.", None # Status message, empty DataFrame
220
 
221
+ username = hf_profile.username
222
+ logger.info(f"πŸš€ Starting evaluation run for user '{username}'...")
 
 
 
 
 
223
 
224
+ # 1. Fetch Questions
225
+ questions = []
226
  try:
227
+ logger.info(f"Fetching questions from {SUBMISSION_URL}/questions")
228
+ resp = requests.get(f"{SUBMISSION_URL}/questions", timeout=20)
229
  resp.raise_for_status()
230
  questions_data = resp.json()
231
+ if isinstance(questions_data, list):
232
+ questions = questions_data
233
+ logger.info(f"βœ… Fetched {len(questions)} questions.")
234
+ else:
235
+ logger.error(f"Fetched questions data is not a list: {type(questions_data)}")
236
+ return "❌ Error: Fetched questions format is incorrect.", None
237
  except Exception as e:
238
  logger.exception("Failed to fetch questions")
239
+ return f"❌ Error fetching questions: {e}", None
240
 
241
  if not questions:
242
  logger.warning("No questions fetched or questions list is empty.")
243
+ return "ℹ️ No questions were fetched from the server.", None
244
 
245
+ # 2. Run Agent on Questions
246
+ results_log = []
247
+ answers_payload = []
248
+ total_questions = len(questions)
249
  for i, item in enumerate(questions):
250
+ task_id = item.get("task_id")
251
+ question_text = item.get("question")
252
+
253
+ if not task_id or not question_text:
254
+ logger.warning(f"Skipping invalid question item {i+1}/{total_questions}: Missing task_id or question. Data: {item}")
 
 
255
  continue
256
 
257
+ logger.info(f"Processing question {i+1}/{total_questions} (Task ID: {task_id})...")
258
+ raw_agent_output = run_agent_on_question(question_text) # Run the agent
259
+
260
+ # Extract final answer for submission
261
+ final_answer = "AGENT_ERROR: No 'FINAL ANSWER:' marker found in output." # Default if parsing fails
262
+ marker = "FINAL ANSWER:"
263
+ if marker in raw_agent_output:
264
+ final_answer = raw_agent_output.split(marker, 1)[1].strip()
265
+ elif "AGENT_ERROR:" in raw_agent_output: # If agent returned an error explicitly
266
+ final_answer = raw_agent_output # Submit the error message
267
+
268
+ logger.info(f"Task ID: {task_id} -> Submitted Answer: '{final_answer}'")
269
+
270
+ # Log results for Gradio table
271
+ results_log.append({
272
+ "Task ID": task_id,
273
+ "Question": question_text,
274
+ "Submitted Answer": final_answer,
275
+ "Full Agent Output": raw_agent_output # Show full trace in UI
 
 
 
 
 
 
 
276
  })
277
+ # Prepare payload for submission API
278
+ answers_payload.append({"task_id": task_id, "submitted_answer": final_answer})
279
+
280
+ results_df = pd.DataFrame(results_log)
281
+ if not answers_payload:
282
+ logger.warning("Agent did not produce any answers to submit.")
283
+ return "⚠️ Agent ran but produced no answers in the expected format.", results_df
284
+
285
+ # 3. Submit Answers
286
+ logger.info(f"Submitting {len(answers_payload)} answers for user '{username}'...")
287
+ space_id = os.getenv("SPACE_ID", "SPACE_ID_NOT_SET")
288
+ agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main" if "NOT_SET" not in space_id else "Agent code URL unavailable"
289
+ submit_data = {
290
+ "username": username,
291
+ "agent_code": agent_code_url,
292
+ "answers": answers_payload
293
+ }
294
 
 
 
295
  try:
296
+ response = requests.post(f"{SUBMISSION_URL}/submit", json=submit_data, timeout=90)
297
+ response.raise_for_status() # Check for HTTP errors
298
+ submission_result = response.json()
299
+ logger.info(f"βœ… Submission successful! API Response: {submission_result}")
300
+
301
+ score = submission_result.get('score', 'N/A')
302
+ score_str = f"{float(score):.2f}%" if isinstance(score, (int, float)) else str(score)
303
+ correct = submission_result.get('correct_count', '?')
304
+ attempted = submission_result.get('total_attempted', '?')
305
+ message = submission_result.get('message', '(No message from server)')
306
+
307
+ status_message = (
308
+ f"βœ… Submission Successful!\n"
309
+ f"User: {username}\n"
310
+ f"Score: {score_str}\n"
311
+ f"Details: {correct} / {attempted} correct\n"
312
+ f"Server Message: {message}"
313
  )
314
+ return status_message, results_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  except requests.exceptions.RequestException as e:
317
  logger.exception("Submission request failed")
318
  error_details = str(e)
319
  if e.response is not None:
320
+ error_details += f" | Status: {e.response.status_code} | Response: {e.response.text[:300]}" # Log snippet
321
+ return f"❌ Submission Failed: {error_details}", results_df
322
  except Exception as e:
323
+ logger.exception("Unexpected error during submission")
324
+ return f"❌ Submission Failed with unexpected error: {e}", results_df
325
+
326
+ # --- Build Gradio App ---
327
+ logger.info("Setting up Gradio interface...")
328
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
329
+ gr.Markdown(
330
+ """
331
+ # πŸš€ Agent Evaluation Runner πŸš€
332
+
333
+ Connect your Hugging Face account, then click the button below to fetch tasks, run the agent, and submit the answers.
334
+ Ensure the `GITHUB_TOKEN` secret is correctly set in your Space settings.
335
+ """
336
+ )
337
+
338
+ with gr.Row():
339
+ hf_login_button = gr.LoginButton() # Use the login button component
340
+
341
+ run_button = gr.Button("▢️ Run Evaluation & Submit All Answers", variant="primary")
342
+
343
+ submission_status_textbox = gr.Textbox(
344
+ label="πŸ“Š Submission Status",
345
+ lines=5,
346
+ interactive=False,
347
+ placeholder="Submission status will appear here..."
348
+ )
349
+
350
+ results_dataframe = gr.DataFrame(
351
+ label="πŸ“‹ Detailed Log (Questions & Agent Output)",
352
+ headers=["Task ID", "Question", "Submitted Answer", "Full Agent Output"],
353
  wrap=True,
354
+ # Removed height, let Gradio manage it or control via CSS if needed
355
+ column_widths=["10%", "25%", "20%", "45%"]
356
+ )
357
+
358
+ # Connect button click to the evaluation function
359
+ # Pass the login button's profile info to the function
360
+ run_button.click(
361
+ fn=evaluate_and_submit,
362
+ inputs=[hf_login_button], # Pass the profile info from the login button
363
+ outputs=[submission_status_textbox, results_dataframe],
364
+ api_name="evaluate_submit" # For API usage if needed
365
  )
 
366
 
367
+ logger.info("Gradio interface setup complete.")
368
+
369
+ # --- Launch the App ---
370
  if __name__ == "__main__":
371
+ logger.info("Launching Gradio application...")
372
+ demo.launch(
373
+ debug=True, # Provides more detailed logs for Gradio itself
374
+ share=True # Necessary for public access on Hugging Face Spaces
375
+ )
376
+ logger.info("Gradio application has been launched.")
 
377