kamorou commited on
Commit
e8f0b12
·
verified ·
1 Parent(s): dbcbf7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -181
app.py CHANGED
@@ -218,6 +218,20 @@
218
  #
219
  # =================================================================================================
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  import os
222
  import io
223
  import requests
@@ -228,12 +242,12 @@ from typing import TypedDict, Annotated, List
228
  import operator
229
 
230
  # --- LangChain & LangGraph Imports ---
231
- from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage
232
  from langchain_core.tools import tool
233
  from langchain_groq import ChatGroq
234
- # from langchain_openai import ChatOpenAI #<-- Alternative LLM
235
  from langgraph.graph import StateGraph, END
236
- from langgraph.prebuilt import ToolNode # <-- Corrected Import for modern LangGraph
 
237
 
238
  # (Keep Constants as is)
239
  # --- Constants ---
@@ -241,52 +255,70 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
241
  FILES_DIR = "./files"
242
  os.makedirs(FILES_DIR, exist_ok=True)
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  #
245
  # ================================================================================================
246
- # ✅ 1. DEFINE THE AGENT'S TOOLS
247
  # ================================================================================================
248
- # Each tool is a simple Python function decorated with `@tool`.
249
- # The docstring of the function is CRUCIAL. The LLM uses it to decide which tool to use.
250
  #
 
 
251
 
252
  @tool
253
- def web_search(query: str) -> str:
254
  """
255
- Searches the web using DuckDuckGo to find up-to-date information, facts, or answers to general questions.
256
- Use this for any questions that require current event knowledge or broad-spectrum information.
 
257
  """
258
- print(f"--- Calling Web Search Tool with query: {query} ---")
259
- from duckduckgo_search import DDGS
260
  try:
261
- with DDGS() as ddgs:
262
- results = [r for r in ddgs.text(query, max_results=5)]
263
- return str(results) if results else "No results found."
 
264
  except Exception as e:
265
- return f"Error during web search: {e}"
266
 
267
  @tool
268
  def read_file(url: str) -> str:
269
  """
270
- Downloads a file from a given URL, saves it locally, and returns its content.
271
- Use this tool when the user provides a URL to a file that needs to be inspected.
272
- The file is saved in the './files/' directory. The function returns the full text content.
273
  """
274
  print(f"--- Calling Read File Tool with URL: {url} ---")
275
  try:
276
  filename = os.path.join(FILES_DIR, os.path.basename(url))
277
  response = requests.get(url)
278
- response.raise_for_status() # Raise an exception for bad status codes
279
  with open(filename, 'wb') as f:
280
  f.write(response.content)
281
-
282
- # Try to read as text, if it fails, it might be a binary file.
283
  try:
284
  with open(filename, 'r', encoding='utf-8') as f:
285
  content = f.read()
286
  return f"Successfully read file '{filename}'. Content:\n\n{content}"
287
  except UnicodeDecodeError:
288
  return f"Successfully downloaded binary file '{filename}'. Cannot display content."
289
-
290
  except requests.exceptions.RequestException as e:
291
  return f"Error downloading or reading file: {e}"
292
 
@@ -295,7 +327,6 @@ def python_interpreter(code: str) -> str:
295
  """
296
  Executes a given string of Python code and returns the output from stdout.
297
  Use this for complex calculations, data manipulation, or any task that can be solved with code.
298
- The code runs in a restricted environment. You can use libraries like pandas, requests etc.
299
  Make sure to use a print() statement to capture the output.
300
  """
301
  print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
@@ -309,180 +340,120 @@ def python_interpreter(code: str) -> str:
309
 
310
  #
311
  # ================================================================================================
312
- # ✅ 2. CONFIGURE THE AGENT'S STATE, BRAIN (LLM)
313
  # ================================================================================================
314
  #
 
 
315
 
316
- # The AgentState is the "memory" of our agent. It keeps track of the conversation history.
317
  class AgentState(TypedDict):
318
  messages: Annotated[List[BaseMessage], operator.add]
319
 
320
- # List of all the tools our agent can use
321
- tools = [web_search, read_file, python_interpreter]
322
-
323
- # The "Brain" of our agent. We're using Groq for speed.
324
- # Make sure to set GROQ_API_KEY in your HF Space secrets
325
- llm = ChatGroq(model="llama3-70b-8192", temperature=0)
326
-
327
- # If you want to use OpenAI instead, uncomment the line below and set OPENAI_API_KEY
328
- # llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
329
-
330
- # We now bind the tools to the LLM. This tells the LLM what functions it can call.
331
- llm_with_tools = llm.bind_tools(tools)
332
-
333
- #
334
- # ================================================================================================
335
- # ✅ 3. DEFINE THE LANGGRAPH NODES AND EDGES
336
- # ================================================================================================
337
- # This is the core logic of our agent, defined as a graph.
338
- #
339
-
340
- # NODE 1: The Agent Node (call_model)
341
- # This node invokes the LLM to decide the next action or to give a final answer.
342
- def call_model(state: AgentState) -> dict:
343
- print("--- Calling LLM ---")
344
- messages = state['messages']
345
- response = llm_with_tools.invoke(messages)
346
- # We return a dict, because this node will always be part of a graph
347
- return {"messages": [response]}
348
-
349
- # EDGE: The Conditional Router (should_continue)
350
- # This function decides which node to go to next.
351
- def should_continue(state: AgentState) -> str:
352
- last_message = state['messages'][-1]
353
- # If the LLM made a tool call, we route to the 'action' node to execute the tool
354
- if last_message.tool_calls:
355
- print("--- Decision: Call a tool ---")
356
- return "action"
357
- # Otherwise, we are done, and we route to the 'end' state
358
- else:
359
- print("--- Decision: End of process ---")
360
- return "end"
361
-
362
- #
363
- # ================================================================================================
364
- # ✅ 4. BUILD AND COMPILE THE GRAPH (Corrected Version)
365
- # ================================================================================================
366
- #
367
-
368
- # The ToolNode is a pre-built node that executes tools for us.
369
- # It's the modern way to handle tool execution in LangGraph.
370
- tool_node = ToolNode(tools)
371
-
372
- # 1. Initialize the graph and add our state object
373
- workflow = StateGraph(AgentState)
374
-
375
- # 2. Add the two nodes we need: the 'agent' and the 'action' (our tool_node)
376
- workflow.add_node("agent", call_model)
377
- workflow.add_node("action", tool_node)
378
-
379
- # 3. Set the entry point of the graph. The first thing to run is the 'agent' node.
380
- workflow.set_entry_point("agent")
381
-
382
- # 4. Add the conditional edge. This controls the flow of the graph.
383
- workflow.add_conditional_edges(
384
- "agent", # Start from the 'agent' node
385
- should_continue, # Use our function to decide the path
386
- {
387
- "action": "action", # If it returns "action", go to the 'action' node
388
- "end": END # If it returns "end", finish the graph
389
- }
390
- )
391
-
392
- # 5. Add a normal edge. After 'action' runs, it should always go back to 'agent' to reflect.
393
- workflow.add_edge('action', 'agent')
394
-
395
- # 6. Compile the graph into a runnable app.
396
- app = workflow.compile()
397
-
398
 
399
  #
400
  # ================================================================================================
401
- # ✅ 5. CREATE THE AGENT CLASS THAT THE TEMPLATE USES
402
  # ================================================================================================
403
- # This class wraps our LangGraph agent in the format expected by the evaluation script.
404
  #
405
  class GaiaAgent:
406
  def __init__(self):
407
- print("GaiaAgent initialized.")
408
- self.agent_app = app
409
 
410
  def __call__(self, question: str) -> str:
411
- print(f"\n{'='*60}\nAgent received question (first 100 chars): {question[:100]}...\n{'='*60}")
412
 
413
- # The initial input for our graph is a list of messages.
414
- initial_input = {"messages": [HumanMessage(content=question)]}
 
 
 
 
415
 
416
  final_state = None
417
- # Let's add a loop limit to prevent infinite cycles
418
  for i, step in enumerate(self.agent_app.stream(initial_input, {"recursion_limit": 15})):
419
  if i == 0:
420
  print("--- Starting Agentic Loop ---")
421
  final_state = step
422
 
423
- # The final answer is in the last AIMessage of the 'messages' list
424
  final_answer_message = final_state['agent']['messages'][-1]
425
- final_answer = final_answer_message.content
426
 
427
  print(f"\n--- Agent finished. Final Answer: {final_answer} ---\n")
428
  return final_answer
429
 
430
  #
431
  # ================================================================================================
432
- # -- DO NOT MODIFY THE CODE BELOW THIS LINE --
433
- # -- This is the Gradio App and Submission Logic from the course --
434
  # ================================================================================================
435
 
436
  def run_and_submit_all( profile: gr.OAuthProfile | None):
437
- """
438
- Fetches all questions, runs the BasicAgent on them, submits all answers,
439
- and displays the results.
440
- """
441
  space_id = os.getenv("SPACE_ID")
442
-
443
- if profile:
444
- username= f"{profile.username}"
445
- print(f"User logged in: {username}")
446
- else:
447
- print("User not logged in.")
448
  return "Please Login to Hugging Face with the button.", None
 
 
449
 
450
  api_url = DEFAULT_API_URL
451
  questions_url = f"{api_url}/questions"
452
  submit_url = f"{api_url}/submit"
453
 
454
- try:
455
- agent = GaiaAgent()
456
- except Exception as e:
457
- print(f"Error instantiating agent: {e}")
458
- return f"Error initializing agent: {e}", None
459
-
460
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
461
- print(agent_code)
462
-
463
  print(f"Fetching questions from: {questions_url}")
464
  try:
465
  response = requests.get(questions_url, timeout=15)
466
  response.raise_for_status()
467
  questions_data = response.json()
468
- if not questions_data:
469
- print("Fetched questions list is empty.")
470
- return "Fetched questions list is empty or invalid format.", None
471
  print(f"Fetched {len(questions_data)} questions.")
472
  except Exception as e:
473
- print(f"An unexpected error occurred fetching questions: {e}")
474
  return f"An unexpected error occurred fetching questions: {e}", None
475
 
476
  results_log = []
477
  answers_payload = []
478
  print(f"Running agent on {len(questions_data)} questions...")
 
 
 
 
479
  for item in questions_data:
480
  task_id = item.get("task_id")
481
  question_text = item.get("question")
482
  if not task_id or question_text is None:
483
- print(f"Skipping item with missing task_id or question: {item}")
484
  continue
485
  try:
 
 
486
  submitted_answer = agent(question_text)
487
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
488
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
@@ -491,13 +462,9 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
491
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
492
 
493
  if not answers_payload:
494
- print("Agent did not produce any answers to submit.")
495
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
496
 
497
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
498
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
499
- print(status_update)
500
-
501
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
502
  try:
503
  response = requests.post(submit_url, json=submission_data, timeout=60)
@@ -513,27 +480,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
513
  print("Submission successful.")
514
  results_df = pd.DataFrame(results_log)
515
  return final_status, results_df
516
- except requests.exceptions.HTTPError as e:
517
- error_detail = f"Server responded with status {e.response.status_code}."
518
- try:
519
- error_json = e.response.json()
520
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
521
- except requests.exceptions.JSONDecodeError:
522
- error_detail += f" Response: {e.response.text[:500]}"
523
- status_message = f"Submission Failed: {error_detail}"
524
- print(status_message)
525
- results_df = pd.DataFrame(results_log)
526
- return status_message, results_df
527
- except requests.exceptions.Timeout:
528
- status_message = "Submission Failed: The request timed out."
529
- print(status_message)
530
- results_df = pd.DataFrame(results_log)
531
- return status_message, results_df
532
- except requests.exceptions.RequestException as e:
533
- status_message = f"Submission Failed: Network error - {e}"
534
- print(status_message)
535
- results_df = pd.DataFrame(results_log)
536
- return status_message, results_df
537
  except Exception as e:
538
  status_message = f"An unexpected error occurred during submission: {e}"
539
  print(status_message)
@@ -541,25 +487,21 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
541
  return status_message, results_df
542
 
543
 
544
- # --- Build Gradio Interface using Blocks ---
545
  with gr.Blocks() as demo:
546
- gr.Markdown("# GAIA Agent Final Assessment")
547
  gr.Markdown(
548
  """
549
- **Instructor's Note:** This space is now powered by a LangGraph agent.
550
- 1. Ensure your `GROQ_API_KEY` is set in the Space secrets.
551
- 2. Make sure you have a `requirements.txt` file with the specified versions.
552
- 3. Log in below and click 'Run Evaluation'. Good luck!
553
  """
554
  )
555
-
556
  gr.LoginButton()
557
-
558
  run_button = gr.Button("Run Evaluation & Submit All Answers")
559
-
560
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
561
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
562
-
563
  run_button.click(
564
  fn=run_and_submit_all,
565
  outputs=[status_output, results_table]
@@ -567,12 +509,4 @@ with gr.Blocks() as demo:
567
 
568
  if __name__ == "__main__":
569
  print("\n" + "-"*30 + " App Starting " + "-"*30)
570
- space_id_startup = os.getenv("SPACE_ID")
571
- if space_id_startup:
572
- print(f"✅ SPACE_ID found: {space_id_startup}")
573
- else:
574
- print("ℹ️ SPACE_ID environment variable not found (running locally?).")
575
- print("-"*(60 + len(" App Starting ")) + "\n")
576
-
577
- print("Launching Gradio Interface for GAIA Agent Evaluation...")
578
  demo.launch(debug=True, share=False)
 
218
  #
219
  # =================================================================================================
220
 
221
+ #
222
+
223
+ ###########################
224
+ # =================================================================================================
225
+ # ✅ --- ✅ FINAL ASSESSMENT AGENT - V4 (STATE-FIXED & TAVILY) ✅ --- ✅
226
+ # =================================================================================================
227
+ #
228
+ # Instructions:
229
+ # 1. Add TAVILY_API_KEY and GROQ_API_KEY to your HF Space secrets.
230
+ # 2. Update your requirements.txt to include `tavily-python`.
231
+ # 3. This version fixes the critical state-leakage bug and uses a better search tool.
232
+ #
233
+ # =================================================================================================
234
+
235
  import os
236
  import io
237
  import requests
 
242
  import operator
243
 
244
  # --- LangChain & LangGraph Imports ---
245
+ from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage, SystemMessage
246
  from langchain_core.tools import tool
247
  from langchain_groq import ChatGroq
 
248
  from langgraph.graph import StateGraph, END
249
+ from langgraph.prebuilt import ToolNode
250
+ from tavily import TavilyClient # <-- Import Tavily
251
 
252
  # (Keep Constants as is)
253
  # --- Constants ---
 
255
  FILES_DIR = "./files"
256
  os.makedirs(FILES_DIR, exist_ok=True)
257
 
258
+ # --- The new, stricter System Prompt ---
259
+ AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark.
260
+
261
+ Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
262
+
263
+ CRITICAL INSTRUCTIONS:
264
+ 1. **Analyze the Goal:** First, understand what the user is asking for.
265
+ 2. **Plan & Execute:** Formulate a plan and use the available tools (`tavily_search`, `read_file`, `python_interpreter`) to gather information.
266
+ 3. **Final Answer Format:** Once you are absolutely certain of the answer, you MUST provide it directly and concisely.
267
+ - DO NOT include your reasoning, thoughts, or any conversational text like 'The answer is...', 'Here is the result:', or 'Based on my search...'.
268
+ - Your final response must ONLY be the answer itself.
269
+
270
+ EXAMPLES OF CORRECT FINAL ANSWERS:
271
+ - If the question asks for a year: `2023`
272
+ - If it asks for a name: `John Doe`
273
+ - If it asks for a number: `42`
274
+ - If it asks for a comma-separated list: `item1, item2, item3`
275
+
276
+ Think, use your tools, and then provide ONLY the final, precise answer.
277
+ """
278
+
279
  #
280
  # ================================================================================================
281
+ # ✅ 1. DEFINE THE AGENT'S TOOLS (NOW WITH TAVILY)
282
  # ================================================================================================
 
 
283
  #
284
+ # Initialize the Tavily client. It will automatically use the TAVILY_API_KEY from secrets.
285
+ tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
286
 
287
  @tool
288
+ def tavily_search(query: str) -> str:
289
  """
290
+ Uses the Tavily Search API to find information on the web.
291
+ Tavily is optimized for AI agents and provides clean, summarized results.
292
+ Use this for any questions that require current, factual, or web-based information.
293
  """
294
+ print(f"--- Calling Tavily Search Tool with query: {query} ---")
 
295
  try:
296
+ # Calling the search method with the query
297
+ result = tavily.search(query=query, search_depth="advanced")
298
+ # Returning the content of the search results
299
+ return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
300
  except Exception as e:
301
+ return f"Error during Tavily search: {e}"
302
 
303
  @tool
304
  def read_file(url: str) -> str:
305
  """
306
+ Downloads a file from a given URL and returns its content.
307
+ Use this tool when a question provides a URL to a file that needs to be read.
 
308
  """
309
  print(f"--- Calling Read File Tool with URL: {url} ---")
310
  try:
311
  filename = os.path.join(FILES_DIR, os.path.basename(url))
312
  response = requests.get(url)
313
+ response.raise_for_status()
314
  with open(filename, 'wb') as f:
315
  f.write(response.content)
 
 
316
  try:
317
  with open(filename, 'r', encoding='utf-8') as f:
318
  content = f.read()
319
  return f"Successfully read file '{filename}'. Content:\n\n{content}"
320
  except UnicodeDecodeError:
321
  return f"Successfully downloaded binary file '{filename}'. Cannot display content."
 
322
  except requests.exceptions.RequestException as e:
323
  return f"Error downloading or reading file: {e}"
324
 
 
327
  """
328
  Executes a given string of Python code and returns the output from stdout.
329
  Use this for complex calculations, data manipulation, or any task that can be solved with code.
 
330
  Make sure to use a print() statement to capture the output.
331
  """
332
  print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
 
340
 
341
  #
342
  # ================================================================================================
343
+ # ✅ 2. CONFIGURE AND BUILD THE AGENT GRAPH
344
  # ================================================================================================
345
  #
346
+ # This section is now self-contained to be called for each new agent instance.
347
+ #
348
 
 
349
  class AgentState(TypedDict):
350
  messages: Annotated[List[BaseMessage], operator.add]
351
 
352
+ def build_agent_graph():
353
+ """Builds the LangGraph agent."""
354
+ tools = [tavily_search, read_file, python_interpreter]
355
+ llm = ChatGroq(model="llama3-70b-8192", temperature=0)
356
+ llm_with_tools = llm.bind_tools(tools)
357
+
358
+ def call_model(state: AgentState) -> dict:
359
+ print("--- Calling LLM ---")
360
+ messages = state['messages']
361
+ response = llm_with_tools.invoke(messages)
362
+ return {"messages": [response]}
363
+
364
+ def should_continue(state: AgentState) -> str:
365
+ last_message = state['messages'][-1]
366
+ if last_message.tool_calls:
367
+ return "action"
368
+ else:
369
+ return "end"
370
+
371
+ tool_node = ToolNode(tools)
372
+ workflow = StateGraph(AgentState)
373
+ workflow.add_node("agent", call_model)
374
+ workflow.add_node("action", tool_node)
375
+ workflow.set_entry_point("agent")
376
+ workflow.add_conditional_edges(
377
+ "agent",
378
+ should_continue,
379
+ {"action": "action", "end": END}
380
+ )
381
+ workflow.add_edge('action', 'agent')
382
+ return workflow.compile()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
  #
385
  # ================================================================================================
386
+ # ✅ 3. CREATE THE AGENT CLASS THAT THE TEMPLATE USES
387
  # ================================================================================================
 
388
  #
389
  class GaiaAgent:
390
  def __init__(self):
391
+ print("GaiaAgent initialized. Building fresh graph...")
392
+ self.agent_app = build_agent_graph()
393
 
394
  def __call__(self, question: str) -> str:
395
+ print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
396
 
397
+ initial_input = {
398
+ "messages": [
399
+ SystemMessage(content=AGENT_SYSTEM_PROMPT),
400
+ HumanMessage(content=question)
401
+ ]
402
+ }
403
 
404
  final_state = None
 
405
  for i, step in enumerate(self.agent_app.stream(initial_input, {"recursion_limit": 15})):
406
  if i == 0:
407
  print("--- Starting Agentic Loop ---")
408
  final_state = step
409
 
 
410
  final_answer_message = final_state['agent']['messages'][-1]
411
+ final_answer = str(final_answer_message.content).strip()
412
 
413
  print(f"\n--- Agent finished. Final Answer: {final_answer} ---\n")
414
  return final_answer
415
 
416
  #
417
  # ================================================================================================
418
+ # -- EVALUATION LOGIC - CRITICAL FIX APPLIED --
 
419
  # ================================================================================================
420
 
421
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
 
 
 
422
  space_id = os.getenv("SPACE_ID")
423
+ if not profile:
 
 
 
 
 
424
  return "Please Login to Hugging Face with the button.", None
425
+ username = f"{profile.username}"
426
+ print(f"User logged in: {username}")
427
 
428
  api_url = DEFAULT_API_URL
429
  questions_url = f"{api_url}/questions"
430
  submit_url = f"{api_url}/submit"
431
 
 
 
 
 
 
 
432
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
 
 
433
  print(f"Fetching questions from: {questions_url}")
434
  try:
435
  response = requests.get(questions_url, timeout=15)
436
  response.raise_for_status()
437
  questions_data = response.json()
 
 
 
438
  print(f"Fetched {len(questions_data)} questions.")
439
  except Exception as e:
 
440
  return f"An unexpected error occurred fetching questions: {e}", None
441
 
442
  results_log = []
443
  answers_payload = []
444
  print(f"Running agent on {len(questions_data)} questions...")
445
+
446
+ #
447
+ # --->>> CRITICAL FIX: Instantiate a NEW agent for EACH question <<<---
448
+ #
449
  for item in questions_data:
450
  task_id = item.get("task_id")
451
  question_text = item.get("question")
452
  if not task_id or question_text is None:
 
453
  continue
454
  try:
455
+ # A new, clean agent is created here to prevent state leakage.
456
+ agent = GaiaAgent()
457
  submitted_answer = agent(question_text)
458
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
459
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
462
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
463
 
464
  if not answers_payload:
 
465
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
466
 
467
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
 
 
 
468
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
469
  try:
470
  response = requests.post(submit_url, json=submission_data, timeout=60)
 
480
  print("Submission successful.")
481
  results_df = pd.DataFrame(results_log)
482
  return final_status, results_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  except Exception as e:
484
  status_message = f"An unexpected error occurred during submission: {e}"
485
  print(status_message)
 
487
  return status_message, results_df
488
 
489
 
490
+ # --- Gradio Interface (No Changes Needed) ---
491
  with gr.Blocks() as demo:
492
+ gr.Markdown("# GAIA Agent Final Assessment (V4 - State Fixed)")
493
  gr.Markdown(
494
  """
495
+ **Instructor's Note:** This version fixes the critical state-leakage bug and uses the Tavily Search API for better results.
496
+ 1. Ensure `GROQ_API_KEY` and `TAVILY_API_KEY` are set in secrets.
497
+ 2. Ensure `requirements.txt` includes `tavily-python`.
498
+ 3. Log in and run the evaluation. Let's see that score jump!
499
  """
500
  )
 
501
  gr.LoginButton()
 
502
  run_button = gr.Button("Run Evaluation & Submit All Answers")
 
503
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
504
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
505
  run_button.click(
506
  fn=run_and_submit_all,
507
  outputs=[status_output, results_table]
 
509
 
510
  if __name__ == "__main__":
511
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
 
 
 
 
 
 
 
512
  demo.launch(debug=True, share=False)