kamorou commited on
Commit
182b505
·
verified ·
1 Parent(s): 10a40a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +935 -241
app.py CHANGED
@@ -247,263 +247,957 @@
247
  # =================================================================================================
248
 
249
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  import os
251
- import io
252
- import json
253
  import requests
 
254
  import pandas as pd
255
- import gradio as gr
256
- from contextlib import redirect_stdout
257
- from typing import TypedDict, Annotated, List
258
- import operator
259
-
260
- # --- LangChain & LangGraph Imports ---
261
- from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage, SystemMessage
262
- from langchain_core.tools import tool
263
- from langchain_huggingface import HuggingFaceEndpoint
264
- from langgraph.graph import StateGraph, END
 
265
  from tavily import TavilyClient
266
- import pypdf
 
 
 
 
 
 
267
 
268
  # --- Constants ---
269
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
270
- FILES_DIR = "./files"
271
- os.makedirs(FILES_DIR, exist_ok=True)
272
-
273
- # --- System Prompt (Updated for Manual JSON Tool Calling) ---
274
- # This prompt instructs the model to generate JSON, a robust method for tool calls.
275
- AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark.
276
- Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
277
-
278
- **TOOL USAGE INSTRUCTIONS:**
279
- When you need to use a tool, you MUST respond with a JSON object containing the tool name and its arguments. The JSON object should have two keys: "tool_name" and "parameters".
280
-
281
- Here is an example of how to call the `tavily_search` tool:
282
- ```json
283
- {
284
- "tool_name": "tavily_search",
285
- "parameters": {
286
- "query": "Who won the last FIFA World Cup?"
287
- }
288
- }
289
- Use code with caution.
290
- Python
291
- CRITICAL FINAL ANSWER INSTRUCTIONS:
292
- Once you have gathered all the necessary information and are absolutely certain of the answer, you MUST provide it directly and concisely.
293
- Your final response must ONLY be the answer itself.
294
- DO NOT wrap the final answer in a JSON object or include any conversational text.
295
- Think, use your tools, and then provide ONLY the final, precise answer.
296
- """
297
- ###===============================================================================================
298
- tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
299
- @tool
300
- def tavily_search(query: str) -> str:
301
- """Uses the Tavily Search API to find information on the web."""
302
- print(f"--- Calling Tavily Search Tool with query: {query} ---")
303
- try:
304
- result = tavily.search(query=query, search_depth="advanced")
305
- return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
306
- except Exception as e: return f"Error during Tavily search: {e}"
307
- @tool
308
- def read_file(url: str) -> str:
309
- """Downloads and reads the content of a file (text or PDF) from a URL."""
310
- print(f"--- Calling Read File Tool with URL: {url} ---")
311
- try:
312
- filename = os.path.join(FILES_DIR, os.path.basename(url))
313
- response = requests.get(url)
314
- response.raise_for_status()
315
- with open(filename, 'wb') as f: f.write(response.content)
316
- if url.lower().endswith('.pdf'):
317
- try:
318
- pdf_reader = pypdf.PdfReader(filename)
319
- return f"Successfully read PDF file '{filename}'. Content:\n\n{''.join(p.extract_text() for p in pdf_reader.pages)}"
320
- except Exception as e: return f"Error reading PDF file: {e}"
321
- else:
322
- try:
323
- with open(filename, 'r', encoding='utf-8') as f: return f"Successfully read text file '{filename}'. Content:\n\n{f.read()}"
324
- except UnicodeDecodeError: return f"Successfully downloaded binary file '{filename}'. Cannot display content as text."
325
- except requests.exceptions.RequestException as e: return f"Error downloading or reading file: {e}"
326
- @tool
327
- def python_interpreter(code: str) -> str:
328
- """Executes Python code and returns its stdout."""
329
- print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
330
- output_buffer = io.StringIO()
331
- try:
332
- with redirect_stdout(output_buffer): exec(code, globals())
333
- return f"Code executed successfully. Output:\n{output_buffer.getvalue()}"
334
- except Exception as e: return f"Error executing Python code: {e}"
335
- ##================================================================================================
336
- #✅ 2. CONFIGURE AND BUILD THE AGENT (with Qwen2 and Manual Tool Calling)
337
- #================================================================================================
338
- class AgentState(TypedDict):
339
- messages: Annotated[List[BaseMessage], operator.add]
340
- def build_agent_graph():
341
- """Builds the agent using a manual LangGraph loop with the HuggingFaceEndpoint."""
342
- tools = [tavily_search, read_file, python_interpreter]
343
- tool_map = {tool.name: tool for tool in tools}
344
- Generated code
345
- # Using Qwen2-72B-Instruct model via HuggingFaceEndpoint
346
- repo_id = "Qwen/Qwen2-72B-Instruct"
347
- llm = HuggingFaceEndpoint(
348
- repo_id=repo_id,
349
- max_new_tokens=1024,
350
- temperature=0.1,
351
- huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
352
- )
353
-
354
- def call_model(state: AgentState):
355
- """Invokes the LLM and wraps the response in an AIMessage."""
356
- # Qwen2 Instruct uses a specific chat template. We build it manually.
357
- prompt_str = ""
358
- for msg in state['messages']:
359
- role = ""
360
- if isinstance(msg, SystemMessage): role = "system"
361
- elif isinstance(msg, HumanMessage): role = "user"
362
- elif isinstance(msg, AIMessage): role = "assistant"
363
- elif isinstance(msg, ToolMessage): continue # We'll handle tool results differently
364
-
365
- if role: prompt_str += f"<|im_start|>{role}\n{msg.content}<|im_end|>\n"
366
-
367
- # Add results from the last tool call, if any
368
- if isinstance(state['messages'][-1], ToolMessage):
369
- prompt_str += f"<|im_start|>user\nTool output:\n{state['messages'][-1].content}<|im_end|>\n"
370
-
371
- prompt_str += "<|im_start|>assistant\n"
372
-
373
- response_text = llm.invoke(prompt_str)
374
- return {"messages": [AIMessage(content=response_text)]}
375
-
376
- def should_continue(state: AgentState) -> str:
377
- """Determines whether to call a tool or end the loop."""
378
- last_message_content = state['messages'][-1].content.strip()
379
- # A simple check for JSON is a reliable way to detect tool calls.
380
- if "```json" in last_message_content:
381
- return "action"
382
- if last_message_content.startswith('{') and last_message_content.endswith('}'):
383
  try:
384
- json.loads(last_message_content)
385
- return "action"
386
- except json.JSONDecodeError:
387
- return "end" # Not valid JSON, must be the final answer
388
- else:
389
- return "end"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- def call_tool_node(state: AgentState):
392
- """Parses the JSON tool call from the LLM and executes it."""
393
- last_message_content = state['messages'][-1].content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- # Extract JSON from markdown code block if present
396
- if "```json" in last_message_content:
397
- json_str = last_message_content.split("```json").split("```")[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  else:
399
- json_str = last_message_content
 
 
 
 
 
400
 
 
401
  try:
402
- tool_call_data = json.loads(json_str)
403
- tool_name = tool_call_data.get("tool_name")
404
- parameters = tool_call_data.get("parameters", {})
405
- if tool_name not in tool_map:
406
- return {"messages": [ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id="error")]}
407
-
408
- selected_tool = tool_map[tool_name]
409
- tool_output = selected_tool.invoke(parameters)
410
- return {"messages": [ToolMessage(content=str(tool_output), tool_call_id=tool_name)]}
411
  except Exception as e:
412
- return {"messages": [ToolMessage(content=f"Error parsing tool call: {e}. Content: '{last_message_content}'", tool_call_id="error")]}
413
-
414
- workflow = StateGraph(AgentState)
415
- workflow.add_node("agent", call_model)
416
- workflow.add_node("action", call_tool_node)
417
- workflow.set_entry_point("agent")
418
- workflow.add_conditional_edges("agent", should_continue, {"action": "action", "end": END})
419
- workflow.add_edge('action', 'agent')
420
- return workflow.compile()
421
- Use code with caution.
422
- #================================================================================================
423
- #✅ 3. AGENT CLASS AND EVALUATION LOGIC
424
- #================================================================================================
425
- class GaiaAgent:
426
- def init(self):
427
- print("GaiaAgent initialized. Building agent with Qwen/Qwen2-72B-Instruct...")
428
- self.agent_app = build_agent_graph()
429
- Generated code
430
- def __call__(self, question: str) -> str:
431
- print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
432
- try:
433
- initial_input = {"messages": [SystemMessage(content=AGENT_SYSTEM_PROMPT), HumanMessage(content=question)]}
434
- final_state = None
435
- for step in self.agent_app.stream(initial_input, {"recursion_limit": 15}):
436
- final_state = list(step.values())[0]
437
 
438
- final_answer = final_state['messages'][-1].content
439
- return str(final_answer).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  except Exception as e:
441
- print(f"An error occurred during agent execution: {e}")
442
- return f"AGENT_EXECUTION_ERROR: {e}"
443
- Use code with caution.
444
- --- The rest of the file is unchanged ---
445
- def run_and_submit_all( profile: gr.OAuthProfile | None):
446
- space_id = os.getenv("SPACE_ID")
447
- if not profile: return "Please Login to Hugging Face with the button.", None
448
- username = f"{profile.username}"
449
- print(f"User logged in: {username}")
450
- api_url = DEFAULT_API_URL
451
- questions_url = f"{api_url}/questions"
452
- submit_url = f"{api_url}/submit"
453
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
454
- Generated code
455
- try:
456
- response = requests.get(questions_url, timeout=15)
457
- response.raise_for_status()
458
- questions_data = response.json()
459
- except Exception as e: return f"An unexpected error occurred fetching questions: {e}", None
460
-
461
- results_log, answers_payload = [], []
462
- agent_instance = GaiaAgent()
463
-
464
- for item in questions_data:
465
- task_id, question_text = item.get("task_id"), item.get("question")
466
- if not task_id or question_text is None: continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  try:
468
- submitted_answer = agent_instance(question_text)
469
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
470
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  except Exception as e:
472
- print(f"Error running agent on task {task_id}: {e}")
473
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
474
-
475
- if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
476
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
477
-
478
- try:
479
- response = requests.post(submit_url, json=submission_data, timeout=90)
480
- response.raise_for_status()
481
- result_data = response.json()
482
- final_status = (
483
- f"Submission Successful!\n"
484
- f"User: {result_data.get('username')}\n"
485
- f"Overall Score: {result_data.get('score', 'N/A')}% "
486
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
487
- f"Message: {result_data.get('message', 'No message received.')}"
488
- )
489
- return final_status, pd.DataFrame(results_log)
490
- except Exception as e: return f"An unexpected error in submission: {e}", pd.DataFrame(results_log)
491
- Use code with caution.
492
  with gr.Blocks() as demo:
493
- gr.Markdown("# GAIA Agent Final Assessment (Qwen2-72B-Instruct)")
494
- gr.Markdown(
495
- """
496
- Instructor's Note: This version uses the powerful Qwen/Qwen2-72B-Instruct model from the Hugging Face Hub.
497
- It relies on a robust manual LangGraph loop to handle tool calls by instructing the model to generate JSON.
498
- 1. Ensure you have a HUGGINGFACEHUB_API_TOKEN and TAVILY_API_KEY set in your secrets.
499
- 2. Ensure your requirements.txt is updated. Good luck!
500
- """
501
- )
502
- gr.LoginButton()
503
- run_button = gr.Button("Run Evaluation & Submit All Answers")
504
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
505
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
506
- run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
507
- if name == "main":
508
- print("\n" + "-"*30 + " App Starting " + "-"*30)
509
- demo.launch(debug=True, share=False, ssr_mode=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  # =================================================================================================
248
 
249
  #
250
+ # import os
251
+ # import io
252
+ # import json
253
+ # import requests
254
+ # import pandas as pd
255
+ # import gradio as gr
256
+ # from contextlib import redirect_stdout
257
+ # from typing import TypedDict, Annotated, List
258
+ # import operator
259
+
260
+ # # --- LangChain & LangGraph Imports ---
261
+ # from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage, SystemMessage
262
+ # from langchain_core.tools import tool
263
+ # from langchain_huggingface import HuggingFaceEndpoint
264
+ # from langgraph.graph import StateGraph, END
265
+ # from tavily import TavilyClient
266
+ # import pypdf
267
+
268
+ # # --- Constants ---
269
+ # DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
270
+ # FILES_DIR = "./files"
271
+ # os.makedirs(FILES_DIR, exist_ok=True)
272
+
273
+ # # --- System Prompt (Updated for Manual JSON Tool Calling) ---
274
+ # # This prompt instructs the model to generate JSON, a robust method for tool calls.
275
+ # AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark.
276
+ # Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
277
+
278
+ # **TOOL USAGE INSTRUCTIONS:**
279
+ # When you need to use a tool, you MUST respond with a JSON object containing the tool name and its arguments. The JSON object should have two keys: "tool_name" and "parameters".
280
+
281
+ # Here is an example of how to call the `tavily_search` tool:
282
+ # ```json
283
+ # {
284
+ # "tool_name": "tavily_search",
285
+ # "parameters": {
286
+ # "query": "Who won the last FIFA World Cup?"
287
+ # }
288
+ # }
289
+ # Use code with caution.
290
+ # Python
291
+ # CRITICAL FINAL ANSWER INSTRUCTIONS:
292
+ # Once you have gathered all the necessary information and are absolutely certain of the answer, you MUST provide it directly and concisely.
293
+ # Your final response must ONLY be the answer itself.
294
+ # DO NOT wrap the final answer in a JSON object or include any conversational text.
295
+ # Think, use your tools, and then provide ONLY the final, precise answer.
296
+ # """
297
+ # ###===============================================================================================
298
+ # tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
299
+ # @tool
300
+ # def tavily_search(query: str) -> str:
301
+ # """Uses the Tavily Search API to find information on the web."""
302
+ # print(f"--- Calling Tavily Search Tool with query: {query} ---")
303
+ # try:
304
+ # result = tavily.search(query=query, search_depth="advanced")
305
+ # return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
306
+ # except Exception as e: return f"Error during Tavily search: {e}"
307
+ # @tool
308
+ # def read_file(url: str) -> str:
309
+ # """Downloads and reads the content of a file (text or PDF) from a URL."""
310
+ # print(f"--- Calling Read File Tool with URL: {url} ---")
311
+ # try:
312
+ # filename = os.path.join(FILES_DIR, os.path.basename(url))
313
+ # response = requests.get(url)
314
+ # response.raise_for_status()
315
+ # with open(filename, 'wb') as f: f.write(response.content)
316
+ # if url.lower().endswith('.pdf'):
317
+ # try:
318
+ # pdf_reader = pypdf.PdfReader(filename)
319
+ # return f"Successfully read PDF file '{filename}'. Content:\n\n{''.join(p.extract_text() for p in pdf_reader.pages)}"
320
+ # except Exception as e: return f"Error reading PDF file: {e}"
321
+ # else:
322
+ # try:
323
+ # with open(filename, 'r', encoding='utf-8') as f: return f"Successfully read text file '{filename}'. Content:\n\n{f.read()}"
324
+ # except UnicodeDecodeError: return f"Successfully downloaded binary file '{filename}'. Cannot display content as text."
325
+ # except requests.exceptions.RequestException as e: return f"Error downloading or reading file: {e}"
326
+ # @tool
327
+ # def python_interpreter(code: str) -> str:
328
+ # """Executes Python code and returns its stdout."""
329
+ # print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
330
+ # output_buffer = io.StringIO()
331
+ # try:
332
+ # with redirect_stdout(output_buffer): exec(code, globals())
333
+ # return f"Code executed successfully. Output:\n{output_buffer.getvalue()}"
334
+ # except Exception as e: return f"Error executing Python code: {e}"
335
+ # ##================================================================================================
336
+ # #✅ 2. CONFIGURE AND BUILD THE AGENT (with Qwen2 and Manual Tool Calling)
337
+ # #================================================================================================
338
+ # class AgentState(TypedDict):
339
+ # messages: Annotated[List[BaseMessage], operator.add]
340
+ # def build_agent_graph():
341
+ # """Builds the agent using a manual LangGraph loop with the HuggingFaceEndpoint."""
342
+ # tools = [tavily_search, read_file, python_interpreter]
343
+ # tool_map = {tool.name: tool for tool in tools}
344
+ # Generated code
345
+ # # Using Qwen2-72B-Instruct model via HuggingFaceEndpoint
346
+ # repo_id = "Qwen/Qwen2-72B-Instruct"
347
+ # llm = HuggingFaceEndpoint(
348
+ # repo_id=repo_id,
349
+ # max_new_tokens=1024,
350
+ # temperature=0.1,
351
+ # huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
352
+ # )
353
+
354
+ # def call_model(state: AgentState):
355
+ # """Invokes the LLM and wraps the response in an AIMessage."""
356
+ # # Qwen2 Instruct uses a specific chat template. We build it manually.
357
+ # prompt_str = ""
358
+ # for msg in state['messages']:
359
+ # role = ""
360
+ # if isinstance(msg, SystemMessage): role = "system"
361
+ # elif isinstance(msg, HumanMessage): role = "user"
362
+ # elif isinstance(msg, AIMessage): role = "assistant"
363
+ # elif isinstance(msg, ToolMessage): continue # We'll handle tool results differently
364
+
365
+ # if role: prompt_str += f"<|im_start|>{role}\n{msg.content}<|im_end|>\n"
366
+
367
+ # # Add results from the last tool call, if any
368
+ # if isinstance(state['messages'][-1], ToolMessage):
369
+ # prompt_str += f"<|im_start|>user\nTool output:\n{state['messages'][-1].content}<|im_end|>\n"
370
+
371
+ # prompt_str += "<|im_start|>assistant\n"
372
+
373
+ # response_text = llm.invoke(prompt_str)
374
+ # return {"messages": [AIMessage(content=response_text)]}
375
+
376
+ # def should_continue(state: AgentState) -> str:
377
+ # """Determines whether to call a tool or end the loop."""
378
+ # last_message_content = state['messages'][-1].content.strip()
379
+ # # A simple check for JSON is a reliable way to detect tool calls.
380
+ # if "```json" in last_message_content:
381
+ # return "action"
382
+ # if last_message_content.startswith('{') and last_message_content.endswith('}'):
383
+ # try:
384
+ # json.loads(last_message_content)
385
+ # return "action"
386
+ # except json.JSONDecodeError:
387
+ # return "end" # Not valid JSON, must be the final answer
388
+ # else:
389
+ # return "end"
390
+
391
+ # def call_tool_node(state: AgentState):
392
+ # """Parses the JSON tool call from the LLM and executes it."""
393
+ # last_message_content = state['messages'][-1].content.strip()
394
+
395
+ # # Extract JSON from markdown code block if present
396
+ # if "```json" in last_message_content:
397
+ # json_str = last_message_content.split("```json").split("```")[0].strip()
398
+ # else:
399
+ # json_str = last_message_content
400
+
401
+ # try:
402
+ # tool_call_data = json.loads(json_str)
403
+ # tool_name = tool_call_data.get("tool_name")
404
+ # parameters = tool_call_data.get("parameters", {})
405
+ # if tool_name not in tool_map:
406
+ # return {"messages": [ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id="error")]}
407
+
408
+ # selected_tool = tool_map[tool_name]
409
+ # tool_output = selected_tool.invoke(parameters)
410
+ # return {"messages": [ToolMessage(content=str(tool_output), tool_call_id=tool_name)]}
411
+ # except Exception as e:
412
+ # return {"messages": [ToolMessage(content=f"Error parsing tool call: {e}. Content: '{last_message_content}'", tool_call_id="error")]}
413
+
414
+ # workflow = StateGraph(AgentState)
415
+ # workflow.add_node("agent", call_model)
416
+ # workflow.add_node("action", call_tool_node)
417
+ # workflow.set_entry_point("agent")
418
+ # workflow.add_conditional_edges("agent", should_continue, {"action": "action", "end": END})
419
+ # workflow.add_edge('action', 'agent')
420
+ # return workflow.compile()
421
+ # Use code with caution.
422
+ # #================================================================================================
423
+ # #✅ 3. AGENT CLASS AND EVALUATION LOGIC
424
+ # #================================================================================================
425
+ # class GaiaAgent:
426
+ # def init(self):
427
+ # print("GaiaAgent initialized. Building agent with Qwen/Qwen2-72B-Instruct...")
428
+ # self.agent_app = build_agent_graph()
429
+ # Generated code
430
+ # def __call__(self, question: str) -> str:
431
+ # print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
432
+ # try:
433
+ # initial_input = {"messages": [SystemMessage(content=AGENT_SYSTEM_PROMPT), HumanMessage(content=question)]}
434
+ # final_state = None
435
+ # for step in self.agent_app.stream(initial_input, {"recursion_limit": 15}):
436
+ # final_state = list(step.values())[0]
437
+
438
+ # final_answer = final_state['messages'][-1].content
439
+ # return str(final_answer).strip()
440
+ # except Exception as e:
441
+ # print(f"An error occurred during agent execution: {e}")
442
+ # return f"AGENT_EXECUTION_ERROR: {e}"
443
+ # Use code with caution.
444
+ # --- The rest of the file is unchanged ---
445
+ # def run_and_submit_all( profile: gr.OAuthProfile | None):
446
+ # space_id = os.getenv("SPACE_ID")
447
+ # if not profile: return "Please Login to Hugging Face with the button.", None
448
+ # username = f"{profile.username}"
449
+ # print(f"User logged in: {username}")
450
+ # api_url = DEFAULT_API_URL
451
+ # questions_url = f"{api_url}/questions"
452
+ # submit_url = f"{api_url}/submit"
453
+ # agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
454
+ # Generated code
455
+ # try:
456
+ # response = requests.get(questions_url, timeout=15)
457
+ # response.raise_for_status()
458
+ # questions_data = response.json()
459
+ # except Exception as e: return f"An unexpected error occurred fetching questions: {e}", None
460
+
461
+ # results_log, answers_payload = [], []
462
+ # agent_instance = GaiaAgent()
463
+
464
+ # for item in questions_data:
465
+ # task_id, question_text = item.get("task_id"), item.get("question")
466
+ # if not task_id or question_text is None: continue
467
+ # try:
468
+ # submitted_answer = agent_instance(question_text)
469
+ # answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
470
+ # results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
471
+ # except Exception as e:
472
+ # print(f"Error running agent on task {task_id}: {e}")
473
+ # results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
474
+
475
+ # if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
476
+ # submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
477
+
478
+ # try:
479
+ # response = requests.post(submit_url, json=submission_data, timeout=90)
480
+ # response.raise_for_status()
481
+ # result_data = response.json()
482
+ # final_status = (
483
+ # f"Submission Successful!\n"
484
+ # f"User: {result_data.get('username')}\n"
485
+ # f"Overall Score: {result_data.get('score', 'N/A')}% "
486
+ # f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
487
+ # f"Message: {result_data.get('message', 'No message received.')}"
488
+ # )
489
+ # return final_status, pd.DataFrame(results_log)
490
+ # except Exception as e: return f"An unexpected error in submission: {e}", pd.DataFrame(results_log)
491
+ # Use code with caution.
492
+ # with gr.Blocks() as demo:
493
+ # gr.Markdown("# GAIA Agent Final Assessment (Qwen2-72B-Instruct)")
494
+ # gr.Markdown(
495
+ # """
496
+ # Instructor's Note: This version uses the powerful Qwen/Qwen2-72B-Instruct model from the Hugging Face Hub.
497
+ # It relies on a robust manual LangGraph loop to handle tool calls by instructing the model to generate JSON.
498
+ # 1. Ensure you have a HUGGINGFACEHUB_API_TOKEN and TAVILY_API_KEY set in your secrets.
499
+ # 2. Ensure your requirements.txt is updated. Good luck!
500
+ # """
501
+ # )
502
+ # gr.LoginButton()
503
+ # run_button = gr.Button("Run Evaluation & Submit All Answers")
504
+ # status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
505
+ # results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
506
+ # run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
507
+ # if name == "main":
508
+ # print("\n" + "-"*30 + " App Starting " + "-"*30)
509
+ # demo.launch(debug=True, share=False, ssr_mode=False)
510
+
511
+
512
+ #########################
513
  import os
514
+ import gradio as gr
 
515
  import requests
516
+ import inspect
517
  import pandas as pd
518
+ import json
519
+ import re
520
+ from typing import Dict, Any, List, Optional
521
+ from dataclasses import dataclass
522
+ import logging
523
+ from datetime import datetime
524
+ import traceback
525
+
526
+ # Third-party imports for the agent
527
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
528
+ import torch
529
  from tavily import TavilyClient
530
+ import tempfile
531
+ import subprocess
532
+ import sys
533
+
534
+ # Configure logging
535
+ logging.basicConfig(level=logging.INFO)
536
+ logger = logging.getLogger(__name__)
537
 
538
  # --- Constants ---
539
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
540
+
541
+ # Agent System Prompt
542
+ AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark. Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
543
+
544
+ CRITICAL INSTRUCTIONS:
545
+ 1. **Analyze the Goal:** First, understand what the user is asking for.
546
+ 2. **Plan & Execute:** Formulate a plan and use the available tools (`tavily_search`, `read_file`, `python_interpreter`) to gather information.
547
+ 3. **Final Answer Format:** Once you are absolutely certain of the answer, you MUST provide it directly and concisely.
548
+ - DO NOT include your reasoning, thoughts, or any conversational text like 'The answer is...', 'Here is the result:', or 'Based on my search...'.
549
+ - Your final response must ONLY be the answer itself.
550
+
551
+ EXAMPLES OF CORRECT FINAL ANSWERS:
552
+ - If the question asks for a year: `2023`
553
+ - If it asks for a name: `John Doe`
554
+ - If it asks for a number: `42`
555
+ - If it asks for a comma-separated list: `item1, item2, item3`
556
+
557
+ Think, use your tools, and then provide ONLY the final, precise answer."""
558
+
559
+ @dataclass
560
+ class ToolResult:
561
+ """Result from a tool execution"""
562
+ success: bool
563
+ result: Any
564
+ error: Optional[str] = None
565
+
566
+ class ToolExecutor:
567
+ """Handles tool execution for the agent"""
568
+
569
+ def __init__(self):
570
+ self.tavily_client = None
571
+ self.setup_tavily()
572
+
573
+ def setup_tavily(self):
574
+ """Initialize Tavily search client"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  try:
576
+ tavily_api_key = os.getenv("TAVILY_API_KEY")
577
+ if tavily_api_key:
578
+ self.tavily_client = TavilyClient(api_key=tavily_api_key)
579
+ logger.info("Tavily client initialized successfully")
580
+ else:
581
+ logger.warning("TAVILY_API_KEY not found in environment variables")
582
+ except Exception as e:
583
+ logger.error(f"Failed to initialize Tavily client: {e}")
584
+
585
+ def tavily_search(self, query: str, max_results: int = 5) -> ToolResult:
586
+ """Search the web using Tavily"""
587
+ try:
588
+ if not self.tavily_client:
589
+ return ToolResult(success=False, error="Tavily client not initialized")
590
+
591
+ response = self.tavily_client.search(
592
+ query=query,
593
+ search_depth="advanced",
594
+ max_results=max_results,
595
+ include_answer=True,
596
+ include_raw_content=True
597
+ )
598
+
599
+ # Extract relevant information
600
+ results = []
601
+ if response.get('results'):
602
+ for result in response['results']:
603
+ results.append({
604
+ 'title': result.get('title', ''),
605
+ 'content': result.get('content', ''),
606
+ 'url': result.get('url', ''),
607
+ 'score': result.get('score', 0)
608
+ })
609
+
610
+ search_result = {
611
+ 'answer': response.get('answer', ''),
612
+ 'results': results,
613
+ 'query': query
614
+ }
615
+
616
+ return ToolResult(success=True, result=search_result)
617
+
618
+ except Exception as e:
619
+ logger.error(f"Tavily search error: {e}")
620
+ return ToolResult(success=False, error=str(e))
621
+
622
+ def python_interpreter(self, code: str) -> ToolResult:
623
+ """Execute Python code safely"""
624
+ try:
625
+ # Create a temporary file for the code
626
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
627
+ f.write(code)
628
+ temp_file = f.name
629
+
630
+ # Execute the code and capture output
631
+ result = subprocess.run(
632
+ [sys.executable, temp_file],
633
+ capture_output=True,
634
+ text=True,
635
+ timeout=30 # 30 seconds timeout
636
+ )
637
+
638
+ # Clean up
639
+ os.unlink(temp_file)
640
+
641
+ if result.returncode == 0:
642
+ return ToolResult(success=True, result=result.stdout.strip())
643
+ else:
644
+ return ToolResult(success=False, error=result.stderr.strip())
645
+
646
+ except subprocess.TimeoutExpired:
647
+ return ToolResult(success=False, error="Code execution timed out")
648
+ except Exception as e:
649
+ logger.error(f"Python interpreter error: {e}")
650
+ return ToolResult(success=False, error=str(e))
651
+
652
+ def read_file(self, file_path: str) -> ToolResult:
653
+ """Read a file and return its contents"""
654
+ try:
655
+ if not os.path.exists(file_path):
656
+ return ToolResult(success=False, error=f"File not found: {file_path}")
657
+
658
+ with open(file_path, 'r', encoding='utf-8') as f:
659
+ content = f.read()
660
+
661
+ return ToolResult(success=True, result=content)
662
+
663
+ except Exception as e:
664
+ logger.error(f"File reading error: {e}")
665
+ return ToolResult(success=False, error=str(e))
666
 
667
+ class GAIAAgent:
668
+ """Advanced GAIA benchmark agent using Qwen model with tool integration"""
669
+
670
+ def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
671
+ self.model_name = model_name
672
+ self.tool_executor = ToolExecutor()
673
+ self.tokenizer = None
674
+ self.model = None
675
+ self.pipeline = None
676
+ self.setup_model()
677
+ logger.info(f"GAIAAgent initialized with model: {model_name}")
678
+
679
+ def setup_model(self):
680
+ """Initialize the Qwen model and tokenizer"""
681
+ try:
682
+ # Check if CUDA is available
683
+ device = "cuda" if torch.cuda.is_available() else "cpu"
684
+ logger.info(f"Using device: {device}")
685
+
686
+ # Load tokenizer and model
687
+ self.tokenizer = AutoTokenizer.from_pretrained(
688
+ self.model_name,
689
+ trust_remote_code=True
690
+ )
691
+
692
+ # Use pipeline for easier inference
693
+ self.pipeline = pipeline(
694
+ "text-generation",
695
+ model=self.model_name,
696
+ tokenizer=self.tokenizer,
697
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
698
+ device_map="auto" if device == "cuda" else None,
699
+ trust_remote_code=True
700
+ )
701
+
702
+ logger.info("Model loaded successfully")
703
+
704
+ except Exception as e:
705
+ logger.error(f"Failed to load model: {e}")
706
+ # Fallback to a simpler approach
707
+ self.setup_fallback_model()
708
 
709
+ def setup_fallback_model(self):
710
+ """Setup a fallback model if main model fails"""
711
+ try:
712
+ # Try a smaller model
713
+ fallback_model = "microsoft/DialoGPT-medium"
714
+ self.pipeline = pipeline(
715
+ "text-generation",
716
+ model=fallback_model,
717
+ tokenizer=fallback_model
718
+ )
719
+ logger.info(f"Fallback model loaded: {fallback_model}")
720
+ except Exception as e:
721
+ logger.error(f"Fallback model also failed: {e}")
722
+ self.pipeline = None
723
+
724
+ def extract_tool_calls(self, text: str) -> List[Dict[str, Any]]:
725
+ """Extract tool calls from the model's response"""
726
+ tool_calls = []
727
+
728
+ # Pattern to match tool calls like: <tool_call>tavily_search("query")</tool_call>
729
+ pattern = r'<tool_call>(\w+)\(([^)]+)\)</tool_call>'
730
+ matches = re.findall(pattern, text)
731
+
732
+ for tool_name, args_str in matches:
733
+ try:
734
+ # Simple argument parsing (assumes string arguments)
735
+ args = args_str.strip().strip('"\'')
736
+ tool_calls.append({
737
+ 'tool': tool_name,
738
+ 'args': args
739
+ })
740
+ except Exception as e:
741
+ logger.error(f"Failed to parse tool call: {e}")
742
+
743
+ return tool_calls
744
+
745
+ def execute_tools(self, tool_calls: List[Dict[str, Any]]) -> str:
746
+ """Execute tool calls and return results"""
747
+ results = []
748
+
749
+ for call in tool_calls:
750
+ tool_name = call['tool']
751
+ args = call['args']
752
+
753
+ if tool_name == 'tavily_search':
754
+ result = self.tool_executor.tavily_search(args)
755
+ elif tool_name == 'python_interpreter':
756
+ result = self.tool_executor.python_interpreter(args)
757
+ elif tool_name == 'read_file':
758
+ result = self.tool_executor.read_file(args)
759
+ else:
760
+ result = ToolResult(success=False, error=f"Unknown tool: {tool_name}")
761
+
762
+ if result.success:
763
+ results.append(f"Tool {tool_name} result: {result.result}")
764
+ else:
765
+ results.append(f"Tool {tool_name} error: {result.error}")
766
+
767
+ return "\n".join(results)
768
+
769
+ def generate_response(self, prompt: str, max_length: int = 1000) -> str:
770
+ """Generate response using the model"""
771
+ try:
772
+ if not self.pipeline:
773
+ return "Model not available"
774
+
775
+ # Generate response
776
+ outputs = self.pipeline(
777
+ prompt,
778
+ max_length=max_length,
779
+ do_sample=True,
780
+ temperature=0.7,
781
+ top_p=0.9,
782
+ pad_token_id=self.tokenizer.eos_token_id if self.tokenizer else None
783
+ )
784
+
785
+ # Extract the generated text
786
+ generated_text = outputs[0]['generated_text']
787
+
788
+ # Remove the input prompt from the output
789
+ if generated_text.startswith(prompt):
790
+ generated_text = generated_text[len(prompt):].strip()
791
+
792
+ return generated_text
793
+
794
+ except Exception as e:
795
+ logger.error(f"Generation error: {e}")
796
+ return f"Generation failed: {str(e)}"
797
+
798
+ def solve_with_reasoning(self, question: str) -> str:
799
+ """Solve question with step-by-step reasoning and tool usage"""
800
+ try:
801
+ # Create initial prompt
802
+ reasoning_prompt = f"""
803
+ {AGENT_SYSTEM_PROMPT}
804
+
805
+ Question: {question}
806
+
807
+ Let me think through this step by step:
808
+
809
+ 1. First, I need to understand what this question is asking for.
810
+ 2. Then I'll determine what tools I need to use.
811
+ 3. I'll gather information using the appropriate tools.
812
+ 4. Finally, I'll provide the precise answer.
813
+
814
+ Let me start by analyzing the question:
815
+ """
816
+
817
+ # Generate initial reasoning
818
+ response = self.generate_response(reasoning_prompt)
819
+
820
+ # Check if we need to use tools
821
+ if self.should_use_search(question, response):
822
+ search_result = self.tool_executor.tavily_search(question)
823
+ if search_result.success:
824
+ # Incorporate search results
825
+ search_info = search_result.result
826
+ enhanced_prompt = f"""
827
+ {reasoning_prompt}
828
+
829
+ Based on my analysis, I need to search for information. Here are the search results:
830
+
831
+ Search Query: {question}
832
+ Answer: {search_info.get('answer', 'No direct answer found')}
833
+
834
+ Top Results:
835
+ """
836
+ for i, result in enumerate(search_info.get('results', [])[:3]):
837
+ enhanced_prompt += f"Result {i+1}: {result.get('title', '')}\n{result.get('content', '')[:200]}...\n\n"
838
+
839
+ enhanced_prompt += "\nBased on this information, the answer is:"
840
+
841
+ final_response = self.generate_response(enhanced_prompt, max_length=500)
842
+ return self.extract_final_answer(final_response)
843
+
844
+ # Check if we need Python computation
845
+ if self.should_use_python(question, response):
846
+ # Generate Python code
847
+ code_prompt = f"""
848
+ Question: {question}
849
+
850
+ I need to solve this using Python. Let me write the code:
851
+
852
+ ```python
853
+ """
854
+ code_response = self.generate_response(code_prompt, max_length=300)
855
+
856
+ # Extract Python code
857
+ python_code = self.extract_python_code(code_response)
858
+ if python_code:
859
+ exec_result = self.tool_executor.python_interpreter(python_code)
860
+ if exec_result.success:
861
+ return str(exec_result.result).strip()
862
+
863
+ # If no tools needed, extract answer from reasoning
864
+ return self.extract_final_answer(response)
865
+
866
+ except Exception as e:
867
+ logger.error(f"Error in solve_with_reasoning: {e}")
868
+ return self.fallback_solve(question)
869
+
870
+ def should_use_search(self, question: str, response: str) -> bool:
871
+ """Determine if we should use web search"""
872
+ search_indicators = [
873
+ "current", "recent", "latest", "news", "today", "now",
874
+ "who is", "what is", "when did", "where is",
875
+ "population", "capital", "president", "CEO",
876
+ "founded", "established", "released", "launched"
877
+ ]
878
+
879
+ question_lower = question.lower()
880
+ return any(indicator in question_lower for indicator in search_indicators)
881
+
882
+ def should_use_python(self, question: str, response: str) -> bool:
883
+ """Determine if we should use Python computation"""
884
+ python_indicators = [
885
+ "calculate", "compute", "solve", "equation", "formula",
886
+ "sum", "average", "total", "percentage", "rate",
887
+ "graph", "plot", "data", "analysis", "statistics"
888
+ ]
889
+
890
+ question_lower = question.lower()
891
+ return any(indicator in question_lower for indicator in python_indicators)
892
+
893
+ def extract_python_code(self, text: str) -> str:
894
+ """Extract Python code from generated text"""
895
+ # Look for code blocks
896
+ code_pattern = r'```python\n(.*?)\n```'
897
+ matches = re.findall(code_pattern, text, re.DOTALL)
898
+
899
+ if matches:
900
+ return matches[0].strip()
901
+
902
+ # Look for simple code after "python" keyword
903
+ lines = text.split('\n')
904
+ code_lines = []
905
+ in_code = False
906
+
907
+ for line in lines:
908
+ if 'python' in line.lower() or in_code:
909
+ in_code = True
910
+ if line.strip() and not line.strip().startswith('#'):
911
+ code_lines.append(line)
912
+
913
+ return '\n'.join(code_lines) if code_lines else ""
914
+
915
+ def extract_final_answer(self, text: str) -> str:
916
+ """Extract the final answer from generated text"""
917
+ # Look for common answer patterns
918
+ answer_patterns = [
919
+ r'(?:the answer is|answer:|final answer:)\s*(.+?)(?:\n|$)',
920
+ r'(?:therefore|thus|so|hence),?\s*(.+?)(?:\n|$)',
921
+ r'(?:result|conclusion):\s*(.+?)(?:\n|$)',
922
+ ]
923
+
924
+ for pattern in answer_patterns:
925
+ matches = re.findall(pattern, text, re.IGNORECASE)
926
+ if matches:
927
+ answer = matches[-1].strip()
928
+ # Clean up the answer
929
+ answer = re.sub(r'^["\']|["\']$', '', answer) # Remove quotes
930
+ answer = answer.strip('.,!?') # Remove trailing punctuation
931
+ return answer
932
+
933
+ # If no pattern found, return the last meaningful line
934
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
935
+ if lines:
936
+ return lines[-1]
937
+
938
+ return text.strip()
939
+
940
+ def fallback_solve(self, question: str) -> str:
941
+ """Simple fallback solution method"""
942
+ try:
943
+ # Try direct search first
944
+ search_result = self.tool_executor.tavily_search(question)
945
+ if search_result.success and search_result.result.get('answer'):
946
+ return search_result.result['answer']
947
+
948
+ # If search fails, try basic pattern matching
949
+ question_lower = question.lower()
950
+
951
+ # Handle year questions
952
+ if 'year' in question_lower or 'when' in question_lower:
953
+ # Look for 4-digit years in search results
954
+ if search_result.success:
955
+ text = str(search_result.result)
956
+ years = re.findall(r'\b(19|20)\d{2}\b', text)
957
+ if years:
958
+ return years[0]
959
+
960
+ # Handle number questions
961
+ if any(word in question_lower for word in ['how many', 'number', 'count']):
962
+ if search_result.success:
963
+ text = str(search_result.result)
964
+ numbers = re.findall(r'\b\d+\b', text)
965
+ if numbers:
966
+ return numbers[0]
967
+
968
+ # Default fallback
969
+ return "Unable to determine answer"
970
+
971
+ except Exception as e:
972
+ logger.error(f"Fallback solve error: {e}")
973
+ return "Error processing question"
974
+
975
+ def __call__(self, question: str) -> str:
976
+ """Main entry point for the agent"""
977
+ logger.info(f"Processing question: {question[:100]}...")
978
+
979
+ try:
980
+ # Solve the question
981
+ answer = self.solve_with_reasoning(question)
982
+
983
+ # Clean and validate answer
984
+ final_answer = answer.strip()
985
+ if not final_answer:
986
+ final_answer = self.fallback_solve(question)
987
+
988
+ logger.info(f"Generated answer: {final_answer}")
989
+ return final_answer
990
+
991
+ except Exception as e:
992
+ logger.error(f"Error in agent call: {e}")
993
+ logger.error(traceback.format_exc())
994
+ return self.fallback_solve(question)
995
+
996
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
997
+ """
998
+ Fetches all questions, runs the GAIAAgent on them, submits all answers,
999
+ and displays the results.
1000
+ """
1001
+ # --- Determine HF Space Runtime URL and Repo URL ---
1002
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
1003
+
1004
+ if profile:
1005
+ username = f"{profile.username}"
1006
+ print(f"User logged in: {username}")
1007
  else:
1008
+ print("User not logged in.")
1009
+ return "Please Login to Hugging Face with the button.", None
1010
+
1011
+ api_url = DEFAULT_API_URL
1012
+ questions_url = f"{api_url}/questions"
1013
+ submit_url = f"{api_url}/submit"
1014
 
1015
+ # 1. Instantiate Agent
1016
  try:
1017
+ agent = GAIAAgent()
 
 
 
 
 
 
 
 
1018
  except Exception as e:
1019
+ print(f"Error instantiating agent: {e}")
1020
+ return f"Error initializing agent: {e}", None
1021
+
1022
+ # In the case of an app running as a Hugging Face space, this link points toward your codebase
1023
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
1024
+ print(agent_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
 
1026
+ # 2. Fetch Questions
1027
+ print(f"Fetching questions from: {questions_url}")
1028
+ try:
1029
+ response = requests.get(questions_url, timeout=15)
1030
+ response.raise_for_status()
1031
+ questions_data = response.json()
1032
+ if not questions_data:
1033
+ print("Fetched questions list is empty.")
1034
+ return "Fetched questions list is empty or invalid format.", None
1035
+ print(f"Fetched {len(questions_data)} questions.")
1036
+ except requests.exceptions.RequestException as e:
1037
+ print(f"Error fetching questions: {e}")
1038
+ return f"Error fetching questions: {e}", None
1039
+ except requests.exceptions.JSONDecodeError as e:
1040
+ print(f"Error decoding JSON response from questions endpoint: {e}")
1041
+ print(f"Response text: {response.text[:500]}")
1042
+ return f"Error decoding server response for questions: {e}", None
1043
  except Exception as e:
1044
+ print(f"An unexpected error occurred fetching questions: {e}")
1045
+ return f"An unexpected error occurred fetching questions: {e}", None
1046
+
1047
+ # 3. Run your Agent
1048
+ results_log = []
1049
+ answers_payload = []
1050
+ print(f"Running agent on {len(questions_data)} questions...")
1051
+
1052
+ for i, item in enumerate(questions_data):
1053
+ task_id = item.get("task_id")
1054
+ question_text = item.get("question")
1055
+ if not task_id or question_text is None:
1056
+ print(f"Skipping item with missing task_id or question: {item}")
1057
+ continue
1058
+
1059
+ print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
1060
+ try:
1061
+ submitted_answer = agent(question_text)
1062
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
1063
+ results_log.append({
1064
+ "Task ID": task_id,
1065
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
1066
+ "Submitted Answer": submitted_answer
1067
+ })
1068
+ print(f"Answer for {task_id}: {submitted_answer}")
1069
+ except Exception as e:
1070
+ print(f"Error running agent on task {task_id}: {e}")
1071
+ error_msg = f"AGENT ERROR: {e}"
1072
+ answers_payload.append({"task_id": task_id, "submitted_answer": error_msg})
1073
+ results_log.append({
1074
+ "Task ID": task_id,
1075
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
1076
+ "Submitted Answer": error_msg
1077
+ })
1078
+
1079
+ if not answers_payload:
1080
+ print("Agent did not produce any answers to submit.")
1081
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
1082
+
1083
+ # 4. Prepare Submission
1084
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
1085
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
1086
+ print(status_update)
1087
+
1088
+ # 5. Submit
1089
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
1090
  try:
1091
+ response = requests.post(submit_url, json=submission_data, timeout=60)
1092
+ response.raise_for_status()
1093
+ result_data = response.json()
1094
+ final_status = (
1095
+ f"Submission Successful!\n"
1096
+ f"User: {result_data.get('username')}\n"
1097
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
1098
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
1099
+ f"Message: {result_data.get('message', 'No message received.')}"
1100
+ )
1101
+ print("Submission successful.")
1102
+ results_df = pd.DataFrame(results_log)
1103
+ return final_status, results_df
1104
+ except requests.exceptions.HTTPError as e:
1105
+ error_detail = f"Server responded with status {e.response.status_code}."
1106
+ try:
1107
+ error_json = e.response.json()
1108
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
1109
+ except requests.exceptions.JSONDecodeError:
1110
+ error_detail += f" Response: {e.response.text[:500]}"
1111
+ status_message = f"Submission Failed: {error_detail}"
1112
+ print(status_message)
1113
+ results_df = pd.DataFrame(results_log)
1114
+ return status_message, results_df
1115
+ except requests.exceptions.Timeout:
1116
+ status_message = "Submission Failed: The request timed out."
1117
+ print(status_message)
1118
+ results_df = pd.DataFrame(results_log)
1119
+ return status_message, results_df
1120
+ except requests.exceptions.RequestException as e:
1121
+ status_message = f"Submission Failed: Network error - {e}"
1122
+ print(status_message)
1123
+ results_df = pd.DataFrame(results_log)
1124
+ return status_message, results_df
1125
  except Exception as e:
1126
+ status_message = f"An unexpected error occurred during submission: {e}"
1127
+ print(status_message)
1128
+ results_df = pd.DataFrame(results_log)
1129
+ return status_message, results_df
1130
+
1131
+ # --- Build Gradio Interface using Blocks ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1132
  with gr.Blocks() as demo:
1133
+ gr.Markdown("# GAIA Benchmark AI Agent")
1134
+ gr.Markdown(
1135
+ """
1136
+ **Advanced AI Agent for GAIA Benchmark**
1137
+
1138
+ This agent uses:
1139
+ - **Qwen 2.5-7B-Instruct** for reasoning and planning
1140
+ - **Tavily Search** for real-time information retrieval
1141
+ - **Python Interpreter** for computational tasks
1142
+ - **File Reading** capabilities for document analysis
1143
+
1144
+ **Instructions:**
1145
+ 1. Clone this space and set up your environment variables:
1146
+ - `TAVILY_API_KEY`: Your Tavily API key for web search
1147
+ - `HF_TOKEN`: Your Hugging Face token (if needed)
1148
+ 2. Log in to your Hugging Face account using the button below
1149
+ 3. Click 'Run Evaluation & Submit All Answers' to start the evaluation
1150
+
1151
+ **Expected Performance:** This agent is designed to score >30% on the GAIA benchmark.
1152
+ """
1153
+ )
1154
+
1155
+ gr.LoginButton()
1156
+
1157
+ run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
1158
+
1159
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
1160
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
1161
+
1162
+ run_button.click(
1163
+ fn=run_and_submit_all,
1164
+ outputs=[status_output, results_table]
1165
+ )
1166
+
1167
+ if __name__ == "__main__":
1168
+ print("\n" + "-"*50 + " GAIA Agent Starting " + "-"*50)
1169
+
1170
+ # Check for required environment variables
1171
+ required_vars = ["TAVILY_API_KEY"]
1172
+ missing_vars = []
1173
+
1174
+ for var in required_vars:
1175
+ if not os.getenv(var):
1176
+ missing_vars.append(var)
1177
+
1178
+ if missing_vars:
1179
+ print(f"⚠️ Missing environment variables: {', '.join(missing_vars)}")
1180
+ print(" Please set these variables for optimal performance.")
1181
+ else:
1182
+ print("✅ All required environment variables found.")
1183
+
1184
+ # Check for SPACE_HOST and SPACE_ID at startup for information
1185
+ space_host_startup = os.getenv("SPACE_HOST")
1186
+ space_id_startup = os.getenv("SPACE_ID")
1187
+
1188
+ if space_host_startup:
1189
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
1190
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
1191
+ else:
1192
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
1193
+
1194
+ if space_id_startup:
1195
+ print(f"✅ SPACE_ID found: {space_id_startup}")
1196
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
1197
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
1198
+ else:
1199
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
1200
+
1201
+ print("-"*120 + "\n")
1202
+ print("🚀 Launching GAIA Benchmark AI Agent...")
1203
+ demo.launch(debug=True, share=False)