Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -247,263 +247,957 @@
|
|
| 247 |
# =================================================================================================
|
| 248 |
|
| 249 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
import os
|
| 251 |
-
import
|
| 252 |
-
import json
|
| 253 |
import requests
|
|
|
|
| 254 |
import pandas as pd
|
| 255 |
-
import
|
| 256 |
-
|
| 257 |
-
from typing import
|
| 258 |
-
import
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
from
|
|
|
|
| 265 |
from tavily import TavilyClient
|
| 266 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
# --- Constants ---
|
| 269 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
|
| 306 |
-
except Exception as e: return f"Error during Tavily search: {e}"
|
| 307 |
-
@tool
|
| 308 |
-
def read_file(url: str) -> str:
|
| 309 |
-
"""Downloads and reads the content of a file (text or PDF) from a URL."""
|
| 310 |
-
print(f"--- Calling Read File Tool with URL: {url} ---")
|
| 311 |
-
try:
|
| 312 |
-
filename = os.path.join(FILES_DIR, os.path.basename(url))
|
| 313 |
-
response = requests.get(url)
|
| 314 |
-
response.raise_for_status()
|
| 315 |
-
with open(filename, 'wb') as f: f.write(response.content)
|
| 316 |
-
if url.lower().endswith('.pdf'):
|
| 317 |
-
try:
|
| 318 |
-
pdf_reader = pypdf.PdfReader(filename)
|
| 319 |
-
return f"Successfully read PDF file '{filename}'. Content:\n\n{''.join(p.extract_text() for p in pdf_reader.pages)}"
|
| 320 |
-
except Exception as e: return f"Error reading PDF file: {e}"
|
| 321 |
-
else:
|
| 322 |
-
try:
|
| 323 |
-
with open(filename, 'r', encoding='utf-8') as f: return f"Successfully read text file '{filename}'. Content:\n\n{f.read()}"
|
| 324 |
-
except UnicodeDecodeError: return f"Successfully downloaded binary file '{filename}'. Cannot display content as text."
|
| 325 |
-
except requests.exceptions.RequestException as e: return f"Error downloading or reading file: {e}"
|
| 326 |
-
@tool
|
| 327 |
-
def python_interpreter(code: str) -> str:
|
| 328 |
-
"""Executes Python code and returns its stdout."""
|
| 329 |
-
print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
|
| 330 |
-
output_buffer = io.StringIO()
|
| 331 |
-
try:
|
| 332 |
-
with redirect_stdout(output_buffer): exec(code, globals())
|
| 333 |
-
return f"Code executed successfully. Output:\n{output_buffer.getvalue()}"
|
| 334 |
-
except Exception as e: return f"Error executing Python code: {e}"
|
| 335 |
-
##================================================================================================
|
| 336 |
-
#✅ 2. CONFIGURE AND BUILD THE AGENT (with Qwen2 and Manual Tool Calling)
|
| 337 |
-
#================================================================================================
|
| 338 |
-
class AgentState(TypedDict):
|
| 339 |
-
messages: Annotated[List[BaseMessage], operator.add]
|
| 340 |
-
def build_agent_graph():
|
| 341 |
-
"""Builds the agent using a manual LangGraph loop with the HuggingFaceEndpoint."""
|
| 342 |
-
tools = [tavily_search, read_file, python_interpreter]
|
| 343 |
-
tool_map = {tool.name: tool for tool in tools}
|
| 344 |
-
Generated code
|
| 345 |
-
# Using Qwen2-72B-Instruct model via HuggingFaceEndpoint
|
| 346 |
-
repo_id = "Qwen/Qwen2-72B-Instruct"
|
| 347 |
-
llm = HuggingFaceEndpoint(
|
| 348 |
-
repo_id=repo_id,
|
| 349 |
-
max_new_tokens=1024,
|
| 350 |
-
temperature=0.1,
|
| 351 |
-
huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 352 |
-
)
|
| 353 |
-
|
| 354 |
-
def call_model(state: AgentState):
|
| 355 |
-
"""Invokes the LLM and wraps the response in an AIMessage."""
|
| 356 |
-
# Qwen2 Instruct uses a specific chat template. We build it manually.
|
| 357 |
-
prompt_str = ""
|
| 358 |
-
for msg in state['messages']:
|
| 359 |
-
role = ""
|
| 360 |
-
if isinstance(msg, SystemMessage): role = "system"
|
| 361 |
-
elif isinstance(msg, HumanMessage): role = "user"
|
| 362 |
-
elif isinstance(msg, AIMessage): role = "assistant"
|
| 363 |
-
elif isinstance(msg, ToolMessage): continue # We'll handle tool results differently
|
| 364 |
-
|
| 365 |
-
if role: prompt_str += f"<|im_start|>{role}\n{msg.content}<|im_end|>\n"
|
| 366 |
-
|
| 367 |
-
# Add results from the last tool call, if any
|
| 368 |
-
if isinstance(state['messages'][-1], ToolMessage):
|
| 369 |
-
prompt_str += f"<|im_start|>user\nTool output:\n{state['messages'][-1].content}<|im_end|>\n"
|
| 370 |
-
|
| 371 |
-
prompt_str += "<|im_start|>assistant\n"
|
| 372 |
-
|
| 373 |
-
response_text = llm.invoke(prompt_str)
|
| 374 |
-
return {"messages": [AIMessage(content=response_text)]}
|
| 375 |
-
|
| 376 |
-
def should_continue(state: AgentState) -> str:
|
| 377 |
-
"""Determines whether to call a tool or end the loop."""
|
| 378 |
-
last_message_content = state['messages'][-1].content.strip()
|
| 379 |
-
# A simple check for JSON is a reliable way to detect tool calls.
|
| 380 |
-
if "```json" in last_message_content:
|
| 381 |
-
return "action"
|
| 382 |
-
if last_message_content.startswith('{') and last_message_content.endswith('}'):
|
| 383 |
try:
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
|
| 391 |
-
|
| 392 |
-
"""
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
else:
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
|
|
|
| 401 |
try:
|
| 402 |
-
|
| 403 |
-
tool_name = tool_call_data.get("tool_name")
|
| 404 |
-
parameters = tool_call_data.get("parameters", {})
|
| 405 |
-
if tool_name not in tool_map:
|
| 406 |
-
return {"messages": [ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id="error")]}
|
| 407 |
-
|
| 408 |
-
selected_tool = tool_map[tool_name]
|
| 409 |
-
tool_output = selected_tool.invoke(parameters)
|
| 410 |
-
return {"messages": [ToolMessage(content=str(tool_output), tool_call_id=tool_name)]}
|
| 411 |
except Exception as e:
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
workflow.add_conditional_edges("agent", should_continue, {"action": "action", "end": END})
|
| 419 |
-
workflow.add_edge('action', 'agent')
|
| 420 |
-
return workflow.compile()
|
| 421 |
-
Use code with caution.
|
| 422 |
-
#================================================================================================
|
| 423 |
-
#✅ 3. AGENT CLASS AND EVALUATION LOGIC
|
| 424 |
-
#================================================================================================
|
| 425 |
-
class GaiaAgent:
|
| 426 |
-
def init(self):
|
| 427 |
-
print("GaiaAgent initialized. Building agent with Qwen/Qwen2-72B-Instruct...")
|
| 428 |
-
self.agent_app = build_agent_graph()
|
| 429 |
-
Generated code
|
| 430 |
-
def __call__(self, question: str) -> str:
|
| 431 |
-
print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
|
| 432 |
-
try:
|
| 433 |
-
initial_input = {"messages": [SystemMessage(content=AGENT_SYSTEM_PROMPT), HumanMessage(content=question)]}
|
| 434 |
-
final_state = None
|
| 435 |
-
for step in self.agent_app.stream(initial_input, {"recursion_limit": 15}):
|
| 436 |
-
final_state = list(step.values())[0]
|
| 437 |
|
| 438 |
-
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
except Exception as e:
|
| 441 |
-
print(f"An error occurred
|
| 442 |
-
return f"
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
try:
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
except Exception as e:
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
try:
|
| 479 |
-
response = requests.post(submit_url, json=submission_data, timeout=90)
|
| 480 |
-
response.raise_for_status()
|
| 481 |
-
result_data = response.json()
|
| 482 |
-
final_status = (
|
| 483 |
-
f"Submission Successful!\n"
|
| 484 |
-
f"User: {result_data.get('username')}\n"
|
| 485 |
-
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 486 |
-
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 487 |
-
f"Message: {result_data.get('message', 'No message received.')}"
|
| 488 |
-
)
|
| 489 |
-
return final_status, pd.DataFrame(results_log)
|
| 490 |
-
except Exception as e: return f"An unexpected error in submission: {e}", pd.DataFrame(results_log)
|
| 491 |
-
Use code with caution.
|
| 492 |
with gr.Blocks() as demo:
|
| 493 |
-
gr.Markdown("# GAIA
|
| 494 |
-
gr.Markdown(
|
| 495 |
-
"""
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
# =================================================================================================
|
| 248 |
|
| 249 |
#
|
| 250 |
+
# import os
|
| 251 |
+
# import io
|
| 252 |
+
# import json
|
| 253 |
+
# import requests
|
| 254 |
+
# import pandas as pd
|
| 255 |
+
# import gradio as gr
|
| 256 |
+
# from contextlib import redirect_stdout
|
| 257 |
+
# from typing import TypedDict, Annotated, List
|
| 258 |
+
# import operator
|
| 259 |
+
|
| 260 |
+
# # --- LangChain & LangGraph Imports ---
|
| 261 |
+
# from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage, SystemMessage
|
| 262 |
+
# from langchain_core.tools import tool
|
| 263 |
+
# from langchain_huggingface import HuggingFaceEndpoint
|
| 264 |
+
# from langgraph.graph import StateGraph, END
|
| 265 |
+
# from tavily import TavilyClient
|
| 266 |
+
# import pypdf
|
| 267 |
+
|
| 268 |
+
# # --- Constants ---
|
| 269 |
+
# DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 270 |
+
# FILES_DIR = "./files"
|
| 271 |
+
# os.makedirs(FILES_DIR, exist_ok=True)
|
| 272 |
+
|
| 273 |
+
# # --- System Prompt (Updated for Manual JSON Tool Calling) ---
|
| 274 |
+
# # This prompt instructs the model to generate JSON, a robust method for tool calls.
|
| 275 |
+
# AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark.
|
| 276 |
+
# Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
|
| 277 |
+
|
| 278 |
+
# **TOOL USAGE INSTRUCTIONS:**
|
| 279 |
+
# When you need to use a tool, you MUST respond with a JSON object containing the tool name and its arguments. The JSON object should have two keys: "tool_name" and "parameters".
|
| 280 |
+
|
| 281 |
+
# Here is an example of how to call the `tavily_search` tool:
|
| 282 |
+
# ```json
|
| 283 |
+
# {
|
| 284 |
+
# "tool_name": "tavily_search",
|
| 285 |
+
# "parameters": {
|
| 286 |
+
# "query": "Who won the last FIFA World Cup?"
|
| 287 |
+
# }
|
| 288 |
+
# }
|
| 289 |
+
# Use code with caution.
|
| 290 |
+
# Python
|
| 291 |
+
# CRITICAL FINAL ANSWER INSTRUCTIONS:
|
| 292 |
+
# Once you have gathered all the necessary information and are absolutely certain of the answer, you MUST provide it directly and concisely.
|
| 293 |
+
# Your final response must ONLY be the answer itself.
|
| 294 |
+
# DO NOT wrap the final answer in a JSON object or include any conversational text.
|
| 295 |
+
# Think, use your tools, and then provide ONLY the final, precise answer.
|
| 296 |
+
# """
|
| 297 |
+
# ###===============================================================================================
|
| 298 |
+
# tavily = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
| 299 |
+
# @tool
|
| 300 |
+
# def tavily_search(query: str) -> str:
|
| 301 |
+
# """Uses the Tavily Search API to find information on the web."""
|
| 302 |
+
# print(f"--- Calling Tavily Search Tool with query: {query} ---")
|
| 303 |
+
# try:
|
| 304 |
+
# result = tavily.search(query=query, search_depth="advanced")
|
| 305 |
+
# return f"Search results for '{query}':\n" + "\n".join([f"- {r['content']}" for r in result['results']])
|
| 306 |
+
# except Exception as e: return f"Error during Tavily search: {e}"
|
| 307 |
+
# @tool
|
| 308 |
+
# def read_file(url: str) -> str:
|
| 309 |
+
# """Downloads and reads the content of a file (text or PDF) from a URL."""
|
| 310 |
+
# print(f"--- Calling Read File Tool with URL: {url} ---")
|
| 311 |
+
# try:
|
| 312 |
+
# filename = os.path.join(FILES_DIR, os.path.basename(url))
|
| 313 |
+
# response = requests.get(url)
|
| 314 |
+
# response.raise_for_status()
|
| 315 |
+
# with open(filename, 'wb') as f: f.write(response.content)
|
| 316 |
+
# if url.lower().endswith('.pdf'):
|
| 317 |
+
# try:
|
| 318 |
+
# pdf_reader = pypdf.PdfReader(filename)
|
| 319 |
+
# return f"Successfully read PDF file '{filename}'. Content:\n\n{''.join(p.extract_text() for p in pdf_reader.pages)}"
|
| 320 |
+
# except Exception as e: return f"Error reading PDF file: {e}"
|
| 321 |
+
# else:
|
| 322 |
+
# try:
|
| 323 |
+
# with open(filename, 'r', encoding='utf-8') as f: return f"Successfully read text file '{filename}'. Content:\n\n{f.read()}"
|
| 324 |
+
# except UnicodeDecodeError: return f"Successfully downloaded binary file '{filename}'. Cannot display content as text."
|
| 325 |
+
# except requests.exceptions.RequestException as e: return f"Error downloading or reading file: {e}"
|
| 326 |
+
# @tool
|
| 327 |
+
# def python_interpreter(code: str) -> str:
|
| 328 |
+
# """Executes Python code and returns its stdout."""
|
| 329 |
+
# print(f"--- Calling Python Interpreter Tool with code:\n{code} ---")
|
| 330 |
+
# output_buffer = io.StringIO()
|
| 331 |
+
# try:
|
| 332 |
+
# with redirect_stdout(output_buffer): exec(code, globals())
|
| 333 |
+
# return f"Code executed successfully. Output:\n{output_buffer.getvalue()}"
|
| 334 |
+
# except Exception as e: return f"Error executing Python code: {e}"
|
| 335 |
+
# ##================================================================================================
|
| 336 |
+
# #✅ 2. CONFIGURE AND BUILD THE AGENT (with Qwen2 and Manual Tool Calling)
|
| 337 |
+
# #================================================================================================
|
| 338 |
+
# class AgentState(TypedDict):
|
| 339 |
+
# messages: Annotated[List[BaseMessage], operator.add]
|
| 340 |
+
# def build_agent_graph():
|
| 341 |
+
# """Builds the agent using a manual LangGraph loop with the HuggingFaceEndpoint."""
|
| 342 |
+
# tools = [tavily_search, read_file, python_interpreter]
|
| 343 |
+
# tool_map = {tool.name: tool for tool in tools}
|
| 344 |
+
# Generated code
|
| 345 |
+
# # Using Qwen2-72B-Instruct model via HuggingFaceEndpoint
|
| 346 |
+
# repo_id = "Qwen/Qwen2-72B-Instruct"
|
| 347 |
+
# llm = HuggingFaceEndpoint(
|
| 348 |
+
# repo_id=repo_id,
|
| 349 |
+
# max_new_tokens=1024,
|
| 350 |
+
# temperature=0.1,
|
| 351 |
+
# huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
| 352 |
+
# )
|
| 353 |
+
|
| 354 |
+
# def call_model(state: AgentState):
|
| 355 |
+
# """Invokes the LLM and wraps the response in an AIMessage."""
|
| 356 |
+
# # Qwen2 Instruct uses a specific chat template. We build it manually.
|
| 357 |
+
# prompt_str = ""
|
| 358 |
+
# for msg in state['messages']:
|
| 359 |
+
# role = ""
|
| 360 |
+
# if isinstance(msg, SystemMessage): role = "system"
|
| 361 |
+
# elif isinstance(msg, HumanMessage): role = "user"
|
| 362 |
+
# elif isinstance(msg, AIMessage): role = "assistant"
|
| 363 |
+
# elif isinstance(msg, ToolMessage): continue # We'll handle tool results differently
|
| 364 |
+
|
| 365 |
+
# if role: prompt_str += f"<|im_start|>{role}\n{msg.content}<|im_end|>\n"
|
| 366 |
+
|
| 367 |
+
# # Add results from the last tool call, if any
|
| 368 |
+
# if isinstance(state['messages'][-1], ToolMessage):
|
| 369 |
+
# prompt_str += f"<|im_start|>user\nTool output:\n{state['messages'][-1].content}<|im_end|>\n"
|
| 370 |
+
|
| 371 |
+
# prompt_str += "<|im_start|>assistant\n"
|
| 372 |
+
|
| 373 |
+
# response_text = llm.invoke(prompt_str)
|
| 374 |
+
# return {"messages": [AIMessage(content=response_text)]}
|
| 375 |
+
|
| 376 |
+
# def should_continue(state: AgentState) -> str:
|
| 377 |
+
# """Determines whether to call a tool or end the loop."""
|
| 378 |
+
# last_message_content = state['messages'][-1].content.strip()
|
| 379 |
+
# # A simple check for JSON is a reliable way to detect tool calls.
|
| 380 |
+
# if "```json" in last_message_content:
|
| 381 |
+
# return "action"
|
| 382 |
+
# if last_message_content.startswith('{') and last_message_content.endswith('}'):
|
| 383 |
+
# try:
|
| 384 |
+
# json.loads(last_message_content)
|
| 385 |
+
# return "action"
|
| 386 |
+
# except json.JSONDecodeError:
|
| 387 |
+
# return "end" # Not valid JSON, must be the final answer
|
| 388 |
+
# else:
|
| 389 |
+
# return "end"
|
| 390 |
+
|
| 391 |
+
# def call_tool_node(state: AgentState):
|
| 392 |
+
# """Parses the JSON tool call from the LLM and executes it."""
|
| 393 |
+
# last_message_content = state['messages'][-1].content.strip()
|
| 394 |
+
|
| 395 |
+
# # Extract JSON from markdown code block if present
|
| 396 |
+
# if "```json" in last_message_content:
|
| 397 |
+
# json_str = last_message_content.split("```json").split("```")[0].strip()
|
| 398 |
+
# else:
|
| 399 |
+
# json_str = last_message_content
|
| 400 |
+
|
| 401 |
+
# try:
|
| 402 |
+
# tool_call_data = json.loads(json_str)
|
| 403 |
+
# tool_name = tool_call_data.get("tool_name")
|
| 404 |
+
# parameters = tool_call_data.get("parameters", {})
|
| 405 |
+
# if tool_name not in tool_map:
|
| 406 |
+
# return {"messages": [ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id="error")]}
|
| 407 |
+
|
| 408 |
+
# selected_tool = tool_map[tool_name]
|
| 409 |
+
# tool_output = selected_tool.invoke(parameters)
|
| 410 |
+
# return {"messages": [ToolMessage(content=str(tool_output), tool_call_id=tool_name)]}
|
| 411 |
+
# except Exception as e:
|
| 412 |
+
# return {"messages": [ToolMessage(content=f"Error parsing tool call: {e}. Content: '{last_message_content}'", tool_call_id="error")]}
|
| 413 |
+
|
| 414 |
+
# workflow = StateGraph(AgentState)
|
| 415 |
+
# workflow.add_node("agent", call_model)
|
| 416 |
+
# workflow.add_node("action", call_tool_node)
|
| 417 |
+
# workflow.set_entry_point("agent")
|
| 418 |
+
# workflow.add_conditional_edges("agent", should_continue, {"action": "action", "end": END})
|
| 419 |
+
# workflow.add_edge('action', 'agent')
|
| 420 |
+
# return workflow.compile()
|
| 421 |
+
# Use code with caution.
|
| 422 |
+
# #================================================================================================
|
| 423 |
+
# #✅ 3. AGENT CLASS AND EVALUATION LOGIC
|
| 424 |
+
# #================================================================================================
|
| 425 |
+
# class GaiaAgent:
|
| 426 |
+
# def init(self):
|
| 427 |
+
# print("GaiaAgent initialized. Building agent with Qwen/Qwen2-72B-Instruct...")
|
| 428 |
+
# self.agent_app = build_agent_graph()
|
| 429 |
+
# Generated code
|
| 430 |
+
# def __call__(self, question: str) -> str:
|
| 431 |
+
# print(f"\n{'='*60}\nAgent received question: {question[:100]}...\n{'='*60}")
|
| 432 |
+
# try:
|
| 433 |
+
# initial_input = {"messages": [SystemMessage(content=AGENT_SYSTEM_PROMPT), HumanMessage(content=question)]}
|
| 434 |
+
# final_state = None
|
| 435 |
+
# for step in self.agent_app.stream(initial_input, {"recursion_limit": 15}):
|
| 436 |
+
# final_state = list(step.values())[0]
|
| 437 |
+
|
| 438 |
+
# final_answer = final_state['messages'][-1].content
|
| 439 |
+
# return str(final_answer).strip()
|
| 440 |
+
# except Exception as e:
|
| 441 |
+
# print(f"An error occurred during agent execution: {e}")
|
| 442 |
+
# return f"AGENT_EXECUTION_ERROR: {e}"
|
| 443 |
+
# Use code with caution.
|
| 444 |
+
# --- The rest of the file is unchanged ---
|
| 445 |
+
# def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 446 |
+
# space_id = os.getenv("SPACE_ID")
|
| 447 |
+
# if not profile: return "Please Login to Hugging Face with the button.", None
|
| 448 |
+
# username = f"{profile.username}"
|
| 449 |
+
# print(f"User logged in: {username}")
|
| 450 |
+
# api_url = DEFAULT_API_URL
|
| 451 |
+
# questions_url = f"{api_url}/questions"
|
| 452 |
+
# submit_url = f"{api_url}/submit"
|
| 453 |
+
# agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 454 |
+
# Generated code
|
| 455 |
+
# try:
|
| 456 |
+
# response = requests.get(questions_url, timeout=15)
|
| 457 |
+
# response.raise_for_status()
|
| 458 |
+
# questions_data = response.json()
|
| 459 |
+
# except Exception as e: return f"An unexpected error occurred fetching questions: {e}", None
|
| 460 |
+
|
| 461 |
+
# results_log, answers_payload = [], []
|
| 462 |
+
# agent_instance = GaiaAgent()
|
| 463 |
+
|
| 464 |
+
# for item in questions_data:
|
| 465 |
+
# task_id, question_text = item.get("task_id"), item.get("question")
|
| 466 |
+
# if not task_id or question_text is None: continue
|
| 467 |
+
# try:
|
| 468 |
+
# submitted_answer = agent_instance(question_text)
|
| 469 |
+
# answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 470 |
+
# results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 471 |
+
# except Exception as e:
|
| 472 |
+
# print(f"Error running agent on task {task_id}: {e}")
|
| 473 |
+
# results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 474 |
+
|
| 475 |
+
# if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 476 |
+
# submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 477 |
+
|
| 478 |
+
# try:
|
| 479 |
+
# response = requests.post(submit_url, json=submission_data, timeout=90)
|
| 480 |
+
# response.raise_for_status()
|
| 481 |
+
# result_data = response.json()
|
| 482 |
+
# final_status = (
|
| 483 |
+
# f"Submission Successful!\n"
|
| 484 |
+
# f"User: {result_data.get('username')}\n"
|
| 485 |
+
# f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 486 |
+
# f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 487 |
+
# f"Message: {result_data.get('message', 'No message received.')}"
|
| 488 |
+
# )
|
| 489 |
+
# return final_status, pd.DataFrame(results_log)
|
| 490 |
+
# except Exception as e: return f"An unexpected error in submission: {e}", pd.DataFrame(results_log)
|
| 491 |
+
# Use code with caution.
|
| 492 |
+
# with gr.Blocks() as demo:
|
| 493 |
+
# gr.Markdown("# GAIA Agent Final Assessment (Qwen2-72B-Instruct)")
|
| 494 |
+
# gr.Markdown(
|
| 495 |
+
# """
|
| 496 |
+
# Instructor's Note: This version uses the powerful Qwen/Qwen2-72B-Instruct model from the Hugging Face Hub.
|
| 497 |
+
# It relies on a robust manual LangGraph loop to handle tool calls by instructing the model to generate JSON.
|
| 498 |
+
# 1. Ensure you have a HUGGINGFACEHUB_API_TOKEN and TAVILY_API_KEY set in your secrets.
|
| 499 |
+
# 2. Ensure your requirements.txt is updated. Good luck!
|
| 500 |
+
# """
|
| 501 |
+
# )
|
| 502 |
+
# gr.LoginButton()
|
| 503 |
+
# run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 504 |
+
# status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 505 |
+
# results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 506 |
+
# run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
|
| 507 |
+
# if name == "main":
|
| 508 |
+
# print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 509 |
+
# demo.launch(debug=True, share=False, ssr_mode=False)
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
#########################
|
| 513 |
import os
|
| 514 |
+
import gradio as gr
|
|
|
|
| 515 |
import requests
|
| 516 |
+
import inspect
|
| 517 |
import pandas as pd
|
| 518 |
+
import json
|
| 519 |
+
import re
|
| 520 |
+
from typing import Dict, Any, List, Optional
|
| 521 |
+
from dataclasses import dataclass
|
| 522 |
+
import logging
|
| 523 |
+
from datetime import datetime
|
| 524 |
+
import traceback
|
| 525 |
+
|
| 526 |
+
# Third-party imports for the agent
|
| 527 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 528 |
+
import torch
|
| 529 |
from tavily import TavilyClient
|
| 530 |
+
import tempfile
|
| 531 |
+
import subprocess
|
| 532 |
+
import sys
|
| 533 |
+
|
| 534 |
+
# Configure logging
|
| 535 |
+
logging.basicConfig(level=logging.INFO)
|
| 536 |
+
logger = logging.getLogger(__name__)
|
| 537 |
|
| 538 |
# --- Constants ---
|
| 539 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 540 |
+
|
| 541 |
+
# Agent System Prompt
|
| 542 |
+
AGENT_SYSTEM_PROMPT = """You are a world-class AI agent, specialized in solving complex problems from the GAIA benchmark. Your task is to analyze the user's question, think step-by-step, and use the provided tools to find the correct answer.
|
| 543 |
+
|
| 544 |
+
CRITICAL INSTRUCTIONS:
|
| 545 |
+
1. **Analyze the Goal:** First, understand what the user is asking for.
|
| 546 |
+
2. **Plan & Execute:** Formulate a plan and use the available tools (`tavily_search`, `read_file`, `python_interpreter`) to gather information.
|
| 547 |
+
3. **Final Answer Format:** Once you are absolutely certain of the answer, you MUST provide it directly and concisely.
|
| 548 |
+
- DO NOT include your reasoning, thoughts, or any conversational text like 'The answer is...', 'Here is the result:', or 'Based on my search...'.
|
| 549 |
+
- Your final response must ONLY be the answer itself.
|
| 550 |
+
|
| 551 |
+
EXAMPLES OF CORRECT FINAL ANSWERS:
|
| 552 |
+
- If the question asks for a year: `2023`
|
| 553 |
+
- If it asks for a name: `John Doe`
|
| 554 |
+
- If it asks for a number: `42`
|
| 555 |
+
- If it asks for a comma-separated list: `item1, item2, item3`
|
| 556 |
+
|
| 557 |
+
Think, use your tools, and then provide ONLY the final, precise answer."""
|
| 558 |
+
|
| 559 |
+
@dataclass
|
| 560 |
+
class ToolResult:
|
| 561 |
+
"""Result from a tool execution"""
|
| 562 |
+
success: bool
|
| 563 |
+
result: Any
|
| 564 |
+
error: Optional[str] = None
|
| 565 |
+
|
| 566 |
+
class ToolExecutor:
|
| 567 |
+
"""Handles tool execution for the agent"""
|
| 568 |
+
|
| 569 |
+
def __init__(self):
|
| 570 |
+
self.tavily_client = None
|
| 571 |
+
self.setup_tavily()
|
| 572 |
+
|
| 573 |
+
def setup_tavily(self):
|
| 574 |
+
"""Initialize Tavily search client"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
try:
|
| 576 |
+
tavily_api_key = os.getenv("TAVILY_API_KEY")
|
| 577 |
+
if tavily_api_key:
|
| 578 |
+
self.tavily_client = TavilyClient(api_key=tavily_api_key)
|
| 579 |
+
logger.info("Tavily client initialized successfully")
|
| 580 |
+
else:
|
| 581 |
+
logger.warning("TAVILY_API_KEY not found in environment variables")
|
| 582 |
+
except Exception as e:
|
| 583 |
+
logger.error(f"Failed to initialize Tavily client: {e}")
|
| 584 |
+
|
| 585 |
+
def tavily_search(self, query: str, max_results: int = 5) -> ToolResult:
|
| 586 |
+
"""Search the web using Tavily"""
|
| 587 |
+
try:
|
| 588 |
+
if not self.tavily_client:
|
| 589 |
+
return ToolResult(success=False, error="Tavily client not initialized")
|
| 590 |
+
|
| 591 |
+
response = self.tavily_client.search(
|
| 592 |
+
query=query,
|
| 593 |
+
search_depth="advanced",
|
| 594 |
+
max_results=max_results,
|
| 595 |
+
include_answer=True,
|
| 596 |
+
include_raw_content=True
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
# Extract relevant information
|
| 600 |
+
results = []
|
| 601 |
+
if response.get('results'):
|
| 602 |
+
for result in response['results']:
|
| 603 |
+
results.append({
|
| 604 |
+
'title': result.get('title', ''),
|
| 605 |
+
'content': result.get('content', ''),
|
| 606 |
+
'url': result.get('url', ''),
|
| 607 |
+
'score': result.get('score', 0)
|
| 608 |
+
})
|
| 609 |
+
|
| 610 |
+
search_result = {
|
| 611 |
+
'answer': response.get('answer', ''),
|
| 612 |
+
'results': results,
|
| 613 |
+
'query': query
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
return ToolResult(success=True, result=search_result)
|
| 617 |
+
|
| 618 |
+
except Exception as e:
|
| 619 |
+
logger.error(f"Tavily search error: {e}")
|
| 620 |
+
return ToolResult(success=False, error=str(e))
|
| 621 |
+
|
| 622 |
+
def python_interpreter(self, code: str) -> ToolResult:
|
| 623 |
+
"""Execute Python code safely"""
|
| 624 |
+
try:
|
| 625 |
+
# Create a temporary file for the code
|
| 626 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 627 |
+
f.write(code)
|
| 628 |
+
temp_file = f.name
|
| 629 |
+
|
| 630 |
+
# Execute the code and capture output
|
| 631 |
+
result = subprocess.run(
|
| 632 |
+
[sys.executable, temp_file],
|
| 633 |
+
capture_output=True,
|
| 634 |
+
text=True,
|
| 635 |
+
timeout=30 # 30 seconds timeout
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
# Clean up
|
| 639 |
+
os.unlink(temp_file)
|
| 640 |
+
|
| 641 |
+
if result.returncode == 0:
|
| 642 |
+
return ToolResult(success=True, result=result.stdout.strip())
|
| 643 |
+
else:
|
| 644 |
+
return ToolResult(success=False, error=result.stderr.strip())
|
| 645 |
+
|
| 646 |
+
except subprocess.TimeoutExpired:
|
| 647 |
+
return ToolResult(success=False, error="Code execution timed out")
|
| 648 |
+
except Exception as e:
|
| 649 |
+
logger.error(f"Python interpreter error: {e}")
|
| 650 |
+
return ToolResult(success=False, error=str(e))
|
| 651 |
+
|
| 652 |
+
def read_file(self, file_path: str) -> ToolResult:
|
| 653 |
+
"""Read a file and return its contents"""
|
| 654 |
+
try:
|
| 655 |
+
if not os.path.exists(file_path):
|
| 656 |
+
return ToolResult(success=False, error=f"File not found: {file_path}")
|
| 657 |
+
|
| 658 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 659 |
+
content = f.read()
|
| 660 |
+
|
| 661 |
+
return ToolResult(success=True, result=content)
|
| 662 |
+
|
| 663 |
+
except Exception as e:
|
| 664 |
+
logger.error(f"File reading error: {e}")
|
| 665 |
+
return ToolResult(success=False, error=str(e))
|
| 666 |
|
| 667 |
+
class GAIAAgent:
|
| 668 |
+
"""Advanced GAIA benchmark agent using Qwen model with tool integration"""
|
| 669 |
+
|
| 670 |
+
def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
|
| 671 |
+
self.model_name = model_name
|
| 672 |
+
self.tool_executor = ToolExecutor()
|
| 673 |
+
self.tokenizer = None
|
| 674 |
+
self.model = None
|
| 675 |
+
self.pipeline = None
|
| 676 |
+
self.setup_model()
|
| 677 |
+
logger.info(f"GAIAAgent initialized with model: {model_name}")
|
| 678 |
+
|
| 679 |
+
def setup_model(self):
|
| 680 |
+
"""Initialize the Qwen model and tokenizer"""
|
| 681 |
+
try:
|
| 682 |
+
# Check if CUDA is available
|
| 683 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 684 |
+
logger.info(f"Using device: {device}")
|
| 685 |
+
|
| 686 |
+
# Load tokenizer and model
|
| 687 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 688 |
+
self.model_name,
|
| 689 |
+
trust_remote_code=True
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
# Use pipeline for easier inference
|
| 693 |
+
self.pipeline = pipeline(
|
| 694 |
+
"text-generation",
|
| 695 |
+
model=self.model_name,
|
| 696 |
+
tokenizer=self.tokenizer,
|
| 697 |
+
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
| 698 |
+
device_map="auto" if device == "cuda" else None,
|
| 699 |
+
trust_remote_code=True
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
logger.info("Model loaded successfully")
|
| 703 |
+
|
| 704 |
+
except Exception as e:
|
| 705 |
+
logger.error(f"Failed to load model: {e}")
|
| 706 |
+
# Fallback to a simpler approach
|
| 707 |
+
self.setup_fallback_model()
|
| 708 |
|
| 709 |
+
def setup_fallback_model(self):
|
| 710 |
+
"""Setup a fallback model if main model fails"""
|
| 711 |
+
try:
|
| 712 |
+
# Try a smaller model
|
| 713 |
+
fallback_model = "microsoft/DialoGPT-medium"
|
| 714 |
+
self.pipeline = pipeline(
|
| 715 |
+
"text-generation",
|
| 716 |
+
model=fallback_model,
|
| 717 |
+
tokenizer=fallback_model
|
| 718 |
+
)
|
| 719 |
+
logger.info(f"Fallback model loaded: {fallback_model}")
|
| 720 |
+
except Exception as e:
|
| 721 |
+
logger.error(f"Fallback model also failed: {e}")
|
| 722 |
+
self.pipeline = None
|
| 723 |
+
|
| 724 |
+
def extract_tool_calls(self, text: str) -> List[Dict[str, Any]]:
|
| 725 |
+
"""Extract tool calls from the model's response"""
|
| 726 |
+
tool_calls = []
|
| 727 |
+
|
| 728 |
+
# Pattern to match tool calls like: <tool_call>tavily_search("query")</tool_call>
|
| 729 |
+
pattern = r'<tool_call>(\w+)\(([^)]+)\)</tool_call>'
|
| 730 |
+
matches = re.findall(pattern, text)
|
| 731 |
+
|
| 732 |
+
for tool_name, args_str in matches:
|
| 733 |
+
try:
|
| 734 |
+
# Simple argument parsing (assumes string arguments)
|
| 735 |
+
args = args_str.strip().strip('"\'')
|
| 736 |
+
tool_calls.append({
|
| 737 |
+
'tool': tool_name,
|
| 738 |
+
'args': args
|
| 739 |
+
})
|
| 740 |
+
except Exception as e:
|
| 741 |
+
logger.error(f"Failed to parse tool call: {e}")
|
| 742 |
+
|
| 743 |
+
return tool_calls
|
| 744 |
+
|
| 745 |
+
def execute_tools(self, tool_calls: List[Dict[str, Any]]) -> str:
|
| 746 |
+
"""Execute tool calls and return results"""
|
| 747 |
+
results = []
|
| 748 |
+
|
| 749 |
+
for call in tool_calls:
|
| 750 |
+
tool_name = call['tool']
|
| 751 |
+
args = call['args']
|
| 752 |
+
|
| 753 |
+
if tool_name == 'tavily_search':
|
| 754 |
+
result = self.tool_executor.tavily_search(args)
|
| 755 |
+
elif tool_name == 'python_interpreter':
|
| 756 |
+
result = self.tool_executor.python_interpreter(args)
|
| 757 |
+
elif tool_name == 'read_file':
|
| 758 |
+
result = self.tool_executor.read_file(args)
|
| 759 |
+
else:
|
| 760 |
+
result = ToolResult(success=False, error=f"Unknown tool: {tool_name}")
|
| 761 |
+
|
| 762 |
+
if result.success:
|
| 763 |
+
results.append(f"Tool {tool_name} result: {result.result}")
|
| 764 |
+
else:
|
| 765 |
+
results.append(f"Tool {tool_name} error: {result.error}")
|
| 766 |
+
|
| 767 |
+
return "\n".join(results)
|
| 768 |
+
|
| 769 |
+
def generate_response(self, prompt: str, max_length: int = 1000) -> str:
|
| 770 |
+
"""Generate response using the model"""
|
| 771 |
+
try:
|
| 772 |
+
if not self.pipeline:
|
| 773 |
+
return "Model not available"
|
| 774 |
+
|
| 775 |
+
# Generate response
|
| 776 |
+
outputs = self.pipeline(
|
| 777 |
+
prompt,
|
| 778 |
+
max_length=max_length,
|
| 779 |
+
do_sample=True,
|
| 780 |
+
temperature=0.7,
|
| 781 |
+
top_p=0.9,
|
| 782 |
+
pad_token_id=self.tokenizer.eos_token_id if self.tokenizer else None
|
| 783 |
+
)
|
| 784 |
+
|
| 785 |
+
# Extract the generated text
|
| 786 |
+
generated_text = outputs[0]['generated_text']
|
| 787 |
+
|
| 788 |
+
# Remove the input prompt from the output
|
| 789 |
+
if generated_text.startswith(prompt):
|
| 790 |
+
generated_text = generated_text[len(prompt):].strip()
|
| 791 |
+
|
| 792 |
+
return generated_text
|
| 793 |
+
|
| 794 |
+
except Exception as e:
|
| 795 |
+
logger.error(f"Generation error: {e}")
|
| 796 |
+
return f"Generation failed: {str(e)}"
|
| 797 |
+
|
| 798 |
+
def solve_with_reasoning(self, question: str) -> str:
|
| 799 |
+
"""Solve question with step-by-step reasoning and tool usage"""
|
| 800 |
+
try:
|
| 801 |
+
# Create initial prompt
|
| 802 |
+
reasoning_prompt = f"""
|
| 803 |
+
{AGENT_SYSTEM_PROMPT}
|
| 804 |
+
|
| 805 |
+
Question: {question}
|
| 806 |
+
|
| 807 |
+
Let me think through this step by step:
|
| 808 |
+
|
| 809 |
+
1. First, I need to understand what this question is asking for.
|
| 810 |
+
2. Then I'll determine what tools I need to use.
|
| 811 |
+
3. I'll gather information using the appropriate tools.
|
| 812 |
+
4. Finally, I'll provide the precise answer.
|
| 813 |
+
|
| 814 |
+
Let me start by analyzing the question:
|
| 815 |
+
"""
|
| 816 |
+
|
| 817 |
+
# Generate initial reasoning
|
| 818 |
+
response = self.generate_response(reasoning_prompt)
|
| 819 |
+
|
| 820 |
+
# Check if we need to use tools
|
| 821 |
+
if self.should_use_search(question, response):
|
| 822 |
+
search_result = self.tool_executor.tavily_search(question)
|
| 823 |
+
if search_result.success:
|
| 824 |
+
# Incorporate search results
|
| 825 |
+
search_info = search_result.result
|
| 826 |
+
enhanced_prompt = f"""
|
| 827 |
+
{reasoning_prompt}
|
| 828 |
+
|
| 829 |
+
Based on my analysis, I need to search for information. Here are the search results:
|
| 830 |
+
|
| 831 |
+
Search Query: {question}
|
| 832 |
+
Answer: {search_info.get('answer', 'No direct answer found')}
|
| 833 |
+
|
| 834 |
+
Top Results:
|
| 835 |
+
"""
|
| 836 |
+
for i, result in enumerate(search_info.get('results', [])[:3]):
|
| 837 |
+
enhanced_prompt += f"Result {i+1}: {result.get('title', '')}\n{result.get('content', '')[:200]}...\n\n"
|
| 838 |
+
|
| 839 |
+
enhanced_prompt += "\nBased on this information, the answer is:"
|
| 840 |
+
|
| 841 |
+
final_response = self.generate_response(enhanced_prompt, max_length=500)
|
| 842 |
+
return self.extract_final_answer(final_response)
|
| 843 |
+
|
| 844 |
+
# Check if we need Python computation
|
| 845 |
+
if self.should_use_python(question, response):
|
| 846 |
+
# Generate Python code
|
| 847 |
+
code_prompt = f"""
|
| 848 |
+
Question: {question}
|
| 849 |
+
|
| 850 |
+
I need to solve this using Python. Let me write the code:
|
| 851 |
+
|
| 852 |
+
```python
|
| 853 |
+
"""
|
| 854 |
+
code_response = self.generate_response(code_prompt, max_length=300)
|
| 855 |
+
|
| 856 |
+
# Extract Python code
|
| 857 |
+
python_code = self.extract_python_code(code_response)
|
| 858 |
+
if python_code:
|
| 859 |
+
exec_result = self.tool_executor.python_interpreter(python_code)
|
| 860 |
+
if exec_result.success:
|
| 861 |
+
return str(exec_result.result).strip()
|
| 862 |
+
|
| 863 |
+
# If no tools needed, extract answer from reasoning
|
| 864 |
+
return self.extract_final_answer(response)
|
| 865 |
+
|
| 866 |
+
except Exception as e:
|
| 867 |
+
logger.error(f"Error in solve_with_reasoning: {e}")
|
| 868 |
+
return self.fallback_solve(question)
|
| 869 |
+
|
| 870 |
+
def should_use_search(self, question: str, response: str) -> bool:
|
| 871 |
+
"""Determine if we should use web search"""
|
| 872 |
+
search_indicators = [
|
| 873 |
+
"current", "recent", "latest", "news", "today", "now",
|
| 874 |
+
"who is", "what is", "when did", "where is",
|
| 875 |
+
"population", "capital", "president", "CEO",
|
| 876 |
+
"founded", "established", "released", "launched"
|
| 877 |
+
]
|
| 878 |
+
|
| 879 |
+
question_lower = question.lower()
|
| 880 |
+
return any(indicator in question_lower for indicator in search_indicators)
|
| 881 |
+
|
| 882 |
+
def should_use_python(self, question: str, response: str) -> bool:
|
| 883 |
+
"""Determine if we should use Python computation"""
|
| 884 |
+
python_indicators = [
|
| 885 |
+
"calculate", "compute", "solve", "equation", "formula",
|
| 886 |
+
"sum", "average", "total", "percentage", "rate",
|
| 887 |
+
"graph", "plot", "data", "analysis", "statistics"
|
| 888 |
+
]
|
| 889 |
+
|
| 890 |
+
question_lower = question.lower()
|
| 891 |
+
return any(indicator in question_lower for indicator in python_indicators)
|
| 892 |
+
|
| 893 |
+
def extract_python_code(self, text: str) -> str:
|
| 894 |
+
"""Extract Python code from generated text"""
|
| 895 |
+
# Look for code blocks
|
| 896 |
+
code_pattern = r'```python\n(.*?)\n```'
|
| 897 |
+
matches = re.findall(code_pattern, text, re.DOTALL)
|
| 898 |
+
|
| 899 |
+
if matches:
|
| 900 |
+
return matches[0].strip()
|
| 901 |
+
|
| 902 |
+
# Look for simple code after "python" keyword
|
| 903 |
+
lines = text.split('\n')
|
| 904 |
+
code_lines = []
|
| 905 |
+
in_code = False
|
| 906 |
+
|
| 907 |
+
for line in lines:
|
| 908 |
+
if 'python' in line.lower() or in_code:
|
| 909 |
+
in_code = True
|
| 910 |
+
if line.strip() and not line.strip().startswith('#'):
|
| 911 |
+
code_lines.append(line)
|
| 912 |
+
|
| 913 |
+
return '\n'.join(code_lines) if code_lines else ""
|
| 914 |
+
|
| 915 |
+
def extract_final_answer(self, text: str) -> str:
|
| 916 |
+
"""Extract the final answer from generated text"""
|
| 917 |
+
# Look for common answer patterns
|
| 918 |
+
answer_patterns = [
|
| 919 |
+
r'(?:the answer is|answer:|final answer:)\s*(.+?)(?:\n|$)',
|
| 920 |
+
r'(?:therefore|thus|so|hence),?\s*(.+?)(?:\n|$)',
|
| 921 |
+
r'(?:result|conclusion):\s*(.+?)(?:\n|$)',
|
| 922 |
+
]
|
| 923 |
+
|
| 924 |
+
for pattern in answer_patterns:
|
| 925 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 926 |
+
if matches:
|
| 927 |
+
answer = matches[-1].strip()
|
| 928 |
+
# Clean up the answer
|
| 929 |
+
answer = re.sub(r'^["\']|["\']$', '', answer) # Remove quotes
|
| 930 |
+
answer = answer.strip('.,!?') # Remove trailing punctuation
|
| 931 |
+
return answer
|
| 932 |
+
|
| 933 |
+
# If no pattern found, return the last meaningful line
|
| 934 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 935 |
+
if lines:
|
| 936 |
+
return lines[-1]
|
| 937 |
+
|
| 938 |
+
return text.strip()
|
| 939 |
+
|
| 940 |
+
def fallback_solve(self, question: str) -> str:
|
| 941 |
+
"""Simple fallback solution method"""
|
| 942 |
+
try:
|
| 943 |
+
# Try direct search first
|
| 944 |
+
search_result = self.tool_executor.tavily_search(question)
|
| 945 |
+
if search_result.success and search_result.result.get('answer'):
|
| 946 |
+
return search_result.result['answer']
|
| 947 |
+
|
| 948 |
+
# If search fails, try basic pattern matching
|
| 949 |
+
question_lower = question.lower()
|
| 950 |
+
|
| 951 |
+
# Handle year questions
|
| 952 |
+
if 'year' in question_lower or 'when' in question_lower:
|
| 953 |
+
# Look for 4-digit years in search results
|
| 954 |
+
if search_result.success:
|
| 955 |
+
text = str(search_result.result)
|
| 956 |
+
years = re.findall(r'\b(19|20)\d{2}\b', text)
|
| 957 |
+
if years:
|
| 958 |
+
return years[0]
|
| 959 |
+
|
| 960 |
+
# Handle number questions
|
| 961 |
+
if any(word in question_lower for word in ['how many', 'number', 'count']):
|
| 962 |
+
if search_result.success:
|
| 963 |
+
text = str(search_result.result)
|
| 964 |
+
numbers = re.findall(r'\b\d+\b', text)
|
| 965 |
+
if numbers:
|
| 966 |
+
return numbers[0]
|
| 967 |
+
|
| 968 |
+
# Default fallback
|
| 969 |
+
return "Unable to determine answer"
|
| 970 |
+
|
| 971 |
+
except Exception as e:
|
| 972 |
+
logger.error(f"Fallback solve error: {e}")
|
| 973 |
+
return "Error processing question"
|
| 974 |
+
|
| 975 |
+
def __call__(self, question: str) -> str:
|
| 976 |
+
"""Main entry point for the agent"""
|
| 977 |
+
logger.info(f"Processing question: {question[:100]}...")
|
| 978 |
+
|
| 979 |
+
try:
|
| 980 |
+
# Solve the question
|
| 981 |
+
answer = self.solve_with_reasoning(question)
|
| 982 |
+
|
| 983 |
+
# Clean and validate answer
|
| 984 |
+
final_answer = answer.strip()
|
| 985 |
+
if not final_answer:
|
| 986 |
+
final_answer = self.fallback_solve(question)
|
| 987 |
+
|
| 988 |
+
logger.info(f"Generated answer: {final_answer}")
|
| 989 |
+
return final_answer
|
| 990 |
+
|
| 991 |
+
except Exception as e:
|
| 992 |
+
logger.error(f"Error in agent call: {e}")
|
| 993 |
+
logger.error(traceback.format_exc())
|
| 994 |
+
return self.fallback_solve(question)
|
| 995 |
+
|
| 996 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 997 |
+
"""
|
| 998 |
+
Fetches all questions, runs the GAIAAgent on them, submits all answers,
|
| 999 |
+
and displays the results.
|
| 1000 |
+
"""
|
| 1001 |
+
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 1002 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 1003 |
+
|
| 1004 |
+
if profile:
|
| 1005 |
+
username = f"{profile.username}"
|
| 1006 |
+
print(f"User logged in: {username}")
|
| 1007 |
else:
|
| 1008 |
+
print("User not logged in.")
|
| 1009 |
+
return "Please Login to Hugging Face with the button.", None
|
| 1010 |
+
|
| 1011 |
+
api_url = DEFAULT_API_URL
|
| 1012 |
+
questions_url = f"{api_url}/questions"
|
| 1013 |
+
submit_url = f"{api_url}/submit"
|
| 1014 |
|
| 1015 |
+
# 1. Instantiate Agent
|
| 1016 |
try:
|
| 1017 |
+
agent = GAIAAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
except Exception as e:
|
| 1019 |
+
print(f"Error instantiating agent: {e}")
|
| 1020 |
+
return f"Error initializing agent: {e}", None
|
| 1021 |
+
|
| 1022 |
+
# In the case of an app running as a Hugging Face space, this link points toward your codebase
|
| 1023 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 1024 |
+
print(agent_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1025 |
|
| 1026 |
+
# 2. Fetch Questions
|
| 1027 |
+
print(f"Fetching questions from: {questions_url}")
|
| 1028 |
+
try:
|
| 1029 |
+
response = requests.get(questions_url, timeout=15)
|
| 1030 |
+
response.raise_for_status()
|
| 1031 |
+
questions_data = response.json()
|
| 1032 |
+
if not questions_data:
|
| 1033 |
+
print("Fetched questions list is empty.")
|
| 1034 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 1035 |
+
print(f"Fetched {len(questions_data)} questions.")
|
| 1036 |
+
except requests.exceptions.RequestException as e:
|
| 1037 |
+
print(f"Error fetching questions: {e}")
|
| 1038 |
+
return f"Error fetching questions: {e}", None
|
| 1039 |
+
except requests.exceptions.JSONDecodeError as e:
|
| 1040 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 1041 |
+
print(f"Response text: {response.text[:500]}")
|
| 1042 |
+
return f"Error decoding server response for questions: {e}", None
|
| 1043 |
except Exception as e:
|
| 1044 |
+
print(f"An unexpected error occurred fetching questions: {e}")
|
| 1045 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
| 1046 |
+
|
| 1047 |
+
# 3. Run your Agent
|
| 1048 |
+
results_log = []
|
| 1049 |
+
answers_payload = []
|
| 1050 |
+
print(f"Running agent on {len(questions_data)} questions...")
|
| 1051 |
+
|
| 1052 |
+
for i, item in enumerate(questions_data):
|
| 1053 |
+
task_id = item.get("task_id")
|
| 1054 |
+
question_text = item.get("question")
|
| 1055 |
+
if not task_id or question_text is None:
|
| 1056 |
+
print(f"Skipping item with missing task_id or question: {item}")
|
| 1057 |
+
continue
|
| 1058 |
+
|
| 1059 |
+
print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
|
| 1060 |
+
try:
|
| 1061 |
+
submitted_answer = agent(question_text)
|
| 1062 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 1063 |
+
results_log.append({
|
| 1064 |
+
"Task ID": task_id,
|
| 1065 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 1066 |
+
"Submitted Answer": submitted_answer
|
| 1067 |
+
})
|
| 1068 |
+
print(f"Answer for {task_id}: {submitted_answer}")
|
| 1069 |
+
except Exception as e:
|
| 1070 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 1071 |
+
error_msg = f"AGENT ERROR: {e}"
|
| 1072 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": error_msg})
|
| 1073 |
+
results_log.append({
|
| 1074 |
+
"Task ID": task_id,
|
| 1075 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 1076 |
+
"Submitted Answer": error_msg
|
| 1077 |
+
})
|
| 1078 |
+
|
| 1079 |
+
if not answers_payload:
|
| 1080 |
+
print("Agent did not produce any answers to submit.")
|
| 1081 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 1082 |
+
|
| 1083 |
+
# 4. Prepare Submission
|
| 1084 |
+
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 1085 |
+
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 1086 |
+
print(status_update)
|
| 1087 |
+
|
| 1088 |
+
# 5. Submit
|
| 1089 |
+
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 1090 |
try:
|
| 1091 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 1092 |
+
response.raise_for_status()
|
| 1093 |
+
result_data = response.json()
|
| 1094 |
+
final_status = (
|
| 1095 |
+
f"Submission Successful!\n"
|
| 1096 |
+
f"User: {result_data.get('username')}\n"
|
| 1097 |
+
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 1098 |
+
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 1099 |
+
f"Message: {result_data.get('message', 'No message received.')}"
|
| 1100 |
+
)
|
| 1101 |
+
print("Submission successful.")
|
| 1102 |
+
results_df = pd.DataFrame(results_log)
|
| 1103 |
+
return final_status, results_df
|
| 1104 |
+
except requests.exceptions.HTTPError as e:
|
| 1105 |
+
error_detail = f"Server responded with status {e.response.status_code}."
|
| 1106 |
+
try:
|
| 1107 |
+
error_json = e.response.json()
|
| 1108 |
+
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 1109 |
+
except requests.exceptions.JSONDecodeError:
|
| 1110 |
+
error_detail += f" Response: {e.response.text[:500]}"
|
| 1111 |
+
status_message = f"Submission Failed: {error_detail}"
|
| 1112 |
+
print(status_message)
|
| 1113 |
+
results_df = pd.DataFrame(results_log)
|
| 1114 |
+
return status_message, results_df
|
| 1115 |
+
except requests.exceptions.Timeout:
|
| 1116 |
+
status_message = "Submission Failed: The request timed out."
|
| 1117 |
+
print(status_message)
|
| 1118 |
+
results_df = pd.DataFrame(results_log)
|
| 1119 |
+
return status_message, results_df
|
| 1120 |
+
except requests.exceptions.RequestException as e:
|
| 1121 |
+
status_message = f"Submission Failed: Network error - {e}"
|
| 1122 |
+
print(status_message)
|
| 1123 |
+
results_df = pd.DataFrame(results_log)
|
| 1124 |
+
return status_message, results_df
|
| 1125 |
except Exception as e:
|
| 1126 |
+
status_message = f"An unexpected error occurred during submission: {e}"
|
| 1127 |
+
print(status_message)
|
| 1128 |
+
results_df = pd.DataFrame(results_log)
|
| 1129 |
+
return status_message, results_df
|
| 1130 |
+
|
| 1131 |
+
# --- Build Gradio Interface using Blocks ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1132 |
with gr.Blocks() as demo:
|
| 1133 |
+
gr.Markdown("# GAIA Benchmark AI Agent")
|
| 1134 |
+
gr.Markdown(
|
| 1135 |
+
"""
|
| 1136 |
+
**Advanced AI Agent for GAIA Benchmark**
|
| 1137 |
+
|
| 1138 |
+
This agent uses:
|
| 1139 |
+
- **Qwen 2.5-7B-Instruct** for reasoning and planning
|
| 1140 |
+
- **Tavily Search** for real-time information retrieval
|
| 1141 |
+
- **Python Interpreter** for computational tasks
|
| 1142 |
+
- **File Reading** capabilities for document analysis
|
| 1143 |
+
|
| 1144 |
+
**Instructions:**
|
| 1145 |
+
1. Clone this space and set up your environment variables:
|
| 1146 |
+
- `TAVILY_API_KEY`: Your Tavily API key for web search
|
| 1147 |
+
- `HF_TOKEN`: Your Hugging Face token (if needed)
|
| 1148 |
+
2. Log in to your Hugging Face account using the button below
|
| 1149 |
+
3. Click 'Run Evaluation & Submit All Answers' to start the evaluation
|
| 1150 |
+
|
| 1151 |
+
**Expected Performance:** This agent is designed to score >30% on the GAIA benchmark.
|
| 1152 |
+
"""
|
| 1153 |
+
)
|
| 1154 |
+
|
| 1155 |
+
gr.LoginButton()
|
| 1156 |
+
|
| 1157 |
+
run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
|
| 1158 |
+
|
| 1159 |
+
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 1160 |
+
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 1161 |
+
|
| 1162 |
+
run_button.click(
|
| 1163 |
+
fn=run_and_submit_all,
|
| 1164 |
+
outputs=[status_output, results_table]
|
| 1165 |
+
)
|
| 1166 |
+
|
| 1167 |
+
if __name__ == "__main__":
|
| 1168 |
+
print("\n" + "-"*50 + " GAIA Agent Starting " + "-"*50)
|
| 1169 |
+
|
| 1170 |
+
# Check for required environment variables
|
| 1171 |
+
required_vars = ["TAVILY_API_KEY"]
|
| 1172 |
+
missing_vars = []
|
| 1173 |
+
|
| 1174 |
+
for var in required_vars:
|
| 1175 |
+
if not os.getenv(var):
|
| 1176 |
+
missing_vars.append(var)
|
| 1177 |
+
|
| 1178 |
+
if missing_vars:
|
| 1179 |
+
print(f"⚠️ Missing environment variables: {', '.join(missing_vars)}")
|
| 1180 |
+
print(" Please set these variables for optimal performance.")
|
| 1181 |
+
else:
|
| 1182 |
+
print("✅ All required environment variables found.")
|
| 1183 |
+
|
| 1184 |
+
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 1185 |
+
space_host_startup = os.getenv("SPACE_HOST")
|
| 1186 |
+
space_id_startup = os.getenv("SPACE_ID")
|
| 1187 |
+
|
| 1188 |
+
if space_host_startup:
|
| 1189 |
+
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 1190 |
+
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
| 1191 |
+
else:
|
| 1192 |
+
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 1193 |
+
|
| 1194 |
+
if space_id_startup:
|
| 1195 |
+
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 1196 |
+
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 1197 |
+
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 1198 |
+
else:
|
| 1199 |
+
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 1200 |
+
|
| 1201 |
+
print("-"*120 + "\n")
|
| 1202 |
+
print("🚀 Launching GAIA Benchmark AI Agent...")
|
| 1203 |
+
demo.launch(debug=True, share=False)
|