Update app.py
Browse files
app.py
CHANGED
|
@@ -204,35 +204,126 @@ def extract_numbers(text: str) -> str:
|
|
| 204 |
return f"Error extracting numbers: {str(e)}"
|
| 205 |
|
| 206 |
@tool
|
| 207 |
-
def
|
| 208 |
-
"""
|
| 209 |
|
| 210 |
Args:
|
| 211 |
-
|
| 212 |
-
item_type: What to count ("words", "characters", "lines", "sentences")
|
| 213 |
|
| 214 |
Returns:
|
| 215 |
-
The
|
| 216 |
"""
|
| 217 |
try:
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
else:
|
| 232 |
-
return f"
|
| 233 |
|
| 234 |
except Exception as e:
|
| 235 |
-
return f"Error
|
| 236 |
|
| 237 |
def setup_authentication():
|
| 238 |
"""Setup HuggingFace authentication for the app."""
|
|
@@ -300,7 +391,9 @@ class GAIAAgent:
|
|
| 300 |
calculate_math,
|
| 301 |
analyze_data,
|
| 302 |
extract_numbers,
|
| 303 |
-
|
|
|
|
|
|
|
| 304 |
]
|
| 305 |
|
| 306 |
# Create the CodeAgent with enhanced capabilities
|
|
@@ -311,12 +404,13 @@ class GAIAAgent:
|
|
| 311 |
add_base_tools=True, # Adds DuckDuckGoSearchTool and other base tools
|
| 312 |
additional_authorized_imports=[
|
| 313 |
'requests', 'bs4', 'json', 'csv', 'math', 'statistics',
|
| 314 |
-
're', 'urllib.parse', 'base64', 'datetime', 'calendar'
|
|
|
|
| 315 |
],
|
| 316 |
-
max_steps=
|
| 317 |
verbosity_level=1 # Reduce verbosity for cleaner output
|
| 318 |
)
|
| 319 |
-
print("โ
GAIA Agent initialized successfully with
|
| 320 |
except Exception as e:
|
| 321 |
print(f"โ Error initializing agent: {e}")
|
| 322 |
raise e
|
|
@@ -326,49 +420,323 @@ class GAIAAgent:
|
|
| 326 |
try:
|
| 327 |
print(f"๐ค Processing question: {question[:100]}...")
|
| 328 |
|
| 329 |
-
# Enhanced prompt
|
| 330 |
-
enhanced_prompt = f"""You are
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
1.
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
Question: {question}
|
| 344 |
|
| 345 |
-
|
| 346 |
|
| 347 |
-
# Run the agent with error handling
|
| 348 |
try:
|
| 349 |
result = self.agent.run(enhanced_prompt)
|
| 350 |
except Exception as api_error:
|
| 351 |
-
if "402" in str(api_error) or "Payment Required" in str(api_error)
|
| 352 |
-
print(f"โ ๏ธ API quota
|
| 353 |
-
|
| 354 |
-
result = f"Unable to process due to API limits: {str(api_error)}"
|
| 355 |
else:
|
| 356 |
raise api_error
|
| 357 |
|
| 358 |
-
#
|
| 359 |
if isinstance(result, str):
|
| 360 |
-
# Remove common prefixes and suffixes
|
| 361 |
result = result.strip()
|
| 362 |
|
| 363 |
-
# Remove
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
result = re.sub(r'^(FINAL\s*ANSWER\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 365 |
result = re.sub(r'^(ANSWER\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 366 |
result = re.sub(r'^(RESULT\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
|
|
|
| 367 |
|
| 368 |
-
# Remove quotes if the entire answer is wrapped
|
| 369 |
if (result.startswith('"') and result.endswith('"')) or (result.startswith("'") and result.endswith("'")):
|
| 370 |
result = result[1:-1]
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
result = result.strip()
|
| 373 |
|
| 374 |
print(f"โ
Agent response: {result}")
|
|
|
|
| 204 |
return f"Error extracting numbers: {str(e)}"
|
| 205 |
|
| 206 |
@tool
|
| 207 |
+
def process_file_content(file_url: str) -> str:
|
| 208 |
+
"""Downloads and processes content from a file URL, supporting various formats.
|
| 209 |
|
| 210 |
Args:
|
| 211 |
+
file_url: URL to a file (PDF, CSV, TXT, etc.)
|
|
|
|
| 212 |
|
| 213 |
Returns:
|
| 214 |
+
The processed content of the file as text
|
| 215 |
"""
|
| 216 |
try:
|
| 217 |
+
import requests
|
| 218 |
+
from urllib.parse import urlparse
|
| 219 |
+
import mimetypes
|
| 220 |
+
|
| 221 |
+
headers = {
|
| 222 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
response = requests.get(file_url, headers=headers, timeout=30)
|
| 226 |
+
response.raise_for_status()
|
| 227 |
+
|
| 228 |
+
# Get content type
|
| 229 |
+
content_type = response.headers.get('content-type', '').lower()
|
| 230 |
+
|
| 231 |
+
# Process based on content type
|
| 232 |
+
if 'text/' in content_type or 'csv' in content_type:
|
| 233 |
+
return response.text
|
| 234 |
+
elif 'json' in content_type:
|
| 235 |
+
return json.dumps(response.json(), indent=2)
|
| 236 |
+
else:
|
| 237 |
+
# For binary files, return info about the file
|
| 238 |
+
return f"Binary file detected. Size: {len(response.content)} bytes. Content-Type: {content_type}"
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
return f"Error processing file: {str(e)}"
|
| 242 |
+
|
| 243 |
+
@tool
|
| 244 |
+
def solve_equation(equation: str) -> str:
|
| 245 |
+
"""Solves mathematical equations and expressions symbolically.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
equation: Mathematical equation to solve (e.g., "x^2 + 2*x - 3 = 0")
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
The solution to the equation
|
| 252 |
+
"""
|
| 253 |
+
try:
|
| 254 |
+
import sympy as sp
|
| 255 |
+
import re
|
| 256 |
+
|
| 257 |
+
# Clean the equation
|
| 258 |
+
equation = equation.replace('=', '==')
|
| 259 |
+
|
| 260 |
+
# Define common variables
|
| 261 |
+
x, y, z, t = sp.symbols('x y z t')
|
| 262 |
+
variables = {'x': x, 'y': y, 'z': z, 't': t}
|
| 263 |
+
|
| 264 |
+
# Replace common math functions
|
| 265 |
+
equation = re.sub(r'\bsqrt\b', 'sp.sqrt', equation)
|
| 266 |
+
equation = re.sub(r'\bsin\b', 'sp.sin', equation)
|
| 267 |
+
equation = re.sub(r'\bcos\b', 'sp.cos', equation)
|
| 268 |
+
equation = re.sub(r'\btan\b', 'sp.tan', equation)
|
| 269 |
+
equation = re.sub(r'\blog\b', 'sp.log', equation)
|
| 270 |
+
equation = re.sub(r'\bexp\b', 'sp.exp', equation)
|
| 271 |
+
|
| 272 |
+
# Parse and solve
|
| 273 |
+
expr = eval(equation, {"sp": sp, "x": x, "y": y, "z": z, "t": t})
|
| 274 |
+
|
| 275 |
+
if '==' in equation:
|
| 276 |
+
# It's an equation to solve
|
| 277 |
+
solution = sp.solve(expr, x)
|
| 278 |
+
return str(solution)
|
| 279 |
+
else:
|
| 280 |
+
# It's an expression to simplify
|
| 281 |
+
simplified = sp.simplify(expr)
|
| 282 |
+
return str(simplified)
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
return f"Error solving equation: {str(e)}"
|
| 286 |
+
|
| 287 |
+
@tool
|
| 288 |
+
def parse_structured_data(data: str, format_type: str = "auto") -> str:
|
| 289 |
+
"""Parses and analyzes structured data (CSV, JSON, etc.).
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
data: The structured data as a string
|
| 293 |
+
format_type: Format type ("csv", "json", "auto")
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
Analysis of the structured data
|
| 297 |
+
"""
|
| 298 |
+
try:
|
| 299 |
+
import pandas as pd
|
| 300 |
+
import json
|
| 301 |
+
from io import StringIO
|
| 302 |
+
|
| 303 |
+
if format_type == "auto":
|
| 304 |
+
# Auto-detect format
|
| 305 |
+
data_clean = data.strip()
|
| 306 |
+
if data_clean.startswith('{') or data_clean.startswith('['):
|
| 307 |
+
format_type = "json"
|
| 308 |
+
elif ',' in data_clean and '\n' in data_clean:
|
| 309 |
+
format_type = "csv"
|
| 310 |
+
|
| 311 |
+
if format_type == "json":
|
| 312 |
+
parsed = json.loads(data)
|
| 313 |
+
return json.dumps(parsed, indent=2)
|
| 314 |
+
elif format_type == "csv":
|
| 315 |
+
df = pd.read_csv(StringIO(data))
|
| 316 |
+
result = f"DataFrame shape: {df.shape}\n"
|
| 317 |
+
result += f"Columns: {list(df.columns)}\n"
|
| 318 |
+
result += f"First 5 rows:\n{df.head().to_string()}\n"
|
| 319 |
+
if df.select_dtypes(include=['number']).columns.any():
|
| 320 |
+
result += f"Numerical summary:\n{df.describe().to_string()}"
|
| 321 |
+
return result
|
| 322 |
else:
|
| 323 |
+
return f"Unsupported format: {format_type}"
|
| 324 |
|
| 325 |
except Exception as e:
|
| 326 |
+
return f"Error parsing data: {str(e)}"
|
| 327 |
|
| 328 |
def setup_authentication():
|
| 329 |
"""Setup HuggingFace authentication for the app."""
|
|
|
|
| 391 |
calculate_math,
|
| 392 |
analyze_data,
|
| 393 |
extract_numbers,
|
| 394 |
+
process_file_content,
|
| 395 |
+
solve_equation,
|
| 396 |
+
parse_structured_data
|
| 397 |
]
|
| 398 |
|
| 399 |
# Create the CodeAgent with enhanced capabilities
|
|
|
|
| 404 |
add_base_tools=True, # Adds DuckDuckGoSearchTool and other base tools
|
| 405 |
additional_authorized_imports=[
|
| 406 |
'requests', 'bs4', 'json', 'csv', 'math', 'statistics',
|
| 407 |
+
're', 'urllib.parse', 'base64', 'datetime', 'calendar',
|
| 408 |
+
'pandas', 'numpy', 'sympy', 'scipy'
|
| 409 |
],
|
| 410 |
+
max_steps=15, # Increased for complex multi-step reasoning
|
| 411 |
verbosity_level=1 # Reduce verbosity for cleaner output
|
| 412 |
)
|
| 413 |
+
print("โ
GAIA Agent initialized successfully with PRO model and enhanced tools")
|
| 414 |
except Exception as e:
|
| 415 |
print(f"โ Error initializing agent: {e}")
|
| 416 |
raise e
|
|
|
|
| 420 |
try:
|
| 421 |
print(f"๐ค Processing question: {question[:100]}...")
|
| 422 |
|
| 423 |
+
# Enhanced GAIA-optimized prompt
|
| 424 |
+
enhanced_prompt = f"""You are an expert AI assistant designed to excel at the GAIA benchmark. You must answer questions with perfect accuracy using a systematic approach.
|
| 425 |
+
|
| 426 |
+
CRITICAL INSTRUCTIONS FOR GAIA SUCCESS:
|
| 427 |
+
1. ANALYZE THE QUESTION: Read carefully and identify what type of question this is:
|
| 428 |
+
- Mathematical calculation or equation
|
| 429 |
+
- Information retrieval from web/files
|
| 430 |
+
- Data analysis or statistics
|
| 431 |
+
- Multi-step reasoning problem
|
| 432 |
+
- Factual lookup
|
| 433 |
+
|
| 434 |
+
2. CHOOSE YOUR APPROACH:
|
| 435 |
+
- For math: Use calculate_math tool or solve_equation for complex equations
|
| 436 |
+
- For web info: Use DuckDuckGoSearchTool then visit_webpage for details
|
| 437 |
+
- For files: Use process_file_content to download and analyze
|
| 438 |
+
- For data: Use analyze_data or parse_structured_data
|
| 439 |
+
- For numbers in text: Use extract_numbers first
|
| 440 |
+
|
| 441 |
+
3. BE SYSTEMATIC:
|
| 442 |
+
- Break complex questions into steps
|
| 443 |
+
- Use multiple tools if needed
|
| 444 |
+
- Verify your reasoning
|
| 445 |
+
- Double-check calculations
|
| 446 |
+
|
| 447 |
+
4. ANSWER FORMAT:
|
| 448 |
+
- Give ONLY the final answer
|
| 449 |
+
- No explanations, no "FINAL ANSWER:" prefix
|
| 450 |
+
- For numbers: just the number (e.g., "42", not "42.0")
|
| 451 |
+
- For text: just the text without quotes
|
| 452 |
+
- Be precise with units, dates, and formatting
|
| 453 |
+
|
| 454 |
+
5. ACCURACY IS PARAMOUNT:
|
| 455 |
+
- GAIA requires exact matches
|
| 456 |
+
- Round numbers appropriately
|
| 457 |
+
- Use proper case and spelling
|
| 458 |
+
- Include units when relevant
|
| 459 |
|
| 460 |
Question: {question}
|
| 461 |
|
| 462 |
+
Think step by step, use the appropriate tools, and provide only the final answer:"""
|
| 463 |
|
| 464 |
+
# Run the agent with enhanced error handling
|
| 465 |
try:
|
| 466 |
result = self.agent.run(enhanced_prompt)
|
| 467 |
except Exception as api_error:
|
| 468 |
+
if "402" in str(api_error) or "Payment Required" in str(api_error):
|
| 469 |
+
print(f"โ ๏ธ API quota issue (you have Pro, this shouldn't happen): {api_error}")
|
| 470 |
+
result = f"API Error: {str(api_error)}"
|
|
|
|
| 471 |
else:
|
| 472 |
raise api_error
|
| 473 |
|
| 474 |
+
# Enhanced answer cleaning for GAIA precision
|
| 475 |
if isinstance(result, str):
|
|
|
|
| 476 |
result = result.strip()
|
| 477 |
|
| 478 |
+
# Remove any explanatory text before the answer
|
| 479 |
+
lines = result.split('\n')
|
| 480 |
+
for i, line in enumerate(lines):
|
| 481 |
+
line = line.strip()
|
| 482 |
+
if line and not line.startswith(('Step', 'First', 'Next', 'Then', 'Finally', 'Therefore', 'So,', 'Thus')):
|
| 483 |
+
result = line
|
| 484 |
+
break
|
| 485 |
+
|
| 486 |
+
# Remove common prefixes
|
| 487 |
result = re.sub(r'^(FINAL\s*ANSWER\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 488 |
result = re.sub(r'^(ANSWER\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 489 |
result = re.sub(r'^(RESULT\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 490 |
+
result = re.sub(r'^(THE\s*ANSWER\s*IS\s*:?\s*)', '', result, flags=re.IGNORECASE)
|
| 491 |
|
| 492 |
+
# Remove quotes if the entire answer is wrapped
|
| 493 |
if (result.startswith('"') and result.endswith('"')) or (result.startswith("'") and result.endswith("'")):
|
| 494 |
result = result[1:-1]
|
| 495 |
|
| 496 |
+
# Clean up decimal numbers (e.g., "42.0" -> "42")
|
| 497 |
+
if re.match(r'^\d+\.0+
|
| 498 |
+
|
| 499 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 500 |
+
"""
|
| 501 |
+
Fetches all questions, runs the GAIAAgent on them, submits all answers,
|
| 502 |
+
and displays the results.
|
| 503 |
+
"""
|
| 504 |
+
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 505 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 506 |
+
|
| 507 |
+
if profile:
|
| 508 |
+
username = f"{profile.username}"
|
| 509 |
+
print(f"User logged in: {username}")
|
| 510 |
+
else:
|
| 511 |
+
print("User not logged in.")
|
| 512 |
+
return "Please Login to Hugging Face with the button.", None
|
| 513 |
+
|
| 514 |
+
api_url = DEFAULT_API_URL
|
| 515 |
+
questions_url = f"{api_url}/questions"
|
| 516 |
+
submit_url = f"{api_url}/submit"
|
| 517 |
+
|
| 518 |
+
# 1. Instantiate Enhanced Agent
|
| 519 |
+
try:
|
| 520 |
+
print("๐ Initializing GAIA Agent with smolagents...")
|
| 521 |
+
agent = GAIAAgent()
|
| 522 |
+
print("โ
Enhanced agent ready for GAIA benchmark!")
|
| 523 |
+
except Exception as e:
|
| 524 |
+
error_msg = f"Error initializing agent: {e}"
|
| 525 |
+
print(f"โ {error_msg}")
|
| 526 |
+
return error_msg, None
|
| 527 |
+
|
| 528 |
+
# In the case of an app running as a hugging Face space, this link points toward your codebase
|
| 529 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 530 |
+
print(f"Agent code link: {agent_code}")
|
| 531 |
+
|
| 532 |
+
# 2. Fetch Questions
|
| 533 |
+
print(f"๐ฅ Fetching questions from: {questions_url}")
|
| 534 |
+
try:
|
| 535 |
+
response = requests.get(questions_url, timeout=15)
|
| 536 |
+
response.raise_for_status()
|
| 537 |
+
questions_data = response.json()
|
| 538 |
+
if not questions_data:
|
| 539 |
+
print("Fetched questions list is empty.")
|
| 540 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 541 |
+
print(f"โ
Fetched {len(questions_data)} questions from GAIA benchmark.")
|
| 542 |
+
except requests.exceptions.RequestException as e:
|
| 543 |
+
print(f"โ Error fetching questions: {e}")
|
| 544 |
+
return f"Error fetching questions: {e}", None
|
| 545 |
+
except requests.exceptions.JSONDecodeError as e:
|
| 546 |
+
print(f"โ Error decoding JSON response from questions endpoint: {e}")
|
| 547 |
+
print(f"Response text: {response.text[:500]}")
|
| 548 |
+
return f"Error decoding server response for questions: {e}", None
|
| 549 |
+
except Exception as e:
|
| 550 |
+
print(f"โ An unexpected error occurred fetching questions: {e}")
|
| 551 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
| 552 |
+
|
| 553 |
+
# 3. Run Enhanced Agent
|
| 554 |
+
results_log = []
|
| 555 |
+
answers_payload = []
|
| 556 |
+
print(f"๐ค Running enhanced GAIA agent on {len(questions_data)} questions...")
|
| 557 |
+
|
| 558 |
+
for i, item in enumerate(questions_data, 1):
|
| 559 |
+
task_id = item.get("task_id")
|
| 560 |
+
question_text = item.get("question")
|
| 561 |
+
if not task_id or question_text is None:
|
| 562 |
+
print(f"โ ๏ธ Skipping item with missing task_id or question: {item}")
|
| 563 |
+
continue
|
| 564 |
+
|
| 565 |
+
print(f"\n๐ Processing question {i}/{len(questions_data)} (ID: {task_id})")
|
| 566 |
+
try:
|
| 567 |
+
submitted_answer = agent(question_text)
|
| 568 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 569 |
+
results_log.append({
|
| 570 |
+
"Task ID": task_id,
|
| 571 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 572 |
+
"Submitted Answer": submitted_answer
|
| 573 |
+
})
|
| 574 |
+
print(f"โ
Answer for {task_id}: {submitted_answer}")
|
| 575 |
+
except Exception as e:
|
| 576 |
+
error_msg = f"AGENT ERROR: {e}"
|
| 577 |
+
print(f"โ Error running agent on task {task_id}: {e}")
|
| 578 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": error_msg})
|
| 579 |
+
results_log.append({
|
| 580 |
+
"Task ID": task_id,
|
| 581 |
+
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
|
| 582 |
+
"Submitted Answer": error_msg
|
| 583 |
+
})
|
| 584 |
+
|
| 585 |
+
if not answers_payload:
|
| 586 |
+
print("โ Agent did not produce any answers to submit.")
|
| 587 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 588 |
+
|
| 589 |
+
# 4. Prepare Submission
|
| 590 |
+
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 591 |
+
status_update = f"๐ Agent finished processing. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 592 |
+
print(status_update)
|
| 593 |
+
|
| 594 |
+
# 5. Submit
|
| 595 |
+
print(f"๐ค Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 596 |
+
try:
|
| 597 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 598 |
+
response.raise_for_status()
|
| 599 |
+
result_data = response.json()
|
| 600 |
+
|
| 601 |
+
score = result_data.get('score', 'N/A')
|
| 602 |
+
correct_count = result_data.get('correct_count', '?')
|
| 603 |
+
total_attempted = result_data.get('total_attempted', '?')
|
| 604 |
+
|
| 605 |
+
final_status = (
|
| 606 |
+
f"๐ Submission Successful!\n"
|
| 607 |
+
f"๐ค User: {result_data.get('username')}\n"
|
| 608 |
+
f"๐ Overall Score: {score}% ({correct_count}/{total_attempted} correct)\n"
|
| 609 |
+
f"๐ฏ Target: >30% for certification\n"
|
| 610 |
+
f"๐ฌ Message: {result_data.get('message', 'No message received.')}"
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
if isinstance(score, (int, float)) and score >= 30:
|
| 614 |
+
final_status += f"\n๐ CONGRATULATIONS! You've achieved the target score of 30%!"
|
| 615 |
+
elif isinstance(score, (int, float)):
|
| 616 |
+
final_status += f"\n๐ Keep improving! You need {30-score:.1f}% more to reach the target."
|
| 617 |
+
|
| 618 |
+
print("โ
Submission successful!")
|
| 619 |
+
results_df = pd.DataFrame(results_log)
|
| 620 |
+
return final_status, results_df
|
| 621 |
+
|
| 622 |
+
except requests.exceptions.HTTPError as e:
|
| 623 |
+
error_detail = f"Server responded with status {e.response.status_code}."
|
| 624 |
+
try:
|
| 625 |
+
error_json = e.response.json()
|
| 626 |
+
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 627 |
+
except requests.exceptions.JSONDecodeError:
|
| 628 |
+
error_detail += f" Response: {e.response.text[:500]}"
|
| 629 |
+
status_message = f"โ Submission Failed: {error_detail}"
|
| 630 |
+
print(status_message)
|
| 631 |
+
results_df = pd.DataFrame(results_log)
|
| 632 |
+
return status_message, results_df
|
| 633 |
+
except requests.exceptions.Timeout:
|
| 634 |
+
status_message = "โ Submission Failed: The request timed out."
|
| 635 |
+
print(status_message)
|
| 636 |
+
results_df = pd.DataFrame(results_log)
|
| 637 |
+
return status_message, results_df
|
| 638 |
+
except requests.exceptions.RequestException as e:
|
| 639 |
+
status_message = f"โ Submission Failed: Network error - {e}"
|
| 640 |
+
print(status_message)
|
| 641 |
+
results_df = pd.DataFrame(results_log)
|
| 642 |
+
return status_message, results_df
|
| 643 |
+
except Exception as e:
|
| 644 |
+
status_message = f"โ An unexpected error occurred during submission: {e}"
|
| 645 |
+
print(status_message)
|
| 646 |
+
results_df = pd.DataFrame(results_log)
|
| 647 |
+
return status_message, results_df
|
| 648 |
+
|
| 649 |
+
|
| 650 |
+
# --- Build Gradio Interface using Blocks ---
|
| 651 |
+
with gr.Blocks(title="GAIA Agent Evaluation") as demo:
|
| 652 |
+
gr.Markdown("# ๐ค Enhanced GAIA Agent Evaluation Runner")
|
| 653 |
+
gr.Markdown(
|
| 654 |
+
"""
|
| 655 |
+
**Enhanced Agent for GAIA Benchmark Certification**
|
| 656 |
+
|
| 657 |
+
This enhanced agent uses Hugging Face's **smolagents** framework with multiple specialized tools:
|
| 658 |
+
- ๐ **Web Search**: DuckDuckGoSearchTool (from base toolkit) for finding information
|
| 659 |
+
- ๐ **Python Interpreter**: Code execution capabilities (from base toolkit)
|
| 660 |
+
- ๐ **Web Scraping**: Custom webpage visitor for content extraction
|
| 661 |
+
- ๐งฎ **Mathematics**: Advanced calculation capabilities
|
| 662 |
+
- ๐ **Data Analysis**: Statistical analysis of numerical data
|
| 663 |
+
- ๐ข **Number Extraction**: Intelligent number parsing from text
|
| 664 |
+
- ๐ **Text Analysis**: Counting and text processing utilities
|
| 665 |
+
- ๐ค **LLM Model**: Llama-3.1-8B-Instruct for advanced reasoning
|
| 666 |
+
|
| 667 |
+
**Instructions:**
|
| 668 |
+
1. ๐ **Clone this space** and customize the agent as needed
|
| 669 |
+
2. ๐ **Log in** to your Hugging Face account using the button below
|
| 670 |
+
3. ๐ **Click 'Run Evaluation'** to test your agent on GAIA benchmark questions
|
| 671 |
+
4. ๐ฏ **Target**: Score >30% for course certification
|
| 672 |
+
|
| 673 |
+
**Goal**: Answer GAIA level 1 validation questions with exact match precision.
|
| 674 |
+
|
| 675 |
+
---
|
| 676 |
+
โ ๏ธ **Note**: Processing all questions may take several minutes due to the complexity of reasoning required.
|
| 677 |
+
"""
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
gr.LoginButton()
|
| 681 |
+
|
| 682 |
+
run_button = gr.Button("๐ Run Evaluation & Submit All Answers", variant="primary", size="lg")
|
| 683 |
+
|
| 684 |
+
status_output = gr.Textbox(
|
| 685 |
+
label="๐ Evaluation Status & Results",
|
| 686 |
+
lines=8,
|
| 687 |
+
interactive=False,
|
| 688 |
+
placeholder="Click the button above to start the evaluation..."
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
results_table = gr.DataFrame(
|
| 692 |
+
label="๐ Questions and Agent Responses",
|
| 693 |
+
wrap=True,
|
| 694 |
+
headers=["Task ID", "Question", "Submitted Answer"]
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
run_button.click(
|
| 698 |
+
fn=run_and_submit_all,
|
| 699 |
+
outputs=[status_output, results_table]
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
if __name__ == "__main__":
|
| 703 |
+
print("\n" + "="*60)
|
| 704 |
+
print("๐ค ENHANCED GAIA AGENT STARTING UP")
|
| 705 |
+
print("="*60)
|
| 706 |
+
|
| 707 |
+
# Setup authentication
|
| 708 |
+
print("๐ Setting up HuggingFace authentication...")
|
| 709 |
+
auth_success = setup_authentication()
|
| 710 |
+
|
| 711 |
+
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 712 |
+
space_host_startup = os.getenv("SPACE_HOST")
|
| 713 |
+
space_id_startup = os.getenv("SPACE_ID")
|
| 714 |
+
|
| 715 |
+
if space_host_startup:
|
| 716 |
+
print(f"โ
SPACE_HOST found: {space_host_startup}")
|
| 717 |
+
print(f" ๐ Runtime URL: https://{space_host_startup}.hf.space")
|
| 718 |
+
else:
|
| 719 |
+
print("โน๏ธ SPACE_HOST environment variable not found (running locally?).")
|
| 720 |
+
if not auth_success:
|
| 721 |
+
print("๐ก For local testing, you may need to run:")
|
| 722 |
+
print(" from huggingface_hub import notebook_login")
|
| 723 |
+
print(" notebook_login()")
|
| 724 |
+
|
| 725 |
+
if space_id_startup:
|
| 726 |
+
print(f"โ
SPACE_ID found: {space_id_startup}")
|
| 727 |
+
print(f" ๐ Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 728 |
+
print(f" ๐ Code URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 729 |
+
else:
|
| 730 |
+
print("โน๏ธ SPACE_ID environment variable not found (running locally?).")
|
| 731 |
+
|
| 732 |
+
print("="*60)
|
| 733 |
+
print("๐ Launching Enhanced GAIA Agent Interface...")
|
| 734 |
+
print("๐ฏ Target: >30% score on GAIA benchmark")
|
| 735 |
+
print("="*60 + "\n")
|
| 736 |
+
|
| 737 |
+
demo.launch(debug=True, share=False), result):
|
| 738 |
+
result = str(int(float(result)))
|
| 739 |
+
|
| 740 |
result = result.strip()
|
| 741 |
|
| 742 |
print(f"โ
Agent response: {result}")
|