Spaces:
Sleeping
Sleeping
derkaal
commited on
Commit
·
c84963f
1
Parent(s):
f6cf872
Add GAIA agent files for certification
Browse files- .gitignore +30 -0
- README.md +41 -8
- app.py +264 -0
- config.json +25 -0
- gaiaX/README.md +119 -0
- gaiaX/__init__.py +9 -0
- gaiaX/agent.py +275 -0
- gaiaX/api.py +225 -0
- gaiaX/config.py +102 -0
- gaiaX/question_handlers.py +320 -0
- gaiaX/tools.py +125 -0
- gaiaX/utils.py +239 -0
- requirements.txt +18 -0
.gitignore
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
|
| 4 |
+
# Python cache files
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.py[cod]
|
| 7 |
+
*$py.class
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
dist/
|
| 11 |
+
build/
|
| 12 |
+
*.egg-info/
|
| 13 |
+
|
| 14 |
+
# Virtual environments
|
| 15 |
+
venv/
|
| 16 |
+
env/
|
| 17 |
+
ENV/
|
| 18 |
+
|
| 19 |
+
# Logs
|
| 20 |
+
logs/
|
| 21 |
+
*.log
|
| 22 |
+
|
| 23 |
+
# Progress files
|
| 24 |
+
gaia_progress.json
|
| 25 |
+
|
| 26 |
+
# Temporary files
|
| 27 |
+
.DS_Store
|
| 28 |
+
.vscode/
|
| 29 |
+
*.swp
|
| 30 |
+
*.swo
|
README.md
CHANGED
|
@@ -1,14 +1,47 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
|
| 11 |
-
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: GAIA Benchmark Agent
|
| 3 |
+
emoji: 🧠
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.25.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
hf_oauth: true
|
| 11 |
+
hf_oauth_expiration_minutes: 480
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# GAIA Benchmark Agent
|
| 15 |
+
|
| 16 |
+
This Hugging Face Space hosts a GAIA (General AI Assistant) benchmark agent designed to solve certification challenges across various domains of AI and machine learning.
|
| 17 |
+
|
| 18 |
+
## Features
|
| 19 |
+
|
| 20 |
+
- Processes questions from the GAIA benchmark
|
| 21 |
+
- Uses LangChain and OpenAI's language models
|
| 22 |
+
- Analyzes questions and identifies their types
|
| 23 |
+
- Retrieves relevant context when needed
|
| 24 |
+
- Generates accurate, well-reasoned answers
|
| 25 |
+
|
| 26 |
+
## Usage
|
| 27 |
+
|
| 28 |
+
1. Log in to your Hugging Face account using the button
|
| 29 |
+
2. Click 'Run Evaluation & Submit All Answers' to:
|
| 30 |
+
- Fetch questions from the GAIA benchmark
|
| 31 |
+
- Run the agent on all questions
|
| 32 |
+
- Submit answers and see your score
|
| 33 |
+
|
| 34 |
+
## Implementation Details
|
| 35 |
+
|
| 36 |
+
The agent uses a modular architecture with specialized handlers for different question types:
|
| 37 |
+
- Factual knowledge questions
|
| 38 |
+
- Technical implementation questions
|
| 39 |
+
- Mathematical questions
|
| 40 |
+
- Context-based analysis questions
|
| 41 |
+
- Ethical/societal impact questions
|
| 42 |
+
|
| 43 |
+
## Repository
|
| 44 |
+
|
| 45 |
+
The code for this agent is available at: https://huggingface.co/derkaal/GAIA-agent
|
| 46 |
+
|
| 47 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GAIA Benchmark Agent Interface
|
| 4 |
+
|
| 5 |
+
This script integrates the modular GAIA agent with the provided interface template.
|
| 6 |
+
It replaces the BasicAgent class with our GAIA agent implementation.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import gradio as gr
|
| 11 |
+
import requests
|
| 12 |
+
import inspect
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from typing import Dict, List, Any, Optional
|
| 15 |
+
|
| 16 |
+
# Import the GAIA agent modules
|
| 17 |
+
from gaiaX.config import (
|
| 18 |
+
logger, CONFIG, HF_USERNAME, OPENAI_API_KEY,
|
| 19 |
+
TAVILY_API_KEY, API_BASE_URL, validate_env_vars
|
| 20 |
+
)
|
| 21 |
+
from gaiaX.agent import initialize_agent, get_agent_response
|
| 22 |
+
from gaiaX.question_handlers import process_question, detect_question_type
|
| 23 |
+
|
| 24 |
+
# --- Constants ---
|
| 25 |
+
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 26 |
+
|
| 27 |
+
# --- GAIA Agent Implementation ---
|
| 28 |
+
class GAIAAgent:
|
| 29 |
+
"""
|
| 30 |
+
GAIA Benchmark Agent implementation that integrates with the provided interface.
|
| 31 |
+
"""
|
| 32 |
+
def __init__(self):
|
| 33 |
+
"""Initialize the GAIA agent."""
|
| 34 |
+
logger.info("Initializing GAIA agent...")
|
| 35 |
+
|
| 36 |
+
# Validate environment variables
|
| 37 |
+
try:
|
| 38 |
+
validate_env_vars()
|
| 39 |
+
except ValueError as e:
|
| 40 |
+
logger.error(f"Environment validation failed: {e}")
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
# Initialize the LangChain agent
|
| 44 |
+
self.agent = initialize_agent(OPENAI_API_KEY, "openai_functions")
|
| 45 |
+
logger.info("GAIA agent initialized successfully.")
|
| 46 |
+
|
| 47 |
+
def __call__(self, question: str) -> str:
|
| 48 |
+
"""
|
| 49 |
+
Process a question and return the answer.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
question: The question text
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
The agent's answer as a string
|
| 56 |
+
"""
|
| 57 |
+
logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 58 |
+
|
| 59 |
+
# Create a question dictionary
|
| 60 |
+
question_dict = {
|
| 61 |
+
"task_id": "custom_question",
|
| 62 |
+
"question": question,
|
| 63 |
+
"has_file": False
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
# Process the question
|
| 67 |
+
try:
|
| 68 |
+
# Detect question type
|
| 69 |
+
question_type = detect_question_type(question)
|
| 70 |
+
logger.info(f"Detected question type: {question_type}")
|
| 71 |
+
|
| 72 |
+
# Process the question
|
| 73 |
+
result = process_question(self.agent, question_dict, API_BASE_URL)
|
| 74 |
+
|
| 75 |
+
# Extract the answer
|
| 76 |
+
answer = result.get("answer", "")
|
| 77 |
+
|
| 78 |
+
if not answer:
|
| 79 |
+
logger.warning("Agent returned an empty answer.")
|
| 80 |
+
answer = "I couldn't generate an answer for this question."
|
| 81 |
+
|
| 82 |
+
logger.info(f"Agent returning answer (first 50 chars): {answer[:50]}...")
|
| 83 |
+
return answer
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Error processing question: {e}")
|
| 87 |
+
return f"Error: {str(e)}"
|
| 88 |
+
|
| 89 |
+
# --- Run and Submit All Function ---
|
| 90 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 91 |
+
"""
|
| 92 |
+
Fetches all questions, runs the GAIA Agent on them, submits all answers,
|
| 93 |
+
and displays the results.
|
| 94 |
+
"""
|
| 95 |
+
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 96 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 97 |
+
|
| 98 |
+
if profile:
|
| 99 |
+
username = f"{profile.username}"
|
| 100 |
+
print(f"User logged in: {username}")
|
| 101 |
+
else:
|
| 102 |
+
print("User not logged in.")
|
| 103 |
+
return "Please Login to Hugging Face with the button.", None
|
| 104 |
+
|
| 105 |
+
api_url = DEFAULT_API_URL
|
| 106 |
+
questions_url = f"{api_url}/questions"
|
| 107 |
+
submit_url = f"{api_url}/submit"
|
| 108 |
+
|
| 109 |
+
# 1. Instantiate Agent
|
| 110 |
+
try:
|
| 111 |
+
agent = GAIAAgent()
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f"Error instantiating agent: {e}")
|
| 114 |
+
return f"Error initializing agent: {e}", None
|
| 115 |
+
|
| 116 |
+
# In the case of an app running as a hugging Face space, this link points toward your codebase
|
| 117 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 118 |
+
print(agent_code)
|
| 119 |
+
|
| 120 |
+
# 2. Fetch Questions
|
| 121 |
+
print(f"Fetching questions from: {questions_url}")
|
| 122 |
+
try:
|
| 123 |
+
response = requests.get(questions_url, timeout=15)
|
| 124 |
+
response.raise_for_status()
|
| 125 |
+
questions_data = response.json()
|
| 126 |
+
if not questions_data:
|
| 127 |
+
print("Fetched questions list is empty.")
|
| 128 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 129 |
+
print(f"Fetched {len(questions_data)} questions.")
|
| 130 |
+
except requests.exceptions.RequestException as e:
|
| 131 |
+
print(f"Error fetching questions: {e}")
|
| 132 |
+
return f"Error fetching questions: {e}", None
|
| 133 |
+
except requests.exceptions.JSONDecodeError as e:
|
| 134 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 135 |
+
print(f"Response text: {response.text[:500]}")
|
| 136 |
+
return f"Error decoding server response for questions: {e}", None
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"An unexpected error occurred fetching questions: {e}")
|
| 139 |
+
return f"An unexpected error occurred fetching questions: {e}", None
|
| 140 |
+
|
| 141 |
+
# 3. Run your Agent
|
| 142 |
+
results_log = []
|
| 143 |
+
answers_payload = []
|
| 144 |
+
print(f"Running agent on {len(questions_data)} questions...")
|
| 145 |
+
for item in questions_data:
|
| 146 |
+
task_id = item.get("task_id")
|
| 147 |
+
question_text = item.get("question")
|
| 148 |
+
if not task_id or question_text is None:
|
| 149 |
+
print(f"Skipping item with missing task_id or question: {item}")
|
| 150 |
+
continue
|
| 151 |
+
try:
|
| 152 |
+
submitted_answer = agent(question_text)
|
| 153 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 154 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 157 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 158 |
+
|
| 159 |
+
if not answers_payload:
|
| 160 |
+
print("Agent did not produce any answers to submit.")
|
| 161 |
+
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 162 |
+
|
| 163 |
+
# 4. Prepare Submission
|
| 164 |
+
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 165 |
+
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 166 |
+
print(status_update)
|
| 167 |
+
|
| 168 |
+
# 5. Submit
|
| 169 |
+
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 170 |
+
try:
|
| 171 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 172 |
+
response.raise_for_status()
|
| 173 |
+
result_data = response.json()
|
| 174 |
+
final_status = (
|
| 175 |
+
f"Submission Successful!\n"
|
| 176 |
+
f"User: {result_data.get('username')}\n"
|
| 177 |
+
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 178 |
+
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 179 |
+
f"Message: {result_data.get('message', 'No message received.')}"
|
| 180 |
+
)
|
| 181 |
+
print("Submission successful.")
|
| 182 |
+
results_df = pd.DataFrame(results_log)
|
| 183 |
+
return final_status, results_df
|
| 184 |
+
except requests.exceptions.HTTPError as e:
|
| 185 |
+
error_detail = f"Server responded with status {e.response.status_code}."
|
| 186 |
+
try:
|
| 187 |
+
error_json = e.response.json()
|
| 188 |
+
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 189 |
+
except requests.exceptions.JSONDecodeError:
|
| 190 |
+
error_detail += f" Response: {e.response.text[:500]}"
|
| 191 |
+
status_message = f"Submission Failed: {error_detail}"
|
| 192 |
+
print(status_message)
|
| 193 |
+
results_df = pd.DataFrame(results_log)
|
| 194 |
+
return status_message, results_df
|
| 195 |
+
except requests.exceptions.Timeout:
|
| 196 |
+
status_message = "Submission Failed: The request timed out."
|
| 197 |
+
print(status_message)
|
| 198 |
+
results_df = pd.DataFrame(results_log)
|
| 199 |
+
return status_message, results_df
|
| 200 |
+
except requests.exceptions.RequestException as e:
|
| 201 |
+
status_message = f"Submission Failed: Network error - {e}"
|
| 202 |
+
print(status_message)
|
| 203 |
+
results_df = pd.DataFrame(results_log)
|
| 204 |
+
return status_message, results_df
|
| 205 |
+
except Exception as e:
|
| 206 |
+
status_message = f"An unexpected error occurred during submission: {e}"
|
| 207 |
+
print(status_message)
|
| 208 |
+
results_df = pd.DataFrame(results_log)
|
| 209 |
+
return status_message, results_df
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# --- Build Gradio Interface using Blocks ---
|
| 213 |
+
with gr.Blocks() as demo:
|
| 214 |
+
gr.Markdown("# GAIA Benchmark Agent Evaluation Runner")
|
| 215 |
+
gr.Markdown(
|
| 216 |
+
"""
|
| 217 |
+
**Instructions:**
|
| 218 |
+
|
| 219 |
+
1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 220 |
+
2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
**Note:**
|
| 224 |
+
This interface uses the modular GAIA Benchmark Agent to process questions from the GAIA benchmark.
|
| 225 |
+
The agent uses LangChain and OpenAI's language models to analyze questions, retrieve relevant context,
|
| 226 |
+
and generate accurate answers across various domains of AI and machine learning.
|
| 227 |
+
"""
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
gr.LoginButton()
|
| 231 |
+
|
| 232 |
+
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 233 |
+
|
| 234 |
+
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 235 |
+
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 236 |
+
|
| 237 |
+
run_button.click(
|
| 238 |
+
fn=run_and_submit_all,
|
| 239 |
+
outputs=[status_output, results_table]
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
if __name__ == "__main__":
|
| 243 |
+
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 244 |
+
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 245 |
+
space_host_startup = os.getenv("SPACE_HOST")
|
| 246 |
+
space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
|
| 247 |
+
|
| 248 |
+
if space_host_startup:
|
| 249 |
+
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 250 |
+
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
| 251 |
+
else:
|
| 252 |
+
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 253 |
+
|
| 254 |
+
if space_id_startup: # Print repo URLs if SPACE_ID is found
|
| 255 |
+
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 256 |
+
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 257 |
+
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 258 |
+
else:
|
| 259 |
+
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 260 |
+
|
| 261 |
+
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 262 |
+
|
| 263 |
+
print("Launching Gradio Interface for GAIA Benchmark Agent Evaluation...")
|
| 264 |
+
demo.launch(debug=True, share=False)
|
config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_parameters": {
|
| 3 |
+
"model_name": "gpt-4-turbo",
|
| 4 |
+
"temperature": 0.2,
|
| 5 |
+
"max_tokens": 1024,
|
| 6 |
+
"top_p": 1.0,
|
| 7 |
+
"frequency_penalty": 0.0,
|
| 8 |
+
"presence_penalty": 0.0
|
| 9 |
+
},
|
| 10 |
+
"paths": {
|
| 11 |
+
"progress_file": "gaia_progress.json"
|
| 12 |
+
},
|
| 13 |
+
"api": {
|
| 14 |
+
"base_url": "https://agents-course-unit4-scoring.hf.space"
|
| 15 |
+
},
|
| 16 |
+
"logging": {
|
| 17 |
+
"level": "INFO",
|
| 18 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 19 |
+
"file": "logs/gaia_agent.log",
|
| 20 |
+
"console": true
|
| 21 |
+
},
|
| 22 |
+
"debugging": {
|
| 23 |
+
"enable_langchain_debug": false
|
| 24 |
+
}
|
| 25 |
+
}
|
gaiaX/README.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GAIA Benchmark Agent
|
| 2 |
+
|
| 3 |
+
A LangChain-based agent for solving Hugging Face certification challenges in the GAIA benchmark.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The GAIA Benchmark Agent is designed to process and answer questions from the Hugging Face GAIA benchmark. It uses LangChain and OpenAI's language models to analyze questions, retrieve relevant context, and generate accurate answers across various domains of AI and machine learning.
|
| 8 |
+
|
| 9 |
+
## Features
|
| 10 |
+
|
| 11 |
+
- Question type detection and specialized handling
|
| 12 |
+
- Context-aware processing for questions with associated files
|
| 13 |
+
- Batch processing with progress tracking
|
| 14 |
+
- Performance analysis and reporting
|
| 15 |
+
- Support for different agent types (OpenAI Functions, ReAct)
|
| 16 |
+
|
| 17 |
+
## Project Structure
|
| 18 |
+
|
| 19 |
+
The project has been modularized for better maintainability and to address token limit issues:
|
| 20 |
+
|
| 21 |
+
```
|
| 22 |
+
gaiaX/
|
| 23 |
+
├── __init__.py # Package initialization
|
| 24 |
+
├── config.py # Configuration handling
|
| 25 |
+
├── api.py # API interaction functions
|
| 26 |
+
├── tools.py # LangChain tools
|
| 27 |
+
├── agent.py # Agent initialization and response handling
|
| 28 |
+
├── question_handlers.py # Question type detection and handling
|
| 29 |
+
├── utils.py # Utility functions
|
| 30 |
+
└── README.md # This file
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Setup
|
| 34 |
+
|
| 35 |
+
1. Clone the repository
|
| 36 |
+
2. Install dependencies:
|
| 37 |
+
```
|
| 38 |
+
pip install -r requirements.txt
|
| 39 |
+
```
|
| 40 |
+
3. Create a `.env` file with the following variables:
|
| 41 |
+
```
|
| 42 |
+
HF_USERNAME=your_huggingface_username
|
| 43 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 44 |
+
TAVILY_API_KEY=your_tavily_api_key # Optional, for search functionality
|
| 45 |
+
```
|
| 46 |
+
4. Create a `config.json` file with your configuration:
|
| 47 |
+
```json
|
| 48 |
+
{
|
| 49 |
+
"model_parameters": {
|
| 50 |
+
"model_name": "gpt-4-turbo",
|
| 51 |
+
"temperature": 0.2
|
| 52 |
+
},
|
| 53 |
+
"paths": {
|
| 54 |
+
"progress_file": "gaia_progress.json"
|
| 55 |
+
},
|
| 56 |
+
"api": {
|
| 57 |
+
"base_url": "https://api.example.com/gaia"
|
| 58 |
+
},
|
| 59 |
+
"logging": {
|
| 60 |
+
"level": "INFO",
|
| 61 |
+
"file": "logs/gaia_agent.log",
|
| 62 |
+
"console": true
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Usage
|
| 68 |
+
|
| 69 |
+
The GAIA Benchmark Agent can be used in several modes:
|
| 70 |
+
|
| 71 |
+
### Test Mode
|
| 72 |
+
|
| 73 |
+
Test the agent with a sample question or a custom question:
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
python gaia_agent_new.py test --agent-type openai_functions --question "What is deep learning?"
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
With a context file:
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
python gaia_agent_new.py test --agent-type openai_functions --question "Explain the concepts in this paper." --file path/to/paper.txt
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Random Question Mode
|
| 86 |
+
|
| 87 |
+
Process a random question from the GAIA benchmark:
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
python gaia_agent_new.py random --agent-type openai_functions
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### Batch Processing Mode
|
| 94 |
+
|
| 95 |
+
Process a batch of questions from the GAIA benchmark:
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
python gaia_agent_new.py batch --agent-type openai_functions --batch-size 10 --progress-file progress.json --limit 50
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
### Submit Answers
|
| 102 |
+
|
| 103 |
+
Submit processed answers to the GAIA benchmark:
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
python gaia_agent_new.py submit --progress-file progress.json --agent-code-link https://github.com/yourusername/gaia-agent
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Testing
|
| 110 |
+
|
| 111 |
+
Run the test suite:
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
python test_gaia_agent_new.py
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## License
|
| 118 |
+
|
| 119 |
+
[MIT License](LICENSE)
|
gaiaX/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GAIA Benchmark Agent - Hugging Face Certification Challenge Solver
|
| 3 |
+
|
| 4 |
+
This package provides a LangChain agent to solve Hugging Face certification
|
| 5 |
+
challenges for the GAIA benchmark. It includes batch processing capabilities,
|
| 6 |
+
progress tracking, and performance analysis.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
__version__ = "1.0.0"
|
gaiaX/agent.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Agent module for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module handles the initialization of LangChain agents and
|
| 6 |
+
processing of responses for different question types.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import tempfile
|
| 10 |
+
import json
|
| 11 |
+
from typing import Dict, List, Any, Optional, Union, Tuple
|
| 12 |
+
|
| 13 |
+
from langchain.agents import AgentExecutor, create_openai_functions_agent, create_react_agent
|
| 14 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
|
| 15 |
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
| 16 |
+
from langchain_openai import ChatOpenAI
|
| 17 |
+
from langchain.memory import ConversationBufferMemory
|
| 18 |
+
from langchain.globals import set_debug
|
| 19 |
+
|
| 20 |
+
from gaiaX.config import logger, CONFIG, OPENAI_API_KEY, TAVILY_API_KEY
|
| 21 |
+
from gaiaX.tools import get_tools
|
| 22 |
+
from gaiaX.api import download_file_for_task
|
| 23 |
+
|
| 24 |
+
def initialize_agent(api_key: str = OPENAI_API_KEY, agent_type: str = "openai_functions") -> Any:
|
| 25 |
+
"""
|
| 26 |
+
Initialize a LangChain agent with appropriate tools and configuration.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
api_key: OpenAI API key or other LLM provider key
|
| 30 |
+
agent_type: Type of agent to initialize ("openai_functions" or "react")
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
Initialized LangChain agent
|
| 34 |
+
"""
|
| 35 |
+
# Enable LangChain debugging if configured
|
| 36 |
+
debug_enabled = CONFIG.get("debugging", {}).get("enable_langchain_debug", False)
|
| 37 |
+
if debug_enabled:
|
| 38 |
+
set_debug(True)
|
| 39 |
+
logger.info("LangChain debugging enabled")
|
| 40 |
+
|
| 41 |
+
# Get model parameters from config
|
| 42 |
+
model_params = CONFIG.get("model_parameters", {})
|
| 43 |
+
model_name = model_params.get("model_name", "gpt-4-turbo")
|
| 44 |
+
temperature = model_params.get("temperature", 0.2)
|
| 45 |
+
max_tokens = model_params.get("max_tokens", None)
|
| 46 |
+
top_p = model_params.get("top_p", 1.0)
|
| 47 |
+
frequency_penalty = model_params.get("frequency_penalty", 0.0)
|
| 48 |
+
presence_penalty = model_params.get("presence_penalty", 0.0)
|
| 49 |
+
|
| 50 |
+
logger.info(f"Initializing agent with model: {model_name}, temperature: {temperature}, type: {agent_type}")
|
| 51 |
+
|
| 52 |
+
# Initialize the language model
|
| 53 |
+
llm = ChatOpenAI(
|
| 54 |
+
model=model_name,
|
| 55 |
+
temperature=temperature,
|
| 56 |
+
max_tokens=max_tokens,
|
| 57 |
+
top_p=top_p,
|
| 58 |
+
frequency_penalty=frequency_penalty,
|
| 59 |
+
presence_penalty=presence_penalty,
|
| 60 |
+
api_key=api_key
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Get tools for the agent
|
| 64 |
+
tools = get_tools(include_search=True, tavily_api_key=TAVILY_API_KEY)
|
| 65 |
+
|
| 66 |
+
if agent_type == "react":
|
| 67 |
+
# Create a ReAct agent with a specialized prompt for GAIA benchmark
|
| 68 |
+
react_template = """
|
| 69 |
+
You are a general AI assistant. I will ask you a question.
|
| 70 |
+
You have access to the following tools:
|
| 71 |
+
{tools}
|
| 72 |
+
|
| 73 |
+
Use the following format:
|
| 74 |
+
|
| 75 |
+
Question: the input question you must answer
|
| 76 |
+
Thought: you should always think about what to do
|
| 77 |
+
Action: the action to take, should be one of [{tool_names}]
|
| 78 |
+
Action Input: the input to the action
|
| 79 |
+
Observation: the result of the action
|
| 80 |
+
... (this Thought/Action/Action Input/Observation can repeat N times)
|
| 81 |
+
Thought: I now know the final answer.
|
| 82 |
+
Final Answer: [The final answer to the original input question. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. Output *only* the final answer value here, without any other surrounding text or prefixes.]
|
| 83 |
+
|
| 84 |
+
Begin!
|
| 85 |
+
|
| 86 |
+
Question: {input}
|
| 87 |
+
Thought: {agent_scratchpad}
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
# Create the prompt template
|
| 91 |
+
react_prompt = PromptTemplate.from_template(react_template)
|
| 92 |
+
|
| 93 |
+
# Create the ReAct agent
|
| 94 |
+
agent = create_react_agent(llm, tools, react_prompt)
|
| 95 |
+
|
| 96 |
+
# Create the agent executor
|
| 97 |
+
agent_executor = AgentExecutor(
|
| 98 |
+
agent=agent,
|
| 99 |
+
tools=tools,
|
| 100 |
+
verbose=True,
|
| 101 |
+
handle_parsing_errors=True
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
logger.info("ReAct agent initialized successfully")
|
| 105 |
+
|
| 106 |
+
else: # Default to OpenAI Functions agent
|
| 107 |
+
# Create a detailed system prompt with instructions for different question types
|
| 108 |
+
system_prompt = """
|
| 109 |
+
You are an expert AI assistant specialized in solving Hugging Face certification challenges for the GAIA benchmark.
|
| 110 |
+
Your goal is to provide accurate, well-reasoned answers to questions across various domains of AI and machine learning.
|
| 111 |
+
|
| 112 |
+
When given a question:
|
| 113 |
+
1. Carefully analyze what is being asked and identify the question type
|
| 114 |
+
2. Determine if you need additional context from any provided files
|
| 115 |
+
3. If context files are available, request them using the fetch_context_file tool
|
| 116 |
+
4. Formulate a comprehensive, accurate answer based on your knowledge and the provided context
|
| 117 |
+
5. Ensure your answer is clear, concise, and directly addresses the question
|
| 118 |
+
|
| 119 |
+
QUESTION TYPES AND STRATEGIES:
|
| 120 |
+
|
| 121 |
+
1. FACTUAL KNOWLEDGE QUESTIONS:
|
| 122 |
+
- These test your knowledge of AI/ML concepts, techniques, or history
|
| 123 |
+
- Provide precise definitions and explanations
|
| 124 |
+
- Include relevant examples to illustrate concepts
|
| 125 |
+
- Cite important research papers or developments when applicable
|
| 126 |
+
|
| 127 |
+
2. TECHNICAL IMPLEMENTATION QUESTIONS:
|
| 128 |
+
- These ask about code, algorithms, or implementation details
|
| 129 |
+
- Provide step-by-step explanations of algorithms or processes
|
| 130 |
+
- Include pseudocode or code snippets when helpful
|
| 131 |
+
- Explain trade-offs between different approaches
|
| 132 |
+
|
| 133 |
+
3. MATHEMATICAL QUESTIONS:
|
| 134 |
+
- These involve equations, proofs, or statistical concepts
|
| 135 |
+
- Show your work step-by-step
|
| 136 |
+
- Explain the intuition behind mathematical concepts
|
| 137 |
+
- Use clear notation and define all variables
|
| 138 |
+
|
| 139 |
+
4. CONTEXT-BASED ANALYSIS QUESTIONS:
|
| 140 |
+
- These require analyzing provided context files
|
| 141 |
+
- Thoroughly read and understand the context before answering
|
| 142 |
+
- Reference specific parts of the context in your answer
|
| 143 |
+
- Connect the context to broader AI/ML concepts when relevant
|
| 144 |
+
|
| 145 |
+
5. ETHICAL/SOCIETAL IMPACT QUESTIONS:
|
| 146 |
+
- These address ethical considerations or societal impacts of AI
|
| 147 |
+
- Present balanced perspectives on controversial topics
|
| 148 |
+
- Consider multiple stakeholders and viewpoints
|
| 149 |
+
- Discuss both benefits and potential risks
|
| 150 |
+
|
| 151 |
+
6. PROBLEM-SOLVING QUESTIONS:
|
| 152 |
+
- These present novel problems requiring creative solutions
|
| 153 |
+
- Break down the problem into manageable components
|
| 154 |
+
- Consider multiple approaches before selecting the best one
|
| 155 |
+
- Explain why your solution is optimal given constraints
|
| 156 |
+
|
| 157 |
+
7. CODING QUESTIONS:
|
| 158 |
+
- These require implementing or debugging code
|
| 159 |
+
- Provide clean, efficient, and well-commented code
|
| 160 |
+
- Explain your implementation choices
|
| 161 |
+
- Consider edge cases and potential optimizations
|
| 162 |
+
|
| 163 |
+
IMPORTANT FORMATTING GUIDELINES:
|
| 164 |
+
|
| 165 |
+
1. For numerical answers:
|
| 166 |
+
- Provide only the number without units unless specifically requested
|
| 167 |
+
- Use standard notation (avoid scientific notation unless appropriate)
|
| 168 |
+
- Round to the specified number of decimal places if indicated
|
| 169 |
+
|
| 170 |
+
2. For multiple-choice questions:
|
| 171 |
+
- Clearly indicate your selected option (A, B, C, D, etc.)
|
| 172 |
+
- Briefly explain your reasoning for the selection
|
| 173 |
+
|
| 174 |
+
3. For short answer questions:
|
| 175 |
+
- Be concise and direct
|
| 176 |
+
- Focus on the key points without unnecessary elaboration
|
| 177 |
+
|
| 178 |
+
4. For coding questions:
|
| 179 |
+
- Provide complete, runnable code unless a snippet is requested
|
| 180 |
+
- Include comments explaining complex logic
|
| 181 |
+
- Follow standard coding conventions for the language
|
| 182 |
+
|
| 183 |
+
Remember, your goal is to provide accurate, helpful answers that demonstrate deep understanding of AI and machine learning concepts.
|
| 184 |
+
"""
|
| 185 |
+
|
| 186 |
+
# Create the prompt template
|
| 187 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 188 |
+
("system", system_prompt),
|
| 189 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
| 190 |
+
("human", "{input}"),
|
| 191 |
+
MessagesPlaceholder(variable_name="agent_scratchpad"),
|
| 192 |
+
])
|
| 193 |
+
|
| 194 |
+
# Create memory for conversation history
|
| 195 |
+
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
| 196 |
+
|
| 197 |
+
# Create the OpenAI Functions agent
|
| 198 |
+
agent = create_openai_functions_agent(llm, tools, prompt)
|
| 199 |
+
|
| 200 |
+
# Create the agent executor
|
| 201 |
+
agent_executor = AgentExecutor(
|
| 202 |
+
agent=agent,
|
| 203 |
+
tools=tools,
|
| 204 |
+
verbose=True,
|
| 205 |
+
memory=memory,
|
| 206 |
+
handle_parsing_errors=True
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
logger.info("OpenAI Functions agent initialized successfully")
|
| 210 |
+
|
| 211 |
+
return agent_executor
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def get_agent_response(agent_executor: AgentExecutor, question_data: dict) -> str:
|
| 215 |
+
"""
|
| 216 |
+
Get a response from the agent for a specific question.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
agent_executor: Initialized LangChain agent executor
|
| 220 |
+
question_data: Dictionary containing question data
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
Agent's response as a string
|
| 224 |
+
"""
|
| 225 |
+
try:
|
| 226 |
+
# Extract question details
|
| 227 |
+
question_text = question_data.get("question", "")
|
| 228 |
+
task_id = question_data.get("task_id", "")
|
| 229 |
+
has_file = question_data.get("has_file", False)
|
| 230 |
+
|
| 231 |
+
# Prepare the input for the agent
|
| 232 |
+
agent_input = {
|
| 233 |
+
"input": question_text
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
# If the question has an associated file, try to download it
|
| 237 |
+
context_content = None
|
| 238 |
+
if has_file and task_id:
|
| 239 |
+
logger.info(f"Question has an associated file. Attempting to download for task {task_id}")
|
| 240 |
+
try:
|
| 241 |
+
# Create a temporary directory to store the file
|
| 242 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 243 |
+
# Download the file
|
| 244 |
+
file_path = download_file_for_task(CONFIG.get("api", {}).get("base_url"), task_id, temp_dir)
|
| 245 |
+
|
| 246 |
+
# Try to read the file as text
|
| 247 |
+
try:
|
| 248 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 249 |
+
context_content = f.read()
|
| 250 |
+
|
| 251 |
+
# Add context to the agent input
|
| 252 |
+
agent_input["context"] = context_content
|
| 253 |
+
agent_input["input"] = f"Question: {question_text}\n\nContext: {context_content}"
|
| 254 |
+
except UnicodeDecodeError:
|
| 255 |
+
# If it's not a text file, provide info about the binary file
|
| 256 |
+
file_size = Path(file_path).stat().st_size
|
| 257 |
+
file_ext = Path(file_path).suffix
|
| 258 |
+
binary_info = f"Binary file detected ({file_ext}, {file_size} bytes). This file cannot be displayed as text."
|
| 259 |
+
agent_input["input"] = f"Question: {question_text}\n\nContext: {binary_info}"
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.error(f"Error handling context file: {str(e)}")
|
| 262 |
+
agent_input["input"] = f"Question: {question_text}\n\nNote: There was an error retrieving the context file: {str(e)}"
|
| 263 |
+
|
| 264 |
+
# Get response from the agent
|
| 265 |
+
logger.info(f"Sending question to agent: {question_text[:100]}...")
|
| 266 |
+
response = agent_executor.invoke(agent_input)
|
| 267 |
+
|
| 268 |
+
# Extract the output from the response
|
| 269 |
+
output = response.get("output", "")
|
| 270 |
+
|
| 271 |
+
return output
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"Error getting agent response: {str(e)}")
|
| 275 |
+
return f"Error: {str(e)}"
|
gaiaX/api.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
API interaction module for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module handles all interactions with the GAIA benchmark API,
|
| 6 |
+
including fetching questions, downloading files, and submitting answers.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import requests
|
| 11 |
+
from typing import Dict, List, Any, Optional
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
from gaiaX.config import logger, API_BASE_URL
|
| 15 |
+
|
| 16 |
+
def get_all_questions(api_base_url: str = API_BASE_URL) -> List[Dict[str, Any]]:
|
| 17 |
+
"""
|
| 18 |
+
Retrieve all available questions from the GAIA benchmark.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
api_base_url: Base URL for the GAIA API
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
List of question dictionaries
|
| 25 |
+
|
| 26 |
+
Raises:
|
| 27 |
+
requests.RequestException: If the API request fails
|
| 28 |
+
ValueError: If the response is not valid JSON or doesn't contain expected data
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
response = requests.get(f"{api_base_url}/questions")
|
| 32 |
+
response.raise_for_status() # Raise exception for 4XX/5XX responses
|
| 33 |
+
|
| 34 |
+
questions = response.json()
|
| 35 |
+
|
| 36 |
+
if not isinstance(questions, list):
|
| 37 |
+
raise ValueError("Expected a list of questions but received a different format")
|
| 38 |
+
|
| 39 |
+
return questions
|
| 40 |
+
|
| 41 |
+
except requests.RequestException as e:
|
| 42 |
+
logger.error(f"Error fetching questions: {e}")
|
| 43 |
+
raise
|
| 44 |
+
|
| 45 |
+
except json.JSONDecodeError:
|
| 46 |
+
logger.error("Error decoding response as JSON")
|
| 47 |
+
raise ValueError("Invalid JSON response from the API")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_random_question(api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
|
| 51 |
+
"""
|
| 52 |
+
Retrieve a random question from the GAIA benchmark.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
api_base_url: Base URL for the GAIA API
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
A single question dictionary
|
| 59 |
+
|
| 60 |
+
Raises:
|
| 61 |
+
requests.RequestException: If the API request fails
|
| 62 |
+
ValueError: If the response is not valid JSON or doesn't contain expected data
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
response = requests.get(f"{api_base_url}/questions/random")
|
| 66 |
+
response.raise_for_status()
|
| 67 |
+
|
| 68 |
+
question = response.json()
|
| 69 |
+
|
| 70 |
+
if not isinstance(question, dict):
|
| 71 |
+
raise ValueError("Expected a question dictionary but received a different format")
|
| 72 |
+
|
| 73 |
+
return question
|
| 74 |
+
|
| 75 |
+
except requests.RequestException as e:
|
| 76 |
+
logger.error(f"Error fetching random question: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
except json.JSONDecodeError:
|
| 80 |
+
logger.error("Error decoding response as JSON")
|
| 81 |
+
raise ValueError("Invalid JSON response from the API")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def download_file_for_task(api_base_url: str, task_id: str, download_path: str) -> str:
|
| 85 |
+
"""
|
| 86 |
+
Download a file associated with a specific task.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
api_base_url: Base URL for the GAIA API
|
| 90 |
+
task_id: ID of the task to download files for
|
| 91 |
+
download_path: Directory path where the file should be saved
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Path to the downloaded file
|
| 95 |
+
|
| 96 |
+
Raises:
|
| 97 |
+
requests.RequestException: If the API request fails
|
| 98 |
+
IOError: If there's an error writing the file
|
| 99 |
+
ValueError: If the task_id is invalid or the response is unexpected
|
| 100 |
+
"""
|
| 101 |
+
if not task_id:
|
| 102 |
+
raise ValueError("Task ID cannot be empty")
|
| 103 |
+
|
| 104 |
+
# Ensure download directory exists
|
| 105 |
+
download_dir = Path(download_path)
|
| 106 |
+
download_dir.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
response = requests.get(
|
| 110 |
+
f"{api_base_url}/tasks/{task_id}/file",
|
| 111 |
+
stream=True # Stream the response for large files
|
| 112 |
+
)
|
| 113 |
+
response.raise_for_status()
|
| 114 |
+
|
| 115 |
+
# Get filename from Content-Disposition header or use task_id as fallback
|
| 116 |
+
content_disposition = response.headers.get('Content-Disposition', '')
|
| 117 |
+
filename = None
|
| 118 |
+
|
| 119 |
+
if 'filename=' in content_disposition:
|
| 120 |
+
filename = content_disposition.split('filename=')[1].strip('"\'')
|
| 121 |
+
|
| 122 |
+
if not filename:
|
| 123 |
+
filename = f"{task_id}_file.txt"
|
| 124 |
+
|
| 125 |
+
file_path = download_dir / filename
|
| 126 |
+
|
| 127 |
+
# Write the file
|
| 128 |
+
with open(file_path, 'wb') as f:
|
| 129 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 130 |
+
f.write(chunk)
|
| 131 |
+
|
| 132 |
+
return str(file_path)
|
| 133 |
+
|
| 134 |
+
except requests.RequestException as e:
|
| 135 |
+
logger.error(f"Error downloading file for task {task_id}: {e}")
|
| 136 |
+
raise
|
| 137 |
+
|
| 138 |
+
except IOError as e:
|
| 139 |
+
logger.error(f"Error writing file to {download_path}: {e}")
|
| 140 |
+
raise
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def submit_answers(
|
| 144 |
+
api_base_url: str,
|
| 145 |
+
username: str,
|
| 146 |
+
agent_code_link: str,
|
| 147 |
+
answers: Dict[str, Any]
|
| 148 |
+
) -> Dict[str, Any]:
|
| 149 |
+
"""
|
| 150 |
+
Submit answers to the GAIA benchmark.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
api_base_url: Base URL for the GAIA API
|
| 154 |
+
username: Hugging Face username
|
| 155 |
+
agent_code_link: Link to the agent code (e.g., GitHub repository)
|
| 156 |
+
answers: Dictionary of answers to submit
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
Response from the API containing submission results
|
| 160 |
+
|
| 161 |
+
Raises:
|
| 162 |
+
requests.RequestException: If the API request fails
|
| 163 |
+
ValueError: If the response is not valid JSON or contains an error message
|
| 164 |
+
"""
|
| 165 |
+
if not username:
|
| 166 |
+
raise ValueError("Username cannot be empty")
|
| 167 |
+
|
| 168 |
+
if not agent_code_link:
|
| 169 |
+
raise ValueError("Agent code link cannot be empty")
|
| 170 |
+
|
| 171 |
+
if not answers or not isinstance(answers, dict):
|
| 172 |
+
raise ValueError("Answers must be a non-empty dictionary")
|
| 173 |
+
|
| 174 |
+
payload = {
|
| 175 |
+
"username": username,
|
| 176 |
+
"agent_code_link": agent_code_link,
|
| 177 |
+
"answers": answers
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
response = requests.post(
|
| 182 |
+
f"{api_base_url}/submit",
|
| 183 |
+
json=payload,
|
| 184 |
+
headers={"Content-Type": "application/json"}
|
| 185 |
+
)
|
| 186 |
+
response.raise_for_status()
|
| 187 |
+
|
| 188 |
+
result = response.json()
|
| 189 |
+
|
| 190 |
+
# Check if the response contains an error message
|
| 191 |
+
if isinstance(result, dict) and result.get("error"):
|
| 192 |
+
raise ValueError(f"API returned an error: {result['error']}")
|
| 193 |
+
|
| 194 |
+
return result
|
| 195 |
+
|
| 196 |
+
except requests.RequestException as e:
|
| 197 |
+
logger.error(f"Error submitting answers: {e}")
|
| 198 |
+
raise
|
| 199 |
+
|
| 200 |
+
except json.JSONDecodeError:
|
| 201 |
+
logger.error("Error decoding response as JSON")
|
| 202 |
+
raise ValueError("Invalid JSON response from the API")
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def get_question_details(task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
|
| 206 |
+
"""
|
| 207 |
+
Get detailed information about a specific question/task.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
task_id: The ID of the task to get details for
|
| 211 |
+
api_base_url: Base URL for the GAIA API
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Dictionary containing question details
|
| 215 |
+
"""
|
| 216 |
+
try:
|
| 217 |
+
response = requests.get(f"{api_base_url}/questions/{task_id}")
|
| 218 |
+
response.raise_for_status()
|
| 219 |
+
return response.json()
|
| 220 |
+
except requests.RequestException as e:
|
| 221 |
+
logger.error(f"Failed to get question details: {str(e)}")
|
| 222 |
+
return {"error": f"Failed to get question details: {str(e)}"}
|
| 223 |
+
except json.JSONDecodeError:
|
| 224 |
+
logger.error("Invalid JSON response from the API")
|
| 225 |
+
return {"error": "Invalid JSON response from the API"}
|
gaiaX/config.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Configuration module for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module handles loading and managing configuration settings from JSON files
|
| 6 |
+
and environment variables.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
# Load environment variables
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
def load_config(config_path: str = "config.json") -> Dict[str, Any]:
|
| 20 |
+
"""
|
| 21 |
+
Load configuration from a JSON file.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
config_path: Path to the configuration file
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dictionary containing configuration settings
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
with open(config_path, 'r') as f:
|
| 31 |
+
config = json.load(f)
|
| 32 |
+
return config
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Error loading configuration from {config_path}: {e}")
|
| 35 |
+
print("Using default configuration.")
|
| 36 |
+
return {
|
| 37 |
+
"model_parameters": {
|
| 38 |
+
"model_name": "gpt-4-turbo",
|
| 39 |
+
"temperature": 0.2
|
| 40 |
+
},
|
| 41 |
+
"paths": {
|
| 42 |
+
"progress_file": "gaia_progress.json"
|
| 43 |
+
},
|
| 44 |
+
"api": {
|
| 45 |
+
"base_url": "https://api.example.com/gaia"
|
| 46 |
+
}
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Load configuration
|
| 50 |
+
CONFIG = load_config()
|
| 51 |
+
|
| 52 |
+
# Setup logging
|
| 53 |
+
def setup_logging():
|
| 54 |
+
"""Configure logging based on settings in CONFIG."""
|
| 55 |
+
logging_config = CONFIG.get("logging", {})
|
| 56 |
+
log_level = getattr(logging, logging_config.get("level", "INFO"))
|
| 57 |
+
log_format = logging_config.get("format", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
| 58 |
+
log_file = logging_config.get("file", "logs/gaia_agent.log")
|
| 59 |
+
|
| 60 |
+
# Create logs directory if it doesn't exist
|
| 61 |
+
if log_file:
|
| 62 |
+
log_dir = os.path.dirname(log_file)
|
| 63 |
+
if log_dir and not os.path.exists(log_dir):
|
| 64 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
# Configure logging
|
| 67 |
+
logging.basicConfig(
|
| 68 |
+
level=log_level,
|
| 69 |
+
format=log_format,
|
| 70 |
+
handlers=[
|
| 71 |
+
logging.FileHandler(log_file) if log_file else logging.NullHandler(),
|
| 72 |
+
logging.StreamHandler() if logging_config.get("console", True) else logging.NullHandler()
|
| 73 |
+
]
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return logging.getLogger("gaia_agent")
|
| 77 |
+
|
| 78 |
+
# Initialize logger
|
| 79 |
+
logger = setup_logging()
|
| 80 |
+
|
| 81 |
+
# Environment variables
|
| 82 |
+
HF_USERNAME = os.getenv("HF_USERNAME")
|
| 83 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 84 |
+
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 85 |
+
|
| 86 |
+
# API configuration
|
| 87 |
+
API_BASE_URL = CONFIG.get("api", {}).get("base_url", "https://api.example.com/gaia")
|
| 88 |
+
|
| 89 |
+
# Validate required environment variables
|
| 90 |
+
def validate_env_vars():
|
| 91 |
+
"""Validate that required environment variables are set."""
|
| 92 |
+
if not HF_USERNAME:
|
| 93 |
+
logger.error("HF_USERNAME environment variable is not set. Please check your .env file.")
|
| 94 |
+
raise ValueError("HF_USERNAME environment variable is not set. Please check your .env file.")
|
| 95 |
+
|
| 96 |
+
if not OPENAI_API_KEY:
|
| 97 |
+
logger.error("OPENAI_API_KEY environment variable is not set. Please check your .env file.")
|
| 98 |
+
raise ValueError("OPENAI_API_KEY environment variable is not set. Please check your .env file.")
|
| 99 |
+
|
| 100 |
+
# Tavily API key is optional but recommended for search functionality
|
| 101 |
+
if not TAVILY_API_KEY:
|
| 102 |
+
logger.warning("TAVILY_API_KEY environment variable is not set. Search functionality will be limited.")
|
gaiaX/question_handlers.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Question handlers module for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module provides specialized handlers for different types of questions
|
| 6 |
+
in the GAIA benchmark, including question type detection and processing.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
import tempfile
|
| 11 |
+
from typing import Dict, Any, Optional
|
| 12 |
+
|
| 13 |
+
from gaiaX.config import logger, CONFIG, API_BASE_URL
|
| 14 |
+
from gaiaX.api import download_file_for_task
|
| 15 |
+
from gaiaX.agent import get_agent_response
|
| 16 |
+
|
| 17 |
+
def detect_question_type(question_text: str) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Detect the type of question based on its content.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
question_text: The text of the question
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
String indicating the question type
|
| 26 |
+
"""
|
| 27 |
+
# Convert to lowercase for case-insensitive matching
|
| 28 |
+
text = question_text.lower()
|
| 29 |
+
|
| 30 |
+
# Check for mathematical questions
|
| 31 |
+
if any(keyword in text for keyword in ["calculate", "compute", "equation", "formula", "derivative",
|
| 32 |
+
"integral", "probability", "statistics", "math"]):
|
| 33 |
+
return "mathematical"
|
| 34 |
+
|
| 35 |
+
# Check for technical implementation questions
|
| 36 |
+
if any(keyword in text for keyword in ["implement", "code", "algorithm", "function", "class",
|
| 37 |
+
"method", "programming", "pseudocode", "complexity"]):
|
| 38 |
+
return "technical"
|
| 39 |
+
|
| 40 |
+
# Check for context-based questions
|
| 41 |
+
if any(keyword in text for keyword in ["context", "file", "document", "text", "analyze",
|
| 42 |
+
"based on", "according to", "refer to"]):
|
| 43 |
+
return "context_based"
|
| 44 |
+
|
| 45 |
+
# Check for ethical/societal questions
|
| 46 |
+
if any(keyword in text for keyword in ["ethics", "ethical", "society", "impact", "bias",
|
| 47 |
+
"fairness", "responsible", "governance"]):
|
| 48 |
+
return "ethical"
|
| 49 |
+
|
| 50 |
+
# Check for factual knowledge questions
|
| 51 |
+
if any(keyword in text for keyword in ["define", "explain", "describe", "what is", "who is",
|
| 52 |
+
"when was", "history", "concept"]):
|
| 53 |
+
return "factual"
|
| 54 |
+
|
| 55 |
+
# Default to general if no specific type is detected
|
| 56 |
+
return "general"
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def handle_factual_question(agent: Any, question: dict, context: str = None) -> str:
|
| 60 |
+
"""
|
| 61 |
+
Handle factual knowledge questions.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
agent: Initialized LangChain agent
|
| 65 |
+
question: Dictionary containing question data
|
| 66 |
+
context: Optional context text
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Agent's response as a string
|
| 70 |
+
"""
|
| 71 |
+
logger.info("Handling factual knowledge question")
|
| 72 |
+
|
| 73 |
+
# Enhance the question with specific instructions for factual questions
|
| 74 |
+
enhanced_question = question.copy()
|
| 75 |
+
|
| 76 |
+
question_text = question.get("question", "")
|
| 77 |
+
enhanced_text = f"""
|
| 78 |
+
[FACTUAL KNOWLEDGE QUESTION]
|
| 79 |
+
|
| 80 |
+
{question_text}
|
| 81 |
+
|
| 82 |
+
Please provide a precise, accurate answer based on established facts and knowledge.
|
| 83 |
+
Include relevant examples and cite important research or developments when applicable.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
enhanced_question["question"] = enhanced_text
|
| 87 |
+
|
| 88 |
+
# Get response from the agent
|
| 89 |
+
return get_agent_response(agent, enhanced_question)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def handle_technical_question(agent: Any, question: dict, context: str = None) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Handle technical implementation questions.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
agent: Initialized LangChain agent
|
| 98 |
+
question: Dictionary containing question data
|
| 99 |
+
context: Optional context text
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Agent's response as a string
|
| 103 |
+
"""
|
| 104 |
+
logger.info("Handling technical implementation question")
|
| 105 |
+
|
| 106 |
+
# Enhance the question with specific instructions for technical questions
|
| 107 |
+
enhanced_question = question.copy()
|
| 108 |
+
|
| 109 |
+
question_text = question.get("question", "")
|
| 110 |
+
enhanced_text = f"""
|
| 111 |
+
[TECHNICAL IMPLEMENTATION QUESTION]
|
| 112 |
+
|
| 113 |
+
{question_text}
|
| 114 |
+
|
| 115 |
+
Please provide a detailed technical explanation, including:
|
| 116 |
+
- Step-by-step explanation of algorithms or processes
|
| 117 |
+
- Pseudocode or code snippets when helpful
|
| 118 |
+
- Analysis of trade-offs between different approaches
|
| 119 |
+
- Complexity analysis (time and space) if relevant
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
enhanced_question["question"] = enhanced_text
|
| 123 |
+
|
| 124 |
+
# Get response from the agent
|
| 125 |
+
return get_agent_response(agent, enhanced_question)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def handle_mathematical_question(agent: Any, question: dict, context: str = None) -> str:
|
| 129 |
+
"""
|
| 130 |
+
Handle mathematical questions.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
agent: Initialized LangChain agent
|
| 134 |
+
question: Dictionary containing question data
|
| 135 |
+
context: Optional context text
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
Agent's response as a string
|
| 139 |
+
"""
|
| 140 |
+
logger.info("Handling mathematical question")
|
| 141 |
+
|
| 142 |
+
# Enhance the question with specific instructions for mathematical questions
|
| 143 |
+
enhanced_question = question.copy()
|
| 144 |
+
|
| 145 |
+
question_text = question.get("question", "")
|
| 146 |
+
enhanced_text = f"""
|
| 147 |
+
[MATHEMATICAL QUESTION]
|
| 148 |
+
|
| 149 |
+
{question_text}
|
| 150 |
+
|
| 151 |
+
Please provide a clear mathematical solution, including:
|
| 152 |
+
- Step-by-step working of the solution
|
| 153 |
+
- Clear explanation of the mathematical concepts involved
|
| 154 |
+
- Proper notation with defined variables
|
| 155 |
+
- Final answer in the simplest form
|
| 156 |
+
|
| 157 |
+
If the question asks for a specific numerical value, provide only that value as your final answer.
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
enhanced_question["question"] = enhanced_text
|
| 161 |
+
|
| 162 |
+
# Get response from the agent
|
| 163 |
+
return get_agent_response(agent, enhanced_question)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def handle_context_based_question(agent: Any, question: dict, context: str = None) -> str:
|
| 167 |
+
"""
|
| 168 |
+
Handle context-based analysis questions.
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
agent: Initialized LangChain agent
|
| 172 |
+
question: Dictionary containing question data
|
| 173 |
+
context: Optional context text
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Agent's response as a string
|
| 177 |
+
"""
|
| 178 |
+
logger.info("Handling context-based question")
|
| 179 |
+
|
| 180 |
+
# If context is not provided but the question has a file, try to download it
|
| 181 |
+
if not context and question.get("has_file", False):
|
| 182 |
+
task_id = question.get("task_id", "")
|
| 183 |
+
if task_id:
|
| 184 |
+
try:
|
| 185 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 186 |
+
file_path = download_file_for_task(API_BASE_URL, task_id, temp_dir)
|
| 187 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 188 |
+
context = f.read()
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Error downloading context file: {str(e)}")
|
| 191 |
+
|
| 192 |
+
# Enhance the question with specific instructions for context-based questions
|
| 193 |
+
enhanced_question = question.copy()
|
| 194 |
+
|
| 195 |
+
question_text = question.get("question", "")
|
| 196 |
+
enhanced_text = f"""
|
| 197 |
+
[CONTEXT-BASED ANALYSIS QUESTION]
|
| 198 |
+
|
| 199 |
+
{question_text}
|
| 200 |
+
|
| 201 |
+
Please analyze the provided context carefully and provide an answer that:
|
| 202 |
+
- Directly references relevant parts of the context
|
| 203 |
+
- Connects the context to broader AI/ML concepts when relevant
|
| 204 |
+
- Provides a comprehensive analysis based on the context
|
| 205 |
+
"""
|
| 206 |
+
|
| 207 |
+
if context:
|
| 208 |
+
enhanced_text += f"\n\nContext:\n{context}"
|
| 209 |
+
|
| 210 |
+
enhanced_question["question"] = enhanced_text
|
| 211 |
+
|
| 212 |
+
# Get response from the agent
|
| 213 |
+
return get_agent_response(agent, enhanced_question)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def handle_general_question(agent: Any, question: dict, context: str = None) -> str:
|
| 217 |
+
"""
|
| 218 |
+
Handle general questions that don't fit into specific categories.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
agent: Initialized LangChain agent
|
| 222 |
+
question: Dictionary containing question data
|
| 223 |
+
context: Optional context text
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
Agent's response as a string
|
| 227 |
+
"""
|
| 228 |
+
logger.info("Handling general question")
|
| 229 |
+
|
| 230 |
+
# Enhance the question with general instructions
|
| 231 |
+
enhanced_question = question.copy()
|
| 232 |
+
|
| 233 |
+
question_text = question.get("question", "")
|
| 234 |
+
enhanced_text = f"""
|
| 235 |
+
[GENERAL QUESTION]
|
| 236 |
+
|
| 237 |
+
{question_text}
|
| 238 |
+
|
| 239 |
+
Please provide a comprehensive, accurate answer that:
|
| 240 |
+
- Directly addresses all aspects of the question
|
| 241 |
+
- Is well-structured and easy to understand
|
| 242 |
+
- Includes relevant examples or illustrations when helpful
|
| 243 |
+
- Cites sources or references when appropriate
|
| 244 |
+
"""
|
| 245 |
+
|
| 246 |
+
if context:
|
| 247 |
+
enhanced_text += f"\n\nContext:\n{context}"
|
| 248 |
+
|
| 249 |
+
enhanced_question["question"] = enhanced_text
|
| 250 |
+
|
| 251 |
+
# Get response from the agent
|
| 252 |
+
return get_agent_response(agent, enhanced_question)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def process_question(agent: Any, question: dict, api_base_url: str = API_BASE_URL) -> dict:
|
| 256 |
+
"""
|
| 257 |
+
Process a single question using the appropriate handler.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
agent: Initialized LangChain agent
|
| 261 |
+
question: Dictionary containing question data
|
| 262 |
+
api_base_url: Base URL for the GAIA API
|
| 263 |
+
|
| 264 |
+
Returns:
|
| 265 |
+
Dictionary containing the question, answer, and metadata
|
| 266 |
+
"""
|
| 267 |
+
try:
|
| 268 |
+
# Extract question details
|
| 269 |
+
question_text = question.get("question", "")
|
| 270 |
+
task_id = question.get("task_id", "")
|
| 271 |
+
has_file = question.get("has_file", False)
|
| 272 |
+
|
| 273 |
+
logger.info(f"Processing question: {task_id} - {question_text[:50]}...")
|
| 274 |
+
|
| 275 |
+
# Detect question type
|
| 276 |
+
question_type = detect_question_type(question_text)
|
| 277 |
+
logger.info(f"Detected question type: {question_type}")
|
| 278 |
+
|
| 279 |
+
# Download context file if available
|
| 280 |
+
context = None
|
| 281 |
+
if has_file and task_id:
|
| 282 |
+
try:
|
| 283 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 284 |
+
file_path = download_file_for_task(api_base_url, task_id, temp_dir)
|
| 285 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 286 |
+
context = f.read()
|
| 287 |
+
except Exception as e:
|
| 288 |
+
logger.error(f"Error downloading context file: {str(e)}")
|
| 289 |
+
|
| 290 |
+
# Handle question based on its type
|
| 291 |
+
if question_type == "factual":
|
| 292 |
+
answer = handle_factual_question(agent, question, context)
|
| 293 |
+
elif question_type == "technical":
|
| 294 |
+
answer = handle_technical_question(agent, question, context)
|
| 295 |
+
elif question_type == "mathematical":
|
| 296 |
+
answer = handle_mathematical_question(agent, question, context)
|
| 297 |
+
elif question_type == "context_based":
|
| 298 |
+
answer = handle_context_based_question(agent, question, context)
|
| 299 |
+
else:
|
| 300 |
+
answer = handle_general_question(agent, question, context)
|
| 301 |
+
|
| 302 |
+
# Create result dictionary
|
| 303 |
+
result = {
|
| 304 |
+
"task_id": task_id,
|
| 305 |
+
"question": question_text,
|
| 306 |
+
"answer": answer,
|
| 307 |
+
"question_type": question_type,
|
| 308 |
+
"has_context": context is not None
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
return result
|
| 312 |
+
|
| 313 |
+
except Exception as e:
|
| 314 |
+
logger.error(f"Error processing question: {str(e)}")
|
| 315 |
+
return {
|
| 316 |
+
"task_id": question.get("task_id", ""),
|
| 317 |
+
"question": question.get("question", ""),
|
| 318 |
+
"answer": f"Error: {str(e)}",
|
| 319 |
+
"error": str(e)
|
| 320 |
+
}
|
gaiaX/tools.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
LangChain tools module for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module defines the custom tools used by the LangChain agent
|
| 6 |
+
to interact with the GAIA benchmark API and process questions.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import tempfile
|
| 11 |
+
from typing import Dict, Any
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
from langchain.tools import BaseTool, tool
|
| 15 |
+
|
| 16 |
+
from gaiaX.config import logger, API_BASE_URL
|
| 17 |
+
from gaiaX.api import download_file_for_task, get_question_details
|
| 18 |
+
|
| 19 |
+
@tool
|
| 20 |
+
def fetch_question_details(task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
|
| 21 |
+
"""
|
| 22 |
+
Get detailed information about a specific question/task.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
task_id: The ID of the task to get details for
|
| 26 |
+
api_base_url: Base URL for the GAIA API
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
Dictionary containing question details
|
| 30 |
+
"""
|
| 31 |
+
return get_question_details(task_id, api_base_url)
|
| 32 |
+
|
| 33 |
+
@tool
|
| 34 |
+
def fetch_context_file(task_id: str, api_base_url: str = API_BASE_URL) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Download and read the context file for a specific task.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
task_id: The ID of the task to download the file for
|
| 40 |
+
api_base_url: Base URL for the GAIA API
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
String containing the file contents or error message
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
# Create a temporary directory to store the file
|
| 47 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 48 |
+
file_path = download_file_for_task(api_base_url, task_id, temp_dir)
|
| 49 |
+
|
| 50 |
+
# Try to read the file as text
|
| 51 |
+
try:
|
| 52 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 53 |
+
return f.read()
|
| 54 |
+
except UnicodeDecodeError:
|
| 55 |
+
# If it's not a text file, try to read it as binary and provide info
|
| 56 |
+
file_size = Path(file_path).stat().st_size
|
| 57 |
+
file_ext = Path(file_path).suffix
|
| 58 |
+
return f"Binary file detected ({file_ext}, {file_size} bytes). This file cannot be displayed as text. Please use specialized tools to analyze this type of file."
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Error fetching context file: {str(e)}")
|
| 61 |
+
return f"Error fetching context file: {str(e)}"
|
| 62 |
+
|
| 63 |
+
# Define a class for each tool to make them more configurable
|
| 64 |
+
class QuestionDetailsTool(BaseTool):
|
| 65 |
+
"""Tool for fetching question details from the GAIA API."""
|
| 66 |
+
|
| 67 |
+
name = "get_question_details"
|
| 68 |
+
description = "Get detailed information about a specific question/task"
|
| 69 |
+
|
| 70 |
+
def _run(self, task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
|
| 71 |
+
"""Execute the tool."""
|
| 72 |
+
return get_question_details(task_id, api_base_url)
|
| 73 |
+
|
| 74 |
+
def _arun(self, task_id: str, api_base_url: str = API_BASE_URL):
|
| 75 |
+
"""Execute the tool asynchronously."""
|
| 76 |
+
raise NotImplementedError("Async version not implemented")
|
| 77 |
+
|
| 78 |
+
class ContextFileTool(BaseTool):
|
| 79 |
+
"""Tool for fetching and reading context files for tasks."""
|
| 80 |
+
|
| 81 |
+
name = "fetch_context_file"
|
| 82 |
+
description = "Download and read the context file for a specific task"
|
| 83 |
+
|
| 84 |
+
def _run(self, task_id: str, api_base_url: str = API_BASE_URL) -> str:
|
| 85 |
+
"""Execute the tool."""
|
| 86 |
+
return fetch_context_file(task_id, api_base_url)
|
| 87 |
+
|
| 88 |
+
def _arun(self, task_id: str, api_base_url: str = API_BASE_URL):
|
| 89 |
+
"""Execute the tool asynchronously."""
|
| 90 |
+
raise NotImplementedError("Async version not implemented")
|
| 91 |
+
|
| 92 |
+
# Function to get all available tools
|
| 93 |
+
def get_tools(include_search: bool = True, tavily_api_key: str = None):
|
| 94 |
+
"""
|
| 95 |
+
Get all available tools for the agent.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
include_search: Whether to include the search tool
|
| 99 |
+
tavily_api_key: Tavily API key for search functionality
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
List of tools
|
| 103 |
+
"""
|
| 104 |
+
tools = [
|
| 105 |
+
fetch_question_details,
|
| 106 |
+
fetch_context_file
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
# Add search tool if Tavily API key is available and search is enabled
|
| 110 |
+
if include_search and tavily_api_key:
|
| 111 |
+
try:
|
| 112 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 113 |
+
|
| 114 |
+
search_tool = TavilySearchResults(
|
| 115 |
+
max_results=3,
|
| 116 |
+
api_key=tavily_api_key
|
| 117 |
+
)
|
| 118 |
+
tools.append(search_tool)
|
| 119 |
+
logger.info("Search tool added to agent tools")
|
| 120 |
+
except ImportError:
|
| 121 |
+
logger.warning("Could not import TavilySearchResults. Search functionality will be disabled.")
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning(f"Error initializing search tool: {e}")
|
| 124 |
+
|
| 125 |
+
return tools
|
gaiaX/utils.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Utility functions for GAIA Benchmark Agent.
|
| 4 |
+
|
| 5 |
+
This module provides utility functions for progress tracking,
|
| 6 |
+
performance analysis, and other helper functions.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import datetime
|
| 12 |
+
from typing import Dict, List, Any, Optional
|
| 13 |
+
|
| 14 |
+
from gaiaX.config import logger, CONFIG
|
| 15 |
+
|
| 16 |
+
def load_progress(progress_file: str = None) -> dict:
|
| 17 |
+
"""
|
| 18 |
+
Load progress from a JSON file.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
progress_file: Path to the progress file
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Dictionary containing progress data
|
| 25 |
+
"""
|
| 26 |
+
if not progress_file:
|
| 27 |
+
progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
if os.path.exists(progress_file):
|
| 31 |
+
with open(progress_file, 'r') as f:
|
| 32 |
+
progress = json.load(f)
|
| 33 |
+
return progress
|
| 34 |
+
else:
|
| 35 |
+
return {"processed_questions": [], "answers": {}}
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.error(f"Error loading progress from {progress_file}: {e}")
|
| 38 |
+
return {"processed_questions": [], "answers": {}}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def save_progress(progress_data: dict, progress_file: str = None) -> bool:
|
| 42 |
+
"""
|
| 43 |
+
Save progress to a JSON file.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
progress_data: Dictionary containing progress data
|
| 47 |
+
progress_file: Path to the progress file
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
True if successful, False otherwise
|
| 51 |
+
"""
|
| 52 |
+
if not progress_file:
|
| 53 |
+
progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
with open(progress_file, 'w') as f:
|
| 57 |
+
json.dump(progress_data, f, indent=2)
|
| 58 |
+
return True
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Error saving progress to {progress_file}: {e}")
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def analyze_performance(answers: list, expected_answers: list = None) -> dict:
|
| 65 |
+
"""
|
| 66 |
+
Analyze the performance of the agent based on answers.
|
| 67 |
+
|
| 68 |
+
Args:
|
| 69 |
+
answers: List of answer dictionaries
|
| 70 |
+
expected_answers: Optional list of expected answers for evaluation
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Dictionary containing performance metrics
|
| 74 |
+
"""
|
| 75 |
+
total_questions = len(answers)
|
| 76 |
+
successful_answers = sum(1 for a in answers if "error" not in a)
|
| 77 |
+
error_count = total_questions - successful_answers
|
| 78 |
+
|
| 79 |
+
# Calculate average response time if available
|
| 80 |
+
response_times = [a.get("response_time", 0) for a in answers if "response_time" in a]
|
| 81 |
+
avg_response_time = sum(response_times) / len(response_times) if response_times else 0
|
| 82 |
+
|
| 83 |
+
# Count question types
|
| 84 |
+
question_types = {}
|
| 85 |
+
for answer in answers:
|
| 86 |
+
q_type = answer.get("question_type", "unknown")
|
| 87 |
+
question_types[q_type] = question_types.get(q_type, 0) + 1
|
| 88 |
+
|
| 89 |
+
# Calculate accuracy if expected answers are provided
|
| 90 |
+
accuracy = None
|
| 91 |
+
correct_answers = 0
|
| 92 |
+
if expected_answers:
|
| 93 |
+
answer_dict = {a.get("task_id"): a.get("answer") for a in answers}
|
| 94 |
+
expected_dict = {e.get("task_id"): e.get("answer") for e in expected_answers}
|
| 95 |
+
|
| 96 |
+
common_ids = set(answer_dict.keys()) & set(expected_dict.keys())
|
| 97 |
+
if common_ids:
|
| 98 |
+
for task_id in common_ids:
|
| 99 |
+
if answer_dict[task_id] == expected_dict[task_id]:
|
| 100 |
+
correct_answers += 1
|
| 101 |
+
accuracy = correct_answers / len(common_ids)
|
| 102 |
+
|
| 103 |
+
# Compile metrics
|
| 104 |
+
metrics = {
|
| 105 |
+
"total_questions": total_questions,
|
| 106 |
+
"successful_answers": successful_answers,
|
| 107 |
+
"error_count": error_count,
|
| 108 |
+
"success_rate": successful_answers / total_questions if total_questions > 0 else 0,
|
| 109 |
+
"average_response_time": avg_response_time,
|
| 110 |
+
"question_types": question_types
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
if accuracy is not None:
|
| 114 |
+
metrics["accuracy"] = accuracy
|
| 115 |
+
metrics["correct_answers"] = correct_answers
|
| 116 |
+
|
| 117 |
+
return metrics
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def format_performance_report(metrics: dict) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Format performance metrics into a readable report.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
metrics: Dictionary containing performance metrics
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Formatted performance report as a string
|
| 129 |
+
"""
|
| 130 |
+
report = [
|
| 131 |
+
"=== GAIA Benchmark Agent Performance Report ===",
|
| 132 |
+
f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
| 133 |
+
"",
|
| 134 |
+
f"Total Questions Processed: {metrics['total_questions']}",
|
| 135 |
+
f"Successful Answers: {metrics['successful_answers']} ({metrics['success_rate']:.2%})",
|
| 136 |
+
f"Errors: {metrics['error_count']}",
|
| 137 |
+
f"Average Response Time: {metrics['average_response_time']:.2f} seconds",
|
| 138 |
+
"",
|
| 139 |
+
"Question Type Distribution:"
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
# Add question type distribution
|
| 143 |
+
for q_type, count in metrics.get("question_types", {}).items():
|
| 144 |
+
percentage = count / metrics["total_questions"] if metrics["total_questions"] > 0 else 0
|
| 145 |
+
report.append(f" - {q_type}: {count} ({percentage:.2%})")
|
| 146 |
+
|
| 147 |
+
# Add accuracy information if available
|
| 148 |
+
if "accuracy" in metrics:
|
| 149 |
+
report.extend([
|
| 150 |
+
"",
|
| 151 |
+
f"Accuracy: {metrics['accuracy']:.2%}",
|
| 152 |
+
f"Correct Answers: {metrics['correct_answers']} out of {metrics['total_questions']}"
|
| 153 |
+
])
|
| 154 |
+
|
| 155 |
+
return "\n".join(report)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def process_questions_batch(agent: Any, questions: list, api_base_url: str,
|
| 159 |
+
progress_file: str = None, batch_size: int = 10) -> dict:
|
| 160 |
+
"""
|
| 161 |
+
Process a batch of questions and track progress.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
agent: Initialized LangChain agent
|
| 165 |
+
questions: List of question dictionaries
|
| 166 |
+
api_base_url: Base URL for the GAIA API
|
| 167 |
+
progress_file: Path to the progress file
|
| 168 |
+
batch_size: Number of questions to process in each batch
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
Dictionary containing processed questions and answers
|
| 172 |
+
"""
|
| 173 |
+
from gaiaX.question_handlers import process_question
|
| 174 |
+
|
| 175 |
+
# Load existing progress if available
|
| 176 |
+
if not progress_file:
|
| 177 |
+
progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
|
| 178 |
+
|
| 179 |
+
progress = {}
|
| 180 |
+
try:
|
| 181 |
+
if os.path.exists(progress_file):
|
| 182 |
+
with open(progress_file, 'r') as f:
|
| 183 |
+
progress = json.load(f)
|
| 184 |
+
else:
|
| 185 |
+
progress = {"processed_questions": [], "answers": {}}
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.error(f"Error loading progress from {progress_file}: {e}")
|
| 188 |
+
progress = {"processed_questions": [], "answers": {}}
|
| 189 |
+
|
| 190 |
+
# Get list of already processed questions
|
| 191 |
+
processed_ids = set(progress.get("processed_questions", []))
|
| 192 |
+
|
| 193 |
+
# Filter out already processed questions
|
| 194 |
+
remaining_questions = [q for q in questions if q.get("task_id") not in processed_ids]
|
| 195 |
+
logger.info(f"Found {len(remaining_questions)} questions to process out of {len(questions)} total")
|
| 196 |
+
|
| 197 |
+
# Process questions in batches
|
| 198 |
+
results = []
|
| 199 |
+
for i, question in enumerate(remaining_questions):
|
| 200 |
+
if i > 0 and i % batch_size == 0:
|
| 201 |
+
logger.info(f"Processed {i}/{len(remaining_questions)} questions. Saving progress...")
|
| 202 |
+
save_progress(progress, progress_file)
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
task_id = question.get("task_id")
|
| 206 |
+
logger.info(f"Processing question {i+1}/{len(remaining_questions)}: {task_id}")
|
| 207 |
+
|
| 208 |
+
# Process the question
|
| 209 |
+
start_time = datetime.datetime.now()
|
| 210 |
+
result = process_question(agent, question, api_base_url)
|
| 211 |
+
end_time = datetime.datetime.now()
|
| 212 |
+
|
| 213 |
+
# Calculate response time
|
| 214 |
+
response_time = (end_time - start_time).total_seconds()
|
| 215 |
+
result["response_time"] = response_time
|
| 216 |
+
|
| 217 |
+
# Add to results and update progress
|
| 218 |
+
results.append(result)
|
| 219 |
+
progress["processed_questions"].append(task_id)
|
| 220 |
+
progress["answers"][task_id] = result.get("answer")
|
| 221 |
+
|
| 222 |
+
logger.info(f"Completed question {task_id} in {response_time:.2f} seconds")
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Error processing question: {str(e)}")
|
| 226 |
+
results.append({
|
| 227 |
+
"task_id": question.get("task_id", ""),
|
| 228 |
+
"question": question.get("question", ""),
|
| 229 |
+
"answer": f"Error: {str(e)}",
|
| 230 |
+
"error": str(e)
|
| 231 |
+
})
|
| 232 |
+
|
| 233 |
+
# Save final progress
|
| 234 |
+
save_progress(progress, progress_file)
|
| 235 |
+
|
| 236 |
+
return {
|
| 237 |
+
"results": results,
|
| 238 |
+
"progress": progress
|
| 239 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GAIA Benchmark Agent Dependencies
|
| 2 |
+
|
| 3 |
+
# Core dependencies
|
| 4 |
+
langchain>=0.1.0
|
| 5 |
+
langchain-openai>=0.0.2
|
| 6 |
+
langchain-community>=0.0.1
|
| 7 |
+
openai>=1.3.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
requests>=2.31.0
|
| 10 |
+
|
| 11 |
+
# Interface dependencies
|
| 12 |
+
gradio>=3.50.0
|
| 13 |
+
pandas>=2.0.0
|
| 14 |
+
|
| 15 |
+
# Utility dependencies
|
| 16 |
+
tqdm>=4.66.1
|
| 17 |
+
pydantic>=2.4.0
|
| 18 |
+
tenacity>=8.2.3
|