derkaal commited on
Commit
c84963f
·
1 Parent(s): f6cf872

Add GAIA agent files for certification

Browse files
Files changed (13) hide show
  1. .gitignore +30 -0
  2. README.md +41 -8
  3. app.py +264 -0
  4. config.json +25 -0
  5. gaiaX/README.md +119 -0
  6. gaiaX/__init__.py +9 -0
  7. gaiaX/agent.py +275 -0
  8. gaiaX/api.py +225 -0
  9. gaiaX/config.py +102 -0
  10. gaiaX/question_handlers.py +320 -0
  11. gaiaX/tools.py +125 -0
  12. gaiaX/utils.py +239 -0
  13. requirements.txt +18 -0
.gitignore ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+
4
+ # Python cache files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # Distribution / packaging
10
+ dist/
11
+ build/
12
+ *.egg-info/
13
+
14
+ # Virtual environments
15
+ venv/
16
+ env/
17
+ ENV/
18
+
19
+ # Logs
20
+ logs/
21
+ *.log
22
+
23
+ # Progress files
24
+ gaia_progress.json
25
+
26
+ # Temporary files
27
+ .DS_Store
28
+ .vscode/
29
+ *.swp
30
+ *.swo
README.md CHANGED
@@ -1,14 +1,47 @@
1
  ---
2
- title: FinalSubmission
3
- emoji: 🌖
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
- short_description: Final Submission For The CertificationAgent
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: GAIA Benchmark Agent
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_expiration_minutes: 480
12
  ---
13
 
14
+ # GAIA Benchmark Agent
15
+
16
+ This Hugging Face Space hosts a GAIA (General AI Assistant) benchmark agent designed to solve certification challenges across various domains of AI and machine learning.
17
+
18
+ ## Features
19
+
20
+ - Processes questions from the GAIA benchmark
21
+ - Uses LangChain and OpenAI's language models
22
+ - Analyzes questions and identifies their types
23
+ - Retrieves relevant context when needed
24
+ - Generates accurate, well-reasoned answers
25
+
26
+ ## Usage
27
+
28
+ 1. Log in to your Hugging Face account using the button
29
+ 2. Click 'Run Evaluation & Submit All Answers' to:
30
+ - Fetch questions from the GAIA benchmark
31
+ - Run the agent on all questions
32
+ - Submit answers and see your score
33
+
34
+ ## Implementation Details
35
+
36
+ The agent uses a modular architecture with specialized handlers for different question types:
37
+ - Factual knowledge questions
38
+ - Technical implementation questions
39
+ - Mathematical questions
40
+ - Context-based analysis questions
41
+ - Ethical/societal impact questions
42
+
43
+ ## Repository
44
+
45
+ The code for this agent is available at: https://huggingface.co/derkaal/GAIA-agent
46
+
47
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GAIA Benchmark Agent Interface
4
+
5
+ This script integrates the modular GAIA agent with the provided interface template.
6
+ It replaces the BasicAgent class with our GAIA agent implementation.
7
+ """
8
+
9
+ import os
10
+ import gradio as gr
11
+ import requests
12
+ import inspect
13
+ import pandas as pd
14
+ from typing import Dict, List, Any, Optional
15
+
16
+ # Import the GAIA agent modules
17
+ from gaiaX.config import (
18
+ logger, CONFIG, HF_USERNAME, OPENAI_API_KEY,
19
+ TAVILY_API_KEY, API_BASE_URL, validate_env_vars
20
+ )
21
+ from gaiaX.agent import initialize_agent, get_agent_response
22
+ from gaiaX.question_handlers import process_question, detect_question_type
23
+
24
+ # --- Constants ---
25
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
+
27
+ # --- GAIA Agent Implementation ---
28
+ class GAIAAgent:
29
+ """
30
+ GAIA Benchmark Agent implementation that integrates with the provided interface.
31
+ """
32
+ def __init__(self):
33
+ """Initialize the GAIA agent."""
34
+ logger.info("Initializing GAIA agent...")
35
+
36
+ # Validate environment variables
37
+ try:
38
+ validate_env_vars()
39
+ except ValueError as e:
40
+ logger.error(f"Environment validation failed: {e}")
41
+ raise
42
+
43
+ # Initialize the LangChain agent
44
+ self.agent = initialize_agent(OPENAI_API_KEY, "openai_functions")
45
+ logger.info("GAIA agent initialized successfully.")
46
+
47
+ def __call__(self, question: str) -> str:
48
+ """
49
+ Process a question and return the answer.
50
+
51
+ Args:
52
+ question: The question text
53
+
54
+ Returns:
55
+ The agent's answer as a string
56
+ """
57
+ logger.info(f"Agent received question (first 50 chars): {question[:50]}...")
58
+
59
+ # Create a question dictionary
60
+ question_dict = {
61
+ "task_id": "custom_question",
62
+ "question": question,
63
+ "has_file": False
64
+ }
65
+
66
+ # Process the question
67
+ try:
68
+ # Detect question type
69
+ question_type = detect_question_type(question)
70
+ logger.info(f"Detected question type: {question_type}")
71
+
72
+ # Process the question
73
+ result = process_question(self.agent, question_dict, API_BASE_URL)
74
+
75
+ # Extract the answer
76
+ answer = result.get("answer", "")
77
+
78
+ if not answer:
79
+ logger.warning("Agent returned an empty answer.")
80
+ answer = "I couldn't generate an answer for this question."
81
+
82
+ logger.info(f"Agent returning answer (first 50 chars): {answer[:50]}...")
83
+ return answer
84
+
85
+ except Exception as e:
86
+ logger.error(f"Error processing question: {e}")
87
+ return f"Error: {str(e)}"
88
+
89
+ # --- Run and Submit All Function ---
90
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
91
+ """
92
+ Fetches all questions, runs the GAIA Agent on them, submits all answers,
93
+ and displays the results.
94
+ """
95
+ # --- Determine HF Space Runtime URL and Repo URL ---
96
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
97
+
98
+ if profile:
99
+ username = f"{profile.username}"
100
+ print(f"User logged in: {username}")
101
+ else:
102
+ print("User not logged in.")
103
+ return "Please Login to Hugging Face with the button.", None
104
+
105
+ api_url = DEFAULT_API_URL
106
+ questions_url = f"{api_url}/questions"
107
+ submit_url = f"{api_url}/submit"
108
+
109
+ # 1. Instantiate Agent
110
+ try:
111
+ agent = GAIAAgent()
112
+ except Exception as e:
113
+ print(f"Error instantiating agent: {e}")
114
+ return f"Error initializing agent: {e}", None
115
+
116
+ # In the case of an app running as a hugging Face space, this link points toward your codebase
117
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
118
+ print(agent_code)
119
+
120
+ # 2. Fetch Questions
121
+ print(f"Fetching questions from: {questions_url}")
122
+ try:
123
+ response = requests.get(questions_url, timeout=15)
124
+ response.raise_for_status()
125
+ questions_data = response.json()
126
+ if not questions_data:
127
+ print("Fetched questions list is empty.")
128
+ return "Fetched questions list is empty or invalid format.", None
129
+ print(f"Fetched {len(questions_data)} questions.")
130
+ except requests.exceptions.RequestException as e:
131
+ print(f"Error fetching questions: {e}")
132
+ return f"Error fetching questions: {e}", None
133
+ except requests.exceptions.JSONDecodeError as e:
134
+ print(f"Error decoding JSON response from questions endpoint: {e}")
135
+ print(f"Response text: {response.text[:500]}")
136
+ return f"Error decoding server response for questions: {e}", None
137
+ except Exception as e:
138
+ print(f"An unexpected error occurred fetching questions: {e}")
139
+ return f"An unexpected error occurred fetching questions: {e}", None
140
+
141
+ # 3. Run your Agent
142
+ results_log = []
143
+ answers_payload = []
144
+ print(f"Running agent on {len(questions_data)} questions...")
145
+ for item in questions_data:
146
+ task_id = item.get("task_id")
147
+ question_text = item.get("question")
148
+ if not task_id or question_text is None:
149
+ print(f"Skipping item with missing task_id or question: {item}")
150
+ continue
151
+ try:
152
+ submitted_answer = agent(question_text)
153
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
154
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
155
+ except Exception as e:
156
+ print(f"Error running agent on task {task_id}: {e}")
157
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
158
+
159
+ if not answers_payload:
160
+ print("Agent did not produce any answers to submit.")
161
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
162
+
163
+ # 4. Prepare Submission
164
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
165
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
166
+ print(status_update)
167
+
168
+ # 5. Submit
169
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
170
+ try:
171
+ response = requests.post(submit_url, json=submission_data, timeout=60)
172
+ response.raise_for_status()
173
+ result_data = response.json()
174
+ final_status = (
175
+ f"Submission Successful!\n"
176
+ f"User: {result_data.get('username')}\n"
177
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
178
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
179
+ f"Message: {result_data.get('message', 'No message received.')}"
180
+ )
181
+ print("Submission successful.")
182
+ results_df = pd.DataFrame(results_log)
183
+ return final_status, results_df
184
+ except requests.exceptions.HTTPError as e:
185
+ error_detail = f"Server responded with status {e.response.status_code}."
186
+ try:
187
+ error_json = e.response.json()
188
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
189
+ except requests.exceptions.JSONDecodeError:
190
+ error_detail += f" Response: {e.response.text[:500]}"
191
+ status_message = f"Submission Failed: {error_detail}"
192
+ print(status_message)
193
+ results_df = pd.DataFrame(results_log)
194
+ return status_message, results_df
195
+ except requests.exceptions.Timeout:
196
+ status_message = "Submission Failed: The request timed out."
197
+ print(status_message)
198
+ results_df = pd.DataFrame(results_log)
199
+ return status_message, results_df
200
+ except requests.exceptions.RequestException as e:
201
+ status_message = f"Submission Failed: Network error - {e}"
202
+ print(status_message)
203
+ results_df = pd.DataFrame(results_log)
204
+ return status_message, results_df
205
+ except Exception as e:
206
+ status_message = f"An unexpected error occurred during submission: {e}"
207
+ print(status_message)
208
+ results_df = pd.DataFrame(results_log)
209
+ return status_message, results_df
210
+
211
+
212
+ # --- Build Gradio Interface using Blocks ---
213
+ with gr.Blocks() as demo:
214
+ gr.Markdown("# GAIA Benchmark Agent Evaluation Runner")
215
+ gr.Markdown(
216
+ """
217
+ **Instructions:**
218
+
219
+ 1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
220
+ 2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
221
+
222
+ ---
223
+ **Note:**
224
+ This interface uses the modular GAIA Benchmark Agent to process questions from the GAIA benchmark.
225
+ The agent uses LangChain and OpenAI's language models to analyze questions, retrieve relevant context,
226
+ and generate accurate answers across various domains of AI and machine learning.
227
+ """
228
+ )
229
+
230
+ gr.LoginButton()
231
+
232
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
233
+
234
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
235
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
236
+
237
+ run_button.click(
238
+ fn=run_and_submit_all,
239
+ outputs=[status_output, results_table]
240
+ )
241
+
242
+ if __name__ == "__main__":
243
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
244
+ # Check for SPACE_HOST and SPACE_ID at startup for information
245
+ space_host_startup = os.getenv("SPACE_HOST")
246
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
247
+
248
+ if space_host_startup:
249
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
250
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
251
+ else:
252
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
253
+
254
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
255
+ print(f"✅ SPACE_ID found: {space_id_startup}")
256
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
257
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
258
+ else:
259
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
260
+
261
+ print("-"*(60 + len(" App Starting ")) + "\n")
262
+
263
+ print("Launching Gradio Interface for GAIA Benchmark Agent Evaluation...")
264
+ demo.launch(debug=True, share=False)
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_parameters": {
3
+ "model_name": "gpt-4-turbo",
4
+ "temperature": 0.2,
5
+ "max_tokens": 1024,
6
+ "top_p": 1.0,
7
+ "frequency_penalty": 0.0,
8
+ "presence_penalty": 0.0
9
+ },
10
+ "paths": {
11
+ "progress_file": "gaia_progress.json"
12
+ },
13
+ "api": {
14
+ "base_url": "https://agents-course-unit4-scoring.hf.space"
15
+ },
16
+ "logging": {
17
+ "level": "INFO",
18
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
19
+ "file": "logs/gaia_agent.log",
20
+ "console": true
21
+ },
22
+ "debugging": {
23
+ "enable_langchain_debug": false
24
+ }
25
+ }
gaiaX/README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GAIA Benchmark Agent
2
+
3
+ A LangChain-based agent for solving Hugging Face certification challenges in the GAIA benchmark.
4
+
5
+ ## Overview
6
+
7
+ The GAIA Benchmark Agent is designed to process and answer questions from the Hugging Face GAIA benchmark. It uses LangChain and OpenAI's language models to analyze questions, retrieve relevant context, and generate accurate answers across various domains of AI and machine learning.
8
+
9
+ ## Features
10
+
11
+ - Question type detection and specialized handling
12
+ - Context-aware processing for questions with associated files
13
+ - Batch processing with progress tracking
14
+ - Performance analysis and reporting
15
+ - Support for different agent types (OpenAI Functions, ReAct)
16
+
17
+ ## Project Structure
18
+
19
+ The project has been modularized for better maintainability and to address token limit issues:
20
+
21
+ ```
22
+ gaiaX/
23
+ ├── __init__.py # Package initialization
24
+ ├── config.py # Configuration handling
25
+ ├── api.py # API interaction functions
26
+ ├── tools.py # LangChain tools
27
+ ├── agent.py # Agent initialization and response handling
28
+ ├── question_handlers.py # Question type detection and handling
29
+ ├── utils.py # Utility functions
30
+ └── README.md # This file
31
+ ```
32
+
33
+ ## Setup
34
+
35
+ 1. Clone the repository
36
+ 2. Install dependencies:
37
+ ```
38
+ pip install -r requirements.txt
39
+ ```
40
+ 3. Create a `.env` file with the following variables:
41
+ ```
42
+ HF_USERNAME=your_huggingface_username
43
+ OPENAI_API_KEY=your_openai_api_key
44
+ TAVILY_API_KEY=your_tavily_api_key # Optional, for search functionality
45
+ ```
46
+ 4. Create a `config.json` file with your configuration:
47
+ ```json
48
+ {
49
+ "model_parameters": {
50
+ "model_name": "gpt-4-turbo",
51
+ "temperature": 0.2
52
+ },
53
+ "paths": {
54
+ "progress_file": "gaia_progress.json"
55
+ },
56
+ "api": {
57
+ "base_url": "https://api.example.com/gaia"
58
+ },
59
+ "logging": {
60
+ "level": "INFO",
61
+ "file": "logs/gaia_agent.log",
62
+ "console": true
63
+ }
64
+ }
65
+ ```
66
+
67
+ ## Usage
68
+
69
+ The GAIA Benchmark Agent can be used in several modes:
70
+
71
+ ### Test Mode
72
+
73
+ Test the agent with a sample question or a custom question:
74
+
75
+ ```bash
76
+ python gaia_agent_new.py test --agent-type openai_functions --question "What is deep learning?"
77
+ ```
78
+
79
+ With a context file:
80
+
81
+ ```bash
82
+ python gaia_agent_new.py test --agent-type openai_functions --question "Explain the concepts in this paper." --file path/to/paper.txt
83
+ ```
84
+
85
+ ### Random Question Mode
86
+
87
+ Process a random question from the GAIA benchmark:
88
+
89
+ ```bash
90
+ python gaia_agent_new.py random --agent-type openai_functions
91
+ ```
92
+
93
+ ### Batch Processing Mode
94
+
95
+ Process a batch of questions from the GAIA benchmark:
96
+
97
+ ```bash
98
+ python gaia_agent_new.py batch --agent-type openai_functions --batch-size 10 --progress-file progress.json --limit 50
99
+ ```
100
+
101
+ ### Submit Answers
102
+
103
+ Submit processed answers to the GAIA benchmark:
104
+
105
+ ```bash
106
+ python gaia_agent_new.py submit --progress-file progress.json --agent-code-link https://github.com/yourusername/gaia-agent
107
+ ```
108
+
109
+ ## Testing
110
+
111
+ Run the test suite:
112
+
113
+ ```bash
114
+ python test_gaia_agent_new.py
115
+ ```
116
+
117
+ ## License
118
+
119
+ [MIT License](LICENSE)
gaiaX/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GAIA Benchmark Agent - Hugging Face Certification Challenge Solver
3
+
4
+ This package provides a LangChain agent to solve Hugging Face certification
5
+ challenges for the GAIA benchmark. It includes batch processing capabilities,
6
+ progress tracking, and performance analysis.
7
+ """
8
+
9
+ __version__ = "1.0.0"
gaiaX/agent.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent module for GAIA Benchmark Agent.
4
+
5
+ This module handles the initialization of LangChain agents and
6
+ processing of responses for different question types.
7
+ """
8
+
9
+ import tempfile
10
+ import json
11
+ from typing import Dict, List, Any, Optional, Union, Tuple
12
+
13
+ from langchain.agents import AgentExecutor, create_openai_functions_agent, create_react_agent
14
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
15
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
16
+ from langchain_openai import ChatOpenAI
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.globals import set_debug
19
+
20
+ from gaiaX.config import logger, CONFIG, OPENAI_API_KEY, TAVILY_API_KEY
21
+ from gaiaX.tools import get_tools
22
+ from gaiaX.api import download_file_for_task
23
+
24
+ def initialize_agent(api_key: str = OPENAI_API_KEY, agent_type: str = "openai_functions") -> Any:
25
+ """
26
+ Initialize a LangChain agent with appropriate tools and configuration.
27
+
28
+ Args:
29
+ api_key: OpenAI API key or other LLM provider key
30
+ agent_type: Type of agent to initialize ("openai_functions" or "react")
31
+
32
+ Returns:
33
+ Initialized LangChain agent
34
+ """
35
+ # Enable LangChain debugging if configured
36
+ debug_enabled = CONFIG.get("debugging", {}).get("enable_langchain_debug", False)
37
+ if debug_enabled:
38
+ set_debug(True)
39
+ logger.info("LangChain debugging enabled")
40
+
41
+ # Get model parameters from config
42
+ model_params = CONFIG.get("model_parameters", {})
43
+ model_name = model_params.get("model_name", "gpt-4-turbo")
44
+ temperature = model_params.get("temperature", 0.2)
45
+ max_tokens = model_params.get("max_tokens", None)
46
+ top_p = model_params.get("top_p", 1.0)
47
+ frequency_penalty = model_params.get("frequency_penalty", 0.0)
48
+ presence_penalty = model_params.get("presence_penalty", 0.0)
49
+
50
+ logger.info(f"Initializing agent with model: {model_name}, temperature: {temperature}, type: {agent_type}")
51
+
52
+ # Initialize the language model
53
+ llm = ChatOpenAI(
54
+ model=model_name,
55
+ temperature=temperature,
56
+ max_tokens=max_tokens,
57
+ top_p=top_p,
58
+ frequency_penalty=frequency_penalty,
59
+ presence_penalty=presence_penalty,
60
+ api_key=api_key
61
+ )
62
+
63
+ # Get tools for the agent
64
+ tools = get_tools(include_search=True, tavily_api_key=TAVILY_API_KEY)
65
+
66
+ if agent_type == "react":
67
+ # Create a ReAct agent with a specialized prompt for GAIA benchmark
68
+ react_template = """
69
+ You are a general AI assistant. I will ask you a question.
70
+ You have access to the following tools:
71
+ {tools}
72
+
73
+ Use the following format:
74
+
75
+ Question: the input question you must answer
76
+ Thought: you should always think about what to do
77
+ Action: the action to take, should be one of [{tool_names}]
78
+ Action Input: the input to the action
79
+ Observation: the result of the action
80
+ ... (this Thought/Action/Action Input/Observation can repeat N times)
81
+ Thought: I now know the final answer.
82
+ Final Answer: [The final answer to the original input question. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. Output *only* the final answer value here, without any other surrounding text or prefixes.]
83
+
84
+ Begin!
85
+
86
+ Question: {input}
87
+ Thought: {agent_scratchpad}
88
+ """
89
+
90
+ # Create the prompt template
91
+ react_prompt = PromptTemplate.from_template(react_template)
92
+
93
+ # Create the ReAct agent
94
+ agent = create_react_agent(llm, tools, react_prompt)
95
+
96
+ # Create the agent executor
97
+ agent_executor = AgentExecutor(
98
+ agent=agent,
99
+ tools=tools,
100
+ verbose=True,
101
+ handle_parsing_errors=True
102
+ )
103
+
104
+ logger.info("ReAct agent initialized successfully")
105
+
106
+ else: # Default to OpenAI Functions agent
107
+ # Create a detailed system prompt with instructions for different question types
108
+ system_prompt = """
109
+ You are an expert AI assistant specialized in solving Hugging Face certification challenges for the GAIA benchmark.
110
+ Your goal is to provide accurate, well-reasoned answers to questions across various domains of AI and machine learning.
111
+
112
+ When given a question:
113
+ 1. Carefully analyze what is being asked and identify the question type
114
+ 2. Determine if you need additional context from any provided files
115
+ 3. If context files are available, request them using the fetch_context_file tool
116
+ 4. Formulate a comprehensive, accurate answer based on your knowledge and the provided context
117
+ 5. Ensure your answer is clear, concise, and directly addresses the question
118
+
119
+ QUESTION TYPES AND STRATEGIES:
120
+
121
+ 1. FACTUAL KNOWLEDGE QUESTIONS:
122
+ - These test your knowledge of AI/ML concepts, techniques, or history
123
+ - Provide precise definitions and explanations
124
+ - Include relevant examples to illustrate concepts
125
+ - Cite important research papers or developments when applicable
126
+
127
+ 2. TECHNICAL IMPLEMENTATION QUESTIONS:
128
+ - These ask about code, algorithms, or implementation details
129
+ - Provide step-by-step explanations of algorithms or processes
130
+ - Include pseudocode or code snippets when helpful
131
+ - Explain trade-offs between different approaches
132
+
133
+ 3. MATHEMATICAL QUESTIONS:
134
+ - These involve equations, proofs, or statistical concepts
135
+ - Show your work step-by-step
136
+ - Explain the intuition behind mathematical concepts
137
+ - Use clear notation and define all variables
138
+
139
+ 4. CONTEXT-BASED ANALYSIS QUESTIONS:
140
+ - These require analyzing provided context files
141
+ - Thoroughly read and understand the context before answering
142
+ - Reference specific parts of the context in your answer
143
+ - Connect the context to broader AI/ML concepts when relevant
144
+
145
+ 5. ETHICAL/SOCIETAL IMPACT QUESTIONS:
146
+ - These address ethical considerations or societal impacts of AI
147
+ - Present balanced perspectives on controversial topics
148
+ - Consider multiple stakeholders and viewpoints
149
+ - Discuss both benefits and potential risks
150
+
151
+ 6. PROBLEM-SOLVING QUESTIONS:
152
+ - These present novel problems requiring creative solutions
153
+ - Break down the problem into manageable components
154
+ - Consider multiple approaches before selecting the best one
155
+ - Explain why your solution is optimal given constraints
156
+
157
+ 7. CODING QUESTIONS:
158
+ - These require implementing or debugging code
159
+ - Provide clean, efficient, and well-commented code
160
+ - Explain your implementation choices
161
+ - Consider edge cases and potential optimizations
162
+
163
+ IMPORTANT FORMATTING GUIDELINES:
164
+
165
+ 1. For numerical answers:
166
+ - Provide only the number without units unless specifically requested
167
+ - Use standard notation (avoid scientific notation unless appropriate)
168
+ - Round to the specified number of decimal places if indicated
169
+
170
+ 2. For multiple-choice questions:
171
+ - Clearly indicate your selected option (A, B, C, D, etc.)
172
+ - Briefly explain your reasoning for the selection
173
+
174
+ 3. For short answer questions:
175
+ - Be concise and direct
176
+ - Focus on the key points without unnecessary elaboration
177
+
178
+ 4. For coding questions:
179
+ - Provide complete, runnable code unless a snippet is requested
180
+ - Include comments explaining complex logic
181
+ - Follow standard coding conventions for the language
182
+
183
+ Remember, your goal is to provide accurate, helpful answers that demonstrate deep understanding of AI and machine learning concepts.
184
+ """
185
+
186
+ # Create the prompt template
187
+ prompt = ChatPromptTemplate.from_messages([
188
+ ("system", system_prompt),
189
+ MessagesPlaceholder(variable_name="chat_history"),
190
+ ("human", "{input}"),
191
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
192
+ ])
193
+
194
+ # Create memory for conversation history
195
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
196
+
197
+ # Create the OpenAI Functions agent
198
+ agent = create_openai_functions_agent(llm, tools, prompt)
199
+
200
+ # Create the agent executor
201
+ agent_executor = AgentExecutor(
202
+ agent=agent,
203
+ tools=tools,
204
+ verbose=True,
205
+ memory=memory,
206
+ handle_parsing_errors=True
207
+ )
208
+
209
+ logger.info("OpenAI Functions agent initialized successfully")
210
+
211
+ return agent_executor
212
+
213
+
214
+ def get_agent_response(agent_executor: AgentExecutor, question_data: dict) -> str:
215
+ """
216
+ Get a response from the agent for a specific question.
217
+
218
+ Args:
219
+ agent_executor: Initialized LangChain agent executor
220
+ question_data: Dictionary containing question data
221
+
222
+ Returns:
223
+ Agent's response as a string
224
+ """
225
+ try:
226
+ # Extract question details
227
+ question_text = question_data.get("question", "")
228
+ task_id = question_data.get("task_id", "")
229
+ has_file = question_data.get("has_file", False)
230
+
231
+ # Prepare the input for the agent
232
+ agent_input = {
233
+ "input": question_text
234
+ }
235
+
236
+ # If the question has an associated file, try to download it
237
+ context_content = None
238
+ if has_file and task_id:
239
+ logger.info(f"Question has an associated file. Attempting to download for task {task_id}")
240
+ try:
241
+ # Create a temporary directory to store the file
242
+ with tempfile.TemporaryDirectory() as temp_dir:
243
+ # Download the file
244
+ file_path = download_file_for_task(CONFIG.get("api", {}).get("base_url"), task_id, temp_dir)
245
+
246
+ # Try to read the file as text
247
+ try:
248
+ with open(file_path, 'r', encoding='utf-8') as f:
249
+ context_content = f.read()
250
+
251
+ # Add context to the agent input
252
+ agent_input["context"] = context_content
253
+ agent_input["input"] = f"Question: {question_text}\n\nContext: {context_content}"
254
+ except UnicodeDecodeError:
255
+ # If it's not a text file, provide info about the binary file
256
+ file_size = Path(file_path).stat().st_size
257
+ file_ext = Path(file_path).suffix
258
+ binary_info = f"Binary file detected ({file_ext}, {file_size} bytes). This file cannot be displayed as text."
259
+ agent_input["input"] = f"Question: {question_text}\n\nContext: {binary_info}"
260
+ except Exception as e:
261
+ logger.error(f"Error handling context file: {str(e)}")
262
+ agent_input["input"] = f"Question: {question_text}\n\nNote: There was an error retrieving the context file: {str(e)}"
263
+
264
+ # Get response from the agent
265
+ logger.info(f"Sending question to agent: {question_text[:100]}...")
266
+ response = agent_executor.invoke(agent_input)
267
+
268
+ # Extract the output from the response
269
+ output = response.get("output", "")
270
+
271
+ return output
272
+
273
+ except Exception as e:
274
+ logger.error(f"Error getting agent response: {str(e)}")
275
+ return f"Error: {str(e)}"
gaiaX/api.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ API interaction module for GAIA Benchmark Agent.
4
+
5
+ This module handles all interactions with the GAIA benchmark API,
6
+ including fetching questions, downloading files, and submitting answers.
7
+ """
8
+
9
+ import json
10
+ import requests
11
+ from typing import Dict, List, Any, Optional
12
+ from pathlib import Path
13
+
14
+ from gaiaX.config import logger, API_BASE_URL
15
+
16
+ def get_all_questions(api_base_url: str = API_BASE_URL) -> List[Dict[str, Any]]:
17
+ """
18
+ Retrieve all available questions from the GAIA benchmark.
19
+
20
+ Args:
21
+ api_base_url: Base URL for the GAIA API
22
+
23
+ Returns:
24
+ List of question dictionaries
25
+
26
+ Raises:
27
+ requests.RequestException: If the API request fails
28
+ ValueError: If the response is not valid JSON or doesn't contain expected data
29
+ """
30
+ try:
31
+ response = requests.get(f"{api_base_url}/questions")
32
+ response.raise_for_status() # Raise exception for 4XX/5XX responses
33
+
34
+ questions = response.json()
35
+
36
+ if not isinstance(questions, list):
37
+ raise ValueError("Expected a list of questions but received a different format")
38
+
39
+ return questions
40
+
41
+ except requests.RequestException as e:
42
+ logger.error(f"Error fetching questions: {e}")
43
+ raise
44
+
45
+ except json.JSONDecodeError:
46
+ logger.error("Error decoding response as JSON")
47
+ raise ValueError("Invalid JSON response from the API")
48
+
49
+
50
+ def get_random_question(api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
51
+ """
52
+ Retrieve a random question from the GAIA benchmark.
53
+
54
+ Args:
55
+ api_base_url: Base URL for the GAIA API
56
+
57
+ Returns:
58
+ A single question dictionary
59
+
60
+ Raises:
61
+ requests.RequestException: If the API request fails
62
+ ValueError: If the response is not valid JSON or doesn't contain expected data
63
+ """
64
+ try:
65
+ response = requests.get(f"{api_base_url}/questions/random")
66
+ response.raise_for_status()
67
+
68
+ question = response.json()
69
+
70
+ if not isinstance(question, dict):
71
+ raise ValueError("Expected a question dictionary but received a different format")
72
+
73
+ return question
74
+
75
+ except requests.RequestException as e:
76
+ logger.error(f"Error fetching random question: {e}")
77
+ raise
78
+
79
+ except json.JSONDecodeError:
80
+ logger.error("Error decoding response as JSON")
81
+ raise ValueError("Invalid JSON response from the API")
82
+
83
+
84
+ def download_file_for_task(api_base_url: str, task_id: str, download_path: str) -> str:
85
+ """
86
+ Download a file associated with a specific task.
87
+
88
+ Args:
89
+ api_base_url: Base URL for the GAIA API
90
+ task_id: ID of the task to download files for
91
+ download_path: Directory path where the file should be saved
92
+
93
+ Returns:
94
+ Path to the downloaded file
95
+
96
+ Raises:
97
+ requests.RequestException: If the API request fails
98
+ IOError: If there's an error writing the file
99
+ ValueError: If the task_id is invalid or the response is unexpected
100
+ """
101
+ if not task_id:
102
+ raise ValueError("Task ID cannot be empty")
103
+
104
+ # Ensure download directory exists
105
+ download_dir = Path(download_path)
106
+ download_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ try:
109
+ response = requests.get(
110
+ f"{api_base_url}/tasks/{task_id}/file",
111
+ stream=True # Stream the response for large files
112
+ )
113
+ response.raise_for_status()
114
+
115
+ # Get filename from Content-Disposition header or use task_id as fallback
116
+ content_disposition = response.headers.get('Content-Disposition', '')
117
+ filename = None
118
+
119
+ if 'filename=' in content_disposition:
120
+ filename = content_disposition.split('filename=')[1].strip('"\'')
121
+
122
+ if not filename:
123
+ filename = f"{task_id}_file.txt"
124
+
125
+ file_path = download_dir / filename
126
+
127
+ # Write the file
128
+ with open(file_path, 'wb') as f:
129
+ for chunk in response.iter_content(chunk_size=8192):
130
+ f.write(chunk)
131
+
132
+ return str(file_path)
133
+
134
+ except requests.RequestException as e:
135
+ logger.error(f"Error downloading file for task {task_id}: {e}")
136
+ raise
137
+
138
+ except IOError as e:
139
+ logger.error(f"Error writing file to {download_path}: {e}")
140
+ raise
141
+
142
+
143
+ def submit_answers(
144
+ api_base_url: str,
145
+ username: str,
146
+ agent_code_link: str,
147
+ answers: Dict[str, Any]
148
+ ) -> Dict[str, Any]:
149
+ """
150
+ Submit answers to the GAIA benchmark.
151
+
152
+ Args:
153
+ api_base_url: Base URL for the GAIA API
154
+ username: Hugging Face username
155
+ agent_code_link: Link to the agent code (e.g., GitHub repository)
156
+ answers: Dictionary of answers to submit
157
+
158
+ Returns:
159
+ Response from the API containing submission results
160
+
161
+ Raises:
162
+ requests.RequestException: If the API request fails
163
+ ValueError: If the response is not valid JSON or contains an error message
164
+ """
165
+ if not username:
166
+ raise ValueError("Username cannot be empty")
167
+
168
+ if not agent_code_link:
169
+ raise ValueError("Agent code link cannot be empty")
170
+
171
+ if not answers or not isinstance(answers, dict):
172
+ raise ValueError("Answers must be a non-empty dictionary")
173
+
174
+ payload = {
175
+ "username": username,
176
+ "agent_code_link": agent_code_link,
177
+ "answers": answers
178
+ }
179
+
180
+ try:
181
+ response = requests.post(
182
+ f"{api_base_url}/submit",
183
+ json=payload,
184
+ headers={"Content-Type": "application/json"}
185
+ )
186
+ response.raise_for_status()
187
+
188
+ result = response.json()
189
+
190
+ # Check if the response contains an error message
191
+ if isinstance(result, dict) and result.get("error"):
192
+ raise ValueError(f"API returned an error: {result['error']}")
193
+
194
+ return result
195
+
196
+ except requests.RequestException as e:
197
+ logger.error(f"Error submitting answers: {e}")
198
+ raise
199
+
200
+ except json.JSONDecodeError:
201
+ logger.error("Error decoding response as JSON")
202
+ raise ValueError("Invalid JSON response from the API")
203
+
204
+
205
+ def get_question_details(task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
206
+ """
207
+ Get detailed information about a specific question/task.
208
+
209
+ Args:
210
+ task_id: The ID of the task to get details for
211
+ api_base_url: Base URL for the GAIA API
212
+
213
+ Returns:
214
+ Dictionary containing question details
215
+ """
216
+ try:
217
+ response = requests.get(f"{api_base_url}/questions/{task_id}")
218
+ response.raise_for_status()
219
+ return response.json()
220
+ except requests.RequestException as e:
221
+ logger.error(f"Failed to get question details: {str(e)}")
222
+ return {"error": f"Failed to get question details: {str(e)}"}
223
+ except json.JSONDecodeError:
224
+ logger.error("Invalid JSON response from the API")
225
+ return {"error": "Invalid JSON response from the API"}
gaiaX/config.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration module for GAIA Benchmark Agent.
4
+
5
+ This module handles loading and managing configuration settings from JSON files
6
+ and environment variables.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import logging
12
+ from typing import Dict, Any
13
+ from pathlib import Path
14
+ from dotenv import load_dotenv
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ def load_config(config_path: str = "config.json") -> Dict[str, Any]:
20
+ """
21
+ Load configuration from a JSON file.
22
+
23
+ Args:
24
+ config_path: Path to the configuration file
25
+
26
+ Returns:
27
+ Dictionary containing configuration settings
28
+ """
29
+ try:
30
+ with open(config_path, 'r') as f:
31
+ config = json.load(f)
32
+ return config
33
+ except Exception as e:
34
+ print(f"Error loading configuration from {config_path}: {e}")
35
+ print("Using default configuration.")
36
+ return {
37
+ "model_parameters": {
38
+ "model_name": "gpt-4-turbo",
39
+ "temperature": 0.2
40
+ },
41
+ "paths": {
42
+ "progress_file": "gaia_progress.json"
43
+ },
44
+ "api": {
45
+ "base_url": "https://api.example.com/gaia"
46
+ }
47
+ }
48
+
49
+ # Load configuration
50
+ CONFIG = load_config()
51
+
52
+ # Setup logging
53
+ def setup_logging():
54
+ """Configure logging based on settings in CONFIG."""
55
+ logging_config = CONFIG.get("logging", {})
56
+ log_level = getattr(logging, logging_config.get("level", "INFO"))
57
+ log_format = logging_config.get("format", "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
58
+ log_file = logging_config.get("file", "logs/gaia_agent.log")
59
+
60
+ # Create logs directory if it doesn't exist
61
+ if log_file:
62
+ log_dir = os.path.dirname(log_file)
63
+ if log_dir and not os.path.exists(log_dir):
64
+ os.makedirs(log_dir, exist_ok=True)
65
+
66
+ # Configure logging
67
+ logging.basicConfig(
68
+ level=log_level,
69
+ format=log_format,
70
+ handlers=[
71
+ logging.FileHandler(log_file) if log_file else logging.NullHandler(),
72
+ logging.StreamHandler() if logging_config.get("console", True) else logging.NullHandler()
73
+ ]
74
+ )
75
+
76
+ return logging.getLogger("gaia_agent")
77
+
78
+ # Initialize logger
79
+ logger = setup_logging()
80
+
81
+ # Environment variables
82
+ HF_USERNAME = os.getenv("HF_USERNAME")
83
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
84
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
85
+
86
+ # API configuration
87
+ API_BASE_URL = CONFIG.get("api", {}).get("base_url", "https://api.example.com/gaia")
88
+
89
+ # Validate required environment variables
90
+ def validate_env_vars():
91
+ """Validate that required environment variables are set."""
92
+ if not HF_USERNAME:
93
+ logger.error("HF_USERNAME environment variable is not set. Please check your .env file.")
94
+ raise ValueError("HF_USERNAME environment variable is not set. Please check your .env file.")
95
+
96
+ if not OPENAI_API_KEY:
97
+ logger.error("OPENAI_API_KEY environment variable is not set. Please check your .env file.")
98
+ raise ValueError("OPENAI_API_KEY environment variable is not set. Please check your .env file.")
99
+
100
+ # Tavily API key is optional but recommended for search functionality
101
+ if not TAVILY_API_KEY:
102
+ logger.warning("TAVILY_API_KEY environment variable is not set. Search functionality will be limited.")
gaiaX/question_handlers.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Question handlers module for GAIA Benchmark Agent.
4
+
5
+ This module provides specialized handlers for different types of questions
6
+ in the GAIA benchmark, including question type detection and processing.
7
+ """
8
+
9
+ import re
10
+ import tempfile
11
+ from typing import Dict, Any, Optional
12
+
13
+ from gaiaX.config import logger, CONFIG, API_BASE_URL
14
+ from gaiaX.api import download_file_for_task
15
+ from gaiaX.agent import get_agent_response
16
+
17
+ def detect_question_type(question_text: str) -> str:
18
+ """
19
+ Detect the type of question based on its content.
20
+
21
+ Args:
22
+ question_text: The text of the question
23
+
24
+ Returns:
25
+ String indicating the question type
26
+ """
27
+ # Convert to lowercase for case-insensitive matching
28
+ text = question_text.lower()
29
+
30
+ # Check for mathematical questions
31
+ if any(keyword in text for keyword in ["calculate", "compute", "equation", "formula", "derivative",
32
+ "integral", "probability", "statistics", "math"]):
33
+ return "mathematical"
34
+
35
+ # Check for technical implementation questions
36
+ if any(keyword in text for keyword in ["implement", "code", "algorithm", "function", "class",
37
+ "method", "programming", "pseudocode", "complexity"]):
38
+ return "technical"
39
+
40
+ # Check for context-based questions
41
+ if any(keyword in text for keyword in ["context", "file", "document", "text", "analyze",
42
+ "based on", "according to", "refer to"]):
43
+ return "context_based"
44
+
45
+ # Check for ethical/societal questions
46
+ if any(keyword in text for keyword in ["ethics", "ethical", "society", "impact", "bias",
47
+ "fairness", "responsible", "governance"]):
48
+ return "ethical"
49
+
50
+ # Check for factual knowledge questions
51
+ if any(keyword in text for keyword in ["define", "explain", "describe", "what is", "who is",
52
+ "when was", "history", "concept"]):
53
+ return "factual"
54
+
55
+ # Default to general if no specific type is detected
56
+ return "general"
57
+
58
+
59
+ def handle_factual_question(agent: Any, question: dict, context: str = None) -> str:
60
+ """
61
+ Handle factual knowledge questions.
62
+
63
+ Args:
64
+ agent: Initialized LangChain agent
65
+ question: Dictionary containing question data
66
+ context: Optional context text
67
+
68
+ Returns:
69
+ Agent's response as a string
70
+ """
71
+ logger.info("Handling factual knowledge question")
72
+
73
+ # Enhance the question with specific instructions for factual questions
74
+ enhanced_question = question.copy()
75
+
76
+ question_text = question.get("question", "")
77
+ enhanced_text = f"""
78
+ [FACTUAL KNOWLEDGE QUESTION]
79
+
80
+ {question_text}
81
+
82
+ Please provide a precise, accurate answer based on established facts and knowledge.
83
+ Include relevant examples and cite important research or developments when applicable.
84
+ """
85
+
86
+ enhanced_question["question"] = enhanced_text
87
+
88
+ # Get response from the agent
89
+ return get_agent_response(agent, enhanced_question)
90
+
91
+
92
+ def handle_technical_question(agent: Any, question: dict, context: str = None) -> str:
93
+ """
94
+ Handle technical implementation questions.
95
+
96
+ Args:
97
+ agent: Initialized LangChain agent
98
+ question: Dictionary containing question data
99
+ context: Optional context text
100
+
101
+ Returns:
102
+ Agent's response as a string
103
+ """
104
+ logger.info("Handling technical implementation question")
105
+
106
+ # Enhance the question with specific instructions for technical questions
107
+ enhanced_question = question.copy()
108
+
109
+ question_text = question.get("question", "")
110
+ enhanced_text = f"""
111
+ [TECHNICAL IMPLEMENTATION QUESTION]
112
+
113
+ {question_text}
114
+
115
+ Please provide a detailed technical explanation, including:
116
+ - Step-by-step explanation of algorithms or processes
117
+ - Pseudocode or code snippets when helpful
118
+ - Analysis of trade-offs between different approaches
119
+ - Complexity analysis (time and space) if relevant
120
+ """
121
+
122
+ enhanced_question["question"] = enhanced_text
123
+
124
+ # Get response from the agent
125
+ return get_agent_response(agent, enhanced_question)
126
+
127
+
128
+ def handle_mathematical_question(agent: Any, question: dict, context: str = None) -> str:
129
+ """
130
+ Handle mathematical questions.
131
+
132
+ Args:
133
+ agent: Initialized LangChain agent
134
+ question: Dictionary containing question data
135
+ context: Optional context text
136
+
137
+ Returns:
138
+ Agent's response as a string
139
+ """
140
+ logger.info("Handling mathematical question")
141
+
142
+ # Enhance the question with specific instructions for mathematical questions
143
+ enhanced_question = question.copy()
144
+
145
+ question_text = question.get("question", "")
146
+ enhanced_text = f"""
147
+ [MATHEMATICAL QUESTION]
148
+
149
+ {question_text}
150
+
151
+ Please provide a clear mathematical solution, including:
152
+ - Step-by-step working of the solution
153
+ - Clear explanation of the mathematical concepts involved
154
+ - Proper notation with defined variables
155
+ - Final answer in the simplest form
156
+
157
+ If the question asks for a specific numerical value, provide only that value as your final answer.
158
+ """
159
+
160
+ enhanced_question["question"] = enhanced_text
161
+
162
+ # Get response from the agent
163
+ return get_agent_response(agent, enhanced_question)
164
+
165
+
166
+ def handle_context_based_question(agent: Any, question: dict, context: str = None) -> str:
167
+ """
168
+ Handle context-based analysis questions.
169
+
170
+ Args:
171
+ agent: Initialized LangChain agent
172
+ question: Dictionary containing question data
173
+ context: Optional context text
174
+
175
+ Returns:
176
+ Agent's response as a string
177
+ """
178
+ logger.info("Handling context-based question")
179
+
180
+ # If context is not provided but the question has a file, try to download it
181
+ if not context and question.get("has_file", False):
182
+ task_id = question.get("task_id", "")
183
+ if task_id:
184
+ try:
185
+ with tempfile.TemporaryDirectory() as temp_dir:
186
+ file_path = download_file_for_task(API_BASE_URL, task_id, temp_dir)
187
+ with open(file_path, 'r', encoding='utf-8') as f:
188
+ context = f.read()
189
+ except Exception as e:
190
+ logger.error(f"Error downloading context file: {str(e)}")
191
+
192
+ # Enhance the question with specific instructions for context-based questions
193
+ enhanced_question = question.copy()
194
+
195
+ question_text = question.get("question", "")
196
+ enhanced_text = f"""
197
+ [CONTEXT-BASED ANALYSIS QUESTION]
198
+
199
+ {question_text}
200
+
201
+ Please analyze the provided context carefully and provide an answer that:
202
+ - Directly references relevant parts of the context
203
+ - Connects the context to broader AI/ML concepts when relevant
204
+ - Provides a comprehensive analysis based on the context
205
+ """
206
+
207
+ if context:
208
+ enhanced_text += f"\n\nContext:\n{context}"
209
+
210
+ enhanced_question["question"] = enhanced_text
211
+
212
+ # Get response from the agent
213
+ return get_agent_response(agent, enhanced_question)
214
+
215
+
216
+ def handle_general_question(agent: Any, question: dict, context: str = None) -> str:
217
+ """
218
+ Handle general questions that don't fit into specific categories.
219
+
220
+ Args:
221
+ agent: Initialized LangChain agent
222
+ question: Dictionary containing question data
223
+ context: Optional context text
224
+
225
+ Returns:
226
+ Agent's response as a string
227
+ """
228
+ logger.info("Handling general question")
229
+
230
+ # Enhance the question with general instructions
231
+ enhanced_question = question.copy()
232
+
233
+ question_text = question.get("question", "")
234
+ enhanced_text = f"""
235
+ [GENERAL QUESTION]
236
+
237
+ {question_text}
238
+
239
+ Please provide a comprehensive, accurate answer that:
240
+ - Directly addresses all aspects of the question
241
+ - Is well-structured and easy to understand
242
+ - Includes relevant examples or illustrations when helpful
243
+ - Cites sources or references when appropriate
244
+ """
245
+
246
+ if context:
247
+ enhanced_text += f"\n\nContext:\n{context}"
248
+
249
+ enhanced_question["question"] = enhanced_text
250
+
251
+ # Get response from the agent
252
+ return get_agent_response(agent, enhanced_question)
253
+
254
+
255
+ def process_question(agent: Any, question: dict, api_base_url: str = API_BASE_URL) -> dict:
256
+ """
257
+ Process a single question using the appropriate handler.
258
+
259
+ Args:
260
+ agent: Initialized LangChain agent
261
+ question: Dictionary containing question data
262
+ api_base_url: Base URL for the GAIA API
263
+
264
+ Returns:
265
+ Dictionary containing the question, answer, and metadata
266
+ """
267
+ try:
268
+ # Extract question details
269
+ question_text = question.get("question", "")
270
+ task_id = question.get("task_id", "")
271
+ has_file = question.get("has_file", False)
272
+
273
+ logger.info(f"Processing question: {task_id} - {question_text[:50]}...")
274
+
275
+ # Detect question type
276
+ question_type = detect_question_type(question_text)
277
+ logger.info(f"Detected question type: {question_type}")
278
+
279
+ # Download context file if available
280
+ context = None
281
+ if has_file and task_id:
282
+ try:
283
+ with tempfile.TemporaryDirectory() as temp_dir:
284
+ file_path = download_file_for_task(api_base_url, task_id, temp_dir)
285
+ with open(file_path, 'r', encoding='utf-8') as f:
286
+ context = f.read()
287
+ except Exception as e:
288
+ logger.error(f"Error downloading context file: {str(e)}")
289
+
290
+ # Handle question based on its type
291
+ if question_type == "factual":
292
+ answer = handle_factual_question(agent, question, context)
293
+ elif question_type == "technical":
294
+ answer = handle_technical_question(agent, question, context)
295
+ elif question_type == "mathematical":
296
+ answer = handle_mathematical_question(agent, question, context)
297
+ elif question_type == "context_based":
298
+ answer = handle_context_based_question(agent, question, context)
299
+ else:
300
+ answer = handle_general_question(agent, question, context)
301
+
302
+ # Create result dictionary
303
+ result = {
304
+ "task_id": task_id,
305
+ "question": question_text,
306
+ "answer": answer,
307
+ "question_type": question_type,
308
+ "has_context": context is not None
309
+ }
310
+
311
+ return result
312
+
313
+ except Exception as e:
314
+ logger.error(f"Error processing question: {str(e)}")
315
+ return {
316
+ "task_id": question.get("task_id", ""),
317
+ "question": question.get("question", ""),
318
+ "answer": f"Error: {str(e)}",
319
+ "error": str(e)
320
+ }
gaiaX/tools.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ LangChain tools module for GAIA Benchmark Agent.
4
+
5
+ This module defines the custom tools used by the LangChain agent
6
+ to interact with the GAIA benchmark API and process questions.
7
+ """
8
+
9
+ import json
10
+ import tempfile
11
+ from typing import Dict, Any
12
+ from pathlib import Path
13
+
14
+ from langchain.tools import BaseTool, tool
15
+
16
+ from gaiaX.config import logger, API_BASE_URL
17
+ from gaiaX.api import download_file_for_task, get_question_details
18
+
19
+ @tool
20
+ def fetch_question_details(task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
21
+ """
22
+ Get detailed information about a specific question/task.
23
+
24
+ Args:
25
+ task_id: The ID of the task to get details for
26
+ api_base_url: Base URL for the GAIA API
27
+
28
+ Returns:
29
+ Dictionary containing question details
30
+ """
31
+ return get_question_details(task_id, api_base_url)
32
+
33
+ @tool
34
+ def fetch_context_file(task_id: str, api_base_url: str = API_BASE_URL) -> str:
35
+ """
36
+ Download and read the context file for a specific task.
37
+
38
+ Args:
39
+ task_id: The ID of the task to download the file for
40
+ api_base_url: Base URL for the GAIA API
41
+
42
+ Returns:
43
+ String containing the file contents or error message
44
+ """
45
+ try:
46
+ # Create a temporary directory to store the file
47
+ with tempfile.TemporaryDirectory() as temp_dir:
48
+ file_path = download_file_for_task(api_base_url, task_id, temp_dir)
49
+
50
+ # Try to read the file as text
51
+ try:
52
+ with open(file_path, 'r', encoding='utf-8') as f:
53
+ return f.read()
54
+ except UnicodeDecodeError:
55
+ # If it's not a text file, try to read it as binary and provide info
56
+ file_size = Path(file_path).stat().st_size
57
+ file_ext = Path(file_path).suffix
58
+ return f"Binary file detected ({file_ext}, {file_size} bytes). This file cannot be displayed as text. Please use specialized tools to analyze this type of file."
59
+ except Exception as e:
60
+ logger.error(f"Error fetching context file: {str(e)}")
61
+ return f"Error fetching context file: {str(e)}"
62
+
63
+ # Define a class for each tool to make them more configurable
64
+ class QuestionDetailsTool(BaseTool):
65
+ """Tool for fetching question details from the GAIA API."""
66
+
67
+ name = "get_question_details"
68
+ description = "Get detailed information about a specific question/task"
69
+
70
+ def _run(self, task_id: str, api_base_url: str = API_BASE_URL) -> Dict[str, Any]:
71
+ """Execute the tool."""
72
+ return get_question_details(task_id, api_base_url)
73
+
74
+ def _arun(self, task_id: str, api_base_url: str = API_BASE_URL):
75
+ """Execute the tool asynchronously."""
76
+ raise NotImplementedError("Async version not implemented")
77
+
78
+ class ContextFileTool(BaseTool):
79
+ """Tool for fetching and reading context files for tasks."""
80
+
81
+ name = "fetch_context_file"
82
+ description = "Download and read the context file for a specific task"
83
+
84
+ def _run(self, task_id: str, api_base_url: str = API_BASE_URL) -> str:
85
+ """Execute the tool."""
86
+ return fetch_context_file(task_id, api_base_url)
87
+
88
+ def _arun(self, task_id: str, api_base_url: str = API_BASE_URL):
89
+ """Execute the tool asynchronously."""
90
+ raise NotImplementedError("Async version not implemented")
91
+
92
+ # Function to get all available tools
93
+ def get_tools(include_search: bool = True, tavily_api_key: str = None):
94
+ """
95
+ Get all available tools for the agent.
96
+
97
+ Args:
98
+ include_search: Whether to include the search tool
99
+ tavily_api_key: Tavily API key for search functionality
100
+
101
+ Returns:
102
+ List of tools
103
+ """
104
+ tools = [
105
+ fetch_question_details,
106
+ fetch_context_file
107
+ ]
108
+
109
+ # Add search tool if Tavily API key is available and search is enabled
110
+ if include_search and tavily_api_key:
111
+ try:
112
+ from langchain_community.tools.tavily_search import TavilySearchResults
113
+
114
+ search_tool = TavilySearchResults(
115
+ max_results=3,
116
+ api_key=tavily_api_key
117
+ )
118
+ tools.append(search_tool)
119
+ logger.info("Search tool added to agent tools")
120
+ except ImportError:
121
+ logger.warning("Could not import TavilySearchResults. Search functionality will be disabled.")
122
+ except Exception as e:
123
+ logger.warning(f"Error initializing search tool: {e}")
124
+
125
+ return tools
gaiaX/utils.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility functions for GAIA Benchmark Agent.
4
+
5
+ This module provides utility functions for progress tracking,
6
+ performance analysis, and other helper functions.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import datetime
12
+ from typing import Dict, List, Any, Optional
13
+
14
+ from gaiaX.config import logger, CONFIG
15
+
16
+ def load_progress(progress_file: str = None) -> dict:
17
+ """
18
+ Load progress from a JSON file.
19
+
20
+ Args:
21
+ progress_file: Path to the progress file
22
+
23
+ Returns:
24
+ Dictionary containing progress data
25
+ """
26
+ if not progress_file:
27
+ progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
28
+
29
+ try:
30
+ if os.path.exists(progress_file):
31
+ with open(progress_file, 'r') as f:
32
+ progress = json.load(f)
33
+ return progress
34
+ else:
35
+ return {"processed_questions": [], "answers": {}}
36
+ except Exception as e:
37
+ logger.error(f"Error loading progress from {progress_file}: {e}")
38
+ return {"processed_questions": [], "answers": {}}
39
+
40
+
41
+ def save_progress(progress_data: dict, progress_file: str = None) -> bool:
42
+ """
43
+ Save progress to a JSON file.
44
+
45
+ Args:
46
+ progress_data: Dictionary containing progress data
47
+ progress_file: Path to the progress file
48
+
49
+ Returns:
50
+ True if successful, False otherwise
51
+ """
52
+ if not progress_file:
53
+ progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
54
+
55
+ try:
56
+ with open(progress_file, 'w') as f:
57
+ json.dump(progress_data, f, indent=2)
58
+ return True
59
+ except Exception as e:
60
+ logger.error(f"Error saving progress to {progress_file}: {e}")
61
+ return False
62
+
63
+
64
+ def analyze_performance(answers: list, expected_answers: list = None) -> dict:
65
+ """
66
+ Analyze the performance of the agent based on answers.
67
+
68
+ Args:
69
+ answers: List of answer dictionaries
70
+ expected_answers: Optional list of expected answers for evaluation
71
+
72
+ Returns:
73
+ Dictionary containing performance metrics
74
+ """
75
+ total_questions = len(answers)
76
+ successful_answers = sum(1 for a in answers if "error" not in a)
77
+ error_count = total_questions - successful_answers
78
+
79
+ # Calculate average response time if available
80
+ response_times = [a.get("response_time", 0) for a in answers if "response_time" in a]
81
+ avg_response_time = sum(response_times) / len(response_times) if response_times else 0
82
+
83
+ # Count question types
84
+ question_types = {}
85
+ for answer in answers:
86
+ q_type = answer.get("question_type", "unknown")
87
+ question_types[q_type] = question_types.get(q_type, 0) + 1
88
+
89
+ # Calculate accuracy if expected answers are provided
90
+ accuracy = None
91
+ correct_answers = 0
92
+ if expected_answers:
93
+ answer_dict = {a.get("task_id"): a.get("answer") for a in answers}
94
+ expected_dict = {e.get("task_id"): e.get("answer") for e in expected_answers}
95
+
96
+ common_ids = set(answer_dict.keys()) & set(expected_dict.keys())
97
+ if common_ids:
98
+ for task_id in common_ids:
99
+ if answer_dict[task_id] == expected_dict[task_id]:
100
+ correct_answers += 1
101
+ accuracy = correct_answers / len(common_ids)
102
+
103
+ # Compile metrics
104
+ metrics = {
105
+ "total_questions": total_questions,
106
+ "successful_answers": successful_answers,
107
+ "error_count": error_count,
108
+ "success_rate": successful_answers / total_questions if total_questions > 0 else 0,
109
+ "average_response_time": avg_response_time,
110
+ "question_types": question_types
111
+ }
112
+
113
+ if accuracy is not None:
114
+ metrics["accuracy"] = accuracy
115
+ metrics["correct_answers"] = correct_answers
116
+
117
+ return metrics
118
+
119
+
120
+ def format_performance_report(metrics: dict) -> str:
121
+ """
122
+ Format performance metrics into a readable report.
123
+
124
+ Args:
125
+ metrics: Dictionary containing performance metrics
126
+
127
+ Returns:
128
+ Formatted performance report as a string
129
+ """
130
+ report = [
131
+ "=== GAIA Benchmark Agent Performance Report ===",
132
+ f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
133
+ "",
134
+ f"Total Questions Processed: {metrics['total_questions']}",
135
+ f"Successful Answers: {metrics['successful_answers']} ({metrics['success_rate']:.2%})",
136
+ f"Errors: {metrics['error_count']}",
137
+ f"Average Response Time: {metrics['average_response_time']:.2f} seconds",
138
+ "",
139
+ "Question Type Distribution:"
140
+ ]
141
+
142
+ # Add question type distribution
143
+ for q_type, count in metrics.get("question_types", {}).items():
144
+ percentage = count / metrics["total_questions"] if metrics["total_questions"] > 0 else 0
145
+ report.append(f" - {q_type}: {count} ({percentage:.2%})")
146
+
147
+ # Add accuracy information if available
148
+ if "accuracy" in metrics:
149
+ report.extend([
150
+ "",
151
+ f"Accuracy: {metrics['accuracy']:.2%}",
152
+ f"Correct Answers: {metrics['correct_answers']} out of {metrics['total_questions']}"
153
+ ])
154
+
155
+ return "\n".join(report)
156
+
157
+
158
+ def process_questions_batch(agent: Any, questions: list, api_base_url: str,
159
+ progress_file: str = None, batch_size: int = 10) -> dict:
160
+ """
161
+ Process a batch of questions and track progress.
162
+
163
+ Args:
164
+ agent: Initialized LangChain agent
165
+ questions: List of question dictionaries
166
+ api_base_url: Base URL for the GAIA API
167
+ progress_file: Path to the progress file
168
+ batch_size: Number of questions to process in each batch
169
+
170
+ Returns:
171
+ Dictionary containing processed questions and answers
172
+ """
173
+ from gaiaX.question_handlers import process_question
174
+
175
+ # Load existing progress if available
176
+ if not progress_file:
177
+ progress_file = CONFIG.get("paths", {}).get("progress_file", "gaia_progress.json")
178
+
179
+ progress = {}
180
+ try:
181
+ if os.path.exists(progress_file):
182
+ with open(progress_file, 'r') as f:
183
+ progress = json.load(f)
184
+ else:
185
+ progress = {"processed_questions": [], "answers": {}}
186
+ except Exception as e:
187
+ logger.error(f"Error loading progress from {progress_file}: {e}")
188
+ progress = {"processed_questions": [], "answers": {}}
189
+
190
+ # Get list of already processed questions
191
+ processed_ids = set(progress.get("processed_questions", []))
192
+
193
+ # Filter out already processed questions
194
+ remaining_questions = [q for q in questions if q.get("task_id") not in processed_ids]
195
+ logger.info(f"Found {len(remaining_questions)} questions to process out of {len(questions)} total")
196
+
197
+ # Process questions in batches
198
+ results = []
199
+ for i, question in enumerate(remaining_questions):
200
+ if i > 0 and i % batch_size == 0:
201
+ logger.info(f"Processed {i}/{len(remaining_questions)} questions. Saving progress...")
202
+ save_progress(progress, progress_file)
203
+
204
+ try:
205
+ task_id = question.get("task_id")
206
+ logger.info(f"Processing question {i+1}/{len(remaining_questions)}: {task_id}")
207
+
208
+ # Process the question
209
+ start_time = datetime.datetime.now()
210
+ result = process_question(agent, question, api_base_url)
211
+ end_time = datetime.datetime.now()
212
+
213
+ # Calculate response time
214
+ response_time = (end_time - start_time).total_seconds()
215
+ result["response_time"] = response_time
216
+
217
+ # Add to results and update progress
218
+ results.append(result)
219
+ progress["processed_questions"].append(task_id)
220
+ progress["answers"][task_id] = result.get("answer")
221
+
222
+ logger.info(f"Completed question {task_id} in {response_time:.2f} seconds")
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error processing question: {str(e)}")
226
+ results.append({
227
+ "task_id": question.get("task_id", ""),
228
+ "question": question.get("question", ""),
229
+ "answer": f"Error: {str(e)}",
230
+ "error": str(e)
231
+ })
232
+
233
+ # Save final progress
234
+ save_progress(progress, progress_file)
235
+
236
+ return {
237
+ "results": results,
238
+ "progress": progress
239
+ }
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GAIA Benchmark Agent Dependencies
2
+
3
+ # Core dependencies
4
+ langchain>=0.1.0
5
+ langchain-openai>=0.0.2
6
+ langchain-community>=0.0.1
7
+ openai>=1.3.0
8
+ python-dotenv>=1.0.0
9
+ requests>=2.31.0
10
+
11
+ # Interface dependencies
12
+ gradio>=3.50.0
13
+ pandas>=2.0.0
14
+
15
+ # Utility dependencies
16
+ tqdm>=4.66.1
17
+ pydantic>=2.4.0
18
+ tenacity>=8.2.3