Agent216 / test_agent.py
ahmet1338's picture
ADD: project files added
35f54b3
"""
Test script for GAIA Agent
This script helps you test your agent implementation before submitting to the leaderboard.
Run this to verify your agent works correctly.
"""
import requests
import json
from typing import Dict, List
from agent_implementation import create_agent
from config import AGENT_TYPE
def test_api_connection():
"""Test connection to the GAIA API"""
print("🔍 Testing API connection...")
try:
# Test questions endpoint
response = requests.get("https://gaia-benchmark.vercel.app/api/questions")
if response.status_code == 200:
questions = response.json()
print(f"✅ API connection successful! Found {len(questions)} questions")
return True
else:
print(f"❌ API connection failed with status code: {response.status_code}")
return False
except Exception as e:
print(f"❌ API connection error: {e}")
return False
def test_random_question():
"""Test fetching a random question"""
print("\n🎲 Testing random question fetch...")
try:
response = requests.get("https://gaia-benchmark.vercel.app/api/random-question")
if response.status_code == 200:
question = response.json()
print(f"✅ Random question fetched successfully!")
print(f" Task ID: {question.get('task_id', 'N/A')}")
print(f" Question: {question.get('question', 'N/A')[:100]}...")
return question
else:
print(f"❌ Failed to fetch random question: {response.status_code}")
return None
except Exception as e:
print(f"❌ Error fetching random question: {e}")
return None
def test_agent_on_question(agent, question: Dict):
"""Test the agent on a specific question"""
print(f"\n🤖 Testing agent on question...")
print(f" Task ID: {question.get('task_id', 'N/A')}")
print(f" Question: {question.get('question', 'N/A')}")
try:
answer = agent.generate_answer(question)
print(f"✅ Agent generated answer:")
print(f" Answer: {answer}")
return answer
except Exception as e:
print(f"❌ Agent error: {e}")
return None
def test_file_download(task_id: str):
"""Test file download functionality"""
print(f"\n📁 Testing file download for task {task_id}...")
try:
response = requests.get(f"https://gaia-benchmark.vercel.app/api/files/{task_id}")
if response.status_code == 200:
content = response.text
print(f"✅ File downloaded successfully!")
print(f" Content length: {len(content)} characters")
print(f" Preview: {content[:100]}...")
return content
else:
print(f"⚠️ No file found for task {task_id} (status: {response.status_code})")
return ""
except Exception as e:
print(f"❌ File download error: {e}")
return ""
def run_comprehensive_test():
"""Run a comprehensive test of the agent"""
print("🚀 Starting comprehensive GAIA agent test...")
print("=" * 60)
# Test 1: API Connection
if not test_api_connection():
print("❌ Cannot proceed without API connection")
return False
# Test 2: Create agent
print(f"\n🤖 Creating {AGENT_TYPE} agent...")
try:
agent = create_agent(AGENT_TYPE)
print(f"✅ {AGENT_TYPE} agent created successfully!")
except Exception as e:
print(f"❌ Failed to create agent: {e}")
return False
# Test 3: Fetch random question
question = test_random_question()
if not question:
print("❌ Cannot proceed without a test question")
return False
# Test 4: Test file download
task_id = question.get('task_id', '')
if task_id:
file_content = test_file_download(task_id)
# Test 5: Test agent on question
answer = test_agent_on_question(agent, question)
if not answer:
print("❌ Agent failed to generate answer")
return False
# Test 6: Validate answer format
print(f"\n✅ Answer validation:")
print(f" Length: {len(answer)} characters")
print(f" Contains 'FINAL ANSWER': {'FINAL ANSWER' in answer}")
if 'FINAL ANSWER' in answer:
print(" ⚠️ Warning: Answer contains 'FINAL ANSWER' - remove this for submission!")
print("\n🎉 Comprehensive test completed successfully!")
return True
def test_multiple_questions(num_questions: int = 3):
"""Test the agent on multiple random questions"""
print(f"\n🔄 Testing agent on {num_questions} random questions...")
agent = create_agent(AGENT_TYPE)
results = []
for i in range(num_questions):
print(f"\n--- Test {i+1}/{num_questions} ---")
# Fetch random question
response = requests.get("https://gaia-benchmark.vercel.app/api/random-question")
if response.status_code != 200:
print(f"❌ Failed to fetch question {i+1}")
continue
question = response.json()
print(f"Question: {question.get('question', 'N/A')[:80]}...")
# Generate answer
try:
answer = agent.generate_answer(question)
print(f"Answer: {answer[:100]}...")
results.append({
'task_id': question.get('task_id'),
'question': question.get('question'),
'answer': answer,
'status': 'success'
})
except Exception as e:
print(f"❌ Error: {e}")
results.append({
'task_id': question.get('task_id'),
'question': question.get('question'),
'error': str(e),
'status': 'error'
})
# Summary
successful = sum(1 for r in results if r['status'] == 'success')
print(f"\n📊 Test Summary:")
print(f" Total questions: {num_questions}")
print(f" Successful: {successful}")
print(f" Failed: {num_questions - successful}")
print(f" Success rate: {(successful/num_questions)*100:.1f}%")
return results
def main():
"""Main test function"""
print("🧪 GAIA Agent Test Suite")
print("=" * 40)
while True:
print("\nChoose a test option:")
print("1. Run comprehensive test")
print("2. Test multiple questions")
print("3. Test single random question")
print("4. Exit")
choice = input("\nEnter your choice (1-4): ").strip()
if choice == "1":
run_comprehensive_test()
elif choice == "2":
num = input("How many questions to test? (default: 3): ").strip()
try:
num = int(num) if num else 3
test_multiple_questions(num)
except ValueError:
print("Invalid number, using default: 3")
test_multiple_questions(3)
elif choice == "3":
if test_api_connection():
agent = create_agent(AGENT_TYPE)
question = test_random_question()
if question:
test_agent_on_question(agent, question)
elif choice == "4":
print("👋 Goodbye!")
break
else:
print("Invalid choice. Please enter 1-4.")
if __name__ == "__main__":
main()