| | """ |
| | Test script for GAIA Agent |
| | |
| | This script helps you test your agent implementation before submitting to the leaderboard. |
| | Run this to verify your agent works correctly. |
| | """ |
| |
|
| | import requests |
| | import json |
| | from typing import Dict, List |
| | from agent_implementation import create_agent |
| | from config import AGENT_TYPE |
| |
|
| | def test_api_connection(): |
| | """Test connection to the GAIA API""" |
| | print("🔍 Testing API connection...") |
| | |
| | try: |
| | |
| | response = requests.get("https://gaia-benchmark.vercel.app/api/questions") |
| | if response.status_code == 200: |
| | questions = response.json() |
| | print(f"✅ API connection successful! Found {len(questions)} questions") |
| | return True |
| | else: |
| | print(f"❌ API connection failed with status code: {response.status_code}") |
| | return False |
| | except Exception as e: |
| | print(f"❌ API connection error: {e}") |
| | return False |
| |
|
| | def test_random_question(): |
| | """Test fetching a random question""" |
| | print("\n🎲 Testing random question fetch...") |
| | |
| | try: |
| | response = requests.get("https://gaia-benchmark.vercel.app/api/random-question") |
| | if response.status_code == 200: |
| | question = response.json() |
| | print(f"✅ Random question fetched successfully!") |
| | print(f" Task ID: {question.get('task_id', 'N/A')}") |
| | print(f" Question: {question.get('question', 'N/A')[:100]}...") |
| | return question |
| | else: |
| | print(f"❌ Failed to fetch random question: {response.status_code}") |
| | return None |
| | except Exception as e: |
| | print(f"❌ Error fetching random question: {e}") |
| | return None |
| |
|
| | def test_agent_on_question(agent, question: Dict): |
| | """Test the agent on a specific question""" |
| | print(f"\n🤖 Testing agent on question...") |
| | print(f" Task ID: {question.get('task_id', 'N/A')}") |
| | print(f" Question: {question.get('question', 'N/A')}") |
| | |
| | try: |
| | answer = agent.generate_answer(question) |
| | print(f"✅ Agent generated answer:") |
| | print(f" Answer: {answer}") |
| | return answer |
| | except Exception as e: |
| | print(f"❌ Agent error: {e}") |
| | return None |
| |
|
| | def test_file_download(task_id: str): |
| | """Test file download functionality""" |
| | print(f"\n📁 Testing file download for task {task_id}...") |
| | |
| | try: |
| | response = requests.get(f"https://gaia-benchmark.vercel.app/api/files/{task_id}") |
| | if response.status_code == 200: |
| | content = response.text |
| | print(f"✅ File downloaded successfully!") |
| | print(f" Content length: {len(content)} characters") |
| | print(f" Preview: {content[:100]}...") |
| | return content |
| | else: |
| | print(f"⚠️ No file found for task {task_id} (status: {response.status_code})") |
| | return "" |
| | except Exception as e: |
| | print(f"❌ File download error: {e}") |
| | return "" |
| |
|
| | def run_comprehensive_test(): |
| | """Run a comprehensive test of the agent""" |
| | print("🚀 Starting comprehensive GAIA agent test...") |
| | print("=" * 60) |
| | |
| | |
| | if not test_api_connection(): |
| | print("❌ Cannot proceed without API connection") |
| | return False |
| | |
| | |
| | print(f"\n🤖 Creating {AGENT_TYPE} agent...") |
| | try: |
| | agent = create_agent(AGENT_TYPE) |
| | print(f"✅ {AGENT_TYPE} agent created successfully!") |
| | except Exception as e: |
| | print(f"❌ Failed to create agent: {e}") |
| | return False |
| | |
| | |
| | question = test_random_question() |
| | if not question: |
| | print("❌ Cannot proceed without a test question") |
| | return False |
| | |
| | |
| | task_id = question.get('task_id', '') |
| | if task_id: |
| | file_content = test_file_download(task_id) |
| | |
| | |
| | answer = test_agent_on_question(agent, question) |
| | if not answer: |
| | print("❌ Agent failed to generate answer") |
| | return False |
| | |
| | |
| | print(f"\n✅ Answer validation:") |
| | print(f" Length: {len(answer)} characters") |
| | print(f" Contains 'FINAL ANSWER': {'FINAL ANSWER' in answer}") |
| | if 'FINAL ANSWER' in answer: |
| | print(" ⚠️ Warning: Answer contains 'FINAL ANSWER' - remove this for submission!") |
| | |
| | print("\n🎉 Comprehensive test completed successfully!") |
| | return True |
| |
|
| | def test_multiple_questions(num_questions: int = 3): |
| | """Test the agent on multiple random questions""" |
| | print(f"\n🔄 Testing agent on {num_questions} random questions...") |
| | |
| | agent = create_agent(AGENT_TYPE) |
| | results = [] |
| | |
| | for i in range(num_questions): |
| | print(f"\n--- Test {i+1}/{num_questions} ---") |
| | |
| | |
| | response = requests.get("https://gaia-benchmark.vercel.app/api/random-question") |
| | if response.status_code != 200: |
| | print(f"❌ Failed to fetch question {i+1}") |
| | continue |
| | |
| | question = response.json() |
| | print(f"Question: {question.get('question', 'N/A')[:80]}...") |
| | |
| | |
| | try: |
| | answer = agent.generate_answer(question) |
| | print(f"Answer: {answer[:100]}...") |
| | results.append({ |
| | 'task_id': question.get('task_id'), |
| | 'question': question.get('question'), |
| | 'answer': answer, |
| | 'status': 'success' |
| | }) |
| | except Exception as e: |
| | print(f"❌ Error: {e}") |
| | results.append({ |
| | 'task_id': question.get('task_id'), |
| | 'question': question.get('question'), |
| | 'error': str(e), |
| | 'status': 'error' |
| | }) |
| | |
| | |
| | successful = sum(1 for r in results if r['status'] == 'success') |
| | print(f"\n📊 Test Summary:") |
| | print(f" Total questions: {num_questions}") |
| | print(f" Successful: {successful}") |
| | print(f" Failed: {num_questions - successful}") |
| | print(f" Success rate: {(successful/num_questions)*100:.1f}%") |
| | |
| | return results |
| |
|
| | def main(): |
| | """Main test function""" |
| | print("🧪 GAIA Agent Test Suite") |
| | print("=" * 40) |
| | |
| | while True: |
| | print("\nChoose a test option:") |
| | print("1. Run comprehensive test") |
| | print("2. Test multiple questions") |
| | print("3. Test single random question") |
| | print("4. Exit") |
| | |
| | choice = input("\nEnter your choice (1-4): ").strip() |
| | |
| | if choice == "1": |
| | run_comprehensive_test() |
| | elif choice == "2": |
| | num = input("How many questions to test? (default: 3): ").strip() |
| | try: |
| | num = int(num) if num else 3 |
| | test_multiple_questions(num) |
| | except ValueError: |
| | print("Invalid number, using default: 3") |
| | test_multiple_questions(3) |
| | elif choice == "3": |
| | if test_api_connection(): |
| | agent = create_agent(AGENT_TYPE) |
| | question = test_random_question() |
| | if question: |
| | test_agent_on_question(agent, question) |
| | elif choice == "4": |
| | print("👋 Goodbye!") |
| | break |
| | else: |
| | print("Invalid choice. Please enter 1-4.") |
| |
|
| | if __name__ == "__main__": |
| | main() |