""" Test script for GAIA Agent This script helps you test your agent implementation before submitting to the leaderboard. Run this to verify your agent works correctly. """ import requests import json from typing import Dict, List from agent_implementation import create_agent from config import AGENT_TYPE def test_api_connection(): """Test connection to the GAIA API""" print("๐Ÿ” Testing API connection...") try: # Test questions endpoint response = requests.get("https://gaia-benchmark.vercel.app/api/questions") if response.status_code == 200: questions = response.json() print(f"โœ… API connection successful! Found {len(questions)} questions") return True else: print(f"โŒ API connection failed with status code: {response.status_code}") return False except Exception as e: print(f"โŒ API connection error: {e}") return False def test_random_question(): """Test fetching a random question""" print("\n๐ŸŽฒ Testing random question fetch...") try: response = requests.get("https://gaia-benchmark.vercel.app/api/random-question") if response.status_code == 200: question = response.json() print(f"โœ… Random question fetched successfully!") print(f" Task ID: {question.get('task_id', 'N/A')}") print(f" Question: {question.get('question', 'N/A')[:100]}...") return question else: print(f"โŒ Failed to fetch random question: {response.status_code}") return None except Exception as e: print(f"โŒ Error fetching random question: {e}") return None def test_agent_on_question(agent, question: Dict): """Test the agent on a specific question""" print(f"\n๐Ÿค– Testing agent on question...") print(f" Task ID: {question.get('task_id', 'N/A')}") print(f" Question: {question.get('question', 'N/A')}") try: answer = agent.generate_answer(question) print(f"โœ… Agent generated answer:") print(f" Answer: {answer}") return answer except Exception as e: print(f"โŒ Agent error: {e}") return None def test_file_download(task_id: str): """Test file download functionality""" print(f"\n๐Ÿ“ Testing file download for task {task_id}...") try: response = requests.get(f"https://gaia-benchmark.vercel.app/api/files/{task_id}") if response.status_code == 200: content = response.text print(f"โœ… File downloaded successfully!") print(f" Content length: {len(content)} characters") print(f" Preview: {content[:100]}...") return content else: print(f"โš ๏ธ No file found for task {task_id} (status: {response.status_code})") return "" except Exception as e: print(f"โŒ File download error: {e}") return "" def run_comprehensive_test(): """Run a comprehensive test of the agent""" print("๐Ÿš€ Starting comprehensive GAIA agent test...") print("=" * 60) # Test 1: API Connection if not test_api_connection(): print("โŒ Cannot proceed without API connection") return False # Test 2: Create agent print(f"\n๐Ÿค– Creating {AGENT_TYPE} agent...") try: agent = create_agent(AGENT_TYPE) print(f"โœ… {AGENT_TYPE} agent created successfully!") except Exception as e: print(f"โŒ Failed to create agent: {e}") return False # Test 3: Fetch random question question = test_random_question() if not question: print("โŒ Cannot proceed without a test question") return False # Test 4: Test file download task_id = question.get('task_id', '') if task_id: file_content = test_file_download(task_id) # Test 5: Test agent on question answer = test_agent_on_question(agent, question) if not answer: print("โŒ Agent failed to generate answer") return False # Test 6: Validate answer format print(f"\nโœ… Answer validation:") print(f" Length: {len(answer)} characters") print(f" Contains 'FINAL ANSWER': {'FINAL ANSWER' in answer}") if 'FINAL ANSWER' in answer: print(" โš ๏ธ Warning: Answer contains 'FINAL ANSWER' - remove this for submission!") print("\n๐ŸŽ‰ Comprehensive test completed successfully!") return True def test_multiple_questions(num_questions: int = 3): """Test the agent on multiple random questions""" print(f"\n๐Ÿ”„ Testing agent on {num_questions} random questions...") agent = create_agent(AGENT_TYPE) results = [] for i in range(num_questions): print(f"\n--- Test {i+1}/{num_questions} ---") # Fetch random question response = requests.get("https://gaia-benchmark.vercel.app/api/random-question") if response.status_code != 200: print(f"โŒ Failed to fetch question {i+1}") continue question = response.json() print(f"Question: {question.get('question', 'N/A')[:80]}...") # Generate answer try: answer = agent.generate_answer(question) print(f"Answer: {answer[:100]}...") results.append({ 'task_id': question.get('task_id'), 'question': question.get('question'), 'answer': answer, 'status': 'success' }) except Exception as e: print(f"โŒ Error: {e}") results.append({ 'task_id': question.get('task_id'), 'question': question.get('question'), 'error': str(e), 'status': 'error' }) # Summary successful = sum(1 for r in results if r['status'] == 'success') print(f"\n๐Ÿ“Š Test Summary:") print(f" Total questions: {num_questions}") print(f" Successful: {successful}") print(f" Failed: {num_questions - successful}") print(f" Success rate: {(successful/num_questions)*100:.1f}%") return results def main(): """Main test function""" print("๐Ÿงช GAIA Agent Test Suite") print("=" * 40) while True: print("\nChoose a test option:") print("1. Run comprehensive test") print("2. Test multiple questions") print("3. Test single random question") print("4. Exit") choice = input("\nEnter your choice (1-4): ").strip() if choice == "1": run_comprehensive_test() elif choice == "2": num = input("How many questions to test? (default: 3): ").strip() try: num = int(num) if num else 3 test_multiple_questions(num) except ValueError: print("Invalid number, using default: 3") test_multiple_questions(3) elif choice == "3": if test_api_connection(): agent = create_agent(AGENT_TYPE) question = test_random_question() if question: test_agent_on_question(agent, question) elif choice == "4": print("๐Ÿ‘‹ Goodbye!") break else: print("Invalid choice. Please enter 1-4.") if __name__ == "__main__": main()