Spaces:

ahmet1338
/

Agent216

Sleeping

App Files Files Community

Agent216 / test_agent.py

ahmet1338

ADD: project files added

35f54b3 8 months ago

raw

history blame contribute delete

7.52 kB

	"""
	Test script for GAIA Agent

	This script helps you test your agent implementation before submitting to the leaderboard.
	Run this to verify your agent works correctly.
	"""

	import requests
	import json
	from typing import Dict, List
	from agent_implementation import create_agent
	from config import AGENT_TYPE

	def test_api_connection():
	"""Test connection to the GAIA API"""
	print("🔍 Testing API connection...")

	try:
	# Test questions endpoint
	response = requests.get("https://gaia-benchmark.vercel.app/api/questions")
	if response.status_code == 200:
	questions = response.json()
	print(f"✅ API connection successful! Found {len(questions)} questions")
	return True
	else:
	print(f"❌ API connection failed with status code: {response.status_code}")
	return False
	except Exception as e:
	print(f"❌ API connection error: {e}")
	return False

	def test_random_question():
	"""Test fetching a random question"""
	print("\n🎲 Testing random question fetch...")

	try:
	response = requests.get("https://gaia-benchmark.vercel.app/api/random-question")
	if response.status_code == 200:
	question = response.json()
	print(f"✅ Random question fetched successfully!")
	print(f" Task ID: {question.get('task_id', 'N/A')}")
	print(f" Question: {question.get('question', 'N/A')[:100]}...")
	return question
	else:
	print(f"❌ Failed to fetch random question: {response.status_code}")
	return None
	except Exception as e:
	print(f"❌ Error fetching random question: {e}")
	return None

	def test_agent_on_question(agent, question: Dict):
	"""Test the agent on a specific question"""
	print(f"\n🤖 Testing agent on question...")
	print(f" Task ID: {question.get('task_id', 'N/A')}")
	print(f" Question: {question.get('question', 'N/A')}")

	try:
	answer = agent.generate_answer(question)
	print(f"✅ Agent generated answer:")
	print(f" Answer: {answer}")
	return answer
	except Exception as e:
	print(f"❌ Agent error: {e}")
	return None

	def test_file_download(task_id: str):
	"""Test file download functionality"""
	print(f"\n📁 Testing file download for task {task_id}...")

	try:
	response = requests.get(f"https://gaia-benchmark.vercel.app/api/files/{task_id}")
	if response.status_code == 200:
	content = response.text
	print(f"✅ File downloaded successfully!")
	print(f" Content length: {len(content)} characters")
	print(f" Preview: {content[:100]}...")
	return content
	else:
	print(f"⚠️ No file found for task {task_id} (status: {response.status_code})")
	return ""
	except Exception as e:
	print(f"❌ File download error: {e}")
	return ""

	def run_comprehensive_test():
	"""Run a comprehensive test of the agent"""
	print("🚀 Starting comprehensive GAIA agent test...")
	print("=" * 60)

	# Test 1: API Connection
	if not test_api_connection():
	print("❌ Cannot proceed without API connection")
	return False

	# Test 2: Create agent
	print(f"\n🤖 Creating {AGENT_TYPE} agent...")
	try:
	agent = create_agent(AGENT_TYPE)
	print(f"✅ {AGENT_TYPE} agent created successfully!")
	except Exception as e:
	print(f"❌ Failed to create agent: {e}")
	return False

	# Test 3: Fetch random question
	question = test_random_question()
	if not question:
	print("❌ Cannot proceed without a test question")
	return False

	# Test 4: Test file download
	task_id = question.get('task_id', '')
	if task_id:
	file_content = test_file_download(task_id)

	# Test 5: Test agent on question
	answer = test_agent_on_question(agent, question)
	if not answer:
	print("❌ Agent failed to generate answer")
	return False

	# Test 6: Validate answer format
	print(f"\n✅ Answer validation:")
	print(f" Length: {len(answer)} characters")
	print(f" Contains 'FINAL ANSWER': {'FINAL ANSWER' in answer}")
	if 'FINAL ANSWER' in answer:
	print(" ⚠️ Warning: Answer contains 'FINAL ANSWER' - remove this for submission!")

	print("\n🎉 Comprehensive test completed successfully!")
	return True

	def test_multiple_questions(num_questions: int = 3):
	"""Test the agent on multiple random questions"""
	print(f"\n🔄 Testing agent on {num_questions} random questions...")

	agent = create_agent(AGENT_TYPE)
	results = []

	for i in range(num_questions):
	print(f"\n--- Test {i+1}/{num_questions} ---")

	# Fetch random question
	response = requests.get("https://gaia-benchmark.vercel.app/api/random-question")
	if response.status_code != 200:
	print(f"❌ Failed to fetch question {i+1}")
	continue

	question = response.json()
	print(f"Question: {question.get('question', 'N/A')[:80]}...")

	# Generate answer
	try:
	answer = agent.generate_answer(question)
	print(f"Answer: {answer[:100]}...")
	results.append({
	'task_id': question.get('task_id'),
	'question': question.get('question'),
	'answer': answer,
	'status': 'success'
	})
	except Exception as e:
	print(f"❌ Error: {e}")
	results.append({
	'task_id': question.get('task_id'),
	'question': question.get('question'),
	'error': str(e),
	'status': 'error'
	})

	# Summary
	successful = sum(1 for r in results if r['status'] == 'success')
	print(f"\n📊 Test Summary:")
	print(f" Total questions: {num_questions}")
	print(f" Successful: {successful}")
	print(f" Failed: {num_questions - successful}")
	print(f" Success rate: {(successful/num_questions)*100:.1f}%")

	return results

	def main():
	"""Main test function"""
	print("🧪 GAIA Agent Test Suite")
	print("=" * 40)

	while True:
	print("\nChoose a test option:")
	print("1. Run comprehensive test")
	print("2. Test multiple questions")
	print("3. Test single random question")
	print("4. Exit")

	choice = input("\nEnter your choice (1-4): ").strip()

	if choice == "1":
	run_comprehensive_test()
	elif choice == "2":
	num = input("How many questions to test? (default: 3): ").strip()
	try:
	num = int(num) if num else 3
	test_multiple_questions(num)
	except ValueError:
	print("Invalid number, using default: 3")
	test_multiple_questions(3)
	elif choice == "3":
	if test_api_connection():
	agent = create_agent(AGENT_TYPE)
	question = test_random_question()
	if question:
	test_agent_on_question(agent, question)
	elif choice == "4":
	print("👋 Goodbye!")
	break
	else:
	print("Invalid choice. Please enter 1-4.")

	if __name__ == "__main__":
	main()