Spaces:

schoolkithub
/

GAIA_AGE

Sleeping

GAIA_AGE / evaluate.py

ghost

Updated GAIA agent for submission

945d0d0 7 months ago

7.93 kB

	import json
	import os
	from typing import List, Dict
	from agent import GAIAAgent

	def normalize_answer(answer: str) -> str:
	"""Normalize answer for comparison."""
	if not answer:
	return ""

	# Remove common prefixes/suffixes
	answer = answer.strip()

	# Remove quotes if they wrap the entire answer
	if (answer.startswith('"') and answer.endswith('"')) or (answer.startswith("'") and answer.endswith("'")):
	answer = answer[1:-1]

	# Convert to lowercase for comparison
	return answer.lower().strip()

	def extract_final_answer(response: str) -> str:
	"""Extract the final answer from the model response."""
	if "FINAL ANSWER:" in response:
	answer = response.split("FINAL ANSWER:")[1].strip()
	# Clean up the answer - remove any trailing explanation
	answer = answer.split('\n')[0].strip()
	return answer

	# If no FINAL ANSWER format, try to extract from end of response
	lines = response.strip().split('\n')
	return lines[-1].strip()

	def load_gaia_dataset(dataset_path: str) -> List[Dict]:
	"""Load GAIA dataset from JSON/JSONL file."""
	tasks = []

	if not os.path.exists(dataset_path):
	print(f"Dataset file not found: {dataset_path}")
	return tasks

	try:
	with open(dataset_path, "r", encoding="utf-8") as f:
	if dataset_path.endswith('.jsonl'):
	# JSONL format - one JSON object per line
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if line:
	try:
	task = json.loads(line)
	tasks.append(task)
	except json.JSONDecodeError as e:
	print(f"Error parsing line {line_num}: {e}")
	else:
	# Regular JSON format
	data = json.load(f)
	if isinstance(data, list):
	tasks = data
	elif isinstance(data, dict) and 'tasks' in data:
	tasks = data['tasks']
	else:
	print("Unexpected JSON format")

	except Exception as e:
	print(f"Error loading dataset: {e}")

	print(f"Loaded {len(tasks)} tasks from {dataset_path}")
	return tasks

	def create_sample_dataset() -> List[Dict]:
	"""Create a sample dataset for testing if no GAIA dataset is available."""
	sample_tasks = [
	{
	"task_id": "sample_1",
	"question": "What is 15 + 27?",
	"answer": "42",
	"level": 1,
	"file_name": None
	},
	{
	"task_id": "sample_2",
	"question": "What is the capital of France?",
	"answer": "Paris",
	"level": 1,
	"file_name": None
	},
	{
	"task_id": "sample_3",
	"question": "How many days are in a leap year?",
	"answer": "366",
	"level": 1,
	"file_name": None
	},
	{
	"task_id": "sample_4",
	"question": "What is 2 * 6 * 7?",
	"answer": "84",
	"level": 1,
	"file_name": None
	},
	{
	"task_id": "sample_5",
	"question": "What year did World War II end?",
	"answer": "1945",
	"level": 1,
	"file_name": None
	}
	]

	print("Using sample dataset for testing")
	return sample_tasks

	def evaluate_agent(dataset_path: str = None, max_tasks: int = None) -> float:
	"""Evaluate the GAIA agent on the dataset."""
	# Load dataset
	if dataset_path and os.path.exists(dataset_path):
	tasks = load_gaia_dataset(dataset_path)
	else:
	print("No dataset file found, using sample tasks for testing")
	tasks = create_sample_dataset()

	if not tasks:
	print("No tasks to evaluate")
	return 0.0

	# Limit number of tasks if specified
	if max_tasks:
	tasks = tasks[:max_tasks]
	print(f"Evaluating on first {len(tasks)} tasks")

	# Initialize agent
	print("Initializing GAIA agent...")
	agent = GAIAAgent()

	# Test API connection first
	print("Testing API connection...")
	test_response = agent.test_grok()
	if "error" in test_response.lower():
	print(f"API test failed: {test_response}")
	return 0.0
	else:
	print("API connection successful!")

	# Process tasks
	correct = 0
	total = len(tasks)
	submission_entries = []

	print(f"\nStarting evaluation on {total} tasks...")
	print("=" * 50)

	for i, task in enumerate(tasks, 1):
	task_id = task.get("task_id", f"task_{i}")
	question = task.get("question", "")
	expected_answer = task.get("answer", "")

	print(f"\nTask {i}/{total}: {task_id}")
	print(f"Question: {question[:100]}{'...' if len(question) > 100 else ''}")

	try:
	# Process task with agent
	response = agent.process_task(task)
	predicted_answer = extract_final_answer(response)

	print(f"Expected: {expected_answer}")
	print(f"Predicted: {predicted_answer}")

	# Compare answers (normalized)
	is_correct = normalize_answer(predicted_answer) == normalize_answer(expected_answer)

	if is_correct:
	correct += 1
	print("✅ CORRECT")
	else:
	print("❌ INCORRECT")

	# Store submission entry
	submission_entries.append({
	"task_id": task_id,
	"model_answer": predicted_answer,
	"reasoning_trace": response
	})

	except Exception as e:
	print(f"Error processing task {task_id}: {e}")
	submission_entries.append({
	"task_id": task_id,
	"model_answer": "ERROR",
	"reasoning_trace": f"Error: {str(e)}"
	})

	# Progress update
	current_score = (correct / i) * 100
	print(f"Current score: {correct}/{i} = {current_score:.1f}%")
	print("-" * 30)

	# Final score
	final_score = (correct / total) * 100

	# Save submission file
	try:
	with open("submission.jsonl", "w", encoding="utf-8") as f:
	for entry in submission_entries:
	f.write(json.dumps(entry) + "\n")
	print(f"\nSubmission saved to submission.jsonl")
	except Exception as e:
	print(f"Error saving submission: {e}")

	# Print final results
	print("=" * 50)
	print("FINAL RESULTS")
	print("=" * 50)
	print(f"Total tasks: {total}")
	print(f"Correct answers: {correct}")
	print(f"Final score: {final_score:.2f}%")

	if final_score >= 30:
	print("🎉 CONGRATULATIONS! Score ≥30% - Certificate achieved!")
	else:
	print(f"📈 Score below 30%. Need {30 - final_score:.2f}% more for certificate.")

	return final_score

	def main():
	"""Main evaluation function."""
	import argparse

	parser = argparse.ArgumentParser(description="Evaluate GAIA agent")
	parser.add_argument("--dataset", type=str, default="gaia_test.json",
	help="Path to GAIA dataset file")
	parser.add_argument("--max-tasks", type=int, default=None,
	help="Maximum number of tasks to evaluate")

	args = parser.parse_args()

	score = evaluate_agent(args.dataset, args.max_tasks)

	print(f"\nFinal evaluation score: {score:.2f}%")

	if score >= 30:
	print("Certificate requirements met! 🎉")
	else:
	print("Keep working to reach 30% for the certificate! 💪")

	if __name__ == "__main__":
	main()