agentbee / test /test_quick_fixes.py
mangubee's picture
feat: system error field, calculator fix, target task IDs, course vs GAIA docs
41ac444
#!/usr/bin/env python3
"""
Quick test script for specific GAIA questions.
Use this to verify fixes without running full evaluation.
Usage:
uv run python test/test_quick_fixes.py
"""
import os
import sys
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.agent.graph import GAIAAgent
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# ============================================================================
# CONFIG - Questions to test
# ============================================================================
TEST_QUESTIONS = [
{
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
"name": "Reverse sentence (calculator threading fix)",
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
"expected": "Right",
},
{
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
"name": "Table commutativity (LLM issue - table in question)",
"question": '''Given this table defining * on the set S = {a, b, c, d, e}
|*|a|b|c|d|e|
|---|---|---|---|---|
|a|a|b|c|b|d|
|b|b|c|a|e|c|
|c|c|a|b|b|a|
|d|b|e|b|e|d|
|e|d|b|a|d|c|
provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.''',
"expected": "b, e",
},
]
# ============================================================================
def test_question(agent: GAIAAgent, test_case: dict) -> dict:
"""Test a single question and return result."""
task_id = test_case["task_id"]
question = test_case["question"]
expected = test_case.get("expected", "N/A")
print(f"\n{'='*60}")
print(f"Testing: {test_case['name']}")
print(f"Task ID: {task_id}")
print(f"Expected: {expected}")
print(f"{'='*60}")
try:
answer = agent(question, file_path=None)
# Check if answer matches expected
is_correct = answer.strip().lower() == expected.lower().strip()
result = {
"task_id": task_id,
"name": test_case["name"],
"question": question[:100] + "..." if len(question) > 100 else question,
"expected": expected,
"answer": answer,
"correct": is_correct,
"status": "success",
}
# Determine system error
if not answer:
result["system_error"] = "yes"
elif answer.lower().startswith("error:") or "no evidence collected" in answer.lower():
result["system_error"] = "yes"
result["error_log"] = answer
else:
result["system_error"] = "no"
except Exception as e:
result = {
"task_id": task_id,
"name": test_case["name"],
"question": question[:100] + "..." if len(question) > 100 else question,
"expected": expected,
"answer": f"ERROR: {str(e)}",
"correct": False,
"status": "error",
"system_error": "yes",
"error_log": str(e),
}
# Print result
status_icon = "✅" if result["correct"] else "❌" if result["system_error"] == "no" else "⚠️"
print(f"\n{status_icon} Result: {result['answer'][:100]}")
if result["system_error"] == "yes":
print(f" System Error: Yes")
if result.get("error_log"):
print(f" Error: {result['error_log'][:100]}")
return result
def main():
"""Run quick tests on specific questions."""
print("\n" + "="*60)
print("GAIA Quick Test - Verify Fixes")
print("="*60)
# Check LLM provider
llm_provider = os.getenv("LLM_PROVIDER", "gemini")
print(f"\nLLM Provider: {llm_provider}")
# Initialize agent
print("\nInitializing agent...")
try:
agent = GAIAAgent()
print("✅ Agent initialized")
except Exception as e:
print(f"❌ Agent initialization failed: {e}")
return
# Run tests
results = []
for test_case in TEST_QUESTIONS:
result = test_question(agent, test_case)
results.append(result)
# Summary
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
success_count = sum(1 for r in results if r["correct"])
error_count = sum(1 for r in results if r["system_error"] == "yes")
ai_fail_count = sum(1 for r in results if r["system_error"] == "no" and not r["correct"])
print(f"\nTotal: {len(results)}")
print(f"✅ Correct: {success_count}")
print(f"⚠️ System Errors: {error_count}")
print(f"❌ AI Wrong: {ai_fail_count}")
# Detailed results
print(f"\nDetailed Results:")
for r in results:
status = "✅" if r["correct"] else "⚠️" if r["system_error"] == "yes" else "❌"
print(f" {status} {r['name']}: {r['answer'][:50]}{'...' if len(r['answer']) > 50 else ''}")
if __name__ == "__main__":
main()