#!/usr/bin/env python3 """ Quick test script for specific GAIA questions. Use this to verify fixes without running full evaluation. Usage: uv run python test/test_quick_fixes.py """ import os import sys # Add project root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from src.agent.graph import GAIAAgent from dotenv import load_dotenv # Load environment variables load_dotenv() # ============================================================================ # CONFIG - Questions to test # ============================================================================ TEST_QUESTIONS = [ { "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0", "name": "Reverse sentence (calculator threading fix)", "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", "expected": "Right", }, { "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4", "name": "Table commutativity (LLM issue - table in question)", "question": '''Given this table defining * on the set S = {a, b, c, d, e} |*|a|b|c|d|e| |---|---|---|---|---| |a|a|b|c|b|d| |b|b|c|a|e|c| |c|c|a|b|b|a| |d|b|e|b|e|d| |e|d|b|a|d|c| provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.''', "expected": "b, e", }, ] # ============================================================================ def test_question(agent: GAIAAgent, test_case: dict) -> dict: """Test a single question and return result.""" task_id = test_case["task_id"] question = test_case["question"] expected = test_case.get("expected", "N/A") print(f"\n{'='*60}") print(f"Testing: {test_case['name']}") print(f"Task ID: {task_id}") print(f"Expected: {expected}") print(f"{'='*60}") try: answer = agent(question, file_path=None) # Check if answer matches expected is_correct = answer.strip().lower() == expected.lower().strip() result = { "task_id": task_id, "name": test_case["name"], "question": question[:100] + "..." if len(question) > 100 else question, "expected": expected, "answer": answer, "correct": is_correct, "status": "success", } # Determine system error if not answer: result["system_error"] = "yes" elif answer.lower().startswith("error:") or "no evidence collected" in answer.lower(): result["system_error"] = "yes" result["error_log"] = answer else: result["system_error"] = "no" except Exception as e: result = { "task_id": task_id, "name": test_case["name"], "question": question[:100] + "..." if len(question) > 100 else question, "expected": expected, "answer": f"ERROR: {str(e)}", "correct": False, "status": "error", "system_error": "yes", "error_log": str(e), } # Print result status_icon = "✅" if result["correct"] else "❌" if result["system_error"] == "no" else "⚠️" print(f"\n{status_icon} Result: {result['answer'][:100]}") if result["system_error"] == "yes": print(f" System Error: Yes") if result.get("error_log"): print(f" Error: {result['error_log'][:100]}") return result def main(): """Run quick tests on specific questions.""" print("\n" + "="*60) print("GAIA Quick Test - Verify Fixes") print("="*60) # Check LLM provider llm_provider = os.getenv("LLM_PROVIDER", "gemini") print(f"\nLLM Provider: {llm_provider}") # Initialize agent print("\nInitializing agent...") try: agent = GAIAAgent() print("✅ Agent initialized") except Exception as e: print(f"❌ Agent initialization failed: {e}") return # Run tests results = [] for test_case in TEST_QUESTIONS: result = test_question(agent, test_case) results.append(result) # Summary print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") success_count = sum(1 for r in results if r["correct"]) error_count = sum(1 for r in results if r["system_error"] == "yes") ai_fail_count = sum(1 for r in results if r["system_error"] == "no" and not r["correct"]) print(f"\nTotal: {len(results)}") print(f"✅ Correct: {success_count}") print(f"⚠️ System Errors: {error_count}") print(f"❌ AI Wrong: {ai_fail_count}") # Detailed results print(f"\nDetailed Results:") for r in results: status = "✅" if r["correct"] else "⚠️" if r["system_error"] == "yes" else "❌" print(f" {status} {r['name']}: {r['answer'][:50]}{'...' if len(r['answer']) > 50 else ''}") if __name__ == "__main__": main()