| | |
| | """ |
| | Test Production Fixes for GAIA Agent System |
| | Quick validation that error handling improvements are working |
| | """ |
| |
|
| | import logging |
| | import time |
| | from typing import List, Dict, Any |
| |
|
| | from models.qwen_client import QwenClient |
| | from workflow.gaia_workflow import SimpleGAIAWorkflow |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| | logger = logging.getLogger(__name__) |
| |
|
| | class ProductionFixTester: |
| | """Test the production fixes for error handling and robustness""" |
| | |
| | def __init__(self): |
| | try: |
| | self.llm_client = QwenClient() |
| | self.workflow = SimpleGAIAWorkflow(self.llm_client) |
| | logger.info("✅ Test environment initialized") |
| | except Exception as e: |
| | logger.error(f"❌ Failed to initialize test environment: {e}") |
| | raise |
| | |
| | def test_error_handling_scenarios(self) -> Dict[str, Any]: |
| | """Test various error scenarios that were causing production failures""" |
| | |
| | test_scenarios = [ |
| | { |
| | "name": "Wikipedia Research Failure Simulation", |
| | "question": "What is the most obscure fictional character from the imaginary book 'Zzzzz12345NonExistent'?", |
| | "expected_behavior": "Should fail gracefully and provide fallback response" |
| | }, |
| | { |
| | "name": "Mathematical Reasoning with Complex Data", |
| | "question": "Calculate the square root of negative infinity divided by zero plus the factorial of pi", |
| | "expected_behavior": "Should handle impossible math gracefully" |
| | }, |
| | { |
| | "name": "Conversion with Invalid Units", |
| | "question": "Convert 50 zorkples to flibbers using the international zorkple standard", |
| | "expected_behavior": "Should recognize invalid units and respond appropriately" |
| | }, |
| | { |
| | "name": "Web Research with Rate Limiting Simulation", |
| | "question": "What are the current stock prices for all Fortune 500 companies as of this exact moment?", |
| | "expected_behavior": "Should handle external API limitations gracefully" |
| | }, |
| | { |
| | "name": "Complex Multi-Agent Question", |
| | "question": "Analyze the correlation between quantum entanglement and the price of tea in 17th century Mongolia while also calculating the fibonacci sequence backwards from infinity", |
| | "expected_behavior": "Should route to multiple agents and synthesize results" |
| | } |
| | ] |
| | |
| | results = { |
| | "test_summary": { |
| | "total_tests": len(test_scenarios), |
| | "passed": 0, |
| | "failed": 0, |
| | "errors": [] |
| | }, |
| | "detailed_results": [] |
| | } |
| | |
| | for i, scenario in enumerate(test_scenarios, 1): |
| | logger.info(f"\n🧪 Test {i}/{len(test_scenarios)}: {scenario['name']}") |
| | logger.info(f"Question: {scenario['question']}") |
| | |
| | start_time = time.time() |
| | |
| | try: |
| | |
| | result_state = self.workflow.process_question( |
| | question=scenario['question'], |
| | task_id=f"fix_test_{i}" |
| | ) |
| | |
| | processing_time = time.time() - start_time |
| | |
| | |
| | test_result = self._analyze_test_result(scenario, result_state, processing_time) |
| | results["detailed_results"].append(test_result) |
| | |
| | if test_result["passed"]: |
| | results["test_summary"]["passed"] += 1 |
| | logger.info(f"✅ PASSED: {test_result['reason']}") |
| | else: |
| | results["test_summary"]["failed"] += 1 |
| | logger.warning(f"❌ FAILED: {test_result['reason']}") |
| | |
| | |
| | logger.info(f" 📊 Confidence: {result_state.final_confidence:.2f}") |
| | logger.info(f" ⏱️ Time: {processing_time:.2f}s") |
| | logger.info(f" 💰 Cost: ${result_state.total_cost:.4f}") |
| | logger.info(f" 🎯 Answer: {result_state.final_answer[:100]}...") |
| | |
| | except Exception as e: |
| | error_msg = f"Exception in test {i}: {str(e)}" |
| | logger.error(f"❌ ERROR: {error_msg}") |
| | results["test_summary"]["errors"].append(error_msg) |
| | results["test_summary"]["failed"] += 1 |
| | |
| | results["detailed_results"].append({ |
| | "test_name": scenario['name'], |
| | "passed": False, |
| | "reason": f"Test exception: {str(e)}", |
| | "processing_time": time.time() - start_time, |
| | "confidence": 0.0, |
| | "answer": "Test failed with exception" |
| | }) |
| | |
| | return results |
| | |
| | def _analyze_test_result(self, scenario: Dict[str, Any], result_state, processing_time: float) -> Dict[str, Any]: |
| | """Analyze if a test result meets expectations for error handling""" |
| | |
| | test_result = { |
| | "test_name": scenario['name'], |
| | "passed": False, |
| | "reason": "", |
| | "processing_time": processing_time, |
| | "confidence": result_state.final_confidence, |
| | "answer": result_state.final_answer, |
| | "agents_used": [role.value for role in result_state.agent_results.keys()], |
| | "error_count": len(result_state.error_messages) |
| | } |
| | |
| | |
| | if result_state.final_answer is None or result_state.final_answer == "": |
| | test_result["reason"] = "Critical failure: No answer generated" |
| | return test_result |
| | |
| | |
| | crash_indicators = [ |
| | "system not initialized", |
| | "workflow execution failed", |
| | "unable to process question - no agent results available" |
| | ] |
| | |
| | answer_lower = result_state.final_answer.lower() |
| | if any(indicator in answer_lower for indicator in crash_indicators): |
| | test_result["reason"] = "System crash detected in response" |
| | return test_result |
| | |
| | |
| | graceful_indicators = [ |
| | "processing encountered difficulties", |
| | "research sources failed", |
| | "reasoning failed", |
| | "conversion failed", |
| | "mathematical complexity", |
| | "limited information available" |
| | ] |
| | |
| | has_graceful_handling = any(indicator in answer_lower for indicator in graceful_indicators) |
| | |
| | |
| | if has_graceful_handling and result_state.final_confidence >= 0.1: |
| | test_result["passed"] = True |
| | test_result["reason"] = "Graceful error handling with reasonable confidence" |
| | elif not has_graceful_handling and result_state.final_confidence >= 0.3: |
| | test_result["passed"] = True |
| | test_result["reason"] = "Provided meaningful answer with acceptable confidence" |
| | elif result_state.final_confidence > 0.0 and len(result_state.agent_results) > 0: |
| | test_result["passed"] = True |
| | test_result["reason"] = "System remained stable and attempted processing" |
| | else: |
| | test_result["reason"] = f"Insufficient error handling or system instability (confidence: {result_state.final_confidence:.2f})" |
| | |
| | return test_result |
| | |
| | def run_comprehensive_test(self) -> None: |
| | """Run comprehensive test and report results""" |
| | |
| | logger.info("🚀 Starting Production Fix Validation Tests") |
| | logger.info("=" * 60) |
| | |
| | start_time = time.time() |
| | |
| | try: |
| | results = self.test_error_handling_scenarios() |
| | total_time = time.time() - start_time |
| | |
| | |
| | summary = results["test_summary"] |
| | logger.info("\n" + "=" * 60) |
| | logger.info("📋 TEST SUMMARY") |
| | logger.info("=" * 60) |
| | logger.info(f"Total Tests: {summary['total_tests']}") |
| | logger.info(f"✅ Passed: {summary['passed']}") |
| | logger.info(f"❌ Failed: {summary['failed']}") |
| | logger.info(f"⚠️ Errors: {len(summary['errors'])}") |
| | logger.info(f"📊 Success Rate: {summary['passed']/summary['total_tests']*100:.1f}%") |
| | logger.info(f"⏱️ Total Time: {total_time:.2f}s") |
| | |
| | |
| | success_rate = summary['passed'] / summary['total_tests'] |
| | if success_rate >= 0.8: |
| | logger.info("🎉 PRODUCTION FIXES VALIDATION: PASSED") |
| | logger.info("System demonstrates robust error handling and graceful degradation") |
| | else: |
| | logger.warning("⚠️ PRODUCTION FIXES VALIDATION: NEEDS IMPROVEMENT") |
| | logger.warning(f"Success rate {success_rate*100:.1f}% below 80% threshold") |
| | |
| | |
| | if summary['errors']: |
| | logger.error("\n🔥 ERRORS ENCOUNTERED:") |
| | for error in summary['errors']: |
| | logger.error(f" - {error}") |
| | |
| | except Exception as e: |
| | logger.error(f"❌ Comprehensive test failed: {str(e)}") |
| | raise |
| |
|
| | def main(): |
| | """Main test execution""" |
| | try: |
| | tester = ProductionFixTester() |
| | tester.run_comprehensive_test() |
| | except Exception as e: |
| | logger.error(f"Test execution failed: {e}") |
| | exit(1) |
| |
|
| | if __name__ == "__main__": |
| | main() |