| |
| """ |
| Test Production Fixes for GAIA Agent System |
| Quick validation that error handling improvements are working |
| """ |
|
|
| import logging |
| import time |
| from typing import List, Dict, Any |
|
|
| from models.qwen_client import QwenClient |
| from workflow.gaia_workflow import SimpleGAIAWorkflow |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| class ProductionFixTester: |
| """Test the production fixes for error handling and robustness""" |
| |
| def __init__(self): |
| try: |
| self.llm_client = QwenClient() |
| self.workflow = SimpleGAIAWorkflow(self.llm_client) |
| logger.info("β
Test environment initialized") |
| except Exception as e: |
| logger.error(f"β Failed to initialize test environment: {e}") |
| raise |
| |
| def test_error_handling_scenarios(self) -> Dict[str, Any]: |
| """Test various error scenarios that were causing production failures""" |
| |
| test_scenarios = [ |
| { |
| "name": "Wikipedia Research Failure Simulation", |
| "question": "What is the most obscure fictional character from the imaginary book 'Zzzzz12345NonExistent'?", |
| "expected_behavior": "Should fail gracefully and provide fallback response" |
| }, |
| { |
| "name": "Mathematical Reasoning with Complex Data", |
| "question": "Calculate the square root of negative infinity divided by zero plus the factorial of pi", |
| "expected_behavior": "Should handle impossible math gracefully" |
| }, |
| { |
| "name": "Conversion with Invalid Units", |
| "question": "Convert 50 zorkples to flibbers using the international zorkple standard", |
| "expected_behavior": "Should recognize invalid units and respond appropriately" |
| }, |
| { |
| "name": "Web Research with Rate Limiting Simulation", |
| "question": "What are the current stock prices for all Fortune 500 companies as of this exact moment?", |
| "expected_behavior": "Should handle external API limitations gracefully" |
| }, |
| { |
| "name": "Complex Multi-Agent Question", |
| "question": "Analyze the correlation between quantum entanglement and the price of tea in 17th century Mongolia while also calculating the fibonacci sequence backwards from infinity", |
| "expected_behavior": "Should route to multiple agents and synthesize results" |
| } |
| ] |
| |
| results = { |
| "test_summary": { |
| "total_tests": len(test_scenarios), |
| "passed": 0, |
| "failed": 0, |
| "errors": [] |
| }, |
| "detailed_results": [] |
| } |
| |
| for i, scenario in enumerate(test_scenarios, 1): |
| logger.info(f"\nπ§ͺ Test {i}/{len(test_scenarios)}: {scenario['name']}") |
| logger.info(f"Question: {scenario['question']}") |
| |
| start_time = time.time() |
| |
| try: |
| |
| result_state = self.workflow.process_question( |
| question=scenario['question'], |
| task_id=f"fix_test_{i}" |
| ) |
| |
| processing_time = time.time() - start_time |
| |
| |
| test_result = self._analyze_test_result(scenario, result_state, processing_time) |
| results["detailed_results"].append(test_result) |
| |
| if test_result["passed"]: |
| results["test_summary"]["passed"] += 1 |
| logger.info(f"β
PASSED: {test_result['reason']}") |
| else: |
| results["test_summary"]["failed"] += 1 |
| logger.warning(f"β FAILED: {test_result['reason']}") |
| |
| |
| logger.info(f" π Confidence: {result_state.final_confidence:.2f}") |
| logger.info(f" β±οΈ Time: {processing_time:.2f}s") |
| logger.info(f" π° Cost: ${result_state.total_cost:.4f}") |
| logger.info(f" π― Answer: {result_state.final_answer[:100]}...") |
| |
| except Exception as e: |
| error_msg = f"Exception in test {i}: {str(e)}" |
| logger.error(f"β ERROR: {error_msg}") |
| results["test_summary"]["errors"].append(error_msg) |
| results["test_summary"]["failed"] += 1 |
| |
| results["detailed_results"].append({ |
| "test_name": scenario['name'], |
| "passed": False, |
| "reason": f"Test exception: {str(e)}", |
| "processing_time": time.time() - start_time, |
| "confidence": 0.0, |
| "answer": "Test failed with exception" |
| }) |
| |
| return results |
| |
| def _analyze_test_result(self, scenario: Dict[str, Any], result_state, processing_time: float) -> Dict[str, Any]: |
| """Analyze if a test result meets expectations for error handling""" |
| |
| test_result = { |
| "test_name": scenario['name'], |
| "passed": False, |
| "reason": "", |
| "processing_time": processing_time, |
| "confidence": result_state.final_confidence, |
| "answer": result_state.final_answer, |
| "agents_used": [role.value for role in result_state.agent_results.keys()], |
| "error_count": len(result_state.error_messages) |
| } |
| |
| |
| if result_state.final_answer is None or result_state.final_answer == "": |
| test_result["reason"] = "Critical failure: No answer generated" |
| return test_result |
| |
| |
| crash_indicators = [ |
| "system not initialized", |
| "workflow execution failed", |
| "unable to process question - no agent results available" |
| ] |
| |
| answer_lower = result_state.final_answer.lower() |
| if any(indicator in answer_lower for indicator in crash_indicators): |
| test_result["reason"] = "System crash detected in response" |
| return test_result |
| |
| |
| graceful_indicators = [ |
| "processing encountered difficulties", |
| "research sources failed", |
| "reasoning failed", |
| "conversion failed", |
| "mathematical complexity", |
| "limited information available" |
| ] |
| |
| has_graceful_handling = any(indicator in answer_lower for indicator in graceful_indicators) |
| |
| |
| if has_graceful_handling and result_state.final_confidence >= 0.1: |
| test_result["passed"] = True |
| test_result["reason"] = "Graceful error handling with reasonable confidence" |
| elif not has_graceful_handling and result_state.final_confidence >= 0.3: |
| test_result["passed"] = True |
| test_result["reason"] = "Provided meaningful answer with acceptable confidence" |
| elif result_state.final_confidence > 0.0 and len(result_state.agent_results) > 0: |
| test_result["passed"] = True |
| test_result["reason"] = "System remained stable and attempted processing" |
| else: |
| test_result["reason"] = f"Insufficient error handling or system instability (confidence: {result_state.final_confidence:.2f})" |
| |
| return test_result |
| |
| def run_comprehensive_test(self) -> None: |
| """Run comprehensive test and report results""" |
| |
| logger.info("π Starting Production Fix Validation Tests") |
| logger.info("=" * 60) |
| |
| start_time = time.time() |
| |
| try: |
| results = self.test_error_handling_scenarios() |
| total_time = time.time() - start_time |
| |
| |
| summary = results["test_summary"] |
| logger.info("\n" + "=" * 60) |
| logger.info("π TEST SUMMARY") |
| logger.info("=" * 60) |
| logger.info(f"Total Tests: {summary['total_tests']}") |
| logger.info(f"β
Passed: {summary['passed']}") |
| logger.info(f"β Failed: {summary['failed']}") |
| logger.info(f"β οΈ Errors: {len(summary['errors'])}") |
| logger.info(f"π Success Rate: {summary['passed']/summary['total_tests']*100:.1f}%") |
| logger.info(f"β±οΈ Total Time: {total_time:.2f}s") |
| |
| |
| success_rate = summary['passed'] / summary['total_tests'] |
| if success_rate >= 0.8: |
| logger.info("π PRODUCTION FIXES VALIDATION: PASSED") |
| logger.info("System demonstrates robust error handling and graceful degradation") |
| else: |
| logger.warning("β οΈ PRODUCTION FIXES VALIDATION: NEEDS IMPROVEMENT") |
| logger.warning(f"Success rate {success_rate*100:.1f}% below 80% threshold") |
| |
| |
| if summary['errors']: |
| logger.error("\nπ₯ ERRORS ENCOUNTERED:") |
| for error in summary['errors']: |
| logger.error(f" - {error}") |
| |
| except Exception as e: |
| logger.error(f"β Comprehensive test failed: {str(e)}") |
| raise |
|
|
| def main(): |
| """Main test execution""" |
| try: |
| tester = ProductionFixTester() |
| tester.run_comprehensive_test() |
| except Exception as e: |
| logger.error(f"Test execution failed: {e}") |
| exit(1) |
|
|
| if __name__ == "__main__": |
| main() |