|
|
|
|
|
""" |
|
|
Test Production Fixes for GAIA Agent System |
|
|
Quick validation that error handling improvements are working |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import time |
|
|
from typing import List, Dict, Any |
|
|
|
|
|
from models.qwen_client import QwenClient |
|
|
from workflow.gaia_workflow import SimpleGAIAWorkflow |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class ProductionFixTester: |
|
|
"""Test the production fixes for error handling and robustness""" |
|
|
|
|
|
def __init__(self): |
|
|
try: |
|
|
self.llm_client = QwenClient() |
|
|
self.workflow = SimpleGAIAWorkflow(self.llm_client) |
|
|
logger.info("✅ Test environment initialized") |
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to initialize test environment: {e}") |
|
|
raise |
|
|
|
|
|
def test_error_handling_scenarios(self) -> Dict[str, Any]: |
|
|
"""Test various error scenarios that were causing production failures""" |
|
|
|
|
|
test_scenarios = [ |
|
|
{ |
|
|
"name": "Wikipedia Research Failure Simulation", |
|
|
"question": "What is the most obscure fictional character from the imaginary book 'Zzzzz12345NonExistent'?", |
|
|
"expected_behavior": "Should fail gracefully and provide fallback response" |
|
|
}, |
|
|
{ |
|
|
"name": "Mathematical Reasoning with Complex Data", |
|
|
"question": "Calculate the square root of negative infinity divided by zero plus the factorial of pi", |
|
|
"expected_behavior": "Should handle impossible math gracefully" |
|
|
}, |
|
|
{ |
|
|
"name": "Conversion with Invalid Units", |
|
|
"question": "Convert 50 zorkples to flibbers using the international zorkple standard", |
|
|
"expected_behavior": "Should recognize invalid units and respond appropriately" |
|
|
}, |
|
|
{ |
|
|
"name": "Web Research with Rate Limiting Simulation", |
|
|
"question": "What are the current stock prices for all Fortune 500 companies as of this exact moment?", |
|
|
"expected_behavior": "Should handle external API limitations gracefully" |
|
|
}, |
|
|
{ |
|
|
"name": "Complex Multi-Agent Question", |
|
|
"question": "Analyze the correlation between quantum entanglement and the price of tea in 17th century Mongolia while also calculating the fibonacci sequence backwards from infinity", |
|
|
"expected_behavior": "Should route to multiple agents and synthesize results" |
|
|
} |
|
|
] |
|
|
|
|
|
results = { |
|
|
"test_summary": { |
|
|
"total_tests": len(test_scenarios), |
|
|
"passed": 0, |
|
|
"failed": 0, |
|
|
"errors": [] |
|
|
}, |
|
|
"detailed_results": [] |
|
|
} |
|
|
|
|
|
for i, scenario in enumerate(test_scenarios, 1): |
|
|
logger.info(f"\n🧪 Test {i}/{len(test_scenarios)}: {scenario['name']}") |
|
|
logger.info(f"Question: {scenario['question']}") |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
result_state = self.workflow.process_question( |
|
|
question=scenario['question'], |
|
|
task_id=f"fix_test_{i}" |
|
|
) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
|
|
|
test_result = self._analyze_test_result(scenario, result_state, processing_time) |
|
|
results["detailed_results"].append(test_result) |
|
|
|
|
|
if test_result["passed"]: |
|
|
results["test_summary"]["passed"] += 1 |
|
|
logger.info(f"✅ PASSED: {test_result['reason']}") |
|
|
else: |
|
|
results["test_summary"]["failed"] += 1 |
|
|
logger.warning(f"❌ FAILED: {test_result['reason']}") |
|
|
|
|
|
|
|
|
logger.info(f" 📊 Confidence: {result_state.final_confidence:.2f}") |
|
|
logger.info(f" ⏱️ Time: {processing_time:.2f}s") |
|
|
logger.info(f" 💰 Cost: ${result_state.total_cost:.4f}") |
|
|
logger.info(f" 🎯 Answer: {result_state.final_answer[:100]}...") |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Exception in test {i}: {str(e)}" |
|
|
logger.error(f"❌ ERROR: {error_msg}") |
|
|
results["test_summary"]["errors"].append(error_msg) |
|
|
results["test_summary"]["failed"] += 1 |
|
|
|
|
|
results["detailed_results"].append({ |
|
|
"test_name": scenario['name'], |
|
|
"passed": False, |
|
|
"reason": f"Test exception: {str(e)}", |
|
|
"processing_time": time.time() - start_time, |
|
|
"confidence": 0.0, |
|
|
"answer": "Test failed with exception" |
|
|
}) |
|
|
|
|
|
return results |
|
|
|
|
|
def _analyze_test_result(self, scenario: Dict[str, Any], result_state, processing_time: float) -> Dict[str, Any]: |
|
|
"""Analyze if a test result meets expectations for error handling""" |
|
|
|
|
|
test_result = { |
|
|
"test_name": scenario['name'], |
|
|
"passed": False, |
|
|
"reason": "", |
|
|
"processing_time": processing_time, |
|
|
"confidence": result_state.final_confidence, |
|
|
"answer": result_state.final_answer, |
|
|
"agents_used": [role.value for role in result_state.agent_results.keys()], |
|
|
"error_count": len(result_state.error_messages) |
|
|
} |
|
|
|
|
|
|
|
|
if result_state.final_answer is None or result_state.final_answer == "": |
|
|
test_result["reason"] = "Critical failure: No answer generated" |
|
|
return test_result |
|
|
|
|
|
|
|
|
crash_indicators = [ |
|
|
"system not initialized", |
|
|
"workflow execution failed", |
|
|
"unable to process question - no agent results available" |
|
|
] |
|
|
|
|
|
answer_lower = result_state.final_answer.lower() |
|
|
if any(indicator in answer_lower for indicator in crash_indicators): |
|
|
test_result["reason"] = "System crash detected in response" |
|
|
return test_result |
|
|
|
|
|
|
|
|
graceful_indicators = [ |
|
|
"processing encountered difficulties", |
|
|
"research sources failed", |
|
|
"reasoning failed", |
|
|
"conversion failed", |
|
|
"mathematical complexity", |
|
|
"limited information available" |
|
|
] |
|
|
|
|
|
has_graceful_handling = any(indicator in answer_lower for indicator in graceful_indicators) |
|
|
|
|
|
|
|
|
if has_graceful_handling and result_state.final_confidence >= 0.1: |
|
|
test_result["passed"] = True |
|
|
test_result["reason"] = "Graceful error handling with reasonable confidence" |
|
|
elif not has_graceful_handling and result_state.final_confidence >= 0.3: |
|
|
test_result["passed"] = True |
|
|
test_result["reason"] = "Provided meaningful answer with acceptable confidence" |
|
|
elif result_state.final_confidence > 0.0 and len(result_state.agent_results) > 0: |
|
|
test_result["passed"] = True |
|
|
test_result["reason"] = "System remained stable and attempted processing" |
|
|
else: |
|
|
test_result["reason"] = f"Insufficient error handling or system instability (confidence: {result_state.final_confidence:.2f})" |
|
|
|
|
|
return test_result |
|
|
|
|
|
def run_comprehensive_test(self) -> None: |
|
|
"""Run comprehensive test and report results""" |
|
|
|
|
|
logger.info("🚀 Starting Production Fix Validation Tests") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
results = self.test_error_handling_scenarios() |
|
|
total_time = time.time() - start_time |
|
|
|
|
|
|
|
|
summary = results["test_summary"] |
|
|
logger.info("\n" + "=" * 60) |
|
|
logger.info("📋 TEST SUMMARY") |
|
|
logger.info("=" * 60) |
|
|
logger.info(f"Total Tests: {summary['total_tests']}") |
|
|
logger.info(f"✅ Passed: {summary['passed']}") |
|
|
logger.info(f"❌ Failed: {summary['failed']}") |
|
|
logger.info(f"⚠️ Errors: {len(summary['errors'])}") |
|
|
logger.info(f"📊 Success Rate: {summary['passed']/summary['total_tests']*100:.1f}%") |
|
|
logger.info(f"⏱️ Total Time: {total_time:.2f}s") |
|
|
|
|
|
|
|
|
success_rate = summary['passed'] / summary['total_tests'] |
|
|
if success_rate >= 0.8: |
|
|
logger.info("🎉 PRODUCTION FIXES VALIDATION: PASSED") |
|
|
logger.info("System demonstrates robust error handling and graceful degradation") |
|
|
else: |
|
|
logger.warning("⚠️ PRODUCTION FIXES VALIDATION: NEEDS IMPROVEMENT") |
|
|
logger.warning(f"Success rate {success_rate*100:.1f}% below 80% threshold") |
|
|
|
|
|
|
|
|
if summary['errors']: |
|
|
logger.error("\n🔥 ERRORS ENCOUNTERED:") |
|
|
for error in summary['errors']: |
|
|
logger.error(f" - {error}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Comprehensive test failed: {str(e)}") |
|
|
raise |
|
|
|
|
|
def main(): |
|
|
"""Main test execution""" |
|
|
try: |
|
|
tester = ProductionFixTester() |
|
|
tester.run_comprehensive_test() |
|
|
except Exception as e: |
|
|
logger.error(f"Test execution failed: {e}") |
|
|
exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |