Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Benchmark Enhanced Felix Framework Systems | |
| Comprehensive benchmarking of all five priority enhancement systems: | |
| 1. Intelligent Output Chunking & Streaming | |
| 2. Dynamic Agent Spawning | |
| 3. Prompt Optimization Pipeline | |
| 4. Memory and Persistence Layer | |
| 5. Benchmarking & Quality Metrics | |
| This script provides performance metrics and validation results for our enhanced systems. | |
| """ | |
| import sys | |
| import time | |
| import json | |
| import tempfile | |
| import statistics | |
| import os | |
| from pathlib import Path | |
| from typing import Dict, List, Any, Optional | |
| from dataclasses import dataclass, asdict | |
| # Add src to Python path | |
| sys.path.insert(0, str(Path(__file__).parent / "src")) | |
| class BenchmarkResult: | |
| """Results from a single benchmark test.""" | |
| system_name: str | |
| test_name: str | |
| execution_time: float | |
| success: bool | |
| metrics: Dict[str, Any] | |
| error_message: Optional[str] = None | |
| class SystemBenchmark: | |
| """Benchmark results for an entire system.""" | |
| system_name: str | |
| total_tests: int | |
| successful_tests: int | |
| total_time: float | |
| average_time: float | |
| results: List[BenchmarkResult] | |
| def benchmark_chunking_system() -> SystemBenchmark: | |
| """Benchmark the intelligent output chunking system.""" | |
| print("🧩 Benchmarking Chunking System...") | |
| from src.pipeline.chunking import ProgressiveProcessor, ChunkedResult | |
| results = [] | |
| start_time = time.time() | |
| # Test 1: Basic chunking functionality | |
| test_start = time.time() | |
| try: | |
| processor = ProgressiveProcessor( | |
| task_id="benchmark_task", | |
| agent_id="benchmark_agent", | |
| full_content="This is test content for chunking. " * 100, # 500 words | |
| chunk_size=200 | |
| ) | |
| chunks_count = processor.total_chunks | |
| first_chunk = processor.get_chunk_by_index(0) | |
| success = chunks_count > 1 and first_chunk is not None | |
| metrics = { | |
| "total_chunks": chunks_count, | |
| "content_length": len(processor.full_content), | |
| "chunk_size": processor.chunk_size, | |
| "first_chunk_length": len(first_chunk.content_chunk) if first_chunk else 0 | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="Chunking", | |
| test_name="basic_chunking", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="Chunking", | |
| test_name="basic_chunking", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| # Test 2: Performance with large content | |
| test_start = time.time() | |
| try: | |
| large_content = "Large content for performance testing. " * 1000 # ~5000 words | |
| large_processor = ProgressiveProcessor( | |
| task_id="large_task", | |
| agent_id="benchmark_agent", | |
| full_content=large_content, | |
| chunk_size=500 | |
| ) | |
| # Process all chunks | |
| all_chunks = [] | |
| for i in range(large_processor.total_chunks): | |
| chunk = large_processor.get_chunk_by_index(i) | |
| if chunk: | |
| all_chunks.append(chunk) | |
| success = len(all_chunks) == large_processor.total_chunks | |
| metrics = { | |
| "content_length": len(large_content), | |
| "total_chunks": large_processor.total_chunks, | |
| "processed_chunks": len(all_chunks), | |
| "avg_chunk_size": statistics.mean([len(chunk.content_chunk) for chunk in all_chunks]) if all_chunks else 0 | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="Chunking", | |
| test_name="large_content_performance", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="Chunking", | |
| test_name="large_content_performance", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| total_time = time.time() - start_time | |
| successful_tests = sum(1 for r in results if r.success) | |
| return SystemBenchmark( | |
| system_name="Chunking System", | |
| total_tests=len(results), | |
| successful_tests=successful_tests, | |
| total_time=total_time, | |
| average_time=total_time / len(results) if results else 0, | |
| results=results | |
| ) | |
| def benchmark_knowledge_store() -> SystemBenchmark: | |
| """Benchmark the memory and persistence layer.""" | |
| print("🧠 Benchmarking Knowledge Store...") | |
| from src.memory.knowledge_store import KnowledgeStore, KnowledgeType, ConfidenceLevel, KnowledgeQuery | |
| results = [] | |
| start_time = time.time() | |
| # Create temporary database | |
| with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as temp_file: | |
| temp_db_path = temp_file.name | |
| try: | |
| # Test 1: Basic storage and retrieval | |
| test_start = time.time() | |
| try: | |
| store = KnowledgeStore(storage_path=temp_db_path) | |
| # Store test knowledge | |
| knowledge_id = store.store_knowledge( | |
| knowledge_type=KnowledgeType.TASK_RESULT, | |
| content={"task": "benchmark_test", "result": "success", "score": 0.95}, | |
| confidence_level=ConfidenceLevel.HIGH, | |
| source_agent="benchmark_agent", | |
| domain="testing", | |
| tags=["benchmark", "test", "performance"] | |
| ) | |
| # Retrieve knowledge | |
| query = KnowledgeQuery( | |
| knowledge_types=[KnowledgeType.TASK_RESULT], | |
| domains=["testing"] | |
| ) | |
| retrieved = store.retrieve_knowledge(query) | |
| success = len(retrieved) == 1 and retrieved[0].knowledge_id == knowledge_id | |
| metrics = { | |
| "stored_entries": 1, | |
| "retrieved_entries": len(retrieved), | |
| "knowledge_id_match": retrieved[0].knowledge_id == knowledge_id if retrieved else False | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="KnowledgeStore", | |
| test_name="basic_storage_retrieval", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="KnowledgeStore", | |
| test_name="basic_storage_retrieval", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| # Test 2: Performance with multiple entries | |
| test_start = time.time() | |
| try: | |
| store = KnowledgeStore(storage_path=temp_db_path) | |
| # Store multiple knowledge entries | |
| knowledge_ids = [] | |
| for i in range(100): | |
| kid = store.store_knowledge( | |
| knowledge_type=KnowledgeType.AGENT_INSIGHT, | |
| content={"insight": f"test_insight_{i}", "value": i * 0.01}, | |
| confidence_level=ConfidenceLevel.MEDIUM, | |
| source_agent=f"agent_{i % 5}", | |
| domain="performance_testing", | |
| tags=["bulk_test", f"batch_{i // 20}"] | |
| ) | |
| knowledge_ids.append(kid) | |
| # Query all entries | |
| query = KnowledgeQuery(domains=["performance_testing"], limit=200) | |
| all_entries = store.retrieve_knowledge(query) | |
| # Query with filters - batch_2 should have entries 40-59 (20 entries) | |
| filtered_query = KnowledgeQuery( | |
| domains=["performance_testing"], | |
| tags=["batch_2"], | |
| min_confidence=ConfidenceLevel.MEDIUM | |
| ) | |
| filtered_entries = store.retrieve_knowledge(filtered_query) | |
| success = len(all_entries) >= 100 and len(filtered_entries) >= 5 | |
| metrics = { | |
| "stored_entries": len(knowledge_ids), | |
| "retrieved_all": len(all_entries), | |
| "retrieved_filtered": len(filtered_entries), | |
| "storage_success_rate": len([kid for kid in knowledge_ids if kid]) / len(knowledge_ids) | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="KnowledgeStore", | |
| test_name="bulk_storage_performance", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="KnowledgeStore", | |
| test_name="bulk_storage_performance", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| finally: | |
| # Cleanup temporary database | |
| if os.path.exists(temp_db_path): | |
| os.unlink(temp_db_path) | |
| total_time = time.time() - start_time | |
| successful_tests = sum(1 for r in results if r.success) | |
| return SystemBenchmark( | |
| system_name="Knowledge Store", | |
| total_tests=len(results), | |
| successful_tests=successful_tests, | |
| total_time=total_time, | |
| average_time=total_time / len(results) if results else 0, | |
| results=results | |
| ) | |
| def benchmark_quality_metrics() -> SystemBenchmark: | |
| """Benchmark the quality metrics calculation system.""" | |
| print("📊 Benchmarking Quality Metrics...") | |
| from src.comparison.quality_metrics import QualityMetricsCalculator, DomainType | |
| results = [] | |
| start_time = time.time() | |
| # Test 1: Basic quality assessment | |
| test_start = time.time() | |
| try: | |
| calculator = QualityMetricsCalculator() | |
| test_text = """ | |
| This comprehensive analysis examines artificial intelligence development trends. | |
| First, we establish the foundational concepts. Furthermore, advanced machine learning | |
| techniques demonstrate significant improvements in accuracy and efficiency. Research | |
| indicates that 95% of systems show enhanced performance. Therefore, these methodologies | |
| provide substantial value for implementation. | |
| """ | |
| quality_score = calculator.calculate_quality_score(test_text, DomainType.TECHNICAL) | |
| success = ( | |
| 0 <= quality_score.overall_score <= 1 and | |
| quality_score.coherence_score > 0 and | |
| quality_score.accuracy_score > 0 and | |
| quality_score.word_count > 0 | |
| ) | |
| metrics = { | |
| "overall_score": quality_score.overall_score, | |
| "coherence_score": quality_score.coherence_score, | |
| "accuracy_score": quality_score.accuracy_score, | |
| "completeness_score": quality_score.completeness_score, | |
| "clarity_score": quality_score.clarity_score, | |
| "word_count": quality_score.word_count, | |
| "sentence_count": quality_score.sentence_count, | |
| "has_bleu_score": quality_score.bleu_score is not None | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="QualityMetrics", | |
| test_name="basic_quality_assessment", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="QualityMetrics", | |
| test_name="basic_quality_assessment", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| # Test 2: Performance with multiple texts | |
| test_start = time.time() | |
| try: | |
| calculator = QualityMetricsCalculator() | |
| test_texts = [ | |
| "High quality technical documentation with research backing.", | |
| "Some random text without much structure or clarity here.", | |
| "This demonstrates excellent coherence. Furthermore, the analysis provides substantial evidence. Research indicates clear patterns.", | |
| "Poor quality text with no structure clarity issues many problems", | |
| "Comprehensive examination reveals innovative approaches. Studies show 92% effectiveness rates." | |
| ] | |
| batch_scores = calculator.batch_calculate_scores(test_texts, DomainType.GENERAL) | |
| success = len(batch_scores) == len(test_texts) and all(0 <= score.overall_score <= 1 for score in batch_scores) | |
| metrics = { | |
| "texts_processed": len(batch_scores), | |
| "average_overall_score": statistics.mean([score.overall_score for score in batch_scores]), | |
| "score_range": max([score.overall_score for score in batch_scores]) - min([score.overall_score for score in batch_scores]), | |
| "all_valid_scores": all(0 <= score.overall_score <= 1 for score in batch_scores) | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="QualityMetrics", | |
| test_name="batch_processing_performance", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="QualityMetrics", | |
| test_name="batch_processing_performance", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| total_time = time.time() - start_time | |
| successful_tests = sum(1 for r in results if r.success) | |
| return SystemBenchmark( | |
| system_name="Quality Metrics", | |
| total_tests=len(results), | |
| successful_tests=successful_tests, | |
| total_time=total_time, | |
| average_time=total_time / len(results) if results else 0, | |
| results=results | |
| ) | |
| def benchmark_prompt_optimization() -> SystemBenchmark: | |
| """Benchmark the prompt optimization system.""" | |
| print("🎯 Benchmarking Prompt Optimization...") | |
| from src.agents.prompt_optimization import PromptOptimizer | |
| results = [] | |
| start_time = time.time() | |
| # Create temporary database | |
| with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as temp_file: | |
| temp_db_path = temp_file.name | |
| try: | |
| # Test 1: Basic prompt optimization | |
| test_start = time.time() | |
| try: | |
| optimizer = PromptOptimizer() | |
| base_prompt = "Analyze the following content and provide insights" | |
| context = {"domain": "technical", "task_type": "analysis"} | |
| optimized_prompt = optimizer.optimize_prompt(base_prompt, context) | |
| success = ( | |
| optimized_prompt is not None and | |
| len(optimized_prompt.prompt_text) > len(base_prompt) and | |
| optimized_prompt.prompt_id is not None | |
| ) | |
| metrics = { | |
| "base_prompt_length": len(base_prompt), | |
| "optimized_prompt_length": len(optimized_prompt.prompt_text), | |
| "improvement_ratio": len(optimized_prompt.prompt_text) / len(base_prompt), | |
| "has_prompt_id": optimized_prompt.prompt_id is not None, | |
| "has_context": len(optimized_prompt.context) > 0 | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="PromptOptimizer", | |
| test_name="basic_optimization", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="PromptOptimizer", | |
| test_name="basic_optimization", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| # Test 2: Performance tracking | |
| test_start = time.time() | |
| try: | |
| optimizer = PromptOptimizer() | |
| # Create and track multiple prompts | |
| prompt_performance_data = [] | |
| for i in range(10): | |
| prompt_text = f"Test prompt {i} for analysis task" | |
| optimized = optimizer.optimize_prompt(prompt_text, {"iteration": i}) | |
| # Record performance | |
| optimizer.record_prompt_performance( | |
| prompt_id=optimized.prompt_id, | |
| success_rate=0.7 + (i * 0.02), # Simulated improvement | |
| quality_metrics={"coherence": 0.8 + (i * 0.01)}, | |
| context={"test_iteration": i} | |
| ) | |
| performance = optimizer.get_prompt_performance(optimized.prompt_id) | |
| prompt_performance_data.append(len(performance)) | |
| success = ( | |
| len(prompt_performance_data) == 10 and | |
| all(count > 0 for count in prompt_performance_data) | |
| ) | |
| metrics = { | |
| "prompts_created": len(prompt_performance_data), | |
| "average_performance_records": statistics.mean(prompt_performance_data), | |
| "all_recorded_performance": all(count > 0 for count in prompt_performance_data) | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="PromptOptimizer", | |
| test_name="performance_tracking", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="PromptOptimizer", | |
| test_name="performance_tracking", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| finally: | |
| # Cleanup temporary database | |
| if os.path.exists(temp_db_path): | |
| os.unlink(temp_db_path) | |
| total_time = time.time() - start_time | |
| successful_tests = sum(1 for r in results if r.success) | |
| return SystemBenchmark( | |
| system_name="Prompt Optimization", | |
| total_tests=len(results), | |
| successful_tests=successful_tests, | |
| total_time=total_time, | |
| average_time=total_time / len(results) if results else 0, | |
| results=results | |
| ) | |
| def benchmark_integration_scenario() -> SystemBenchmark: | |
| """Benchmark integration of multiple enhanced systems.""" | |
| print("🔗 Benchmarking Integration Scenarios...") | |
| results = [] | |
| start_time = time.time() | |
| # Test 1: Chunking + Quality Metrics integration | |
| test_start = time.time() | |
| try: | |
| from src.pipeline.chunking import ProgressiveProcessor | |
| from src.comparison.quality_metrics import QualityMetricsCalculator, DomainType | |
| # Create content for chunking | |
| blog_content = """ | |
| # AI Ethics in Healthcare: A Comprehensive Analysis | |
| The integration of artificial intelligence into healthcare systems presents unprecedented | |
| opportunities and challenges. This analysis examines key ethical considerations, regulatory | |
| frameworks, and implementation strategies for responsible AI deployment in medical contexts. | |
| ## Current State and Challenges | |
| Healthcare AI systems currently face several critical ethical challenges. First, ensuring | |
| patient privacy while enabling valuable medical research requires sophisticated data handling. | |
| Furthermore, algorithmic bias in diagnostic tools can perpetuate healthcare disparities. | |
| ## Regulatory and Implementation Frameworks | |
| Effective governance structures must balance innovation with patient safety. Research indicates | |
| that 87% of healthcare institutions require comprehensive ethical review processes for AI systems. | |
| ## Future Directions | |
| Therefore, successful AI integration demands collaborative approaches between technologists, | |
| clinicians, ethicists, and policymakers to ensure beneficial outcomes for all patients. | |
| """ | |
| # Chunk the content | |
| processor = ProgressiveProcessor( | |
| task_id="integration_test", | |
| agent_id="integration_agent", | |
| full_content=blog_content, | |
| chunk_size=400 | |
| ) | |
| # Calculate quality metrics for each chunk | |
| quality_calculator = QualityMetricsCalculator() | |
| chunk_quality_scores = [] | |
| for i in range(processor.total_chunks): | |
| chunk = processor.get_chunk_by_index(i) | |
| if chunk and chunk.content_chunk: | |
| quality_score = quality_calculator.calculate_quality_score( | |
| chunk.content_chunk, | |
| DomainType.TECHNICAL | |
| ) | |
| chunk_quality_scores.append(quality_score.overall_score) | |
| # Calculate overall quality metrics | |
| overall_quality = quality_calculator.calculate_quality_score(blog_content, DomainType.TECHNICAL) | |
| success = ( | |
| processor.total_chunks > 1 and | |
| len(chunk_quality_scores) == processor.total_chunks and | |
| overall_quality.overall_score > 0.5 and | |
| all(0 <= score <= 1 for score in chunk_quality_scores) | |
| ) | |
| metrics = { | |
| "total_chunks": processor.total_chunks, | |
| "chunks_with_quality": len(chunk_quality_scores), | |
| "average_chunk_quality": statistics.mean(chunk_quality_scores) if chunk_quality_scores else 0, | |
| "overall_quality_score": overall_quality.overall_score, | |
| "quality_consistency": statistics.stdev(chunk_quality_scores) if len(chunk_quality_scores) > 1 else 0 | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="Integration", | |
| test_name="chunking_quality_metrics", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="Integration", | |
| test_name="chunking_quality_metrics", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| # Test 2: Knowledge Store + Quality Metrics integration | |
| test_start = time.time() | |
| with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as temp_file: | |
| temp_db_path = temp_file.name | |
| try: | |
| from src.memory.knowledge_store import KnowledgeStore, KnowledgeType, ConfidenceLevel | |
| from src.comparison.quality_metrics import QualityMetricsCalculator, DomainType | |
| store = KnowledgeStore(storage_path=temp_db_path) | |
| calculator = QualityMetricsCalculator() | |
| # Store knowledge with quality assessments | |
| test_content = [ | |
| "High quality technical analysis with comprehensive research backing and statistical evidence. This demonstrates sophisticated understanding of complex systems with detailed methodological approach and rigorous validation procedures.", | |
| "Poor quality text with unclear structure and limited substance or insight.", | |
| "Excellent coherence demonstrated through logical flow. Furthermore, evidence supports conclusions with comprehensive analysis and detailed reasoning throughout the investigation." | |
| ] | |
| quality_assessments = [] | |
| stored_ids = [] | |
| for i, content in enumerate(test_content): | |
| # Calculate quality | |
| quality_score = calculator.calculate_quality_score(content, DomainType.TECHNICAL) | |
| quality_assessments.append(quality_score.overall_score) | |
| # Store knowledge with quality metadata | |
| knowledge_id = store.store_knowledge( | |
| knowledge_type=KnowledgeType.TASK_RESULT, | |
| content={ | |
| "text": content, | |
| "quality_assessment": { | |
| "overall_score": quality_score.overall_score, | |
| "coherence": quality_score.coherence_score, | |
| "accuracy": quality_score.accuracy_score | |
| } | |
| }, | |
| confidence_level=ConfidenceLevel.HIGH if quality_score.overall_score > 0.7 else ConfidenceLevel.MEDIUM, | |
| source_agent="integration_test", | |
| domain="quality_testing", | |
| tags=["integration", "quality", f"batch_{i}"] | |
| ) | |
| stored_ids.append(knowledge_id) | |
| # Query high-quality knowledge | |
| from src.memory.knowledge_store import KnowledgeQuery | |
| high_quality_query = KnowledgeQuery( | |
| domains=["quality_testing"], | |
| min_confidence=ConfidenceLevel.MEDIUM, | |
| content_keywords=["quality"] | |
| ) | |
| high_quality_entries = store.retrieve_knowledge(high_quality_query) | |
| success = ( | |
| len(stored_ids) == len(test_content) and | |
| len(quality_assessments) == len(test_content) and | |
| len(high_quality_entries) > 0 and | |
| all(entry.content.get("quality_assessment") for entry in high_quality_entries) | |
| ) | |
| metrics = { | |
| "content_pieces": len(test_content), | |
| "stored_entries": len(stored_ids), | |
| "quality_assessments": len(quality_assessments), | |
| "high_quality_retrieved": len(high_quality_entries), | |
| "average_quality": statistics.mean(quality_assessments), | |
| "quality_range": max(quality_assessments) - min(quality_assessments) | |
| } | |
| results.append(BenchmarkResult( | |
| system_name="Integration", | |
| test_name="knowledge_quality_integration", | |
| execution_time=time.time() - test_start, | |
| success=success, | |
| metrics=metrics | |
| )) | |
| except Exception as e: | |
| results.append(BenchmarkResult( | |
| system_name="Integration", | |
| test_name="knowledge_quality_integration", | |
| execution_time=time.time() - test_start, | |
| success=False, | |
| metrics={}, | |
| error_message=str(e) | |
| )) | |
| finally: | |
| if os.path.exists(temp_db_path): | |
| os.unlink(temp_db_path) | |
| total_time = time.time() - start_time | |
| successful_tests = sum(1 for r in results if r.success) | |
| return SystemBenchmark( | |
| system_name="Integration Scenarios", | |
| total_tests=len(results), | |
| successful_tests=successful_tests, | |
| total_time=total_time, | |
| average_time=total_time / len(results) if results else 0, | |
| results=results | |
| ) | |
| def generate_benchmark_report(benchmarks: List[SystemBenchmark]) -> Dict[str, Any]: | |
| """Generate comprehensive benchmark report.""" | |
| total_tests = sum(b.total_tests for b in benchmarks) | |
| total_successful = sum(b.successful_tests for b in benchmarks) | |
| total_time = sum(b.total_time for b in benchmarks) | |
| report = { | |
| "benchmark_timestamp": time.time(), | |
| "summary": { | |
| "total_systems": len(benchmarks), | |
| "total_tests": total_tests, | |
| "successful_tests": total_successful, | |
| "success_rate": total_successful / total_tests if total_tests > 0 else 0, | |
| "total_execution_time": total_time, | |
| "average_time_per_test": total_time / total_tests if total_tests > 0 else 0 | |
| }, | |
| "system_results": [] | |
| } | |
| for benchmark in benchmarks: | |
| system_result = { | |
| "system_name": benchmark.system_name, | |
| "tests": benchmark.total_tests, | |
| "successful": benchmark.successful_tests, | |
| "success_rate": benchmark.successful_tests / benchmark.total_tests if benchmark.total_tests > 0 else 0, | |
| "total_time": benchmark.total_time, | |
| "average_time": benchmark.average_time, | |
| "detailed_results": [] | |
| } | |
| for result in benchmark.results: | |
| system_result["detailed_results"].append({ | |
| "test_name": result.test_name, | |
| "success": result.success, | |
| "execution_time": result.execution_time, | |
| "metrics": result.metrics, | |
| "error": result.error_message | |
| }) | |
| report["system_results"].append(system_result) | |
| return report | |
| def main(): | |
| """Run comprehensive benchmarks of all enhanced systems.""" | |
| print("🚀 Felix Framework Enhanced Systems Benchmark") | |
| print("=" * 60) | |
| benchmarks = [] | |
| # Run individual system benchmarks | |
| benchmarks.append(benchmark_chunking_system()) | |
| benchmarks.append(benchmark_knowledge_store()) | |
| benchmarks.append(benchmark_quality_metrics()) | |
| benchmarks.append(benchmark_prompt_optimization()) | |
| benchmarks.append(benchmark_integration_scenario()) | |
| print("\n" + "=" * 60) | |
| print("📊 BENCHMARK RESULTS SUMMARY") | |
| print("=" * 60) | |
| # Generate and display report | |
| report = generate_benchmark_report(benchmarks) | |
| print(f"Total Systems Tested: {report['summary']['total_systems']}") | |
| print(f"Total Tests: {report['summary']['total_tests']}") | |
| print(f"Successful Tests: {report['summary']['successful_tests']}") | |
| print(f"Overall Success Rate: {report['summary']['success_rate']:.1%}") | |
| print(f"Total Execution Time: {report['summary']['total_execution_time']:.3f}s") | |
| print(f"Average Time per Test: {report['summary']['average_time_per_test']:.3f}s") | |
| print("\n" + "-" * 60) | |
| print("DETAILED SYSTEM RESULTS") | |
| print("-" * 60) | |
| for system_result in report["system_results"]: | |
| print(f"\n🔧 {system_result['system_name']}") | |
| print(f" Tests: {system_result['successful']}/{system_result['tests']} " | |
| f"({system_result['success_rate']:.1%} success)") | |
| print(f" Time: {system_result['total_time']:.3f}s " | |
| f"(avg: {system_result['average_time']:.3f}s)") | |
| for test_result in system_result["detailed_results"]: | |
| status = "✅" if test_result["success"] else "❌" | |
| print(f" {status} {test_result['test_name']}: {test_result['execution_time']:.3f}s") | |
| if not test_result["success"] and test_result["error"]: | |
| print(f" Error: {test_result['error']}") | |
| # Save detailed report | |
| report_path = Path("benchmark_results.json") | |
| with open(report_path, 'w') as f: | |
| json.dump(report, f, indent=2) | |
| print(f"\n💾 Detailed results saved to: {report_path}") | |
| print("\n" + "=" * 60) | |
| print("✨ BENCHMARK COMPLETE") | |
| print("=" * 60) | |
| if report['summary']['success_rate'] >= 0.8: | |
| print("🎉 EXCELLENT: All enhanced systems performing well!") | |
| elif report['summary']['success_rate'] >= 0.6: | |
| print("✅ GOOD: Most enhanced systems working correctly") | |
| else: | |
| print("⚠️ WARNING: Some systems need attention") | |
| return report | |
| if __name__ == "__main__": | |
| main() |