Spaces:
Build error
Build error
| """ | |
| Chain of Thought Performance Benchmarks and Testing Suite | |
| Comprehensive benchmarking for the Optimized CoT System | |
| """ | |
| import asyncio | |
| import time | |
| import numpy as np | |
| import pandas as pd | |
| from typing import Dict, List, Tuple, Any | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from dataclasses import dataclass | |
| import json | |
| import statistics | |
| from concurrent.futures import ProcessPoolExecutor | |
| import psutil | |
| import tracemalloc | |
| import sys | |
| import os | |
| import logging | |
| # Add the src directory to the path | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src')) | |
| # Import the CoT system | |
| try: | |
| from core.optimized_chain_of_thought import ( | |
| OptimizedChainOfThought, | |
| ReasoningType, | |
| ReasoningPath, | |
| ReasoningStep | |
| ) | |
| except ImportError: | |
| # Fallback for direct execution | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) | |
| from src.core.optimized_chain_of_thought import ( | |
| OptimizedChainOfThought, | |
| ReasoningType, | |
| ReasoningPath, | |
| ReasoningStep | |
| ) | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # ============================= | |
| # Benchmark Data Structures | |
| # ============================= | |
| class BenchmarkResult: | |
| """Result of a single benchmark run""" | |
| query: str | |
| complexity: float | |
| execution_time: float | |
| confidence: float | |
| steps_count: int | |
| cache_hit: bool | |
| memory_usage: float | |
| cpu_usage: float | |
| template_used: str | |
| paths_explored: int | |
| reasoning_types: List[str] | |
| final_answer: str | |
| class BenchmarkSuite: | |
| """Collection of benchmark results""" | |
| name: str | |
| results: List[BenchmarkResult] | |
| timestamp: float | |
| config: Dict[str, Any] | |
| # ============================= | |
| # Test Query Sets | |
| # ============================= | |
| class QueryDataset: | |
| """Dataset of queries for benchmarking""" | |
| def get_simple_queries() -> List[str]: | |
| """Simple queries (complexity < 0.3)""" | |
| return [ | |
| "What is 2 + 2?", | |
| "What color is the sky?", | |
| "Define democracy.", | |
| "What is the capital of France?", | |
| "How many days in a week?", | |
| "What is water made of?", | |
| "When did World War II end?", | |
| "What is the speed of light?", | |
| "Who wrote Romeo and Juliet?", | |
| "What is the largest planet?" | |
| ] | |
| def get_medium_queries() -> List[str]: | |
| """Medium complexity queries (0.3 < complexity < 0.7)""" | |
| return [ | |
| "Explain the process of photosynthesis.", | |
| "Compare renewable and non-renewable energy sources.", | |
| "What are the main causes of climate change?", | |
| "Describe the water cycle and its importance.", | |
| "How does the stock market work?", | |
| "Explain the difference between virus and bacteria.", | |
| "What factors contributed to the Industrial Revolution?", | |
| "How do vaccines work to prevent diseases?", | |
| "Describe the structure of DNA.", | |
| "What are the pros and cons of social media?" | |
| ] | |
| def get_complex_queries() -> List[str]: | |
| """Complex queries (complexity > 0.7)""" | |
| return [ | |
| "Analyze the potential long-term socioeconomic impacts of artificial intelligence on global employment patterns, considering both displacement effects and new job creation.", | |
| "Compare and contrast the philosophical foundations of utilitarianism and deontological ethics, providing examples of how each would approach modern ethical dilemmas.", | |
| "Evaluate the effectiveness of different monetary policy tools in combating inflation while maintaining economic growth, considering recent global economic trends.", | |
| "Discuss the role of epigenetics in evolution and heredity, explaining how environmental factors can influence gene expression across generations.", | |
| "Analyze the geopolitical implications of renewable energy transition on international relations and global power dynamics.", | |
| "Examine the intersection of quantum mechanics and consciousness, discussing various interpretations and their philosophical implications.", | |
| "Evaluate the challenges and opportunities of establishing a sustainable human colony on Mars, considering technological, biological, and social factors.", | |
| "Analyze the impact of social media algorithms on democratic processes and public discourse, proposing potential regulatory frameworks.", | |
| "Discuss the ethical implications of gene editing technologies like CRISPR, considering medical benefits, risks, and societal concerns.", | |
| "Examine the role of cognitive biases in financial decision-making and their impact on market efficiency." | |
| ] | |
| def get_mathematical_queries() -> List[str]: | |
| """Mathematical reasoning queries""" | |
| return [ | |
| "Solve for x: 2x + 5 = 13", | |
| "Calculate the derivative of f(x) = 3x^2 + 2x - 1", | |
| "Find the area of a circle with radius 7", | |
| "Solve the quadratic equation: x^2 - 5x + 6 = 0", | |
| "Calculate the compound interest on $1000 at 5% for 3 years", | |
| "Find the integral of sin(x) from 0 to π", | |
| "Determine if the series Σ(1/n^2) converges", | |
| "Calculate the probability of getting exactly 3 heads in 5 coin flips", | |
| "Find the eigenvalues of the matrix [[2, 1], [1, 2]]", | |
| "Solve the differential equation: dy/dx = 2y" | |
| ] | |
| def get_ai_agent_queries() -> List[str]: | |
| """AI Agent specific queries""" | |
| return [ | |
| "How does the FSM agent handle recursive reasoning?", | |
| "Compare the performance of Chain of Thought vs FSM reasoning approaches", | |
| "What are the advantages of hybrid architecture in AI agents?", | |
| "How does the metacognitive layer improve reasoning quality?", | |
| "Analyze the trade-offs between cache size and memory usage in CoT systems", | |
| "What makes the OptimizedChainOfThought system different from basic CoT?", | |
| "How does the complexity analyzer determine reasoning depth?", | |
| "Explain the multi-path exploration strategy in reasoning systems", | |
| "What role does template selection play in reasoning quality?", | |
| "How can we optimize reasoning performance for real-time applications?" | |
| ] | |
| # ============================= | |
| # Benchmarking Engine | |
| # ============================= | |
| class CoTBenchmark: | |
| """Benchmarking engine for Chain of Thought system""" | |
| def __init__(self): | |
| self.results: List[BenchmarkResult] = [] | |
| self.cot_configs = [ | |
| {'max_paths': 1, 'cache_size': 100}, | |
| {'max_paths': 3, 'cache_size': 500}, | |
| {'max_paths': 5, 'cache_size': 1000}, | |
| {'max_paths': 7, 'cache_size': 2000} | |
| ] | |
| async def run_benchmark(self, cot_system, queries: List[str], | |
| name: str = "default") -> BenchmarkSuite: | |
| """Run benchmark on a set of queries""" | |
| results = [] | |
| print(f"Running benchmark: {name}") | |
| print(f"Number of queries: {len(queries)}") | |
| print("-" * 50) | |
| for i, query in enumerate(queries): | |
| print(f"Processing query {i+1}/{len(queries)}: {query[:50]}...") | |
| # Measure performance | |
| result = await self._benchmark_single_query(cot_system, query) | |
| results.append(result) | |
| suite = BenchmarkSuite( | |
| name=name, | |
| results=results, | |
| timestamp=time.time(), | |
| config=cot_system.config | |
| ) | |
| return suite | |
| async def _benchmark_single_query(self, cot_system, query: str) -> BenchmarkResult: | |
| """Benchmark a single query""" | |
| # Start monitoring | |
| tracemalloc.start() | |
| process = psutil.Process() | |
| cpu_before = process.cpu_percent() | |
| # Check if query is in cache | |
| cache_result = cot_system.reasoning_cache.get(query) | |
| cache_hit = cache_result is not None | |
| # Execute reasoning | |
| start_time = time.time() | |
| result = await cot_system.reason(query) | |
| execution_time = time.time() - start_time | |
| # Get memory usage | |
| current, peak = tracemalloc.get_traced_memory() | |
| tracemalloc.stop() | |
| memory_mb = peak / 1024 / 1024 | |
| # Get CPU usage | |
| cpu_after = process.cpu_percent() | |
| cpu_usage = cpu_after - cpu_before | |
| # Get complexity | |
| complexity, _ = cot_system.complexity_analyzer.analyze(query) | |
| # Extract reasoning types | |
| reasoning_types = [step.reasoning_type.name for step in result.steps] | |
| return BenchmarkResult( | |
| query=query, | |
| complexity=complexity, | |
| execution_time=execution_time, | |
| confidence=result.total_confidence, | |
| steps_count=len(result.steps), | |
| cache_hit=cache_hit, | |
| memory_usage=memory_mb, | |
| cpu_usage=cpu_usage, | |
| template_used=result.template_used, | |
| paths_explored=len(result.steps), # Simplified | |
| reasoning_types=reasoning_types, | |
| final_answer=result.final_answer or "No answer generated" | |
| ) | |
| async def run_comparative_benchmark(self) -> Dict[str, BenchmarkSuite]: | |
| """Run benchmarks with different configurations""" | |
| all_queries = ( | |
| QueryDataset.get_simple_queries()[:5] + | |
| QueryDataset.get_medium_queries()[:5] + | |
| QueryDataset.get_complex_queries()[:5] | |
| ) | |
| results = {} | |
| for config in self.cot_configs: | |
| config_name = f"paths_{config['max_paths']}_cache_{config['cache_size']}" | |
| # Create CoT system with config | |
| cot = OptimizedChainOfThought("benchmark_cot", config) | |
| # Run benchmark | |
| suite = await self.run_benchmark(cot, all_queries, config_name) | |
| results[config_name] = suite | |
| return results | |
| def analyze_results(self, suite: BenchmarkSuite) -> Dict[str, Any]: | |
| """Analyze benchmark results""" | |
| results = suite.results | |
| # Basic statistics | |
| execution_times = [r.execution_time for r in results] | |
| confidences = [r.confidence for r in results] | |
| steps_counts = [r.steps_count for r in results] | |
| # Group by complexity | |
| simple_results = [r for r in results if r.complexity < 0.3] | |
| medium_results = [r for r in results if 0.3 <= r.complexity < 0.7] | |
| complex_results = [r for r in results if r.complexity >= 0.7] | |
| analysis = { | |
| 'overall': { | |
| 'total_queries': len(results), | |
| 'avg_execution_time': statistics.mean(execution_times), | |
| 'std_execution_time': statistics.stdev(execution_times) if len(execution_times) > 1 else 0, | |
| 'avg_confidence': statistics.mean(confidences), | |
| 'avg_steps': statistics.mean(steps_counts), | |
| 'cache_hit_rate': sum(1 for r in results if r.cache_hit) / len(results), | |
| 'avg_memory_mb': statistics.mean([r.memory_usage for r in results]), | |
| 'avg_cpu_percent': statistics.mean([r.cpu_usage for r in results]) | |
| }, | |
| 'by_complexity': { | |
| 'simple': self._analyze_group(simple_results), | |
| 'medium': self._analyze_group(medium_results), | |
| 'complex': self._analyze_group(complex_results) | |
| }, | |
| 'by_template': self._analyze_by_template(results), | |
| 'by_reasoning_type': self._analyze_by_reasoning_type(results) | |
| } | |
| return analysis | |
| def _analyze_group(self, results: List[BenchmarkResult]) -> Dict[str, float]: | |
| """Analyze a group of results""" | |
| if not results: | |
| return {} | |
| return { | |
| 'count': len(results), | |
| 'avg_execution_time': statistics.mean([r.execution_time for r in results]), | |
| 'avg_confidence': statistics.mean([r.confidence for r in results]), | |
| 'avg_steps': statistics.mean([r.steps_count for r in results]) | |
| } | |
| def _analyze_by_template(self, results: List[BenchmarkResult]) -> Dict[str, Dict[str, float]]: | |
| """Analyze results by template used""" | |
| template_results = {} | |
| for result in results: | |
| template = result.template_used | |
| if template not in template_results: | |
| template_results[template] = [] | |
| template_results[template].append(result) | |
| analysis = {} | |
| for template, template_group in template_results.items(): | |
| analysis[template] = self._analyze_group(template_group) | |
| return analysis | |
| def _analyze_by_reasoning_type(self, results: List[BenchmarkResult]) -> Dict[str, Dict[str, float]]: | |
| """Analyze results by reasoning type""" | |
| type_results = {} | |
| for result in results: | |
| for reasoning_type in result.reasoning_types: | |
| if reasoning_type not in type_results: | |
| type_results[reasoning_type] = [] | |
| type_results[reasoning_type].append(result) | |
| analysis = {} | |
| for reasoning_type, type_group in type_results.items(): | |
| analysis[reasoning_type] = self._analyze_group(type_group) | |
| return analysis | |
| # ============================= | |
| # Performance Visualization | |
| # ============================= | |
| class BenchmarkVisualizer: | |
| """Visualize benchmark results""" | |
| def plot_execution_time_by_complexity(suite: BenchmarkSuite): | |
| """Plot execution time vs complexity""" | |
| complexities = [r.complexity for r in suite.results] | |
| execution_times = [r.execution_time for r in suite.results] | |
| plt.figure(figsize=(10, 6)) | |
| plt.scatter(complexities, execution_times, alpha=0.6) | |
| plt.xlabel('Query Complexity') | |
| plt.ylabel('Execution Time (seconds)') | |
| plt.title('Execution Time vs Query Complexity') | |
| # Add trend line | |
| z = np.polyfit(complexities, execution_times, 2) | |
| p = np.poly1d(z) | |
| x_trend = np.linspace(0, 1, 100) | |
| plt.plot(x_trend, p(x_trend), 'r--', alpha=0.8, label='Trend') | |
| plt.legend() | |
| plt.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.show() | |
| def plot_confidence_distribution(suite: BenchmarkSuite): | |
| """Plot confidence score distribution""" | |
| confidences = [r.confidence for r in suite.results] | |
| plt.figure(figsize=(10, 6)) | |
| plt.hist(confidences, bins=20, alpha=0.7, edgecolor='black') | |
| plt.xlabel('Confidence Score') | |
| plt.ylabel('Frequency') | |
| plt.title('Distribution of Confidence Scores') | |
| plt.axvline(np.mean(confidences), color='red', linestyle='--', | |
| label=f'Mean: {np.mean(confidences):.2f}') | |
| plt.legend() | |
| plt.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.show() | |
| def plot_template_performance(analysis: Dict[str, Any]): | |
| """Plot performance by template""" | |
| template_data = analysis['by_template'] | |
| if not template_data: | |
| print("No template data available for visualization") | |
| return | |
| templates = list(template_data.keys()) | |
| avg_times = [template_data[t]['avg_execution_time'] for t in templates] | |
| avg_confidence = [template_data[t]['avg_confidence'] for t in templates] | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) | |
| # Execution time | |
| ax1.bar(templates, avg_times, alpha=0.7) | |
| ax1.set_xlabel('Template') | |
| ax1.set_ylabel('Average Execution Time (s)') | |
| ax1.set_title('Average Execution Time by Template') | |
| ax1.tick_params(axis='x', rotation=45) | |
| # Confidence | |
| ax2.bar(templates, avg_confidence, alpha=0.7, color='green') | |
| ax2.set_xlabel('Template') | |
| ax2.set_ylabel('Average Confidence') | |
| ax2.set_title('Average Confidence by Template') | |
| ax2.tick_params(axis='x', rotation=45) | |
| plt.tight_layout() | |
| plt.show() | |
| def plot_comparative_results(comparative_results: Dict[str, BenchmarkSuite]): | |
| """Plot comparative benchmark results""" | |
| configs = [] | |
| avg_times = [] | |
| avg_confidences = [] | |
| for config_name, suite in comparative_results.items(): | |
| analysis = CoTBenchmark().analyze_results(suite) | |
| configs.append(config_name) | |
| avg_times.append(analysis['overall']['avg_execution_time']) | |
| avg_confidences.append(analysis['overall']['avg_confidence']) | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) | |
| # Execution time comparison | |
| ax1.plot(configs, avg_times, 'o-', markersize=8) | |
| ax1.set_xlabel('Configuration') | |
| ax1.set_ylabel('Average Execution Time (s)') | |
| ax1.set_title('Execution Time by Configuration') | |
| ax1.tick_params(axis='x', rotation=45) | |
| ax1.grid(True, alpha=0.3) | |
| # Confidence comparison | |
| ax2.plot(configs, avg_confidences, 'o-', markersize=8, color='green') | |
| ax2.set_xlabel('Configuration') | |
| ax2.set_ylabel('Average Confidence') | |
| ax2.set_title('Confidence by Configuration') | |
| ax2.tick_params(axis='x', rotation=45) | |
| ax2.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.show() | |
| def plot_memory_usage_analysis(suite: BenchmarkSuite): | |
| """Plot memory usage analysis""" | |
| memory_usage = [r.memory_usage for r in suite.results] | |
| complexities = [r.complexity for r in suite.results] | |
| plt.figure(figsize=(12, 8)) | |
| # Memory vs complexity | |
| plt.subplot(2, 2, 1) | |
| plt.scatter(complexities, memory_usage, alpha=0.6) | |
| plt.xlabel('Query Complexity') | |
| plt.ylabel('Memory Usage (MB)') | |
| plt.title('Memory Usage vs Complexity') | |
| plt.grid(True, alpha=0.3) | |
| # Memory distribution | |
| plt.subplot(2, 2, 2) | |
| plt.hist(memory_usage, bins=15, alpha=0.7, edgecolor='black') | |
| plt.xlabel('Memory Usage (MB)') | |
| plt.ylabel('Frequency') | |
| plt.title('Memory Usage Distribution') | |
| plt.axvline(np.mean(memory_usage), color='red', linestyle='--', | |
| label=f'Mean: {np.mean(memory_usage):.2f} MB') | |
| plt.legend() | |
| plt.grid(True, alpha=0.3) | |
| # Memory vs execution time | |
| plt.subplot(2, 2, 3) | |
| execution_times = [r.execution_time for r in suite.results] | |
| plt.scatter(execution_times, memory_usage, alpha=0.6) | |
| plt.xlabel('Execution Time (s)') | |
| plt.ylabel('Memory Usage (MB)') | |
| plt.title('Memory Usage vs Execution Time') | |
| plt.grid(True, alpha=0.3) | |
| # Memory vs steps | |
| plt.subplot(2, 2, 4) | |
| steps_counts = [r.steps_count for r in suite.results] | |
| plt.scatter(steps_counts, memory_usage, alpha=0.6) | |
| plt.xlabel('Number of Steps') | |
| plt.ylabel('Memory Usage (MB)') | |
| plt.title('Memory Usage vs Steps Count') | |
| plt.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| plt.show() | |
| # ============================= | |
| # Performance Testing Suite | |
| # ============================= | |
| async def run_performance_tests(): | |
| """Run comprehensive performance tests""" | |
| print("=== Chain of Thought Performance Testing Suite ===\n") | |
| # Initialize benchmark engine | |
| benchmark = CoTBenchmark() | |
| # Test 1: Basic Performance | |
| print("Test 1: Basic Performance") | |
| print("-" * 50) | |
| cot = OptimizedChainOfThought("test_cot", {'max_paths': 3}) | |
| # Get test queries | |
| test_queries = ( | |
| QueryDataset.get_simple_queries()[:3] + | |
| QueryDataset.get_medium_queries()[:3] + | |
| QueryDataset.get_complex_queries()[:2] | |
| ) | |
| suite = await benchmark.run_benchmark(cot, test_queries, "basic_performance") | |
| analysis = benchmark.analyze_results(suite) | |
| print("\nBasic Performance Results:") | |
| print(f"Average execution time: {analysis['overall']['avg_execution_time']:.3f}s") | |
| print(f"Average confidence: {analysis['overall']['avg_confidence']:.3f}") | |
| print(f"Cache hit rate: {analysis['overall']['cache_hit_rate']:.2%}") | |
| print(f"Average memory usage: {analysis['overall']['avg_memory_mb']:.2f} MB") | |
| # Test 2: Stress Test | |
| print("\n\nTest 2: Stress Test (50 queries)") | |
| print("-" * 50) | |
| # Generate many queries | |
| stress_queries = [] | |
| for i in range(50): | |
| complexity = np.random.random() | |
| if complexity < 0.3: | |
| stress_queries.append(f"Simple query {i}: What is {i} + {i+1}?") | |
| elif complexity < 0.7: | |
| stress_queries.append(f"Medium query {i}: Explain concept {i} in detail.") | |
| else: | |
| stress_queries.append( | |
| f"Complex query {i}: Analyze the multifaceted implications of topic {i} " | |
| f"considering various perspectives and long-term consequences." | |
| ) | |
| stress_suite = await benchmark.run_benchmark(cot, stress_queries, "stress_test") | |
| stress_analysis = benchmark.analyze_results(stress_suite) | |
| print("\nStress Test Results:") | |
| print(f"Total queries processed: {stress_analysis['overall']['total_queries']}") | |
| print(f"Average execution time: {stress_analysis['overall']['avg_execution_time']:.3f}s") | |
| print(f"Peak memory usage: {max(r.memory_usage for r in stress_suite.results):.2f} MB") | |
| # Test 3: Cache Performance | |
| print("\n\nTest 3: Cache Performance") | |
| print("-" * 50) | |
| # Test cache effectiveness | |
| cache_test_queries = QueryDataset.get_medium_queries()[:5] | |
| # First run - no cache | |
| print("First run (cold cache)...") | |
| first_run = await benchmark.run_benchmark(cot, cache_test_queries, "cache_cold") | |
| # Second run - with cache | |
| print("Second run (warm cache)...") | |
| second_run = await benchmark.run_benchmark(cot, cache_test_queries, "cache_warm") | |
| first_times = [r.execution_time for r in first_run.results] | |
| second_times = [r.execution_time for r in second_run.results] | |
| speedup = statistics.mean(first_times) / statistics.mean(second_times) | |
| print(f"\nCache speedup: {speedup:.2f}x") | |
| print(f"First run avg: {statistics.mean(first_times):.3f}s") | |
| print(f"Second run avg: {statistics.mean(second_times):.3f}s") | |
| # Test 4: Comparative Configuration Test | |
| print("\n\nTest 4: Configuration Comparison") | |
| print("-" * 50) | |
| comparative_results = await benchmark.run_comparative_benchmark() | |
| print("\nConfiguration Comparison Results:") | |
| for config_name, suite in comparative_results.items(): | |
| analysis = benchmark.analyze_results(suite) | |
| print(f"\n{config_name}:") | |
| print(f" Avg execution time: {analysis['overall']['avg_execution_time']:.3f}s") | |
| print(f" Avg confidence: {analysis['overall']['avg_confidence']:.3f}") | |
| print(f" Avg memory: {analysis['overall']['avg_memory_mb']:.2f} MB") | |
| # Test 5: Domain-Specific Performance | |
| print("\n\nTest 5: Domain-Specific Performance") | |
| print("-" * 50) | |
| # Test different query domains | |
| domain_queries = { | |
| 'mathematical': QueryDataset.get_mathematical_queries()[:5], | |
| 'ai_agent': QueryDataset.get_ai_agent_queries()[:5] | |
| } | |
| domain_results = {} | |
| for domain, queries in domain_queries.items(): | |
| print(f"Testing {domain} queries...") | |
| domain_suite = await benchmark.run_benchmark(cot, queries, f"{domain}_domain") | |
| domain_analysis = benchmark.analyze_results(domain_suite) | |
| domain_results[domain] = domain_analysis | |
| print(f" {domain.capitalize()} domain:") | |
| print(f" Avg execution time: {domain_analysis['overall']['avg_execution_time']:.3f}s") | |
| print(f" Avg confidence: {domain_analysis['overall']['avg_confidence']:.3f}") | |
| # Visualize results | |
| print("\n\nGenerating visualizations...") | |
| visualizer = BenchmarkVisualizer() | |
| # Plot execution time vs complexity | |
| visualizer.plot_execution_time_by_complexity(suite) | |
| # Plot confidence distribution | |
| visualizer.plot_confidence_distribution(suite) | |
| # Plot template performance | |
| visualizer.plot_template_performance(analysis) | |
| # Plot comparative results | |
| visualizer.plot_comparative_results(comparative_results) | |
| # Plot memory usage analysis | |
| visualizer.plot_memory_usage_analysis(suite) | |
| return { | |
| 'basic_performance': analysis, | |
| 'stress_test': stress_analysis, | |
| 'cache_performance': { | |
| 'speedup': speedup, | |
| 'first_run_avg': statistics.mean(first_times), | |
| 'second_run_avg': statistics.mean(second_times) | |
| }, | |
| 'comparative': comparative_results, | |
| 'domain_specific': domain_results | |
| } | |
| # ============================= | |
| # Main Execution | |
| # ============================= | |
| if __name__ == "__main__": | |
| # Run performance tests | |
| results = asyncio.run(run_performance_tests()) | |
| # Save results | |
| with open('cot_benchmark_results.json', 'w') as f: | |
| # Convert results to serializable format | |
| serializable_results = { | |
| 'timestamp': time.time(), | |
| 'basic_performance': results['basic_performance'], | |
| 'stress_test': results['stress_test'], | |
| 'cache_performance': results['cache_performance'], | |
| 'domain_specific': results['domain_specific'] | |
| } | |
| json.dump(serializable_results, f, indent=2) | |
| print("\n\nBenchmark results saved to 'cot_benchmark_results.json'") | |
| print("Performance testing completed successfully!") |