RobotPai

Build error

App Files Files Community

RobotPai / tests /performance /cot_benchmark_suite.py

atr0p05

Upload 291 files

8a682b5 verified 10 months ago

raw

history blame contribute delete

26.5 kB

	"""
	Chain of Thought Performance Benchmarks and Testing Suite
	Comprehensive benchmarking for the Optimized CoT System
	"""

	import asyncio
	import time
	import numpy as np
	import pandas as pd
	from typing import Dict, List, Tuple, Any
	import matplotlib.pyplot as plt
	import seaborn as sns
	from dataclasses import dataclass
	import json
	import statistics
	from concurrent.futures import ProcessPoolExecutor
	import psutil
	import tracemalloc
	import sys
	import os
	import logging

	# Add the src directory to the path
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))

	# Import the CoT system
	try:
	from core.optimized_chain_of_thought import (
	OptimizedChainOfThought,
	ReasoningType,
	ReasoningPath,
	ReasoningStep
	)
	except ImportError:
	# Fallback for direct execution
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
	from src.core.optimized_chain_of_thought import (
	OptimizedChainOfThought,
	ReasoningType,
	ReasoningPath,
	ReasoningStep
	)

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# =============================
	# Benchmark Data Structures
	# =============================

	@dataclass
	class BenchmarkResult:
	"""Result of a single benchmark run"""
	query: str
	complexity: float
	execution_time: float
	confidence: float
	steps_count: int
	cache_hit: bool
	memory_usage: float
	cpu_usage: float
	template_used: str
	paths_explored: int
	reasoning_types: List[str]
	final_answer: str

	@dataclass
	class BenchmarkSuite:
	"""Collection of benchmark results"""
	name: str
	results: List[BenchmarkResult]
	timestamp: float
	config: Dict[str, Any]

	# =============================
	# Test Query Sets
	# =============================

	class QueryDataset:
	"""Dataset of queries for benchmarking"""

	@staticmethod
	def get_simple_queries() -> List[str]:
	"""Simple queries (complexity < 0.3)"""
	return [
	"What is 2 + 2?",
	"What color is the sky?",
	"Define democracy.",
	"What is the capital of France?",
	"How many days in a week?",
	"What is water made of?",
	"When did World War II end?",
	"What is the speed of light?",
	"Who wrote Romeo and Juliet?",
	"What is the largest planet?"
	]

	@staticmethod
	def get_medium_queries() -> List[str]:
	"""Medium complexity queries (0.3 < complexity < 0.7)"""
	return [
	"Explain the process of photosynthesis.",
	"Compare renewable and non-renewable energy sources.",
	"What are the main causes of climate change?",
	"Describe the water cycle and its importance.",
	"How does the stock market work?",
	"Explain the difference between virus and bacteria.",
	"What factors contributed to the Industrial Revolution?",
	"How do vaccines work to prevent diseases?",
	"Describe the structure of DNA.",
	"What are the pros and cons of social media?"
	]

	@staticmethod
	def get_complex_queries() -> List[str]:
	"""Complex queries (complexity > 0.7)"""
	return [
	"Analyze the potential long-term socioeconomic impacts of artificial intelligence on global employment patterns, considering both displacement effects and new job creation.",
	"Compare and contrast the philosophical foundations of utilitarianism and deontological ethics, providing examples of how each would approach modern ethical dilemmas.",
	"Evaluate the effectiveness of different monetary policy tools in combating inflation while maintaining economic growth, considering recent global economic trends.",
	"Discuss the role of epigenetics in evolution and heredity, explaining how environmental factors can influence gene expression across generations.",
	"Analyze the geopolitical implications of renewable energy transition on international relations and global power dynamics.",
	"Examine the intersection of quantum mechanics and consciousness, discussing various interpretations and their philosophical implications.",
	"Evaluate the challenges and opportunities of establishing a sustainable human colony on Mars, considering technological, biological, and social factors.",
	"Analyze the impact of social media algorithms on democratic processes and public discourse, proposing potential regulatory frameworks.",
	"Discuss the ethical implications of gene editing technologies like CRISPR, considering medical benefits, risks, and societal concerns.",
	"Examine the role of cognitive biases in financial decision-making and their impact on market efficiency."
	]

	@staticmethod
	def get_mathematical_queries() -> List[str]:
	"""Mathematical reasoning queries"""
	return [
	"Solve for x: 2x + 5 = 13",
	"Calculate the derivative of f(x) = 3x^2 + 2x - 1",
	"Find the area of a circle with radius 7",
	"Solve the quadratic equation: x^2 - 5x + 6 = 0",
	"Calculate the compound interest on $1000 at 5% for 3 years",
	"Find the integral of sin(x) from 0 to π",
	"Determine if the series Σ(1/n^2) converges",
	"Calculate the probability of getting exactly 3 heads in 5 coin flips",
	"Find the eigenvalues of the matrix [[2, 1], [1, 2]]",
	"Solve the differential equation: dy/dx = 2y"
	]

	@staticmethod
	def get_ai_agent_queries() -> List[str]:
	"""AI Agent specific queries"""
	return [
	"How does the FSM agent handle recursive reasoning?",
	"Compare the performance of Chain of Thought vs FSM reasoning approaches",
	"What are the advantages of hybrid architecture in AI agents?",
	"How does the metacognitive layer improve reasoning quality?",
	"Analyze the trade-offs between cache size and memory usage in CoT systems",
	"What makes the OptimizedChainOfThought system different from basic CoT?",
	"How does the complexity analyzer determine reasoning depth?",
	"Explain the multi-path exploration strategy in reasoning systems",
	"What role does template selection play in reasoning quality?",
	"How can we optimize reasoning performance for real-time applications?"
	]

	# =============================
	# Benchmarking Engine
	# =============================

	class CoTBenchmark:
	"""Benchmarking engine for Chain of Thought system"""

	def __init__(self):
	self.results: List[BenchmarkResult] = []
	self.cot_configs = [
	{'max_paths': 1, 'cache_size': 100},
	{'max_paths': 3, 'cache_size': 500},
	{'max_paths': 5, 'cache_size': 1000},
	{'max_paths': 7, 'cache_size': 2000}
	]

	async def run_benchmark(self, cot_system, queries: List[str],
	name: str = "default") -> BenchmarkSuite:
	"""Run benchmark on a set of queries"""
	results = []

	print(f"Running benchmark: {name}")
	print(f"Number of queries: {len(queries)}")
	print("-" * 50)

	for i, query in enumerate(queries):
	print(f"Processing query {i+1}/{len(queries)}: {query[:50]}...")

	# Measure performance
	result = await self._benchmark_single_query(cot_system, query)
	results.append(result)

	suite = BenchmarkSuite(
	name=name,
	results=results,
	timestamp=time.time(),
	config=cot_system.config
	)

	return suite

	async def _benchmark_single_query(self, cot_system, query: str) -> BenchmarkResult:
	"""Benchmark a single query"""
	# Start monitoring
	tracemalloc.start()
	process = psutil.Process()
	cpu_before = process.cpu_percent()

	# Check if query is in cache
	cache_result = cot_system.reasoning_cache.get(query)
	cache_hit = cache_result is not None

	# Execute reasoning
	start_time = time.time()
	result = await cot_system.reason(query)
	execution_time = time.time() - start_time

	# Get memory usage
	current, peak = tracemalloc.get_traced_memory()
	tracemalloc.stop()
	memory_mb = peak / 1024 / 1024

	# Get CPU usage
	cpu_after = process.cpu_percent()
	cpu_usage = cpu_after - cpu_before

	# Get complexity
	complexity, _ = cot_system.complexity_analyzer.analyze(query)

	# Extract reasoning types
	reasoning_types = [step.reasoning_type.name for step in result.steps]

	return BenchmarkResult(
	query=query,
	complexity=complexity,
	execution_time=execution_time,
	confidence=result.total_confidence,
	steps_count=len(result.steps),
	cache_hit=cache_hit,
	memory_usage=memory_mb,
	cpu_usage=cpu_usage,
	template_used=result.template_used,
	paths_explored=len(result.steps), # Simplified
	reasoning_types=reasoning_types,
	final_answer=result.final_answer or "No answer generated"
	)

	async def run_comparative_benchmark(self) -> Dict[str, BenchmarkSuite]:
	"""Run benchmarks with different configurations"""
	all_queries = (
	QueryDataset.get_simple_queries()[:5] +
	QueryDataset.get_medium_queries()[:5] +
	QueryDataset.get_complex_queries()[:5]
	)

	results = {}

	for config in self.cot_configs:
	config_name = f"paths_{config['max_paths']}_cache_{config['cache_size']}"

	# Create CoT system with config
	cot = OptimizedChainOfThought("benchmark_cot", config)

	# Run benchmark
	suite = await self.run_benchmark(cot, all_queries, config_name)
	results[config_name] = suite

	return results

	def analyze_results(self, suite: BenchmarkSuite) -> Dict[str, Any]:
	"""Analyze benchmark results"""
	results = suite.results

	# Basic statistics
	execution_times = [r.execution_time for r in results]
	confidences = [r.confidence for r in results]
	steps_counts = [r.steps_count for r in results]

	# Group by complexity
	simple_results = [r for r in results if r.complexity < 0.3]
	medium_results = [r for r in results if 0.3 <= r.complexity < 0.7]
	complex_results = [r for r in results if r.complexity >= 0.7]

	analysis = {
	'overall': {
	'total_queries': len(results),
	'avg_execution_time': statistics.mean(execution_times),
	'std_execution_time': statistics.stdev(execution_times) if len(execution_times) > 1 else 0,
	'avg_confidence': statistics.mean(confidences),
	'avg_steps': statistics.mean(steps_counts),
	'cache_hit_rate': sum(1 for r in results if r.cache_hit) / len(results),
	'avg_memory_mb': statistics.mean([r.memory_usage for r in results]),
	'avg_cpu_percent': statistics.mean([r.cpu_usage for r in results])
	},
	'by_complexity': {
	'simple': self._analyze_group(simple_results),
	'medium': self._analyze_group(medium_results),
	'complex': self._analyze_group(complex_results)
	},
	'by_template': self._analyze_by_template(results),
	'by_reasoning_type': self._analyze_by_reasoning_type(results)
	}

	return analysis

	def _analyze_group(self, results: List[BenchmarkResult]) -> Dict[str, float]:
	"""Analyze a group of results"""
	if not results:
	return {}

	return {
	'count': len(results),
	'avg_execution_time': statistics.mean([r.execution_time for r in results]),
	'avg_confidence': statistics.mean([r.confidence for r in results]),
	'avg_steps': statistics.mean([r.steps_count for r in results])
	}

	def _analyze_by_template(self, results: List[BenchmarkResult]) -> Dict[str, Dict[str, float]]:
	"""Analyze results by template used"""
	template_results = {}

	for result in results:
	template = result.template_used
	if template not in template_results:
	template_results[template] = []
	template_results[template].append(result)

	analysis = {}
	for template, template_group in template_results.items():
	analysis[template] = self._analyze_group(template_group)

	return analysis

	def _analyze_by_reasoning_type(self, results: List[BenchmarkResult]) -> Dict[str, Dict[str, float]]:
	"""Analyze results by reasoning type"""
	type_results = {}

	for result in results:
	for reasoning_type in result.reasoning_types:
	if reasoning_type not in type_results:
	type_results[reasoning_type] = []
	type_results[reasoning_type].append(result)

	analysis = {}
	for reasoning_type, type_group in type_results.items():
	analysis[reasoning_type] = self._analyze_group(type_group)

	return analysis

	# =============================
	# Performance Visualization
	# =============================

	class BenchmarkVisualizer:
	"""Visualize benchmark results"""

	@staticmethod
	def plot_execution_time_by_complexity(suite: BenchmarkSuite):
	"""Plot execution time vs complexity"""
	complexities = [r.complexity for r in suite.results]
	execution_times = [r.execution_time for r in suite.results]

	plt.figure(figsize=(10, 6))
	plt.scatter(complexities, execution_times, alpha=0.6)
	plt.xlabel('Query Complexity')
	plt.ylabel('Execution Time (seconds)')
	plt.title('Execution Time vs Query Complexity')

	# Add trend line
	z = np.polyfit(complexities, execution_times, 2)
	p = np.poly1d(z)
	x_trend = np.linspace(0, 1, 100)
	plt.plot(x_trend, p(x_trend), 'r--', alpha=0.8, label='Trend')

	plt.legend()
	plt.grid(True, alpha=0.3)
	plt.tight_layout()
	plt.show()

	@staticmethod
	def plot_confidence_distribution(suite: BenchmarkSuite):
	"""Plot confidence score distribution"""
	confidences = [r.confidence for r in suite.results]

	plt.figure(figsize=(10, 6))
	plt.hist(confidences, bins=20, alpha=0.7, edgecolor='black')
	plt.xlabel('Confidence Score')
	plt.ylabel('Frequency')
	plt.title('Distribution of Confidence Scores')
	plt.axvline(np.mean(confidences), color='red', linestyle='--',
	label=f'Mean: {np.mean(confidences):.2f}')
	plt.legend()
	plt.grid(True, alpha=0.3)
	plt.tight_layout()
	plt.show()

	@staticmethod
	def plot_template_performance(analysis: Dict[str, Any]):
	"""Plot performance by template"""
	template_data = analysis['by_template']

	if not template_data:
	print("No template data available for visualization")
	return

	templates = list(template_data.keys())
	avg_times = [template_data[t]['avg_execution_time'] for t in templates]
	avg_confidence = [template_data[t]['avg_confidence'] for t in templates]

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

	# Execution time
	ax1.bar(templates, avg_times, alpha=0.7)
	ax1.set_xlabel('Template')
	ax1.set_ylabel('Average Execution Time (s)')
	ax1.set_title('Average Execution Time by Template')
	ax1.tick_params(axis='x', rotation=45)

	# Confidence
	ax2.bar(templates, avg_confidence, alpha=0.7, color='green')
	ax2.set_xlabel('Template')
	ax2.set_ylabel('Average Confidence')
	ax2.set_title('Average Confidence by Template')
	ax2.tick_params(axis='x', rotation=45)

	plt.tight_layout()
	plt.show()

	@staticmethod
	def plot_comparative_results(comparative_results: Dict[str, BenchmarkSuite]):
	"""Plot comparative benchmark results"""
	configs = []
	avg_times = []
	avg_confidences = []

	for config_name, suite in comparative_results.items():
	analysis = CoTBenchmark().analyze_results(suite)
	configs.append(config_name)
	avg_times.append(analysis['overall']['avg_execution_time'])
	avg_confidences.append(analysis['overall']['avg_confidence'])

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

	# Execution time comparison
	ax1.plot(configs, avg_times, 'o-', markersize=8)
	ax1.set_xlabel('Configuration')
	ax1.set_ylabel('Average Execution Time (s)')
	ax1.set_title('Execution Time by Configuration')
	ax1.tick_params(axis='x', rotation=45)
	ax1.grid(True, alpha=0.3)

	# Confidence comparison
	ax2.plot(configs, avg_confidences, 'o-', markersize=8, color='green')
	ax2.set_xlabel('Configuration')
	ax2.set_ylabel('Average Confidence')
	ax2.set_title('Confidence by Configuration')
	ax2.tick_params(axis='x', rotation=45)
	ax2.grid(True, alpha=0.3)

	plt.tight_layout()
	plt.show()

	@staticmethod
	def plot_memory_usage_analysis(suite: BenchmarkSuite):
	"""Plot memory usage analysis"""
	memory_usage = [r.memory_usage for r in suite.results]
	complexities = [r.complexity for r in suite.results]

	plt.figure(figsize=(12, 8))

	# Memory vs complexity
	plt.subplot(2, 2, 1)
	plt.scatter(complexities, memory_usage, alpha=0.6)
	plt.xlabel('Query Complexity')
	plt.ylabel('Memory Usage (MB)')
	plt.title('Memory Usage vs Complexity')
	plt.grid(True, alpha=0.3)

	# Memory distribution
	plt.subplot(2, 2, 2)
	plt.hist(memory_usage, bins=15, alpha=0.7, edgecolor='black')
	plt.xlabel('Memory Usage (MB)')
	plt.ylabel('Frequency')
	plt.title('Memory Usage Distribution')
	plt.axvline(np.mean(memory_usage), color='red', linestyle='--',
	label=f'Mean: {np.mean(memory_usage):.2f} MB')
	plt.legend()
	plt.grid(True, alpha=0.3)

	# Memory vs execution time
	plt.subplot(2, 2, 3)
	execution_times = [r.execution_time for r in suite.results]
	plt.scatter(execution_times, memory_usage, alpha=0.6)
	plt.xlabel('Execution Time (s)')
	plt.ylabel('Memory Usage (MB)')
	plt.title('Memory Usage vs Execution Time')
	plt.grid(True, alpha=0.3)

	# Memory vs steps
	plt.subplot(2, 2, 4)
	steps_counts = [r.steps_count for r in suite.results]
	plt.scatter(steps_counts, memory_usage, alpha=0.6)
	plt.xlabel('Number of Steps')
	plt.ylabel('Memory Usage (MB)')
	plt.title('Memory Usage vs Steps Count')
	plt.grid(True, alpha=0.3)

	plt.tight_layout()
	plt.show()

	# =============================
	# Performance Testing Suite
	# =============================

	async def run_performance_tests():
	"""Run comprehensive performance tests"""
	print("=== Chain of Thought Performance Testing Suite ===\n")

	# Initialize benchmark engine
	benchmark = CoTBenchmark()

	# Test 1: Basic Performance
	print("Test 1: Basic Performance")
	print("-" * 50)

	cot = OptimizedChainOfThought("test_cot", {'max_paths': 3})

	# Get test queries
	test_queries = (
	QueryDataset.get_simple_queries()[:3] +
	QueryDataset.get_medium_queries()[:3] +
	QueryDataset.get_complex_queries()[:2]
	)

	suite = await benchmark.run_benchmark(cot, test_queries, "basic_performance")
	analysis = benchmark.analyze_results(suite)

	print("\nBasic Performance Results:")
	print(f"Average execution time: {analysis['overall']['avg_execution_time']:.3f}s")
	print(f"Average confidence: {analysis['overall']['avg_confidence']:.3f}")
	print(f"Cache hit rate: {analysis['overall']['cache_hit_rate']:.2%}")
	print(f"Average memory usage: {analysis['overall']['avg_memory_mb']:.2f} MB")

	# Test 2: Stress Test
	print("\n\nTest 2: Stress Test (50 queries)")
	print("-" * 50)

	# Generate many queries
	stress_queries = []
	for i in range(50):
	complexity = np.random.random()
	if complexity < 0.3:
	stress_queries.append(f"Simple query {i}: What is {i} + {i+1}?")
	elif complexity < 0.7:
	stress_queries.append(f"Medium query {i}: Explain concept {i} in detail.")
	else:
	stress_queries.append(
	f"Complex query {i}: Analyze the multifaceted implications of topic {i} "
	f"considering various perspectives and long-term consequences."
	)

	stress_suite = await benchmark.run_benchmark(cot, stress_queries, "stress_test")
	stress_analysis = benchmark.analyze_results(stress_suite)

	print("\nStress Test Results:")
	print(f"Total queries processed: {stress_analysis['overall']['total_queries']}")
	print(f"Average execution time: {stress_analysis['overall']['avg_execution_time']:.3f}s")
	print(f"Peak memory usage: {max(r.memory_usage for r in stress_suite.results):.2f} MB")

	# Test 3: Cache Performance
	print("\n\nTest 3: Cache Performance")
	print("-" * 50)

	# Test cache effectiveness
	cache_test_queries = QueryDataset.get_medium_queries()[:5]

	# First run - no cache
	print("First run (cold cache)...")
	first_run = await benchmark.run_benchmark(cot, cache_test_queries, "cache_cold")

	# Second run - with cache
	print("Second run (warm cache)...")
	second_run = await benchmark.run_benchmark(cot, cache_test_queries, "cache_warm")

	first_times = [r.execution_time for r in first_run.results]
	second_times = [r.execution_time for r in second_run.results]

	speedup = statistics.mean(first_times) / statistics.mean(second_times)
	print(f"\nCache speedup: {speedup:.2f}x")
	print(f"First run avg: {statistics.mean(first_times):.3f}s")
	print(f"Second run avg: {statistics.mean(second_times):.3f}s")

	# Test 4: Comparative Configuration Test
	print("\n\nTest 4: Configuration Comparison")
	print("-" * 50)

	comparative_results = await benchmark.run_comparative_benchmark()

	print("\nConfiguration Comparison Results:")
	for config_name, suite in comparative_results.items():
	analysis = benchmark.analyze_results(suite)
	print(f"\n{config_name}:")
	print(f" Avg execution time: {analysis['overall']['avg_execution_time']:.3f}s")
	print(f" Avg confidence: {analysis['overall']['avg_confidence']:.3f}")
	print(f" Avg memory: {analysis['overall']['avg_memory_mb']:.2f} MB")

	# Test 5: Domain-Specific Performance
	print("\n\nTest 5: Domain-Specific Performance")
	print("-" * 50)

	# Test different query domains
	domain_queries = {
	'mathematical': QueryDataset.get_mathematical_queries()[:5],
	'ai_agent': QueryDataset.get_ai_agent_queries()[:5]
	}

	domain_results = {}
	for domain, queries in domain_queries.items():
	print(f"Testing {domain} queries...")
	domain_suite = await benchmark.run_benchmark(cot, queries, f"{domain}_domain")
	domain_analysis = benchmark.analyze_results(domain_suite)
	domain_results[domain] = domain_analysis

	print(f" {domain.capitalize()} domain:")
	print(f" Avg execution time: {domain_analysis['overall']['avg_execution_time']:.3f}s")
	print(f" Avg confidence: {domain_analysis['overall']['avg_confidence']:.3f}")

	# Visualize results
	print("\n\nGenerating visualizations...")
	visualizer = BenchmarkVisualizer()

	# Plot execution time vs complexity
	visualizer.plot_execution_time_by_complexity(suite)

	# Plot confidence distribution
	visualizer.plot_confidence_distribution(suite)

	# Plot template performance
	visualizer.plot_template_performance(analysis)

	# Plot comparative results
	visualizer.plot_comparative_results(comparative_results)

	# Plot memory usage analysis
	visualizer.plot_memory_usage_analysis(suite)

	return {
	'basic_performance': analysis,
	'stress_test': stress_analysis,
	'cache_performance': {
	'speedup': speedup,
	'first_run_avg': statistics.mean(first_times),
	'second_run_avg': statistics.mean(second_times)
	},
	'comparative': comparative_results,
	'domain_specific': domain_results
	}

	# =============================
	# Main Execution
	# =============================

	if __name__ == "__main__":
	# Run performance tests
	results = asyncio.run(run_performance_tests())

	# Save results
	with open('cot_benchmark_results.json', 'w') as f:
	# Convert results to serializable format
	serializable_results = {
	'timestamp': time.time(),
	'basic_performance': results['basic_performance'],
	'stress_test': results['stress_test'],
	'cache_performance': results['cache_performance'],
	'domain_specific': results['domain_specific']
	}
	json.dump(serializable_results, f, indent=2)

	print("\n\nBenchmark results saved to 'cot_benchmark_results.json'")
	print("Performance testing completed successfully!")