Spaces:

jkbennitt
/

felix-framework

Paused

File size: 15,241 Bytes

fb867c3

#!/usr/bin/env python3
"""
Multi-Server Performance Comparison Test for Felix Framework.

This script compares performance between single-server and multi-server
configurations to demonstrate true parallel processing benefits.

Requirements:
- Multiple LM Studio servers running on different ports
- Configuration files in config/ directory

Usage:
    python examples/test_multi_server_performance.py
"""

import sys
import time
import asyncio
import statistics
from pathlib import Path
from typing import List, Dict, Any, Tuple

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from blog_writer import FelixBlogWriter


class PerformanceTest:
    """Test framework for comparing single vs multi-server performance."""
    
    def __init__(self, debug_mode: bool = False):
        self.debug_mode = debug_mode
        self.test_topics = [
            "Quantum computing applications",
            "Sustainable energy solutions", 
            "Artificial intelligence ethics",
            "Space exploration technology",
            "Blockchain and cryptocurrency"
        ]
    
    async def run_single_test(self, topic: str, config_path: str, test_name: str) -> Dict[str, Any]:
        """
        Run a single performance test.
        
        Args:
            topic: Blog post topic
            config_path: Server configuration path
            test_name: Name for this test
            
        Returns:
            Test results dictionary
        """
        print(f"\n{'='*60}")
        print(f"🧪 RUNNING TEST: {test_name}")
        print(f"Topic: {topic}")
        print(f"Config: {config_path}")
        print(f"{'='*60}")
        
        # Create writer with specified configuration
        writer = FelixBlogWriter(
            server_config_path=config_path,
            strict_mode=True,  # Use strict mode for consistent testing
            debug_mode=self.debug_mode
        )
        
        # Test connection
        if not writer.test_lm_studio_connection():
            return {
                "error": "Connection failed",
                "test_name": test_name,
                "topic": topic,
                "config": config_path
            }
        
        # Create team
        writer.create_blog_writing_team(complexity="medium")
        
        # Run test
        start_time = time.perf_counter()
        results = await writer.run_blog_writing_session_async(
            topic=topic,
            simulation_time=1.0
        )
        end_time = time.perf_counter()
        
        total_duration = end_time - start_time
        
        # Extract metrics
        stats = results["session_stats"]
        
        test_result = {
            "test_name": test_name,
            "topic": topic,
            "config": config_path,
            "success": results["final_output"] is not None,
            "total_duration": total_duration,
            "simulation_duration": stats["total_duration"],
            "total_tokens": stats["total_tokens_used"],
            "agents_participated": stats["agents_participated"],
            "agents_created": stats["agents_created"],
            "messages_processed": stats["total_messages_processed"],
            "final_confidence": results["final_output"]["confidence"] if results["final_output"] else 0.0,
            "llm_stats": stats.get("llm_client_stats", {}),
            "content_length": len(results["final_output"]["content"]) if results["final_output"] else 0
        }
        
        # Add server-specific metrics if available
        if hasattr(writer.llm_client, 'get_pool_stats'):
            test_result["pool_stats"] = writer.llm_client.get_pool_stats()
        
        print(f"✅ Test completed: {total_duration:.2f}s, {test_result['total_tokens']} tokens")
        
        return test_result
    
    async def run_comparison_tests(self, num_iterations: int = 3) -> Dict[str, Any]:
        """
        Run comprehensive comparison tests.
        
        Args:
            num_iterations: Number of test iterations per configuration
            
        Returns:
            Comparison results
        """
        print(f"🚀 STARTING MULTI-SERVER PERFORMANCE COMPARISON")
        print(f"Iterations per config: {num_iterations}")
        print(f"Topics: {len(self.test_topics)}")
        print(f"Total tests: {len(self.test_topics) * num_iterations * 2}")
        
        all_results = {
            "single_server": [],
            "multi_server": [],
            "test_config": {
                "num_iterations": num_iterations,
                "topics": self.test_topics,
                "debug_mode": self.debug_mode,
                "timestamp": time.time()
            }
        }
        
        # Test each topic with both configurations
        for topic in self.test_topics:
            print(f"\n🎯 Testing topic: {topic}")
            
            for iteration in range(num_iterations):
                print(f"\n--- Iteration {iteration + 1}/{num_iterations} ---")
                
                # Test single server
                try:
                    single_result = await self.run_single_test(
                        topic, 
                        "config/single_server_config.json",
                        f"Single-Server-{iteration+1}"
                    )
                    all_results["single_server"].append(single_result)
                except Exception as e:
                    print(f"❌ Single server test failed: {e}")
                    all_results["single_server"].append({
                        "error": str(e),
                        "test_name": f"Single-Server-{iteration+1}",
                        "topic": topic
                    })
                
                # Brief pause between tests
                await asyncio.sleep(2)
                
                # Test multi server
                try:
                    multi_result = await self.run_single_test(
                        topic,
                        "config/server_config.json", 
                        f"Multi-Server-{iteration+1}"
                    )
                    all_results["multi_server"].append(multi_result)
                except Exception as e:
                    print(f"❌ Multi server test failed: {e}")
                    all_results["multi_server"].append({
                        "error": str(e),
                        "test_name": f"Multi-Server-{iteration+1}",
                        "topic": topic
                    })
                
                # Brief pause between tests
                await asyncio.sleep(2)
        
        return all_results
    
    def analyze_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
        """
        Analyze and compare test results.
        
        Args:
            results: Results from run_comparison_tests
            
        Returns:
            Analysis summary
        """
        print(f"\n{'='*60}")
        print(f"📊 ANALYZING RESULTS")
        print(f"{'='*60}")
        
        # Filter successful tests
        single_success = [r for r in results["single_server"] if "error" not in r and r.get("success", False)]
        multi_success = [r for r in results["multi_server"] if "error" not in r and r.get("success", False)]
        
        print(f"Successful tests: Single={len(single_success)}, Multi={len(multi_success)}")
        
        if not single_success or not multi_success:
            return {"error": "Insufficient successful tests for comparison"}
        
        # Calculate metrics
        def calc_stats(test_list: List[Dict], metric: str) -> Dict[str, float]:
            values = [t[metric] for t in test_list if metric in t]
            if not values:
                return {"mean": 0, "median": 0, "min": 0, "max": 0, "std": 0}
            
            return {
                "mean": statistics.mean(values),
                "median": statistics.median(values),
                "min": min(values),
                "max": max(values),
                "std": statistics.stdev(values) if len(values) > 1 else 0,
                "count": len(values)
            }
        
        analysis = {
            "single_server": {
                "duration": calc_stats(single_success, "total_duration"),
                "tokens": calc_stats(single_success, "total_tokens"),
                "confidence": calc_stats(single_success, "final_confidence"),
                "content_length": calc_stats(single_success, "content_length"),
                "agents_participated": calc_stats(single_success, "agents_participated")
            },
            "multi_server": {
                "duration": calc_stats(multi_success, "total_duration"),
                "tokens": calc_stats(multi_success, "total_tokens"),
                "confidence": calc_stats(multi_success, "final_confidence"),
                "content_length": calc_stats(multi_success, "content_length"),
                "agents_participated": calc_stats(multi_success, "agents_participated")
            }
        }
        
        # Calculate performance improvements
        improvements = {}
        for metric in ["duration", "tokens", "confidence", "content_length"]:
            single_mean = analysis["single_server"][metric]["mean"]
            multi_mean = analysis["multi_server"][metric]["mean"]
            
            if single_mean > 0:
                if metric == "duration":  # Lower is better
                    improvement = ((single_mean - multi_mean) / single_mean) * 100
                    improvements[metric] = improvement
                else:  # Higher is better
                    improvement = ((multi_mean - single_mean) / single_mean) * 100
                    improvements[metric] = improvement
            else:
                improvements[metric] = 0
        
        analysis["improvements"] = improvements
        analysis["summary"] = {
            "single_tests": len(single_success),
            "multi_tests": len(multi_success),
            "speed_improvement": improvements.get("duration", 0),
            "token_difference": improvements.get("tokens", 0),
            "confidence_improvement": improvements.get("confidence", 0),
            "quality_improvement": improvements.get("content_length", 0)
        }
        
        return analysis
    
    def display_analysis(self, analysis: Dict[str, Any]):
        """Display analysis results in readable format."""
        if "error" in analysis:
            print(f"❌ Analysis Error: {analysis['error']}")
            return
        
        print(f"\n{'='*60}")
        print(f"📈 PERFORMANCE COMPARISON RESULTS")
        print(f"{'='*60}")
        
        summary = analysis["summary"]
        print(f"Tests: {summary['single_tests']} single-server, {summary['multi_tests']} multi-server")
        
        print(f"\n🏃 SPEED COMPARISON:")
        single_duration = analysis["single_server"]["duration"]["mean"]
        multi_duration = analysis["multi_server"]["duration"]["mean"]
        speed_improvement = summary["speed_improvement"]
        
        print(f"  Single-server: {single_duration:.2f}s average")
        print(f"  Multi-server:  {multi_duration:.2f}s average")
        if speed_improvement > 0:
            print(f"  ✅ Multi-server is {speed_improvement:.1f}% FASTER")
        else:
            print(f"  ❌ Multi-server is {abs(speed_improvement):.1f}% slower")
        
        print(f"\n🪙 TOKEN USAGE:")
        single_tokens = analysis["single_server"]["tokens"]["mean"]
        multi_tokens = analysis["multi_server"]["tokens"]["mean"]
        token_difference = summary["token_difference"]
        
        print(f"  Single-server: {single_tokens:.0f} tokens average")
        print(f"  Multi-server:  {multi_tokens:.0f} tokens average")
        print(f"  Difference: {token_difference:.1f}%")
        
        print(f"\n🎯 QUALITY METRICS:")
        single_conf = analysis["single_server"]["confidence"]["mean"]
        multi_conf = analysis["multi_server"]["confidence"]["mean"]
        single_length = analysis["single_server"]["content_length"]["mean"]
        multi_length = analysis["multi_server"]["content_length"]["mean"]
        
        print(f"  Confidence: {single_conf:.2f} vs {multi_conf:.2f} ({summary['confidence_improvement']:+.1f}%)")
        print(f"  Content length: {single_length:.0f} vs {multi_length:.0f} chars ({summary['quality_improvement']:+.1f}%)")
        
        print(f"\n🔍 DETAILED STATISTICS:")
        print(f"  Single-server duration: {single_duration:.2f}±{analysis['single_server']['duration']['std']:.2f}s")
        print(f"  Multi-server duration:  {multi_duration:.2f}±{analysis['multi_server']['duration']['std']:.2f}s")
        
        print(f"\n🏆 VERDICT:")
        if speed_improvement > 10:
            print("✅ MULTI-SERVER PROVIDES SIGNIFICANT PERFORMANCE IMPROVEMENT")
        elif speed_improvement > 0:
            print("✅ Multi-server provides modest performance improvement")
        else:
            print("❌ Multi-server shows no performance benefit (check server setup)")
        
        # Server utilization analysis
        multi_results = [r for r in analysis if r.get("pool_stats")]
        if multi_results:
            print(f"\n🌐 SERVER UTILIZATION:")
            # This would show which servers were used most
            print("  (Server utilization metrics available in detailed results)")
    
    def save_results(self, results: Dict[str, Any], analysis: Dict[str, Any], filename: str = None):
        """Save test results and analysis to file."""
        if filename is None:
            timestamp = int(time.time())
            filename = f"performance_comparison_{timestamp}.json"
        
        import json
        output_data = {
            "test_results": results,
            "analysis": analysis,
            "timestamp": time.time()
        }
        
        with open(filename, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"\n💾 Results saved to: {filename}")


async def main():
    """Main performance test function."""
    print("🚀 Felix Framework Multi-Server Performance Test")
    print("=" * 60)
    
    # Check if config files exist
    single_config = Path("config/single_server_config.json")
    multi_config = Path("config/server_config.json")
    
    if not single_config.exists():
        print(f"❌ Single server config not found: {single_config}")
        return
    
    if not multi_config.exists():
        print(f"❌ Multi server config not found: {multi_config}")
        return
    
    # Run performance tests
    test_runner = PerformanceTest(debug_mode=False)
    
    print("⚡ Running performance comparison tests...")
    results = await test_runner.run_comparison_tests(num_iterations=2)
    
    print("\n📊 Analyzing results...")
    analysis = test_runner.analyze_results(results)
    
    # Display results
    test_runner.display_analysis(analysis)
    
    # Save results
    test_runner.save_results(results, analysis)
    
    print("\n🎉 Performance comparison complete!")


if __name__ == "__main__":
    asyncio.run(main())