File size: 15,241 Bytes
fb867c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#!/usr/bin/env python3
"""
Multi-Server Performance Comparison Test for Felix Framework.

This script compares performance between single-server and multi-server
configurations to demonstrate true parallel processing benefits.

Requirements:
- Multiple LM Studio servers running on different ports
- Configuration files in config/ directory

Usage:
    python examples/test_multi_server_performance.py
"""

import sys
import time
import asyncio
import statistics
from pathlib import Path
from typing import List, Dict, Any, Tuple

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from blog_writer import FelixBlogWriter


class PerformanceTest:
    """Test framework for comparing single vs multi-server performance."""
    
    def __init__(self, debug_mode: bool = False):
        self.debug_mode = debug_mode
        self.test_topics = [
            "Quantum computing applications",
            "Sustainable energy solutions", 
            "Artificial intelligence ethics",
            "Space exploration technology",
            "Blockchain and cryptocurrency"
        ]
    
    async def run_single_test(self, topic: str, config_path: str, test_name: str) -> Dict[str, Any]:
        """
        Run a single performance test.
        
        Args:
            topic: Blog post topic
            config_path: Server configuration path
            test_name: Name for this test
            
        Returns:
            Test results dictionary
        """
        print(f"\n{'='*60}")
        print(f"πŸ§ͺ RUNNING TEST: {test_name}")
        print(f"Topic: {topic}")
        print(f"Config: {config_path}")
        print(f"{'='*60}")
        
        # Create writer with specified configuration
        writer = FelixBlogWriter(
            server_config_path=config_path,
            strict_mode=True,  # Use strict mode for consistent testing
            debug_mode=self.debug_mode
        )
        
        # Test connection
        if not writer.test_lm_studio_connection():
            return {
                "error": "Connection failed",
                "test_name": test_name,
                "topic": topic,
                "config": config_path
            }
        
        # Create team
        writer.create_blog_writing_team(complexity="medium")
        
        # Run test
        start_time = time.perf_counter()
        results = await writer.run_blog_writing_session_async(
            topic=topic,
            simulation_time=1.0
        )
        end_time = time.perf_counter()
        
        total_duration = end_time - start_time
        
        # Extract metrics
        stats = results["session_stats"]
        
        test_result = {
            "test_name": test_name,
            "topic": topic,
            "config": config_path,
            "success": results["final_output"] is not None,
            "total_duration": total_duration,
            "simulation_duration": stats["total_duration"],
            "total_tokens": stats["total_tokens_used"],
            "agents_participated": stats["agents_participated"],
            "agents_created": stats["agents_created"],
            "messages_processed": stats["total_messages_processed"],
            "final_confidence": results["final_output"]["confidence"] if results["final_output"] else 0.0,
            "llm_stats": stats.get("llm_client_stats", {}),
            "content_length": len(results["final_output"]["content"]) if results["final_output"] else 0
        }
        
        # Add server-specific metrics if available
        if hasattr(writer.llm_client, 'get_pool_stats'):
            test_result["pool_stats"] = writer.llm_client.get_pool_stats()
        
        print(f"βœ… Test completed: {total_duration:.2f}s, {test_result['total_tokens']} tokens")
        
        return test_result
    
    async def run_comparison_tests(self, num_iterations: int = 3) -> Dict[str, Any]:
        """
        Run comprehensive comparison tests.
        
        Args:
            num_iterations: Number of test iterations per configuration
            
        Returns:
            Comparison results
        """
        print(f"πŸš€ STARTING MULTI-SERVER PERFORMANCE COMPARISON")
        print(f"Iterations per config: {num_iterations}")
        print(f"Topics: {len(self.test_topics)}")
        print(f"Total tests: {len(self.test_topics) * num_iterations * 2}")
        
        all_results = {
            "single_server": [],
            "multi_server": [],
            "test_config": {
                "num_iterations": num_iterations,
                "topics": self.test_topics,
                "debug_mode": self.debug_mode,
                "timestamp": time.time()
            }
        }
        
        # Test each topic with both configurations
        for topic in self.test_topics:
            print(f"\n🎯 Testing topic: {topic}")
            
            for iteration in range(num_iterations):
                print(f"\n--- Iteration {iteration + 1}/{num_iterations} ---")
                
                # Test single server
                try:
                    single_result = await self.run_single_test(
                        topic, 
                        "config/single_server_config.json",
                        f"Single-Server-{iteration+1}"
                    )
                    all_results["single_server"].append(single_result)
                except Exception as e:
                    print(f"❌ Single server test failed: {e}")
                    all_results["single_server"].append({
                        "error": str(e),
                        "test_name": f"Single-Server-{iteration+1}",
                        "topic": topic
                    })
                
                # Brief pause between tests
                await asyncio.sleep(2)
                
                # Test multi server
                try:
                    multi_result = await self.run_single_test(
                        topic,
                        "config/server_config.json", 
                        f"Multi-Server-{iteration+1}"
                    )
                    all_results["multi_server"].append(multi_result)
                except Exception as e:
                    print(f"❌ Multi server test failed: {e}")
                    all_results["multi_server"].append({
                        "error": str(e),
                        "test_name": f"Multi-Server-{iteration+1}",
                        "topic": topic
                    })
                
                # Brief pause between tests
                await asyncio.sleep(2)
        
        return all_results
    
    def analyze_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
        """
        Analyze and compare test results.
        
        Args:
            results: Results from run_comparison_tests
            
        Returns:
            Analysis summary
        """
        print(f"\n{'='*60}")
        print(f"πŸ“Š ANALYZING RESULTS")
        print(f"{'='*60}")
        
        # Filter successful tests
        single_success = [r for r in results["single_server"] if "error" not in r and r.get("success", False)]
        multi_success = [r for r in results["multi_server"] if "error" not in r and r.get("success", False)]
        
        print(f"Successful tests: Single={len(single_success)}, Multi={len(multi_success)}")
        
        if not single_success or not multi_success:
            return {"error": "Insufficient successful tests for comparison"}
        
        # Calculate metrics
        def calc_stats(test_list: List[Dict], metric: str) -> Dict[str, float]:
            values = [t[metric] for t in test_list if metric in t]
            if not values:
                return {"mean": 0, "median": 0, "min": 0, "max": 0, "std": 0}
            
            return {
                "mean": statistics.mean(values),
                "median": statistics.median(values),
                "min": min(values),
                "max": max(values),
                "std": statistics.stdev(values) if len(values) > 1 else 0,
                "count": len(values)
            }
        
        analysis = {
            "single_server": {
                "duration": calc_stats(single_success, "total_duration"),
                "tokens": calc_stats(single_success, "total_tokens"),
                "confidence": calc_stats(single_success, "final_confidence"),
                "content_length": calc_stats(single_success, "content_length"),
                "agents_participated": calc_stats(single_success, "agents_participated")
            },
            "multi_server": {
                "duration": calc_stats(multi_success, "total_duration"),
                "tokens": calc_stats(multi_success, "total_tokens"),
                "confidence": calc_stats(multi_success, "final_confidence"),
                "content_length": calc_stats(multi_success, "content_length"),
                "agents_participated": calc_stats(multi_success, "agents_participated")
            }
        }
        
        # Calculate performance improvements
        improvements = {}
        for metric in ["duration", "tokens", "confidence", "content_length"]:
            single_mean = analysis["single_server"][metric]["mean"]
            multi_mean = analysis["multi_server"][metric]["mean"]
            
            if single_mean > 0:
                if metric == "duration":  # Lower is better
                    improvement = ((single_mean - multi_mean) / single_mean) * 100
                    improvements[metric] = improvement
                else:  # Higher is better
                    improvement = ((multi_mean - single_mean) / single_mean) * 100
                    improvements[metric] = improvement
            else:
                improvements[metric] = 0
        
        analysis["improvements"] = improvements
        analysis["summary"] = {
            "single_tests": len(single_success),
            "multi_tests": len(multi_success),
            "speed_improvement": improvements.get("duration", 0),
            "token_difference": improvements.get("tokens", 0),
            "confidence_improvement": improvements.get("confidence", 0),
            "quality_improvement": improvements.get("content_length", 0)
        }
        
        return analysis
    
    def display_analysis(self, analysis: Dict[str, Any]):
        """Display analysis results in readable format."""
        if "error" in analysis:
            print(f"❌ Analysis Error: {analysis['error']}")
            return
        
        print(f"\n{'='*60}")
        print(f"πŸ“ˆ PERFORMANCE COMPARISON RESULTS")
        print(f"{'='*60}")
        
        summary = analysis["summary"]
        print(f"Tests: {summary['single_tests']} single-server, {summary['multi_tests']} multi-server")
        
        print(f"\nπŸƒ SPEED COMPARISON:")
        single_duration = analysis["single_server"]["duration"]["mean"]
        multi_duration = analysis["multi_server"]["duration"]["mean"]
        speed_improvement = summary["speed_improvement"]
        
        print(f"  Single-server: {single_duration:.2f}s average")
        print(f"  Multi-server:  {multi_duration:.2f}s average")
        if speed_improvement > 0:
            print(f"  βœ… Multi-server is {speed_improvement:.1f}% FASTER")
        else:
            print(f"  ❌ Multi-server is {abs(speed_improvement):.1f}% slower")
        
        print(f"\nπŸͺ™ TOKEN USAGE:")
        single_tokens = analysis["single_server"]["tokens"]["mean"]
        multi_tokens = analysis["multi_server"]["tokens"]["mean"]
        token_difference = summary["token_difference"]
        
        print(f"  Single-server: {single_tokens:.0f} tokens average")
        print(f"  Multi-server:  {multi_tokens:.0f} tokens average")
        print(f"  Difference: {token_difference:.1f}%")
        
        print(f"\n🎯 QUALITY METRICS:")
        single_conf = analysis["single_server"]["confidence"]["mean"]
        multi_conf = analysis["multi_server"]["confidence"]["mean"]
        single_length = analysis["single_server"]["content_length"]["mean"]
        multi_length = analysis["multi_server"]["content_length"]["mean"]
        
        print(f"  Confidence: {single_conf:.2f} vs {multi_conf:.2f} ({summary['confidence_improvement']:+.1f}%)")
        print(f"  Content length: {single_length:.0f} vs {multi_length:.0f} chars ({summary['quality_improvement']:+.1f}%)")
        
        print(f"\nπŸ” DETAILED STATISTICS:")
        print(f"  Single-server duration: {single_duration:.2f}Β±{analysis['single_server']['duration']['std']:.2f}s")
        print(f"  Multi-server duration:  {multi_duration:.2f}Β±{analysis['multi_server']['duration']['std']:.2f}s")
        
        print(f"\nπŸ† VERDICT:")
        if speed_improvement > 10:
            print("βœ… MULTI-SERVER PROVIDES SIGNIFICANT PERFORMANCE IMPROVEMENT")
        elif speed_improvement > 0:
            print("βœ… Multi-server provides modest performance improvement")
        else:
            print("❌ Multi-server shows no performance benefit (check server setup)")
        
        # Server utilization analysis
        multi_results = [r for r in analysis if r.get("pool_stats")]
        if multi_results:
            print(f"\n🌐 SERVER UTILIZATION:")
            # This would show which servers were used most
            print("  (Server utilization metrics available in detailed results)")
    
    def save_results(self, results: Dict[str, Any], analysis: Dict[str, Any], filename: str = None):
        """Save test results and analysis to file."""
        if filename is None:
            timestamp = int(time.time())
            filename = f"performance_comparison_{timestamp}.json"
        
        import json
        output_data = {
            "test_results": results,
            "analysis": analysis,
            "timestamp": time.time()
        }
        
        with open(filename, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"\nπŸ’Ύ Results saved to: {filename}")


async def main():
    """Main performance test function."""
    print("πŸš€ Felix Framework Multi-Server Performance Test")
    print("=" * 60)
    
    # Check if config files exist
    single_config = Path("config/single_server_config.json")
    multi_config = Path("config/server_config.json")
    
    if not single_config.exists():
        print(f"❌ Single server config not found: {single_config}")
        return
    
    if not multi_config.exists():
        print(f"❌ Multi server config not found: {multi_config}")
        return
    
    # Run performance tests
    test_runner = PerformanceTest(debug_mode=False)
    
    print("⚑ Running performance comparison tests...")
    results = await test_runner.run_comparison_tests(num_iterations=2)
    
    print("\nπŸ“Š Analyzing results...")
    analysis = test_runner.analyze_results(results)
    
    # Display results
    test_runner.display_analysis(analysis)
    
    # Save results
    test_runner.save_results(results, analysis)
    
    print("\nπŸŽ‰ Performance comparison complete!")


if __name__ == "__main__":
    asyncio.run(main())