File size: 10,611 Bytes
24214fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/usr/bin/env python3
"""
Test for Task 9.2: Performance Metrics Collection Implementation.

This script validates that performance metrics collection has been successfully implemented:
- Performance metrics are collected during prompt executions
- Response times and confidence levels are logged
- Component-specific performance tracking works
- Integration with existing system is seamless

Requirements validated: 8.1, 8.2
"""

import sys
import os
import time
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))

from core.simplified_medical_app import SimplifiedMedicalApp
from src.config.prompt_management.performance_monitor import PromptMonitor


def test_performance_metrics_collection():
    """Test Task 9.2: Performance metrics collection during prompt execution."""
    print("Testing Task 9.2: Performance metrics collection...")
    
    # Create app with performance monitoring
    app = SimplifiedMedicalApp()
    
    # Verify performance monitor is initialized
    assert hasattr(app, 'performance_monitor'), "Should have performance monitor"
    assert isinstance(app.performance_monitor, PromptMonitor), "Should be PromptMonitor instance"
    
    # Test direct performance monitoring (independent of AI providers)
    print("   Testing direct performance monitoring...")
    
    # Directly test the performance monitor
    monitor = app.performance_monitor
    
    # Log some test metrics
    for i in range(3):
        monitor.track_execution(
            agent_type='spiritual_monitor',
            response_time=0.5 + i * 0.1,
            confidence=0.7 + i * 0.05,
            success=True,
            metadata={'test_execution': i, 'message_length': 50 + i * 10}
        )
    
    # Get performance metrics
    metrics = app.get_performance_metrics('spiritual_monitor')
    
    # Verify metrics collection (Requirement 8.1)
    assert 'total_executions' in metrics, "Should track total executions"
    assert 'average_response_time' in metrics, "Should track average response time"
    assert 'average_confidence' in metrics, "Should track average confidence"
    assert 'success_rate' in metrics, "Should track success rate"
    
    # Verify we have collected metrics for our test executions
    assert metrics['total_executions'] >= 3, \
        f"Should have at least 3 executions, got {metrics['total_executions']}"
    
    # Verify response times are reasonable
    assert metrics['average_response_time'] > 0, "Should have positive response times"
    assert metrics['average_response_time'] < 30, "Response times should be reasonable (< 30s)"
    
    # Verify confidence levels are in valid range
    assert 0 <= metrics['average_confidence'] <= 1, "Confidence should be between 0 and 1"
    
    # Verify success rate
    assert 0 <= metrics['success_rate'] <= 1, "Success rate should be between 0 and 1"
    
    print(f"   βœ“ Collected metrics for {metrics['total_executions']} executions")
    print(f"   βœ“ Average response time: {metrics['average_response_time']:.3f}s")
    print(f"   βœ“ Average confidence: {metrics['average_confidence']:.3f}")
    print(f"   βœ“ Success rate: {metrics['success_rate']:.3f}")
    
    # Test integration with actual message processing (if AI is available)
    print("   Testing integration with message processing...")
    try:
        # Process one test message
        history, status = app.process_message("Test message for monitoring")
        print("   βœ“ Message processing integration working")
    except Exception as e:
        print(f"   ⚠ Message processing failed (expected without AI): {e}")
        # This is expected without AI providers, but monitoring should still work
    
    return True


def test_component_specific_tracking():
    """Test component-specific performance tracking."""
    print("Testing component-specific performance tracking...")
    
    monitor = PromptMonitor()
    
    # Test tracking for different agent types
    agent_types = ['spiritual_monitor', 'triage_question', 'triage_evaluator']
    
    for agent_type in agent_types:
        # Log some test metrics
        for i in range(3):
            monitor.track_execution(
                agent_type=agent_type,
                response_time=0.5 + i * 0.1,
                confidence=0.7 + i * 0.1,
                success=True,
                metadata={'test_execution': i}
            )
    
    # Verify each agent has separate metrics
    for agent_type in agent_types:
        metrics = monitor.get_detailed_metrics(agent_type)
        
        assert metrics['total_executions'] == 3, f"Should have 3 executions for {agent_type}"
        assert metrics['average_response_time'] > 0, f"Should have response time for {agent_type}"
        assert metrics['average_confidence'] > 0, f"Should have confidence for {agent_type}"
        
        print(f"   βœ“ {agent_type}: {metrics['total_executions']} executions tracked")
    
    return True


def test_performance_trend_analysis():
    """Test performance trend analysis capabilities."""
    print("Testing performance trend analysis...")
    
    monitor = PromptMonitor()
    
    # Simulate improving performance over time
    base_time = 1.0
    for i in range(10):
        # Gradually improving response times
        response_time = base_time - (i * 0.05)  # Getting faster
        confidence = 0.6 + (i * 0.03)  # Getting more confident
        
        monitor.track_execution(
            agent_type='test_agent',
            response_time=response_time,
            confidence=confidence,
            success=True
        )
    
    # Get detailed metrics with trend analysis
    metrics = monitor.get_detailed_metrics('test_agent')
    
    # Verify trend analysis is available
    assert 'performance_trend' in metrics, "Should include performance trend analysis"
    assert 'confidence_distribution' in metrics, "Should include confidence distribution"
    
    # Verify trend detection
    trend = metrics['performance_trend']
    assert trend in ['improving', 'stable', 'degrading', 'insufficient_data'], \
        f"Should have valid trend value, got: {trend}"
    
    print(f"   βœ“ Performance trend detected: {trend}")
    print(f"   βœ“ Confidence distribution: {metrics['confidence_distribution']}")
    
    return True


def test_error_handling_and_logging():
    """Test error handling and logging in performance monitoring."""
    print("Testing error handling and logging...")
    
    monitor = PromptMonitor()
    
    # Log some successful and failed executions
    for i in range(5):
        success = i % 2 == 0  # Alternate success/failure
        
        monitor.track_execution(
            agent_type='error_test_agent',
            response_time=0.5,
            confidence=0.8 if success else 0.3,
            success=success,
            metadata={'error_test': True, 'execution_id': i}
        )
    
    # Get metrics
    metrics = monitor.get_detailed_metrics('error_test_agent')
    
    # Verify error tracking
    assert metrics['total_executions'] == 5, "Should track all executions"
    assert 0 < metrics['success_rate'] < 1, "Should have mixed success rate"
    
    # Verify error patterns are analyzed
    assert 'error_patterns' in metrics, "Should analyze error patterns"
    
    print(f"   βœ“ Success rate: {metrics['success_rate']:.2f}")
    print(f"   βœ“ Error patterns analyzed: {len(metrics['error_patterns'])} patterns found")
    
    return True


def test_integration_with_existing_system():
    """Test integration with existing medical app system."""
    print("Testing integration with existing system...")
    
    app = SimplifiedMedicalApp()
    
    # Test that performance monitoring doesn't interfere with normal operation
    message = "I need help with my medication"
    
    # Process message normally
    history, status = app.process_message(message)
    
    # Verify normal operation still works
    assert isinstance(history, list), "Should return history list"
    assert isinstance(status, str), "Should return status string"
    assert len(history) > 0, "Should have message in history"
    
    # Verify performance metrics were collected
    all_metrics = app.get_performance_metrics()
    assert isinstance(all_metrics, dict), "Should return metrics dictionary"
    
    # Test optimization recommendations
    recommendations = app.get_optimization_recommendations()
    assert isinstance(recommendations, dict), "Should return recommendations dictionary"
    
    # Test improvement tracking
    tracking = app.get_improvement_tracking()
    assert isinstance(tracking, dict), "Should return tracking dictionary"
    
    print("   βœ“ Normal operation preserved")
    print("   βœ“ Performance metrics accessible")
    print("   βœ“ Optimization features available")
    
    return True


def main():
    """Run all Task 9.2 completion tests."""
    print("=" * 70)
    print("TASK 9.2 COMPLETION VALIDATION: PERFORMANCE METRICS COLLECTION")
    print("=" * 70)
    
    try:
        # Test all components
        if not test_performance_metrics_collection():
            return False
        
        if not test_component_specific_tracking():
            return False
        
        if not test_performance_trend_analysis():
            return False
        
        if not test_error_handling_and_logging():
            return False
        
        if not test_integration_with_existing_system():
            return False
        
        print("\n" + "=" * 70)
        print("βœ… TASK 9.2 COMPLETED SUCCESSFULLY!")
        print("=" * 70)
        print("IMPLEMENTED FEATURES:")
        print("βœ“ Performance metrics collection during prompt executions")
        print("βœ“ Response time and confidence level logging")
        print("βœ“ Component-specific performance tracking")
        print("βœ“ Performance trend analysis capabilities")
        print("βœ“ Error handling and pattern detection")
        print("βœ“ Integration with existing medical assistant system")
        print("βœ“ Seamless operation without affecting core functionality")
        print("\nREQUIREMENTS VALIDATED:")
        print("βœ“ 8.1: Response time and confidence level logging implemented")
        print("βœ“ 8.2: Component-specific performance tracking working")
        print("=" * 70)
        return True
        
    except Exception as e:
        print(f"\n❌ TASK 9.2 VALIDATION FAILED: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)