Spaces:

DocUA
/

Spiritual_Health_Project

Sleeping

App Files Files Community

Spiritual_Health_Project / tests /integration /test_task_9_2_complete.py

DocUA

feat: Complete prompt optimization system implementation

24214fc 29 days ago

raw

history blame contribute delete

10.6 kB

	#!/usr/bin/env python3
	"""
	Test for Task 9.2: Performance Metrics Collection Implementation.

	This script validates that performance metrics collection has been successfully implemented:
	- Performance metrics are collected during prompt executions
	- Response times and confidence levels are logged
	- Component-specific performance tracking works
	- Integration with existing system is seamless

	Requirements validated: 8.1, 8.2
	"""

	import sys
	import os
	import time
	sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))

	from core.simplified_medical_app import SimplifiedMedicalApp
	from src.config.prompt_management.performance_monitor import PromptMonitor


	def test_performance_metrics_collection():
	"""Test Task 9.2: Performance metrics collection during prompt execution."""
	print("Testing Task 9.2: Performance metrics collection...")

	# Create app with performance monitoring
	app = SimplifiedMedicalApp()

	# Verify performance monitor is initialized
	assert hasattr(app, 'performance_monitor'), "Should have performance monitor"
	assert isinstance(app.performance_monitor, PromptMonitor), "Should be PromptMonitor instance"

	# Test direct performance monitoring (independent of AI providers)
	print(" Testing direct performance monitoring...")

	# Directly test the performance monitor
	monitor = app.performance_monitor

	# Log some test metrics
	for i in range(3):
	monitor.track_execution(
	agent_type='spiritual_monitor',
	response_time=0.5 + i * 0.1,
	confidence=0.7 + i * 0.05,
	success=True,
	metadata={'test_execution': i, 'message_length': 50 + i * 10}
	)

	# Get performance metrics
	metrics = app.get_performance_metrics('spiritual_monitor')

	# Verify metrics collection (Requirement 8.1)
	assert 'total_executions' in metrics, "Should track total executions"
	assert 'average_response_time' in metrics, "Should track average response time"
	assert 'average_confidence' in metrics, "Should track average confidence"
	assert 'success_rate' in metrics, "Should track success rate"

	# Verify we have collected metrics for our test executions
	assert metrics['total_executions'] >= 3, \
	f"Should have at least 3 executions, got {metrics['total_executions']}"

	# Verify response times are reasonable
	assert metrics['average_response_time'] > 0, "Should have positive response times"
	assert metrics['average_response_time'] < 30, "Response times should be reasonable (< 30s)"

	# Verify confidence levels are in valid range
	assert 0 <= metrics['average_confidence'] <= 1, "Confidence should be between 0 and 1"

	# Verify success rate
	assert 0 <= metrics['success_rate'] <= 1, "Success rate should be between 0 and 1"

	print(f" ✓ Collected metrics for {metrics['total_executions']} executions")
	print(f" ✓ Average response time: {metrics['average_response_time']:.3f}s")
	print(f" ✓ Average confidence: {metrics['average_confidence']:.3f}")
	print(f" ✓ Success rate: {metrics['success_rate']:.3f}")

	# Test integration with actual message processing (if AI is available)
	print(" Testing integration with message processing...")
	try:
	# Process one test message
	history, status = app.process_message("Test message for monitoring")
	print(" ✓ Message processing integration working")
	except Exception as e:
	print(f" ⚠ Message processing failed (expected without AI): {e}")
	# This is expected without AI providers, but monitoring should still work

	return True


	def test_component_specific_tracking():
	"""Test component-specific performance tracking."""
	print("Testing component-specific performance tracking...")

	monitor = PromptMonitor()

	# Test tracking for different agent types
	agent_types = ['spiritual_monitor', 'triage_question', 'triage_evaluator']

	for agent_type in agent_types:
	# Log some test metrics
	for i in range(3):
	monitor.track_execution(
	agent_type=agent_type,
	response_time=0.5 + i * 0.1,
	confidence=0.7 + i * 0.1,
	success=True,
	metadata={'test_execution': i}
	)

	# Verify each agent has separate metrics
	for agent_type in agent_types:
	metrics = monitor.get_detailed_metrics(agent_type)

	assert metrics['total_executions'] == 3, f"Should have 3 executions for {agent_type}"
	assert metrics['average_response_time'] > 0, f"Should have response time for {agent_type}"
	assert metrics['average_confidence'] > 0, f"Should have confidence for {agent_type}"

	print(f" ✓ {agent_type}: {metrics['total_executions']} executions tracked")

	return True


	def test_performance_trend_analysis():
	"""Test performance trend analysis capabilities."""
	print("Testing performance trend analysis...")

	monitor = PromptMonitor()

	# Simulate improving performance over time
	base_time = 1.0
	for i in range(10):
	# Gradually improving response times
	response_time = base_time - (i * 0.05) # Getting faster
	confidence = 0.6 + (i * 0.03) # Getting more confident

	monitor.track_execution(
	agent_type='test_agent',
	response_time=response_time,
	confidence=confidence,
	success=True
	)

	# Get detailed metrics with trend analysis
	metrics = monitor.get_detailed_metrics('test_agent')

	# Verify trend analysis is available
	assert 'performance_trend' in metrics, "Should include performance trend analysis"
	assert 'confidence_distribution' in metrics, "Should include confidence distribution"

	# Verify trend detection
	trend = metrics['performance_trend']
	assert trend in ['improving', 'stable', 'degrading', 'insufficient_data'], \
	f"Should have valid trend value, got: {trend}"

	print(f" ✓ Performance trend detected: {trend}")
	print(f" ✓ Confidence distribution: {metrics['confidence_distribution']}")

	return True


	def test_error_handling_and_logging():
	"""Test error handling and logging in performance monitoring."""
	print("Testing error handling and logging...")

	monitor = PromptMonitor()

	# Log some successful and failed executions
	for i in range(5):
	success = i % 2 == 0 # Alternate success/failure

	monitor.track_execution(
	agent_type='error_test_agent',
	response_time=0.5,
	confidence=0.8 if success else 0.3,
	success=success,
	metadata={'error_test': True, 'execution_id': i}
	)

	# Get metrics
	metrics = monitor.get_detailed_metrics('error_test_agent')

	# Verify error tracking
	assert metrics['total_executions'] == 5, "Should track all executions"
	assert 0 < metrics['success_rate'] < 1, "Should have mixed success rate"

	# Verify error patterns are analyzed
	assert 'error_patterns' in metrics, "Should analyze error patterns"

	print(f" ✓ Success rate: {metrics['success_rate']:.2f}")
	print(f" ✓ Error patterns analyzed: {len(metrics['error_patterns'])} patterns found")

	return True


	def test_integration_with_existing_system():
	"""Test integration with existing medical app system."""
	print("Testing integration with existing system...")

	app = SimplifiedMedicalApp()

	# Test that performance monitoring doesn't interfere with normal operation
	message = "I need help with my medication"

	# Process message normally
	history, status = app.process_message(message)

	# Verify normal operation still works
	assert isinstance(history, list), "Should return history list"
	assert isinstance(status, str), "Should return status string"
	assert len(history) > 0, "Should have message in history"

	# Verify performance metrics were collected
	all_metrics = app.get_performance_metrics()
	assert isinstance(all_metrics, dict), "Should return metrics dictionary"

	# Test optimization recommendations
	recommendations = app.get_optimization_recommendations()
	assert isinstance(recommendations, dict), "Should return recommendations dictionary"

	# Test improvement tracking
	tracking = app.get_improvement_tracking()
	assert isinstance(tracking, dict), "Should return tracking dictionary"

	print(" ✓ Normal operation preserved")
	print(" ✓ Performance metrics accessible")
	print(" ✓ Optimization features available")

	return True


	def main():
	"""Run all Task 9.2 completion tests."""
	print("=" * 70)
	print("TASK 9.2 COMPLETION VALIDATION: PERFORMANCE METRICS COLLECTION")
	print("=" * 70)

	try:
	# Test all components
	if not test_performance_metrics_collection():
	return False

	if not test_component_specific_tracking():
	return False

	if not test_performance_trend_analysis():
	return False

	if not test_error_handling_and_logging():
	return False

	if not test_integration_with_existing_system():
	return False

	print("\n" + "=" * 70)
	print("✅ TASK 9.2 COMPLETED SUCCESSFULLY!")
	print("=" * 70)
	print("IMPLEMENTED FEATURES:")
	print("✓ Performance metrics collection during prompt executions")
	print("✓ Response time and confidence level logging")
	print("✓ Component-specific performance tracking")
	print("✓ Performance trend analysis capabilities")
	print("✓ Error handling and pattern detection")
	print("✓ Integration with existing medical assistant system")
	print("✓ Seamless operation without affecting core functionality")
	print("\nREQUIREMENTS VALIDATED:")
	print("✓ 8.1: Response time and confidence level logging implemented")
	print("✓ 8.2: Component-specific performance tracking working")
	print("=" * 70)
	return True

	except Exception as e:
	print(f"\n❌ TASK 9.2 VALIDATION FAILED: {e}")
	import traceback
	traceback.print_exc()
	return False


	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)