Spiritual_Health_Project / tests /integration /test_task_4_complete.py
DocUA's picture
feat: Complete prompt optimization system implementation
24214fc
#!/usr/bin/env python3
"""
Comprehensive test script for Task 4: Build Structured Feedback System.
Tests all subtasks: 4.1, 4.2, 4.3, and 4.4.
"""
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
from config.prompt_management.feedback_system import FeedbackSystem
from config.prompt_management.pattern_recognizer import PatternRecognizer
from interface.feedback_ui_integration import FeedbackUIIntegration
from config.prompt_management.data_models import (
ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType,
ClassificationError, QuestionIssue, ReferralProblem
)
def test_task_4_1_property_based_feedback_capture():
"""Test Task 4.1: Property-based test for structured feedback capture."""
print("Testing Task 4.1: Property-based structured feedback capture...")
feedback_system = FeedbackSystem(storage_path=".verification_data/task4_test")
# Test structured data capture for all feedback types
test_cases = [
# Classification errors
{
'type': 'classification_error',
'data': {
'error_type': ErrorType.WRONG_CLASSIFICATION,
'subcategory': ErrorSubcategory.GREEN_TO_YELLOW,
'expected_category': 'YELLOW',
'actual_category': 'GREEN',
'message_content': 'I feel overwhelmed and stressed about everything',
'reviewer_comments': 'Clear distress indicators were missed by the system',
'confidence_level': 0.9
}
},
# Question issues
{
'type': 'question_issue',
'data': {
'issue_type': QuestionIssueType.INAPPROPRIATE_QUESTION,
'question_content': 'What is wrong with you?',
'scenario_type': ScenarioType.VAGUE_STRESS,
'reviewer_comments': 'Question is too direct and potentially offensive',
'severity': 'high'
}
},
# Referral problems
{
'type': 'referral_problem',
'data': {
'problem_type': ReferralProblemType.INCOMPLETE_SUMMARY,
'referral_content': 'Patient needs spiritual care.',
'reviewer_comments': 'Summary lacks specific distress indicators and context',
'severity': 'medium',
'missing_fields': ['distress_indicators', 'conversation_context']
}
}
]
recorded_ids = []
# Record all test feedback
for case in test_cases:
if case['type'] == 'classification_error':
error_id = feedback_system.record_classification_error(**case['data'])
recorded_ids.append(error_id)
elif case['type'] == 'question_issue':
issue_id = feedback_system.record_question_issue(**case['data'])
recorded_ids.append(issue_id)
elif case['type'] == 'referral_problem':
problem_id = feedback_system.record_referral_problem(**case['data'])
recorded_ids.append(problem_id)
# Verify all feedback was captured
summary = feedback_system.get_feedback_summary()
assert summary['total_errors'] >= 1, "Classification error should be recorded"
assert summary['total_question_issues'] >= 1, "Question issue should be recorded"
assert summary['total_referral_problems'] >= 1, "Referral problem should be recorded"
assert len(recorded_ids) == 3, "All feedback should return IDs"
# Verify structured data fields are present
errors = feedback_system._load_errors()
if errors:
latest_error = errors[-1]
required_fields = ['error_id', 'error_type', 'subcategory', 'expected_category',
'actual_category', 'message_content', 'reviewer_comments',
'confidence_level', 'timestamp']
for field in required_fields:
assert field in latest_error, f"Required field {field} missing"
print("βœ“ Task 4.1: Property-based structured feedback capture works correctly")
return True
def test_task_4_2_classification_error_data_model():
"""Test Task 4.2: ClassificationError data model implementation."""
print("Testing Task 4.2: ClassificationError data model...")
from datetime import datetime
# Test ClassificationError creation
error = ClassificationError(
error_id="test_error_123",
error_type=ErrorType.SEVERITY_MISJUDGMENT,
subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS,
expected_category="RED",
actual_category="YELLOW",
message_content="I don't think I can go on like this anymore",
reviewer_comments="Clear indication of severe distress, should be RED not YELLOW",
confidence_level=0.95,
timestamp=datetime.now(),
session_id="test_session_456",
additional_context={"reviewer_id": "reviewer_789"}
)
# Test serialization
error_dict = error.to_dict()
assert error_dict['error_id'] == "test_error_123"
assert error_dict['error_type'] == 'severity_misjudgment'
assert error_dict['subcategory'] == 'underestimated_distress'
assert error_dict['confidence_level'] == 0.95
# Test deserialization
reconstructed_error = ClassificationError.from_dict(error_dict)
assert reconstructed_error.error_id == error.error_id
assert reconstructed_error.error_type == error.error_type
assert reconstructed_error.subcategory == error.subcategory
assert reconstructed_error.confidence_level == error.confidence_level
# Test all error types and subcategories
for error_type in ErrorType:
assert isinstance(error_type.value, str), f"Error type {error_type} should have string value"
for subcategory in ErrorSubcategory:
assert isinstance(subcategory.value, str), f"Subcategory {subcategory} should have string value"
print("βœ“ Task 4.2: ClassificationError data model works correctly")
return True
def test_task_4_3_feedback_ui_integration():
"""Test Task 4.3: Feedback UI integration."""
print("Testing Task 4.3: Feedback UI integration...")
# Test UI integration initialization
ui_integration = FeedbackUIIntegration()
# Verify error type options are complete
expected_error_types = [
'wrong_classification', 'severity_misjudgment', 'missed_indicators',
'false_positive', 'context_misunderstanding', 'language_interpretation'
]
actual_error_types = [value for _, value in ui_integration.error_type_options]
for expected_type in expected_error_types:
assert expected_type in actual_error_types, f"Missing error type: {expected_type}"
# Verify subcategory mappings are complete
for error_type_label, error_type_value in ui_integration.error_type_options:
assert error_type_value in ui_integration.subcategory_mapping, \
f"Missing subcategory mapping for: {error_type_value}"
subcategories = ui_integration.subcategory_mapping[error_type_value]
assert len(subcategories) > 0, f"No subcategories for: {error_type_value}"
# Verify question issue options
expected_question_types = [
'inappropriate_question', 'insensitive_language', 'wrong_scenario_targeting',
'unclear_question', 'leading_question'
]
actual_question_types = [value for _, value in ui_integration.question_issue_options]
for expected_type in expected_question_types:
assert expected_type in actual_question_types, f"Missing question issue type: {expected_type}"
# Verify scenario options
expected_scenarios = [
'loss_of_interest', 'loss_of_loved_one', 'no_support',
'vague_stress', 'sleep_issues', 'spiritual_practice_change'
]
actual_scenarios = [value for _, value in ui_integration.scenario_options]
for expected_scenario in expected_scenarios:
assert expected_scenario in actual_scenarios, f"Missing scenario: {expected_scenario}"
# Test UI component methods exist
assert hasattr(ui_integration, 'create_classification_error_interface')
assert hasattr(ui_integration, 'create_question_issue_interface')
assert hasattr(ui_integration, 'create_pattern_analysis_display')
assert hasattr(ui_integration, 'create_complete_feedback_interface')
print("βœ“ Task 4.3: Feedback UI integration works correctly")
return True
def test_task_4_4_error_pattern_analysis():
"""Test Task 4.4: Error pattern analysis implementation."""
print("Testing Task 4.4: Error pattern analysis...")
# Test PatternRecognizer initialization
recognizer = PatternRecognizer(min_pattern_frequency=2, confidence_threshold=0.7)
assert recognizer.min_pattern_frequency == 2
assert recognizer.confidence_threshold == 0.7
# Create test feedback system with pattern recognizer
feedback_system = FeedbackSystem(storage_path=".verification_data/task4_pattern_test")
# Record multiple similar errors to create patterns
for i in range(4):
feedback_system.record_classification_error(
error_type=ErrorType.WRONG_CLASSIFICATION,
subcategory=ErrorSubcategory.GREEN_TO_YELLOW,
expected_category="YELLOW",
actual_category="GREEN",
message_content=f"I feel stressed and overwhelmed about work situation {i}",
reviewer_comments=f"Clear distress indicators were missed {i}",
confidence_level=0.85 + (i * 0.02),
session_id=f"pattern_test_session_{i}",
additional_context={"scenario_type": "vague_stress"}
)
# Record question issues
for i in range(3):
feedback_system.record_question_issue(
issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION,
question_content=f"What's wrong with you? {i}",
scenario_type=ScenarioType.VAGUE_STRESS,
reviewer_comments=f"Too direct and potentially offensive {i}",
severity="high",
session_id=f"pattern_test_session_{i}"
)
# Analyze patterns
patterns = feedback_system.analyze_error_patterns(min_frequency=2)
# Verify patterns were identified
assert len(patterns) > 0, "Should identify error patterns"
# Check for wrong classification pattern
wrong_classification_patterns = [p for p in patterns if 'wrong_classification' in p.pattern_type]
assert len(wrong_classification_patterns) > 0, "Should identify wrong classification pattern"
# Verify pattern structure
for pattern in patterns:
assert hasattr(pattern, 'pattern_id'), "Pattern should have ID"
assert hasattr(pattern, 'frequency'), "Pattern should have frequency"
assert hasattr(pattern, 'suggested_improvements'), "Pattern should have suggestions"
assert pattern.frequency >= 2, "Pattern frequency should meet minimum"
assert len(pattern.suggested_improvements) > 0, "Pattern should have improvement suggestions"
# Test optimization report generation
report = feedback_system.generate_optimization_report()
# Verify report structure
required_fields = [
'summary', 'total_patterns', 'recommendations', 'priority_actions',
'confidence_score', 'most_frequent_pattern', 'affected_scenarios',
'report_generated'
]
for field in required_fields:
assert field in report, f"Report missing required field: {field}"
assert report['total_patterns'] > 0, "Report should show patterns"
assert len(report['recommendations']) > 0, "Report should have recommendations"
assert 0.0 <= report['confidence_score'] <= 1.0, "Confidence score should be valid"
print("βœ“ Task 4.4: Error pattern analysis works correctly")
return True
def test_end_to_end_feedback_workflow():
"""Test complete end-to-end feedback workflow."""
print("Testing end-to-end feedback workflow...")
# Create fresh feedback system
feedback_system = FeedbackSystem(storage_path=".verification_data/task4_e2e_test")
# Create UI integration
ui_integration = FeedbackUIIntegration(feedback_system=feedback_system)
# Simulate complete feedback workflow
# 1. Record various types of feedback
error_id = feedback_system.record_classification_error(
error_type=ErrorType.CONTEXT_MISUNDERSTANDING,
subcategory=ErrorSubcategory.IGNORED_HISTORY,
expected_category="RED",
actual_category="GREEN",
message_content="I mentioned earlier that I've been having thoughts of ending it all",
reviewer_comments="System ignored previous context about suicidal ideation",
confidence_level=0.95,
session_id="e2e_session_1",
additional_context={"conversation_turn": 3, "previous_classification": "YELLOW"}
)
issue_id = feedback_system.record_question_issue(
issue_type=QuestionIssueType.INSENSITIVE_LANGUAGE,
question_content="Why don't you just try to be more positive?",
scenario_type=ScenarioType.LOSS_OF_LOVED_ONE,
reviewer_comments="Dismissive language inappropriate for grief scenario",
severity="high",
session_id="e2e_session_2",
suggested_improvement="Ask: 'How are you processing this loss?'"
)
problem_id = feedback_system.record_referral_problem(
problem_type=ReferralProblemType.MISSING_CONTACT_INFO,
referral_content="Patient experiencing severe spiritual distress and needs immediate support.",
reviewer_comments="No contact information or urgency level specified",
severity="high",
session_id="e2e_session_3",
missing_fields=["contact_phone", "urgency_level", "preferred_contact_time"]
)
# 2. Verify all feedback was recorded
summary = feedback_system.get_feedback_summary()
assert summary['total_errors'] >= 1
assert summary['total_question_issues'] >= 1
assert summary['total_referral_problems'] >= 1
# 3. Analyze patterns (may not find patterns with single instances)
patterns = feedback_system.analyze_error_patterns(min_frequency=1)
# 4. Generate optimization report
report = feedback_system.generate_optimization_report()
assert 'summary' in report
assert 'recommendations' in report
# 5. Verify UI integration can access the data
assert ui_integration.feedback_system == feedback_system
print("βœ“ End-to-end feedback workflow works correctly")
return True
def main():
"""Run all Task 4 tests."""
print("=" * 70)
print("TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM - COMPREHENSIVE TESTS")
print("=" * 70)
tests = [
test_task_4_1_property_based_feedback_capture,
test_task_4_2_classification_error_data_model,
test_task_4_3_feedback_ui_integration,
test_task_4_4_error_pattern_analysis,
test_end_to_end_feedback_workflow
]
passed = 0
failed = 0
for test in tests:
try:
print(f"\n{test.__name__.replace('_', ' ').title()}:")
print("-" * 50)
result = test()
if result:
passed += 1
print("βœ“ PASSED")
else:
failed += 1
print("βœ— FAILED")
except Exception as e:
failed += 1
print(f"βœ— FAILED: {str(e)}")
print("\n" + "=" * 70)
print(f"RESULTS: {passed} passed, {failed} failed")
print("=" * 70)
if failed == 0:
print("πŸŽ‰ ALL TASK 4 TESTS PASSED!")
print("\n**TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM**")
print("βœ“ COMPLETED: Task 4.1 - Property test for structured feedback capture")
print("βœ“ COMPLETED: Task 4.2 - ClassificationError data model")
print("βœ“ COMPLETED: Task 4.3 - Feedback UI integration")
print("βœ“ COMPLETED: Task 4.4 - Error pattern analysis")
print("\n**Requirements Validated:**")
print("βœ“ 3.1 - Predefined error categories from documentation")
print("βœ“ 3.2 - Specific subcategories of wrong classification types")
print("βœ“ 3.3 - Structured feedback about question quality")
print("βœ“ 3.4 - Pattern analysis and improvement suggestions")
print("βœ“ 3.5 - Feedback aggregation and reporting")
return True
else:
print("❌ Some tests failed. Please check the implementation.")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)