#!/usr/bin/env python3 """ Comprehensive test script for Task 4: Build Structured Feedback System. Tests all subtasks: 4.1, 4.2, 4.3, and 4.4. """ import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src')) from config.prompt_management.feedback_system import FeedbackSystem from config.prompt_management.pattern_recognizer import PatternRecognizer from interface.feedback_ui_integration import FeedbackUIIntegration from config.prompt_management.data_models import ( ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType, ClassificationError, QuestionIssue, ReferralProblem ) def test_task_4_1_property_based_feedback_capture(): """Test Task 4.1: Property-based test for structured feedback capture.""" print("Testing Task 4.1: Property-based structured feedback capture...") feedback_system = FeedbackSystem(storage_path=".verification_data/task4_test") # Test structured data capture for all feedback types test_cases = [ # Classification errors { 'type': 'classification_error', 'data': { 'error_type': ErrorType.WRONG_CLASSIFICATION, 'subcategory': ErrorSubcategory.GREEN_TO_YELLOW, 'expected_category': 'YELLOW', 'actual_category': 'GREEN', 'message_content': 'I feel overwhelmed and stressed about everything', 'reviewer_comments': 'Clear distress indicators were missed by the system', 'confidence_level': 0.9 } }, # Question issues { 'type': 'question_issue', 'data': { 'issue_type': QuestionIssueType.INAPPROPRIATE_QUESTION, 'question_content': 'What is wrong with you?', 'scenario_type': ScenarioType.VAGUE_STRESS, 'reviewer_comments': 'Question is too direct and potentially offensive', 'severity': 'high' } }, # Referral problems { 'type': 'referral_problem', 'data': { 'problem_type': ReferralProblemType.INCOMPLETE_SUMMARY, 'referral_content': 'Patient needs spiritual care.', 'reviewer_comments': 'Summary lacks specific distress indicators and context', 'severity': 'medium', 'missing_fields': ['distress_indicators', 'conversation_context'] } } ] recorded_ids = [] # Record all test feedback for case in test_cases: if case['type'] == 'classification_error': error_id = feedback_system.record_classification_error(**case['data']) recorded_ids.append(error_id) elif case['type'] == 'question_issue': issue_id = feedback_system.record_question_issue(**case['data']) recorded_ids.append(issue_id) elif case['type'] == 'referral_problem': problem_id = feedback_system.record_referral_problem(**case['data']) recorded_ids.append(problem_id) # Verify all feedback was captured summary = feedback_system.get_feedback_summary() assert summary['total_errors'] >= 1, "Classification error should be recorded" assert summary['total_question_issues'] >= 1, "Question issue should be recorded" assert summary['total_referral_problems'] >= 1, "Referral problem should be recorded" assert len(recorded_ids) == 3, "All feedback should return IDs" # Verify structured data fields are present errors = feedback_system._load_errors() if errors: latest_error = errors[-1] required_fields = ['error_id', 'error_type', 'subcategory', 'expected_category', 'actual_category', 'message_content', 'reviewer_comments', 'confidence_level', 'timestamp'] for field in required_fields: assert field in latest_error, f"Required field {field} missing" print("✓ Task 4.1: Property-based structured feedback capture works correctly") return True def test_task_4_2_classification_error_data_model(): """Test Task 4.2: ClassificationError data model implementation.""" print("Testing Task 4.2: ClassificationError data model...") from datetime import datetime # Test ClassificationError creation error = ClassificationError( error_id="test_error_123", error_type=ErrorType.SEVERITY_MISJUDGMENT, subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS, expected_category="RED", actual_category="YELLOW", message_content="I don't think I can go on like this anymore", reviewer_comments="Clear indication of severe distress, should be RED not YELLOW", confidence_level=0.95, timestamp=datetime.now(), session_id="test_session_456", additional_context={"reviewer_id": "reviewer_789"} ) # Test serialization error_dict = error.to_dict() assert error_dict['error_id'] == "test_error_123" assert error_dict['error_type'] == 'severity_misjudgment' assert error_dict['subcategory'] == 'underestimated_distress' assert error_dict['confidence_level'] == 0.95 # Test deserialization reconstructed_error = ClassificationError.from_dict(error_dict) assert reconstructed_error.error_id == error.error_id assert reconstructed_error.error_type == error.error_type assert reconstructed_error.subcategory == error.subcategory assert reconstructed_error.confidence_level == error.confidence_level # Test all error types and subcategories for error_type in ErrorType: assert isinstance(error_type.value, str), f"Error type {error_type} should have string value" for subcategory in ErrorSubcategory: assert isinstance(subcategory.value, str), f"Subcategory {subcategory} should have string value" print("✓ Task 4.2: ClassificationError data model works correctly") return True def test_task_4_3_feedback_ui_integration(): """Test Task 4.3: Feedback UI integration.""" print("Testing Task 4.3: Feedback UI integration...") # Test UI integration initialization ui_integration = FeedbackUIIntegration() # Verify error type options are complete expected_error_types = [ 'wrong_classification', 'severity_misjudgment', 'missed_indicators', 'false_positive', 'context_misunderstanding', 'language_interpretation' ] actual_error_types = [value for _, value in ui_integration.error_type_options] for expected_type in expected_error_types: assert expected_type in actual_error_types, f"Missing error type: {expected_type}" # Verify subcategory mappings are complete for error_type_label, error_type_value in ui_integration.error_type_options: assert error_type_value in ui_integration.subcategory_mapping, \ f"Missing subcategory mapping for: {error_type_value}" subcategories = ui_integration.subcategory_mapping[error_type_value] assert len(subcategories) > 0, f"No subcategories for: {error_type_value}" # Verify question issue options expected_question_types = [ 'inappropriate_question', 'insensitive_language', 'wrong_scenario_targeting', 'unclear_question', 'leading_question' ] actual_question_types = [value for _, value in ui_integration.question_issue_options] for expected_type in expected_question_types: assert expected_type in actual_question_types, f"Missing question issue type: {expected_type}" # Verify scenario options expected_scenarios = [ 'loss_of_interest', 'loss_of_loved_one', 'no_support', 'vague_stress', 'sleep_issues', 'spiritual_practice_change' ] actual_scenarios = [value for _, value in ui_integration.scenario_options] for expected_scenario in expected_scenarios: assert expected_scenario in actual_scenarios, f"Missing scenario: {expected_scenario}" # Test UI component methods exist assert hasattr(ui_integration, 'create_classification_error_interface') assert hasattr(ui_integration, 'create_question_issue_interface') assert hasattr(ui_integration, 'create_pattern_analysis_display') assert hasattr(ui_integration, 'create_complete_feedback_interface') print("✓ Task 4.3: Feedback UI integration works correctly") return True def test_task_4_4_error_pattern_analysis(): """Test Task 4.4: Error pattern analysis implementation.""" print("Testing Task 4.4: Error pattern analysis...") # Test PatternRecognizer initialization recognizer = PatternRecognizer(min_pattern_frequency=2, confidence_threshold=0.7) assert recognizer.min_pattern_frequency == 2 assert recognizer.confidence_threshold == 0.7 # Create test feedback system with pattern recognizer feedback_system = FeedbackSystem(storage_path=".verification_data/task4_pattern_test") # Record multiple similar errors to create patterns for i in range(4): feedback_system.record_classification_error( error_type=ErrorType.WRONG_CLASSIFICATION, subcategory=ErrorSubcategory.GREEN_TO_YELLOW, expected_category="YELLOW", actual_category="GREEN", message_content=f"I feel stressed and overwhelmed about work situation {i}", reviewer_comments=f"Clear distress indicators were missed {i}", confidence_level=0.85 + (i * 0.02), session_id=f"pattern_test_session_{i}", additional_context={"scenario_type": "vague_stress"} ) # Record question issues for i in range(3): feedback_system.record_question_issue( issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION, question_content=f"What's wrong with you? {i}", scenario_type=ScenarioType.VAGUE_STRESS, reviewer_comments=f"Too direct and potentially offensive {i}", severity="high", session_id=f"pattern_test_session_{i}" ) # Analyze patterns patterns = feedback_system.analyze_error_patterns(min_frequency=2) # Verify patterns were identified assert len(patterns) > 0, "Should identify error patterns" # Check for wrong classification pattern wrong_classification_patterns = [p for p in patterns if 'wrong_classification' in p.pattern_type] assert len(wrong_classification_patterns) > 0, "Should identify wrong classification pattern" # Verify pattern structure for pattern in patterns: assert hasattr(pattern, 'pattern_id'), "Pattern should have ID" assert hasattr(pattern, 'frequency'), "Pattern should have frequency" assert hasattr(pattern, 'suggested_improvements'), "Pattern should have suggestions" assert pattern.frequency >= 2, "Pattern frequency should meet minimum" assert len(pattern.suggested_improvements) > 0, "Pattern should have improvement suggestions" # Test optimization report generation report = feedback_system.generate_optimization_report() # Verify report structure required_fields = [ 'summary', 'total_patterns', 'recommendations', 'priority_actions', 'confidence_score', 'most_frequent_pattern', 'affected_scenarios', 'report_generated' ] for field in required_fields: assert field in report, f"Report missing required field: {field}" assert report['total_patterns'] > 0, "Report should show patterns" assert len(report['recommendations']) > 0, "Report should have recommendations" assert 0.0 <= report['confidence_score'] <= 1.0, "Confidence score should be valid" print("✓ Task 4.4: Error pattern analysis works correctly") return True def test_end_to_end_feedback_workflow(): """Test complete end-to-end feedback workflow.""" print("Testing end-to-end feedback workflow...") # Create fresh feedback system feedback_system = FeedbackSystem(storage_path=".verification_data/task4_e2e_test") # Create UI integration ui_integration = FeedbackUIIntegration(feedback_system=feedback_system) # Simulate complete feedback workflow # 1. Record various types of feedback error_id = feedback_system.record_classification_error( error_type=ErrorType.CONTEXT_MISUNDERSTANDING, subcategory=ErrorSubcategory.IGNORED_HISTORY, expected_category="RED", actual_category="GREEN", message_content="I mentioned earlier that I've been having thoughts of ending it all", reviewer_comments="System ignored previous context about suicidal ideation", confidence_level=0.95, session_id="e2e_session_1", additional_context={"conversation_turn": 3, "previous_classification": "YELLOW"} ) issue_id = feedback_system.record_question_issue( issue_type=QuestionIssueType.INSENSITIVE_LANGUAGE, question_content="Why don't you just try to be more positive?", scenario_type=ScenarioType.LOSS_OF_LOVED_ONE, reviewer_comments="Dismissive language inappropriate for grief scenario", severity="high", session_id="e2e_session_2", suggested_improvement="Ask: 'How are you processing this loss?'" ) problem_id = feedback_system.record_referral_problem( problem_type=ReferralProblemType.MISSING_CONTACT_INFO, referral_content="Patient experiencing severe spiritual distress and needs immediate support.", reviewer_comments="No contact information or urgency level specified", severity="high", session_id="e2e_session_3", missing_fields=["contact_phone", "urgency_level", "preferred_contact_time"] ) # 2. Verify all feedback was recorded summary = feedback_system.get_feedback_summary() assert summary['total_errors'] >= 1 assert summary['total_question_issues'] >= 1 assert summary['total_referral_problems'] >= 1 # 3. Analyze patterns (may not find patterns with single instances) patterns = feedback_system.analyze_error_patterns(min_frequency=1) # 4. Generate optimization report report = feedback_system.generate_optimization_report() assert 'summary' in report assert 'recommendations' in report # 5. Verify UI integration can access the data assert ui_integration.feedback_system == feedback_system print("✓ End-to-end feedback workflow works correctly") return True def main(): """Run all Task 4 tests.""" print("=" * 70) print("TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM - COMPREHENSIVE TESTS") print("=" * 70) tests = [ test_task_4_1_property_based_feedback_capture, test_task_4_2_classification_error_data_model, test_task_4_3_feedback_ui_integration, test_task_4_4_error_pattern_analysis, test_end_to_end_feedback_workflow ] passed = 0 failed = 0 for test in tests: try: print(f"\n{test.__name__.replace('_', ' ').title()}:") print("-" * 50) result = test() if result: passed += 1 print("✓ PASSED") else: failed += 1 print("✗ FAILED") except Exception as e: failed += 1 print(f"✗ FAILED: {str(e)}") print("\n" + "=" * 70) print(f"RESULTS: {passed} passed, {failed} failed") print("=" * 70) if failed == 0: print("🎉 ALL TASK 4 TESTS PASSED!") print("\n**TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM**") print("✓ COMPLETED: Task 4.1 - Property test for structured feedback capture") print("✓ COMPLETED: Task 4.2 - ClassificationError data model") print("✓ COMPLETED: Task 4.3 - Feedback UI integration") print("✓ COMPLETED: Task 4.4 - Error pattern analysis") print("\n**Requirements Validated:**") print("✓ 3.1 - Predefined error categories from documentation") print("✓ 3.2 - Specific subcategories of wrong classification types") print("✓ 3.3 - Structured feedback about question quality") print("✓ 3.4 - Pattern analysis and improvement suggestions") print("✓ 3.5 - Feedback aggregation and reporting") return True else: print("❌ Some tests failed. Please check the implementation.") return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)