Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Comprehensive test script for Task 4: Build Structured Feedback System. | |
| Tests all subtasks: 4.1, 4.2, 4.3, and 4.4. | |
| """ | |
| import sys | |
| import os | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src')) | |
| from config.prompt_management.feedback_system import FeedbackSystem | |
| from config.prompt_management.pattern_recognizer import PatternRecognizer | |
| from interface.feedback_ui_integration import FeedbackUIIntegration | |
| from config.prompt_management.data_models import ( | |
| ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType, | |
| ClassificationError, QuestionIssue, ReferralProblem | |
| ) | |
| def test_task_4_1_property_based_feedback_capture(): | |
| """Test Task 4.1: Property-based test for structured feedback capture.""" | |
| print("Testing Task 4.1: Property-based structured feedback capture...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/task4_test") | |
| # Test structured data capture for all feedback types | |
| test_cases = [ | |
| # Classification errors | |
| { | |
| 'type': 'classification_error', | |
| 'data': { | |
| 'error_type': ErrorType.WRONG_CLASSIFICATION, | |
| 'subcategory': ErrorSubcategory.GREEN_TO_YELLOW, | |
| 'expected_category': 'YELLOW', | |
| 'actual_category': 'GREEN', | |
| 'message_content': 'I feel overwhelmed and stressed about everything', | |
| 'reviewer_comments': 'Clear distress indicators were missed by the system', | |
| 'confidence_level': 0.9 | |
| } | |
| }, | |
| # Question issues | |
| { | |
| 'type': 'question_issue', | |
| 'data': { | |
| 'issue_type': QuestionIssueType.INAPPROPRIATE_QUESTION, | |
| 'question_content': 'What is wrong with you?', | |
| 'scenario_type': ScenarioType.VAGUE_STRESS, | |
| 'reviewer_comments': 'Question is too direct and potentially offensive', | |
| 'severity': 'high' | |
| } | |
| }, | |
| # Referral problems | |
| { | |
| 'type': 'referral_problem', | |
| 'data': { | |
| 'problem_type': ReferralProblemType.INCOMPLETE_SUMMARY, | |
| 'referral_content': 'Patient needs spiritual care.', | |
| 'reviewer_comments': 'Summary lacks specific distress indicators and context', | |
| 'severity': 'medium', | |
| 'missing_fields': ['distress_indicators', 'conversation_context'] | |
| } | |
| } | |
| ] | |
| recorded_ids = [] | |
| # Record all test feedback | |
| for case in test_cases: | |
| if case['type'] == 'classification_error': | |
| error_id = feedback_system.record_classification_error(**case['data']) | |
| recorded_ids.append(error_id) | |
| elif case['type'] == 'question_issue': | |
| issue_id = feedback_system.record_question_issue(**case['data']) | |
| recorded_ids.append(issue_id) | |
| elif case['type'] == 'referral_problem': | |
| problem_id = feedback_system.record_referral_problem(**case['data']) | |
| recorded_ids.append(problem_id) | |
| # Verify all feedback was captured | |
| summary = feedback_system.get_feedback_summary() | |
| assert summary['total_errors'] >= 1, "Classification error should be recorded" | |
| assert summary['total_question_issues'] >= 1, "Question issue should be recorded" | |
| assert summary['total_referral_problems'] >= 1, "Referral problem should be recorded" | |
| assert len(recorded_ids) == 3, "All feedback should return IDs" | |
| # Verify structured data fields are present | |
| errors = feedback_system._load_errors() | |
| if errors: | |
| latest_error = errors[-1] | |
| required_fields = ['error_id', 'error_type', 'subcategory', 'expected_category', | |
| 'actual_category', 'message_content', 'reviewer_comments', | |
| 'confidence_level', 'timestamp'] | |
| for field in required_fields: | |
| assert field in latest_error, f"Required field {field} missing" | |
| print("β Task 4.1: Property-based structured feedback capture works correctly") | |
| return True | |
| def test_task_4_2_classification_error_data_model(): | |
| """Test Task 4.2: ClassificationError data model implementation.""" | |
| print("Testing Task 4.2: ClassificationError data model...") | |
| from datetime import datetime | |
| # Test ClassificationError creation | |
| error = ClassificationError( | |
| error_id="test_error_123", | |
| error_type=ErrorType.SEVERITY_MISJUDGMENT, | |
| subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS, | |
| expected_category="RED", | |
| actual_category="YELLOW", | |
| message_content="I don't think I can go on like this anymore", | |
| reviewer_comments="Clear indication of severe distress, should be RED not YELLOW", | |
| confidence_level=0.95, | |
| timestamp=datetime.now(), | |
| session_id="test_session_456", | |
| additional_context={"reviewer_id": "reviewer_789"} | |
| ) | |
| # Test serialization | |
| error_dict = error.to_dict() | |
| assert error_dict['error_id'] == "test_error_123" | |
| assert error_dict['error_type'] == 'severity_misjudgment' | |
| assert error_dict['subcategory'] == 'underestimated_distress' | |
| assert error_dict['confidence_level'] == 0.95 | |
| # Test deserialization | |
| reconstructed_error = ClassificationError.from_dict(error_dict) | |
| assert reconstructed_error.error_id == error.error_id | |
| assert reconstructed_error.error_type == error.error_type | |
| assert reconstructed_error.subcategory == error.subcategory | |
| assert reconstructed_error.confidence_level == error.confidence_level | |
| # Test all error types and subcategories | |
| for error_type in ErrorType: | |
| assert isinstance(error_type.value, str), f"Error type {error_type} should have string value" | |
| for subcategory in ErrorSubcategory: | |
| assert isinstance(subcategory.value, str), f"Subcategory {subcategory} should have string value" | |
| print("β Task 4.2: ClassificationError data model works correctly") | |
| return True | |
| def test_task_4_3_feedback_ui_integration(): | |
| """Test Task 4.3: Feedback UI integration.""" | |
| print("Testing Task 4.3: Feedback UI integration...") | |
| # Test UI integration initialization | |
| ui_integration = FeedbackUIIntegration() | |
| # Verify error type options are complete | |
| expected_error_types = [ | |
| 'wrong_classification', 'severity_misjudgment', 'missed_indicators', | |
| 'false_positive', 'context_misunderstanding', 'language_interpretation' | |
| ] | |
| actual_error_types = [value for _, value in ui_integration.error_type_options] | |
| for expected_type in expected_error_types: | |
| assert expected_type in actual_error_types, f"Missing error type: {expected_type}" | |
| # Verify subcategory mappings are complete | |
| for error_type_label, error_type_value in ui_integration.error_type_options: | |
| assert error_type_value in ui_integration.subcategory_mapping, \ | |
| f"Missing subcategory mapping for: {error_type_value}" | |
| subcategories = ui_integration.subcategory_mapping[error_type_value] | |
| assert len(subcategories) > 0, f"No subcategories for: {error_type_value}" | |
| # Verify question issue options | |
| expected_question_types = [ | |
| 'inappropriate_question', 'insensitive_language', 'wrong_scenario_targeting', | |
| 'unclear_question', 'leading_question' | |
| ] | |
| actual_question_types = [value for _, value in ui_integration.question_issue_options] | |
| for expected_type in expected_question_types: | |
| assert expected_type in actual_question_types, f"Missing question issue type: {expected_type}" | |
| # Verify scenario options | |
| expected_scenarios = [ | |
| 'loss_of_interest', 'loss_of_loved_one', 'no_support', | |
| 'vague_stress', 'sleep_issues', 'spiritual_practice_change' | |
| ] | |
| actual_scenarios = [value for _, value in ui_integration.scenario_options] | |
| for expected_scenario in expected_scenarios: | |
| assert expected_scenario in actual_scenarios, f"Missing scenario: {expected_scenario}" | |
| # Test UI component methods exist | |
| assert hasattr(ui_integration, 'create_classification_error_interface') | |
| assert hasattr(ui_integration, 'create_question_issue_interface') | |
| assert hasattr(ui_integration, 'create_pattern_analysis_display') | |
| assert hasattr(ui_integration, 'create_complete_feedback_interface') | |
| print("β Task 4.3: Feedback UI integration works correctly") | |
| return True | |
| def test_task_4_4_error_pattern_analysis(): | |
| """Test Task 4.4: Error pattern analysis implementation.""" | |
| print("Testing Task 4.4: Error pattern analysis...") | |
| # Test PatternRecognizer initialization | |
| recognizer = PatternRecognizer(min_pattern_frequency=2, confidence_threshold=0.7) | |
| assert recognizer.min_pattern_frequency == 2 | |
| assert recognizer.confidence_threshold == 0.7 | |
| # Create test feedback system with pattern recognizer | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/task4_pattern_test") | |
| # Record multiple similar errors to create patterns | |
| for i in range(4): | |
| feedback_system.record_classification_error( | |
| error_type=ErrorType.WRONG_CLASSIFICATION, | |
| subcategory=ErrorSubcategory.GREEN_TO_YELLOW, | |
| expected_category="YELLOW", | |
| actual_category="GREEN", | |
| message_content=f"I feel stressed and overwhelmed about work situation {i}", | |
| reviewer_comments=f"Clear distress indicators were missed {i}", | |
| confidence_level=0.85 + (i * 0.02), | |
| session_id=f"pattern_test_session_{i}", | |
| additional_context={"scenario_type": "vague_stress"} | |
| ) | |
| # Record question issues | |
| for i in range(3): | |
| feedback_system.record_question_issue( | |
| issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION, | |
| question_content=f"What's wrong with you? {i}", | |
| scenario_type=ScenarioType.VAGUE_STRESS, | |
| reviewer_comments=f"Too direct and potentially offensive {i}", | |
| severity="high", | |
| session_id=f"pattern_test_session_{i}" | |
| ) | |
| # Analyze patterns | |
| patterns = feedback_system.analyze_error_patterns(min_frequency=2) | |
| # Verify patterns were identified | |
| assert len(patterns) > 0, "Should identify error patterns" | |
| # Check for wrong classification pattern | |
| wrong_classification_patterns = [p for p in patterns if 'wrong_classification' in p.pattern_type] | |
| assert len(wrong_classification_patterns) > 0, "Should identify wrong classification pattern" | |
| # Verify pattern structure | |
| for pattern in patterns: | |
| assert hasattr(pattern, 'pattern_id'), "Pattern should have ID" | |
| assert hasattr(pattern, 'frequency'), "Pattern should have frequency" | |
| assert hasattr(pattern, 'suggested_improvements'), "Pattern should have suggestions" | |
| assert pattern.frequency >= 2, "Pattern frequency should meet minimum" | |
| assert len(pattern.suggested_improvements) > 0, "Pattern should have improvement suggestions" | |
| # Test optimization report generation | |
| report = feedback_system.generate_optimization_report() | |
| # Verify report structure | |
| required_fields = [ | |
| 'summary', 'total_patterns', 'recommendations', 'priority_actions', | |
| 'confidence_score', 'most_frequent_pattern', 'affected_scenarios', | |
| 'report_generated' | |
| ] | |
| for field in required_fields: | |
| assert field in report, f"Report missing required field: {field}" | |
| assert report['total_patterns'] > 0, "Report should show patterns" | |
| assert len(report['recommendations']) > 0, "Report should have recommendations" | |
| assert 0.0 <= report['confidence_score'] <= 1.0, "Confidence score should be valid" | |
| print("β Task 4.4: Error pattern analysis works correctly") | |
| return True | |
| def test_end_to_end_feedback_workflow(): | |
| """Test complete end-to-end feedback workflow.""" | |
| print("Testing end-to-end feedback workflow...") | |
| # Create fresh feedback system | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/task4_e2e_test") | |
| # Create UI integration | |
| ui_integration = FeedbackUIIntegration(feedback_system=feedback_system) | |
| # Simulate complete feedback workflow | |
| # 1. Record various types of feedback | |
| error_id = feedback_system.record_classification_error( | |
| error_type=ErrorType.CONTEXT_MISUNDERSTANDING, | |
| subcategory=ErrorSubcategory.IGNORED_HISTORY, | |
| expected_category="RED", | |
| actual_category="GREEN", | |
| message_content="I mentioned earlier that I've been having thoughts of ending it all", | |
| reviewer_comments="System ignored previous context about suicidal ideation", | |
| confidence_level=0.95, | |
| session_id="e2e_session_1", | |
| additional_context={"conversation_turn": 3, "previous_classification": "YELLOW"} | |
| ) | |
| issue_id = feedback_system.record_question_issue( | |
| issue_type=QuestionIssueType.INSENSITIVE_LANGUAGE, | |
| question_content="Why don't you just try to be more positive?", | |
| scenario_type=ScenarioType.LOSS_OF_LOVED_ONE, | |
| reviewer_comments="Dismissive language inappropriate for grief scenario", | |
| severity="high", | |
| session_id="e2e_session_2", | |
| suggested_improvement="Ask: 'How are you processing this loss?'" | |
| ) | |
| problem_id = feedback_system.record_referral_problem( | |
| problem_type=ReferralProblemType.MISSING_CONTACT_INFO, | |
| referral_content="Patient experiencing severe spiritual distress and needs immediate support.", | |
| reviewer_comments="No contact information or urgency level specified", | |
| severity="high", | |
| session_id="e2e_session_3", | |
| missing_fields=["contact_phone", "urgency_level", "preferred_contact_time"] | |
| ) | |
| # 2. Verify all feedback was recorded | |
| summary = feedback_system.get_feedback_summary() | |
| assert summary['total_errors'] >= 1 | |
| assert summary['total_question_issues'] >= 1 | |
| assert summary['total_referral_problems'] >= 1 | |
| # 3. Analyze patterns (may not find patterns with single instances) | |
| patterns = feedback_system.analyze_error_patterns(min_frequency=1) | |
| # 4. Generate optimization report | |
| report = feedback_system.generate_optimization_report() | |
| assert 'summary' in report | |
| assert 'recommendations' in report | |
| # 5. Verify UI integration can access the data | |
| assert ui_integration.feedback_system == feedback_system | |
| print("β End-to-end feedback workflow works correctly") | |
| return True | |
| def main(): | |
| """Run all Task 4 tests.""" | |
| print("=" * 70) | |
| print("TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM - COMPREHENSIVE TESTS") | |
| print("=" * 70) | |
| tests = [ | |
| test_task_4_1_property_based_feedback_capture, | |
| test_task_4_2_classification_error_data_model, | |
| test_task_4_3_feedback_ui_integration, | |
| test_task_4_4_error_pattern_analysis, | |
| test_end_to_end_feedback_workflow | |
| ] | |
| passed = 0 | |
| failed = 0 | |
| for test in tests: | |
| try: | |
| print(f"\n{test.__name__.replace('_', ' ').title()}:") | |
| print("-" * 50) | |
| result = test() | |
| if result: | |
| passed += 1 | |
| print("β PASSED") | |
| else: | |
| failed += 1 | |
| print("β FAILED") | |
| except Exception as e: | |
| failed += 1 | |
| print(f"β FAILED: {str(e)}") | |
| print("\n" + "=" * 70) | |
| print(f"RESULTS: {passed} passed, {failed} failed") | |
| print("=" * 70) | |
| if failed == 0: | |
| print("π ALL TASK 4 TESTS PASSED!") | |
| print("\n**TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM**") | |
| print("β COMPLETED: Task 4.1 - Property test for structured feedback capture") | |
| print("β COMPLETED: Task 4.2 - ClassificationError data model") | |
| print("β COMPLETED: Task 4.3 - Feedback UI integration") | |
| print("β COMPLETED: Task 4.4 - Error pattern analysis") | |
| print("\n**Requirements Validated:**") | |
| print("β 3.1 - Predefined error categories from documentation") | |
| print("β 3.2 - Specific subcategories of wrong classification types") | |
| print("β 3.3 - Structured feedback about question quality") | |
| print("β 3.4 - Pattern analysis and improvement suggestions") | |
| print("β 3.5 - Feedback aggregation and reporting") | |
| return True | |
| else: | |
| print("β Some tests failed. Please check the implementation.") | |
| return False | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |