#!/usr/bin/env python3 """ Test script for the structured feedback system. Tests Task 4.1 and 4.2 implementation. """ import sys import os sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src')) from config.prompt_management.feedback_system import FeedbackSystem from config.prompt_management.data_models import ( ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType ) def test_classification_error_recording(): """Test recording classification errors with all required fields.""" print("Testing classification error recording...") feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") # Record a classification error error_id = feedback_system.record_classification_error( error_type=ErrorType.WRONG_CLASSIFICATION, subcategory=ErrorSubcategory.GREEN_TO_YELLOW, expected_category="YELLOW", actual_category="GREEN", message_content="I feel a bit stressed about work lately", reviewer_comments="Patient expressed stress but system classified as GREEN. Should be YELLOW for follow-up.", confidence_level=0.85, session_id="test_session_001", additional_context={"reviewer_id": "reviewer_123", "review_date": "2024-12-18"} ) print(f"✓ Recorded classification error with ID: {error_id}") # Verify the error was stored correctly errors = feedback_system._load_errors() assert len(errors) >= 1, "Error should be stored" latest_error = errors[-1] assert latest_error['error_id'] == error_id assert latest_error['error_type'] == 'wrong_classification' assert latest_error['subcategory'] == 'green_to_yellow' assert latest_error['expected_category'] == 'YELLOW' assert latest_error['actual_category'] == 'GREEN' assert latest_error['confidence_level'] == 0.85 print("✓ Classification error stored with all required fields") return True def test_question_issue_recording(): """Test recording question issues.""" print("Testing question issue recording...") feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") # Record a question issue issue_id = feedback_system.record_question_issue( issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION, question_content="Why are you feeling sad?", scenario_type=ScenarioType.LOSS_OF_INTEREST, reviewer_comments="Question is too direct and assumes emotional state. Should ask about impact instead.", severity="medium", session_id="test_session_002", suggested_improvement="Ask: 'Is that something that's been weighing on you emotionally?'" ) print(f"✓ Recorded question issue with ID: {issue_id}") # Verify the issue was stored correctly issues = feedback_system._load_question_issues() assert len(issues) >= 1, "Issue should be stored" latest_issue = issues[-1] assert latest_issue['issue_id'] == issue_id assert latest_issue['issue_type'] == 'inappropriate_question' assert latest_issue['scenario_type'] == 'loss_of_interest' assert latest_issue['severity'] == 'medium' print("✓ Question issue stored with all required fields") return True def test_referral_problem_recording(): """Test recording referral problems.""" print("Testing referral problem recording...") feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") # Record a referral problem problem_id = feedback_system.record_referral_problem( problem_type=ReferralProblemType.INCOMPLETE_SUMMARY, referral_content="Patient needs spiritual care support.", reviewer_comments="Summary lacks specific distress indicators and conversation context.", severity="high", session_id="test_session_003", missing_fields=["distress_indicators", "conversation_context", "urgency_level"] ) print(f"✓ Recorded referral problem with ID: {problem_id}") # Verify the problem was stored correctly problems = feedback_system._load_referral_problems() assert len(problems) >= 1, "Problem should be stored" latest_problem = problems[-1] assert latest_problem['problem_id'] == problem_id assert latest_problem['problem_type'] == 'incomplete_summary' assert latest_problem['severity'] == 'high' assert len(latest_problem['missing_fields']) == 3 print("✓ Referral problem stored with all required fields") return True def test_error_pattern_analysis(): """Test error pattern analysis functionality.""" print("Testing error pattern analysis...") feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") # Record multiple similar errors to create a pattern for i in range(4): feedback_system.record_classification_error( error_type=ErrorType.WRONG_CLASSIFICATION, subcategory=ErrorSubcategory.GREEN_TO_YELLOW, expected_category="YELLOW", actual_category="GREEN", message_content=f"Test message {i} about stress", reviewer_comments=f"Test comment {i}", confidence_level=0.8 + (i * 0.05), session_id=f"pattern_test_{i}" ) # Analyze patterns patterns = feedback_system.analyze_error_patterns(min_frequency=3) print(f"✓ Identified {len(patterns)} error patterns") # Verify pattern structure for pattern in patterns: assert hasattr(pattern, 'pattern_id') assert hasattr(pattern, 'frequency') assert hasattr(pattern, 'suggested_improvements') assert pattern.frequency >= 3 assert len(pattern.suggested_improvements) > 0 print(f" - Pattern: {pattern.pattern_type} (frequency: {pattern.frequency})") for suggestion in pattern.suggested_improvements[:2]: # Show first 2 suggestions print(f" Suggestion: {suggestion}") return True def test_feedback_summary(): """Test comprehensive feedback summary generation.""" print("Testing feedback summary generation...") feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") # Get comprehensive summary summary = feedback_system.get_feedback_summary() # Verify summary structure required_fields = [ 'total_errors', 'total_question_issues', 'total_referral_problems', 'error_types', 'error_subcategories', 'question_issue_types', 'referral_problem_types', 'average_confidence', 'recent_errors', 'improvement_suggestions' ] for field in required_fields: assert field in summary, f"Summary missing required field: {field}" print("✓ Summary contains all required fields") print(f" - Total errors: {summary['total_errors']}") print(f" - Total question issues: {summary['total_question_issues']}") print(f" - Total referral problems: {summary['total_referral_problems']}") print(f" - Average confidence: {summary['average_confidence']:.2f}") print(f" - Recent errors: {summary['recent_errors']}") # Show improvement suggestions print(" - Top improvement suggestions:") for i, suggestion in enumerate(summary['improvement_suggestions'][:3], 1): print(f" {i}. {suggestion}") return True def test_data_model_serialization(): """Test that data models serialize and deserialize correctly.""" print("Testing data model serialization...") from config.prompt_management.data_models import ClassificationError from datetime import datetime # Create a classification error error = ClassificationError( error_id="test_error_123", error_type=ErrorType.SEVERITY_MISJUDGMENT, subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS, expected_category="RED", actual_category="YELLOW", message_content="I don't think I can go on like this anymore", reviewer_comments="Clear indication of severe distress, should be RED not YELLOW", confidence_level=0.95, timestamp=datetime.now(), session_id="serialization_test", additional_context={"test": True} ) # Test serialization error_dict = error.to_dict() assert isinstance(error_dict, dict) assert error_dict['error_id'] == "test_error_123" assert error_dict['error_type'] == 'severity_misjudgment' # Test deserialization reconstructed_error = ClassificationError.from_dict(error_dict) assert reconstructed_error.error_id == error.error_id assert reconstructed_error.error_type == error.error_type assert reconstructed_error.confidence_level == error.confidence_level print("✓ Data model serialization works correctly") return True def main(): """Run all feedback system tests.""" print("=" * 60) print("STRUCTURED FEEDBACK SYSTEM TESTS") print("=" * 60) tests = [ test_classification_error_recording, test_question_issue_recording, test_referral_problem_recording, test_error_pattern_analysis, test_feedback_summary, test_data_model_serialization ] passed = 0 failed = 0 for test in tests: try: print(f"\n{test.__name__.replace('_', ' ').title()}:") print("-" * 40) result = test() if result: passed += 1 print("✓ PASSED") else: failed += 1 print("✗ FAILED") except Exception as e: failed += 1 print(f"✗ FAILED: {str(e)}") print("\n" + "=" * 60) print(f"RESULTS: {passed} passed, {failed} failed") print("=" * 60) if failed == 0: print("🎉 All feedback system tests passed!") print("\n**Feature: prompt-optimization, Property 3: Structured Feedback Data Capture**") print("✓ VALIDATED: Requirements 3.1, 3.2, 3.3, 3.4, 3.5") return True else: print("❌ Some tests failed. Please check the implementation.") return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)