Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script for the structured feedback system. | |
| Tests Task 4.1 and 4.2 implementation. | |
| """ | |
| import sys | |
| import os | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src')) | |
| from config.prompt_management.feedback_system import FeedbackSystem | |
| from config.prompt_management.data_models import ( | |
| ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType | |
| ) | |
| def test_classification_error_recording(): | |
| """Test recording classification errors with all required fields.""" | |
| print("Testing classification error recording...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") | |
| # Record a classification error | |
| error_id = feedback_system.record_classification_error( | |
| error_type=ErrorType.WRONG_CLASSIFICATION, | |
| subcategory=ErrorSubcategory.GREEN_TO_YELLOW, | |
| expected_category="YELLOW", | |
| actual_category="GREEN", | |
| message_content="I feel a bit stressed about work lately", | |
| reviewer_comments="Patient expressed stress but system classified as GREEN. Should be YELLOW for follow-up.", | |
| confidence_level=0.85, | |
| session_id="test_session_001", | |
| additional_context={"reviewer_id": "reviewer_123", "review_date": "2024-12-18"} | |
| ) | |
| print(f"β Recorded classification error with ID: {error_id}") | |
| # Verify the error was stored correctly | |
| errors = feedback_system._load_errors() | |
| assert len(errors) >= 1, "Error should be stored" | |
| latest_error = errors[-1] | |
| assert latest_error['error_id'] == error_id | |
| assert latest_error['error_type'] == 'wrong_classification' | |
| assert latest_error['subcategory'] == 'green_to_yellow' | |
| assert latest_error['expected_category'] == 'YELLOW' | |
| assert latest_error['actual_category'] == 'GREEN' | |
| assert latest_error['confidence_level'] == 0.85 | |
| print("β Classification error stored with all required fields") | |
| return True | |
| def test_question_issue_recording(): | |
| """Test recording question issues.""" | |
| print("Testing question issue recording...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") | |
| # Record a question issue | |
| issue_id = feedback_system.record_question_issue( | |
| issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION, | |
| question_content="Why are you feeling sad?", | |
| scenario_type=ScenarioType.LOSS_OF_INTEREST, | |
| reviewer_comments="Question is too direct and assumes emotional state. Should ask about impact instead.", | |
| severity="medium", | |
| session_id="test_session_002", | |
| suggested_improvement="Ask: 'Is that something that's been weighing on you emotionally?'" | |
| ) | |
| print(f"β Recorded question issue with ID: {issue_id}") | |
| # Verify the issue was stored correctly | |
| issues = feedback_system._load_question_issues() | |
| assert len(issues) >= 1, "Issue should be stored" | |
| latest_issue = issues[-1] | |
| assert latest_issue['issue_id'] == issue_id | |
| assert latest_issue['issue_type'] == 'inappropriate_question' | |
| assert latest_issue['scenario_type'] == 'loss_of_interest' | |
| assert latest_issue['severity'] == 'medium' | |
| print("β Question issue stored with all required fields") | |
| return True | |
| def test_referral_problem_recording(): | |
| """Test recording referral problems.""" | |
| print("Testing referral problem recording...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") | |
| # Record a referral problem | |
| problem_id = feedback_system.record_referral_problem( | |
| problem_type=ReferralProblemType.INCOMPLETE_SUMMARY, | |
| referral_content="Patient needs spiritual care support.", | |
| reviewer_comments="Summary lacks specific distress indicators and conversation context.", | |
| severity="high", | |
| session_id="test_session_003", | |
| missing_fields=["distress_indicators", "conversation_context", "urgency_level"] | |
| ) | |
| print(f"β Recorded referral problem with ID: {problem_id}") | |
| # Verify the problem was stored correctly | |
| problems = feedback_system._load_referral_problems() | |
| assert len(problems) >= 1, "Problem should be stored" | |
| latest_problem = problems[-1] | |
| assert latest_problem['problem_id'] == problem_id | |
| assert latest_problem['problem_type'] == 'incomplete_summary' | |
| assert latest_problem['severity'] == 'high' | |
| assert len(latest_problem['missing_fields']) == 3 | |
| print("β Referral problem stored with all required fields") | |
| return True | |
| def test_error_pattern_analysis(): | |
| """Test error pattern analysis functionality.""" | |
| print("Testing error pattern analysis...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") | |
| # Record multiple similar errors to create a pattern | |
| for i in range(4): | |
| feedback_system.record_classification_error( | |
| error_type=ErrorType.WRONG_CLASSIFICATION, | |
| subcategory=ErrorSubcategory.GREEN_TO_YELLOW, | |
| expected_category="YELLOW", | |
| actual_category="GREEN", | |
| message_content=f"Test message {i} about stress", | |
| reviewer_comments=f"Test comment {i}", | |
| confidence_level=0.8 + (i * 0.05), | |
| session_id=f"pattern_test_{i}" | |
| ) | |
| # Analyze patterns | |
| patterns = feedback_system.analyze_error_patterns(min_frequency=3) | |
| print(f"β Identified {len(patterns)} error patterns") | |
| # Verify pattern structure | |
| for pattern in patterns: | |
| assert hasattr(pattern, 'pattern_id') | |
| assert hasattr(pattern, 'frequency') | |
| assert hasattr(pattern, 'suggested_improvements') | |
| assert pattern.frequency >= 3 | |
| assert len(pattern.suggested_improvements) > 0 | |
| print(f" - Pattern: {pattern.pattern_type} (frequency: {pattern.frequency})") | |
| for suggestion in pattern.suggested_improvements[:2]: # Show first 2 suggestions | |
| print(f" Suggestion: {suggestion}") | |
| return True | |
| def test_feedback_summary(): | |
| """Test comprehensive feedback summary generation.""" | |
| print("Testing feedback summary generation...") | |
| feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback") | |
| # Get comprehensive summary | |
| summary = feedback_system.get_feedback_summary() | |
| # Verify summary structure | |
| required_fields = [ | |
| 'total_errors', 'total_question_issues', 'total_referral_problems', | |
| 'error_types', 'error_subcategories', 'question_issue_types', | |
| 'referral_problem_types', 'average_confidence', 'recent_errors', | |
| 'improvement_suggestions' | |
| ] | |
| for field in required_fields: | |
| assert field in summary, f"Summary missing required field: {field}" | |
| print("β Summary contains all required fields") | |
| print(f" - Total errors: {summary['total_errors']}") | |
| print(f" - Total question issues: {summary['total_question_issues']}") | |
| print(f" - Total referral problems: {summary['total_referral_problems']}") | |
| print(f" - Average confidence: {summary['average_confidence']:.2f}") | |
| print(f" - Recent errors: {summary['recent_errors']}") | |
| # Show improvement suggestions | |
| print(" - Top improvement suggestions:") | |
| for i, suggestion in enumerate(summary['improvement_suggestions'][:3], 1): | |
| print(f" {i}. {suggestion}") | |
| return True | |
| def test_data_model_serialization(): | |
| """Test that data models serialize and deserialize correctly.""" | |
| print("Testing data model serialization...") | |
| from config.prompt_management.data_models import ClassificationError | |
| from datetime import datetime | |
| # Create a classification error | |
| error = ClassificationError( | |
| error_id="test_error_123", | |
| error_type=ErrorType.SEVERITY_MISJUDGMENT, | |
| subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS, | |
| expected_category="RED", | |
| actual_category="YELLOW", | |
| message_content="I don't think I can go on like this anymore", | |
| reviewer_comments="Clear indication of severe distress, should be RED not YELLOW", | |
| confidence_level=0.95, | |
| timestamp=datetime.now(), | |
| session_id="serialization_test", | |
| additional_context={"test": True} | |
| ) | |
| # Test serialization | |
| error_dict = error.to_dict() | |
| assert isinstance(error_dict, dict) | |
| assert error_dict['error_id'] == "test_error_123" | |
| assert error_dict['error_type'] == 'severity_misjudgment' | |
| # Test deserialization | |
| reconstructed_error = ClassificationError.from_dict(error_dict) | |
| assert reconstructed_error.error_id == error.error_id | |
| assert reconstructed_error.error_type == error.error_type | |
| assert reconstructed_error.confidence_level == error.confidence_level | |
| print("β Data model serialization works correctly") | |
| return True | |
| def main(): | |
| """Run all feedback system tests.""" | |
| print("=" * 60) | |
| print("STRUCTURED FEEDBACK SYSTEM TESTS") | |
| print("=" * 60) | |
| tests = [ | |
| test_classification_error_recording, | |
| test_question_issue_recording, | |
| test_referral_problem_recording, | |
| test_error_pattern_analysis, | |
| test_feedback_summary, | |
| test_data_model_serialization | |
| ] | |
| passed = 0 | |
| failed = 0 | |
| for test in tests: | |
| try: | |
| print(f"\n{test.__name__.replace('_', ' ').title()}:") | |
| print("-" * 40) | |
| result = test() | |
| if result: | |
| passed += 1 | |
| print("β PASSED") | |
| else: | |
| failed += 1 | |
| print("β FAILED") | |
| except Exception as e: | |
| failed += 1 | |
| print(f"β FAILED: {str(e)}") | |
| print("\n" + "=" * 60) | |
| print(f"RESULTS: {passed} passed, {failed} failed") | |
| print("=" * 60) | |
| if failed == 0: | |
| print("π All feedback system tests passed!") | |
| print("\n**Feature: prompt-optimization, Property 3: Structured Feedback Data Capture**") | |
| print("β VALIDATED: Requirements 3.1, 3.2, 3.3, 3.4, 3.5") | |
| return True | |
| else: | |
| print("β Some tests failed. Please check the implementation.") | |
| return False | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |