File size: 10,421 Bytes
24214fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
"""
Test script for the structured feedback system.
Tests Task 4.1 and 4.2 implementation.
"""

import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))

from config.prompt_management.feedback_system import FeedbackSystem
from config.prompt_management.data_models import (
    ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType
)


def test_classification_error_recording():
    """Test recording classification errors with all required fields."""
    print("Testing classification error recording...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback")
    
    # Record a classification error
    error_id = feedback_system.record_classification_error(
        error_type=ErrorType.WRONG_CLASSIFICATION,
        subcategory=ErrorSubcategory.GREEN_TO_YELLOW,
        expected_category="YELLOW",
        actual_category="GREEN",
        message_content="I feel a bit stressed about work lately",
        reviewer_comments="Patient expressed stress but system classified as GREEN. Should be YELLOW for follow-up.",
        confidence_level=0.85,
        session_id="test_session_001",
        additional_context={"reviewer_id": "reviewer_123", "review_date": "2024-12-18"}
    )
    
    print(f"βœ“ Recorded classification error with ID: {error_id}")
    
    # Verify the error was stored correctly
    errors = feedback_system._load_errors()
    assert len(errors) >= 1, "Error should be stored"
    
    latest_error = errors[-1]
    assert latest_error['error_id'] == error_id
    assert latest_error['error_type'] == 'wrong_classification'
    assert latest_error['subcategory'] == 'green_to_yellow'
    assert latest_error['expected_category'] == 'YELLOW'
    assert latest_error['actual_category'] == 'GREEN'
    assert latest_error['confidence_level'] == 0.85
    
    print("βœ“ Classification error stored with all required fields")
    return True


def test_question_issue_recording():
    """Test recording question issues."""
    print("Testing question issue recording...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback")
    
    # Record a question issue
    issue_id = feedback_system.record_question_issue(
        issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION,
        question_content="Why are you feeling sad?",
        scenario_type=ScenarioType.LOSS_OF_INTEREST,
        reviewer_comments="Question is too direct and assumes emotional state. Should ask about impact instead.",
        severity="medium",
        session_id="test_session_002",
        suggested_improvement="Ask: 'Is that something that's been weighing on you emotionally?'"
    )
    
    print(f"βœ“ Recorded question issue with ID: {issue_id}")
    
    # Verify the issue was stored correctly
    issues = feedback_system._load_question_issues()
    assert len(issues) >= 1, "Issue should be stored"
    
    latest_issue = issues[-1]
    assert latest_issue['issue_id'] == issue_id
    assert latest_issue['issue_type'] == 'inappropriate_question'
    assert latest_issue['scenario_type'] == 'loss_of_interest'
    assert latest_issue['severity'] == 'medium'
    
    print("βœ“ Question issue stored with all required fields")
    return True


def test_referral_problem_recording():
    """Test recording referral problems."""
    print("Testing referral problem recording...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback")
    
    # Record a referral problem
    problem_id = feedback_system.record_referral_problem(
        problem_type=ReferralProblemType.INCOMPLETE_SUMMARY,
        referral_content="Patient needs spiritual care support.",
        reviewer_comments="Summary lacks specific distress indicators and conversation context.",
        severity="high",
        session_id="test_session_003",
        missing_fields=["distress_indicators", "conversation_context", "urgency_level"]
    )
    
    print(f"βœ“ Recorded referral problem with ID: {problem_id}")
    
    # Verify the problem was stored correctly
    problems = feedback_system._load_referral_problems()
    assert len(problems) >= 1, "Problem should be stored"
    
    latest_problem = problems[-1]
    assert latest_problem['problem_id'] == problem_id
    assert latest_problem['problem_type'] == 'incomplete_summary'
    assert latest_problem['severity'] == 'high'
    assert len(latest_problem['missing_fields']) == 3
    
    print("βœ“ Referral problem stored with all required fields")
    return True


def test_error_pattern_analysis():
    """Test error pattern analysis functionality."""
    print("Testing error pattern analysis...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback")
    
    # Record multiple similar errors to create a pattern
    for i in range(4):
        feedback_system.record_classification_error(
            error_type=ErrorType.WRONG_CLASSIFICATION,
            subcategory=ErrorSubcategory.GREEN_TO_YELLOW,
            expected_category="YELLOW",
            actual_category="GREEN",
            message_content=f"Test message {i} about stress",
            reviewer_comments=f"Test comment {i}",
            confidence_level=0.8 + (i * 0.05),
            session_id=f"pattern_test_{i}"
        )
    
    # Analyze patterns
    patterns = feedback_system.analyze_error_patterns(min_frequency=3)
    
    print(f"βœ“ Identified {len(patterns)} error patterns")
    
    # Verify pattern structure
    for pattern in patterns:
        assert hasattr(pattern, 'pattern_id')
        assert hasattr(pattern, 'frequency')
        assert hasattr(pattern, 'suggested_improvements')
        assert pattern.frequency >= 3
        assert len(pattern.suggested_improvements) > 0
        
        print(f"  - Pattern: {pattern.pattern_type} (frequency: {pattern.frequency})")
        for suggestion in pattern.suggested_improvements[:2]:  # Show first 2 suggestions
            print(f"    Suggestion: {suggestion}")
    
    return True


def test_feedback_summary():
    """Test comprehensive feedback summary generation."""
    print("Testing feedback summary generation...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/test_feedback")
    
    # Get comprehensive summary
    summary = feedback_system.get_feedback_summary()
    
    # Verify summary structure
    required_fields = [
        'total_errors', 'total_question_issues', 'total_referral_problems',
        'error_types', 'error_subcategories', 'question_issue_types',
        'referral_problem_types', 'average_confidence', 'recent_errors',
        'improvement_suggestions'
    ]
    
    for field in required_fields:
        assert field in summary, f"Summary missing required field: {field}"
    
    print("βœ“ Summary contains all required fields")
    print(f"  - Total errors: {summary['total_errors']}")
    print(f"  - Total question issues: {summary['total_question_issues']}")
    print(f"  - Total referral problems: {summary['total_referral_problems']}")
    print(f"  - Average confidence: {summary['average_confidence']:.2f}")
    print(f"  - Recent errors: {summary['recent_errors']}")
    
    # Show improvement suggestions
    print("  - Top improvement suggestions:")
    for i, suggestion in enumerate(summary['improvement_suggestions'][:3], 1):
        print(f"    {i}. {suggestion}")
    
    return True


def test_data_model_serialization():
    """Test that data models serialize and deserialize correctly."""
    print("Testing data model serialization...")
    
    from config.prompt_management.data_models import ClassificationError
    from datetime import datetime
    
    # Create a classification error
    error = ClassificationError(
        error_id="test_error_123",
        error_type=ErrorType.SEVERITY_MISJUDGMENT,
        subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS,
        expected_category="RED",
        actual_category="YELLOW",
        message_content="I don't think I can go on like this anymore",
        reviewer_comments="Clear indication of severe distress, should be RED not YELLOW",
        confidence_level=0.95,
        timestamp=datetime.now(),
        session_id="serialization_test",
        additional_context={"test": True}
    )
    
    # Test serialization
    error_dict = error.to_dict()
    assert isinstance(error_dict, dict)
    assert error_dict['error_id'] == "test_error_123"
    assert error_dict['error_type'] == 'severity_misjudgment'
    
    # Test deserialization
    reconstructed_error = ClassificationError.from_dict(error_dict)
    assert reconstructed_error.error_id == error.error_id
    assert reconstructed_error.error_type == error.error_type
    assert reconstructed_error.confidence_level == error.confidence_level
    
    print("βœ“ Data model serialization works correctly")
    return True


def main():
    """Run all feedback system tests."""
    print("=" * 60)
    print("STRUCTURED FEEDBACK SYSTEM TESTS")
    print("=" * 60)
    
    tests = [
        test_classification_error_recording,
        test_question_issue_recording,
        test_referral_problem_recording,
        test_error_pattern_analysis,
        test_feedback_summary,
        test_data_model_serialization
    ]
    
    passed = 0
    failed = 0
    
    for test in tests:
        try:
            print(f"\n{test.__name__.replace('_', ' ').title()}:")
            print("-" * 40)
            
            result = test()
            if result:
                passed += 1
                print("βœ“ PASSED")
            else:
                failed += 1
                print("βœ— FAILED")
                
        except Exception as e:
            failed += 1
            print(f"βœ— FAILED: {str(e)}")
    
    print("\n" + "=" * 60)
    print(f"RESULTS: {passed} passed, {failed} failed")
    print("=" * 60)
    
    if failed == 0:
        print("πŸŽ‰ All feedback system tests passed!")
        print("\n**Feature: prompt-optimization, Property 3: Structured Feedback Data Capture**")
        print("βœ“ VALIDATED: Requirements 3.1, 3.2, 3.3, 3.4, 3.5")
        return True
    else:
        print("❌ Some tests failed. Please check the implementation.")
        return False


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)