File size: 16,803 Bytes
24214fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
"""
Comprehensive test script for Task 4: Build Structured Feedback System.
Tests all subtasks: 4.1, 4.2, 4.3, and 4.4.
"""

import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))

from config.prompt_management.feedback_system import FeedbackSystem
from config.prompt_management.pattern_recognizer import PatternRecognizer
from interface.feedback_ui_integration import FeedbackUIIntegration
from config.prompt_management.data_models import (
    ErrorType, ErrorSubcategory, QuestionIssueType, ReferralProblemType, ScenarioType,
    ClassificationError, QuestionIssue, ReferralProblem
)


def test_task_4_1_property_based_feedback_capture():
    """Test Task 4.1: Property-based test for structured feedback capture."""
    print("Testing Task 4.1: Property-based structured feedback capture...")
    
    feedback_system = FeedbackSystem(storage_path=".verification_data/task4_test")
    
    # Test structured data capture for all feedback types
    test_cases = [
        # Classification errors
        {
            'type': 'classification_error',
            'data': {
                'error_type': ErrorType.WRONG_CLASSIFICATION,
                'subcategory': ErrorSubcategory.GREEN_TO_YELLOW,
                'expected_category': 'YELLOW',
                'actual_category': 'GREEN',
                'message_content': 'I feel overwhelmed and stressed about everything',
                'reviewer_comments': 'Clear distress indicators were missed by the system',
                'confidence_level': 0.9
            }
        },
        # Question issues
        {
            'type': 'question_issue',
            'data': {
                'issue_type': QuestionIssueType.INAPPROPRIATE_QUESTION,
                'question_content': 'What is wrong with you?',
                'scenario_type': ScenarioType.VAGUE_STRESS,
                'reviewer_comments': 'Question is too direct and potentially offensive',
                'severity': 'high'
            }
        },
        # Referral problems
        {
            'type': 'referral_problem',
            'data': {
                'problem_type': ReferralProblemType.INCOMPLETE_SUMMARY,
                'referral_content': 'Patient needs spiritual care.',
                'reviewer_comments': 'Summary lacks specific distress indicators and context',
                'severity': 'medium',
                'missing_fields': ['distress_indicators', 'conversation_context']
            }
        }
    ]
    
    recorded_ids = []
    
    # Record all test feedback
    for case in test_cases:
        if case['type'] == 'classification_error':
            error_id = feedback_system.record_classification_error(**case['data'])
            recorded_ids.append(error_id)
        elif case['type'] == 'question_issue':
            issue_id = feedback_system.record_question_issue(**case['data'])
            recorded_ids.append(issue_id)
        elif case['type'] == 'referral_problem':
            problem_id = feedback_system.record_referral_problem(**case['data'])
            recorded_ids.append(problem_id)
    
    # Verify all feedback was captured
    summary = feedback_system.get_feedback_summary()
    
    assert summary['total_errors'] >= 1, "Classification error should be recorded"
    assert summary['total_question_issues'] >= 1, "Question issue should be recorded"
    assert summary['total_referral_problems'] >= 1, "Referral problem should be recorded"
    assert len(recorded_ids) == 3, "All feedback should return IDs"
    
    # Verify structured data fields are present
    errors = feedback_system._load_errors()
    if errors:
        latest_error = errors[-1]
        required_fields = ['error_id', 'error_type', 'subcategory', 'expected_category', 
                          'actual_category', 'message_content', 'reviewer_comments', 
                          'confidence_level', 'timestamp']
        for field in required_fields:
            assert field in latest_error, f"Required field {field} missing"
    
    print("βœ“ Task 4.1: Property-based structured feedback capture works correctly")
    return True


def test_task_4_2_classification_error_data_model():
    """Test Task 4.2: ClassificationError data model implementation."""
    print("Testing Task 4.2: ClassificationError data model...")
    
    from datetime import datetime
    
    # Test ClassificationError creation
    error = ClassificationError(
        error_id="test_error_123",
        error_type=ErrorType.SEVERITY_MISJUDGMENT,
        subcategory=ErrorSubcategory.UNDERESTIMATED_DISTRESS,
        expected_category="RED",
        actual_category="YELLOW",
        message_content="I don't think I can go on like this anymore",
        reviewer_comments="Clear indication of severe distress, should be RED not YELLOW",
        confidence_level=0.95,
        timestamp=datetime.now(),
        session_id="test_session_456",
        additional_context={"reviewer_id": "reviewer_789"}
    )
    
    # Test serialization
    error_dict = error.to_dict()
    assert error_dict['error_id'] == "test_error_123"
    assert error_dict['error_type'] == 'severity_misjudgment'
    assert error_dict['subcategory'] == 'underestimated_distress'
    assert error_dict['confidence_level'] == 0.95
    
    # Test deserialization
    reconstructed_error = ClassificationError.from_dict(error_dict)
    assert reconstructed_error.error_id == error.error_id
    assert reconstructed_error.error_type == error.error_type
    assert reconstructed_error.subcategory == error.subcategory
    assert reconstructed_error.confidence_level == error.confidence_level
    
    # Test all error types and subcategories
    for error_type in ErrorType:
        assert isinstance(error_type.value, str), f"Error type {error_type} should have string value"
    
    for subcategory in ErrorSubcategory:
        assert isinstance(subcategory.value, str), f"Subcategory {subcategory} should have string value"
    
    print("βœ“ Task 4.2: ClassificationError data model works correctly")
    return True


def test_task_4_3_feedback_ui_integration():
    """Test Task 4.3: Feedback UI integration."""
    print("Testing Task 4.3: Feedback UI integration...")
    
    # Test UI integration initialization
    ui_integration = FeedbackUIIntegration()
    
    # Verify error type options are complete
    expected_error_types = [
        'wrong_classification', 'severity_misjudgment', 'missed_indicators',
        'false_positive', 'context_misunderstanding', 'language_interpretation'
    ]
    
    actual_error_types = [value for _, value in ui_integration.error_type_options]
    for expected_type in expected_error_types:
        assert expected_type in actual_error_types, f"Missing error type: {expected_type}"
    
    # Verify subcategory mappings are complete
    for error_type_label, error_type_value in ui_integration.error_type_options:
        assert error_type_value in ui_integration.subcategory_mapping, \
            f"Missing subcategory mapping for: {error_type_value}"
        
        subcategories = ui_integration.subcategory_mapping[error_type_value]
        assert len(subcategories) > 0, f"No subcategories for: {error_type_value}"
    
    # Verify question issue options
    expected_question_types = [
        'inappropriate_question', 'insensitive_language', 'wrong_scenario_targeting',
        'unclear_question', 'leading_question'
    ]
    
    actual_question_types = [value for _, value in ui_integration.question_issue_options]
    for expected_type in expected_question_types:
        assert expected_type in actual_question_types, f"Missing question issue type: {expected_type}"
    
    # Verify scenario options
    expected_scenarios = [
        'loss_of_interest', 'loss_of_loved_one', 'no_support',
        'vague_stress', 'sleep_issues', 'spiritual_practice_change'
    ]
    
    actual_scenarios = [value for _, value in ui_integration.scenario_options]
    for expected_scenario in expected_scenarios:
        assert expected_scenario in actual_scenarios, f"Missing scenario: {expected_scenario}"
    
    # Test UI component methods exist
    assert hasattr(ui_integration, 'create_classification_error_interface')
    assert hasattr(ui_integration, 'create_question_issue_interface')
    assert hasattr(ui_integration, 'create_pattern_analysis_display')
    assert hasattr(ui_integration, 'create_complete_feedback_interface')
    
    print("βœ“ Task 4.3: Feedback UI integration works correctly")
    return True


def test_task_4_4_error_pattern_analysis():
    """Test Task 4.4: Error pattern analysis implementation."""
    print("Testing Task 4.4: Error pattern analysis...")
    
    # Test PatternRecognizer initialization
    recognizer = PatternRecognizer(min_pattern_frequency=2, confidence_threshold=0.7)
    assert recognizer.min_pattern_frequency == 2
    assert recognizer.confidence_threshold == 0.7
    
    # Create test feedback system with pattern recognizer
    feedback_system = FeedbackSystem(storage_path=".verification_data/task4_pattern_test")
    
    # Record multiple similar errors to create patterns
    for i in range(4):
        feedback_system.record_classification_error(
            error_type=ErrorType.WRONG_CLASSIFICATION,
            subcategory=ErrorSubcategory.GREEN_TO_YELLOW,
            expected_category="YELLOW",
            actual_category="GREEN",
            message_content=f"I feel stressed and overwhelmed about work situation {i}",
            reviewer_comments=f"Clear distress indicators were missed {i}",
            confidence_level=0.85 + (i * 0.02),
            session_id=f"pattern_test_session_{i}",
            additional_context={"scenario_type": "vague_stress"}
        )
    
    # Record question issues
    for i in range(3):
        feedback_system.record_question_issue(
            issue_type=QuestionIssueType.INAPPROPRIATE_QUESTION,
            question_content=f"What's wrong with you? {i}",
            scenario_type=ScenarioType.VAGUE_STRESS,
            reviewer_comments=f"Too direct and potentially offensive {i}",
            severity="high",
            session_id=f"pattern_test_session_{i}"
        )
    
    # Analyze patterns
    patterns = feedback_system.analyze_error_patterns(min_frequency=2)
    
    # Verify patterns were identified
    assert len(patterns) > 0, "Should identify error patterns"
    
    # Check for wrong classification pattern
    wrong_classification_patterns = [p for p in patterns if 'wrong_classification' in p.pattern_type]
    assert len(wrong_classification_patterns) > 0, "Should identify wrong classification pattern"
    
    # Verify pattern structure
    for pattern in patterns:
        assert hasattr(pattern, 'pattern_id'), "Pattern should have ID"
        assert hasattr(pattern, 'frequency'), "Pattern should have frequency"
        assert hasattr(pattern, 'suggested_improvements'), "Pattern should have suggestions"
        assert pattern.frequency >= 2, "Pattern frequency should meet minimum"
        assert len(pattern.suggested_improvements) > 0, "Pattern should have improvement suggestions"
    
    # Test optimization report generation
    report = feedback_system.generate_optimization_report()
    
    # Verify report structure
    required_fields = [
        'summary', 'total_patterns', 'recommendations', 'priority_actions',
        'confidence_score', 'most_frequent_pattern', 'affected_scenarios',
        'report_generated'
    ]
    
    for field in required_fields:
        assert field in report, f"Report missing required field: {field}"
    
    assert report['total_patterns'] > 0, "Report should show patterns"
    assert len(report['recommendations']) > 0, "Report should have recommendations"
    assert 0.0 <= report['confidence_score'] <= 1.0, "Confidence score should be valid"
    
    print("βœ“ Task 4.4: Error pattern analysis works correctly")
    return True


def test_end_to_end_feedback_workflow():
    """Test complete end-to-end feedback workflow."""
    print("Testing end-to-end feedback workflow...")
    
    # Create fresh feedback system
    feedback_system = FeedbackSystem(storage_path=".verification_data/task4_e2e_test")
    
    # Create UI integration
    ui_integration = FeedbackUIIntegration(feedback_system=feedback_system)
    
    # Simulate complete feedback workflow
    
    # 1. Record various types of feedback
    error_id = feedback_system.record_classification_error(
        error_type=ErrorType.CONTEXT_MISUNDERSTANDING,
        subcategory=ErrorSubcategory.IGNORED_HISTORY,
        expected_category="RED",
        actual_category="GREEN",
        message_content="I mentioned earlier that I've been having thoughts of ending it all",
        reviewer_comments="System ignored previous context about suicidal ideation",
        confidence_level=0.95,
        session_id="e2e_session_1",
        additional_context={"conversation_turn": 3, "previous_classification": "YELLOW"}
    )
    
    issue_id = feedback_system.record_question_issue(
        issue_type=QuestionIssueType.INSENSITIVE_LANGUAGE,
        question_content="Why don't you just try to be more positive?",
        scenario_type=ScenarioType.LOSS_OF_LOVED_ONE,
        reviewer_comments="Dismissive language inappropriate for grief scenario",
        severity="high",
        session_id="e2e_session_2",
        suggested_improvement="Ask: 'How are you processing this loss?'"
    )
    
    problem_id = feedback_system.record_referral_problem(
        problem_type=ReferralProblemType.MISSING_CONTACT_INFO,
        referral_content="Patient experiencing severe spiritual distress and needs immediate support.",
        reviewer_comments="No contact information or urgency level specified",
        severity="high",
        session_id="e2e_session_3",
        missing_fields=["contact_phone", "urgency_level", "preferred_contact_time"]
    )
    
    # 2. Verify all feedback was recorded
    summary = feedback_system.get_feedback_summary()
    assert summary['total_errors'] >= 1
    assert summary['total_question_issues'] >= 1
    assert summary['total_referral_problems'] >= 1
    
    # 3. Analyze patterns (may not find patterns with single instances)
    patterns = feedback_system.analyze_error_patterns(min_frequency=1)
    
    # 4. Generate optimization report
    report = feedback_system.generate_optimization_report()
    assert 'summary' in report
    assert 'recommendations' in report
    
    # 5. Verify UI integration can access the data
    assert ui_integration.feedback_system == feedback_system
    
    print("βœ“ End-to-end feedback workflow works correctly")
    return True


def main():
    """Run all Task 4 tests."""
    print("=" * 70)
    print("TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM - COMPREHENSIVE TESTS")
    print("=" * 70)
    
    tests = [
        test_task_4_1_property_based_feedback_capture,
        test_task_4_2_classification_error_data_model,
        test_task_4_3_feedback_ui_integration,
        test_task_4_4_error_pattern_analysis,
        test_end_to_end_feedback_workflow
    ]
    
    passed = 0
    failed = 0
    
    for test in tests:
        try:
            print(f"\n{test.__name__.replace('_', ' ').title()}:")
            print("-" * 50)
            
            result = test()
            if result:
                passed += 1
                print("βœ“ PASSED")
            else:
                failed += 1
                print("βœ— FAILED")
                
        except Exception as e:
            failed += 1
            print(f"βœ— FAILED: {str(e)}")
    
    print("\n" + "=" * 70)
    print(f"RESULTS: {passed} passed, {failed} failed")
    print("=" * 70)
    
    if failed == 0:
        print("πŸŽ‰ ALL TASK 4 TESTS PASSED!")
        print("\n**TASK 4: BUILD STRUCTURED FEEDBACK SYSTEM**")
        print("βœ“ COMPLETED: Task 4.1 - Property test for structured feedback capture")
        print("βœ“ COMPLETED: Task 4.2 - ClassificationError data model")
        print("βœ“ COMPLETED: Task 4.3 - Feedback UI integration")
        print("βœ“ COMPLETED: Task 4.4 - Error pattern analysis")
        print("\n**Requirements Validated:**")
        print("βœ“ 3.1 - Predefined error categories from documentation")
        print("βœ“ 3.2 - Specific subcategories of wrong classification types")
        print("βœ“ 3.3 - Structured feedback about question quality")
        print("βœ“ 3.4 - Pattern analysis and improvement suggestions")
        print("βœ“ 3.5 - Feedback aggregation and reporting")
        return True
    else:
        print("❌ Some tests failed. Please check the implementation.")
        return False


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)