Spaces:
Sleeping
Sleeping
| # test_properties_interaction_logging.py | |
| """ | |
| Property-based tests for Chaplain Feedback interaction logging. | |
| Tests that interaction logging correctly records all steps with input/output | |
| and supports approval status updates. | |
| """ | |
| import pytest | |
| from hypothesis import given, settings | |
| from datetime import datetime | |
| from src.core.interaction_logger import InteractionLogger | |
| from src.core.chaplain_models import ( | |
| InteractionStepLog, | |
| TaggingRecord, | |
| INTERACTION_STEP_TYPES, | |
| ) | |
| from tests.chaplain_feedback.conftest import ( | |
| valid_id_strategy, | |
| tagging_record_strategy, | |
| ) | |
| class TestInteractionLoggingCompleteness: | |
| """ | |
| **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete** | |
| Tests that interaction logging records all required fields for each step. | |
| """ | |
| def test_interaction_step_logging_complete_all_types(self): | |
| """ | |
| **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete** | |
| **Validates: Requirements 7.1, 7.2** | |
| For any interaction step, the log should contain: input text, model output, and timestamp. | |
| """ | |
| logger = InteractionLogger() | |
| # Test all step types | |
| for step_type in INTERACTION_STEP_TYPES: | |
| session_id = f"session_{step_type}" | |
| message_id = f"msg_{step_type}" | |
| input_text = f"input for {step_type}" | |
| model_output = f"output for {step_type}" | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id=message_id, | |
| step_type=step_type, | |
| input_text=input_text, | |
| model_output=model_output, | |
| ) | |
| # Retrieve the logged step | |
| logged_step = logger.get_step(step_id) | |
| # Verify all required fields are present and correct | |
| assert logged_step is not None | |
| assert logged_step.step_id == step_id | |
| assert logged_step.session_id == session_id | |
| assert logged_step.message_id == message_id | |
| assert logged_step.step_type == step_type | |
| assert logged_step.input_text == input_text | |
| assert logged_step.model_output == model_output | |
| assert logged_step.timestamp is not None | |
| assert isinstance(logged_step.timestamp, datetime) | |
| assert logged_step.approval_status is None # Initially no approval | |
| assert logged_step.tagging_data is None # Initially no tagging | |
| def test_interaction_step_logging_multiple_steps(self): | |
| """ | |
| Test that multiple steps are logged correctly for a session. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_1" | |
| message_id = "test_message_1" | |
| # Log multiple steps | |
| step_ids = [] | |
| for i in range(3): | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id=message_id, | |
| step_type="classification", | |
| input_text=f"input {i}", | |
| model_output=f"output {i}", | |
| ) | |
| step_ids.append(step_id) | |
| # Retrieve all session logs | |
| session_logs = logger.get_session_logs(session_id) | |
| # Verify all steps are logged | |
| assert len(session_logs) == 3 | |
| for i, log in enumerate(session_logs): | |
| assert log.input_text == f"input {i}" | |
| assert log.model_output == f"output {i}" | |
| def test_interaction_step_logging_preserves_order(self): | |
| """ | |
| Test that logged steps are retrieved in the order they were logged. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_order" | |
| # Log steps in order | |
| step_ids = [] | |
| for i in range(5): | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id=f"msg_{i}", | |
| step_type="classification", | |
| input_text=f"input_{i}", | |
| model_output=f"output_{i}", | |
| ) | |
| step_ids.append(step_id) | |
| # Retrieve logs | |
| session_logs = logger.get_session_logs(session_id) | |
| # Verify order is preserved | |
| assert len(session_logs) == 5 | |
| for i, log in enumerate(session_logs): | |
| assert log.message_id == f"msg_{i}" | |
| assert log.input_text == f"input_{i}" | |
| def test_interaction_step_logging_by_type(self): | |
| """ | |
| Test filtering logs by step type. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_types" | |
| # Log different types of steps | |
| logger.log_step(session_id, "msg1", "classification", "input1", "output1") | |
| logger.log_step(session_id, "msg2", "explanation", "input2", "output2") | |
| logger.log_step(session_id, "msg3", "classification", "input3", "output3") | |
| logger.log_step(session_id, "msg4", "referral", "input4", "output4") | |
| # Filter by type | |
| classification_logs = logger.get_session_logs_by_type(session_id, "classification") | |
| explanation_logs = logger.get_session_logs_by_type(session_id, "explanation") | |
| referral_logs = logger.get_session_logs_by_type(session_id, "referral") | |
| # Verify filtering | |
| assert len(classification_logs) == 2 | |
| assert len(explanation_logs) == 1 | |
| assert len(referral_logs) == 1 | |
| def test_interaction_step_logging_message_logs(self): | |
| """ | |
| Test retrieving logs for a specific message across sessions. | |
| """ | |
| logger = InteractionLogger() | |
| message_id = "shared_message" | |
| # Log same message in different sessions | |
| logger.log_step("session1", message_id, "classification", "input1", "output1") | |
| logger.log_step("session2", message_id, "explanation", "input2", "output2") | |
| logger.log_step("session1", "other_msg", "referral", "input3", "output3") | |
| # Get logs for the message | |
| message_logs = logger.get_message_logs(message_id) | |
| # Verify we get logs from both sessions | |
| assert len(message_logs) == 2 | |
| assert all(log.message_id == message_id for log in message_logs) | |
| def test_interaction_step_logging_empty_strings(self): | |
| """ | |
| Test that empty input/output strings are logged correctly. | |
| """ | |
| logger = InteractionLogger() | |
| step_id = logger.log_step( | |
| session_id="test_session", | |
| message_id="test_msg", | |
| step_type="classification", | |
| input_text="", | |
| model_output="", | |
| ) | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.input_text == "" | |
| assert logged_step.model_output == "" | |
| def test_interaction_step_logging_long_text(self): | |
| """ | |
| Test that long input/output text is logged correctly. | |
| """ | |
| logger = InteractionLogger() | |
| long_text = "x" * 10000 | |
| step_id = logger.log_step( | |
| session_id="test_session", | |
| message_id="test_msg", | |
| step_type="classification", | |
| input_text=long_text, | |
| model_output=long_text, | |
| ) | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.input_text == long_text | |
| assert logged_step.model_output == long_text | |
| assert len(logged_step.input_text) == 10000 | |
| def test_interaction_step_logging_special_characters(self): | |
| """ | |
| Test that special characters in input/output are preserved. | |
| """ | |
| logger = InteractionLogger() | |
| special_text = "Test with special chars: !@#$%^&*()_+-=[]{}|;:',.<>?/~`" | |
| step_id = logger.log_step( | |
| session_id="test_session", | |
| message_id="test_msg", | |
| step_type="classification", | |
| input_text=special_text, | |
| model_output=special_text, | |
| ) | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.input_text == special_text | |
| assert logged_step.model_output == special_text | |
| def test_interaction_step_logging_unicode(self): | |
| """ | |
| Test that Unicode characters in input/output are preserved. | |
| """ | |
| logger = InteractionLogger() | |
| unicode_text = "Test with Unicode: 你好世界 🌍 Привет мир" | |
| step_id = logger.log_step( | |
| session_id="test_session", | |
| message_id="test_msg", | |
| step_type="classification", | |
| input_text=unicode_text, | |
| model_output=unicode_text, | |
| ) | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.input_text == unicode_text | |
| assert logged_step.model_output == unicode_text | |
| def test_interaction_step_logging_statistics(self): | |
| """ | |
| Test that session statistics are calculated correctly. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_stats" | |
| # Log some steps | |
| logger.log_step(session_id, "msg1", "classification", "input1", "output1") | |
| logger.log_step(session_id, "msg2", "explanation", "input2", "output2") | |
| logger.log_step(session_id, "msg3", "referral", "input3", "output3") | |
| # Get statistics | |
| stats = logger.get_session_statistics(session_id) | |
| # Verify statistics | |
| assert stats["session_id"] == session_id | |
| assert stats["total_steps"] == 3 | |
| assert stats["approved_steps"] == 0 | |
| assert stats["disapproved_steps"] == 0 | |
| assert stats["unapproved_steps"] == 3 | |
| assert stats["steps_by_type"]["classification"] == 1 | |
| assert stats["steps_by_type"]["explanation"] == 1 | |
| assert stats["steps_by_type"]["referral"] == 1 | |
| def test_interaction_step_logging_invalid_step_type(self): | |
| """ | |
| Test that invalid step types raise an error. | |
| """ | |
| logger = InteractionLogger() | |
| with pytest.raises(ValueError): | |
| logger.log_step( | |
| session_id="test_session", | |
| message_id="test_msg", | |
| step_type="invalid_type", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| def test_interaction_step_logging_nonexistent_step(self): | |
| """ | |
| Test that retrieving a nonexistent step returns None. | |
| """ | |
| logger = InteractionLogger() | |
| result = logger.get_step("nonexistent_step_id") | |
| assert result is None | |
| def test_interaction_step_logging_empty_session(self): | |
| """ | |
| Test that retrieving logs for an empty session returns empty list. | |
| """ | |
| logger = InteractionLogger() | |
| session_logs = logger.get_session_logs("nonexistent_session") | |
| assert session_logs == [] | |
| def test_interaction_step_logging_export(self): | |
| """ | |
| Test that session logs can be exported as dictionaries. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_export" | |
| # Log some steps | |
| logger.log_step(session_id, "msg1", "classification", "input1", "output1") | |
| logger.log_step(session_id, "msg2", "explanation", "input2", "output2") | |
| # Export logs | |
| exported = logger.export_session_logs(session_id) | |
| # Verify export | |
| assert len(exported) == 2 | |
| assert all(isinstance(log, dict) for log in exported) | |
| assert all("step_id" in log for log in exported) | |
| assert all("input_text" in log for log in exported) | |
| assert all("model_output" in log for log in exported) | |
| assert all("timestamp" in log for log in exported) | |
| class TestFeedbackLogging: | |
| """ | |
| **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** | |
| Tests that feedback logging correctly records approval/disapproval status | |
| with tagging categories and comments. | |
| """ | |
| def test_feedback_logging_approved_status(self): | |
| """ | |
| **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** | |
| **Validates: Requirements 7.3, 7.4** | |
| For any feedback, the log should record approval status. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_feedback" | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with approved status | |
| logger.update_approval(step_id, "approved") | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.approval_status == "approved" | |
| assert logged_step.tagging_data is None | |
| def test_feedback_logging_disapproved_status(self): | |
| """ | |
| Test that disapproved status is recorded correctly. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_feedback" | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with disapproved status | |
| logger.update_approval(step_id, "disapproved") | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.approval_status == "disapproved" | |
| def test_feedback_logging_with_tagging_data(self, tagging_record): | |
| """ | |
| **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** | |
| **Validates: Requirements 7.3, 7.4** | |
| For any chaplain feedback, the log should contain: approval/disapproval status, | |
| and if disapproved, the tagging categories and comments. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_tagging" | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id=tagging_record.message_id, | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with disapproved status and tagging data | |
| logger.update_approval(step_id, "disapproved", tagging_record) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.approval_status == "disapproved" | |
| assert logged_step.tagging_data is not None | |
| assert logged_step.tagging_data.record_id == tagging_record.record_id | |
| assert logged_step.tagging_data.message_id == tagging_record.message_id | |
| assert logged_step.tagging_data.is_classification_correct == tagging_record.is_classification_correct | |
| assert logged_step.tagging_data.question_issues == tagging_record.question_issues | |
| assert logged_step.tagging_data.referral_issues == tagging_record.referral_issues | |
| def test_feedback_logging_classification_subcategory(self): | |
| """ | |
| Test that classification subcategory is recorded in tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_classification" | |
| # Create tagging record with classification subcategory | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=False, | |
| classification_subcategory="missed_indicators", | |
| correct_classification="red", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "disapproved", tagging) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.tagging_data.classification_subcategory == "missed_indicators" | |
| assert logged_step.tagging_data.correct_classification == "red" | |
| def test_feedback_logging_question_issues(self): | |
| """ | |
| Test that question issues are recorded in tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_questions" | |
| # Create tagging record with question issues | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=True, | |
| question_issues=["inappropriate", "too_leading"], | |
| question_comments="Questions were too intrusive", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="follow_up", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "disapproved", tagging) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.tagging_data.question_issues == ["inappropriate", "too_leading"] | |
| assert logged_step.tagging_data.question_comments == "Questions were too intrusive" | |
| def test_feedback_logging_referral_issues(self): | |
| """ | |
| Test that referral issues are recorded in tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_referral" | |
| # Create tagging record with referral issues | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=True, | |
| referral_issues=["incomplete_summary", "inappropriate_tone"], | |
| referral_comments="Message was incomplete", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="referral", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "disapproved", tagging) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.tagging_data.referral_issues == ["incomplete_summary", "inappropriate_tone"] | |
| assert logged_step.tagging_data.referral_comments == "Message was incomplete" | |
| def test_feedback_logging_indicator_issues(self): | |
| """ | |
| Test that indicator issues are recorded in tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_indicators" | |
| # Create tagging record with indicator issues | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=True, | |
| indicator_issues=["indicator_1", "indicator_2"], | |
| indicator_comments="These indicators were incorrectly identified", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "disapproved", tagging) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.tagging_data.indicator_issues == ["indicator_1", "indicator_2"] | |
| assert logged_step.tagging_data.indicator_comments == "These indicators were incorrectly identified" | |
| def test_feedback_logging_general_notes(self): | |
| """ | |
| Test that general notes are recorded in tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_notes" | |
| # Create tagging record with general notes | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=True, | |
| general_notes="Overall good classification but needs improvement in tone", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "approved", tagging) | |
| # Retrieve and verify | |
| logged_step = logger.get_step(step_id) | |
| assert logged_step.tagging_data.general_notes == "Overall good classification but needs improvement in tone" | |
| def test_feedback_logging_disapproved_steps_retrieval(self): | |
| """ | |
| Test that disapproved steps can be retrieved from a session. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_disapproved" | |
| # Log multiple steps | |
| step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1") | |
| step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2") | |
| step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3") | |
| # Approve first, disapprove second and third | |
| logger.update_approval(step_id_1, "approved") | |
| logger.update_approval(step_id_2, "disapproved") | |
| logger.update_approval(step_id_3, "disapproved") | |
| # Get disapproved steps | |
| disapproved = logger.get_disapproved_steps(session_id) | |
| # Verify | |
| assert len(disapproved) == 2 | |
| assert all(log.approval_status == "disapproved" for log in disapproved) | |
| def test_feedback_logging_unapproved_steps_retrieval(self): | |
| """ | |
| Test that unapproved steps can be retrieved from a session. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_unapproved" | |
| # Log multiple steps | |
| step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1") | |
| step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2") | |
| step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3") | |
| # Approve first, leave others unapproved | |
| logger.update_approval(step_id_1, "approved") | |
| # Get unapproved steps | |
| unapproved = logger.get_unapproved_steps(session_id) | |
| # Verify | |
| assert len(unapproved) == 2 | |
| assert all(log.approval_status is None for log in unapproved) | |
| def test_feedback_logging_invalid_approval_status(self): | |
| """ | |
| Test that invalid approval status raises an error. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_invalid" | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Try to update with invalid status | |
| with pytest.raises(ValueError): | |
| logger.update_approval(step_id, "invalid_status") | |
| def test_feedback_logging_nonexistent_step(self): | |
| """ | |
| Test that updating a nonexistent step raises an error. | |
| """ | |
| logger = InteractionLogger() | |
| with pytest.raises(ValueError): | |
| logger.update_approval("nonexistent_step", "approved") | |
| def test_feedback_logging_export_with_tagging(self): | |
| """ | |
| Test that exported logs include tagging data. | |
| """ | |
| logger = InteractionLogger() | |
| session_id = "test_session_export_tagging" | |
| # Create tagging record | |
| tagging = TaggingRecord( | |
| record_id="tag1", | |
| message_id="msg1", | |
| is_classification_correct=False, | |
| classification_subcategory="missed_indicators", | |
| correct_classification="red", | |
| general_notes="Missed key indicators", | |
| ) | |
| # Log a step | |
| step_id = logger.log_step( | |
| session_id=session_id, | |
| message_id="msg1", | |
| step_type="classification", | |
| input_text="input", | |
| model_output="output", | |
| ) | |
| # Update with tagging | |
| logger.update_approval(step_id, "disapproved", tagging) | |
| # Export logs | |
| exported = logger.export_session_logs(session_id) | |
| # Verify export includes tagging data | |
| assert len(exported) == 1 | |
| assert exported[0]["approval_status"] == "disapproved" | |
| assert exported[0]["tagging_data"] is not None | |
| assert exported[0]["tagging_data"]["classification_subcategory"] == "missed_indicators" | |
| assert exported[0]["tagging_data"]["correct_classification"] == "red" | |
| assert exported[0]["tagging_data"]["general_notes"] == "Missed key indicators" | |