# test_properties_interaction_logging.py """ Property-based tests for Chaplain Feedback interaction logging. Tests that interaction logging correctly records all steps with input/output and supports approval status updates. """ import pytest from hypothesis import given, settings from datetime import datetime from src.core.interaction_logger import InteractionLogger from src.core.chaplain_models import ( InteractionStepLog, TaggingRecord, INTERACTION_STEP_TYPES, ) from tests.chaplain_feedback.conftest import ( valid_id_strategy, tagging_record_strategy, ) class TestInteractionLoggingCompleteness: """ **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete** Tests that interaction logging records all required fields for each step. """ def test_interaction_step_logging_complete_all_types(self): """ **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete** **Validates: Requirements 7.1, 7.2** For any interaction step, the log should contain: input text, model output, and timestamp. """ logger = InteractionLogger() # Test all step types for step_type in INTERACTION_STEP_TYPES: session_id = f"session_{step_type}" message_id = f"msg_{step_type}" input_text = f"input for {step_type}" model_output = f"output for {step_type}" # Log a step step_id = logger.log_step( session_id=session_id, message_id=message_id, step_type=step_type, input_text=input_text, model_output=model_output, ) # Retrieve the logged step logged_step = logger.get_step(step_id) # Verify all required fields are present and correct assert logged_step is not None assert logged_step.step_id == step_id assert logged_step.session_id == session_id assert logged_step.message_id == message_id assert logged_step.step_type == step_type assert logged_step.input_text == input_text assert logged_step.model_output == model_output assert logged_step.timestamp is not None assert isinstance(logged_step.timestamp, datetime) assert logged_step.approval_status is None # Initially no approval assert logged_step.tagging_data is None # Initially no tagging def test_interaction_step_logging_multiple_steps(self): """ Test that multiple steps are logged correctly for a session. """ logger = InteractionLogger() session_id = "test_session_1" message_id = "test_message_1" # Log multiple steps step_ids = [] for i in range(3): step_id = logger.log_step( session_id=session_id, message_id=message_id, step_type="classification", input_text=f"input {i}", model_output=f"output {i}", ) step_ids.append(step_id) # Retrieve all session logs session_logs = logger.get_session_logs(session_id) # Verify all steps are logged assert len(session_logs) == 3 for i, log in enumerate(session_logs): assert log.input_text == f"input {i}" assert log.model_output == f"output {i}" def test_interaction_step_logging_preserves_order(self): """ Test that logged steps are retrieved in the order they were logged. """ logger = InteractionLogger() session_id = "test_session_order" # Log steps in order step_ids = [] for i in range(5): step_id = logger.log_step( session_id=session_id, message_id=f"msg_{i}", step_type="classification", input_text=f"input_{i}", model_output=f"output_{i}", ) step_ids.append(step_id) # Retrieve logs session_logs = logger.get_session_logs(session_id) # Verify order is preserved assert len(session_logs) == 5 for i, log in enumerate(session_logs): assert log.message_id == f"msg_{i}" assert log.input_text == f"input_{i}" def test_interaction_step_logging_by_type(self): """ Test filtering logs by step type. """ logger = InteractionLogger() session_id = "test_session_types" # Log different types of steps logger.log_step(session_id, "msg1", "classification", "input1", "output1") logger.log_step(session_id, "msg2", "explanation", "input2", "output2") logger.log_step(session_id, "msg3", "classification", "input3", "output3") logger.log_step(session_id, "msg4", "referral", "input4", "output4") # Filter by type classification_logs = logger.get_session_logs_by_type(session_id, "classification") explanation_logs = logger.get_session_logs_by_type(session_id, "explanation") referral_logs = logger.get_session_logs_by_type(session_id, "referral") # Verify filtering assert len(classification_logs) == 2 assert len(explanation_logs) == 1 assert len(referral_logs) == 1 def test_interaction_step_logging_message_logs(self): """ Test retrieving logs for a specific message across sessions. """ logger = InteractionLogger() message_id = "shared_message" # Log same message in different sessions logger.log_step("session1", message_id, "classification", "input1", "output1") logger.log_step("session2", message_id, "explanation", "input2", "output2") logger.log_step("session1", "other_msg", "referral", "input3", "output3") # Get logs for the message message_logs = logger.get_message_logs(message_id) # Verify we get logs from both sessions assert len(message_logs) == 2 assert all(log.message_id == message_id for log in message_logs) def test_interaction_step_logging_empty_strings(self): """ Test that empty input/output strings are logged correctly. """ logger = InteractionLogger() step_id = logger.log_step( session_id="test_session", message_id="test_msg", step_type="classification", input_text="", model_output="", ) logged_step = logger.get_step(step_id) assert logged_step.input_text == "" assert logged_step.model_output == "" def test_interaction_step_logging_long_text(self): """ Test that long input/output text is logged correctly. """ logger = InteractionLogger() long_text = "x" * 10000 step_id = logger.log_step( session_id="test_session", message_id="test_msg", step_type="classification", input_text=long_text, model_output=long_text, ) logged_step = logger.get_step(step_id) assert logged_step.input_text == long_text assert logged_step.model_output == long_text assert len(logged_step.input_text) == 10000 def test_interaction_step_logging_special_characters(self): """ Test that special characters in input/output are preserved. """ logger = InteractionLogger() special_text = "Test with special chars: !@#$%^&*()_+-=[]{}|;:',.<>?/~`" step_id = logger.log_step( session_id="test_session", message_id="test_msg", step_type="classification", input_text=special_text, model_output=special_text, ) logged_step = logger.get_step(step_id) assert logged_step.input_text == special_text assert logged_step.model_output == special_text def test_interaction_step_logging_unicode(self): """ Test that Unicode characters in input/output are preserved. """ logger = InteractionLogger() unicode_text = "Test with Unicode: 你好世界 🌍 Привет мир" step_id = logger.log_step( session_id="test_session", message_id="test_msg", step_type="classification", input_text=unicode_text, model_output=unicode_text, ) logged_step = logger.get_step(step_id) assert logged_step.input_text == unicode_text assert logged_step.model_output == unicode_text def test_interaction_step_logging_statistics(self): """ Test that session statistics are calculated correctly. """ logger = InteractionLogger() session_id = "test_session_stats" # Log some steps logger.log_step(session_id, "msg1", "classification", "input1", "output1") logger.log_step(session_id, "msg2", "explanation", "input2", "output2") logger.log_step(session_id, "msg3", "referral", "input3", "output3") # Get statistics stats = logger.get_session_statistics(session_id) # Verify statistics assert stats["session_id"] == session_id assert stats["total_steps"] == 3 assert stats["approved_steps"] == 0 assert stats["disapproved_steps"] == 0 assert stats["unapproved_steps"] == 3 assert stats["steps_by_type"]["classification"] == 1 assert stats["steps_by_type"]["explanation"] == 1 assert stats["steps_by_type"]["referral"] == 1 def test_interaction_step_logging_invalid_step_type(self): """ Test that invalid step types raise an error. """ logger = InteractionLogger() with pytest.raises(ValueError): logger.log_step( session_id="test_session", message_id="test_msg", step_type="invalid_type", input_text="input", model_output="output", ) def test_interaction_step_logging_nonexistent_step(self): """ Test that retrieving a nonexistent step returns None. """ logger = InteractionLogger() result = logger.get_step("nonexistent_step_id") assert result is None def test_interaction_step_logging_empty_session(self): """ Test that retrieving logs for an empty session returns empty list. """ logger = InteractionLogger() session_logs = logger.get_session_logs("nonexistent_session") assert session_logs == [] def test_interaction_step_logging_export(self): """ Test that session logs can be exported as dictionaries. """ logger = InteractionLogger() session_id = "test_session_export" # Log some steps logger.log_step(session_id, "msg1", "classification", "input1", "output1") logger.log_step(session_id, "msg2", "explanation", "input2", "output2") # Export logs exported = logger.export_session_logs(session_id) # Verify export assert len(exported) == 2 assert all(isinstance(log, dict) for log in exported) assert all("step_id" in log for log in exported) assert all("input_text" in log for log in exported) assert all("model_output" in log for log in exported) assert all("timestamp" in log for log in exported) class TestFeedbackLogging: """ **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** Tests that feedback logging correctly records approval/disapproval status with tagging categories and comments. """ def test_feedback_logging_approved_status(self): """ **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** **Validates: Requirements 7.3, 7.4** For any feedback, the log should record approval status. """ logger = InteractionLogger() session_id = "test_session_feedback" # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with approved status logger.update_approval(step_id, "approved") # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.approval_status == "approved" assert logged_step.tagging_data is None def test_feedback_logging_disapproved_status(self): """ Test that disapproved status is recorded correctly. """ logger = InteractionLogger() session_id = "test_session_feedback" # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with disapproved status logger.update_approval(step_id, "disapproved") # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.approval_status == "disapproved" @given(tagging_record_strategy()) @settings(max_examples=100) def test_feedback_logging_with_tagging_data(self, tagging_record): """ **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete** **Validates: Requirements 7.3, 7.4** For any chaplain feedback, the log should contain: approval/disapproval status, and if disapproved, the tagging categories and comments. """ logger = InteractionLogger() session_id = "test_session_tagging" # Log a step step_id = logger.log_step( session_id=session_id, message_id=tagging_record.message_id, step_type="classification", input_text="input", model_output="output", ) # Update with disapproved status and tagging data logger.update_approval(step_id, "disapproved", tagging_record) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.approval_status == "disapproved" assert logged_step.tagging_data is not None assert logged_step.tagging_data.record_id == tagging_record.record_id assert logged_step.tagging_data.message_id == tagging_record.message_id assert logged_step.tagging_data.is_classification_correct == tagging_record.is_classification_correct assert logged_step.tagging_data.question_issues == tagging_record.question_issues assert logged_step.tagging_data.referral_issues == tagging_record.referral_issues def test_feedback_logging_classification_subcategory(self): """ Test that classification subcategory is recorded in tagging data. """ logger = InteractionLogger() session_id = "test_session_classification" # Create tagging record with classification subcategory tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=False, classification_subcategory="missed_indicators", correct_classification="red", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "disapproved", tagging) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.tagging_data.classification_subcategory == "missed_indicators" assert logged_step.tagging_data.correct_classification == "red" def test_feedback_logging_question_issues(self): """ Test that question issues are recorded in tagging data. """ logger = InteractionLogger() session_id = "test_session_questions" # Create tagging record with question issues tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=True, question_issues=["inappropriate", "too_leading"], question_comments="Questions were too intrusive", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="follow_up", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "disapproved", tagging) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.tagging_data.question_issues == ["inappropriate", "too_leading"] assert logged_step.tagging_data.question_comments == "Questions were too intrusive" def test_feedback_logging_referral_issues(self): """ Test that referral issues are recorded in tagging data. """ logger = InteractionLogger() session_id = "test_session_referral" # Create tagging record with referral issues tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=True, referral_issues=["incomplete_summary", "inappropriate_tone"], referral_comments="Message was incomplete", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="referral", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "disapproved", tagging) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.tagging_data.referral_issues == ["incomplete_summary", "inappropriate_tone"] assert logged_step.tagging_data.referral_comments == "Message was incomplete" def test_feedback_logging_indicator_issues(self): """ Test that indicator issues are recorded in tagging data. """ logger = InteractionLogger() session_id = "test_session_indicators" # Create tagging record with indicator issues tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=True, indicator_issues=["indicator_1", "indicator_2"], indicator_comments="These indicators were incorrectly identified", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "disapproved", tagging) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.tagging_data.indicator_issues == ["indicator_1", "indicator_2"] assert logged_step.tagging_data.indicator_comments == "These indicators were incorrectly identified" def test_feedback_logging_general_notes(self): """ Test that general notes are recorded in tagging data. """ logger = InteractionLogger() session_id = "test_session_notes" # Create tagging record with general notes tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=True, general_notes="Overall good classification but needs improvement in tone", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "approved", tagging) # Retrieve and verify logged_step = logger.get_step(step_id) assert logged_step.tagging_data.general_notes == "Overall good classification but needs improvement in tone" def test_feedback_logging_disapproved_steps_retrieval(self): """ Test that disapproved steps can be retrieved from a session. """ logger = InteractionLogger() session_id = "test_session_disapproved" # Log multiple steps step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1") step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2") step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3") # Approve first, disapprove second and third logger.update_approval(step_id_1, "approved") logger.update_approval(step_id_2, "disapproved") logger.update_approval(step_id_3, "disapproved") # Get disapproved steps disapproved = logger.get_disapproved_steps(session_id) # Verify assert len(disapproved) == 2 assert all(log.approval_status == "disapproved" for log in disapproved) def test_feedback_logging_unapproved_steps_retrieval(self): """ Test that unapproved steps can be retrieved from a session. """ logger = InteractionLogger() session_id = "test_session_unapproved" # Log multiple steps step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1") step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2") step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3") # Approve first, leave others unapproved logger.update_approval(step_id_1, "approved") # Get unapproved steps unapproved = logger.get_unapproved_steps(session_id) # Verify assert len(unapproved) == 2 assert all(log.approval_status is None for log in unapproved) def test_feedback_logging_invalid_approval_status(self): """ Test that invalid approval status raises an error. """ logger = InteractionLogger() session_id = "test_session_invalid" # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Try to update with invalid status with pytest.raises(ValueError): logger.update_approval(step_id, "invalid_status") def test_feedback_logging_nonexistent_step(self): """ Test that updating a nonexistent step raises an error. """ logger = InteractionLogger() with pytest.raises(ValueError): logger.update_approval("nonexistent_step", "approved") def test_feedback_logging_export_with_tagging(self): """ Test that exported logs include tagging data. """ logger = InteractionLogger() session_id = "test_session_export_tagging" # Create tagging record tagging = TaggingRecord( record_id="tag1", message_id="msg1", is_classification_correct=False, classification_subcategory="missed_indicators", correct_classification="red", general_notes="Missed key indicators", ) # Log a step step_id = logger.log_step( session_id=session_id, message_id="msg1", step_type="classification", input_text="input", model_output="output", ) # Update with tagging logger.update_approval(step_id, "disapproved", tagging) # Export logs exported = logger.export_session_logs(session_id) # Verify export includes tagging data assert len(exported) == 1 assert exported[0]["approval_status"] == "disapproved" assert exported[0]["tagging_data"] is not None assert exported[0]["tagging_data"]["classification_subcategory"] == "missed_indicators" assert exported[0]["tagging_data"]["correct_classification"] == "red" assert exported[0]["tagging_data"]["general_notes"] == "Missed key indicators"