Spaces:

DocUA
/

Spiritual_Health_Project

Sleeping

File size: 25,567 Bytes

ab93d81

# test_properties_interaction_logging.py
"""
Property-based tests for Chaplain Feedback interaction logging.

Tests that interaction logging correctly records all steps with input/output
and supports approval status updates.
"""

import pytest
from hypothesis import given, settings
from datetime import datetime

from src.core.interaction_logger import InteractionLogger
from src.core.chaplain_models import (
    InteractionStepLog,
    TaggingRecord,
    INTERACTION_STEP_TYPES,
)

from tests.chaplain_feedback.conftest import (
    valid_id_strategy,
    tagging_record_strategy,
)


class TestInteractionLoggingCompleteness:
    """
    **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete**
    
    Tests that interaction logging records all required fields for each step.
    """

    def test_interaction_step_logging_complete_all_types(self):
        """
        **Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete**
        **Validates: Requirements 7.1, 7.2**
        
        For any interaction step, the log should contain: input text, model output, and timestamp.
        """
        logger = InteractionLogger()
        
        # Test all step types
        for step_type in INTERACTION_STEP_TYPES:
            session_id = f"session_{step_type}"
            message_id = f"msg_{step_type}"
            input_text = f"input for {step_type}"
            model_output = f"output for {step_type}"
            
            # Log a step
            step_id = logger.log_step(
                session_id=session_id,
                message_id=message_id,
                step_type=step_type,
                input_text=input_text,
                model_output=model_output,
            )
            
            # Retrieve the logged step
            logged_step = logger.get_step(step_id)
            
            # Verify all required fields are present and correct
            assert logged_step is not None
            assert logged_step.step_id == step_id
            assert logged_step.session_id == session_id
            assert logged_step.message_id == message_id
            assert logged_step.step_type == step_type
            assert logged_step.input_text == input_text
            assert logged_step.model_output == model_output
            assert logged_step.timestamp is not None
            assert isinstance(logged_step.timestamp, datetime)
            assert logged_step.approval_status is None  # Initially no approval
            assert logged_step.tagging_data is None  # Initially no tagging

    def test_interaction_step_logging_multiple_steps(self):
        """
        Test that multiple steps are logged correctly for a session.
        """
        logger = InteractionLogger()
        session_id = "test_session_1"
        message_id = "test_message_1"
        
        # Log multiple steps
        step_ids = []
        for i in range(3):
            step_id = logger.log_step(
                session_id=session_id,
                message_id=message_id,
                step_type="classification",
                input_text=f"input {i}",
                model_output=f"output {i}",
            )
            step_ids.append(step_id)
        
        # Retrieve all session logs
        session_logs = logger.get_session_logs(session_id)
        
        # Verify all steps are logged
        assert len(session_logs) == 3
        for i, log in enumerate(session_logs):
            assert log.input_text == f"input {i}"
            assert log.model_output == f"output {i}"

    def test_interaction_step_logging_preserves_order(self):
        """
        Test that logged steps are retrieved in the order they were logged.
        """
        logger = InteractionLogger()
        session_id = "test_session_order"
        
        # Log steps in order
        step_ids = []
        for i in range(5):
            step_id = logger.log_step(
                session_id=session_id,
                message_id=f"msg_{i}",
                step_type="classification",
                input_text=f"input_{i}",
                model_output=f"output_{i}",
            )
            step_ids.append(step_id)
        
        # Retrieve logs
        session_logs = logger.get_session_logs(session_id)
        
        # Verify order is preserved
        assert len(session_logs) == 5
        for i, log in enumerate(session_logs):
            assert log.message_id == f"msg_{i}"
            assert log.input_text == f"input_{i}"

    def test_interaction_step_logging_by_type(self):
        """
        Test filtering logs by step type.
        """
        logger = InteractionLogger()
        session_id = "test_session_types"
        
        # Log different types of steps
        logger.log_step(session_id, "msg1", "classification", "input1", "output1")
        logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
        logger.log_step(session_id, "msg3", "classification", "input3", "output3")
        logger.log_step(session_id, "msg4", "referral", "input4", "output4")
        
        # Filter by type
        classification_logs = logger.get_session_logs_by_type(session_id, "classification")
        explanation_logs = logger.get_session_logs_by_type(session_id, "explanation")
        referral_logs = logger.get_session_logs_by_type(session_id, "referral")
        
        # Verify filtering
        assert len(classification_logs) == 2
        assert len(explanation_logs) == 1
        assert len(referral_logs) == 1

    def test_interaction_step_logging_message_logs(self):
        """
        Test retrieving logs for a specific message across sessions.
        """
        logger = InteractionLogger()
        message_id = "shared_message"
        
        # Log same message in different sessions
        logger.log_step("session1", message_id, "classification", "input1", "output1")
        logger.log_step("session2", message_id, "explanation", "input2", "output2")
        logger.log_step("session1", "other_msg", "referral", "input3", "output3")
        
        # Get logs for the message
        message_logs = logger.get_message_logs(message_id)
        
        # Verify we get logs from both sessions
        assert len(message_logs) == 2
        assert all(log.message_id == message_id for log in message_logs)

    def test_interaction_step_logging_empty_strings(self):
        """
        Test that empty input/output strings are logged correctly.
        """
        logger = InteractionLogger()
        
        step_id = logger.log_step(
            session_id="test_session",
            message_id="test_msg",
            step_type="classification",
            input_text="",
            model_output="",
        )
        
        logged_step = logger.get_step(step_id)
        
        assert logged_step.input_text == ""
        assert logged_step.model_output == ""

    def test_interaction_step_logging_long_text(self):
        """
        Test that long input/output text is logged correctly.
        """
        logger = InteractionLogger()
        long_text = "x" * 10000
        
        step_id = logger.log_step(
            session_id="test_session",
            message_id="test_msg",
            step_type="classification",
            input_text=long_text,
            model_output=long_text,
        )
        
        logged_step = logger.get_step(step_id)
        
        assert logged_step.input_text == long_text
        assert logged_step.model_output == long_text
        assert len(logged_step.input_text) == 10000

    def test_interaction_step_logging_special_characters(self):
        """
        Test that special characters in input/output are preserved.
        """
        logger = InteractionLogger()
        special_text = "Test with special chars: !@#$%^&*()_+-=[]{}|;:',.<>?/~`"
        
        step_id = logger.log_step(
            session_id="test_session",
            message_id="test_msg",
            step_type="classification",
            input_text=special_text,
            model_output=special_text,
        )
        
        logged_step = logger.get_step(step_id)
        
        assert logged_step.input_text == special_text
        assert logged_step.model_output == special_text

    def test_interaction_step_logging_unicode(self):
        """
        Test that Unicode characters in input/output are preserved.
        """
        logger = InteractionLogger()
        unicode_text = "Test with Unicode: 你好世界 🌍 Привет мир"
        
        step_id = logger.log_step(
            session_id="test_session",
            message_id="test_msg",
            step_type="classification",
            input_text=unicode_text,
            model_output=unicode_text,
        )
        
        logged_step = logger.get_step(step_id)
        
        assert logged_step.input_text == unicode_text
        assert logged_step.model_output == unicode_text

    def test_interaction_step_logging_statistics(self):
        """
        Test that session statistics are calculated correctly.
        """
        logger = InteractionLogger()
        session_id = "test_session_stats"
        
        # Log some steps
        logger.log_step(session_id, "msg1", "classification", "input1", "output1")
        logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
        logger.log_step(session_id, "msg3", "referral", "input3", "output3")
        
        # Get statistics
        stats = logger.get_session_statistics(session_id)
        
        # Verify statistics
        assert stats["session_id"] == session_id
        assert stats["total_steps"] == 3
        assert stats["approved_steps"] == 0
        assert stats["disapproved_steps"] == 0
        assert stats["unapproved_steps"] == 3
        assert stats["steps_by_type"]["classification"] == 1
        assert stats["steps_by_type"]["explanation"] == 1
        assert stats["steps_by_type"]["referral"] == 1

    def test_interaction_step_logging_invalid_step_type(self):
        """
        Test that invalid step types raise an error.
        """
        logger = InteractionLogger()
        
        with pytest.raises(ValueError):
            logger.log_step(
                session_id="test_session",
                message_id="test_msg",
                step_type="invalid_type",
                input_text="input",
                model_output="output",
            )

    def test_interaction_step_logging_nonexistent_step(self):
        """
        Test that retrieving a nonexistent step returns None.
        """
        logger = InteractionLogger()
        
        result = logger.get_step("nonexistent_step_id")
        
        assert result is None

    def test_interaction_step_logging_empty_session(self):
        """
        Test that retrieving logs for an empty session returns empty list.
        """
        logger = InteractionLogger()
        
        session_logs = logger.get_session_logs("nonexistent_session")
        
        assert session_logs == []

    def test_interaction_step_logging_export(self):
        """
        Test that session logs can be exported as dictionaries.
        """
        logger = InteractionLogger()
        session_id = "test_session_export"
        
        # Log some steps
        logger.log_step(session_id, "msg1", "classification", "input1", "output1")
        logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
        
        # Export logs
        exported = logger.export_session_logs(session_id)
        
        # Verify export
        assert len(exported) == 2
        assert all(isinstance(log, dict) for log in exported)
        assert all("step_id" in log for log in exported)
        assert all("input_text" in log for log in exported)
        assert all("model_output" in log for log in exported)
        assert all("timestamp" in log for log in exported)


class TestFeedbackLogging:
    """
    **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
    
    Tests that feedback logging correctly records approval/disapproval status
    with tagging categories and comments.
    """

    def test_feedback_logging_approved_status(self):
        """
        **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
        **Validates: Requirements 7.3, 7.4**
        
        For any feedback, the log should record approval status.
        """
        logger = InteractionLogger()
        session_id = "test_session_feedback"
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with approved status
        logger.update_approval(step_id, "approved")
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.approval_status == "approved"
        assert logged_step.tagging_data is None

    def test_feedback_logging_disapproved_status(self):
        """
        Test that disapproved status is recorded correctly.
        """
        logger = InteractionLogger()
        session_id = "test_session_feedback"
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with disapproved status
        logger.update_approval(step_id, "disapproved")
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.approval_status == "disapproved"

    @given(tagging_record_strategy())
    @settings(max_examples=100)
    def test_feedback_logging_with_tagging_data(self, tagging_record):
        """
        **Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
        **Validates: Requirements 7.3, 7.4**
        
        For any chaplain feedback, the log should contain: approval/disapproval status,
        and if disapproved, the tagging categories and comments.
        """
        logger = InteractionLogger()
        session_id = "test_session_tagging"
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id=tagging_record.message_id,
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with disapproved status and tagging data
        logger.update_approval(step_id, "disapproved", tagging_record)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.approval_status == "disapproved"
        assert logged_step.tagging_data is not None
        assert logged_step.tagging_data.record_id == tagging_record.record_id
        assert logged_step.tagging_data.message_id == tagging_record.message_id
        assert logged_step.tagging_data.is_classification_correct == tagging_record.is_classification_correct
        assert logged_step.tagging_data.question_issues == tagging_record.question_issues
        assert logged_step.tagging_data.referral_issues == tagging_record.referral_issues

    def test_feedback_logging_classification_subcategory(self):
        """
        Test that classification subcategory is recorded in tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_classification"
        
        # Create tagging record with classification subcategory
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=False,
            classification_subcategory="missed_indicators",
            correct_classification="red",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "disapproved", tagging)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.tagging_data.classification_subcategory == "missed_indicators"
        assert logged_step.tagging_data.correct_classification == "red"

    def test_feedback_logging_question_issues(self):
        """
        Test that question issues are recorded in tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_questions"
        
        # Create tagging record with question issues
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=True,
            question_issues=["inappropriate", "too_leading"],
            question_comments="Questions were too intrusive",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="follow_up",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "disapproved", tagging)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.tagging_data.question_issues == ["inappropriate", "too_leading"]
        assert logged_step.tagging_data.question_comments == "Questions were too intrusive"

    def test_feedback_logging_referral_issues(self):
        """
        Test that referral issues are recorded in tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_referral"
        
        # Create tagging record with referral issues
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=True,
            referral_issues=["incomplete_summary", "inappropriate_tone"],
            referral_comments="Message was incomplete",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="referral",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "disapproved", tagging)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.tagging_data.referral_issues == ["incomplete_summary", "inappropriate_tone"]
        assert logged_step.tagging_data.referral_comments == "Message was incomplete"

    def test_feedback_logging_indicator_issues(self):
        """
        Test that indicator issues are recorded in tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_indicators"
        
        # Create tagging record with indicator issues
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=True,
            indicator_issues=["indicator_1", "indicator_2"],
            indicator_comments="These indicators were incorrectly identified",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "disapproved", tagging)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.tagging_data.indicator_issues == ["indicator_1", "indicator_2"]
        assert logged_step.tagging_data.indicator_comments == "These indicators were incorrectly identified"

    def test_feedback_logging_general_notes(self):
        """
        Test that general notes are recorded in tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_notes"
        
        # Create tagging record with general notes
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=True,
            general_notes="Overall good classification but needs improvement in tone",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "approved", tagging)
        
        # Retrieve and verify
        logged_step = logger.get_step(step_id)
        assert logged_step.tagging_data.general_notes == "Overall good classification but needs improvement in tone"

    def test_feedback_logging_disapproved_steps_retrieval(self):
        """
        Test that disapproved steps can be retrieved from a session.
        """
        logger = InteractionLogger()
        session_id = "test_session_disapproved"
        
        # Log multiple steps
        step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
        step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
        step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")
        
        # Approve first, disapprove second and third
        logger.update_approval(step_id_1, "approved")
        logger.update_approval(step_id_2, "disapproved")
        logger.update_approval(step_id_3, "disapproved")
        
        # Get disapproved steps
        disapproved = logger.get_disapproved_steps(session_id)
        
        # Verify
        assert len(disapproved) == 2
        assert all(log.approval_status == "disapproved" for log in disapproved)

    def test_feedback_logging_unapproved_steps_retrieval(self):
        """
        Test that unapproved steps can be retrieved from a session.
        """
        logger = InteractionLogger()
        session_id = "test_session_unapproved"
        
        # Log multiple steps
        step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
        step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
        step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")
        
        # Approve first, leave others unapproved
        logger.update_approval(step_id_1, "approved")
        
        # Get unapproved steps
        unapproved = logger.get_unapproved_steps(session_id)
        
        # Verify
        assert len(unapproved) == 2
        assert all(log.approval_status is None for log in unapproved)

    def test_feedback_logging_invalid_approval_status(self):
        """
        Test that invalid approval status raises an error.
        """
        logger = InteractionLogger()
        session_id = "test_session_invalid"
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Try to update with invalid status
        with pytest.raises(ValueError):
            logger.update_approval(step_id, "invalid_status")

    def test_feedback_logging_nonexistent_step(self):
        """
        Test that updating a nonexistent step raises an error.
        """
        logger = InteractionLogger()
        
        with pytest.raises(ValueError):
            logger.update_approval("nonexistent_step", "approved")

    def test_feedback_logging_export_with_tagging(self):
        """
        Test that exported logs include tagging data.
        """
        logger = InteractionLogger()
        session_id = "test_session_export_tagging"
        
        # Create tagging record
        tagging = TaggingRecord(
            record_id="tag1",
            message_id="msg1",
            is_classification_correct=False,
            classification_subcategory="missed_indicators",
            correct_classification="red",
            general_notes="Missed key indicators",
        )
        
        # Log a step
        step_id = logger.log_step(
            session_id=session_id,
            message_id="msg1",
            step_type="classification",
            input_text="input",
            model_output="output",
        )
        
        # Update with tagging
        logger.update_approval(step_id, "disapproved", tagging)
        
        # Export logs
        exported = logger.export_session_logs(session_id)
        
        # Verify export includes tagging data
        assert len(exported) == 1
        assert exported[0]["approval_status"] == "disapproved"
        assert exported[0]["tagging_data"] is not None
        assert exported[0]["tagging_data"]["classification_subcategory"] == "missed_indicators"
        assert exported[0]["tagging_data"]["correct_classification"] == "red"
        assert exported[0]["tagging_data"]["general_notes"] == "Missed key indicators"