Spiritual_Health_Project / tests /chaplain_feedback /test_properties_interaction_logging.py
DocUA's picture
Fix CSV download button for Hugging Face Spaces - use DownloadButton for direct file download
ab93d81
# test_properties_interaction_logging.py
"""
Property-based tests for Chaplain Feedback interaction logging.
Tests that interaction logging correctly records all steps with input/output
and supports approval status updates.
"""
import pytest
from hypothesis import given, settings
from datetime import datetime
from src.core.interaction_logger import InteractionLogger
from src.core.chaplain_models import (
InteractionStepLog,
TaggingRecord,
INTERACTION_STEP_TYPES,
)
from tests.chaplain_feedback.conftest import (
valid_id_strategy,
tagging_record_strategy,
)
class TestInteractionLoggingCompleteness:
"""
**Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete**
Tests that interaction logging records all required fields for each step.
"""
def test_interaction_step_logging_complete_all_types(self):
"""
**Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete**
**Validates: Requirements 7.1, 7.2**
For any interaction step, the log should contain: input text, model output, and timestamp.
"""
logger = InteractionLogger()
# Test all step types
for step_type in INTERACTION_STEP_TYPES:
session_id = f"session_{step_type}"
message_id = f"msg_{step_type}"
input_text = f"input for {step_type}"
model_output = f"output for {step_type}"
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id=message_id,
step_type=step_type,
input_text=input_text,
model_output=model_output,
)
# Retrieve the logged step
logged_step = logger.get_step(step_id)
# Verify all required fields are present and correct
assert logged_step is not None
assert logged_step.step_id == step_id
assert logged_step.session_id == session_id
assert logged_step.message_id == message_id
assert logged_step.step_type == step_type
assert logged_step.input_text == input_text
assert logged_step.model_output == model_output
assert logged_step.timestamp is not None
assert isinstance(logged_step.timestamp, datetime)
assert logged_step.approval_status is None # Initially no approval
assert logged_step.tagging_data is None # Initially no tagging
def test_interaction_step_logging_multiple_steps(self):
"""
Test that multiple steps are logged correctly for a session.
"""
logger = InteractionLogger()
session_id = "test_session_1"
message_id = "test_message_1"
# Log multiple steps
step_ids = []
for i in range(3):
step_id = logger.log_step(
session_id=session_id,
message_id=message_id,
step_type="classification",
input_text=f"input {i}",
model_output=f"output {i}",
)
step_ids.append(step_id)
# Retrieve all session logs
session_logs = logger.get_session_logs(session_id)
# Verify all steps are logged
assert len(session_logs) == 3
for i, log in enumerate(session_logs):
assert log.input_text == f"input {i}"
assert log.model_output == f"output {i}"
def test_interaction_step_logging_preserves_order(self):
"""
Test that logged steps are retrieved in the order they were logged.
"""
logger = InteractionLogger()
session_id = "test_session_order"
# Log steps in order
step_ids = []
for i in range(5):
step_id = logger.log_step(
session_id=session_id,
message_id=f"msg_{i}",
step_type="classification",
input_text=f"input_{i}",
model_output=f"output_{i}",
)
step_ids.append(step_id)
# Retrieve logs
session_logs = logger.get_session_logs(session_id)
# Verify order is preserved
assert len(session_logs) == 5
for i, log in enumerate(session_logs):
assert log.message_id == f"msg_{i}"
assert log.input_text == f"input_{i}"
def test_interaction_step_logging_by_type(self):
"""
Test filtering logs by step type.
"""
logger = InteractionLogger()
session_id = "test_session_types"
# Log different types of steps
logger.log_step(session_id, "msg1", "classification", "input1", "output1")
logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
logger.log_step(session_id, "msg3", "classification", "input3", "output3")
logger.log_step(session_id, "msg4", "referral", "input4", "output4")
# Filter by type
classification_logs = logger.get_session_logs_by_type(session_id, "classification")
explanation_logs = logger.get_session_logs_by_type(session_id, "explanation")
referral_logs = logger.get_session_logs_by_type(session_id, "referral")
# Verify filtering
assert len(classification_logs) == 2
assert len(explanation_logs) == 1
assert len(referral_logs) == 1
def test_interaction_step_logging_message_logs(self):
"""
Test retrieving logs for a specific message across sessions.
"""
logger = InteractionLogger()
message_id = "shared_message"
# Log same message in different sessions
logger.log_step("session1", message_id, "classification", "input1", "output1")
logger.log_step("session2", message_id, "explanation", "input2", "output2")
logger.log_step("session1", "other_msg", "referral", "input3", "output3")
# Get logs for the message
message_logs = logger.get_message_logs(message_id)
# Verify we get logs from both sessions
assert len(message_logs) == 2
assert all(log.message_id == message_id for log in message_logs)
def test_interaction_step_logging_empty_strings(self):
"""
Test that empty input/output strings are logged correctly.
"""
logger = InteractionLogger()
step_id = logger.log_step(
session_id="test_session",
message_id="test_msg",
step_type="classification",
input_text="",
model_output="",
)
logged_step = logger.get_step(step_id)
assert logged_step.input_text == ""
assert logged_step.model_output == ""
def test_interaction_step_logging_long_text(self):
"""
Test that long input/output text is logged correctly.
"""
logger = InteractionLogger()
long_text = "x" * 10000
step_id = logger.log_step(
session_id="test_session",
message_id="test_msg",
step_type="classification",
input_text=long_text,
model_output=long_text,
)
logged_step = logger.get_step(step_id)
assert logged_step.input_text == long_text
assert logged_step.model_output == long_text
assert len(logged_step.input_text) == 10000
def test_interaction_step_logging_special_characters(self):
"""
Test that special characters in input/output are preserved.
"""
logger = InteractionLogger()
special_text = "Test with special chars: !@#$%^&*()_+-=[]{}|;:',.<>?/~`"
step_id = logger.log_step(
session_id="test_session",
message_id="test_msg",
step_type="classification",
input_text=special_text,
model_output=special_text,
)
logged_step = logger.get_step(step_id)
assert logged_step.input_text == special_text
assert logged_step.model_output == special_text
def test_interaction_step_logging_unicode(self):
"""
Test that Unicode characters in input/output are preserved.
"""
logger = InteractionLogger()
unicode_text = "Test with Unicode: 你好世界 🌍 Привет мир"
step_id = logger.log_step(
session_id="test_session",
message_id="test_msg",
step_type="classification",
input_text=unicode_text,
model_output=unicode_text,
)
logged_step = logger.get_step(step_id)
assert logged_step.input_text == unicode_text
assert logged_step.model_output == unicode_text
def test_interaction_step_logging_statistics(self):
"""
Test that session statistics are calculated correctly.
"""
logger = InteractionLogger()
session_id = "test_session_stats"
# Log some steps
logger.log_step(session_id, "msg1", "classification", "input1", "output1")
logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
logger.log_step(session_id, "msg3", "referral", "input3", "output3")
# Get statistics
stats = logger.get_session_statistics(session_id)
# Verify statistics
assert stats["session_id"] == session_id
assert stats["total_steps"] == 3
assert stats["approved_steps"] == 0
assert stats["disapproved_steps"] == 0
assert stats["unapproved_steps"] == 3
assert stats["steps_by_type"]["classification"] == 1
assert stats["steps_by_type"]["explanation"] == 1
assert stats["steps_by_type"]["referral"] == 1
def test_interaction_step_logging_invalid_step_type(self):
"""
Test that invalid step types raise an error.
"""
logger = InteractionLogger()
with pytest.raises(ValueError):
logger.log_step(
session_id="test_session",
message_id="test_msg",
step_type="invalid_type",
input_text="input",
model_output="output",
)
def test_interaction_step_logging_nonexistent_step(self):
"""
Test that retrieving a nonexistent step returns None.
"""
logger = InteractionLogger()
result = logger.get_step("nonexistent_step_id")
assert result is None
def test_interaction_step_logging_empty_session(self):
"""
Test that retrieving logs for an empty session returns empty list.
"""
logger = InteractionLogger()
session_logs = logger.get_session_logs("nonexistent_session")
assert session_logs == []
def test_interaction_step_logging_export(self):
"""
Test that session logs can be exported as dictionaries.
"""
logger = InteractionLogger()
session_id = "test_session_export"
# Log some steps
logger.log_step(session_id, "msg1", "classification", "input1", "output1")
logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
# Export logs
exported = logger.export_session_logs(session_id)
# Verify export
assert len(exported) == 2
assert all(isinstance(log, dict) for log in exported)
assert all("step_id" in log for log in exported)
assert all("input_text" in log for log in exported)
assert all("model_output" in log for log in exported)
assert all("timestamp" in log for log in exported)
class TestFeedbackLogging:
"""
**Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
Tests that feedback logging correctly records approval/disapproval status
with tagging categories and comments.
"""
def test_feedback_logging_approved_status(self):
"""
**Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
**Validates: Requirements 7.3, 7.4**
For any feedback, the log should record approval status.
"""
logger = InteractionLogger()
session_id = "test_session_feedback"
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with approved status
logger.update_approval(step_id, "approved")
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.approval_status == "approved"
assert logged_step.tagging_data is None
def test_feedback_logging_disapproved_status(self):
"""
Test that disapproved status is recorded correctly.
"""
logger = InteractionLogger()
session_id = "test_session_feedback"
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with disapproved status
logger.update_approval(step_id, "disapproved")
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.approval_status == "disapproved"
@given(tagging_record_strategy())
@settings(max_examples=100)
def test_feedback_logging_with_tagging_data(self, tagging_record):
"""
**Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete**
**Validates: Requirements 7.3, 7.4**
For any chaplain feedback, the log should contain: approval/disapproval status,
and if disapproved, the tagging categories and comments.
"""
logger = InteractionLogger()
session_id = "test_session_tagging"
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id=tagging_record.message_id,
step_type="classification",
input_text="input",
model_output="output",
)
# Update with disapproved status and tagging data
logger.update_approval(step_id, "disapproved", tagging_record)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.approval_status == "disapproved"
assert logged_step.tagging_data is not None
assert logged_step.tagging_data.record_id == tagging_record.record_id
assert logged_step.tagging_data.message_id == tagging_record.message_id
assert logged_step.tagging_data.is_classification_correct == tagging_record.is_classification_correct
assert logged_step.tagging_data.question_issues == tagging_record.question_issues
assert logged_step.tagging_data.referral_issues == tagging_record.referral_issues
def test_feedback_logging_classification_subcategory(self):
"""
Test that classification subcategory is recorded in tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_classification"
# Create tagging record with classification subcategory
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=False,
classification_subcategory="missed_indicators",
correct_classification="red",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "disapproved", tagging)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.tagging_data.classification_subcategory == "missed_indicators"
assert logged_step.tagging_data.correct_classification == "red"
def test_feedback_logging_question_issues(self):
"""
Test that question issues are recorded in tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_questions"
# Create tagging record with question issues
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=True,
question_issues=["inappropriate", "too_leading"],
question_comments="Questions were too intrusive",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="follow_up",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "disapproved", tagging)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.tagging_data.question_issues == ["inappropriate", "too_leading"]
assert logged_step.tagging_data.question_comments == "Questions were too intrusive"
def test_feedback_logging_referral_issues(self):
"""
Test that referral issues are recorded in tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_referral"
# Create tagging record with referral issues
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=True,
referral_issues=["incomplete_summary", "inappropriate_tone"],
referral_comments="Message was incomplete",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="referral",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "disapproved", tagging)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.tagging_data.referral_issues == ["incomplete_summary", "inappropriate_tone"]
assert logged_step.tagging_data.referral_comments == "Message was incomplete"
def test_feedback_logging_indicator_issues(self):
"""
Test that indicator issues are recorded in tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_indicators"
# Create tagging record with indicator issues
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=True,
indicator_issues=["indicator_1", "indicator_2"],
indicator_comments="These indicators were incorrectly identified",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "disapproved", tagging)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.tagging_data.indicator_issues == ["indicator_1", "indicator_2"]
assert logged_step.tagging_data.indicator_comments == "These indicators were incorrectly identified"
def test_feedback_logging_general_notes(self):
"""
Test that general notes are recorded in tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_notes"
# Create tagging record with general notes
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=True,
general_notes="Overall good classification but needs improvement in tone",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "approved", tagging)
# Retrieve and verify
logged_step = logger.get_step(step_id)
assert logged_step.tagging_data.general_notes == "Overall good classification but needs improvement in tone"
def test_feedback_logging_disapproved_steps_retrieval(self):
"""
Test that disapproved steps can be retrieved from a session.
"""
logger = InteractionLogger()
session_id = "test_session_disapproved"
# Log multiple steps
step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")
# Approve first, disapprove second and third
logger.update_approval(step_id_1, "approved")
logger.update_approval(step_id_2, "disapproved")
logger.update_approval(step_id_3, "disapproved")
# Get disapproved steps
disapproved = logger.get_disapproved_steps(session_id)
# Verify
assert len(disapproved) == 2
assert all(log.approval_status == "disapproved" for log in disapproved)
def test_feedback_logging_unapproved_steps_retrieval(self):
"""
Test that unapproved steps can be retrieved from a session.
"""
logger = InteractionLogger()
session_id = "test_session_unapproved"
# Log multiple steps
step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")
# Approve first, leave others unapproved
logger.update_approval(step_id_1, "approved")
# Get unapproved steps
unapproved = logger.get_unapproved_steps(session_id)
# Verify
assert len(unapproved) == 2
assert all(log.approval_status is None for log in unapproved)
def test_feedback_logging_invalid_approval_status(self):
"""
Test that invalid approval status raises an error.
"""
logger = InteractionLogger()
session_id = "test_session_invalid"
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Try to update with invalid status
with pytest.raises(ValueError):
logger.update_approval(step_id, "invalid_status")
def test_feedback_logging_nonexistent_step(self):
"""
Test that updating a nonexistent step raises an error.
"""
logger = InteractionLogger()
with pytest.raises(ValueError):
logger.update_approval("nonexistent_step", "approved")
def test_feedback_logging_export_with_tagging(self):
"""
Test that exported logs include tagging data.
"""
logger = InteractionLogger()
session_id = "test_session_export_tagging"
# Create tagging record
tagging = TaggingRecord(
record_id="tag1",
message_id="msg1",
is_classification_correct=False,
classification_subcategory="missed_indicators",
correct_classification="red",
general_notes="Missed key indicators",
)
# Log a step
step_id = logger.log_step(
session_id=session_id,
message_id="msg1",
step_type="classification",
input_text="input",
model_output="output",
)
# Update with tagging
logger.update_approval(step_id, "disapproved", tagging)
# Export logs
exported = logger.export_session_logs(session_id)
# Verify export includes tagging data
assert len(exported) == 1
assert exported[0]["approval_status"] == "disapproved"
assert exported[0]["tagging_data"] is not None
assert exported[0]["tagging_data"]["classification_subcategory"] == "missed_indicators"
assert exported[0]["tagging_data"]["correct_classification"] == "red"
assert exported[0]["tagging_data"]["general_notes"] == "Missed key indicators"