Spaces:

DocUA
/

Spiritual_Health_Project

Sleeping

App Files Files Community

Spiritual_Health_Project / tests /chaplain_feedback /test_properties_interaction_logging.py

DocUA

Fix CSV download button for Hugging Face Spaces - use DownloadButton for direct file download

ab93d81 about 1 month ago

raw

history blame contribute delete

25.6 kB

	# test_properties_interaction_logging.py
	"""
	Property-based tests for Chaplain Feedback interaction logging.

	Tests that interaction logging correctly records all steps with input/output
	and supports approval status updates.
	"""

	import pytest
	from hypothesis import given, settings
	from datetime import datetime

	from src.core.interaction_logger import InteractionLogger
	from src.core.chaplain_models import (
	InteractionStepLog,
	TaggingRecord,
	INTERACTION_STEP_TYPES,
	)

	from tests.chaplain_feedback.conftest import (
	valid_id_strategy,
	tagging_record_strategy,
	)


	class TestInteractionLoggingCompleteness:
	"""
	Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete

	Tests that interaction logging records all required fields for each step.
	"""

	def test_interaction_step_logging_complete_all_types(self):
	"""
	Feature: chaplain-feedback-system, Property 14: Interaction Step Logging Complete
	Validates: Requirements 7.1, 7.2

	For any interaction step, the log should contain: input text, model output, and timestamp.
	"""
	logger = InteractionLogger()

	# Test all step types
	for step_type in INTERACTION_STEP_TYPES:
	session_id = f"session_{step_type}"
	message_id = f"msg_{step_type}"
	input_text = f"input for {step_type}"
	model_output = f"output for {step_type}"

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id=message_id,
	step_type=step_type,
	input_text=input_text,
	model_output=model_output,
	)

	# Retrieve the logged step
	logged_step = logger.get_step(step_id)

	# Verify all required fields are present and correct
	assert logged_step is not None
	assert logged_step.step_id == step_id
	assert logged_step.session_id == session_id
	assert logged_step.message_id == message_id
	assert logged_step.step_type == step_type
	assert logged_step.input_text == input_text
	assert logged_step.model_output == model_output
	assert logged_step.timestamp is not None
	assert isinstance(logged_step.timestamp, datetime)
	assert logged_step.approval_status is None # Initially no approval
	assert logged_step.tagging_data is None # Initially no tagging

	def test_interaction_step_logging_multiple_steps(self):
	"""
	Test that multiple steps are logged correctly for a session.
	"""
	logger = InteractionLogger()
	session_id = "test_session_1"
	message_id = "test_message_1"

	# Log multiple steps
	step_ids = []
	for i in range(3):
	step_id = logger.log_step(
	session_id=session_id,
	message_id=message_id,
	step_type="classification",
	input_text=f"input {i}",
	model_output=f"output {i}",
	)
	step_ids.append(step_id)

	# Retrieve all session logs
	session_logs = logger.get_session_logs(session_id)

	# Verify all steps are logged
	assert len(session_logs) == 3
	for i, log in enumerate(session_logs):
	assert log.input_text == f"input {i}"
	assert log.model_output == f"output {i}"

	def test_interaction_step_logging_preserves_order(self):
	"""
	Test that logged steps are retrieved in the order they were logged.
	"""
	logger = InteractionLogger()
	session_id = "test_session_order"

	# Log steps in order
	step_ids = []
	for i in range(5):
	step_id = logger.log_step(
	session_id=session_id,
	message_id=f"msg_{i}",
	step_type="classification",
	input_text=f"input_{i}",
	model_output=f"output_{i}",
	)
	step_ids.append(step_id)

	# Retrieve logs
	session_logs = logger.get_session_logs(session_id)

	# Verify order is preserved
	assert len(session_logs) == 5
	for i, log in enumerate(session_logs):
	assert log.message_id == f"msg_{i}"
	assert log.input_text == f"input_{i}"

	def test_interaction_step_logging_by_type(self):
	"""
	Test filtering logs by step type.
	"""
	logger = InteractionLogger()
	session_id = "test_session_types"

	# Log different types of steps
	logger.log_step(session_id, "msg1", "classification", "input1", "output1")
	logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
	logger.log_step(session_id, "msg3", "classification", "input3", "output3")
	logger.log_step(session_id, "msg4", "referral", "input4", "output4")

	# Filter by type
	classification_logs = logger.get_session_logs_by_type(session_id, "classification")
	explanation_logs = logger.get_session_logs_by_type(session_id, "explanation")
	referral_logs = logger.get_session_logs_by_type(session_id, "referral")

	# Verify filtering
	assert len(classification_logs) == 2
	assert len(explanation_logs) == 1
	assert len(referral_logs) == 1

	def test_interaction_step_logging_message_logs(self):
	"""
	Test retrieving logs for a specific message across sessions.
	"""
	logger = InteractionLogger()
	message_id = "shared_message"

	# Log same message in different sessions
	logger.log_step("session1", message_id, "classification", "input1", "output1")
	logger.log_step("session2", message_id, "explanation", "input2", "output2")
	logger.log_step("session1", "other_msg", "referral", "input3", "output3")

	# Get logs for the message
	message_logs = logger.get_message_logs(message_id)

	# Verify we get logs from both sessions
	assert len(message_logs) == 2
	assert all(log.message_id == message_id for log in message_logs)

	def test_interaction_step_logging_empty_strings(self):
	"""
	Test that empty input/output strings are logged correctly.
	"""
	logger = InteractionLogger()

	step_id = logger.log_step(
	session_id="test_session",
	message_id="test_msg",
	step_type="classification",
	input_text="",
	model_output="",
	)

	logged_step = logger.get_step(step_id)

	assert logged_step.input_text == ""
	assert logged_step.model_output == ""

	def test_interaction_step_logging_long_text(self):
	"""
	Test that long input/output text is logged correctly.
	"""
	logger = InteractionLogger()
	long_text = "x" * 10000

	step_id = logger.log_step(
	session_id="test_session",
	message_id="test_msg",
	step_type="classification",
	input_text=long_text,
	model_output=long_text,
	)

	logged_step = logger.get_step(step_id)

	assert logged_step.input_text == long_text
	assert logged_step.model_output == long_text
	assert len(logged_step.input_text) == 10000

	def test_interaction_step_logging_special_characters(self):
	"""
	Test that special characters in input/output are preserved.
	"""
	logger = InteractionLogger()
	special_text = "Test with special chars: !@#$%^&*()_+-=[]{}\|;:',.<>?/~`"

	step_id = logger.log_step(
	session_id="test_session",
	message_id="test_msg",
	step_type="classification",
	input_text=special_text,
	model_output=special_text,
	)

	logged_step = logger.get_step(step_id)

	assert logged_step.input_text == special_text
	assert logged_step.model_output == special_text

	def test_interaction_step_logging_unicode(self):
	"""
	Test that Unicode characters in input/output are preserved.
	"""
	logger = InteractionLogger()
	unicode_text = "Test with Unicode: 你好世界 🌍 Привет мир"

	step_id = logger.log_step(
	session_id="test_session",
	message_id="test_msg",
	step_type="classification",
	input_text=unicode_text,
	model_output=unicode_text,
	)

	logged_step = logger.get_step(step_id)

	assert logged_step.input_text == unicode_text
	assert logged_step.model_output == unicode_text

	def test_interaction_step_logging_statistics(self):
	"""
	Test that session statistics are calculated correctly.
	"""
	logger = InteractionLogger()
	session_id = "test_session_stats"

	# Log some steps
	logger.log_step(session_id, "msg1", "classification", "input1", "output1")
	logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
	logger.log_step(session_id, "msg3", "referral", "input3", "output3")

	# Get statistics
	stats = logger.get_session_statistics(session_id)

	# Verify statistics
	assert stats["session_id"] == session_id
	assert stats["total_steps"] == 3
	assert stats["approved_steps"] == 0
	assert stats["disapproved_steps"] == 0
	assert stats["unapproved_steps"] == 3
	assert stats["steps_by_type"]["classification"] == 1
	assert stats["steps_by_type"]["explanation"] == 1
	assert stats["steps_by_type"]["referral"] == 1

	def test_interaction_step_logging_invalid_step_type(self):
	"""
	Test that invalid step types raise an error.
	"""
	logger = InteractionLogger()

	with pytest.raises(ValueError):
	logger.log_step(
	session_id="test_session",
	message_id="test_msg",
	step_type="invalid_type",
	input_text="input",
	model_output="output",
	)

	def test_interaction_step_logging_nonexistent_step(self):
	"""
	Test that retrieving a nonexistent step returns None.
	"""
	logger = InteractionLogger()

	result = logger.get_step("nonexistent_step_id")

	assert result is None

	def test_interaction_step_logging_empty_session(self):
	"""
	Test that retrieving logs for an empty session returns empty list.
	"""
	logger = InteractionLogger()

	session_logs = logger.get_session_logs("nonexistent_session")

	assert session_logs == []

	def test_interaction_step_logging_export(self):
	"""
	Test that session logs can be exported as dictionaries.
	"""
	logger = InteractionLogger()
	session_id = "test_session_export"

	# Log some steps
	logger.log_step(session_id, "msg1", "classification", "input1", "output1")
	logger.log_step(session_id, "msg2", "explanation", "input2", "output2")

	# Export logs
	exported = logger.export_session_logs(session_id)

	# Verify export
	assert len(exported) == 2
	assert all(isinstance(log, dict) for log in exported)
	assert all("step_id" in log for log in exported)
	assert all("input_text" in log for log in exported)
	assert all("model_output" in log for log in exported)
	assert all("timestamp" in log for log in exported)


	class TestFeedbackLogging:
	"""
	Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete

	Tests that feedback logging correctly records approval/disapproval status
	with tagging categories and comments.
	"""

	def test_feedback_logging_approved_status(self):
	"""
	Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete
	Validates: Requirements 7.3, 7.4

	For any feedback, the log should record approval status.
	"""
	logger = InteractionLogger()
	session_id = "test_session_feedback"

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with approved status
	logger.update_approval(step_id, "approved")

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.approval_status == "approved"
	assert logged_step.tagging_data is None

	def test_feedback_logging_disapproved_status(self):
	"""
	Test that disapproved status is recorded correctly.
	"""
	logger = InteractionLogger()
	session_id = "test_session_feedback"

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with disapproved status
	logger.update_approval(step_id, "disapproved")

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.approval_status == "disapproved"

	@given(tagging_record_strategy())
	@settings(max_examples=100)
	def test_feedback_logging_with_tagging_data(self, tagging_record):
	"""
	Feature: chaplain-feedback-system, Property 15: Feedback Logging Complete
	Validates: Requirements 7.3, 7.4

	For any chaplain feedback, the log should contain: approval/disapproval status,
	and if disapproved, the tagging categories and comments.
	"""
	logger = InteractionLogger()
	session_id = "test_session_tagging"

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id=tagging_record.message_id,
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with disapproved status and tagging data
	logger.update_approval(step_id, "disapproved", tagging_record)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.approval_status == "disapproved"
	assert logged_step.tagging_data is not None
	assert logged_step.tagging_data.record_id == tagging_record.record_id
	assert logged_step.tagging_data.message_id == tagging_record.message_id
	assert logged_step.tagging_data.is_classification_correct == tagging_record.is_classification_correct
	assert logged_step.tagging_data.question_issues == tagging_record.question_issues
	assert logged_step.tagging_data.referral_issues == tagging_record.referral_issues

	def test_feedback_logging_classification_subcategory(self):
	"""
	Test that classification subcategory is recorded in tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_classification"

	# Create tagging record with classification subcategory
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=False,
	classification_subcategory="missed_indicators",
	correct_classification="red",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "disapproved", tagging)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.tagging_data.classification_subcategory == "missed_indicators"
	assert logged_step.tagging_data.correct_classification == "red"

	def test_feedback_logging_question_issues(self):
	"""
	Test that question issues are recorded in tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_questions"

	# Create tagging record with question issues
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=True,
	question_issues=["inappropriate", "too_leading"],
	question_comments="Questions were too intrusive",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="follow_up",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "disapproved", tagging)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.tagging_data.question_issues == ["inappropriate", "too_leading"]
	assert logged_step.tagging_data.question_comments == "Questions were too intrusive"

	def test_feedback_logging_referral_issues(self):
	"""
	Test that referral issues are recorded in tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_referral"

	# Create tagging record with referral issues
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=True,
	referral_issues=["incomplete_summary", "inappropriate_tone"],
	referral_comments="Message was incomplete",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="referral",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "disapproved", tagging)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.tagging_data.referral_issues == ["incomplete_summary", "inappropriate_tone"]
	assert logged_step.tagging_data.referral_comments == "Message was incomplete"

	def test_feedback_logging_indicator_issues(self):
	"""
	Test that indicator issues are recorded in tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_indicators"

	# Create tagging record with indicator issues
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=True,
	indicator_issues=["indicator_1", "indicator_2"],
	indicator_comments="These indicators were incorrectly identified",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "disapproved", tagging)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.tagging_data.indicator_issues == ["indicator_1", "indicator_2"]
	assert logged_step.tagging_data.indicator_comments == "These indicators were incorrectly identified"

	def test_feedback_logging_general_notes(self):
	"""
	Test that general notes are recorded in tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_notes"

	# Create tagging record with general notes
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=True,
	general_notes="Overall good classification but needs improvement in tone",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "approved", tagging)

	# Retrieve and verify
	logged_step = logger.get_step(step_id)
	assert logged_step.tagging_data.general_notes == "Overall good classification but needs improvement in tone"

	def test_feedback_logging_disapproved_steps_retrieval(self):
	"""
	Test that disapproved steps can be retrieved from a session.
	"""
	logger = InteractionLogger()
	session_id = "test_session_disapproved"

	# Log multiple steps
	step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
	step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
	step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")

	# Approve first, disapprove second and third
	logger.update_approval(step_id_1, "approved")
	logger.update_approval(step_id_2, "disapproved")
	logger.update_approval(step_id_3, "disapproved")

	# Get disapproved steps
	disapproved = logger.get_disapproved_steps(session_id)

	# Verify
	assert len(disapproved) == 2
	assert all(log.approval_status == "disapproved" for log in disapproved)

	def test_feedback_logging_unapproved_steps_retrieval(self):
	"""
	Test that unapproved steps can be retrieved from a session.
	"""
	logger = InteractionLogger()
	session_id = "test_session_unapproved"

	# Log multiple steps
	step_id_1 = logger.log_step(session_id, "msg1", "classification", "input1", "output1")
	step_id_2 = logger.log_step(session_id, "msg2", "explanation", "input2", "output2")
	step_id_3 = logger.log_step(session_id, "msg3", "referral", "input3", "output3")

	# Approve first, leave others unapproved
	logger.update_approval(step_id_1, "approved")

	# Get unapproved steps
	unapproved = logger.get_unapproved_steps(session_id)

	# Verify
	assert len(unapproved) == 2
	assert all(log.approval_status is None for log in unapproved)

	def test_feedback_logging_invalid_approval_status(self):
	"""
	Test that invalid approval status raises an error.
	"""
	logger = InteractionLogger()
	session_id = "test_session_invalid"

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Try to update with invalid status
	with pytest.raises(ValueError):
	logger.update_approval(step_id, "invalid_status")

	def test_feedback_logging_nonexistent_step(self):
	"""
	Test that updating a nonexistent step raises an error.
	"""
	logger = InteractionLogger()

	with pytest.raises(ValueError):
	logger.update_approval("nonexistent_step", "approved")

	def test_feedback_logging_export_with_tagging(self):
	"""
	Test that exported logs include tagging data.
	"""
	logger = InteractionLogger()
	session_id = "test_session_export_tagging"

	# Create tagging record
	tagging = TaggingRecord(
	record_id="tag1",
	message_id="msg1",
	is_classification_correct=False,
	classification_subcategory="missed_indicators",
	correct_classification="red",
	general_notes="Missed key indicators",
	)

	# Log a step
	step_id = logger.log_step(
	session_id=session_id,
	message_id="msg1",
	step_type="classification",
	input_text="input",
	model_output="output",
	)

	# Update with tagging
	logger.update_approval(step_id, "disapproved", tagging)

	# Export logs
	exported = logger.export_session_logs(session_id)

	# Verify export includes tagging data
	assert len(exported) == 1
	assert exported[0]["approval_status"] == "disapproved"
	assert exported[0]["tagging_data"] is not None
	assert exported[0]["tagging_data"]["classification_subcategory"] == "missed_indicators"
	assert exported[0]["tagging_data"]["correct_classification"] == "red"
	assert exported[0]["tagging_data"]["general_notes"] == "Missed key indicators"