""" Test script for Empathy Evaluators (ER, IP, EX) Usage: python tests/test_evaluators/test_empathy_evaluators.py """ import sys import os # Add parent directory to path for imports sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) from evaluators import create_evaluator from custom_types import Utterance # Test conversation with seeker-response pairs test_conversation: list[Utterance] = [ { "speaker": "Patient", "text": "I've been feeling really anxious lately and can't sleep." }, { "speaker": "Therapist", "text": "I understand how difficult that must be. Have you tried any relaxation techniques?" }, { "speaker": "Patient", "text": "I'm struggling with depression." }, { "speaker": "Therapist", "text": "Just think positive thoughts!" }, { "speaker": "Patient", "text": "I feel like nobody understands me." }, { "speaker": "Therapist", "text": "It sounds like you're feeling very isolated. Can you tell me more about what makes you feel that way?" } ] def test_single_evaluator(metric_name: str, label: str): """Test a single empathy evaluator.""" print(f"\n{'='*80}") print(f"Testing {label}") print(f"{'='*80}") try: # Create evaluator print(f"Creating {metric_name} evaluator...") evaluator = create_evaluator(metric_name) if not evaluator: print(f"❌ Failed to create evaluator for {metric_name}") return False print(f"✓ Evaluator created successfully") print(f" Model: {evaluator.MODEL_NAME}") # Execute evaluation print(f"\nEvaluating conversation ({len(test_conversation)} utterances)...") result = evaluator.execute(test_conversation) # Check result structure assert result["granularity"] == "utterance", "Expected utterance-level granularity" assert result["per_utterance"] is not None, "Expected per_utterance results" assert len(result["per_utterance"]) == len(test_conversation), "Mismatch in result count" print(f"✓ Evaluation complete") # Display results print(f"\nResults:") for i, (utt, utt_result) in enumerate(zip(test_conversation, result["per_utterance"])): print(f"\n Utterance {i+1}:") print(f" Speaker: {utt['speaker']}") print(f" Text: {utt['text'][:60]}{'...' if len(utt['text']) > 60 else ''}") if metric_name in utt_result["metrics"]: score = utt_result["metrics"][metric_name] print(f" {metric_name}: {score['label']} (confidence: {score['confidence']:.3f})") else: print(f" {metric_name}: (not evaluated)") print(f"\n✅ {label} test passed!") return True except Exception as e: print(f"\n❌ Error testing {label}: {e}") import traceback traceback.print_exc() return False def test_all_empathy_evaluators(): """Test all three empathy evaluators.""" print("\n" + "="*80) print("Testing All Empathy Evaluators") print("="*80) evaluators = [ ("empathy_er", "Empathy ER (Emotional Reaction)"), ("empathy_ip", "Empathy IP (Interpretation)"), ("empathy_ex", "Empathy EX (Exploration)") ] results = {} for metric_name, label in evaluators: results[metric_name] = test_single_evaluator(metric_name, label) # Summary print(f"\n{'='*80}") print("Test Summary") print(f"{'='*80}") for metric_name, label in evaluators: status = "✅ PASSED" if results[metric_name] else "❌ FAILED" print(f" {label}: {status}") total = len(evaluators) passed = sum(results.values()) print(f"\nTotal: {passed}/{total} tests passed") print(f"{'='*80}\n") return all(results.values()) if __name__ == "__main__": success = test_all_empathy_evaluators() sys.exit(0 if success else 1)