github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
| """ | |
| Test script for Empathy Evaluators (ER, IP, EX) | |
| Usage: | |
| python tests/test_evaluators/test_empathy_evaluators.py | |
| """ | |
| import sys | |
| import os | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) | |
| from evaluators import create_evaluator | |
| from custom_types import Utterance | |
| # Test conversation with seeker-response pairs | |
| test_conversation: list[Utterance] = [ | |
| { | |
| "speaker": "Patient", | |
| "text": "I've been feeling really anxious lately and can't sleep." | |
| }, | |
| { | |
| "speaker": "Therapist", | |
| "text": "I understand how difficult that must be. Have you tried any relaxation techniques?" | |
| }, | |
| { | |
| "speaker": "Patient", | |
| "text": "I'm struggling with depression." | |
| }, | |
| { | |
| "speaker": "Therapist", | |
| "text": "Just think positive thoughts!" | |
| }, | |
| { | |
| "speaker": "Patient", | |
| "text": "I feel like nobody understands me." | |
| }, | |
| { | |
| "speaker": "Therapist", | |
| "text": "It sounds like you're feeling very isolated. Can you tell me more about what makes you feel that way?" | |
| } | |
| ] | |
| def test_single_evaluator(metric_name: str, label: str): | |
| """Test a single empathy evaluator.""" | |
| print(f"\n{'='*80}") | |
| print(f"Testing {label}") | |
| print(f"{'='*80}") | |
| try: | |
| # Create evaluator | |
| print(f"Creating {metric_name} evaluator...") | |
| evaluator = create_evaluator(metric_name) | |
| if not evaluator: | |
| print(f"❌ Failed to create evaluator for {metric_name}") | |
| return False | |
| print(f"✓ Evaluator created successfully") | |
| print(f" Model: {evaluator.MODEL_NAME}") | |
| # Execute evaluation | |
| print(f"\nEvaluating conversation ({len(test_conversation)} utterances)...") | |
| result = evaluator.execute(test_conversation) | |
| # Check result structure | |
| assert result["granularity"] == "utterance", "Expected utterance-level granularity" | |
| assert result["per_utterance"] is not None, "Expected per_utterance results" | |
| assert len(result["per_utterance"]) == len(test_conversation), "Mismatch in result count" | |
| print(f"✓ Evaluation complete") | |
| # Display results | |
| print(f"\nResults:") | |
| for i, (utt, utt_result) in enumerate(zip(test_conversation, result["per_utterance"])): | |
| print(f"\n Utterance {i+1}:") | |
| print(f" Speaker: {utt['speaker']}") | |
| print(f" Text: {utt['text'][:60]}{'...' if len(utt['text']) > 60 else ''}") | |
| if metric_name in utt_result["metrics"]: | |
| score = utt_result["metrics"][metric_name] | |
| print(f" {metric_name}: {score['label']} (confidence: {score['confidence']:.3f})") | |
| else: | |
| print(f" {metric_name}: (not evaluated)") | |
| print(f"\n✅ {label} test passed!") | |
| return True | |
| except Exception as e: | |
| print(f"\n❌ Error testing {label}: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def test_all_empathy_evaluators(): | |
| """Test all three empathy evaluators.""" | |
| print("\n" + "="*80) | |
| print("Testing All Empathy Evaluators") | |
| print("="*80) | |
| evaluators = [ | |
| ("empathy_er", "Empathy ER (Emotional Reaction)"), | |
| ("empathy_ip", "Empathy IP (Interpretation)"), | |
| ("empathy_ex", "Empathy EX (Exploration)") | |
| ] | |
| results = {} | |
| for metric_name, label in evaluators: | |
| results[metric_name] = test_single_evaluator(metric_name, label) | |
| # Summary | |
| print(f"\n{'='*80}") | |
| print("Test Summary") | |
| print(f"{'='*80}") | |
| for metric_name, label in evaluators: | |
| status = "✅ PASSED" if results[metric_name] else "❌ FAILED" | |
| print(f" {label}: {status}") | |
| total = len(evaluators) | |
| passed = sum(results.values()) | |
| print(f"\nTotal: {passed}/{total} tests passed") | |
| print(f"{'='*80}\n") | |
| return all(results.values()) | |
| if __name__ == "__main__": | |
| success = test_all_empathy_evaluators() | |
| sys.exit(0 if success else 1) | |