hhh / tests /test_evaluators /test_empathy_evaluators.py
github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
"""
Test script for Empathy Evaluators (ER, IP, EX)
Usage:
python tests/test_evaluators/test_empathy_evaluators.py
"""
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from evaluators import create_evaluator
from custom_types import Utterance
# Test conversation with seeker-response pairs
test_conversation: list[Utterance] = [
{
"speaker": "Patient",
"text": "I've been feeling really anxious lately and can't sleep."
},
{
"speaker": "Therapist",
"text": "I understand how difficult that must be. Have you tried any relaxation techniques?"
},
{
"speaker": "Patient",
"text": "I'm struggling with depression."
},
{
"speaker": "Therapist",
"text": "Just think positive thoughts!"
},
{
"speaker": "Patient",
"text": "I feel like nobody understands me."
},
{
"speaker": "Therapist",
"text": "It sounds like you're feeling very isolated. Can you tell me more about what makes you feel that way?"
}
]
def test_single_evaluator(metric_name: str, label: str):
"""Test a single empathy evaluator."""
print(f"\n{'='*80}")
print(f"Testing {label}")
print(f"{'='*80}")
try:
# Create evaluator
print(f"Creating {metric_name} evaluator...")
evaluator = create_evaluator(metric_name)
if not evaluator:
print(f"❌ Failed to create evaluator for {metric_name}")
return False
print(f"✓ Evaluator created successfully")
print(f" Model: {evaluator.MODEL_NAME}")
# Execute evaluation
print(f"\nEvaluating conversation ({len(test_conversation)} utterances)...")
result = evaluator.execute(test_conversation)
# Check result structure
assert result["granularity"] == "utterance", "Expected utterance-level granularity"
assert result["per_utterance"] is not None, "Expected per_utterance results"
assert len(result["per_utterance"]) == len(test_conversation), "Mismatch in result count"
print(f"✓ Evaluation complete")
# Display results
print(f"\nResults:")
for i, (utt, utt_result) in enumerate(zip(test_conversation, result["per_utterance"])):
print(f"\n Utterance {i+1}:")
print(f" Speaker: {utt['speaker']}")
print(f" Text: {utt['text'][:60]}{'...' if len(utt['text']) > 60 else ''}")
if metric_name in utt_result["metrics"]:
score = utt_result["metrics"][metric_name]
print(f" {metric_name}: {score['label']} (confidence: {score['confidence']:.3f})")
else:
print(f" {metric_name}: (not evaluated)")
print(f"\n✅ {label} test passed!")
return True
except Exception as e:
print(f"\n❌ Error testing {label}: {e}")
import traceback
traceback.print_exc()
return False
def test_all_empathy_evaluators():
"""Test all three empathy evaluators."""
print("\n" + "="*80)
print("Testing All Empathy Evaluators")
print("="*80)
evaluators = [
("empathy_er", "Empathy ER (Emotional Reaction)"),
("empathy_ip", "Empathy IP (Interpretation)"),
("empathy_ex", "Empathy EX (Exploration)")
]
results = {}
for metric_name, label in evaluators:
results[metric_name] = test_single_evaluator(metric_name, label)
# Summary
print(f"\n{'='*80}")
print("Test Summary")
print(f"{'='*80}")
for metric_name, label in evaluators:
status = "✅ PASSED" if results[metric_name] else "❌ FAILED"
print(f" {label}: {status}")
total = len(evaluators)
passed = sum(results.values())
print(f"\nTotal: {passed}/{total} tests passed")
print(f"{'='*80}\n")
return all(results.values())
if __name__ == "__main__":
success = test_all_empathy_evaluators()
sys.exit(0 if success else 1)