File size: 4,196 Bytes
9366995 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
"""
Test script for Empathy Evaluators (ER, IP, EX)
Usage:
python tests/test_evaluators/test_empathy_evaluators.py
"""
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from evaluators import create_evaluator
from custom_types import Utterance
# Test conversation with seeker-response pairs
test_conversation: list[Utterance] = [
{
"speaker": "Patient",
"text": "I've been feeling really anxious lately and can't sleep."
},
{
"speaker": "Therapist",
"text": "I understand how difficult that must be. Have you tried any relaxation techniques?"
},
{
"speaker": "Patient",
"text": "I'm struggling with depression."
},
{
"speaker": "Therapist",
"text": "Just think positive thoughts!"
},
{
"speaker": "Patient",
"text": "I feel like nobody understands me."
},
{
"speaker": "Therapist",
"text": "It sounds like you're feeling very isolated. Can you tell me more about what makes you feel that way?"
}
]
def test_single_evaluator(metric_name: str, label: str):
"""Test a single empathy evaluator."""
print(f"\n{'='*80}")
print(f"Testing {label}")
print(f"{'='*80}")
try:
# Create evaluator
print(f"Creating {metric_name} evaluator...")
evaluator = create_evaluator(metric_name)
if not evaluator:
print(f"❌ Failed to create evaluator for {metric_name}")
return False
print(f"✓ Evaluator created successfully")
print(f" Model: {evaluator.MODEL_NAME}")
# Execute evaluation
print(f"\nEvaluating conversation ({len(test_conversation)} utterances)...")
result = evaluator.execute(test_conversation)
# Check result structure
assert result["granularity"] == "utterance", "Expected utterance-level granularity"
assert result["per_utterance"] is not None, "Expected per_utterance results"
assert len(result["per_utterance"]) == len(test_conversation), "Mismatch in result count"
print(f"✓ Evaluation complete")
# Display results
print(f"\nResults:")
for i, (utt, utt_result) in enumerate(zip(test_conversation, result["per_utterance"])):
print(f"\n Utterance {i+1}:")
print(f" Speaker: {utt['speaker']}")
print(f" Text: {utt['text'][:60]}{'...' if len(utt['text']) > 60 else ''}")
if metric_name in utt_result["metrics"]:
score = utt_result["metrics"][metric_name]
print(f" {metric_name}: {score['label']} (confidence: {score['confidence']:.3f})")
else:
print(f" {metric_name}: (not evaluated)")
print(f"\n✅ {label} test passed!")
return True
except Exception as e:
print(f"\n❌ Error testing {label}: {e}")
import traceback
traceback.print_exc()
return False
def test_all_empathy_evaluators():
"""Test all three empathy evaluators."""
print("\n" + "="*80)
print("Testing All Empathy Evaluators")
print("="*80)
evaluators = [
("empathy_er", "Empathy ER (Emotional Reaction)"),
("empathy_ip", "Empathy IP (Interpretation)"),
("empathy_ex", "Empathy EX (Exploration)")
]
results = {}
for metric_name, label in evaluators:
results[metric_name] = test_single_evaluator(metric_name, label)
# Summary
print(f"\n{'='*80}")
print("Test Summary")
print(f"{'='*80}")
for metric_name, label in evaluators:
status = "✅ PASSED" if results[metric_name] else "❌ FAILED"
print(f" {label}: {status}")
total = len(evaluators)
passed = sum(results.values())
print(f"\nTotal: {passed}/{total} tests passed")
print(f"{'='*80}\n")
return all(results.values())
if __name__ == "__main__":
success = test_all_empathy_evaluators()
sys.exit(0 if success else 1)
|