File size: 4,196 Bytes
9366995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Test script for Empathy Evaluators (ER, IP, EX)

Usage:
    python tests/test_evaluators/test_empathy_evaluators.py
"""
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

from evaluators import create_evaluator
from custom_types import Utterance


# Test conversation with seeker-response pairs
test_conversation: list[Utterance] = [
    {
        "speaker": "Patient",
        "text": "I've been feeling really anxious lately and can't sleep."
    },
    {
        "speaker": "Therapist",
        "text": "I understand how difficult that must be. Have you tried any relaxation techniques?"
    },
    {
        "speaker": "Patient",
        "text": "I'm struggling with depression."
    },
    {
        "speaker": "Therapist",
        "text": "Just think positive thoughts!"
    },
    {
        "speaker": "Patient",
        "text": "I feel like nobody understands me."
    },
    {
        "speaker": "Therapist",
        "text": "It sounds like you're feeling very isolated. Can you tell me more about what makes you feel that way?"
    }
]


def test_single_evaluator(metric_name: str, label: str):
    """Test a single empathy evaluator."""
    print(f"\n{'='*80}")
    print(f"Testing {label}")
    print(f"{'='*80}")
    
    try:
        # Create evaluator
        print(f"Creating {metric_name} evaluator...")
        evaluator = create_evaluator(metric_name)
        
        if not evaluator:
            print(f"❌ Failed to create evaluator for {metric_name}")
            return False
        
        print(f"✓ Evaluator created successfully")
        print(f"  Model: {evaluator.MODEL_NAME}")
        
        # Execute evaluation
        print(f"\nEvaluating conversation ({len(test_conversation)} utterances)...")
        result = evaluator.execute(test_conversation)
        
        # Check result structure
        assert result["granularity"] == "utterance", "Expected utterance-level granularity"
        assert result["per_utterance"] is not None, "Expected per_utterance results"
        assert len(result["per_utterance"]) == len(test_conversation), "Mismatch in result count"
        
        print(f"✓ Evaluation complete")
        
        # Display results
        print(f"\nResults:")
        for i, (utt, utt_result) in enumerate(zip(test_conversation, result["per_utterance"])):
            print(f"\n  Utterance {i+1}:")
            print(f"    Speaker: {utt['speaker']}")
            print(f"    Text: {utt['text'][:60]}{'...' if len(utt['text']) > 60 else ''}")
            
            if metric_name in utt_result["metrics"]:
                score = utt_result["metrics"][metric_name]
                print(f"    {metric_name}: {score['label']} (confidence: {score['confidence']:.3f})")
            else:
                print(f"    {metric_name}: (not evaluated)")
        
        print(f"\n✅ {label} test passed!")
        return True
        
    except Exception as e:
        print(f"\n❌ Error testing {label}: {e}")
        import traceback
        traceback.print_exc()
        return False


def test_all_empathy_evaluators():
    """Test all three empathy evaluators."""
    print("\n" + "="*80)
    print("Testing All Empathy Evaluators")
    print("="*80)
    
    evaluators = [
        ("empathy_er", "Empathy ER (Emotional Reaction)"),
        ("empathy_ip", "Empathy IP (Interpretation)"),
        ("empathy_ex", "Empathy EX (Exploration)")
    ]
    
    results = {}
    for metric_name, label in evaluators:
        results[metric_name] = test_single_evaluator(metric_name, label)
    
    # Summary
    print(f"\n{'='*80}")
    print("Test Summary")
    print(f"{'='*80}")
    
    for metric_name, label in evaluators:
        status = "✅ PASSED" if results[metric_name] else "❌ FAILED"
        print(f"  {label}: {status}")
    
    total = len(evaluators)
    passed = sum(results.values())
    print(f"\nTotal: {passed}/{total} tests passed")
    print(f"{'='*80}\n")
    
    return all(results.values())


if __name__ == "__main__":
    success = test_all_empathy_evaluators()
    sys.exit(0 if success else 1)