File size: 10,001 Bytes
81aa69d
 
 
 
 
 
 
6c591d0
81aa69d
 
 
 
 
 
 
 
 
 
 
 
 
6c591d0
81aa69d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e4c834
81aa69d
 
 
 
6c591d0
 
 
81aa69d
 
 
 
 
 
 
 
 
 
 
 
 
6c591d0
81aa69d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c591d0
 
 
 
 
81aa69d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c591d0
81aa69d
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""
Validation / smoke-test script for the Customer Support Environment.

Runs through all 3 tasks with deterministic responses and verifies:
  βœ“ reset() returns valid SupportObservation
  βœ“ step() returns (observation, reward, done, info) with correct types
  βœ“ state() returns valid SupportState
  βœ“ Rewards are non-constant and in (0.0, 1.0) strict open interval
  βœ“ Episodes terminate correctly
  βœ“ Grader produces varying scores for different responses

Usage:
    python validate.py
"""

import sys
import os

# Ensure project root is on path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score
from server.environment import CustomerSupportEnvironment
from tasks import TASK_IDS


def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list[str]) -> dict:
    """Run a task with given responses and collect results."""
    print(f"\n{'='*50}")
    print(f"  Validating: {task_id}")
    print(f"{'='*50}")

    # Test reset
    obs = env.reset(task_id=task_id)
    assert isinstance(obs, SupportObservation), f"reset() must return SupportObservation, got {type(obs)}"
    assert obs.task_id == task_id, f"task_id mismatch: {obs.task_id} != {task_id}"
    assert not obs.done, "Episode should not be done after reset"
    assert obs.current_message, "Initial customer message should not be empty"
    print(f"  βœ“ reset() returned valid SupportObservation")
    print(f"    Customer: {obs.ticket.customer_name}")
    print(f"    Subject:  {obs.ticket.subject}")
    print(f"    Message:  {obs.current_message[:60]}...")

    # Test state after reset
    state = env.state()
    assert isinstance(state, SupportState), f"state() must return SupportState, got {type(state)}"
    assert state.step_count == 0, "Step count should be 0 after reset"
    assert not state.done, "State should not be done after reset"
    print(f"  βœ“ state() returned valid SupportState")

    # Test steps
    rewards = []
    for i, response_text in enumerate(responses):
        action = SupportAction(
            response_text=response_text,
            action_type="respond" if i < len(responses) - 1 else "resolve",
        )
        obs, reward, done, info = env.step(action)

        assert isinstance(obs, SupportObservation), f"step() obs must be SupportObservation"
        assert isinstance(reward, float), f"step() reward must be float, got {type(reward)}"
        assert isinstance(done, bool), f"step() done must be bool, got {type(done)}"
        assert isinstance(info, dict), f"step() info must be dict, got {type(info)}"
        assert 0.0 < reward < 1.0, f"Reward {reward} out of strict (0.0, 1.0) range"

        rewards.append(reward)
        breakdown = info.get("reward_breakdown", {})
        print(f"  βœ“ step({i+1}) β†’ reward={reward:.4f} | "
              f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} "
              f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} "
              f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} "
              f"done={done}")

        if done:
            break

    # Verify final state
    state = env.state()
    assert state.step_count > 0, "Step count should be > 0 after steps"
    print(f"  βœ“ Final state: steps={state.step_count}, reward={state.cumulative_reward:.4f}")

    return {
        "task_id": task_id,
        "rewards": rewards,
        "avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5,
        "steps": len(rewards),
    }


def validate_grader_variance():
    """Verify the grader doesn't return constant values."""
    print(f"\n{'='*50}")
    print(f"  Validating: Grader Variance")
    print(f"{'='*50}")

    env = CustomerSupportEnvironment()
    env.reset(task_id="easy_faq")

    # Test with a GOOD response
    good_action = SupportAction(
        response_text=(
            "Hi Sarah! Thank you for reaching out about your order ORD-55821. "
            "I completely understand your concern about the shipping update. "
            "Standard shipping typically takes 5-7 business days, and since your "
            "order was placed on March 28th, it should be arriving soon. "
            "You should receive a tracking number via email. Let me look into "
            "the specific status of your order right away and I'll update you. "
            "Is there anything else I can help you with?"
        ),
        action_type="respond",
    )
    _, good_reward, _, good_info = env.step(good_action)

    # Reset and test with a BAD response
    env.reset(task_id="easy_faq")
    bad_action = SupportAction(
        response_text="I don't know.",
        action_type="respond",
    )
    _, bad_reward, _, bad_info = env.step(bad_action)

    # Reset and test with an IRRELEVANT response
    env.reset(task_id="easy_faq")
    irr_action = SupportAction(
        response_text="The weather is nice today. Have you tried checking the stock market?",
        action_type="respond",
    )
    _, irr_reward, _, irr_info = env.step(irr_action)

    print(f"  Good response reward:       {good_reward:.4f}")
    print(f"  Bad response reward:        {bad_reward:.4f}")
    print(f"  Irrelevant response reward: {irr_reward:.4f}")

    assert good_reward != bad_reward, "Grader returns same reward for good and bad responses!"
    assert good_reward > bad_reward, "Good response should score higher than bad response!"
    assert good_reward > irr_reward, "Good response should score higher than irrelevant response!"
    print(f"  βœ“ Grader produces varying scores (NOT constant)")
    print(f"  βœ“ Good > Bad > Irrelevant ordering confirmed")

    # Verify ALL rewards are strictly in (0, 1)
    for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]:
        assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!"
    print(f"  βœ“ All rewards strictly in (0, 1) open interval")


def main():
    print("=" * 50)
    print("  Customer Support Environment β€” Validation")
    print("=" * 50)

    env = CustomerSupportEnvironment()

    # Test responses per task
    test_responses = {
        "easy_faq": [
            "Hi Sarah! Thank you for reaching out about your order ORD-55821. "
            "Standard shipping takes 5-7 business days. You'll receive a tracking "
            "number via email within 24 hours of shipment. Let me check on the "
            "status of your Wireless Bluetooth Headphones order right away.",
        ],
        "medium_refund": [
            "Hi Michael, I'm sorry to hear about the stitching issue with your "
            "Premium Leather Laptop Bag. That sounds like a manufacturing defect, "
            "and I completely understand your frustration. According to our policy, "
            "defective items qualify for a full refund or replacement at any time. "
            "Could you please send photos of the defect so we can process this quickly?",
            "Thank you for the photos, Michael. I can confirm this is a defect. "
            "You have two options: a full refund of $149.99 or a replacement bag. "
            "Either way, we'll provide a prepaid return shipping label. "
            "Which would you prefer?",
            "We'll process your full refund within 5-7 business days after we "
            "receive the returned bag. I'll email you the return label right away. "
            "I sincerely apologize for the inconvenience.",
        ],
        "hard_escalation": [
            "Mr. Martinez, I sincerely apologize for this terrible experience. "
            "What happened β€” receiving the wrong item after a late delivery, "
            "and then being treated rudely by our support staff β€” is completely "
            "unacceptable. You deserve much better. I'm escalating this to our "
            "senior support team immediately as a top priority case.",
            "I understand your frustration completely, Mr. Martinez. Here's exactly "
            "what I'm going to do: First, I'm processing a full refund of $349.99 "
            "for the wrong item. Second, I'm adding a $50 store credit as compensation "
            "for the inconvenience. Third, I'm personally ensuring the correct "
            "Smart Home Security Camera System ships via expedited delivery today. "
            "The staff member's behavior will be addressed by management.",
            "Absolutely, Mr. Martinez. Here are the specifics: Your refund will be "
            "processed within 24 hours. The replacement ships via priority express "
            "and will arrive within 2-3 business days. The $50 credit is already "
            "applied to your account. I will personally follow up with you via "
            "email tomorrow to confirm everything is on track.",
            "I completely understand, Mr. Martinez. I'll send you a confirmation "
            "email within the hour with all the details in writing: the refund, "
            "the replacement tracking, and the store credit. You have my word "
            "this will be resolved. Thank you for your patience."
        ],
    }

    all_results = []
    for task_id in TASK_IDS:
        responses = test_responses.get(task_id, ["Thank you for reaching out."])
        result = validate_task(env, task_id, responses)
        all_results.append(result)

    # Validate grader variance
    validate_grader_variance()

    # Summary
    print(f"\n{'='*50}")
    print(f"  VALIDATION SUMMARY")
    print(f"{'='*50}")
    total_avg = 0.0
    for r in all_results:
        print(f"  βœ“ {r['task_id']:20s} β†’ avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
        total_avg += r['avg_reward']
    overall = safe_score(total_avg / len(all_results)) if all_results else 0.01
    print(f"\n  Overall Score: {overall:.4f}")
    print(f"\n  βœ… ALL VALIDATIONS PASSED!")
    return 0


if __name__ == "__main__":
    sys.exit(main())