Spaces:
Sleeping
Sleeping
File size: 10,001 Bytes
81aa69d 6c591d0 81aa69d 6c591d0 81aa69d 3e4c834 81aa69d 6c591d0 81aa69d 6c591d0 81aa69d 6c591d0 81aa69d 6c591d0 81aa69d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | """
Validation / smoke-test script for the Customer Support Environment.
Runs through all 3 tasks with deterministic responses and verifies:
β reset() returns valid SupportObservation
β step() returns (observation, reward, done, info) with correct types
β state() returns valid SupportState
β Rewards are non-constant and in (0.0, 1.0) strict open interval
β Episodes terminate correctly
β Grader produces varying scores for different responses
Usage:
python validate.py
"""
import sys
import os
# Ensure project root is on path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score
from server.environment import CustomerSupportEnvironment
from tasks import TASK_IDS
def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list[str]) -> dict:
"""Run a task with given responses and collect results."""
print(f"\n{'='*50}")
print(f" Validating: {task_id}")
print(f"{'='*50}")
# Test reset
obs = env.reset(task_id=task_id)
assert isinstance(obs, SupportObservation), f"reset() must return SupportObservation, got {type(obs)}"
assert obs.task_id == task_id, f"task_id mismatch: {obs.task_id} != {task_id}"
assert not obs.done, "Episode should not be done after reset"
assert obs.current_message, "Initial customer message should not be empty"
print(f" β reset() returned valid SupportObservation")
print(f" Customer: {obs.ticket.customer_name}")
print(f" Subject: {obs.ticket.subject}")
print(f" Message: {obs.current_message[:60]}...")
# Test state after reset
state = env.state()
assert isinstance(state, SupportState), f"state() must return SupportState, got {type(state)}"
assert state.step_count == 0, "Step count should be 0 after reset"
assert not state.done, "State should not be done after reset"
print(f" β state() returned valid SupportState")
# Test steps
rewards = []
for i, response_text in enumerate(responses):
action = SupportAction(
response_text=response_text,
action_type="respond" if i < len(responses) - 1 else "resolve",
)
obs, reward, done, info = env.step(action)
assert isinstance(obs, SupportObservation), f"step() obs must be SupportObservation"
assert isinstance(reward, float), f"step() reward must be float, got {type(reward)}"
assert isinstance(done, bool), f"step() done must be bool, got {type(done)}"
assert isinstance(info, dict), f"step() info must be dict, got {type(info)}"
assert 0.0 < reward < 1.0, f"Reward {reward} out of strict (0.0, 1.0) range"
rewards.append(reward)
breakdown = info.get("reward_breakdown", {})
print(f" β step({i+1}) β reward={reward:.4f} | "
f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} "
f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} "
f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} "
f"done={done}")
if done:
break
# Verify final state
state = env.state()
assert state.step_count > 0, "Step count should be > 0 after steps"
print(f" β Final state: steps={state.step_count}, reward={state.cumulative_reward:.4f}")
return {
"task_id": task_id,
"rewards": rewards,
"avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5,
"steps": len(rewards),
}
def validate_grader_variance():
"""Verify the grader doesn't return constant values."""
print(f"\n{'='*50}")
print(f" Validating: Grader Variance")
print(f"{'='*50}")
env = CustomerSupportEnvironment()
env.reset(task_id="easy_faq")
# Test with a GOOD response
good_action = SupportAction(
response_text=(
"Hi Sarah! Thank you for reaching out about your order ORD-55821. "
"I completely understand your concern about the shipping update. "
"Standard shipping typically takes 5-7 business days, and since your "
"order was placed on March 28th, it should be arriving soon. "
"You should receive a tracking number via email. Let me look into "
"the specific status of your order right away and I'll update you. "
"Is there anything else I can help you with?"
),
action_type="respond",
)
_, good_reward, _, good_info = env.step(good_action)
# Reset and test with a BAD response
env.reset(task_id="easy_faq")
bad_action = SupportAction(
response_text="I don't know.",
action_type="respond",
)
_, bad_reward, _, bad_info = env.step(bad_action)
# Reset and test with an IRRELEVANT response
env.reset(task_id="easy_faq")
irr_action = SupportAction(
response_text="The weather is nice today. Have you tried checking the stock market?",
action_type="respond",
)
_, irr_reward, _, irr_info = env.step(irr_action)
print(f" Good response reward: {good_reward:.4f}")
print(f" Bad response reward: {bad_reward:.4f}")
print(f" Irrelevant response reward: {irr_reward:.4f}")
assert good_reward != bad_reward, "Grader returns same reward for good and bad responses!"
assert good_reward > bad_reward, "Good response should score higher than bad response!"
assert good_reward > irr_reward, "Good response should score higher than irrelevant response!"
print(f" β Grader produces varying scores (NOT constant)")
print(f" β Good > Bad > Irrelevant ordering confirmed")
# Verify ALL rewards are strictly in (0, 1)
for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]:
assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!"
print(f" β All rewards strictly in (0, 1) open interval")
def main():
print("=" * 50)
print(" Customer Support Environment β Validation")
print("=" * 50)
env = CustomerSupportEnvironment()
# Test responses per task
test_responses = {
"easy_faq": [
"Hi Sarah! Thank you for reaching out about your order ORD-55821. "
"Standard shipping takes 5-7 business days. You'll receive a tracking "
"number via email within 24 hours of shipment. Let me check on the "
"status of your Wireless Bluetooth Headphones order right away.",
],
"medium_refund": [
"Hi Michael, I'm sorry to hear about the stitching issue with your "
"Premium Leather Laptop Bag. That sounds like a manufacturing defect, "
"and I completely understand your frustration. According to our policy, "
"defective items qualify for a full refund or replacement at any time. "
"Could you please send photos of the defect so we can process this quickly?",
"Thank you for the photos, Michael. I can confirm this is a defect. "
"You have two options: a full refund of $149.99 or a replacement bag. "
"Either way, we'll provide a prepaid return shipping label. "
"Which would you prefer?",
"We'll process your full refund within 5-7 business days after we "
"receive the returned bag. I'll email you the return label right away. "
"I sincerely apologize for the inconvenience.",
],
"hard_escalation": [
"Mr. Martinez, I sincerely apologize for this terrible experience. "
"What happened β receiving the wrong item after a late delivery, "
"and then being treated rudely by our support staff β is completely "
"unacceptable. You deserve much better. I'm escalating this to our "
"senior support team immediately as a top priority case.",
"I understand your frustration completely, Mr. Martinez. Here's exactly "
"what I'm going to do: First, I'm processing a full refund of $349.99 "
"for the wrong item. Second, I'm adding a $50 store credit as compensation "
"for the inconvenience. Third, I'm personally ensuring the correct "
"Smart Home Security Camera System ships via expedited delivery today. "
"The staff member's behavior will be addressed by management.",
"Absolutely, Mr. Martinez. Here are the specifics: Your refund will be "
"processed within 24 hours. The replacement ships via priority express "
"and will arrive within 2-3 business days. The $50 credit is already "
"applied to your account. I will personally follow up with you via "
"email tomorrow to confirm everything is on track.",
"I completely understand, Mr. Martinez. I'll send you a confirmation "
"email within the hour with all the details in writing: the refund, "
"the replacement tracking, and the store credit. You have my word "
"this will be resolved. Thank you for your patience."
],
}
all_results = []
for task_id in TASK_IDS:
responses = test_responses.get(task_id, ["Thank you for reaching out."])
result = validate_task(env, task_id, responses)
all_results.append(result)
# Validate grader variance
validate_grader_variance()
# Summary
print(f"\n{'='*50}")
print(f" VALIDATION SUMMARY")
print(f"{'='*50}")
total_avg = 0.0
for r in all_results:
print(f" β {r['task_id']:20s} β avg_reward={r['avg_reward']:.4f} steps={r['steps']}")
total_avg += r['avg_reward']
overall = safe_score(total_avg / len(all_results)) if all_results else 0.01
print(f"\n Overall Score: {overall:.4f}")
print(f"\n β
ALL VALIDATIONS PASSED!")
return 0
if __name__ == "__main__":
sys.exit(main())
|