Spaces:
Sleeping
Sleeping
| """ | |
| Validation / smoke-test script for the Customer Support Environment. | |
| Runs through all 3 tasks with deterministic responses and verifies: | |
| β reset() returns valid SupportObservation | |
| β step() returns (observation, reward, done, info) with correct types | |
| β state() returns valid SupportState | |
| β Rewards are non-constant and in (0.0, 1.0) strict open interval | |
| β Episodes terminate correctly | |
| β Grader produces varying scores for different responses | |
| Usage: | |
| python validate.py | |
| """ | |
| import sys | |
| import os | |
| # Ensure project root is on path | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score | |
| from server.environment import CustomerSupportEnvironment | |
| from tasks import TASK_IDS | |
| def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list[str]) -> dict: | |
| """Run a task with given responses and collect results.""" | |
| print(f"\n{'='*50}") | |
| print(f" Validating: {task_id}") | |
| print(f"{'='*50}") | |
| # Test reset | |
| obs = env.reset(task_id=task_id) | |
| assert isinstance(obs, SupportObservation), f"reset() must return SupportObservation, got {type(obs)}" | |
| assert obs.task_id == task_id, f"task_id mismatch: {obs.task_id} != {task_id}" | |
| assert not obs.done, "Episode should not be done after reset" | |
| assert obs.current_message, "Initial customer message should not be empty" | |
| print(f" β reset() returned valid SupportObservation") | |
| print(f" Customer: {obs.ticket.customer_name}") | |
| print(f" Subject: {obs.ticket.subject}") | |
| print(f" Message: {obs.current_message[:60]}...") | |
| # Test state after reset | |
| state = env.state() | |
| assert isinstance(state, SupportState), f"state() must return SupportState, got {type(state)}" | |
| assert state.step_count == 0, "Step count should be 0 after reset" | |
| assert not state.done, "State should not be done after reset" | |
| print(f" β state() returned valid SupportState") | |
| # Test steps | |
| rewards = [] | |
| for i, response_text in enumerate(responses): | |
| action = SupportAction( | |
| response_text=response_text, | |
| action_type="respond" if i < len(responses) - 1 else "resolve", | |
| ) | |
| obs, reward, done, info = env.step(action) | |
| assert isinstance(obs, SupportObservation), f"step() obs must be SupportObservation" | |
| assert isinstance(reward, float), f"step() reward must be float, got {type(reward)}" | |
| assert isinstance(done, bool), f"step() done must be bool, got {type(done)}" | |
| assert isinstance(info, dict), f"step() info must be dict, got {type(info)}" | |
| assert 0.0 < reward < 1.0, f"Reward {reward} out of strict (0.0, 1.0) range" | |
| rewards.append(reward) | |
| breakdown = info.get("reward_breakdown", {}) | |
| print(f" β step({i+1}) β reward={reward:.4f} | " | |
| f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} " | |
| f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} " | |
| f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} " | |
| f"done={done}") | |
| if done: | |
| break | |
| # Verify final state | |
| state = env.state() | |
| assert state.step_count > 0, "Step count should be > 0 after steps" | |
| print(f" β Final state: steps={state.step_count}, reward={state.cumulative_reward:.4f}") | |
| return { | |
| "task_id": task_id, | |
| "rewards": rewards, | |
| "avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5, | |
| "steps": len(rewards), | |
| } | |
| def validate_grader_variance(): | |
| """Verify the grader doesn't return constant values.""" | |
| print(f"\n{'='*50}") | |
| print(f" Validating: Grader Variance") | |
| print(f"{'='*50}") | |
| env = CustomerSupportEnvironment() | |
| env.reset(task_id="easy_faq") | |
| # Test with a GOOD response | |
| good_action = SupportAction( | |
| response_text=( | |
| "Hi Sarah! Thank you for reaching out about your order ORD-55821. " | |
| "I completely understand your concern about the shipping update. " | |
| "Standard shipping typically takes 5-7 business days, and since your " | |
| "order was placed on March 28th, it should be arriving soon. " | |
| "You should receive a tracking number via email. Let me look into " | |
| "the specific status of your order right away and I'll update you. " | |
| "Is there anything else I can help you with?" | |
| ), | |
| action_type="respond", | |
| ) | |
| _, good_reward, _, good_info = env.step(good_action) | |
| # Reset and test with a BAD response | |
| env.reset(task_id="easy_faq") | |
| bad_action = SupportAction( | |
| response_text="I don't know.", | |
| action_type="respond", | |
| ) | |
| _, bad_reward, _, bad_info = env.step(bad_action) | |
| # Reset and test with an IRRELEVANT response | |
| env.reset(task_id="easy_faq") | |
| irr_action = SupportAction( | |
| response_text="The weather is nice today. Have you tried checking the stock market?", | |
| action_type="respond", | |
| ) | |
| _, irr_reward, _, irr_info = env.step(irr_action) | |
| print(f" Good response reward: {good_reward:.4f}") | |
| print(f" Bad response reward: {bad_reward:.4f}") | |
| print(f" Irrelevant response reward: {irr_reward:.4f}") | |
| assert good_reward != bad_reward, "Grader returns same reward for good and bad responses!" | |
| assert good_reward > bad_reward, "Good response should score higher than bad response!" | |
| assert good_reward > irr_reward, "Good response should score higher than irrelevant response!" | |
| print(f" β Grader produces varying scores (NOT constant)") | |
| print(f" β Good > Bad > Irrelevant ordering confirmed") | |
| # Verify ALL rewards are strictly in (0, 1) | |
| for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]: | |
| assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!" | |
| print(f" β All rewards strictly in (0, 1) open interval") | |
| def main(): | |
| print("=" * 50) | |
| print(" Customer Support Environment β Validation") | |
| print("=" * 50) | |
| env = CustomerSupportEnvironment() | |
| # Test responses per task | |
| test_responses = { | |
| "easy_faq": [ | |
| "Hi Sarah! Thank you for reaching out about your order ORD-55821. " | |
| "Standard shipping takes 5-7 business days. You'll receive a tracking " | |
| "number via email within 24 hours of shipment. Let me check on the " | |
| "status of your Wireless Bluetooth Headphones order right away.", | |
| ], | |
| "medium_refund": [ | |
| "Hi Michael, I'm sorry to hear about the stitching issue with your " | |
| "Premium Leather Laptop Bag. That sounds like a manufacturing defect, " | |
| "and I completely understand your frustration. According to our policy, " | |
| "defective items qualify for a full refund or replacement at any time. " | |
| "Could you please send photos of the defect so we can process this quickly?", | |
| "Thank you for the photos, Michael. I can confirm this is a defect. " | |
| "You have two options: a full refund of $149.99 or a replacement bag. " | |
| "Either way, we'll provide a prepaid return shipping label. " | |
| "Which would you prefer?", | |
| "We'll process your full refund within 5-7 business days after we " | |
| "receive the returned bag. I'll email you the return label right away. " | |
| "I sincerely apologize for the inconvenience.", | |
| ], | |
| "hard_escalation": [ | |
| "Mr. Martinez, I sincerely apologize for this terrible experience. " | |
| "What happened β receiving the wrong item after a late delivery, " | |
| "and then being treated rudely by our support staff β is completely " | |
| "unacceptable. You deserve much better. I'm escalating this to our " | |
| "senior support team immediately as a top priority case.", | |
| "I understand your frustration completely, Mr. Martinez. Here's exactly " | |
| "what I'm going to do: First, I'm processing a full refund of $349.99 " | |
| "for the wrong item. Second, I'm adding a $50 store credit as compensation " | |
| "for the inconvenience. Third, I'm personally ensuring the correct " | |
| "Smart Home Security Camera System ships via expedited delivery today. " | |
| "The staff member's behavior will be addressed by management.", | |
| "Absolutely, Mr. Martinez. Here are the specifics: Your refund will be " | |
| "processed within 24 hours. The replacement ships via priority express " | |
| "and will arrive within 2-3 business days. The $50 credit is already " | |
| "applied to your account. I will personally follow up with you via " | |
| "email tomorrow to confirm everything is on track.", | |
| "I completely understand, Mr. Martinez. I'll send you a confirmation " | |
| "email within the hour with all the details in writing: the refund, " | |
| "the replacement tracking, and the store credit. You have my word " | |
| "this will be resolved. Thank you for your patience." | |
| ], | |
| } | |
| all_results = [] | |
| for task_id in TASK_IDS: | |
| responses = test_responses.get(task_id, ["Thank you for reaching out."]) | |
| result = validate_task(env, task_id, responses) | |
| all_results.append(result) | |
| # Validate grader variance | |
| validate_grader_variance() | |
| # Summary | |
| print(f"\n{'='*50}") | |
| print(f" VALIDATION SUMMARY") | |
| print(f"{'='*50}") | |
| total_avg = 0.0 | |
| for r in all_results: | |
| print(f" β {r['task_id']:20s} β avg_reward={r['avg_reward']:.4f} steps={r['steps']}") | |
| total_avg += r['avg_reward'] | |
| overall = safe_score(total_avg / len(all_results)) if all_results else 0.01 | |
| print(f"\n Overall Score: {overall:.4f}") | |
| print(f"\n β ALL VALIDATIONS PASSED!") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |