""" Validation / smoke-test script for the Customer Support Environment. Runs through all 3 tasks with deterministic responses and verifies: ✓ reset() returns valid SupportObservation ✓ step() returns (observation, reward, done, info) with correct types ✓ state() returns valid SupportState ✓ Rewards are non-constant and in (0.0, 1.0) strict open interval ✓ Episodes terminate correctly ✓ Grader produces varying scores for different responses Usage: python validate.py """ import sys import os # Ensure project root is on path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from models import SupportAction, SupportObservation, SupportState, RewardBreakdown, safe_score from server.environment import CustomerSupportEnvironment from tasks import TASK_IDS def validate_task(env: CustomerSupportEnvironment, task_id: str, responses: list[str]) -> dict: """Run a task with given responses and collect results.""" print(f"\n{'='*50}") print(f" Validating: {task_id}") print(f"{'='*50}") # Test reset obs = env.reset(task_id=task_id) assert isinstance(obs, SupportObservation), f"reset() must return SupportObservation, got {type(obs)}" assert obs.task_id == task_id, f"task_id mismatch: {obs.task_id} != {task_id}" assert not obs.done, "Episode should not be done after reset" assert obs.current_message, "Initial customer message should not be empty" print(f" ✓ reset() returned valid SupportObservation") print(f" Customer: {obs.ticket.customer_name}") print(f" Subject: {obs.ticket.subject}") print(f" Message: {obs.current_message[:60]}...") # Test state after reset state = env.state() assert isinstance(state, SupportState), f"state() must return SupportState, got {type(state)}" assert state.step_count == 0, "Step count should be 0 after reset" assert not state.done, "State should not be done after reset" print(f" ✓ state() returned valid SupportState") # Test steps rewards = [] for i, response_text in enumerate(responses): action = SupportAction( response_text=response_text, action_type="respond" if i < len(responses) - 1 else "resolve", ) obs, reward, done, info = env.step(action) assert isinstance(obs, SupportObservation), f"step() obs must be SupportObservation" assert isinstance(reward, float), f"step() reward must be float, got {type(reward)}" assert isinstance(done, bool), f"step() done must be bool, got {type(done)}" assert isinstance(info, dict), f"step() info must be dict, got {type(info)}" assert 0.0 < reward < 1.0, f"Reward {reward} out of strict (0.0, 1.0) range" rewards.append(reward) breakdown = info.get("reward_breakdown", {}) print(f" ✓ step({i+1}) → reward={reward:.4f} | " f"correctness={safe_score(breakdown.get('correctness', 0.5)):.2f} " f"tone={safe_score(breakdown.get('tone', 0.5)):.2f} " f"completeness={safe_score(breakdown.get('completeness', 0.5)):.2f} " f"done={done}") if done: break # Verify final state state = env.state() assert state.step_count > 0, "Step count should be > 0 after steps" print(f" ✓ Final state: steps={state.step_count}, reward={state.cumulative_reward:.4f}") return { "task_id": task_id, "rewards": rewards, "avg_reward": safe_score(sum(rewards) / len(rewards)) if rewards else 0.5, "steps": len(rewards), } def validate_grader_variance(): """Verify the grader doesn't return constant values.""" print(f"\n{'='*50}") print(f" Validating: Grader Variance") print(f"{'='*50}") env = CustomerSupportEnvironment() env.reset(task_id="easy_faq") # Test with a GOOD response good_action = SupportAction( response_text=( "Hi Sarah! Thank you for reaching out about your order ORD-55821. " "I completely understand your concern about the shipping update. " "Standard shipping typically takes 5-7 business days, and since your " "order was placed on March 28th, it should be arriving soon. " "You should receive a tracking number via email. Let me look into " "the specific status of your order right away and I'll update you. " "Is there anything else I can help you with?" ), action_type="respond", ) _, good_reward, _, good_info = env.step(good_action) # Reset and test with a BAD response env.reset(task_id="easy_faq") bad_action = SupportAction( response_text="I don't know.", action_type="respond", ) _, bad_reward, _, bad_info = env.step(bad_action) # Reset and test with an IRRELEVANT response env.reset(task_id="easy_faq") irr_action = SupportAction( response_text="The weather is nice today. Have you tried checking the stock market?", action_type="respond", ) _, irr_reward, _, irr_info = env.step(irr_action) print(f" Good response reward: {good_reward:.4f}") print(f" Bad response reward: {bad_reward:.4f}") print(f" Irrelevant response reward: {irr_reward:.4f}") assert good_reward != bad_reward, "Grader returns same reward for good and bad responses!" assert good_reward > bad_reward, "Good response should score higher than bad response!" assert good_reward > irr_reward, "Good response should score higher than irrelevant response!" print(f" ✓ Grader produces varying scores (NOT constant)") print(f" ✓ Good > Bad > Irrelevant ordering confirmed") # Verify ALL rewards are strictly in (0, 1) for label, r in [("good", good_reward), ("bad", bad_reward), ("irr", irr_reward)]: assert 0.0 < r < 1.0, f"{label} reward {r} violates strict (0, 1)!" print(f" ✓ All rewards strictly in (0, 1) open interval") def main(): print("=" * 50) print(" Customer Support Environment — Validation") print("=" * 50) env = CustomerSupportEnvironment() # Test responses per task test_responses = { "easy_faq": [ "Hi Sarah! Thank you for reaching out about your order ORD-55821. " "Standard shipping takes 5-7 business days. You'll receive a tracking " "number via email within 24 hours of shipment. Let me check on the " "status of your Wireless Bluetooth Headphones order right away.", ], "medium_refund": [ "Hi Michael, I'm sorry to hear about the stitching issue with your " "Premium Leather Laptop Bag. That sounds like a manufacturing defect, " "and I completely understand your frustration. According to our policy, " "defective items qualify for a full refund or replacement at any time. " "Could you please send photos of the defect so we can process this quickly?", "Thank you for the photos, Michael. I can confirm this is a defect. " "You have two options: a full refund of $149.99 or a replacement bag. " "Either way, we'll provide a prepaid return shipping label. " "Which would you prefer?", "We'll process your full refund within 5-7 business days after we " "receive the returned bag. I'll email you the return label right away. " "I sincerely apologize for the inconvenience.", ], "hard_escalation": [ "Mr. Martinez, I sincerely apologize for this terrible experience. " "What happened — receiving the wrong item after a late delivery, " "and then being treated rudely by our support staff — is completely " "unacceptable. You deserve much better. I'm escalating this to our " "senior support team immediately as a top priority case.", "I understand your frustration completely, Mr. Martinez. Here's exactly " "what I'm going to do: First, I'm processing a full refund of $349.99 " "for the wrong item. Second, I'm adding a $50 store credit as compensation " "for the inconvenience. Third, I'm personally ensuring the correct " "Smart Home Security Camera System ships via expedited delivery today. " "The staff member's behavior will be addressed by management.", "Absolutely, Mr. Martinez. Here are the specifics: Your refund will be " "processed within 24 hours. The replacement ships via priority express " "and will arrive within 2-3 business days. The $50 credit is already " "applied to your account. I will personally follow up with you via " "email tomorrow to confirm everything is on track.", "I completely understand, Mr. Martinez. I'll send you a confirmation " "email within the hour with all the details in writing: the refund, " "the replacement tracking, and the store credit. You have my word " "this will be resolved. Thank you for your patience." ], } all_results = [] for task_id in TASK_IDS: responses = test_responses.get(task_id, ["Thank you for reaching out."]) result = validate_task(env, task_id, responses) all_results.append(result) # Validate grader variance validate_grader_variance() # Summary print(f"\n{'='*50}") print(f" VALIDATION SUMMARY") print(f"{'='*50}") total_avg = 0.0 for r in all_results: print(f" ✓ {r['task_id']:20s} → avg_reward={r['avg_reward']:.4f} steps={r['steps']}") total_avg += r['avg_reward'] overall = safe_score(total_avg / len(all_results)) if all_results else 0.01 print(f"\n Overall Score: {overall:.4f}") print(f"\n ✅ ALL VALIDATIONS PASSED!") return 0 if __name__ == "__main__": sys.exit(main())