""" PolicyEvolverEnv — Multi-Episode Terminal Verification ====================================================== Runs 3 episodes per task, each with progressive steps, to verify reward improvement over time via the API. """ import sys, json, time, copy sys.path.insert(0, ".") from server.environment import PolicyEvolverEnvironment from server.grader import grade DIVIDER = "=" * 65 # ─── Define progressive action sequences (simulating ICL adaptation) ─── EASY_ACTIONS = [ # Episode 1: Naive → Mediocre → Good [ {"action_type": "propose_clarification", "ambiguous_term": "offensive", "suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"}, {"action_type": "propose_clarification", "ambiguous_term": "appropriate", "suggested_definition": "Right behavior that is ethical and defined as including verified reports.", "justification": "It's subjective and varies between teams.", "think": "I think we need specific criteria."}, {"action_type": "propose_clarification", "ambiguous_term": "appropriate", "suggested_definition": ( "Behavior is defined as a violation when it specifically includes 3 or more " "verified reports within 24 hours, exceeding the 5% threshold for category " "violations. Must meet measurable community standards." ), "justification": "The term varies between interpreters and leads to inconsistent moderation.", "think": ( "Because the threshold is too low, the tradeoff between precision and recall " "creates a false positive risk that will impact community trust. Therefore I " "balance the evidence requirement based on corpus data." )}, ], # Episode 2: Different starting point [ {"action_type": "propose_clarification", "ambiguous_term": "reasonable", "suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"}, {"action_type": "propose_clarification", "ambiguous_term": "substantial", "suggested_definition": ( "Substantial AI use is defined as any deliverable where 30% or more of the " "content was generated by AI. Must be verified within 48 hours by the " "compliance team. Threshold of 30% specifically applies to all text-based outputs." ), "justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.", "think": ( "Because the precision of enforcement depends on a measurable threshold, " "I balance the tradeoff between innovation freedom and evidence-based accountability. " "The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%." )}, ], # Episode 3: Perfect from step 1 [ {"action_type": "propose_clarification", "ambiguous_term": "ethical", "suggested_definition": ( "Ethical AI use is specifically defined as usage where: (1) the employee " "discloses AI involvement within 24 hours, (2) output does not exceed 50% " "AI-generated content without manager approval, and (3) no client-facing " "materials include unverified AI claims. Must meet measurable audit standards." ), "justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.", "think": ( "Because ethical boundaries require precision, the tradeoff between allowing " "AI innovation and maintaining accountability creates a false positive risk. " "I balance the threshold at 50% based on evidence from corpus incidents. " "The impact on organizational trust is measurable through audit compliance rates." )}, ], ] HARD_ACTIONS = [ # Episode 1: Hallucinated → Realistic [ {"action_type": "evolve_policy", "policy_modifications": [{"policy_id": "p1", "change_type": "enhance", "new_text": "Be better at fraud.", "reason": "improve"}], "expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95}, "justification": "Everything improves.", "think": "simple"}, {"action_type": "evolve_policy", "policy_modifications": [ {"policy_id": "ts_pol_001", "change_type": "enhance", "new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.", "reason": "Reduces false positives on legitimate seasonal sellers"}, {"policy_id": "ts_pol_002", "change_type": "enhance", "new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.", "reason": "Category-aware thresholds reduce false positive rate"} ], "expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60}, "justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.", "think": ( "Because improving fraud detection creates a tradeoff with revenue velocity, " "I balance the threshold to optimise precision and recall without false positive " "spikes. The impact on seller trust is measurable through the trust score metric. " "Evidence from corpus shows legitimate sellers being incorrectly flagged." )}, ], # Episode 2: Good from start [ {"action_type": "evolve_policy", "policy_modifications": [ {"policy_id": "ts_pol_003", "change_type": "enhance", "new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.", "reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"}, {"policy_id": "ts_pol_004", "change_type": "enhance", "new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.", "reason": "Prevents competitor sabotage via fake fraud reports"} ], "expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65}, "justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.", "think": ( "Because the manual approval process creates a 14-day delay tradeoff with seller " "onboarding speed, I balance the threshold by automating KYC. The precision of " "fraud detection improves when bot-net reports are discounted. Evidence from corpus " "shows 10 legitimate sellers flagged by a single competitor bot-net. The impact " "on recall is measurable through the fraud detection rate metric." )}, ], ] def run_multi_episode_test(): env = PolicyEvolverEnvironment() all_results = {} # ═══ EASY TASK ═══ print(f"\n{DIVIDER}") print(" EASY TASK — Multi-Episode Progression Test") print(DIVIDER) easy_episodes = [] for ep_idx, actions in enumerate(EASY_ACTIONS): env.reset(task_id="task_easy") ep_rewards = [] for step_idx, action in enumerate(actions): obs = env.step(copy.deepcopy(action)) ep_rewards.append(obs.reward) rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A") print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}") if obs.done: break easy_episodes.append(ep_rewards) print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})") print() all_results["task_easy"] = easy_episodes # ═══ HARD TASK ═══ print(f"{DIVIDER}") print(" HARD TASK — Multi-Episode Progression Test") print(DIVIDER) hard_episodes = [] for ep_idx, actions in enumerate(HARD_ACTIONS): env.reset(task_id="task_hard") ep_rewards = [] for step_idx, action in enumerate(actions): obs = env.step(copy.deepcopy(action)) ep_rewards.append(obs.reward) rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A") print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}") if obs.done: break hard_episodes.append(ep_rewards) print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})") print() all_results["task_hard"] = hard_episodes # ═══ SUMMARY ═══ print(f"{DIVIDER}") print(" MULTI-EPISODE SUMMARY") print(DIVIDER) for task, episodes in all_results.items(): print(f"\n {task}:") for i, ep in enumerate(episodes): trajectory = " → ".join(f"{r:.2f}" for r in ep) improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True status = "✓ IMPROVING" if improving else "⚠ NON-MONOTONIC" print(f" Episode {i+1}: [{trajectory}] {status}") # Check cross-episode consistency print(f"\n{DIVIDER}") print(" CROSS-EPISODE DETERMINISM CHECK") print(DIVIDER) env.reset(task_id="task_easy") det_action = copy.deepcopy(EASY_ACTIONS[2][0]) # Perfect action scores = [] for run in range(3): env.reset(task_id="task_easy") obs = env.step(copy.deepcopy(det_action)) scores.append(obs.reward) all_same = len(set(scores)) == 1 print(f" 3 identical runs: {scores}") print(f" Deterministic: {'✓ YES' if all_same else '✗ NO'}") print(f"\n{DIVIDER}") print(" ALL MULTI-EPISODE TESTS COMPLETE") print(DIVIDER) if __name__ == "__main__": run_multi_episode_test()