| """ |
| PolicyEvolverEnv β Multi-Episode Terminal Verification |
| ====================================================== |
| Runs 3 episodes per task, each with progressive steps, |
| to verify reward improvement over time via the API. |
| """ |
| import sys, json, time, copy |
| sys.path.insert(0, ".") |
|
|
| from server.environment import PolicyEvolverEnvironment |
| from server.grader import grade |
|
|
| DIVIDER = "=" * 65 |
|
|
| |
|
|
| EASY_ACTIONS = [ |
| |
| [ |
| {"action_type": "propose_clarification", "ambiguous_term": "offensive", |
| "suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"}, |
| {"action_type": "propose_clarification", "ambiguous_term": "appropriate", |
| "suggested_definition": "Right behavior that is ethical and defined as including verified reports.", |
| "justification": "It's subjective and varies between teams.", |
| "think": "I think we need specific criteria."}, |
| {"action_type": "propose_clarification", "ambiguous_term": "appropriate", |
| "suggested_definition": ( |
| "Behavior is defined as a violation when it specifically includes 3 or more " |
| "verified reports within 24 hours, exceeding the 5% threshold for category " |
| "violations. Must meet measurable community standards." |
| ), |
| "justification": "The term varies between interpreters and leads to inconsistent moderation.", |
| "think": ( |
| "Because the threshold is too low, the tradeoff between precision and recall " |
| "creates a false positive risk that will impact community trust. Therefore I " |
| "balance the evidence requirement based on corpus data." |
| )}, |
| ], |
| |
| [ |
| {"action_type": "propose_clarification", "ambiguous_term": "reasonable", |
| "suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"}, |
| {"action_type": "propose_clarification", "ambiguous_term": "substantial", |
| "suggested_definition": ( |
| "Substantial AI use is defined as any deliverable where 30% or more of the " |
| "content was generated by AI. Must be verified within 48 hours by the " |
| "compliance team. Threshold of 30% specifically applies to all text-based outputs." |
| ), |
| "justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.", |
| "think": ( |
| "Because the precision of enforcement depends on a measurable threshold, " |
| "I balance the tradeoff between innovation freedom and evidence-based accountability. " |
| "The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%." |
| )}, |
| ], |
| |
| [ |
| {"action_type": "propose_clarification", "ambiguous_term": "ethical", |
| "suggested_definition": ( |
| "Ethical AI use is specifically defined as usage where: (1) the employee " |
| "discloses AI involvement within 24 hours, (2) output does not exceed 50% " |
| "AI-generated content without manager approval, and (3) no client-facing " |
| "materials include unverified AI claims. Must meet measurable audit standards." |
| ), |
| "justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.", |
| "think": ( |
| "Because ethical boundaries require precision, the tradeoff between allowing " |
| "AI innovation and maintaining accountability creates a false positive risk. " |
| "I balance the threshold at 50% based on evidence from corpus incidents. " |
| "The impact on organizational trust is measurable through audit compliance rates." |
| )}, |
| ], |
| ] |
|
|
| HARD_ACTIONS = [ |
| |
| [ |
| {"action_type": "evolve_policy", |
| "policy_modifications": [{"policy_id": "p1", "change_type": "enhance", |
| "new_text": "Be better at fraud.", "reason": "improve"}], |
| "expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95}, |
| "justification": "Everything improves.", "think": "simple"}, |
| {"action_type": "evolve_policy", |
| "policy_modifications": [ |
| {"policy_id": "ts_pol_001", "change_type": "enhance", |
| "new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.", |
| "reason": "Reduces false positives on legitimate seasonal sellers"}, |
| {"policy_id": "ts_pol_002", "change_type": "enhance", |
| "new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.", |
| "reason": "Category-aware thresholds reduce false positive rate"} |
| ], |
| "expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60}, |
| "justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.", |
| "think": ( |
| "Because improving fraud detection creates a tradeoff with revenue velocity, " |
| "I balance the threshold to optimise precision and recall without false positive " |
| "spikes. The impact on seller trust is measurable through the trust score metric. " |
| "Evidence from corpus shows legitimate sellers being incorrectly flagged." |
| )}, |
| ], |
| |
| [ |
| {"action_type": "evolve_policy", |
| "policy_modifications": [ |
| {"policy_id": "ts_pol_003", "change_type": "enhance", |
| "new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.", |
| "reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"}, |
| {"policy_id": "ts_pol_004", "change_type": "enhance", |
| "new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.", |
| "reason": "Prevents competitor sabotage via fake fraud reports"} |
| ], |
| "expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65}, |
| "justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.", |
| "think": ( |
| "Because the manual approval process creates a 14-day delay tradeoff with seller " |
| "onboarding speed, I balance the threshold by automating KYC. The precision of " |
| "fraud detection improves when bot-net reports are discounted. Evidence from corpus " |
| "shows 10 legitimate sellers flagged by a single competitor bot-net. The impact " |
| "on recall is measurable through the fraud detection rate metric." |
| )}, |
| ], |
| ] |
|
|
|
|
| def run_multi_episode_test(): |
| env = PolicyEvolverEnvironment() |
| all_results = {} |
|
|
| |
| print(f"\n{DIVIDER}") |
| print(" EASY TASK β Multi-Episode Progression Test") |
| print(DIVIDER) |
|
|
| easy_episodes = [] |
| for ep_idx, actions in enumerate(EASY_ACTIONS): |
| env.reset(task_id="task_easy") |
| ep_rewards = [] |
| for step_idx, action in enumerate(actions): |
| obs = env.step(copy.deepcopy(action)) |
| ep_rewards.append(obs.reward) |
| rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A") |
| print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}") |
| if obs.done: |
| break |
| easy_episodes.append(ep_rewards) |
| print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})") |
| print() |
|
|
| all_results["task_easy"] = easy_episodes |
|
|
| |
| print(f"{DIVIDER}") |
| print(" HARD TASK β Multi-Episode Progression Test") |
| print(DIVIDER) |
|
|
| hard_episodes = [] |
| for ep_idx, actions in enumerate(HARD_ACTIONS): |
| env.reset(task_id="task_hard") |
| ep_rewards = [] |
| for step_idx, action in enumerate(actions): |
| obs = env.step(copy.deepcopy(action)) |
| ep_rewards.append(obs.reward) |
| rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A") |
| print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}") |
| if obs.done: |
| break |
| hard_episodes.append(ep_rewards) |
| print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})") |
| print() |
|
|
| all_results["task_hard"] = hard_episodes |
|
|
| |
| print(f"{DIVIDER}") |
| print(" MULTI-EPISODE SUMMARY") |
| print(DIVIDER) |
|
|
| for task, episodes in all_results.items(): |
| print(f"\n {task}:") |
| for i, ep in enumerate(episodes): |
| trajectory = " β ".join(f"{r:.2f}" for r in ep) |
| improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True |
| status = "β IMPROVING" if improving else "β NON-MONOTONIC" |
| print(f" Episode {i+1}: [{trajectory}] {status}") |
|
|
| |
| print(f"\n{DIVIDER}") |
| print(" CROSS-EPISODE DETERMINISM CHECK") |
| print(DIVIDER) |
|
|
| env.reset(task_id="task_easy") |
| det_action = copy.deepcopy(EASY_ACTIONS[2][0]) |
| scores = [] |
| for run in range(3): |
| env.reset(task_id="task_easy") |
| obs = env.step(copy.deepcopy(det_action)) |
| scores.append(obs.reward) |
| |
| all_same = len(set(scores)) == 1 |
| print(f" 3 identical runs: {scores}") |
| print(f" Deterministic: {'β YES' if all_same else 'β NO'}") |
|
|
| print(f"\n{DIVIDER}") |
| print(" ALL MULTI-EPISODE TESTS COMPLETE") |
| print(DIVIDER) |
|
|
|
|
| if __name__ == "__main__": |
| run_multi_episode_test() |
|
|