Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

App Files Files Community

Somuai12 commited on Apr 10

Commit

7660535

1 Parent(s): 89fc53c

Add multi-episode verification script

Browse files

Files changed (1) hide show

verify_multi_episode.py +209 -0

verify_multi_episode.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+PolicyEvolverEnv — Multi-Episode Terminal Verification
+======================================================
+Runs 3 episodes per task, each with progressive steps,
+to verify reward improvement over time via the API.
+"""
+import sys, json, time, copy
+sys.path.insert(0, ".")
+from server.environment import PolicyEvolverEnvironment
+from server.grader import grade
+DIVIDER = "=" * 65
+# ─── Define progressive action sequences (simulating ICL adaptation) ───
+EASY_ACTIONS = [
+    # Episode 1: Naive → Mediocre → Good
+    [
+        {"action_type": "propose_clarification", "ambiguous_term": "offensive",
+         "suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"},
+        {"action_type": "propose_clarification", "ambiguous_term": "appropriate",
+         "suggested_definition": "Right behavior that is ethical and defined as including verified reports.",
+         "justification": "It's subjective and varies between teams.",
+         "think": "I think we need specific criteria."},
+        {"action_type": "propose_clarification", "ambiguous_term": "appropriate",
+         "suggested_definition": (
+             "Behavior is defined as a violation when it specifically includes 3 or more "
+             "verified reports within 24 hours, exceeding the 5% threshold for category "
+             "violations. Must meet measurable community standards."
+         ),
+         "justification": "The term varies between interpreters and leads to inconsistent moderation.",
+         "think": (
+             "Because the threshold is too low, the tradeoff between precision and recall "
+             "creates a false positive risk that will impact community trust. Therefore I "
+             "balance the evidence requirement based on corpus data."
+         )},
+    ],
+    # Episode 2: Different starting point
+    [
+        {"action_type": "propose_clarification", "ambiguous_term": "reasonable",
+         "suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"},
+        {"action_type": "propose_clarification", "ambiguous_term": "substantial",
+         "suggested_definition": (
+             "Substantial AI use is defined as any deliverable where 30% or more of the "
+             "content was generated by AI. Must be verified within 48 hours by the "
+             "compliance team. Threshold of 30% specifically applies to all text-based outputs."
+         ),
+         "justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.",
+         "think": (
+             "Because the precision of enforcement depends on a measurable threshold, "
+             "I balance the tradeoff between innovation freedom and evidence-based accountability. "
+             "The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%."
+         )},
+    ],
+    # Episode 3: Perfect from step 1
+    [
+        {"action_type": "propose_clarification", "ambiguous_term": "ethical",
+         "suggested_definition": (
+             "Ethical AI use is specifically defined as usage where: (1) the employee "
+             "discloses AI involvement within 24 hours, (2) output does not exceed 50% "
+             "AI-generated content without manager approval, and (3) no client-facing "
+             "materials include unverified AI claims. Must meet measurable audit standards."
+         ),
+         "justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.",
+         "think": (
+             "Because ethical boundaries require precision, the tradeoff between allowing "
+             "AI innovation and maintaining accountability creates a false positive risk. "
+             "I balance the threshold at 50% based on evidence from corpus incidents. "
+             "The impact on organizational trust is measurable through audit compliance rates."
+         )},
+    ],
+]
+HARD_ACTIONS = [
+    # Episode 1: Hallucinated → Realistic
+    [
+        {"action_type": "evolve_policy",
+         "policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
+                                    "new_text": "Be better at fraud.", "reason": "improve"}],
+         "expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95},
+         "justification": "Everything improves.", "think": "simple"},
+        {"action_type": "evolve_policy",
+         "policy_modifications": [
+             {"policy_id": "ts_pol_001", "change_type": "enhance",
+              "new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.",
+              "reason": "Reduces false positives on legitimate seasonal sellers"},
+             {"policy_id": "ts_pol_002", "change_type": "enhance",
+              "new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.",
+              "reason": "Category-aware thresholds reduce false positive rate"}
+         ],
+         "expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60},
+         "justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.",
+         "think": (
+             "Because improving fraud detection creates a tradeoff with revenue velocity, "
+             "I balance the threshold to optimise precision and recall without false positive "
+             "spikes. The impact on seller trust is measurable through the trust score metric. "
+             "Evidence from corpus shows legitimate sellers being incorrectly flagged."
+         )},
+    ],
+    # Episode 2: Good from start
+    [
+        {"action_type": "evolve_policy",
+         "policy_modifications": [
+             {"policy_id": "ts_pol_003", "change_type": "enhance",
+              "new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.",
+              "reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"},
+             {"policy_id": "ts_pol_004", "change_type": "enhance",
+              "new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.",
+              "reason": "Prevents competitor sabotage via fake fraud reports"}
+         ],
+         "expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65},
+         "justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.",
+         "think": (
+             "Because the manual approval process creates a 14-day delay tradeoff with seller "
+             "onboarding speed, I balance the threshold by automating KYC. The precision of "
+             "fraud detection improves when bot-net reports are discounted. Evidence from corpus "
+             "shows 10 legitimate sellers flagged by a single competitor bot-net. The impact "
+             "on recall is measurable through the fraud detection rate metric."
+         )},
+    ],
+]
+def run_multi_episode_test():
+    env = PolicyEvolverEnvironment()
+    all_results = {}
+    # ═══ EASY TASK ═══
+    print(f"\n{DIVIDER}")
+    print("  EASY TASK — Multi-Episode Progression Test")
+    print(DIVIDER)
+    easy_episodes = []
+    for ep_idx, actions in enumerate(EASY_ACTIONS):
+        env.reset(task_id="task_easy")
+        ep_rewards = []
+        for step_idx, action in enumerate(actions):
+            obs = env.step(copy.deepcopy(action))
+            ep_rewards.append(obs.reward)
+            rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
+            print(f"  Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f}  rating={rating}")
+            if obs.done:
+                break
+        easy_episodes.append(ep_rewards)
+        print(f"  Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f}  (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
+        print()
+    all_results["task_easy"] = easy_episodes
+    # ═══ HARD TASK ═══
+    print(f"{DIVIDER}")
+    print("  HARD TASK — Multi-Episode Progression Test")
+    print(DIVIDER)
+    hard_episodes = []
+    for ep_idx, actions in enumerate(HARD_ACTIONS):
+        env.reset(task_id="task_hard")
+        ep_rewards = []
+        for step_idx, action in enumerate(actions):
+            obs = env.step(copy.deepcopy(action))
+            ep_rewards.append(obs.reward)
+            rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
+            print(f"  Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f}  rating={rating}")
+            if obs.done:
+                break
+        hard_episodes.append(ep_rewards)
+        print(f"  Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f}  (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
+        print()
+    all_results["task_hard"] = hard_episodes
+    # ═══ SUMMARY ═══
+    print(f"{DIVIDER}")
+    print("  MULTI-EPISODE SUMMARY")
+    print(DIVIDER)
+    for task, episodes in all_results.items():
+        print(f"\n  {task}:")
+        for i, ep in enumerate(episodes):
+            trajectory = " → ".join(f"{r:.2f}" for r in ep)
+            improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True
+            status = "✓ IMPROVING" if improving else "⚠ NON-MONOTONIC"
+            print(f"    Episode {i+1}: [{trajectory}]  {status}")
+    # Check cross-episode consistency
+    print(f"\n{DIVIDER}")
+    print("  CROSS-EPISODE DETERMINISM CHECK")
+    print(DIVIDER)
+    env.reset(task_id="task_easy")
+    det_action = copy.deepcopy(EASY_ACTIONS[2][0])  # Perfect action
+    scores = []
+    for run in range(3):
+        env.reset(task_id="task_easy")
+        obs = env.step(copy.deepcopy(det_action))
+        scores.append(obs.reward)
+    all_same = len(set(scores)) == 1
+    print(f"  3 identical runs: {scores}")
+    print(f"  Deterministic: {'✓ YES' if all_same else '✗ NO'}")
+    print(f"\n{DIVIDER}")
+    print("  ALL MULTI-EPISODE TESTS COMPLETE")
+    print(DIVIDER)
+if __name__ == "__main__":
+    run_multi_episode_test()