File size: 10,253 Bytes
7660535 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 | """
PolicyEvolverEnv β Multi-Episode Terminal Verification
======================================================
Runs 3 episodes per task, each with progressive steps,
to verify reward improvement over time via the API.
"""
import sys, json, time, copy
sys.path.insert(0, ".")
from server.environment import PolicyEvolverEnvironment
from server.grader import grade
DIVIDER = "=" * 65
# βββ Define progressive action sequences (simulating ICL adaptation) βββ
EASY_ACTIONS = [
# Episode 1: Naive β Mediocre β Good
[
{"action_type": "propose_clarification", "ambiguous_term": "offensive",
"suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"},
{"action_type": "propose_clarification", "ambiguous_term": "appropriate",
"suggested_definition": "Right behavior that is ethical and defined as including verified reports.",
"justification": "It's subjective and varies between teams.",
"think": "I think we need specific criteria."},
{"action_type": "propose_clarification", "ambiguous_term": "appropriate",
"suggested_definition": (
"Behavior is defined as a violation when it specifically includes 3 or more "
"verified reports within 24 hours, exceeding the 5% threshold for category "
"violations. Must meet measurable community standards."
),
"justification": "The term varies between interpreters and leads to inconsistent moderation.",
"think": (
"Because the threshold is too low, the tradeoff between precision and recall "
"creates a false positive risk that will impact community trust. Therefore I "
"balance the evidence requirement based on corpus data."
)},
],
# Episode 2: Different starting point
[
{"action_type": "propose_clarification", "ambiguous_term": "reasonable",
"suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"},
{"action_type": "propose_clarification", "ambiguous_term": "substantial",
"suggested_definition": (
"Substantial AI use is defined as any deliverable where 30% or more of the "
"content was generated by AI. Must be verified within 48 hours by the "
"compliance team. Threshold of 30% specifically applies to all text-based outputs."
),
"justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.",
"think": (
"Because the precision of enforcement depends on a measurable threshold, "
"I balance the tradeoff between innovation freedom and evidence-based accountability. "
"The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%."
)},
],
# Episode 3: Perfect from step 1
[
{"action_type": "propose_clarification", "ambiguous_term": "ethical",
"suggested_definition": (
"Ethical AI use is specifically defined as usage where: (1) the employee "
"discloses AI involvement within 24 hours, (2) output does not exceed 50% "
"AI-generated content without manager approval, and (3) no client-facing "
"materials include unverified AI claims. Must meet measurable audit standards."
),
"justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.",
"think": (
"Because ethical boundaries require precision, the tradeoff between allowing "
"AI innovation and maintaining accountability creates a false positive risk. "
"I balance the threshold at 50% based on evidence from corpus incidents. "
"The impact on organizational trust is measurable through audit compliance rates."
)},
],
]
HARD_ACTIONS = [
# Episode 1: Hallucinated β Realistic
[
{"action_type": "evolve_policy",
"policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
"new_text": "Be better at fraud.", "reason": "improve"}],
"expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95},
"justification": "Everything improves.", "think": "simple"},
{"action_type": "evolve_policy",
"policy_modifications": [
{"policy_id": "ts_pol_001", "change_type": "enhance",
"new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.",
"reason": "Reduces false positives on legitimate seasonal sellers"},
{"policy_id": "ts_pol_002", "change_type": "enhance",
"new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.",
"reason": "Category-aware thresholds reduce false positive rate"}
],
"expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60},
"justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.",
"think": (
"Because improving fraud detection creates a tradeoff with revenue velocity, "
"I balance the threshold to optimise precision and recall without false positive "
"spikes. The impact on seller trust is measurable through the trust score metric. "
"Evidence from corpus shows legitimate sellers being incorrectly flagged."
)},
],
# Episode 2: Good from start
[
{"action_type": "evolve_policy",
"policy_modifications": [
{"policy_id": "ts_pol_003", "change_type": "enhance",
"new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.",
"reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"},
{"policy_id": "ts_pol_004", "change_type": "enhance",
"new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.",
"reason": "Prevents competitor sabotage via fake fraud reports"}
],
"expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65},
"justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.",
"think": (
"Because the manual approval process creates a 14-day delay tradeoff with seller "
"onboarding speed, I balance the threshold by automating KYC. The precision of "
"fraud detection improves when bot-net reports are discounted. Evidence from corpus "
"shows 10 legitimate sellers flagged by a single competitor bot-net. The impact "
"on recall is measurable through the fraud detection rate metric."
)},
],
]
def run_multi_episode_test():
env = PolicyEvolverEnvironment()
all_results = {}
# βββ EASY TASK βββ
print(f"\n{DIVIDER}")
print(" EASY TASK β Multi-Episode Progression Test")
print(DIVIDER)
easy_episodes = []
for ep_idx, actions in enumerate(EASY_ACTIONS):
env.reset(task_id="task_easy")
ep_rewards = []
for step_idx, action in enumerate(actions):
obs = env.step(copy.deepcopy(action))
ep_rewards.append(obs.reward)
rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
if obs.done:
break
easy_episodes.append(ep_rewards)
print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
print()
all_results["task_easy"] = easy_episodes
# βββ HARD TASK βββ
print(f"{DIVIDER}")
print(" HARD TASK β Multi-Episode Progression Test")
print(DIVIDER)
hard_episodes = []
for ep_idx, actions in enumerate(HARD_ACTIONS):
env.reset(task_id="task_hard")
ep_rewards = []
for step_idx, action in enumerate(actions):
obs = env.step(copy.deepcopy(action))
ep_rewards.append(obs.reward)
rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
if obs.done:
break
hard_episodes.append(ep_rewards)
print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
print()
all_results["task_hard"] = hard_episodes
# βββ SUMMARY βββ
print(f"{DIVIDER}")
print(" MULTI-EPISODE SUMMARY")
print(DIVIDER)
for task, episodes in all_results.items():
print(f"\n {task}:")
for i, ep in enumerate(episodes):
trajectory = " β ".join(f"{r:.2f}" for r in ep)
improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True
status = "β IMPROVING" if improving else "β NON-MONOTONIC"
print(f" Episode {i+1}: [{trajectory}] {status}")
# Check cross-episode consistency
print(f"\n{DIVIDER}")
print(" CROSS-EPISODE DETERMINISM CHECK")
print(DIVIDER)
env.reset(task_id="task_easy")
det_action = copy.deepcopy(EASY_ACTIONS[2][0]) # Perfect action
scores = []
for run in range(3):
env.reset(task_id="task_easy")
obs = env.step(copy.deepcopy(det_action))
scores.append(obs.reward)
all_same = len(set(scores)) == 1
print(f" 3 identical runs: {scores}")
print(f" Deterministic: {'β YES' if all_same else 'β NO'}")
print(f"\n{DIVIDER}")
print(" ALL MULTI-EPISODE TESTS COMPLETE")
print(DIVIDER)
if __name__ == "__main__":
run_multi_episode_test()
|