Add multi-episode verification script
Browse files- verify_multi_episode.py +209 -0
verify_multi_episode.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PolicyEvolverEnv β Multi-Episode Terminal Verification
|
| 3 |
+
======================================================
|
| 4 |
+
Runs 3 episodes per task, each with progressive steps,
|
| 5 |
+
to verify reward improvement over time via the API.
|
| 6 |
+
"""
|
| 7 |
+
import sys, json, time, copy
|
| 8 |
+
sys.path.insert(0, ".")
|
| 9 |
+
|
| 10 |
+
from server.environment import PolicyEvolverEnvironment
|
| 11 |
+
from server.grader import grade
|
| 12 |
+
|
| 13 |
+
DIVIDER = "=" * 65
|
| 14 |
+
|
| 15 |
+
# βββ Define progressive action sequences (simulating ICL adaptation) βββ
|
| 16 |
+
|
| 17 |
+
EASY_ACTIONS = [
|
| 18 |
+
# Episode 1: Naive β Mediocre β Good
|
| 19 |
+
[
|
| 20 |
+
{"action_type": "propose_clarification", "ambiguous_term": "offensive",
|
| 21 |
+
"suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"},
|
| 22 |
+
{"action_type": "propose_clarification", "ambiguous_term": "appropriate",
|
| 23 |
+
"suggested_definition": "Right behavior that is ethical and defined as including verified reports.",
|
| 24 |
+
"justification": "It's subjective and varies between teams.",
|
| 25 |
+
"think": "I think we need specific criteria."},
|
| 26 |
+
{"action_type": "propose_clarification", "ambiguous_term": "appropriate",
|
| 27 |
+
"suggested_definition": (
|
| 28 |
+
"Behavior is defined as a violation when it specifically includes 3 or more "
|
| 29 |
+
"verified reports within 24 hours, exceeding the 5% threshold for category "
|
| 30 |
+
"violations. Must meet measurable community standards."
|
| 31 |
+
),
|
| 32 |
+
"justification": "The term varies between interpreters and leads to inconsistent moderation.",
|
| 33 |
+
"think": (
|
| 34 |
+
"Because the threshold is too low, the tradeoff between precision and recall "
|
| 35 |
+
"creates a false positive risk that will impact community trust. Therefore I "
|
| 36 |
+
"balance the evidence requirement based on corpus data."
|
| 37 |
+
)},
|
| 38 |
+
],
|
| 39 |
+
# Episode 2: Different starting point
|
| 40 |
+
[
|
| 41 |
+
{"action_type": "propose_clarification", "ambiguous_term": "reasonable",
|
| 42 |
+
"suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"},
|
| 43 |
+
{"action_type": "propose_clarification", "ambiguous_term": "substantial",
|
| 44 |
+
"suggested_definition": (
|
| 45 |
+
"Substantial AI use is defined as any deliverable where 30% or more of the "
|
| 46 |
+
"content was generated by AI. Must be verified within 48 hours by the "
|
| 47 |
+
"compliance team. Threshold of 30% specifically applies to all text-based outputs."
|
| 48 |
+
),
|
| 49 |
+
"justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.",
|
| 50 |
+
"think": (
|
| 51 |
+
"Because the precision of enforcement depends on a measurable threshold, "
|
| 52 |
+
"I balance the tradeoff between innovation freedom and evidence-based accountability. "
|
| 53 |
+
"The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%."
|
| 54 |
+
)},
|
| 55 |
+
],
|
| 56 |
+
# Episode 3: Perfect from step 1
|
| 57 |
+
[
|
| 58 |
+
{"action_type": "propose_clarification", "ambiguous_term": "ethical",
|
| 59 |
+
"suggested_definition": (
|
| 60 |
+
"Ethical AI use is specifically defined as usage where: (1) the employee "
|
| 61 |
+
"discloses AI involvement within 24 hours, (2) output does not exceed 50% "
|
| 62 |
+
"AI-generated content without manager approval, and (3) no client-facing "
|
| 63 |
+
"materials include unverified AI claims. Must meet measurable audit standards."
|
| 64 |
+
),
|
| 65 |
+
"justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.",
|
| 66 |
+
"think": (
|
| 67 |
+
"Because ethical boundaries require precision, the tradeoff between allowing "
|
| 68 |
+
"AI innovation and maintaining accountability creates a false positive risk. "
|
| 69 |
+
"I balance the threshold at 50% based on evidence from corpus incidents. "
|
| 70 |
+
"The impact on organizational trust is measurable through audit compliance rates."
|
| 71 |
+
)},
|
| 72 |
+
],
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
HARD_ACTIONS = [
|
| 76 |
+
# Episode 1: Hallucinated β Realistic
|
| 77 |
+
[
|
| 78 |
+
{"action_type": "evolve_policy",
|
| 79 |
+
"policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
|
| 80 |
+
"new_text": "Be better at fraud.", "reason": "improve"}],
|
| 81 |
+
"expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95},
|
| 82 |
+
"justification": "Everything improves.", "think": "simple"},
|
| 83 |
+
{"action_type": "evolve_policy",
|
| 84 |
+
"policy_modifications": [
|
| 85 |
+
{"policy_id": "ts_pol_001", "change_type": "enhance",
|
| 86 |
+
"new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.",
|
| 87 |
+
"reason": "Reduces false positives on legitimate seasonal sellers"},
|
| 88 |
+
{"policy_id": "ts_pol_002", "change_type": "enhance",
|
| 89 |
+
"new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.",
|
| 90 |
+
"reason": "Category-aware thresholds reduce false positive rate"}
|
| 91 |
+
],
|
| 92 |
+
"expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60},
|
| 93 |
+
"justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.",
|
| 94 |
+
"think": (
|
| 95 |
+
"Because improving fraud detection creates a tradeoff with revenue velocity, "
|
| 96 |
+
"I balance the threshold to optimise precision and recall without false positive "
|
| 97 |
+
"spikes. The impact on seller trust is measurable through the trust score metric. "
|
| 98 |
+
"Evidence from corpus shows legitimate sellers being incorrectly flagged."
|
| 99 |
+
)},
|
| 100 |
+
],
|
| 101 |
+
# Episode 2: Good from start
|
| 102 |
+
[
|
| 103 |
+
{"action_type": "evolve_policy",
|
| 104 |
+
"policy_modifications": [
|
| 105 |
+
{"policy_id": "ts_pol_003", "change_type": "enhance",
|
| 106 |
+
"new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.",
|
| 107 |
+
"reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"},
|
| 108 |
+
{"policy_id": "ts_pol_004", "change_type": "enhance",
|
| 109 |
+
"new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.",
|
| 110 |
+
"reason": "Prevents competitor sabotage via fake fraud reports"}
|
| 111 |
+
],
|
| 112 |
+
"expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65},
|
| 113 |
+
"justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.",
|
| 114 |
+
"think": (
|
| 115 |
+
"Because the manual approval process creates a 14-day delay tradeoff with seller "
|
| 116 |
+
"onboarding speed, I balance the threshold by automating KYC. The precision of "
|
| 117 |
+
"fraud detection improves when bot-net reports are discounted. Evidence from corpus "
|
| 118 |
+
"shows 10 legitimate sellers flagged by a single competitor bot-net. The impact "
|
| 119 |
+
"on recall is measurable through the fraud detection rate metric."
|
| 120 |
+
)},
|
| 121 |
+
],
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def run_multi_episode_test():
|
| 126 |
+
env = PolicyEvolverEnvironment()
|
| 127 |
+
all_results = {}
|
| 128 |
+
|
| 129 |
+
# βββ EASY TASK βββ
|
| 130 |
+
print(f"\n{DIVIDER}")
|
| 131 |
+
print(" EASY TASK β Multi-Episode Progression Test")
|
| 132 |
+
print(DIVIDER)
|
| 133 |
+
|
| 134 |
+
easy_episodes = []
|
| 135 |
+
for ep_idx, actions in enumerate(EASY_ACTIONS):
|
| 136 |
+
env.reset(task_id="task_easy")
|
| 137 |
+
ep_rewards = []
|
| 138 |
+
for step_idx, action in enumerate(actions):
|
| 139 |
+
obs = env.step(copy.deepcopy(action))
|
| 140 |
+
ep_rewards.append(obs.reward)
|
| 141 |
+
rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
|
| 142 |
+
print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
|
| 143 |
+
if obs.done:
|
| 144 |
+
break
|
| 145 |
+
easy_episodes.append(ep_rewards)
|
| 146 |
+
print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
|
| 147 |
+
print()
|
| 148 |
+
|
| 149 |
+
all_results["task_easy"] = easy_episodes
|
| 150 |
+
|
| 151 |
+
# βββ HARD TASK βββ
|
| 152 |
+
print(f"{DIVIDER}")
|
| 153 |
+
print(" HARD TASK β Multi-Episode Progression Test")
|
| 154 |
+
print(DIVIDER)
|
| 155 |
+
|
| 156 |
+
hard_episodes = []
|
| 157 |
+
for ep_idx, actions in enumerate(HARD_ACTIONS):
|
| 158 |
+
env.reset(task_id="task_hard")
|
| 159 |
+
ep_rewards = []
|
| 160 |
+
for step_idx, action in enumerate(actions):
|
| 161 |
+
obs = env.step(copy.deepcopy(action))
|
| 162 |
+
ep_rewards.append(obs.reward)
|
| 163 |
+
rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
|
| 164 |
+
print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
|
| 165 |
+
if obs.done:
|
| 166 |
+
break
|
| 167 |
+
hard_episodes.append(ep_rewards)
|
| 168 |
+
print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
|
| 169 |
+
print()
|
| 170 |
+
|
| 171 |
+
all_results["task_hard"] = hard_episodes
|
| 172 |
+
|
| 173 |
+
# βββ SUMMARY βββ
|
| 174 |
+
print(f"{DIVIDER}")
|
| 175 |
+
print(" MULTI-EPISODE SUMMARY")
|
| 176 |
+
print(DIVIDER)
|
| 177 |
+
|
| 178 |
+
for task, episodes in all_results.items():
|
| 179 |
+
print(f"\n {task}:")
|
| 180 |
+
for i, ep in enumerate(episodes):
|
| 181 |
+
trajectory = " β ".join(f"{r:.2f}" for r in ep)
|
| 182 |
+
improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True
|
| 183 |
+
status = "β IMPROVING" if improving else "β NON-MONOTONIC"
|
| 184 |
+
print(f" Episode {i+1}: [{trajectory}] {status}")
|
| 185 |
+
|
| 186 |
+
# Check cross-episode consistency
|
| 187 |
+
print(f"\n{DIVIDER}")
|
| 188 |
+
print(" CROSS-EPISODE DETERMINISM CHECK")
|
| 189 |
+
print(DIVIDER)
|
| 190 |
+
|
| 191 |
+
env.reset(task_id="task_easy")
|
| 192 |
+
det_action = copy.deepcopy(EASY_ACTIONS[2][0]) # Perfect action
|
| 193 |
+
scores = []
|
| 194 |
+
for run in range(3):
|
| 195 |
+
env.reset(task_id="task_easy")
|
| 196 |
+
obs = env.step(copy.deepcopy(det_action))
|
| 197 |
+
scores.append(obs.reward)
|
| 198 |
+
|
| 199 |
+
all_same = len(set(scores)) == 1
|
| 200 |
+
print(f" 3 identical runs: {scores}")
|
| 201 |
+
print(f" Deterministic: {'β YES' if all_same else 'β NO'}")
|
| 202 |
+
|
| 203 |
+
print(f"\n{DIVIDER}")
|
| 204 |
+
print(" ALL MULTI-EPISODE TESTS COMPLETE")
|
| 205 |
+
print(DIVIDER)
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
run_multi_episode_test()
|