Somuai12 commited on
Commit
7660535
Β·
1 Parent(s): 89fc53c

Add multi-episode verification script

Browse files
Files changed (1) hide show
  1. verify_multi_episode.py +209 -0
verify_multi_episode.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PolicyEvolverEnv β€” Multi-Episode Terminal Verification
3
+ ======================================================
4
+ Runs 3 episodes per task, each with progressive steps,
5
+ to verify reward improvement over time via the API.
6
+ """
7
+ import sys, json, time, copy
8
+ sys.path.insert(0, ".")
9
+
10
+ from server.environment import PolicyEvolverEnvironment
11
+ from server.grader import grade
12
+
13
+ DIVIDER = "=" * 65
14
+
15
+ # ─── Define progressive action sequences (simulating ICL adaptation) ───
16
+
17
+ EASY_ACTIONS = [
18
+ # Episode 1: Naive β†’ Mediocre β†’ Good
19
+ [
20
+ {"action_type": "propose_clarification", "ambiguous_term": "offensive",
21
+ "suggested_definition": "Bad stuff.", "justification": "unclear", "think": "hmm"},
22
+ {"action_type": "propose_clarification", "ambiguous_term": "appropriate",
23
+ "suggested_definition": "Right behavior that is ethical and defined as including verified reports.",
24
+ "justification": "It's subjective and varies between teams.",
25
+ "think": "I think we need specific criteria."},
26
+ {"action_type": "propose_clarification", "ambiguous_term": "appropriate",
27
+ "suggested_definition": (
28
+ "Behavior is defined as a violation when it specifically includes 3 or more "
29
+ "verified reports within 24 hours, exceeding the 5% threshold for category "
30
+ "violations. Must meet measurable community standards."
31
+ ),
32
+ "justification": "The term varies between interpreters and leads to inconsistent moderation.",
33
+ "think": (
34
+ "Because the threshold is too low, the tradeoff between precision and recall "
35
+ "creates a false positive risk that will impact community trust. Therefore I "
36
+ "balance the evidence requirement based on corpus data."
37
+ )},
38
+ ],
39
+ # Episode 2: Different starting point
40
+ [
41
+ {"action_type": "propose_clarification", "ambiguous_term": "reasonable",
42
+ "suggested_definition": "Acceptable.", "justification": "vague", "think": "ok"},
43
+ {"action_type": "propose_clarification", "ambiguous_term": "substantial",
44
+ "suggested_definition": (
45
+ "Substantial AI use is defined as any deliverable where 30% or more of the "
46
+ "content was generated by AI. Must be verified within 48 hours by the "
47
+ "compliance team. Threshold of 30% specifically applies to all text-based outputs."
48
+ ),
49
+ "justification": "Different managers interpret 'substantial' differently, causing inconsistent enforcement.",
50
+ "think": (
51
+ "Because the precision of enforcement depends on a measurable threshold, "
52
+ "I balance the tradeoff between innovation freedom and evidence-based accountability. "
53
+ "The impact on trust requires a clear recall mechanism. Therefore corpus data supports 30%."
54
+ )},
55
+ ],
56
+ # Episode 3: Perfect from step 1
57
+ [
58
+ {"action_type": "propose_clarification", "ambiguous_term": "ethical",
59
+ "suggested_definition": (
60
+ "Ethical AI use is specifically defined as usage where: (1) the employee "
61
+ "discloses AI involvement within 24 hours, (2) output does not exceed 50% "
62
+ "AI-generated content without manager approval, and (3) no client-facing "
63
+ "materials include unverified AI claims. Must meet measurable audit standards."
64
+ ),
65
+ "justification": "The term 'ethical' is subjective and leads to inconsistent enforcement across departments.",
66
+ "think": (
67
+ "Because ethical boundaries require precision, the tradeoff between allowing "
68
+ "AI innovation and maintaining accountability creates a false positive risk. "
69
+ "I balance the threshold at 50% based on evidence from corpus incidents. "
70
+ "The impact on organizational trust is measurable through audit compliance rates."
71
+ )},
72
+ ],
73
+ ]
74
+
75
+ HARD_ACTIONS = [
76
+ # Episode 1: Hallucinated β†’ Realistic
77
+ [
78
+ {"action_type": "evolve_policy",
79
+ "policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
80
+ "new_text": "Be better at fraud.", "reason": "improve"}],
81
+ "expected_outcomes": {"fraud_rate": 0.95, "revenue_velocity": 0.95, "seller_trust": 0.95},
82
+ "justification": "Everything improves.", "think": "simple"},
83
+ {"action_type": "evolve_policy",
84
+ "policy_modifications": [
85
+ {"policy_id": "ts_pol_001", "change_type": "enhance",
86
+ "new_text": "New seller accounts exceeding 50 transactions in week 1 are flagged for expedited 24h review. Seasonal category sellers exempt if volume matches historical patterns.",
87
+ "reason": "Reduces false positives on legitimate seasonal sellers"},
88
+ {"policy_id": "ts_pol_002", "change_type": "enhance",
89
+ "new_text": "Return rate thresholds tiered by category: Electronics >10%, Fashion >20%, Home >12%. Exceeding triggers review, not suspension.",
90
+ "reason": "Category-aware thresholds reduce false positive rate"}
91
+ ],
92
+ "expected_outcomes": {"fraud_rate": 0.75, "revenue_velocity": 0.40, "seller_trust": 0.60},
93
+ "justification": "Balancing fraud detection against marketplace revenue velocity for seller trust.",
94
+ "think": (
95
+ "Because improving fraud detection creates a tradeoff with revenue velocity, "
96
+ "I balance the threshold to optimise precision and recall without false positive "
97
+ "spikes. The impact on seller trust is measurable through the trust score metric. "
98
+ "Evidence from corpus shows legitimate sellers being incorrectly flagged."
99
+ )},
100
+ ],
101
+ # Episode 2: Good from start
102
+ [
103
+ {"action_type": "evolve_policy",
104
+ "policy_modifications": [
105
+ {"policy_id": "ts_pol_003", "change_type": "enhance",
106
+ "new_text": "Foreign bank sellers get automated KYC fast-track (48h SLA) instead of manual review bottleneck.",
107
+ "reason": "Eliminates 14-day manual approval delays for legitimate foreign sellers"},
108
+ {"policy_id": "ts_pol_004", "change_type": "enhance",
109
+ "new_text": "Fraud reports weighted by reporter credibility score. Bot-net reports auto-discounted by 80%.",
110
+ "reason": "Prevents competitor sabotage via fake fraud reports"}
111
+ ],
112
+ "expected_outcomes": {"fraud_rate": 0.70, "revenue_velocity": 0.45, "seller_trust": 0.65},
113
+ "justification": "Targeting the manual approval bottleneck and fake report vulnerability to improve marketplace velocity.",
114
+ "think": (
115
+ "Because the manual approval process creates a 14-day delay tradeoff with seller "
116
+ "onboarding speed, I balance the threshold by automating KYC. The precision of "
117
+ "fraud detection improves when bot-net reports are discounted. Evidence from corpus "
118
+ "shows 10 legitimate sellers flagged by a single competitor bot-net. The impact "
119
+ "on recall is measurable through the fraud detection rate metric."
120
+ )},
121
+ ],
122
+ ]
123
+
124
+
125
+ def run_multi_episode_test():
126
+ env = PolicyEvolverEnvironment()
127
+ all_results = {}
128
+
129
+ # ═══ EASY TASK ═══
130
+ print(f"\n{DIVIDER}")
131
+ print(" EASY TASK β€” Multi-Episode Progression Test")
132
+ print(DIVIDER)
133
+
134
+ easy_episodes = []
135
+ for ep_idx, actions in enumerate(EASY_ACTIONS):
136
+ env.reset(task_id="task_easy")
137
+ ep_rewards = []
138
+ for step_idx, action in enumerate(actions):
139
+ obs = env.step(copy.deepcopy(action))
140
+ ep_rewards.append(obs.reward)
141
+ rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
142
+ print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
143
+ if obs.done:
144
+ break
145
+ easy_episodes.append(ep_rewards)
146
+ print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
147
+ print()
148
+
149
+ all_results["task_easy"] = easy_episodes
150
+
151
+ # ═══ HARD TASK ═══
152
+ print(f"{DIVIDER}")
153
+ print(" HARD TASK β€” Multi-Episode Progression Test")
154
+ print(DIVIDER)
155
+
156
+ hard_episodes = []
157
+ for ep_idx, actions in enumerate(HARD_ACTIONS):
158
+ env.reset(task_id="task_hard")
159
+ ep_rewards = []
160
+ for step_idx, action in enumerate(actions):
161
+ obs = env.step(copy.deepcopy(action))
162
+ ep_rewards.append(obs.reward)
163
+ rating = obs.info.get("staff_feedback", {}).get("strategic_rating", "N/A")
164
+ print(f" Ep{ep_idx+1} Step{step_idx+1}: reward={obs.reward:.4f} rating={rating}")
165
+ if obs.done:
166
+ break
167
+ hard_episodes.append(ep_rewards)
168
+ print(f" Ep{ep_idx+1} Final: {ep_rewards[-1]:.4f} (trajectory: {[f'{r:.2f}' for r in ep_rewards]})")
169
+ print()
170
+
171
+ all_results["task_hard"] = hard_episodes
172
+
173
+ # ═══ SUMMARY ═══
174
+ print(f"{DIVIDER}")
175
+ print(" MULTI-EPISODE SUMMARY")
176
+ print(DIVIDER)
177
+
178
+ for task, episodes in all_results.items():
179
+ print(f"\n {task}:")
180
+ for i, ep in enumerate(episodes):
181
+ trajectory = " β†’ ".join(f"{r:.2f}" for r in ep)
182
+ improving = all(ep[j+1] >= ep[j] for j in range(len(ep)-1)) if len(ep) > 1 else True
183
+ status = "βœ“ IMPROVING" if improving else "⚠ NON-MONOTONIC"
184
+ print(f" Episode {i+1}: [{trajectory}] {status}")
185
+
186
+ # Check cross-episode consistency
187
+ print(f"\n{DIVIDER}")
188
+ print(" CROSS-EPISODE DETERMINISM CHECK")
189
+ print(DIVIDER)
190
+
191
+ env.reset(task_id="task_easy")
192
+ det_action = copy.deepcopy(EASY_ACTIONS[2][0]) # Perfect action
193
+ scores = []
194
+ for run in range(3):
195
+ env.reset(task_id="task_easy")
196
+ obs = env.step(copy.deepcopy(det_action))
197
+ scores.append(obs.reward)
198
+
199
+ all_same = len(set(scores)) == 1
200
+ print(f" 3 identical runs: {scores}")
201
+ print(f" Deterministic: {'βœ“ YES' if all_same else 'βœ— NO'}")
202
+
203
+ print(f"\n{DIVIDER}")
204
+ print(" ALL MULTI-EPISODE TESTS COMPLETE")
205
+ print(DIVIDER)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ run_multi_episode_test()