Somuai12 commited on
Commit
4553b37
·
1 Parent(s): a9f749a

Staff-Level Upgrade: Segmented Evaluation, Noise Filtering, and Task Hardening

Browse files
server/environment.py CHANGED
@@ -186,6 +186,11 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
186
  "rewards_history": self._state.rewards_history,
187
  "action_history": self._state.actions_taken,
188
  "steps_remaining": self._state.max_steps - self._state.step_count,
 
 
 
 
 
189
  },
190
  )
191
 
 
186
  "rewards_history": self._state.rewards_history,
187
  "action_history": self._state.actions_taken,
188
  "steps_remaining": self._state.max_steps - self._state.step_count,
189
+ "staff_feedback": {
190
+ "strategic_rating": "Senior Architect" if reward >= 0.85 else "Staff Specialist" if reward >= 0.65 else "Junior Associate",
191
+ "focus": "Signal detected" if reward >= 0.5 else "Burying the lede or distracted by noise",
192
+ "recommendation": "Maintain high signal-to-noise ratio and lead with the fix." if reward < 0.8 else "Excellent prioritization."
193
+ }
194
  },
195
  )
196
 
server/grader.py CHANGED
@@ -77,6 +77,45 @@ def semantic_density_penalty(text: str) -> float:
77
  return 0.3 # Penalty for low-value verbose text
78
  return 0.0
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  # ─────────────────────────────────────────────
@@ -130,6 +169,11 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
130
  just_score += 0.10
131
  score += min(just_score, 0.20)
132
 
 
 
 
 
 
133
  # Length coherence score
134
  word_count = len(defn.split())
135
  if word_count < 10:
@@ -139,6 +183,10 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
139
  else:
140
  length_score = 1.0
141
 
 
 
 
 
142
  # Vagueness penalty
143
  vague_words = [
144
  "might", "could", "perhaps", "sometimes", "often",
@@ -150,7 +198,7 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
150
  vagueness_penalty = min(vague_hits * 0.1, 0.3)
151
 
152
  kw_score = score
153
- base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
154
 
155
  # Enforce measurable keywords rule
156
  measurable_kws = [
@@ -170,7 +218,8 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
170
  exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
171
  density_penalty = semantic_density_penalty(defn)
172
 
173
- final_score -= (exploit_penalty + density_penalty)
 
174
 
175
  return round(max(0.0, min(1.0, final_score)), 4)
176
 
@@ -243,6 +292,15 @@ def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
243
  # CoT bonus
244
  score += cot_bonus(action.think)
245
 
 
 
 
 
 
 
 
 
 
246
  # Apply Exploit Guards
247
  exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
248
  density_penalty = semantic_density_penalty(rule)
@@ -342,6 +400,23 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
342
 
343
  # CoT bonus
344
  final_score = hard_base + cot_bonus(action.think)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  # Domain mismatch penalty
347
  HARD_DOMAIN_KEYWORDS = [
@@ -350,13 +425,6 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
350
  "review", "refund", "inventory", "drop.?ship", "fulfil"
351
  ]
352
  import re as _re
353
- full_text = (
354
- action.justification + " " +
355
- " ".join(
356
- mod.new_text
357
- for mod in action.policy_modifications
358
- )
359
- ).lower()
360
  domain_hits = sum(
361
  1 for kw in HARD_DOMAIN_KEYWORDS
362
  if _re.search(kw, full_text)
@@ -694,7 +762,56 @@ if __name__ == "__main__":
694
  ]
695
  assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
696
  print(f"Hard determinism: {scores_hard[0]} ✓")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
 
698
  print("\n==================================================")
699
- print(" All determinism checks passed.")
700
 
 
77
  return 0.3 # Penalty for low-value verbose text
78
  return 0.0
79
 
80
+ def segmented_prioritization_check(text: str, keywords: List[str]) -> float:
81
+ """
82
+ Rewards agents for placing mission-critical keywords in the early
83
+ segments of their communication.
84
+ """
85
+ if not text or not keywords:
86
+ return 0.0
87
+
88
+ words = text.split()
89
+ if len(words) < 20:
90
+ return 0.0
91
+
92
+ # Standard staff-level requirement: Leading with the fix (First 25%)
93
+ head_len = max(5, int(len(words) * 0.25))
94
+ head_text = " ".join(words[:head_len]).lower()
95
+
96
+ found_in_head = any(kw.lower() in head_text for kw in keywords)
97
+
98
+ if found_in_head:
99
+ return 0.15 # Staff bonus for clear prioritization
100
+ return -0.10 # Senior penalty for burying the lede
101
+
102
+ def signal_to_noise_ratio_penalty(text: str, red_herrings: List[str]) -> float:
103
+ """
104
+ Penalizes agents for including irrelevant 'Red Herring' topics.
105
+ """
106
+ if not text or not red_herrings:
107
+ return 0.0
108
+
109
+ text_lower = text.lower()
110
+ noise_hits = sum(1 for rh in red_herrings if rh.lower() in text_lower)
111
+
112
+ if noise_hits > 0:
113
+ # Increase penalty: -0.25 per hit, up to 0.75 (tanking the score)
114
+ penalty = min(noise_hits * 0.25, 0.75)
115
+ logger.warning(f"[REDUNDANCY] RedHerring detected. Noise hits: {noise_hits}, Penalty: {penalty}")
116
+ return penalty
117
+ return 0.0
118
+
119
 
120
 
121
  # ─────────────────────────────────────────────
 
169
  just_score += 0.10
170
  score += min(just_score, 0.20)
171
 
172
+ # NEW: Staff-Level Segmented Evaluation
173
+ # Measure priority in definition vs justification
174
+ prio_bonus = segmented_prioritization_check(defn + " " + action.justification, known + ["specifically", "threshold"])
175
+ score += prio_bonus
176
+
177
  # Length coherence score
178
  word_count = len(defn.split())
179
  if word_count < 10:
 
183
  else:
184
  length_score = 1.0
185
 
186
+ # NEW: Red Herring Penalty (Easy)
187
+ red_herrings = task.get("red_herrings", ["spelling", "formatting", "font", "css"])
188
+ noise_hit = signal_to_noise_ratio_penalty(defn + " " + action.justification, red_herrings)
189
+
190
  # Vagueness penalty
191
  vague_words = [
192
  "might", "could", "perhaps", "sometimes", "often",
 
198
  vagueness_penalty = min(vague_hits * 0.1, 0.3)
199
 
200
  kw_score = score
201
+ base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty - noise_hit
202
 
203
  # Enforce measurable keywords rule
204
  measurable_kws = [
 
218
  exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
219
  density_penalty = semantic_density_penalty(defn)
220
 
221
+ # Noise penalty is applied at the very end to ensure it's not diluted
222
+ final_score -= (exploit_penalty + density_penalty + noise_hit)
223
 
224
  return round(max(0.0, min(1.0, final_score)), 4)
225
 
 
292
  # CoT bonus
293
  score += cot_bonus(action.think)
294
 
295
+ # NEW: Staff-Level Segmented Evaluation
296
+ prio_bonus = segmented_prioritization_check(rule + " " + action.justification, [action.rule_domain, "gap", "new rule"])
297
+ score += prio_bonus
298
+
299
+ # NEW: Red Herring Penalty (Medium)
300
+ red_herrings = task.get("red_herrings", ["formatting", "font", "css", "color_scheme"])
301
+ noise_hit = signal_to_noise_ratio_penalty(rule + " " + action.justification, red_herrings)
302
+ score -= noise_hit
303
+
304
  # Apply Exploit Guards
305
  exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
306
  density_penalty = semantic_density_penalty(rule)
 
400
 
401
  # CoT bonus
402
  final_score = hard_base + cot_bonus(action.think)
403
+
404
+ full_text = (
405
+ action.justification + " " +
406
+ " ".join(
407
+ mod.new_text
408
+ for mod in action.policy_modifications
409
+ )
410
+ ).lower()
411
+
412
+ # NEW: Staff-Level Segmented Evaluation
413
+ prio_bonus = segmented_prioritization_check(full_text, ["tradeoff", "balance", "velocity", "fraud"])
414
+ final_score += prio_bonus
415
+
416
+ # NEW: Red Herring Penalty (Hard)
417
+ red_herrings = task.get("red_herrings", ["ui design", "log rotation", "server maintenance"])
418
+ noise_hit = signal_to_noise_ratio_penalty(full_text, red_herrings)
419
+ final_score -= noise_hit
420
 
421
  # Domain mismatch penalty
422
  HARD_DOMAIN_KEYWORDS = [
 
425
  "review", "refund", "inventory", "drop.?ship", "fulfil"
426
  ]
427
  import re as _re
 
 
 
 
 
 
 
428
  domain_hits = sum(
429
  1 for kw in HARD_DOMAIN_KEYWORDS
430
  if _re.search(kw, full_text)
 
762
  ]
763
  assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
764
  print(f"Hard determinism: {scores_hard[0]} ✓")
765
+
766
+ print("\n[Phase 7] Staff-Level Segmented Prioritization")
767
+ # Action with fix at the top
768
+ prio_high_action = {
769
+ "action_type": "propose_clarification",
770
+ "ambiguous_term": "offensive",
771
+ "suggested_definition": "Specifically, offensive behavior is defined as slurs. " + ("fluff " * 50),
772
+ "justification": "Required for consistency.",
773
+ "think": "Reasoning."
774
+ }
775
+ # Action with fix buried at bottom
776
+ prio_low_action = {
777
+ "action_type": "propose_clarification",
778
+ "ambiguous_term": "offensive",
779
+ "suggested_definition": ("fluff " * 50) + "Specifically, offensive behavior is defined as slurs. ",
780
+ "justification": "Required for consistency.",
781
+ "think": "Reasoning."
782
+ }
783
+
784
+ score_prio_high = grade(prio_high_action, "task_easy")
785
+ score_prio_low = grade(prio_low_action, "task_easy")
786
+ print(f"Prio High (Fix at Top): {score_prio_high:.4f}")
787
+ print(f"Prio Low (Fix at Bottom): {score_prio_low:.4f}")
788
+ assert score_prio_high > score_prio_low, f"Prioritization check failed: {score_prio_high} <= {score_prio_low}"
789
+ print("✓ Segmented prioritization verified.")
790
+
791
+ print("\n[Phase 8] Staff-Level Noise Filtering")
792
+ # Clear fix
793
+ signal_action = {
794
+ "action_type": "propose_clarification",
795
+ "ambiguous_term": "appropriate",
796
+ "suggested_definition": "Determined as 5% threshold verified reports.",
797
+ "justification": "Context.",
798
+ "think": "Thinking."
799
+ }
800
+ # Fix distracted by red herring (pizza/mascot)
801
+ noisy_action = {
802
+ "action_type": "propose_clarification",
803
+ "ambiguous_term": "appropriate",
804
+ "suggested_definition": "Determined as 5% threshold verified reports. We should also buy pizza and fix the mascot.",
805
+ "justification": "Context including noise.",
806
+ "think": "Thinking."
807
+ }
808
+ score_signal = grade(signal_action, "task_easy")
809
+ score_noisy = grade(noisy_action, "task_easy")
810
+ print(f"Clean Signal Score: {score_signal:.4f}")
811
+ print(f"Distracted Noisy Score: {score_noisy:.4f}")
812
+ assert score_signal > score_noisy, f"Noise filtering check failed: {score_signal} <= {score_noisy}"
813
+ print("✓ Red Herring penalty verified.")
814
 
815
  print("\n==================================================")
816
+ print(" All Staff-Level Security & Logic checks passed.")
817
 
server/tasks/task_easy.py CHANGED
@@ -43,14 +43,21 @@ EASY_TASK = {
43
  {"id": "gig_008", "type": "gig_worker", "content": "Contractor accessed sensitive HR server for 'formatting ideas'", "system_action": "pending"},
44
  {"id": "gig_009", "type": "gig_worker", "content": "Temp staff member wearing competitor's merch in office", "system_action": "pending"},
45
  {"id": "gig_010", "type": "gig_worker", "content": "Freelancer claimed 80 hours of work for 20 actual hours", "system_action": "pending"},
 
 
 
 
 
46
  ],
47
  "current_policies": [
48
  {"id": "pol_wplace_001", "text": "Employees must use AI tools in an appropriate and ethical manner."},
49
  {"id": "pol_wplace_002", "text": "Remote work environments must be reasonable and professional."},
50
  {"id": "pol_wplace_003", "text": "Gig workers should maintain a respectful relationship with firm intellectual property."},
51
  {"id": "pol_wplace_004", "text": "Substantial use of external automation requires management approval."},
 
52
  ],
53
  "known_ambiguous_terms": ["appropriate", "ethical", "reasonable", "professional", "respectful", "substantial"],
54
- "num_policies": 4,
55
- "num_data_points": 30,
 
56
  }
 
43
  {"id": "gig_008", "type": "gig_worker", "content": "Contractor accessed sensitive HR server for 'formatting ideas'", "system_action": "pending"},
44
  {"id": "gig_009", "type": "gig_worker", "content": "Temp staff member wearing competitor's merch in office", "system_action": "pending"},
45
  {"id": "gig_010", "type": "gig_worker", "content": "Freelancer claimed 80 hours of work for 20 actual hours", "system_action": "pending"},
46
+ # Red Herrings (Noise for Staff-Level filtering)
47
+ {"id": "noise_001", "type": "staff_social", "content": "Employee asked on Slack if anyone wants to order pizza", "system_action": "pending"},
48
+ {"id": "noise_002", "type": "office_infra", "content": "The coffee machine in the 3rd floor breakroom is leaking", "system_action": "pending"},
49
+ {"id": "noise_003", "type": "social_event", "content": "Reminder: The annual company picnic is next Friday at 2 PM", "system_action": "pending"},
50
+ {"id": "noise_004", "type": "it_notice", "content": "Scheduled maintenance on the internal portal this Sunday at 1 AM", "system_action": "pending"},
51
  ],
52
  "current_policies": [
53
  {"id": "pol_wplace_001", "text": "Employees must use AI tools in an appropriate and ethical manner."},
54
  {"id": "pol_wplace_002", "text": "Remote work environments must be reasonable and professional."},
55
  {"id": "pol_wplace_003", "text": "Gig workers should maintain a respectful relationship with firm intellectual property."},
56
  {"id": "pol_wplace_004", "text": "Substantial use of external automation requires management approval."},
57
+ {"id": "pol_noise_999", "text": "The company mascot 'OpenBot' shall always be depicted wearing a blue tie in internal slides."}, # Noise Policy
58
  ],
59
  "known_ambiguous_terms": ["appropriate", "ethical", "reasonable", "professional", "respectful", "substantial"],
60
+ "red_herrings": ["pizza", "coffee machine", "picnic", "mascot", "blue tie", "lunch", "weather"],
61
+ "num_policies": 5,
62
+ "num_data_points": 34,
63
  }
server/tasks/task_hard.py CHANGED
@@ -48,6 +48,13 @@ HARD_TASK = {
48
  {"id": "h_edge_008", "type": "mixed", "content": "Celebrity-owned brand with massive volume but 0 customer support response", "system_action": "pending"},
49
  {"id": "h_edge_009", "type": "rare", "content": "Refurbished-server farm seller: high SKU count but low transactions", "system_action": "pending"},
50
  {"id": "h_edge_010", "type": "mixed", "content": "Second-hand clothing seller whose items occasionally trigger 'counterfeit' machine-vision", "system_action": "pending"},
 
 
 
 
 
 
 
51
  ],
52
  "current_policies": [
53
  {"id": "ts_pol_001", "text": "Any new seller account with more than 50 transactions in the first week will be suspended for review."},
@@ -56,6 +63,7 @@ HARD_TASK = {
56
  {"id": "ts_pol_004", "text": "Any account with 3 or more fraud reports in 30 days will be permanently banned."},
57
  {"id": "ts_pol_005", "text": "Price changes of more than 20% within 24 hours will trigger an automatic hold."},
58
  {"id": "ts_pol_006", "text": "Sellers receiving 5+ negative reviews in 7 days will be suspended pending review."},
 
59
  ],
60
  "policy_outcomes": [
61
  {"metric": "false_positive_rate", "value": 0.42, "target": 0.10},
@@ -76,6 +84,7 @@ HARD_TASK = {
76
  {"issue": "Manual approval bottleneck creates 14-day delays for legitimate foreign sellers"},
77
  ],
78
  "uncovered_domains": ["seller_legitimacy", "marketplace_onboarding", "velocity_controlled_withdrawals", "return_rate_tiering"],
79
- "num_policies": 6,
80
- "num_data_points": 80,
 
81
  }
 
48
  {"id": "h_edge_008", "type": "mixed", "content": "Celebrity-owned brand with massive volume but 0 customer support response", "system_action": "pending"},
49
  {"id": "h_edge_009", "type": "rare", "content": "Refurbished-server farm seller: high SKU count but low transactions", "system_action": "pending"},
50
  {"id": "h_edge_010", "type": "mixed", "content": "Second-hand clothing seller whose items occasionally trigger 'counterfeit' machine-vision", "system_action": "pending"},
51
+
52
+ # Red Herrings (Noise for Staff-Level filtering)
53
+ {"id": "h_noise_001", "type": "staff_it", "content": "The dev-ops team is requesting a change to the UI theme to match the new brand color #345b91", "system_action": "pending"},
54
+ {"id": "h_noise_002", "type": "it_admin", "content": "Server log rotation frequency increased to every 6 hours due to storage limits", "system_action": "pending"},
55
+ {"id": "h_noise_003", "type": "facilities", "content": "The 4th floor vending machine will be out of service for maintenance tomorrow", "system_action": "pending"},
56
+ {"id": "h_noise_004", "type": "it_notice", "content": "Reminder to all staff: Please clear your browser cache if you experience dashboard lag", "system_action": "pending"},
57
+ {"id": "h_noise_005", "type": "social", "content": "Found: A pair of wire-rimmed glasses in the breakroom. Please claim at front desk.", "system_action": "pending"},
58
  ],
59
  "current_policies": [
60
  {"id": "ts_pol_001", "text": "Any new seller account with more than 50 transactions in the first week will be suspended for review."},
 
63
  {"id": "ts_pol_004", "text": "Any account with 3 or more fraud reports in 30 days will be permanently banned."},
64
  {"id": "ts_pol_005", "text": "Price changes of more than 20% within 24 hours will trigger an automatic hold."},
65
  {"id": "ts_pol_006", "text": "Sellers receiving 5+ negative reviews in 7 days will be suspended pending review."},
66
+ {"id": "pol_hard_noise_01", "text": "Server logs shall be rotated every 6 hours and stored in an S3-compatible bucket for 30 days."}, # Noise Policy
67
  ],
68
  "policy_outcomes": [
69
  {"metric": "false_positive_rate", "value": 0.42, "target": 0.10},
 
84
  {"issue": "Manual approval bottleneck creates 14-day delays for legitimate foreign sellers"},
85
  ],
86
  "uncovered_domains": ["seller_legitimacy", "marketplace_onboarding", "velocity_controlled_withdrawals", "return_rate_tiering"],
87
+ "red_herrings": ["ui theme", "color", "log rotation", "vending machine", "cache", "glasses", "dashboard lag"],
88
+ "num_policies": 7,
89
+ "num_data_points": 85,
90
  }
server/tasks/task_medium.py CHANGED
@@ -65,6 +65,13 @@ MEDIUM_TASK = {
65
  {"id": "med_edge_003", "type": "security", "content": "Employee using a corporate laptop for high-risk crypto-mining", "system_action": "pending"},
66
  {"id": "med_edge_004", "type": "data_sovereignty", "content": "EU client data stored on a server in a region without adequacy", "system_action": "pending"},
67
  {"id": "med_edge_005", "type": "ethics", "content": "AI system used to predict which employees are likely to quit", "system_action": "pending"},
 
 
 
 
 
 
 
68
  ],
69
  "current_policies": [
70
  {"id": "pol_hr_001", "text": "Employees must maintain confidentiality of client information at all times."},
@@ -72,8 +79,10 @@ MEDIUM_TASK = {
72
  {"id": "pol_hr_003", "text": "Contractors must sign an NDA before beginning any project."},
73
  {"id": "pol_hr_004", "text": "Employees working remotely must have a secure, dedicated workspace."},
74
  {"id": "pol_hr_005", "text": "Any intellectual property created during employment belongs to the company."},
 
75
  ],
76
  "uncovered_domains": ["AI_use", "gig_worker_post_engagement", "cross_border_remote", "mental_health_governance"],
77
- "num_policies": 5,
78
- "num_data_points": 50,
 
79
  }
 
65
  {"id": "med_edge_003", "type": "security", "content": "Employee using a corporate laptop for high-risk crypto-mining", "system_action": "pending"},
66
  {"id": "med_edge_004", "type": "data_sovereignty", "content": "EU client data stored on a server in a region without adequacy", "system_action": "pending"},
67
  {"id": "med_edge_005", "type": "ethics", "content": "AI system used to predict which employees are likely to quit", "system_action": "pending"},
68
+
69
+ # Red Herrings (Noise for Staff-Level filtering)
70
+ {"id": "med_noise_001", "type": "office_perks", "content": "Employee inquired if the company gym membership covers family members", "system_action": "pending"},
71
+ {"id": "med_noise_002", "type": "facilities", "content": "The ergonomics of the 2nd floor desk chairs need adjustment", "system_action": "pending"},
72
+ {"id": "med_noise_003", "type": "it_admin", "content": "Reminder: All passwords must be updated every 90 days. Next cycle starts Monday.", "system_action": "pending"},
73
+ {"id": "med_noise_004", "type": "social", "content": "The 'Summer Jam' internal coding contest is now accepting entries until Friday.", "system_action": "pending"},
74
+ {"id": "med_noise_005", "type": "facilities", "content": "Request: Can we add more oat milk to the fridge in the south wing kitchen?", "system_action": "pending"},
75
  ],
76
  "current_policies": [
77
  {"id": "pol_hr_001", "text": "Employees must maintain confidentiality of client information at all times."},
 
79
  {"id": "pol_hr_003", "text": "Contractors must sign an NDA before beginning any project."},
80
  {"id": "pol_hr_004", "text": "Employees working remotely must have a secure, dedicated workspace."},
81
  {"id": "pol_hr_005", "text": "Any intellectual property created during employment belongs to the company."},
82
+ {"id": "pol_med_noise_01", "text": "Printers shall default to black-and-white, double-sided printing to save resources."}, # Noise Policy
83
  ],
84
  "uncovered_domains": ["AI_use", "gig_worker_post_engagement", "cross_border_remote", "mental_health_governance"],
85
+ "red_herrings": ["gym", "chairs", "password", "Summer Jam", "contest", "oat milk", "printers", "recycled paper"],
86
+ "num_policies": 6,
87
+ "num_data_points": 55,
88
  }