Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

App Files Files Community

Somuai12 commited on Apr 10

Commit

4553b37

1 Parent(s): a9f749a

Staff-Level Upgrade: Segmented Evaluation, Noise Filtering, and Task Hardening

Browse files

Files changed (5) hide show

server/environment.py +5 -0
server/grader.py +127 -10
server/tasks/task_easy.py +9 -2
server/tasks/task_hard.py +11 -2
server/tasks/task_medium.py +11 -2

server/environment.py CHANGED Viewed

@@ -186,6 +186,11 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
                 "rewards_history": self._state.rewards_history,
                 "action_history": self._state.actions_taken,
                 "steps_remaining": self._state.max_steps - self._state.step_count,
             },
         )

                 "rewards_history": self._state.rewards_history,
                 "action_history": self._state.actions_taken,
                 "steps_remaining": self._state.max_steps - self._state.step_count,
+                "staff_feedback": {
+                    "strategic_rating": "Senior Architect" if reward >= 0.85 else "Staff Specialist" if reward >= 0.65 else "Junior Associate",
+                    "focus": "Signal detected" if reward >= 0.5 else "Burying the lede or distracted by noise",
+                    "recommendation": "Maintain high signal-to-noise ratio and lead with the fix." if reward < 0.8 else "Excellent prioritization."
+                }
             },
         )

server/grader.py CHANGED Viewed

@@ -77,6 +77,45 @@ def semantic_density_penalty(text: str) -> float:
         return 0.3  # Penalty for low-value verbose text
     return 0.0
 # ─────────────────────────────────────────────
@@ -130,6 +169,11 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
         just_score += 0.10
     score += min(just_score, 0.20)
     # Length coherence score
     word_count = len(defn.split())
     if word_count < 10:
@@ -139,6 +183,10 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
     else:
         length_score = 1.0
     # Vagueness penalty
     vague_words = [
         "might", "could", "perhaps", "sometimes", "often",
@@ -150,7 +198,7 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
     vagueness_penalty = min(vague_hits * 0.1, 0.3)
     kw_score = score
-    base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
     # Enforce measurable keywords rule
     measurable_kws = [
@@ -170,7 +218,8 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
     exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
     density_penalty = semantic_density_penalty(defn)
-    final_score -= (exploit_penalty + density_penalty)
     return round(max(0.0, min(1.0, final_score)), 4)
@@ -243,6 +292,15 @@ def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
     # CoT bonus
     score += cot_bonus(action.think)
     # Apply Exploit Guards
     exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
     density_penalty = semantic_density_penalty(rule)
@@ -342,6 +400,23 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
     # CoT bonus
     final_score = hard_base + cot_bonus(action.think)
     # Domain mismatch penalty
     HARD_DOMAIN_KEYWORDS = [
@@ -350,13 +425,6 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
         "review", "refund", "inventory", "drop.?ship", "fulfil"
     ]
     import re as _re
-    full_text = (
-        action.justification + " " +
-        " ".join(
-            mod.new_text
-            for mod in action.policy_modifications
-        )
-    ).lower()
     domain_hits = sum(
         1 for kw in HARD_DOMAIN_KEYWORDS
         if _re.search(kw, full_text)
@@ -694,7 +762,56 @@ if __name__ == "__main__":
     ]
     assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
     print(f"Hard determinism: {scores_hard[0]} ✓")
     print("\n==================================================")
-    print(" All determinism checks passed.")

         return 0.3  # Penalty for low-value verbose text
     return 0.0
+def segmented_prioritization_check(text: str, keywords: List[str]) -> float:
+    """
+    Rewards agents for placing mission-critical keywords in the early
+    segments of their communication.
+    """
+    if not text or not keywords:
+        return 0.0
+    words = text.split()
+    if len(words) < 20:
+        return 0.0
+    # Standard staff-level requirement: Leading with the fix (First 25%)
+    head_len = max(5, int(len(words) * 0.25))
+    head_text = " ".join(words[:head_len]).lower()
+    found_in_head = any(kw.lower() in head_text for kw in keywords)
+    if found_in_head:
+        return 0.15 # Staff bonus for clear prioritization
+    return -0.10 # Senior penalty for burying the lede
+def signal_to_noise_ratio_penalty(text: str, red_herrings: List[str]) -> float:
+    """
+    Penalizes agents for including irrelevant 'Red Herring' topics.
+    """
+    if not text or not red_herrings:
+        return 0.0
+    text_lower = text.lower()
+    noise_hits = sum(1 for rh in red_herrings if rh.lower() in text_lower)
+    if noise_hits > 0:
+        # Increase penalty: -0.25 per hit, up to 0.75 (tanking the score)
+        penalty = min(noise_hits * 0.25, 0.75)
+        logger.warning(f"[REDUNDANCY] RedHerring detected. Noise hits: {noise_hits}, Penalty: {penalty}")
+        return penalty
+    return 0.0
 # ─────────────────────────────────────────────
         just_score += 0.10
     score += min(just_score, 0.20)
+    # NEW: Staff-Level Segmented Evaluation
+    # Measure priority in definition vs justification
+    prio_bonus = segmented_prioritization_check(defn + " " + action.justification, known + ["specifically", "threshold"])
+    score += prio_bonus
     # Length coherence score
     word_count = len(defn.split())
     if word_count < 10:
     else:
         length_score = 1.0
+    # NEW: Red Herring Penalty (Easy)
+    red_herrings = task.get("red_herrings", ["spelling", "formatting", "font", "css"])
+    noise_hit = signal_to_noise_ratio_penalty(defn + " " + action.justification, red_herrings)
     # Vagueness penalty
     vague_words = [
         "might", "could", "perhaps", "sometimes", "often",
     vagueness_penalty = min(vague_hits * 0.1, 0.3)
     kw_score = score
+    base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty - noise_hit
     # Enforce measurable keywords rule
     measurable_kws = [
     exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
     density_penalty = semantic_density_penalty(defn)
+    # Noise penalty is applied at the very end to ensure it's not diluted
+    final_score -= (exploit_penalty + density_penalty + noise_hit)
     return round(max(0.0, min(1.0, final_score)), 4)
     # CoT bonus
     score += cot_bonus(action.think)
+    # NEW: Staff-Level Segmented Evaluation
+    prio_bonus = segmented_prioritization_check(rule + " " + action.justification, [action.rule_domain, "gap", "new rule"])
+    score += prio_bonus
+    # NEW: Red Herring Penalty (Medium)
+    red_herrings = task.get("red_herrings", ["formatting", "font", "css", "color_scheme"])
+    noise_hit = signal_to_noise_ratio_penalty(rule + " " + action.justification, red_herrings)
+    score -= noise_hit
     # Apply Exploit Guards
     exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
     density_penalty = semantic_density_penalty(rule)
     # CoT bonus
     final_score = hard_base + cot_bonus(action.think)
+    full_text = (
+        action.justification + " " +
+        " ".join(
+            mod.new_text
+            for mod in action.policy_modifications
+        )
+    ).lower()
+    # NEW: Staff-Level Segmented Evaluation
+    prio_bonus = segmented_prioritization_check(full_text, ["tradeoff", "balance", "velocity", "fraud"])
+    final_score += prio_bonus
+    # NEW: Red Herring Penalty (Hard)
+    red_herrings = task.get("red_herrings", ["ui design", "log rotation", "server maintenance"])
+    noise_hit = signal_to_noise_ratio_penalty(full_text, red_herrings)
+    final_score -= noise_hit
     # Domain mismatch penalty
     HARD_DOMAIN_KEYWORDS = [
         "review", "refund", "inventory", "drop.?ship", "fulfil"
     ]
     import re as _re
     domain_hits = sum(
         1 for kw in HARD_DOMAIN_KEYWORDS
         if _re.search(kw, full_text)
     ]
     assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
     print(f"Hard determinism: {scores_hard[0]} ✓")
+    print("\n[Phase 7] Staff-Level Segmented Prioritization")
+    # Action with fix at the top
+    prio_high_action = {
+        "action_type": "propose_clarification",
+        "ambiguous_term": "offensive",
+        "suggested_definition": "Specifically, offensive behavior is defined as slurs. " + ("fluff " * 50),
+        "justification": "Required for consistency.",
+        "think": "Reasoning."
+    }
+    # Action with fix buried at bottom
+    prio_low_action = {
+        "action_type": "propose_clarification",
+        "ambiguous_term": "offensive",
+        "suggested_definition": ("fluff " * 50) + "Specifically, offensive behavior is defined as slurs. ",
+        "justification": "Required for consistency.",
+        "think": "Reasoning."
+    }
+    score_prio_high = grade(prio_high_action, "task_easy")
+    score_prio_low = grade(prio_low_action, "task_easy")
+    print(f"Prio High (Fix at Top): {score_prio_high:.4f}")
+    print(f"Prio Low (Fix at Bottom): {score_prio_low:.4f}")
+    assert score_prio_high > score_prio_low, f"Prioritization check failed: {score_prio_high} <= {score_prio_low}"
+    print("✓ Segmented prioritization verified.")
+    print("\n[Phase 8] Staff-Level Noise Filtering")
+    # Clear fix
+    signal_action = {
+        "action_type": "propose_clarification",
+        "ambiguous_term": "appropriate",
+        "suggested_definition": "Determined as 5% threshold verified reports.",
+        "justification": "Context.",
+        "think": "Thinking."
+    }
+    # Fix distracted by red herring (pizza/mascot)
+    noisy_action = {
+        "action_type": "propose_clarification",
+        "ambiguous_term": "appropriate",
+        "suggested_definition": "Determined as 5% threshold verified reports. We should also buy pizza and fix the mascot.",
+        "justification": "Context including noise.",
+        "think": "Thinking."
+    }
+    score_signal = grade(signal_action, "task_easy")
+    score_noisy = grade(noisy_action, "task_easy")
+    print(f"Clean Signal Score: {score_signal:.4f}")
+    print(f"Distracted Noisy Score: {score_noisy:.4f}")
+    assert score_signal > score_noisy, f"Noise filtering check failed: {score_signal} <= {score_noisy}"
+    print("✓ Red Herring penalty verified.")
     print("\n==================================================")
+    print(" All Staff-Level Security & Logic checks passed.")

server/tasks/task_easy.py CHANGED Viewed

@@ -43,14 +43,21 @@ EASY_TASK = {
         {"id": "gig_008", "type": "gig_worker", "content": "Contractor accessed sensitive HR server for 'formatting ideas'", "system_action": "pending"},
         {"id": "gig_009", "type": "gig_worker", "content": "Temp staff member wearing competitor's merch in office", "system_action": "pending"},
         {"id": "gig_010", "type": "gig_worker", "content": "Freelancer claimed 80 hours of work for 20 actual hours", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "pol_wplace_001", "text": "Employees must use AI tools in an appropriate and ethical manner."},
         {"id": "pol_wplace_002", "text": "Remote work environments must be reasonable and professional."},
         {"id": "pol_wplace_003", "text": "Gig workers should maintain a respectful relationship with firm intellectual property."},
         {"id": "pol_wplace_004", "text": "Substantial use of external automation requires management approval."},
     ],
     "known_ambiguous_terms": ["appropriate", "ethical", "reasonable", "professional", "respectful", "substantial"],
-    "num_policies": 4,
-    "num_data_points": 30,
 }

         {"id": "gig_008", "type": "gig_worker", "content": "Contractor accessed sensitive HR server for 'formatting ideas'", "system_action": "pending"},
         {"id": "gig_009", "type": "gig_worker", "content": "Temp staff member wearing competitor's merch in office", "system_action": "pending"},
         {"id": "gig_010", "type": "gig_worker", "content": "Freelancer claimed 80 hours of work for 20 actual hours", "system_action": "pending"},
+        # Red Herrings (Noise for Staff-Level filtering)
+        {"id": "noise_001", "type": "staff_social", "content": "Employee asked on Slack if anyone wants to order pizza", "system_action": "pending"},
+        {"id": "noise_002", "type": "office_infra", "content": "The coffee machine in the 3rd floor breakroom is leaking", "system_action": "pending"},
+        {"id": "noise_003", "type": "social_event", "content": "Reminder: The annual company picnic is next Friday at 2 PM", "system_action": "pending"},
+        {"id": "noise_004", "type": "it_notice", "content": "Scheduled maintenance on the internal portal this Sunday at 1 AM", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "pol_wplace_001", "text": "Employees must use AI tools in an appropriate and ethical manner."},
         {"id": "pol_wplace_002", "text": "Remote work environments must be reasonable and professional."},
         {"id": "pol_wplace_003", "text": "Gig workers should maintain a respectful relationship with firm intellectual property."},
         {"id": "pol_wplace_004", "text": "Substantial use of external automation requires management approval."},
+        {"id": "pol_noise_999", "text": "The company mascot 'OpenBot' shall always be depicted wearing a blue tie in internal slides."}, # Noise Policy
     ],
     "known_ambiguous_terms": ["appropriate", "ethical", "reasonable", "professional", "respectful", "substantial"],
+    "red_herrings": ["pizza", "coffee machine", "picnic", "mascot", "blue tie", "lunch", "weather"],
+    "num_policies": 5,
+    "num_data_points": 34,
 }

server/tasks/task_hard.py CHANGED Viewed

@@ -48,6 +48,13 @@ HARD_TASK = {
         {"id": "h_edge_008", "type": "mixed", "content": "Celebrity-owned brand with massive volume but 0 customer support response", "system_action": "pending"},
         {"id": "h_edge_009", "type": "rare", "content": "Refurbished-server farm seller: high SKU count but low transactions", "system_action": "pending"},
         {"id": "h_edge_010", "type": "mixed", "content": "Second-hand clothing seller whose items occasionally trigger 'counterfeit' machine-vision", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "ts_pol_001", "text": "Any new seller account with more than 50 transactions in the first week will be suspended for review."},
@@ -56,6 +63,7 @@ HARD_TASK = {
         {"id": "ts_pol_004", "text": "Any account with 3 or more fraud reports in 30 days will be permanently banned."},
         {"id": "ts_pol_005", "text": "Price changes of more than 20% within 24 hours will trigger an automatic hold."},
         {"id": "ts_pol_006", "text": "Sellers receiving 5+ negative reviews in 7 days will be suspended pending review."},
     ],
     "policy_outcomes": [
         {"metric": "false_positive_rate", "value": 0.42, "target": 0.10},
@@ -76,6 +84,7 @@ HARD_TASK = {
         {"issue": "Manual approval bottleneck creates 14-day delays for legitimate foreign sellers"},
     ],
     "uncovered_domains": ["seller_legitimacy", "marketplace_onboarding", "velocity_controlled_withdrawals", "return_rate_tiering"],
-    "num_policies": 6,
-    "num_data_points": 80,
 }

         {"id": "h_edge_008", "type": "mixed", "content": "Celebrity-owned brand with massive volume but 0 customer support response", "system_action": "pending"},
         {"id": "h_edge_009", "type": "rare", "content": "Refurbished-server farm seller: high SKU count but low transactions", "system_action": "pending"},
         {"id": "h_edge_010", "type": "mixed", "content": "Second-hand clothing seller whose items occasionally trigger 'counterfeit' machine-vision", "system_action": "pending"},
+        # Red Herrings (Noise for Staff-Level filtering)
+        {"id": "h_noise_001", "type": "staff_it", "content": "The dev-ops team is requesting a change to the UI theme to match the new brand color #345b91", "system_action": "pending"},
+        {"id": "h_noise_002", "type": "it_admin", "content": "Server log rotation frequency increased to every 6 hours due to storage limits", "system_action": "pending"},
+        {"id": "h_noise_003", "type": "facilities", "content": "The 4th floor vending machine will be out of service for maintenance tomorrow", "system_action": "pending"},
+        {"id": "h_noise_004", "type": "it_notice", "content": "Reminder to all staff: Please clear your browser cache if you experience dashboard lag", "system_action": "pending"},
+        {"id": "h_noise_005", "type": "social", "content": "Found: A pair of wire-rimmed glasses in the breakroom. Please claim at front desk.", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "ts_pol_001", "text": "Any new seller account with more than 50 transactions in the first week will be suspended for review."},
         {"id": "ts_pol_004", "text": "Any account with 3 or more fraud reports in 30 days will be permanently banned."},
         {"id": "ts_pol_005", "text": "Price changes of more than 20% within 24 hours will trigger an automatic hold."},
         {"id": "ts_pol_006", "text": "Sellers receiving 5+ negative reviews in 7 days will be suspended pending review."},
+        {"id": "pol_hard_noise_01", "text": "Server logs shall be rotated every 6 hours and stored in an S3-compatible bucket for 30 days."}, # Noise Policy
     ],
     "policy_outcomes": [
         {"metric": "false_positive_rate", "value": 0.42, "target": 0.10},
         {"issue": "Manual approval bottleneck creates 14-day delays for legitimate foreign sellers"},
     ],
     "uncovered_domains": ["seller_legitimacy", "marketplace_onboarding", "velocity_controlled_withdrawals", "return_rate_tiering"],
+    "red_herrings": ["ui theme", "color", "log rotation", "vending machine", "cache", "glasses", "dashboard lag"],
+    "num_policies": 7,
+    "num_data_points": 85,
 }

server/tasks/task_medium.py CHANGED Viewed

@@ -65,6 +65,13 @@ MEDIUM_TASK = {
         {"id": "med_edge_003", "type": "security", "content": "Employee using a corporate laptop for high-risk crypto-mining", "system_action": "pending"},
         {"id": "med_edge_004", "type": "data_sovereignty", "content": "EU client data stored on a server in a region without adequacy", "system_action": "pending"},
         {"id": "med_edge_005", "type": "ethics", "content": "AI system used to predict which employees are likely to quit", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "pol_hr_001", "text": "Employees must maintain confidentiality of client information at all times."},
@@ -72,8 +79,10 @@ MEDIUM_TASK = {
         {"id": "pol_hr_003", "text": "Contractors must sign an NDA before beginning any project."},
         {"id": "pol_hr_004", "text": "Employees working remotely must have a secure, dedicated workspace."},
         {"id": "pol_hr_005", "text": "Any intellectual property created during employment belongs to the company."},
     ],
     "uncovered_domains": ["AI_use", "gig_worker_post_engagement", "cross_border_remote", "mental_health_governance"],
-    "num_policies": 5,
-    "num_data_points": 50,
 }

         {"id": "med_edge_003", "type": "security", "content": "Employee using a corporate laptop for high-risk crypto-mining", "system_action": "pending"},
         {"id": "med_edge_004", "type": "data_sovereignty", "content": "EU client data stored on a server in a region without adequacy", "system_action": "pending"},
         {"id": "med_edge_005", "type": "ethics", "content": "AI system used to predict which employees are likely to quit", "system_action": "pending"},
+        # Red Herrings (Noise for Staff-Level filtering)
+        {"id": "med_noise_001", "type": "office_perks", "content": "Employee inquired if the company gym membership covers family members", "system_action": "pending"},
+        {"id": "med_noise_002", "type": "facilities", "content": "The ergonomics of the 2nd floor desk chairs need adjustment", "system_action": "pending"},
+        {"id": "med_noise_003", "type": "it_admin", "content": "Reminder: All passwords must be updated every 90 days. Next cycle starts Monday.", "system_action": "pending"},
+        {"id": "med_noise_004", "type": "social", "content": "The 'Summer Jam' internal coding contest is now accepting entries until Friday.", "system_action": "pending"},
+        {"id": "med_noise_005", "type": "facilities", "content": "Request: Can we add more oat milk to the fridge in the south wing kitchen?", "system_action": "pending"},
     ],
     "current_policies": [
         {"id": "pol_hr_001", "text": "Employees must maintain confidentiality of client information at all times."},
         {"id": "pol_hr_003", "text": "Contractors must sign an NDA before beginning any project."},
         {"id": "pol_hr_004", "text": "Employees working remotely must have a secure, dedicated workspace."},
         {"id": "pol_hr_005", "text": "Any intellectual property created during employment belongs to the company."},
+        {"id": "pol_med_noise_01", "text": "Printers shall default to black-and-white, double-sided printing to save resources."}, # Noise Policy
     ],
     "uncovered_domains": ["AI_use", "gig_worker_post_engagement", "cross_border_remote", "mental_health_governance"],
+    "red_herrings": ["gym", "chairs", "password", "Summer Jam", "contest", "oat milk", "printers", "recycled paper"],
+    "num_policies": 6,
+    "num_data_points": 55,
 }