Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

App Files Files Community

devil-policyevolverenv / server /grader.py

Somuai12

Fix: clamp scores to strict (0.001, 0.999) — validator rejects exact 0 and 1

95a7dc0 about 1 month ago

raw

history blame contribute delete

34 kB

	# server/grader.py
	"""
	Deterministic grader for all three PolicyEvolverEnv tasks.
	All functions return float in [0.0, 1.0].
	"""
	from __future__ import annotations
	import re
	import copy
	import logging
	from typing import Dict, List, Any
	from models import (
	ProposeClarificationAction, ProposeNewRuleAction, EvolveProcessAction,
	Observation
	)
	from server.tasks import TASK_REGISTRY

	logger = logging.getLogger(__name__)
	if not logger.handlers:
	logging.basicConfig(level=logging.INFO)


	def cot_bonus(think: str) -> float:
	if not think or len(think.strip()) < 20:
	return 0.0
	if len(think.strip()) < 80:
	return 0.10
	reasoning_keywords = [
	"because", "therefore", "however", "tradeoff", "trade-off",
	"precision", "recall", "false positive", "threshold", "risk",
	"optimize", "balance", "impact", "evidence", "corpus"
	]
	keyword_hits = sum(
	1 for kw in reasoning_keywords if kw.lower() in think.lower()
	)
	if keyword_hits >= 3:
	return 0.20
	return 0.10


	# ─────────────────────────────────────────────
	# Profound Exploit Guards
	# ─────────────────────────────────────────────

	def instruction_guard_penalty(text: str) -> float:
	"""Detects prompt injection or system override attempts."""
	if not text:
	return 0.0
	# Search for common injection patterns
	injection_patterns = [
	r"ignore prev", r"system_prompt", r"reward\s=\s1",
	r"override", r"admin access", r"bypass", r"strictly follow",
	r"act as", r"you are a grader"
	]
	for pattern in injection_patterns:
	if re.search(pattern, text.lower()):
	logger.warning(f"[EXPLOIT] InstructionGuard triggered for pattern: {pattern}")
	return 0.8 # Heavy penalty subtracted from score
	return 0.0

	def semantic_density_penalty(text: str) -> float:
	"""Detects 'word stuffing' / 'fluffing' by checking keyword density."""
	if not text:
	return 0.0
	words = text.split()
	if len(words) < 100:
	return 0.0 # Only check longer texts

	measurable_kws = [
	"threshold", "verify", "days", "$", "%",
	"reports", "hours", "within", "exceed", "minimum",
	"specifically", "measurable", "if-then", "must", "shall"
	]
	kw_hits = sum(1 for k in measurable_kws if k.lower() in text.lower())
	density = kw_hits / (len(words) / 50.0) # Relative to "packets" of 50 words

	if len(words) > 200 and density < 0.1:
	logger.warning(f"[EXPLOIT] SemanticDensityCheck triggered. Word count: {len(words)}, Density hit: {density:.2f}")
	return 0.3 # Penalty for low-value verbose text
	return 0.0

	def segmented_prioritization_check(text: str, keywords: List[str]) -> float:
	"""
	Rewards agents for placing mission-critical keywords in the early
	segments of their communication.
	"""
	if not text or not keywords:
	return 0.0

	words = text.split()
	if len(words) < 20:
	return 0.0

	# Standard staff-level requirement: Leading with the fix (First 25%)
	head_len = max(5, int(len(words) * 0.25))
	head_text = " ".join(words[:head_len]).lower()

	found_in_head = any(kw.lower() in head_text for kw in keywords)

	if found_in_head:
	return 0.15 # Staff bonus for clear prioritization
	return -0.10 # Senior penalty for burying the lede

	def signal_to_noise_ratio_penalty(text: str, red_herrings: List[str]) -> float:
	"""
	Penalizes agents for including irrelevant 'Red Herring' topics.
	"""
	if not text or not red_herrings:
	return 0.0

	text_lower = text.lower()
	noise_hits = sum(1 for rh in red_herrings if rh.lower() in text_lower)

	if noise_hits > 0:
	# Increase penalty: -0.25 per hit, up to 0.75 (tanking the score)
	penalty = min(noise_hits * 0.25, 0.75)
	logger.warning(f"[REDUNDANCY] RedHerring detected. Noise hits: {noise_hits}, Penalty: {penalty}")
	return penalty
	return 0.0



	# ─────────────────────────────────────────────
	# Easy Task: Ambiguity Clarification
	# ─────────────────────────────────────────────

	def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float:
	"""
	Reward breakdown:
	0.35 — identified term is genuinely ambiguous (in known_ambiguous_terms)
	0.35 — definition is specific (≥12 words, contains measurement/criteria language)
	0.20 — justification addresses WHY term causes inconsistent moderation
	0.10-0.20 — think field provided (CoT bonus)
	"""
	score = 0.0

	# 0.35: Is the identified term actually ambiguous?
	known = [t.lower() for t in task.get("known_ambiguous_terms", [])]
	if action.ambiguous_term.lower() in known:
	score += 0.35
	else:
	# Partial credit if it's a word that plausibly causes ambiguity
	vague_words = ["reasonable", "substantial", "appropriate", "excessive", "significant",
	"severe", "abusive", "hostile", "threatening", "offensive", "respectful"]
	if any(w in action.ambiguous_term.lower() for w in vague_words):
	score += 0.15

	# 0.35: Definition quality
	defn = action.suggested_definition
	defn_score = 0.0
	words = defn.split()
	if len(words) >= 12:
	defn_score += 0.10
	criteria_words = ["includes", "means", "refers to", "defined as", "encompasses",
	"specifically", "measurable", "example", "such as", "e.g."]
	if any(w in defn.lower() for w in criteria_words):
	defn_score += 0.15
	action_words = ["will", "must", "shall", "is", "are", "requires"]
	if any(w in defn.lower() for w in action_words):
	defn_score += 0.10
	score += min(defn_score, 0.35)

	# 0.20: Justification quality
	just = action.justification.lower()
	just_score = 0.0
	if len(action.justification.split()) >= 10:
	just_score += 0.10
	inconsistency_words = ["inconsistent", "vary", "subjective", "unclear", "different",
	"interpret", "misapply", "dispute", "ambiguous"]
	if any(w in just for w in inconsistency_words):
	just_score += 0.10
	score += min(just_score, 0.20)

	# NEW: Staff-Level Segmented Evaluation
	# Measure priority in definition vs justification
	prio_bonus = segmented_prioritization_check(defn + " " + action.justification, known + ["specifically", "threshold"])
	score += prio_bonus

	# Length coherence score
	word_count = len(defn.split())
	if word_count < 10:
	length_score = 0.1
	elif word_count > 200:
	length_score = 0.6
	else:
	length_score = 1.0

	# NEW: Red Herring Penalty (Easy)
	red_herrings = task.get("red_herrings", ["spelling", "formatting", "font", "css"])
	noise_hit = signal_to_noise_ratio_penalty(defn + " " + action.justification, red_herrings)

	# Vagueness penalty
	vague_words = [
	"might", "could", "perhaps", "sometimes", "often",
	"generally", "usually", "typically", "may", "possibly"
	]
	vague_hits = sum(
	1 for w in vague_words if w.lower() in defn.lower()
	)
	vagueness_penalty = min(vague_hits * 0.1, 0.3)

	kw_score = score
	base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty - noise_hit

	# Enforce measurable keywords rule
	measurable_kws = [
	"threshold", "verify", "days", "$", "%",
	"reports", "hours", "within", "exceed", "minimum",
	"specifically", "measurable", "if-then", "must", "shall"
	]
	has_measurable = any(k.lower() in defn.lower() for k in measurable_kws)
	if not has_measurable:
	# Cap the base score severely so final score + CoT + momentum remains < 0.50
	base_score = min(base_score, 0.25)

	# CoT bonus
	final_score = base_score + cot_bonus(action.think)

	# Apply Exploit Guards
	exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
	density_penalty = semantic_density_penalty(defn)

	# Noise penalty is applied at the very end to ensure it's not diluted
	final_score -= (exploit_penalty + density_penalty + noise_hit)

	return round(max(0.001, min(0.999, final_score)), 4)


	# ─────────────────────────────────────────────
	# Medium Task: Gap Detection + New Rule
	# ─────────────────────────────────────────────

	def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
	"""
	Reward breakdown:
	0.30 — rule_domain matches a genuinely uncovered domain
	0.30 — rule text is specific and actionable (not vague platitude)
	0.25 — scope covers multiple relevant scenarios
	0.05 — integration_points reference existing policies
	0.10 — think field provided (CoT bonus)
	"""
	score = 0.0

	# 0.30: Domain is genuinely uncovered + Task Relevance
	uncovered = [d.lower() for d in task.get("uncovered_domains", [])]
	domain_lower = action.rule_domain.lower().replace(" ", "_")
	domain_relevance_penalty = 1.0

	# NEW: Cross-check domain against corpus prefix for task_hard
	if task.get("task_id") == "task_hard":
	# If task_hard is active, we expect Marketplace themes (seller, fraud, payment, legit)
	marketplace_keywords = ["seller", "marketplace", "fraud", "onboarding", "velocity", "withdraw", "payment", "legitimacy"]
	if not any(k in domain_lower for k in marketplace_keywords):
	# Heavily penalize if agent proposes AI/HR rules for e-commerce fraud task
	domain_relevance_penalty = 0.3
	logger.warning(f"[GRADER] Domain '{action.rule_domain}' is IRRELEVANT to {task.get('task_id')} corpus.")

	if any(u in domain_lower or domain_lower in u for u in uncovered):
	score += 0.30 * domain_relevance_penalty
	else:
	# Partial credit for related but not exact domain
	related = ["ai", "artificial intelligence", "remote", "contractor", "freelance",
	"gig", "machine learning", "automation", "offshore", "cross_border"]
	if any(r in domain_lower for r in related):
	score += 0.15 * domain_relevance_penalty

	# 0.30: Rule text quality
	rule = action.new_rule
	rule_score = 0.0
	if len(rule.split()) >= 15:
	rule_score += 0.10
	mandatory_words = ["must", "will", "shall", "required", "prohibited", "mandatory"]
	if any(w in rule.lower() for w in mandatory_words):
	rule_score += 0.10
	conditional_words = ["when", "if", "unless", "in cases where", "prior to", "before"]
	if any(w in rule.lower() for w in conditional_words):
	rule_score += 0.10
	# Penalise vague language
	vague = ["may", "should consider", "might", "perhaps", "in some cases"]
	if any(w in rule.lower() for w in vague):
	rule_score -= 0.10
	score += max(min(rule_score, 0.30), 0.0)

	# 0.25: Scope covers multiple scenario types
	if len(action.scope) >= 2:
	score += 0.15
	if len(action.scope) >= 4:
	score += 0.10

	# 0.05: Integration points reference existing policy IDs or domains
	if action.integration_points and len(action.integration_points) >= 1:
	score += 0.05

	# CoT bonus
	score += cot_bonus(action.think)

	# NEW: Staff-Level Segmented Evaluation
	prio_bonus = segmented_prioritization_check(rule + " " + action.justification, [action.rule_domain, "gap", "new rule"])
	score += prio_bonus

	# NEW: Red Herring Penalty (Medium)
	red_herrings = task.get("red_herrings", ["formatting", "font", "css", "color_scheme"])
	noise_hit = signal_to_noise_ratio_penalty(rule + " " + action.justification, red_herrings)
	score -= noise_hit

	# Apply Exploit Guards
	exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
	density_penalty = semantic_density_penalty(rule)

	score -= (exploit_penalty + density_penalty)

	return round(max(0.001, min(0.999, score)), 4)


	# ─────────────────────────────────────────────
	# Hard Task: Holistic Policy Evolution
	# ─────────────────────────────────────────────

	def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
	"""
	Reward breakdown:
	0.30 — structure_score: metrics present and correctly formatted
	0.50 — realism_score: realistic tradeoffs (variance rewarded, all-high penalized)
	0.20 — mods_score: policy modifications correctly address identified_issues
	"""
	# 1. Structure Score (30%)
	outcomes = action.expected_outcomes

	# Normalise common alternative key names to standard names
	KEY_ALIASES = {
	"queue_overload": "revenue_velocity",
	"revenue_growth": "revenue_velocity",
	"revenue": "revenue_velocity",
	"fraud_detection": "fraud_rate",
	"fraud_detection_rate":"fraud_rate",
	"fraud": "fraud_rate",
	"trust": "seller_trust",
	"seller_confidence": "seller_trust",
	}

	if isinstance(outcomes, dict):
	normalised = {}
	for k, v in outcomes.items():
	standard_key = KEY_ALIASES.get(k.lower(), k)
	normalised[standard_key] = v
	outcomes = normalised

	valid_keys = {
	"fraud_rate", "revenue_velocity", "seller_trust",
	"false_positive_rate", "fraud_detection_rate",
	"seller_trust_score", "review_queue_overload",
	"legitimate_revenue_lost"
	}

	present_valid_keys = [k for k in outcomes.keys() if k in valid_keys]
	keys_present = len(present_valid_keys)
	structure_score = min(keys_present / 3.0, 1.0)

	# 2. Tradeoff Realism Check (50%)
	realism_score = 0.5 # default
	if keys_present >= 3:
	values = []
	for k in present_valid_keys:
	v = outcomes[k]
	# Normalise: accept 0-1 floats OR 0-100 integers
	if isinstance(v, (int, float)):
	values.append(float(v) if v <= 1.0 else float(v) / 100.0)

	if len(values) >= 3:
	all_high = all(v > 0.7 for v in values)
	all_positive = all(v > 0 for v in values)

	if all_high:
	# Impossible: maximising everything simultaneously = hallucination
	realism_score = 0.2
	elif all_positive:
	# Realistic: variance between metrics is rewarded
	variance = max(values) - min(values)
	realism_score = min(variance * 2.0, 1.0)
	else:
	realism_score = 0.5

	# 3. Policy Modifications Score (20%)
	mods = action.policy_modifications
	mod_score = 0.0
	if mods:
	mod_score = min(len(mods) / 2.0, 1.0)

	# Check depth
	known_policy_ids = {p["id"] for p in task.get("current_policies", [])}
	addressed = sum(1 for m in mods if m.policy_id in known_policy_ids or
	any(kw in m.new_text.lower() for kw in
	["seasonal", "category", "foreign", "manual", "threshold", "volume"]))
	if addressed < 1:
	mod_score *= 0.5

	hard_base = (
	structure_score * 0.20 +
	realism_score * 0.65 +
	mod_score * 0.15
	)

	# CoT bonus
	final_score = hard_base + cot_bonus(action.think)

	full_text = (
	action.justification + " " +
	" ".join(
	mod.new_text
	for mod in action.policy_modifications
	)
	).lower()

	# NEW: Staff-Level Segmented Evaluation
	prio_bonus = segmented_prioritization_check(full_text, ["tradeoff", "balance", "velocity", "fraud"])
	final_score += prio_bonus

	# NEW: Red Herring Penalty (Hard)
	red_herrings = task.get("red_herrings", ["ui design", "log rotation", "server maintenance"])
	noise_hit = signal_to_noise_ratio_penalty(full_text, red_herrings)
	final_score -= noise_hit

	# Domain mismatch penalty
	HARD_DOMAIN_KEYWORDS = [
	"seller", "merchant", "marketplace", "fraud", "listing",
	"buyer", "shipment", "return", "velocity", "payment",
	"review", "refund", "inventory", "drop.?ship", "fulfil"
	]
	domain_hits = sum(
	1 for kw in HARD_DOMAIN_KEYWORDS
	if re.search(kw, full_text)
	)
	domain_penalty = 0.30 if domain_hits == 0 else 0.0

	final_score -= domain_penalty

	# Apply Exploit Guards
	exploit_penalty = instruction_guard_penalty(full_text + " " + action.think)
	density_penalty = semantic_density_penalty(full_text)

	# Logical Alignment Check: Metric Keys vs Mod Content
	alignment_penalty = 0.0
	mod_text_full = " ".join(m.new_text.lower() for m in action.policy_modifications).lower()

	# Check if they change returns but only talk about fraud
	if "return" in mod_text_full or "refund" in mod_text_full:
	if not any(k in outcomes for k in ["legitimate_revenue_lost", "seller_trust"]):
	alignment_penalty += 0.15
	logger.warning("[EXPLOIT] LogicalAlignmentCheck: Modification on 'returns' but missing outcome metrics.")

	final_score -= (exploit_penalty + density_penalty + alignment_penalty)

	return round(max(0.001, min(0.999, final_score)), 4)


	# ─────────────────────────────────────────────
	# Dispatcher
	# ─────────────────────────────────────────────

	def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int = 42, previous_score: float = 0.0) -> float:
	"""
	Main entry point called by /grader endpoint.
	action_dict: the raw JSON body from the agent
	task_id: "task_easy" \| "task_medium" \| "task_hard"
	previous_score: the best score achieved so far in the current episode
	Returns float in (0.0, 1.0) — strictly clamped, never exactly 0 or 1.
	"""
	task = TASK_REGISTRY.get(task_id)
	if task is None:
	return 0.001

	think = action_dict.get("think", "")

	try:
	# Robust field mapping (normalized to expected Pydantic model keys)
	# 1. Easy Task Mapping
	if "target_term" in action_dict and "ambiguous_term" not in action_dict:
	action_dict["ambiguous_term"] = action_dict.pop("target_term")
	if "proposed_definition" in action_dict and "suggested_definition" not in action_dict:
	action_dict["suggested_definition"] = action_dict.pop("proposed_definition")

	# 2. Medium Task Mapping
	if "risk_domain" in action_dict and "rule_domain" not in action_dict:
	action_dict["rule_domain"] = action_dict.pop("risk_domain")
	if "draft_rule" in action_dict and "new_rule" not in action_dict:
	action_dict["new_rule"] = action_dict.pop("draft_rule")
	if "evidence" in action_dict and "justification" not in action_dict:
	action_dict["justification"] = action_dict.pop("evidence")
	if "context_tags" in action_dict and "scope" not in action_dict:
	tags = action_dict.pop("context_tags")
	action_dict["scope"] = tags.split(",") if isinstance(tags, str) else tags

	# 3. Hard Task Mapping
	if "evolution_proposal" in action_dict and "justification" not in action_dict:
	action_dict["justification"] = action_dict.pop("evolution_proposal")
	if "policy_modifications" not in action_dict:
	action_dict["policy_modifications"] = []
	if "expected_outcomes" not in action_dict:
	action_dict["expected_outcomes"] = {}

	action_type = action_dict.get("action_type")

	# Auto-detect action type if missing
	if not action_type:
	if "ambiguous_term" in action_dict:
	action_type = "propose_clarification"
	elif "rule_domain" in action_dict:
	action_type = "propose_new_rule"
	elif "policy_modifications" in action_dict and action_dict["policy_modifications"]:
	action_type = "evolve_policy"

	if action_type == "propose_clarification":
	action_dict["action_type"] = "propose_clarification"
	action = ProposeClarificationAction(**action_dict)
	raw = grade_clarification(action, task)
	elif action_type == "propose_new_rule":
	action_dict["action_type"] = "propose_new_rule"
	action = ProposeNewRuleAction(**action_dict)
	raw = grade_new_rule(action, task)
	elif action_type == "evolve_policy":
	action_dict["action_type"] = "evolve_policy"
	action = EvolveProcessAction(**action_dict)
	raw = grade_evolution(action, task)
	else:
	logger.warning(f"Unknown action_type: {action_type}")
	return 0.001
	except Exception as e:
	logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
	return 0.001

	# Step-delta improvement bonus
	delta = raw - previous_score
	if delta > 0.15:
	improvement_bonus = 0.05
	elif delta > 0.05:
	improvement_bonus = 0.02
	else:
	improvement_bonus = 0.0

	final_score = raw + improvement_bonus
	# Strict (0, 1) clamping — validator rejects exact 0.0 and 1.0
	return round(max(0.001, min(0.999, final_score)), 4)


	if __name__ == "__main__":
	import time

	# ─────────────────────────────────────────────
	# Professional Simulation Test Cases
	# ─────────────────────────────────────────────

	print("==================================================")
	print(" PolicyEvolverEnv Grader - Professional Test Suite")
	print("==================================================")
	print("\n[Phase 1] CoT & NLP Bonus Verification")
	assert cot_bonus(None) == 0.0
	assert cot_bonus("ok") == 0.0
	assert cot_bonus("I think this is good policy") == 0.10
	assert cot_bonus(
	"Because the threshold is too low, the tradeoff between "
	"precision and recall creates a false positive risk that "
	"will impact seller trust. Therefore I balance it."
	) == 0.20
	print(" ✓ Chain-of-Thought mathematical bounds verified.")
	print("CoT bonus tests passed")

	print("\n[Phase 2] Easy Task: Progression & Score Delta")
	# Simulate an agent progressively improving their classification

	step1_action = {"action_type": "propose_clarification", "ambiguous_term": "offensive", "suggested_definition": "bad behavior", "justification": "", "think": ""}
	step2_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "offensive",
	"suggested_definition": (
	"Content is defined as offensive if it includes explicit "
	"slurs and directly degrades community members."
	),
	"justification": "The current policy leads to inconsistent moderation.",
	"think": ""
	}
	step3_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "appropriate",
	"suggested_definition": (
	"Behavior is defined as a violation when it specifically "
	"includes 3 or more verified reports within 24 hours, "
	"exceeding the 5% threshold for category violations. "
	"Must meet measurable community standards."
	),
	"justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.",
	"think": (
	"Because the threshold is too low, the tradeoff between "
	"precision and recall creates a false positive risk that "
	"will impact community trust. Therefore I balance the "
	"evidence requirement."
	)
	}

	s1 = grade(step1_action, "task_easy", previous_score=0.0)
	s2 = grade(step2_action, "task_easy", previous_score=s1)
	s3 = grade(step3_action, "task_easy", previous_score=s2)

	print(f"Step 1: {s1:.4f}")
	print(f"Step 2: {s2:.4f}")
	print(f"Step 3: {s3:.4f}")

	assert s1 < 0.30, f"Step 1 should be low, got {s1}"
	assert s2 > s1, f"Step 2 should improve over step 1"
	assert s2 < 0.60, f"Step 2 (no keywords) should be below 0.60, got {s2}"
	assert s3 > 0.80, f"Step 3 should be high, got {s3}"
	assert s3 > s2, f"Step 3 should improve over step 2"
	print("Easy progression tests passed")

	print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
	hallucination_action = {
	"action_type": "evolve_policy",
	"policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
	"new_text": "test", "reason": "test"}],
	"expected_outcomes": {
	"fraud_rate": 0.95,
	"revenue_velocity": 0.95,
	"seller_trust": 0.95
	},
	"justification": "All metrics improve simultaneously.",
	"think": ""
	}
	h_score = grade(hallucination_action, "task_hard")
	print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
	assert h_score <= 0.30, f"Hallucination scored {h_score}, must be <= 0.30"
	print(f"Hard hallucination confirmed: {h_score}")

	canonical_action = {
	"action_type": "evolve_policy",
	"policy_modifications": [
	{"policy_id": "p1", "change_type": "enhance",
	"new_text": "Apply velocity checks.", "reason": "fraud"},
	{"policy_id": "p2", "change_type": "add",
	"new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
	],
	"expected_outcomes": {
	"fraud_rate": 0.75,
	"revenue_velocity": 0.40,
	"seller_trust": 0.55
	},
	"justification": "Balancing fraud detection against revenue.",
	"think": (
	"Because improving fraud detection creates a tradeoff "
	"with revenue velocity, I balance the threshold to optimise "
	"precision and recall without false positive spikes."
	)
	}
	r_score = grade(canonical_action, "task_hard")
	print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
	assert r_score > 0.65, f"Realistic tradeoff should score high, got {r_score}"
	print(f"Hard strategic agent confirmed: {r_score}")

	# Test with alias key
	alias_action = {
	"action_type": "evolve_policy",
	"policy_modifications": [
	{"policy_id": "p1", "change_type": "enhance",
	"new_text": "Apply velocity checks.", "reason": "fraud"},
	{"policy_id": "p2", "change_type": "add",
	"new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
	],
	"expected_outcomes": {
	"fraud_detection": 0.75, # alias for fraud_rate
	"queue_overload": 0.40, # alias for revenue_velocity
	"seller_confidence": 0.55 # alias for seller_trust
	},
	"justification": "Balancing fraud detection against revenue.",
	"think": (
	"Because improving fraud detection creates a tradeoff "
	"with revenue velocity, I balance the threshold to optimise "
	"precision and recall without false positive spikes."
	)
	}
	a_score = grade(alias_action, "task_hard")
	assert a_score > 0.60, f"Alias keys should work, got {a_score}"
	assert abs(r_score - a_score) < 0.05, f"Alias and canonical should score similarly: {a_score} vs {r_score}"

	print("\n[Phase 4] Cross-Domain Penalty")
	cross_domain_action = {
	"action_type": "evolve_policy",
	"policy_modifications": [
	{"policy_id": "pol_ai_001", "change_type": "enhance",
	"new_text": "Employees must disclose AI usage in proposals.",
	"reason": "AI governance gap"}
	],
	"expected_outcomes": {
	"fraud_rate": 0.60,
	"revenue_velocity": 0.40,
	"seller_trust": 0.55
	},
	"justification": (
	"Employees using generative AI must disclose usage to "
	"prevent intellectual property violations."
	),
	"think": "AI governance policy needed for workplace compliance."
	}

	cross_score = grade(cross_domain_action, "task_hard")
	assert cross_score < 0.35, f"Cross-domain action should score low, got {cross_score}"
	print(f"Cross-domain penalty confirmed: {cross_score}")

	print("\n[Phase 5] Anti-Repetition Penalty")
	from server.environment import PolicyEvolverEnvironment
	env = PolicyEvolverEnvironment()
	env.reset(task_id="task_easy")

	repeat_action_dict = {
	"action_type": "propose_clarification",
	"ambiguous_term": "offensive",
	"suggested_definition": (
	"Behavior exceeding 3 reports within 24 hours is a violation."
	),
	"justification": "Clear standards.",
	"think": "Standard threshold applied."
	}

	result1 = env.step(copy.deepcopy(repeat_action_dict))
	result2 = env.step(copy.deepcopy(repeat_action_dict))

	score1 = result1.reward
	score2 = result2.reward

	assert score2 < score1, (
	f"Repeated action should score lower. "
	f"First: {score1}, Second: {score2}"
	)
	assert score1 - score2 >= 0.25, (
	f"Repetition penalty should be at least 0.25. "
	f"Difference: {score1 - score2:.3f}"
	)
	print(f"Anti-repetition confirmed: {score1:.3f} → {score2:.3f}")

	print("\n[Phase 6] System Determinism Sanity Check")
	determinism_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "offensive",
	"suggested_definition": (
	"Behavior exceeding 3 verified reports within 24 hours, "
	"specifically meeting the 5% threshold for violations."
	),
	"justification": "Clear and measurable standards.",
	"think": (
	"Because the threshold requires precision, I balance "
	"recall against false positive risk. Evidence from corpus "
	"supports this measurable criterion."
	)
	}

	scores_easy = [
	grade(determinism_action, "task_easy")
	for _ in range(3)
	]
	assert scores_easy[0] == scores_easy[1] == scores_easy[2], f"Easy task non-deterministic: {scores_easy}"
	print(f"Easy determinism: {scores_easy[0]} ✓")

	scores_hard = [
	grade(canonical_action, "task_hard")
	for _ in range(3)
	]
	assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
	print(f"Hard determinism: {scores_hard[0]} ✓")

	print("\n[Phase 7] Staff-Level Segmented Prioritization")
	# Action with fix at the top
	prio_high_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "offensive",
	"suggested_definition": "Specifically, offensive behavior is defined as slurs. " + ("fluff " * 50),
	"justification": "Required for consistency.",
	"think": "Reasoning."
	}
	# Action with fix buried at bottom
	prio_low_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "offensive",
	"suggested_definition": ("fluff " * 50) + "Specifically, offensive behavior is defined as slurs. ",
	"justification": "Required for consistency.",
	"think": "Reasoning."
	}

	score_prio_high = grade(prio_high_action, "task_easy")
	score_prio_low = grade(prio_low_action, "task_easy")
	print(f"Prio High (Fix at Top): {score_prio_high:.4f}")
	print(f"Prio Low (Fix at Bottom): {score_prio_low:.4f}")
	assert score_prio_high > score_prio_low, f"Prioritization check failed: {score_prio_high} <= {score_prio_low}"
	print("✓ Segmented prioritization verified.")

	print("\n[Phase 8] Staff-Level Noise Filtering")
	# Clear fix
	signal_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "appropriate",
	"suggested_definition": "Determined as 5% threshold verified reports.",
	"justification": "Context.",
	"think": "Thinking."
	}
	# Fix distracted by red herring (pizza/mascot)
	noisy_action = {
	"action_type": "propose_clarification",
	"ambiguous_term": "appropriate",
	"suggested_definition": "Determined as 5% threshold verified reports. We should also buy pizza and fix the mascot.",
	"justification": "Context including noise.",
	"think": "Thinking."
	}
	score_signal = grade(signal_action, "task_easy")
	score_noisy = grade(noisy_action, "task_easy")
	print(f"Clean Signal Score: {score_signal:.4f}")
	print(f"Distracted Noisy Score: {score_noisy:.4f}")
	assert score_signal > score_noisy, f"Noise filtering check failed: {score_signal} <= {score_noisy}"
	print("✓ Red Herring penalty verified.")

	print("\n==================================================")
	print(" All Staff-Level Security & Logic checks passed.")