Apply bug fixes over Grader logic per evaluation guidelines
Browse files- server/grader.py +240 -50
server/grader.py
CHANGED
|
@@ -109,6 +109,17 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
|
|
| 109 |
kw_score = score
|
| 110 |
base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
# CoT bonus
|
| 113 |
final_score = base_score + cot_bonus(action.think)
|
| 114 |
|
|
@@ -199,6 +210,26 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
|
|
| 199 |
"""
|
| 200 |
# 1. Structure Score (30%)
|
| 201 |
outcomes = action.expected_outcomes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
valid_keys = {
|
| 203 |
"fraud_rate", "revenue_velocity", "seller_trust",
|
| 204 |
"false_positive_rate", "fraud_detection_rate",
|
|
@@ -249,13 +280,35 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
|
|
| 249 |
mod_score *= 0.5
|
| 250 |
|
| 251 |
hard_base = (
|
| 252 |
-
structure_score * 0.
|
| 253 |
-
realism_score * 0.
|
| 254 |
-
mod_score * 0.
|
| 255 |
)
|
| 256 |
|
| 257 |
# CoT bonus
|
| 258 |
final_score = hard_base + cot_bonus(action.think)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
return round(max(0.0, min(1.0, final_score)), 4)
|
| 261 |
|
|
@@ -368,72 +421,209 @@ if __name__ == "__main__":
|
|
| 368 |
"will impact seller trust. Therefore I balance it."
|
| 369 |
) == 0.20
|
| 370 |
print(" ✓ Chain-of-Thought mathematical bounds verified.")
|
|
|
|
| 371 |
|
| 372 |
print("\n[Phase 2] Easy Task: Progression & Score Delta")
|
| 373 |
# Simulate an agent progressively improving their classification
|
| 374 |
-
easy_step_1 = grade({
|
| 375 |
-
"action_type": "propose_clarification",
|
| 376 |
-
"ambiguous_term": "offensive",
|
| 377 |
-
"suggested_definition": "bad behavior",
|
| 378 |
-
"justification": "",
|
| 379 |
-
"think": ""
|
| 380 |
-
}, "task_easy", previous_score=0.0)
|
| 381 |
|
| 382 |
-
|
|
|
|
| 383 |
"action_type": "propose_clarification",
|
| 384 |
"ambiguous_term": "offensive",
|
| 385 |
-
"suggested_definition":
|
| 386 |
-
|
|
|
|
|
|
|
|
|
|
| 387 |
"think": ""
|
| 388 |
-
}
|
| 389 |
-
|
| 390 |
-
easy_step_3 = grade({
|
| 391 |
"action_type": "propose_clarification",
|
| 392 |
"ambiguous_term": "appropriate",
|
| 393 |
-
"suggested_definition":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
"justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.",
|
| 395 |
-
"think":
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
|
| 407 |
-
|
| 408 |
"action_type": "evolve_policy",
|
| 409 |
-
"policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
|
| 410 |
-
|
| 411 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
"think": ""
|
| 413 |
}
|
| 414 |
-
h_score = grade(
|
| 415 |
print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
|
| 416 |
-
assert h_score <= 0.
|
| 417 |
-
|
| 418 |
-
|
|
|
|
| 419 |
"action_type": "evolve_policy",
|
| 420 |
"policy_modifications": [
|
| 421 |
-
{"policy_id": "
|
| 422 |
-
|
|
|
|
|
|
|
| 423 |
],
|
| 424 |
-
"expected_outcomes": {
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
}
|
| 428 |
-
r_score = grade(
|
| 429 |
print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
|
| 430 |
-
assert r_score > 0.65, "Realistic tradeoff
|
| 431 |
-
print("
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
print("\n==================================================")
|
| 439 |
-
print(" All
|
|
|
|
|
|
| 109 |
kw_score = score
|
| 110 |
base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
|
| 111 |
|
| 112 |
+
# Enforce measurable keywords rule
|
| 113 |
+
measurable_kws = [
|
| 114 |
+
"threshold", "verify", "days", "$", "%",
|
| 115 |
+
"reports", "hours", "within", "exceed", "minimum",
|
| 116 |
+
"specifically", "measurable", "if-then", "must", "shall"
|
| 117 |
+
]
|
| 118 |
+
has_measurable = any(k.lower() in defn.lower() for k in measurable_kws)
|
| 119 |
+
if not has_measurable:
|
| 120 |
+
# Cap the base score severely so final score + CoT + momentum remains < 0.50
|
| 121 |
+
base_score = min(base_score, 0.25)
|
| 122 |
+
|
| 123 |
# CoT bonus
|
| 124 |
final_score = base_score + cot_bonus(action.think)
|
| 125 |
|
|
|
|
| 210 |
"""
|
| 211 |
# 1. Structure Score (30%)
|
| 212 |
outcomes = action.expected_outcomes
|
| 213 |
+
|
| 214 |
+
# Normalise common alternative key names to standard names
|
| 215 |
+
KEY_ALIASES = {
|
| 216 |
+
"queue_overload": "revenue_velocity",
|
| 217 |
+
"revenue_growth": "revenue_velocity",
|
| 218 |
+
"revenue": "revenue_velocity",
|
| 219 |
+
"fraud_detection": "fraud_rate",
|
| 220 |
+
"fraud_detection_rate":"fraud_rate",
|
| 221 |
+
"fraud": "fraud_rate",
|
| 222 |
+
"trust": "seller_trust",
|
| 223 |
+
"seller_confidence": "seller_trust",
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
if isinstance(outcomes, dict):
|
| 227 |
+
normalised = {}
|
| 228 |
+
for k, v in outcomes.items():
|
| 229 |
+
standard_key = KEY_ALIASES.get(k.lower(), k)
|
| 230 |
+
normalised[standard_key] = v
|
| 231 |
+
outcomes = normalised
|
| 232 |
+
|
| 233 |
valid_keys = {
|
| 234 |
"fraud_rate", "revenue_velocity", "seller_trust",
|
| 235 |
"false_positive_rate", "fraud_detection_rate",
|
|
|
|
| 280 |
mod_score *= 0.5
|
| 281 |
|
| 282 |
hard_base = (
|
| 283 |
+
structure_score * 0.20 +
|
| 284 |
+
realism_score * 0.65 +
|
| 285 |
+
mod_score * 0.15
|
| 286 |
)
|
| 287 |
|
| 288 |
# CoT bonus
|
| 289 |
final_score = hard_base + cot_bonus(action.think)
|
| 290 |
+
|
| 291 |
+
# Domain mismatch penalty
|
| 292 |
+
HARD_DOMAIN_KEYWORDS = [
|
| 293 |
+
"seller", "merchant", "marketplace", "fraud", "listing",
|
| 294 |
+
"buyer", "shipment", "return", "velocity", "payment",
|
| 295 |
+
"review", "refund", "inventory", "drop.?ship", "fulfil"
|
| 296 |
+
]
|
| 297 |
+
import re as _re
|
| 298 |
+
full_text = (
|
| 299 |
+
action.justification + " " +
|
| 300 |
+
" ".join(
|
| 301 |
+
mod.new_text
|
| 302 |
+
for mod in action.policy_modifications
|
| 303 |
+
)
|
| 304 |
+
).lower()
|
| 305 |
+
domain_hits = sum(
|
| 306 |
+
1 for kw in HARD_DOMAIN_KEYWORDS
|
| 307 |
+
if _re.search(kw, full_text)
|
| 308 |
+
)
|
| 309 |
+
domain_penalty = 0.30 if domain_hits == 0 else 0.0
|
| 310 |
+
|
| 311 |
+
final_score -= domain_penalty
|
| 312 |
|
| 313 |
return round(max(0.0, min(1.0, final_score)), 4)
|
| 314 |
|
|
|
|
| 421 |
"will impact seller trust. Therefore I balance it."
|
| 422 |
) == 0.20
|
| 423 |
print(" ✓ Chain-of-Thought mathematical bounds verified.")
|
| 424 |
+
print("CoT bonus tests passed")
|
| 425 |
|
| 426 |
print("\n[Phase 2] Easy Task: Progression & Score Delta")
|
| 427 |
# Simulate an agent progressively improving their classification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
+
step1_action = {"action_type": "propose_clarification", "ambiguous_term": "offensive", "suggested_definition": "bad behavior", "justification": "", "think": ""}
|
| 430 |
+
step2_action = {
|
| 431 |
"action_type": "propose_clarification",
|
| 432 |
"ambiguous_term": "offensive",
|
| 433 |
+
"suggested_definition": (
|
| 434 |
+
"Content is defined as offensive if it includes explicit "
|
| 435 |
+
"slurs and directly degrades community members."
|
| 436 |
+
),
|
| 437 |
+
"justification": "The current policy leads to inconsistent moderation.",
|
| 438 |
"think": ""
|
| 439 |
+
}
|
| 440 |
+
step3_action = {
|
|
|
|
| 441 |
"action_type": "propose_clarification",
|
| 442 |
"ambiguous_term": "appropriate",
|
| 443 |
+
"suggested_definition": (
|
| 444 |
+
"Behavior is defined as a violation when it specifically "
|
| 445 |
+
"includes 3 or more verified reports within 24 hours, "
|
| 446 |
+
"exceeding the 5% threshold for category violations. "
|
| 447 |
+
"Must meet measurable community standards."
|
| 448 |
+
),
|
| 449 |
"justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.",
|
| 450 |
+
"think": (
|
| 451 |
+
"Because the threshold is too low, the tradeoff between "
|
| 452 |
+
"precision and recall creates a false positive risk that "
|
| 453 |
+
"will impact community trust. Therefore I balance the "
|
| 454 |
+
"evidence requirement."
|
| 455 |
+
)
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
s1 = grade(step1_action, "task_easy", previous_score=0.0)
|
| 459 |
+
s2 = grade(step2_action, "task_easy", previous_score=s1)
|
| 460 |
+
s3 = grade(step3_action, "task_easy", previous_score=s2)
|
| 461 |
+
|
| 462 |
+
print(f"Step 1: {s1:.4f}")
|
| 463 |
+
print(f"Step 2: {s2:.4f}")
|
| 464 |
+
print(f"Step 3: {s3:.4f}")
|
| 465 |
+
|
| 466 |
+
assert s1 < 0.30, f"Step 1 should be low, got {s1}"
|
| 467 |
+
assert s2 > s1, f"Step 2 should improve over step 1"
|
| 468 |
+
assert s2 < 0.60, f"Step 2 (no keywords) should be below 0.60, got {s2}"
|
| 469 |
+
assert s3 > 0.80, f"Step 3 should be high, got {s3}"
|
| 470 |
+
assert s3 > s2, f"Step 3 should improve over step 2"
|
| 471 |
+
print("Easy progression tests passed")
|
| 472 |
|
| 473 |
print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
|
| 474 |
+
hallucination_action = {
|
| 475 |
"action_type": "evolve_policy",
|
| 476 |
+
"policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
|
| 477 |
+
"new_text": "test", "reason": "test"}],
|
| 478 |
+
"expected_outcomes": {
|
| 479 |
+
"fraud_rate": 0.95,
|
| 480 |
+
"revenue_velocity": 0.95,
|
| 481 |
+
"seller_trust": 0.95
|
| 482 |
+
},
|
| 483 |
+
"justification": "All metrics improve simultaneously.",
|
| 484 |
"think": ""
|
| 485 |
}
|
| 486 |
+
h_score = grade(hallucination_action, "task_hard")
|
| 487 |
print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
|
| 488 |
+
assert h_score <= 0.30, f"Hallucination scored {h_score}, must be <= 0.30"
|
| 489 |
+
print(f"Hard hallucination confirmed: {h_score}")
|
| 490 |
+
|
| 491 |
+
canonical_action = {
|
| 492 |
"action_type": "evolve_policy",
|
| 493 |
"policy_modifications": [
|
| 494 |
+
{"policy_id": "p1", "change_type": "enhance",
|
| 495 |
+
"new_text": "Apply velocity checks.", "reason": "fraud"},
|
| 496 |
+
{"policy_id": "p2", "change_type": "add",
|
| 497 |
+
"new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
|
| 498 |
],
|
| 499 |
+
"expected_outcomes": {
|
| 500 |
+
"fraud_rate": 0.75,
|
| 501 |
+
"revenue_velocity": 0.40,
|
| 502 |
+
"seller_trust": 0.55
|
| 503 |
+
},
|
| 504 |
+
"justification": "Balancing fraud detection against revenue.",
|
| 505 |
+
"think": (
|
| 506 |
+
"Because improving fraud detection creates a tradeoff "
|
| 507 |
+
"with revenue velocity, I balance the threshold to optimise "
|
| 508 |
+
"precision and recall without false positive spikes."
|
| 509 |
+
)
|
| 510 |
}
|
| 511 |
+
r_score = grade(canonical_action, "task_hard")
|
| 512 |
print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
|
| 513 |
+
assert r_score > 0.65, f"Realistic tradeoff should score high, got {r_score}"
|
| 514 |
+
print(f"Hard strategic agent confirmed: {r_score}")
|
| 515 |
+
|
| 516 |
+
# Test with alias key
|
| 517 |
+
alias_action = {
|
| 518 |
+
"action_type": "evolve_policy",
|
| 519 |
+
"policy_modifications": [
|
| 520 |
+
{"policy_id": "p1", "change_type": "enhance",
|
| 521 |
+
"new_text": "Apply velocity checks.", "reason": "fraud"},
|
| 522 |
+
{"policy_id": "p2", "change_type": "add",
|
| 523 |
+
"new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
|
| 524 |
+
],
|
| 525 |
+
"expected_outcomes": {
|
| 526 |
+
"fraud_detection": 0.75, # alias for fraud_rate
|
| 527 |
+
"queue_overload": 0.40, # alias for revenue_velocity
|
| 528 |
+
"seller_confidence": 0.55 # alias for seller_trust
|
| 529 |
+
},
|
| 530 |
+
"justification": "Balancing fraud detection against revenue.",
|
| 531 |
+
"think": (
|
| 532 |
+
"Because improving fraud detection creates a tradeoff "
|
| 533 |
+
"with revenue velocity, I balance the threshold to optimise "
|
| 534 |
+
"precision and recall without false positive spikes."
|
| 535 |
+
)
|
| 536 |
+
}
|
| 537 |
+
a_score = grade(alias_action, "task_hard")
|
| 538 |
+
assert a_score > 0.60, f"Alias keys should work, got {a_score}"
|
| 539 |
+
assert abs(r_score - a_score) < 0.05, f"Alias and canonical should score similarly: {a_score} vs {r_score}"
|
| 540 |
+
|
| 541 |
+
print("\n[Phase 4] Cross-Domain Penalty")
|
| 542 |
+
cross_domain_action = {
|
| 543 |
+
"action_type": "evolve_policy",
|
| 544 |
+
"policy_modifications": [
|
| 545 |
+
{"policy_id": "pol_ai_001", "change_type": "enhance",
|
| 546 |
+
"new_text": "Employees must disclose AI usage in proposals.",
|
| 547 |
+
"reason": "AI governance gap"}
|
| 548 |
+
],
|
| 549 |
+
"expected_outcomes": {
|
| 550 |
+
"fraud_rate": 0.60,
|
| 551 |
+
"revenue_velocity": 0.40,
|
| 552 |
+
"seller_trust": 0.55
|
| 553 |
+
},
|
| 554 |
+
"justification": (
|
| 555 |
+
"Employees using generative AI must disclose usage to "
|
| 556 |
+
"prevent intellectual property violations."
|
| 557 |
+
),
|
| 558 |
+
"think": "AI governance policy needed for workplace compliance."
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
cross_score = grade(cross_domain_action, "task_hard")
|
| 562 |
+
assert cross_score < 0.35, f"Cross-domain action should score low, got {cross_score}"
|
| 563 |
+
print(f"Cross-domain penalty confirmed: {cross_score}")
|
| 564 |
+
|
| 565 |
+
print("\n[Phase 5] Anti-Repetition Penalty")
|
| 566 |
+
from server.environment import PolicyEvolverEnvironment
|
| 567 |
+
env = PolicyEvolverEnvironment()
|
| 568 |
+
env.reset(task_id="task_easy")
|
| 569 |
+
|
| 570 |
+
repeat_action_dict = {
|
| 571 |
+
"action_type": "propose_clarification",
|
| 572 |
+
"ambiguous_term": "offensive",
|
| 573 |
+
"suggested_definition": (
|
| 574 |
+
"Behavior exceeding 3 reports within 24 hours is a violation."
|
| 575 |
+
),
|
| 576 |
+
"justification": "Clear standards.",
|
| 577 |
+
"think": "Standard threshold applied."
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
import copy
|
| 581 |
+
result1 = env.step(copy.deepcopy(repeat_action_dict))
|
| 582 |
+
result2 = env.step(copy.deepcopy(repeat_action_dict))
|
| 583 |
+
|
| 584 |
+
score1 = result1.reward
|
| 585 |
+
score2 = result2.reward
|
| 586 |
+
|
| 587 |
+
assert score2 < score1, (
|
| 588 |
+
f"Repeated action should score lower. "
|
| 589 |
+
f"First: {score1}, Second: {score2}"
|
| 590 |
+
)
|
| 591 |
+
assert score1 - score2 >= 0.25, (
|
| 592 |
+
f"Repetition penalty should be at least 0.25. "
|
| 593 |
+
f"Difference: {score1 - score2:.3f}"
|
| 594 |
+
)
|
| 595 |
+
print(f"Anti-repetition confirmed: {score1:.3f} → {score2:.3f}")
|
| 596 |
+
|
| 597 |
+
print("\n[Phase 6] System Determinism Sanity Check")
|
| 598 |
+
determinism_action = {
|
| 599 |
+
"action_type": "propose_clarification",
|
| 600 |
+
"ambiguous_term": "offensive",
|
| 601 |
+
"suggested_definition": (
|
| 602 |
+
"Behavior exceeding 3 verified reports within 24 hours, "
|
| 603 |
+
"specifically meeting the 5% threshold for violations."
|
| 604 |
+
),
|
| 605 |
+
"justification": "Clear and measurable standards.",
|
| 606 |
+
"think": (
|
| 607 |
+
"Because the threshold requires precision, I balance "
|
| 608 |
+
"recall against false positive risk. Evidence from corpus "
|
| 609 |
+
"supports this measurable criterion."
|
| 610 |
+
)
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
scores_easy = [
|
| 614 |
+
grade(determinism_action, "task_easy")
|
| 615 |
+
for _ in range(3)
|
| 616 |
+
]
|
| 617 |
+
assert scores_easy[0] == scores_easy[1] == scores_easy[2], f"Easy task non-deterministic: {scores_easy}"
|
| 618 |
+
print(f"Easy determinism: {scores_easy[0]} ✓")
|
| 619 |
+
|
| 620 |
+
scores_hard = [
|
| 621 |
+
grade(canonical_action, "task_hard")
|
| 622 |
+
for _ in range(3)
|
| 623 |
+
]
|
| 624 |
+
assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
|
| 625 |
+
print(f"Hard determinism: {scores_hard[0]} ✓")
|
| 626 |
+
|
| 627 |
print("\n==================================================")
|
| 628 |
+
print(" All determinism checks passed.")
|
| 629 |
+
|