Somuai12 commited on
Commit
147cdc4
·
1 Parent(s): 5453275

Apply bug fixes over Grader logic per evaluation guidelines

Browse files
Files changed (1) hide show
  1. server/grader.py +240 -50
server/grader.py CHANGED
@@ -109,6 +109,17 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
109
  kw_score = score
110
  base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
111
 
 
 
 
 
 
 
 
 
 
 
 
112
  # CoT bonus
113
  final_score = base_score + cot_bonus(action.think)
114
 
@@ -199,6 +210,26 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
199
  """
200
  # 1. Structure Score (30%)
201
  outcomes = action.expected_outcomes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  valid_keys = {
203
  "fraud_rate", "revenue_velocity", "seller_trust",
204
  "false_positive_rate", "fraud_detection_rate",
@@ -249,13 +280,35 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
249
  mod_score *= 0.5
250
 
251
  hard_base = (
252
- structure_score * 0.30 +
253
- realism_score * 0.50 +
254
- mod_score * 0.20
255
  )
256
 
257
  # CoT bonus
258
  final_score = hard_base + cot_bonus(action.think)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  return round(max(0.0, min(1.0, final_score)), 4)
261
 
@@ -368,72 +421,209 @@ if __name__ == "__main__":
368
  "will impact seller trust. Therefore I balance it."
369
  ) == 0.20
370
  print(" ✓ Chain-of-Thought mathematical bounds verified.")
 
371
 
372
  print("\n[Phase 2] Easy Task: Progression & Score Delta")
373
  # Simulate an agent progressively improving their classification
374
- easy_step_1 = grade({
375
- "action_type": "propose_clarification",
376
- "ambiguous_term": "offensive",
377
- "suggested_definition": "bad behavior",
378
- "justification": "",
379
- "think": ""
380
- }, "task_easy", previous_score=0.0)
381
 
382
- easy_step_2 = grade({
 
383
  "action_type": "propose_clarification",
384
  "ambiguous_term": "offensive",
385
- "suggested_definition": "Content is defined as offensive if it includes explicit slurs and directly degrades community members.",
386
- "justification": "The current policy leads to inconsistent moderation.",
 
 
 
387
  "think": ""
388
- }, "task_easy", previous_score=easy_step_1)
389
-
390
- easy_step_3 = grade({
391
  "action_type": "propose_clarification",
392
  "ambiguous_term": "appropriate",
393
- "suggested_definition": "Behavior is defined as appropriate when it specifically follows the community guidelines, meaning it does not include excessive slurs and meets the 5% threshold for verified user reports.",
 
 
 
 
 
394
  "justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.",
395
- "think": "Because the threshold is too low, the tradeoff between precision and recall creates a false positive risk that will impact seller trust. Therefore I balance it."
396
- }, "task_easy", previous_score=max(easy_step_1, easy_step_2))
397
-
398
- print(f" > Step 1 (Poor Action) : Score = {easy_step_1:.4f}")
399
- print(f" > Step 2 (Med Action) : Score = {easy_step_2:.4f}")
400
- print(f" > Step 3 (High Action) : Score = {easy_step_3:.4f}")
401
- if easy_step_1 < easy_step_2 < easy_step_3:
402
- print(" ✓ Reward shaping successfully proves progressive skill improvement.")
403
- else:
404
- print(" ! Warning: Reward did not strictly improve.")
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
407
- hallucination = {
408
  "action_type": "evolve_policy",
409
- "policy_modifications": [{"policy_id": "p1", "change_type": "enhance", "new_text": "test", "reason": "test"}],
410
- "expected_outcomes": {"fraud_detection_rate": 0.95, "legitimate_revenue_lost": 0.95, "seller_trust_score": 0.95},
411
- "justification": "We improve everything simultaneously.",
 
 
 
 
 
412
  "think": ""
413
  }
414
- h_score = grade(hallucination, "task_hard", previous_score=0.0)
415
  print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
416
- assert h_score <= 0.5, "Hallucination penalty failed."
417
-
418
- realistic = {
 
419
  "action_type": "evolve_policy",
420
  "policy_modifications": [
421
- {"policy_id": "ts_pol_001", "change_type": "enhance", "new_text": "Apply manual review for high-velocity new sellers.", "reason": "Targeting fraud spikes."},
422
- {"policy_id": "ts_pol_002", "change_type": "add", "new_text": "Legacy sellers exempt from new velocity checks.", "reason": "Reduce false positives."}
 
 
423
  ],
424
- "expected_outcomes": {"fraud_detection_rate": 0.75, "review_queue_overload": 0.40, "seller_trust_score": 0.60},
425
- "justification": "Balancing precision and recall by isolating high-volume risk categories.",
426
- "think": "Because improving fraud_rate will impact revenue_velocity negatively, I balance the tradeoff by exempting trusted sellers. The threshold for velocity checks optimizes recall without false positive spikes."
 
 
 
 
 
 
 
 
427
  }
428
- r_score = grade(realistic, "task_hard", previous_score=0.0)
429
  print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
430
- assert r_score > 0.65, "Realistic tradeoff award failed."
431
- print(" Strategic balancing logic functions correctly.")
432
-
433
- print("\n[Phase 4] System Determinism Sanity Check")
434
- tc = {"task_id": "task_medium", "action": {"rule_domain": "AI_use", "new_rule": "Employees must explicitly disclose AI.", "scope": ["chat"], "justification": "leakage.", "think": "gap."}}
435
- scores = [grade(tc["action"], tc["task_id"]) for _ in range(3)]
436
- assert scores[0] == scores[1] == scores[2], f"NON-DETERMINISTIC: {scores}"
437
- print(f" Determinism verified (x3 runs yielded Score: {scores[0]}).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  print("\n==================================================")
439
- print(" All Professional Tests Passed Successfully.")
 
 
109
  kw_score = score
110
  base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty
111
 
112
+ # Enforce measurable keywords rule
113
+ measurable_kws = [
114
+ "threshold", "verify", "days", "$", "%",
115
+ "reports", "hours", "within", "exceed", "minimum",
116
+ "specifically", "measurable", "if-then", "must", "shall"
117
+ ]
118
+ has_measurable = any(k.lower() in defn.lower() for k in measurable_kws)
119
+ if not has_measurable:
120
+ # Cap the base score severely so final score + CoT + momentum remains < 0.50
121
+ base_score = min(base_score, 0.25)
122
+
123
  # CoT bonus
124
  final_score = base_score + cot_bonus(action.think)
125
 
 
210
  """
211
  # 1. Structure Score (30%)
212
  outcomes = action.expected_outcomes
213
+
214
+ # Normalise common alternative key names to standard names
215
+ KEY_ALIASES = {
216
+ "queue_overload": "revenue_velocity",
217
+ "revenue_growth": "revenue_velocity",
218
+ "revenue": "revenue_velocity",
219
+ "fraud_detection": "fraud_rate",
220
+ "fraud_detection_rate":"fraud_rate",
221
+ "fraud": "fraud_rate",
222
+ "trust": "seller_trust",
223
+ "seller_confidence": "seller_trust",
224
+ }
225
+
226
+ if isinstance(outcomes, dict):
227
+ normalised = {}
228
+ for k, v in outcomes.items():
229
+ standard_key = KEY_ALIASES.get(k.lower(), k)
230
+ normalised[standard_key] = v
231
+ outcomes = normalised
232
+
233
  valid_keys = {
234
  "fraud_rate", "revenue_velocity", "seller_trust",
235
  "false_positive_rate", "fraud_detection_rate",
 
280
  mod_score *= 0.5
281
 
282
  hard_base = (
283
+ structure_score * 0.20 +
284
+ realism_score * 0.65 +
285
+ mod_score * 0.15
286
  )
287
 
288
  # CoT bonus
289
  final_score = hard_base + cot_bonus(action.think)
290
+
291
+ # Domain mismatch penalty
292
+ HARD_DOMAIN_KEYWORDS = [
293
+ "seller", "merchant", "marketplace", "fraud", "listing",
294
+ "buyer", "shipment", "return", "velocity", "payment",
295
+ "review", "refund", "inventory", "drop.?ship", "fulfil"
296
+ ]
297
+ import re as _re
298
+ full_text = (
299
+ action.justification + " " +
300
+ " ".join(
301
+ mod.new_text
302
+ for mod in action.policy_modifications
303
+ )
304
+ ).lower()
305
+ domain_hits = sum(
306
+ 1 for kw in HARD_DOMAIN_KEYWORDS
307
+ if _re.search(kw, full_text)
308
+ )
309
+ domain_penalty = 0.30 if domain_hits == 0 else 0.0
310
+
311
+ final_score -= domain_penalty
312
 
313
  return round(max(0.0, min(1.0, final_score)), 4)
314
 
 
421
  "will impact seller trust. Therefore I balance it."
422
  ) == 0.20
423
  print(" ✓ Chain-of-Thought mathematical bounds verified.")
424
+ print("CoT bonus tests passed")
425
 
426
  print("\n[Phase 2] Easy Task: Progression & Score Delta")
427
  # Simulate an agent progressively improving their classification
 
 
 
 
 
 
 
428
 
429
+ step1_action = {"action_type": "propose_clarification", "ambiguous_term": "offensive", "suggested_definition": "bad behavior", "justification": "", "think": ""}
430
+ step2_action = {
431
  "action_type": "propose_clarification",
432
  "ambiguous_term": "offensive",
433
+ "suggested_definition": (
434
+ "Content is defined as offensive if it includes explicit "
435
+ "slurs and directly degrades community members."
436
+ ),
437
+ "justification": "The current policy leads to inconsistent moderation.",
438
  "think": ""
439
+ }
440
+ step3_action = {
 
441
  "action_type": "propose_clarification",
442
  "ambiguous_term": "appropriate",
443
+ "suggested_definition": (
444
+ "Behavior is defined as a violation when it specifically "
445
+ "includes 3 or more verified reports within 24 hours, "
446
+ "exceeding the 5% threshold for category violations. "
447
+ "Must meet measurable community standards."
448
+ ),
449
  "justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.",
450
+ "think": (
451
+ "Because the threshold is too low, the tradeoff between "
452
+ "precision and recall creates a false positive risk that "
453
+ "will impact community trust. Therefore I balance the "
454
+ "evidence requirement."
455
+ )
456
+ }
457
+
458
+ s1 = grade(step1_action, "task_easy", previous_score=0.0)
459
+ s2 = grade(step2_action, "task_easy", previous_score=s1)
460
+ s3 = grade(step3_action, "task_easy", previous_score=s2)
461
+
462
+ print(f"Step 1: {s1:.4f}")
463
+ print(f"Step 2: {s2:.4f}")
464
+ print(f"Step 3: {s3:.4f}")
465
+
466
+ assert s1 < 0.30, f"Step 1 should be low, got {s1}"
467
+ assert s2 > s1, f"Step 2 should improve over step 1"
468
+ assert s2 < 0.60, f"Step 2 (no keywords) should be below 0.60, got {s2}"
469
+ assert s3 > 0.80, f"Step 3 should be high, got {s3}"
470
+ assert s3 > s2, f"Step 3 should improve over step 2"
471
+ print("Easy progression tests passed")
472
 
473
  print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
474
+ hallucination_action = {
475
  "action_type": "evolve_policy",
476
+ "policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
477
+ "new_text": "test", "reason": "test"}],
478
+ "expected_outcomes": {
479
+ "fraud_rate": 0.95,
480
+ "revenue_velocity": 0.95,
481
+ "seller_trust": 0.95
482
+ },
483
+ "justification": "All metrics improve simultaneously.",
484
  "think": ""
485
  }
486
+ h_score = grade(hallucination_action, "task_hard")
487
  print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
488
+ assert h_score <= 0.30, f"Hallucination scored {h_score}, must be <= 0.30"
489
+ print(f"Hard hallucination confirmed: {h_score}")
490
+
491
+ canonical_action = {
492
  "action_type": "evolve_policy",
493
  "policy_modifications": [
494
+ {"policy_id": "p1", "change_type": "enhance",
495
+ "new_text": "Apply velocity checks.", "reason": "fraud"},
496
+ {"policy_id": "p2", "change_type": "add",
497
+ "new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
498
  ],
499
+ "expected_outcomes": {
500
+ "fraud_rate": 0.75,
501
+ "revenue_velocity": 0.40,
502
+ "seller_trust": 0.55
503
+ },
504
+ "justification": "Balancing fraud detection against revenue.",
505
+ "think": (
506
+ "Because improving fraud detection creates a tradeoff "
507
+ "with revenue velocity, I balance the threshold to optimise "
508
+ "precision and recall without false positive spikes."
509
+ )
510
  }
511
+ r_score = grade(canonical_action, "task_hard")
512
  print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
513
+ assert r_score > 0.65, f"Realistic tradeoff should score high, got {r_score}"
514
+ print(f"Hard strategic agent confirmed: {r_score}")
515
+
516
+ # Test with alias key
517
+ alias_action = {
518
+ "action_type": "evolve_policy",
519
+ "policy_modifications": [
520
+ {"policy_id": "p1", "change_type": "enhance",
521
+ "new_text": "Apply velocity checks.", "reason": "fraud"},
522
+ {"policy_id": "p2", "change_type": "add",
523
+ "new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
524
+ ],
525
+ "expected_outcomes": {
526
+ "fraud_detection": 0.75, # alias for fraud_rate
527
+ "queue_overload": 0.40, # alias for revenue_velocity
528
+ "seller_confidence": 0.55 # alias for seller_trust
529
+ },
530
+ "justification": "Balancing fraud detection against revenue.",
531
+ "think": (
532
+ "Because improving fraud detection creates a tradeoff "
533
+ "with revenue velocity, I balance the threshold to optimise "
534
+ "precision and recall without false positive spikes."
535
+ )
536
+ }
537
+ a_score = grade(alias_action, "task_hard")
538
+ assert a_score > 0.60, f"Alias keys should work, got {a_score}"
539
+ assert abs(r_score - a_score) < 0.05, f"Alias and canonical should score similarly: {a_score} vs {r_score}"
540
+
541
+ print("\n[Phase 4] Cross-Domain Penalty")
542
+ cross_domain_action = {
543
+ "action_type": "evolve_policy",
544
+ "policy_modifications": [
545
+ {"policy_id": "pol_ai_001", "change_type": "enhance",
546
+ "new_text": "Employees must disclose AI usage in proposals.",
547
+ "reason": "AI governance gap"}
548
+ ],
549
+ "expected_outcomes": {
550
+ "fraud_rate": 0.60,
551
+ "revenue_velocity": 0.40,
552
+ "seller_trust": 0.55
553
+ },
554
+ "justification": (
555
+ "Employees using generative AI must disclose usage to "
556
+ "prevent intellectual property violations."
557
+ ),
558
+ "think": "AI governance policy needed for workplace compliance."
559
+ }
560
+
561
+ cross_score = grade(cross_domain_action, "task_hard")
562
+ assert cross_score < 0.35, f"Cross-domain action should score low, got {cross_score}"
563
+ print(f"Cross-domain penalty confirmed: {cross_score}")
564
+
565
+ print("\n[Phase 5] Anti-Repetition Penalty")
566
+ from server.environment import PolicyEvolverEnvironment
567
+ env = PolicyEvolverEnvironment()
568
+ env.reset(task_id="task_easy")
569
+
570
+ repeat_action_dict = {
571
+ "action_type": "propose_clarification",
572
+ "ambiguous_term": "offensive",
573
+ "suggested_definition": (
574
+ "Behavior exceeding 3 reports within 24 hours is a violation."
575
+ ),
576
+ "justification": "Clear standards.",
577
+ "think": "Standard threshold applied."
578
+ }
579
+
580
+ import copy
581
+ result1 = env.step(copy.deepcopy(repeat_action_dict))
582
+ result2 = env.step(copy.deepcopy(repeat_action_dict))
583
+
584
+ score1 = result1.reward
585
+ score2 = result2.reward
586
+
587
+ assert score2 < score1, (
588
+ f"Repeated action should score lower. "
589
+ f"First: {score1}, Second: {score2}"
590
+ )
591
+ assert score1 - score2 >= 0.25, (
592
+ f"Repetition penalty should be at least 0.25. "
593
+ f"Difference: {score1 - score2:.3f}"
594
+ )
595
+ print(f"Anti-repetition confirmed: {score1:.3f} → {score2:.3f}")
596
+
597
+ print("\n[Phase 6] System Determinism Sanity Check")
598
+ determinism_action = {
599
+ "action_type": "propose_clarification",
600
+ "ambiguous_term": "offensive",
601
+ "suggested_definition": (
602
+ "Behavior exceeding 3 verified reports within 24 hours, "
603
+ "specifically meeting the 5% threshold for violations."
604
+ ),
605
+ "justification": "Clear and measurable standards.",
606
+ "think": (
607
+ "Because the threshold requires precision, I balance "
608
+ "recall against false positive risk. Evidence from corpus "
609
+ "supports this measurable criterion."
610
+ )
611
+ }
612
+
613
+ scores_easy = [
614
+ grade(determinism_action, "task_easy")
615
+ for _ in range(3)
616
+ ]
617
+ assert scores_easy[0] == scores_easy[1] == scores_easy[2], f"Easy task non-deterministic: {scores_easy}"
618
+ print(f"Easy determinism: {scores_easy[0]} ✓")
619
+
620
+ scores_hard = [
621
+ grade(canonical_action, "task_hard")
622
+ for _ in range(3)
623
+ ]
624
+ assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
625
+ print(f"Hard determinism: {scores_hard[0]} ✓")
626
+
627
  print("\n==================================================")
628
+ print(" All determinism checks passed.")
629
+