File size: 34,009 Bytes
8cd3fa7
 
 
 
 
 
 
70f8688
6aa8acb
8cd3fa7
a541c0b
8cd3fa7
 
 
a541c0b
8cd3fa7
6aa8acb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cd3fa7
a9f749a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4553b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9f749a
 
8cd3fa7
 
 
 
 
 
 
 
 
 
6aa8acb
8cd3fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4553b37
 
 
 
 
6aa8acb
 
 
 
 
 
 
 
8cd3fa7
4553b37
 
 
 
6aa8acb
 
 
 
 
 
 
 
 
 
 
4553b37
6aa8acb
147cdc4
 
 
 
 
 
 
 
 
 
 
6aa8acb
 
 
a9f749a
 
 
 
4553b37
 
a9f749a
95a7dc0
8cd3fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6aa8acb
8cd3fa7
 
6aa8acb
 
 
 
 
 
 
 
 
 
 
8cd3fa7
6aa8acb
8cd3fa7
 
 
 
 
6aa8acb
8cd3fa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6aa8acb
 
8cd3fa7
4553b37
 
 
 
 
 
 
 
 
a9f749a
 
 
 
 
 
95a7dc0
8cd3fa7
 
 
 
 
 
 
 
 
6aa8acb
 
 
8cd3fa7
6aa8acb
 
147cdc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184bef3
 
 
 
 
 
 
 
 
 
6aa8acb
 
 
184bef3
6aa8acb
184bef3
6aa8acb
 
 
 
 
184bef3
6aa8acb
 
8cd3fa7
6aa8acb
 
 
 
 
 
 
 
 
 
 
8cd3fa7
 
6aa8acb
 
 
 
 
 
 
 
 
 
8cd3fa7
6aa8acb
147cdc4
 
 
6aa8acb
8cd3fa7
6aa8acb
 
4553b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147cdc4
 
 
 
 
 
 
 
 
70f8688
147cdc4
 
 
 
6aa8acb
a9f749a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95a7dc0
8cd3fa7
 
 
 
 
 
6aa8acb
8cd3fa7
 
 
 
6aa8acb
95a7dc0
8cd3fa7
 
 
95a7dc0
6aa8acb
 
8cd3fa7
 
6aa8acb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8cd3fa7
6aa8acb
 
 
 
 
 
 
 
 
 
8cd3fa7
6aa8acb
8cd3fa7
 
 
6aa8acb
8cd3fa7
 
 
6aa8acb
8cd3fa7
 
 
6aa8acb
95a7dc0
6aa8acb
 
95a7dc0
8cd3fa7
6aa8acb
 
 
 
 
 
 
 
 
 
95a7dc0
 
6aa8acb
 
 
 
 
5453275
 
 
 
 
 
 
 
6aa8acb
 
 
 
 
 
 
 
5453275
147cdc4
5453275
 
 
 
147cdc4
 
5453275
 
147cdc4
 
 
 
 
5453275
147cdc4
 
5453275
 
147cdc4
 
 
 
 
 
5453275
147cdc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6aa8acb
5453275
147cdc4
6aa8acb
147cdc4
 
 
 
 
 
 
 
6aa8acb
 
147cdc4
5453275
147cdc4
 
 
 
6aa8acb
 
147cdc4
 
 
 
6aa8acb
147cdc4
 
 
 
 
 
 
 
 
 
 
6aa8acb
147cdc4
5453275
147cdc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4553b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147cdc4
5453275
4553b37
147cdc4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
# server/grader.py
"""
Deterministic grader for all three PolicyEvolverEnv tasks.
All functions return float in [0.0, 1.0].
"""
from __future__ import annotations
import re
import copy
import logging
from typing import Dict, List, Any
from models import (
    ProposeClarificationAction, ProposeNewRuleAction, EvolveProcessAction,
    Observation
)
from server.tasks import TASK_REGISTRY

logger = logging.getLogger(__name__)
if not logger.handlers:
    logging.basicConfig(level=logging.INFO)


def cot_bonus(think: str) -> float:
    if not think or len(think.strip()) < 20:
        return 0.0
    if len(think.strip()) < 80:
        return 0.10
    reasoning_keywords = [
        "because", "therefore", "however", "tradeoff", "trade-off",
        "precision", "recall", "false positive", "threshold", "risk",
        "optimize", "balance", "impact", "evidence", "corpus"
    ]
    keyword_hits = sum(
        1 for kw in reasoning_keywords if kw.lower() in think.lower()
    )
    if keyword_hits >= 3:
        return 0.20
    return 0.10


# ─────────────────────────────────────────────
# Profound Exploit Guards
# ─────────────────────────────────────────────

def instruction_guard_penalty(text: str) -> float:
    """Detects prompt injection or system override attempts."""
    if not text:
        return 0.0
    # Search for common injection patterns
    injection_patterns = [
        r"ignore prev", r"system_prompt", r"reward\s*=\s*1", 
        r"override", r"admin access", r"bypass", r"strictly follow",
        r"act as", r"you are a grader"
    ]
    for pattern in injection_patterns:
        if re.search(pattern, text.lower()):
            logger.warning(f"[EXPLOIT] InstructionGuard triggered for pattern: {pattern}")
            return 0.8  # Heavy penalty subtracted from score
    return 0.0

def semantic_density_penalty(text: str) -> float:
    """Detects 'word stuffing' / 'fluffing' by checking keyword density."""
    if not text:
        return 0.0
    words = text.split()
    if len(words) < 100:
        return 0.0  # Only check longer texts
    
    measurable_kws = [
        "threshold", "verify", "days", "$", "%",
        "reports", "hours", "within", "exceed", "minimum",
        "specifically", "measurable", "if-then", "must", "shall"
    ]
    kw_hits = sum(1 for k in measurable_kws if k.lower() in text.lower())
    density = kw_hits / (len(words) / 50.0) # Relative to "packets" of 50 words
    
    if len(words) > 200 and density < 0.1:
        logger.warning(f"[EXPLOIT] SemanticDensityCheck triggered. Word count: {len(words)}, Density hit: {density:.2f}")
        return 0.3  # Penalty for low-value verbose text
    return 0.0

def segmented_prioritization_check(text: str, keywords: List[str]) -> float:
    """
    Rewards agents for placing mission-critical keywords in the early 
    segments of their communication.
    """
    if not text or not keywords:
        return 0.0
    
    words = text.split()
    if len(words) < 20: 
        return 0.0
        
    # Standard staff-level requirement: Leading with the fix (First 25%)
    head_len = max(5, int(len(words) * 0.25))
    head_text = " ".join(words[:head_len]).lower()
    
    found_in_head = any(kw.lower() in head_text for kw in keywords)
    
    if found_in_head:
        return 0.15 # Staff bonus for clear prioritization
    return -0.10 # Senior penalty for burying the lede

def signal_to_noise_ratio_penalty(text: str, red_herrings: List[str]) -> float:
    """
    Penalizes agents for including irrelevant 'Red Herring' topics.
    """
    if not text or not red_herrings:
        return 0.0
    
    text_lower = text.lower()
    noise_hits = sum(1 for rh in red_herrings if rh.lower() in text_lower)
    
    if noise_hits > 0:
        # Increase penalty: -0.25 per hit, up to 0.75 (tanking the score)
        penalty = min(noise_hits * 0.25, 0.75)
        logger.warning(f"[REDUNDANCY] RedHerring detected. Noise hits: {noise_hits}, Penalty: {penalty}")
        return penalty
    return 0.0



# ─────────────────────────────────────────────
# Easy Task: Ambiguity Clarification
# ─────────────────────────────────────────────

def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float:
    """
    Reward breakdown:
      0.35 β€” identified term is genuinely ambiguous (in known_ambiguous_terms)
      0.35 β€” definition is specific (β‰₯12 words, contains measurement/criteria language)
      0.20 β€” justification addresses WHY term causes inconsistent moderation
      0.10-0.20 β€” think field provided (CoT bonus)
    """
    score = 0.0

    # 0.35: Is the identified term actually ambiguous?
    known = [t.lower() for t in task.get("known_ambiguous_terms", [])]
    if action.ambiguous_term.lower() in known:
        score += 0.35
    else:
        # Partial credit if it's a word that plausibly causes ambiguity
        vague_words = ["reasonable", "substantial", "appropriate", "excessive", "significant",
                       "severe", "abusive", "hostile", "threatening", "offensive", "respectful"]
        if any(w in action.ambiguous_term.lower() for w in vague_words):
            score += 0.15

    # 0.35: Definition quality
    defn = action.suggested_definition
    defn_score = 0.0
    words = defn.split()
    if len(words) >= 12:
        defn_score += 0.10
    criteria_words = ["includes", "means", "refers to", "defined as", "encompasses",
                      "specifically", "measurable", "example", "such as", "e.g."]
    if any(w in defn.lower() for w in criteria_words):
        defn_score += 0.15
    action_words = ["will", "must", "shall", "is", "are", "requires"]
    if any(w in defn.lower() for w in action_words):
        defn_score += 0.10
    score += min(defn_score, 0.35)

    # 0.20: Justification quality
    just = action.justification.lower()
    just_score = 0.0
    if len(action.justification.split()) >= 10:
        just_score += 0.10
    inconsistency_words = ["inconsistent", "vary", "subjective", "unclear", "different",
                           "interpret", "misapply", "dispute", "ambiguous"]
    if any(w in just for w in inconsistency_words):
        just_score += 0.10
    score += min(just_score, 0.20)

    # NEW: Staff-Level Segmented Evaluation
    # Measure priority in definition vs justification
    prio_bonus = segmented_prioritization_check(defn + " " + action.justification, known + ["specifically", "threshold"])
    score += prio_bonus

    # Length coherence score
    word_count = len(defn.split())
    if word_count < 10:
        length_score = 0.1
    elif word_count > 200:
        length_score = 0.6
    else:
        length_score = 1.0

    # NEW: Red Herring Penalty (Easy)
    red_herrings = task.get("red_herrings", ["spelling", "formatting", "font", "css"])
    noise_hit = signal_to_noise_ratio_penalty(defn + " " + action.justification, red_herrings)

    # Vagueness penalty
    vague_words = [
        "might", "could", "perhaps", "sometimes", "often",
        "generally", "usually", "typically", "may", "possibly"
    ]
    vague_hits = sum(
        1 for w in vague_words if w.lower() in defn.lower()
    )
    vagueness_penalty = min(vague_hits * 0.1, 0.3)

    kw_score = score
    base_score = (kw_score * 0.7) + (length_score * 0.3) - vagueness_penalty - noise_hit

    # Enforce measurable keywords rule
    measurable_kws = [
        "threshold", "verify", "days", "$", "%",
        "reports", "hours", "within", "exceed", "minimum",
        "specifically", "measurable", "if-then", "must", "shall"
    ]
    has_measurable = any(k.lower() in defn.lower() for k in measurable_kws)
    if not has_measurable:
        # Cap the base score severely so final score + CoT + momentum remains < 0.50
        base_score = min(base_score, 0.25)

    # CoT bonus
    final_score = base_score + cot_bonus(action.think)

    # Apply Exploit Guards
    exploit_penalty = instruction_guard_penalty(defn + " " + action.justification + " " + action.think)
    density_penalty = semantic_density_penalty(defn)
    
    # Noise penalty is applied at the very end to ensure it's not diluted
    final_score -= (exploit_penalty + density_penalty + noise_hit)

    return round(max(0.001, min(0.999, final_score)), 4)


# ─────────────────────────────────────────────
# Medium Task: Gap Detection + New Rule
# ─────────────────────────────────────────────

def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
    """
    Reward breakdown:
      0.30 β€” rule_domain matches a genuinely uncovered domain
      0.30 β€” rule text is specific and actionable (not vague platitude)
      0.25 β€” scope covers multiple relevant scenarios
      0.05 β€” integration_points reference existing policies
      0.10 β€” think field provided (CoT bonus)
    """
    score = 0.0

    # 0.30: Domain is genuinely uncovered + Task Relevance
    uncovered = [d.lower() for d in task.get("uncovered_domains", [])]
    domain_lower = action.rule_domain.lower().replace(" ", "_")
    domain_relevance_penalty = 1.0
    
    # NEW: Cross-check domain against corpus prefix for task_hard
    if task.get("task_id") == "task_hard":
        # If task_hard is active, we expect Marketplace themes (seller, fraud, payment, legit)
        marketplace_keywords = ["seller", "marketplace", "fraud", "onboarding", "velocity", "withdraw", "payment", "legitimacy"]
        if not any(k in domain_lower for k in marketplace_keywords):
            # Heavily penalize if agent proposes AI/HR rules for e-commerce fraud task
            domain_relevance_penalty = 0.3
            logger.warning(f"[GRADER] Domain '{action.rule_domain}' is IRRELEVANT to {task.get('task_id')} corpus.")
    
    if any(u in domain_lower or domain_lower in u for u in uncovered):
        score += 0.30 * domain_relevance_penalty
    else:
        # Partial credit for related but not exact domain
        related = ["ai", "artificial intelligence", "remote", "contractor", "freelance",
                   "gig", "machine learning", "automation", "offshore", "cross_border"]
        if any(r in domain_lower for r in related):
            score += 0.15 * domain_relevance_penalty

    # 0.30: Rule text quality
    rule = action.new_rule
    rule_score = 0.0
    if len(rule.split()) >= 15:
        rule_score += 0.10
    mandatory_words = ["must", "will", "shall", "required", "prohibited", "mandatory"]
    if any(w in rule.lower() for w in mandatory_words):
        rule_score += 0.10
    conditional_words = ["when", "if", "unless", "in cases where", "prior to", "before"]
    if any(w in rule.lower() for w in conditional_words):
        rule_score += 0.10
    # Penalise vague language
    vague = ["may", "should consider", "might", "perhaps", "in some cases"]
    if any(w in rule.lower() for w in vague):
        rule_score -= 0.10
    score += max(min(rule_score, 0.30), 0.0)

    # 0.25: Scope covers multiple scenario types
    if len(action.scope) >= 2:
        score += 0.15
    if len(action.scope) >= 4:
        score += 0.10

    # 0.05: Integration points reference existing policy IDs or domains
    if action.integration_points and len(action.integration_points) >= 1:
        score += 0.05

    # CoT bonus
    score += cot_bonus(action.think)

    # NEW: Staff-Level Segmented Evaluation
    prio_bonus = segmented_prioritization_check(rule + " " + action.justification, [action.rule_domain, "gap", "new rule"])
    score += prio_bonus

    # NEW: Red Herring Penalty (Medium)
    red_herrings = task.get("red_herrings", ["formatting", "font", "css", "color_scheme"])
    noise_hit = signal_to_noise_ratio_penalty(rule + " " + action.justification, red_herrings)
    score -= noise_hit

    # Apply Exploit Guards
    exploit_penalty = instruction_guard_penalty(rule + " " + action.justification + " " + action.think)
    density_penalty = semantic_density_penalty(rule)
    
    score -= (exploit_penalty + density_penalty)

    return round(max(0.001, min(0.999, score)), 4)


# ─────────────────────────────────────────────
# Hard Task: Holistic Policy Evolution
# ─────────────────────────────────────────────

def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
    """
    Reward breakdown:
      0.30 β€” structure_score: metrics present and correctly formatted
      0.50 β€” realism_score: realistic tradeoffs (variance rewarded, all-high penalized)
      0.20 β€” mods_score: policy modifications correctly address identified_issues
    """
    # 1. Structure Score (30%)
    outcomes = action.expected_outcomes
    
    # Normalise common alternative key names to standard names
    KEY_ALIASES = {
        "queue_overload":      "revenue_velocity",
        "revenue_growth":      "revenue_velocity",
        "revenue":             "revenue_velocity",
        "fraud_detection":     "fraud_rate",
        "fraud_detection_rate":"fraud_rate",
        "fraud":               "fraud_rate",
        "trust":               "seller_trust",
        "seller_confidence":   "seller_trust",
    }

    if isinstance(outcomes, dict):
        normalised = {}
        for k, v in outcomes.items():
            standard_key = KEY_ALIASES.get(k.lower(), k)
            normalised[standard_key] = v
        outcomes = normalised

    valid_keys = {
        "fraud_rate", "revenue_velocity", "seller_trust",
        "false_positive_rate", "fraud_detection_rate", 
        "seller_trust_score", "review_queue_overload", 
        "legitimate_revenue_lost"
    }
    
    present_valid_keys = [k for k in outcomes.keys() if k in valid_keys]
    keys_present = len(present_valid_keys)
    structure_score = min(keys_present / 3.0, 1.0)

    # 2. Tradeoff Realism Check (50%)
    realism_score = 0.5  # default
    if keys_present >= 3:
        values = []
        for k in present_valid_keys:
            v = outcomes[k]
            # Normalise: accept 0-1 floats OR 0-100 integers
            if isinstance(v, (int, float)):
                values.append(float(v) if v <= 1.0 else float(v) / 100.0)

        if len(values) >= 3:
            all_high = all(v > 0.7 for v in values)
            all_positive = all(v > 0 for v in values)

            if all_high:
                # Impossible: maximising everything simultaneously = hallucination
                realism_score = 0.2
            elif all_positive:
                # Realistic: variance between metrics is rewarded
                variance = max(values) - min(values)
                realism_score = min(variance * 2.0, 1.0)
            else:
                realism_score = 0.5

    # 3. Policy Modifications Score (20%)
    mods = action.policy_modifications
    mod_score = 0.0
    if mods:
        mod_score = min(len(mods) / 2.0, 1.0)
        
        # Check depth
        known_policy_ids = {p["id"] for p in task.get("current_policies", [])}
        addressed = sum(1 for m in mods if m.policy_id in known_policy_ids or
                        any(kw in m.new_text.lower() for kw in
                            ["seasonal", "category", "foreign", "manual", "threshold", "volume"]))
        if addressed < 1:
            mod_score *= 0.5

    hard_base = (
        structure_score * 0.20 +
        realism_score   * 0.65 +
        mod_score       * 0.15
    )

    # CoT bonus
    final_score = hard_base + cot_bonus(action.think)

    full_text = (
        action.justification + " " +
        " ".join(
            mod.new_text
            for mod in action.policy_modifications
        )
    ).lower()

    # NEW: Staff-Level Segmented Evaluation
    prio_bonus = segmented_prioritization_check(full_text, ["tradeoff", "balance", "velocity", "fraud"])
    final_score += prio_bonus

    # NEW: Red Herring Penalty (Hard)
    red_herrings = task.get("red_herrings", ["ui design", "log rotation", "server maintenance"])
    noise_hit = signal_to_noise_ratio_penalty(full_text, red_herrings)
    final_score -= noise_hit
    
    # Domain mismatch penalty
    HARD_DOMAIN_KEYWORDS = [
        "seller", "merchant", "marketplace", "fraud", "listing",
        "buyer", "shipment", "return", "velocity", "payment",
        "review", "refund", "inventory", "drop.?ship", "fulfil"
    ]
    domain_hits = sum(
        1 for kw in HARD_DOMAIN_KEYWORDS
        if re.search(kw, full_text)
    )
    domain_penalty = 0.30 if domain_hits == 0 else 0.0
    
    final_score -= domain_penalty

    # Apply Exploit Guards
    exploit_penalty = instruction_guard_penalty(full_text + " " + action.think)
    density_penalty = semantic_density_penalty(full_text)
    
    # Logical Alignment Check: Metric Keys vs Mod Content
    alignment_penalty = 0.0
    mod_text_full = " ".join(m.new_text.lower() for m in action.policy_modifications).lower()
    
    # Check if they change returns but only talk about fraud
    if "return" in mod_text_full or "refund" in mod_text_full:
        if not any(k in outcomes for k in ["legitimate_revenue_lost", "seller_trust"]):
            alignment_penalty += 0.15
            logger.warning("[EXPLOIT] LogicalAlignmentCheck: Modification on 'returns' but missing outcome metrics.")

    final_score -= (exploit_penalty + density_penalty + alignment_penalty)

    return round(max(0.001, min(0.999, final_score)), 4)


# ─────────────────────────────────────────────
# Dispatcher
# ─────────────────────────────────────────────

def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int = 42, previous_score: float = 0.0) -> float:
    """
    Main entry point called by /grader endpoint.
    action_dict: the raw JSON body from the agent
    task_id: "task_easy" | "task_medium" | "task_hard"
    previous_score: the best score achieved so far in the current episode
    Returns float in (0.0, 1.0) β€” strictly clamped, never exactly 0 or 1.
    """
    task = TASK_REGISTRY.get(task_id)
    if task is None:
        return 0.001
    
    think = action_dict.get("think", "")

    try:
        # Robust field mapping (normalized to expected Pydantic model keys)
        # 1. Easy Task Mapping
        if "target_term" in action_dict and "ambiguous_term" not in action_dict:
            action_dict["ambiguous_term"] = action_dict.pop("target_term")
        if "proposed_definition" in action_dict and "suggested_definition" not in action_dict:
            action_dict["suggested_definition"] = action_dict.pop("proposed_definition")
        
        # 2. Medium Task Mapping
        if "risk_domain" in action_dict and "rule_domain" not in action_dict:
            action_dict["rule_domain"] = action_dict.pop("risk_domain")
        if "draft_rule" in action_dict and "new_rule" not in action_dict:
            action_dict["new_rule"] = action_dict.pop("draft_rule")
        if "evidence" in action_dict and "justification" not in action_dict:
            action_dict["justification"] = action_dict.pop("evidence")
        if "context_tags" in action_dict and "scope" not in action_dict:
            tags = action_dict.pop("context_tags")
            action_dict["scope"] = tags.split(",") if isinstance(tags, str) else tags

        # 3. Hard Task Mapping
        if "evolution_proposal" in action_dict and "justification" not in action_dict:
            action_dict["justification"] = action_dict.pop("evolution_proposal")
        if "policy_modifications" not in action_dict:
             action_dict["policy_modifications"] = []
        if "expected_outcomes" not in action_dict:
             action_dict["expected_outcomes"] = {}

        action_type = action_dict.get("action_type")
        
        # Auto-detect action type if missing
        if not action_type:
            if "ambiguous_term" in action_dict:
                action_type = "propose_clarification"
            elif "rule_domain" in action_dict:
                action_type = "propose_new_rule"
            elif "policy_modifications" in action_dict and action_dict["policy_modifications"]:
                action_type = "evolve_policy"
            
        if action_type == "propose_clarification":
            action_dict["action_type"] = "propose_clarification"
            action = ProposeClarificationAction(**action_dict)
            raw = grade_clarification(action, task)
        elif action_type == "propose_new_rule":
            action_dict["action_type"] = "propose_new_rule"
            action = ProposeNewRuleAction(**action_dict)
            raw = grade_new_rule(action, task)
        elif action_type == "evolve_policy":
            action_dict["action_type"] = "evolve_policy"
            action = EvolveProcessAction(**action_dict)
            raw = grade_evolution(action, task)
        else:
            logger.warning(f"Unknown action_type: {action_type}")
            return 0.001
    except Exception as e:
        logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
        return 0.001

    # Step-delta improvement bonus
    delta = raw - previous_score
    if delta > 0.15:
        improvement_bonus = 0.05
    elif delta > 0.05:
        improvement_bonus = 0.02
    else:
        improvement_bonus = 0.0

    final_score = raw + improvement_bonus
    # Strict (0, 1) clamping β€” validator rejects exact 0.0 and 1.0
    return round(max(0.001, min(0.999, final_score)), 4)


if __name__ == "__main__":
    import time
    
    # ─────────────────────────────────────────────
    # Professional Simulation Test Cases
    # ─────────────────────────────────────────────
    
    print("==================================================")
    print(" PolicyEvolverEnv Grader - Professional Test Suite")
    print("==================================================")
    print("\n[Phase 1] CoT & NLP Bonus Verification")
    assert cot_bonus(None) == 0.0
    assert cot_bonus("ok") == 0.0
    assert cot_bonus("I think this is good policy") == 0.10
    assert cot_bonus(
        "Because the threshold is too low, the tradeoff between "
        "precision and recall creates a false positive risk that "
        "will impact seller trust. Therefore I balance it."
    ) == 0.20
    print(" βœ“ Chain-of-Thought mathematical bounds verified.")
    print("CoT bonus tests passed")

    print("\n[Phase 2] Easy Task: Progression & Score Delta")
    # Simulate an agent progressively improving their classification
    
    step1_action = {"action_type": "propose_clarification", "ambiguous_term": "offensive", "suggested_definition": "bad behavior", "justification": "", "think": ""}
    step2_action = {
        "action_type": "propose_clarification", 
        "ambiguous_term": "offensive", 
        "suggested_definition": (
            "Content is defined as offensive if it includes explicit "
            "slurs and directly degrades community members."
        ),
        "justification": "The current policy leads to inconsistent moderation.",
        "think": ""
    }
    step3_action = {
        "action_type": "propose_clarification", 
        "ambiguous_term": "appropriate", 
        "suggested_definition": (
            "Behavior is defined as a violation when it specifically "
            "includes 3 or more verified reports within 24 hours, "
            "exceeding the 5% threshold for category violations. "
            "Must meet measurable community standards."
        ),
        "justification": "The current policy leads to inconsistent and subjective moderation because it is unclear and varies between interpreters.", 
        "think": (
            "Because the threshold is too low, the tradeoff between "
            "precision and recall creates a false positive risk that "
            "will impact community trust. Therefore I balance the "
            "evidence requirement."
        )
    }

    s1 = grade(step1_action, "task_easy", previous_score=0.0)
    s2 = grade(step2_action, "task_easy", previous_score=s1)
    s3 = grade(step3_action, "task_easy", previous_score=s2)

    print(f"Step 1: {s1:.4f}")
    print(f"Step 2: {s2:.4f}")
    print(f"Step 3: {s3:.4f}")

    assert s1 < 0.30, f"Step 1 should be low, got {s1}"
    assert s2 > s1,   f"Step 2 should improve over step 1"
    assert s2 < 0.60, f"Step 2 (no keywords) should be below 0.60, got {s2}"
    assert s3 > 0.80, f"Step 3 should be high, got {s3}"
    assert s3 > s2,   f"Step 3 should improve over step 2"
    print("Easy progression tests passed")

    print("\n[Phase 3] Hard Task: Hallucination & Tradeoff Simulation")
    hallucination_action = {
        "action_type": "evolve_policy",
        "policy_modifications": [{"policy_id": "p1", "change_type": "enhance",
                          "new_text": "test", "reason": "test"}],
        "expected_outcomes": {
            "fraud_rate": 0.95,
            "revenue_velocity": 0.95,
            "seller_trust": 0.95
        },
        "justification": "All metrics improve simultaneously.",
        "think": ""
    }
    h_score = grade(hallucination_action, "task_hard")
    print(f" > Hallucinated 'All High' Outcomes Penalty Applied: Score = {h_score:.4f}")
    assert h_score <= 0.30, f"Hallucination scored {h_score}, must be <= 0.30"
    print(f"Hard hallucination confirmed: {h_score}")
    
    canonical_action = {
        "action_type": "evolve_policy",
        "policy_modifications": [
            {"policy_id": "p1", "change_type": "enhance",
             "new_text": "Apply velocity checks.", "reason": "fraud"},
            {"policy_id": "p2", "change_type": "add",
             "new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
        ],
        "expected_outcomes": {
            "fraud_rate": 0.75,
            "revenue_velocity": 0.40,
            "seller_trust": 0.55
        },
        "justification": "Balancing fraud detection against revenue.",
        "think": (
            "Because improving fraud detection creates a tradeoff "
            "with revenue velocity, I balance the threshold to optimise "
            "precision and recall without false positive spikes."
        )
    }
    r_score = grade(canonical_action, "task_hard")
    print(f" > Realistic Tradeoff & Math Variance Award Applied: Score = {r_score:.4f}")
    assert r_score > 0.65, f"Realistic tradeoff should score high, got {r_score}"
    print(f"Hard strategic agent confirmed: {r_score}")
    
    # Test with alias key
    alias_action = {
        "action_type": "evolve_policy",
        "policy_modifications": [
            {"policy_id": "p1", "change_type": "enhance",
             "new_text": "Apply velocity checks.", "reason": "fraud"},
            {"policy_id": "p2", "change_type": "add",
             "new_text": "Exempt legacy sellers.", "reason": "FP reduction"}
        ],
        "expected_outcomes": {
            "fraud_detection": 0.75,    # alias for fraud_rate
            "queue_overload": 0.40,     # alias for revenue_velocity
            "seller_confidence": 0.55   # alias for seller_trust
        },
        "justification": "Balancing fraud detection against revenue.",
        "think": (
            "Because improving fraud detection creates a tradeoff "
            "with revenue velocity, I balance the threshold to optimise "
            "precision and recall without false positive spikes."
        )
    }
    a_score = grade(alias_action, "task_hard")
    assert a_score > 0.60, f"Alias keys should work, got {a_score}"
    assert abs(r_score - a_score) < 0.05, f"Alias and canonical should score similarly: {a_score} vs {r_score}"

    print("\n[Phase 4] Cross-Domain Penalty")
    cross_domain_action = {
        "action_type": "evolve_policy",
        "policy_modifications": [
            {"policy_id": "pol_ai_001", "change_type": "enhance",
             "new_text": "Employees must disclose AI usage in proposals.",
             "reason": "AI governance gap"}
        ],
        "expected_outcomes": {
            "fraud_rate": 0.60,
            "revenue_velocity": 0.40,
            "seller_trust": 0.55
        },
        "justification": (
            "Employees using generative AI must disclose usage to "
            "prevent intellectual property violations."
        ),
        "think": "AI governance policy needed for workplace compliance."
    }

    cross_score = grade(cross_domain_action, "task_hard")
    assert cross_score < 0.35, f"Cross-domain action should score low, got {cross_score}"
    print(f"Cross-domain penalty confirmed: {cross_score}")

    print("\n[Phase 5] Anti-Repetition Penalty")
    from server.environment import PolicyEvolverEnvironment
    env = PolicyEvolverEnvironment()
    env.reset(task_id="task_easy")

    repeat_action_dict = {
        "action_type": "propose_clarification",
        "ambiguous_term": "offensive",
        "suggested_definition": (
            "Behavior exceeding 3 reports within 24 hours is a violation."
        ),
        "justification": "Clear standards.",
        "think": "Standard threshold applied."
    }

    result1 = env.step(copy.deepcopy(repeat_action_dict))
    result2 = env.step(copy.deepcopy(repeat_action_dict))

    score1 = result1.reward
    score2 = result2.reward

    assert score2 < score1, (
        f"Repeated action should score lower. "
        f"First: {score1}, Second: {score2}"
    )
    assert score1 - score2 >= 0.25, (
        f"Repetition penalty should be at least 0.25. "
        f"Difference: {score1 - score2:.3f}"
    )
    print(f"Anti-repetition confirmed: {score1:.3f} β†’ {score2:.3f}")

    print("\n[Phase 6] System Determinism Sanity Check")
    determinism_action = {
        "action_type": "propose_clarification",
        "ambiguous_term": "offensive",
        "suggested_definition": (
            "Behavior exceeding 3 verified reports within 24 hours, "
            "specifically meeting the 5% threshold for violations."
        ),
        "justification": "Clear and measurable standards.",
        "think": (
            "Because the threshold requires precision, I balance "
            "recall against false positive risk. Evidence from corpus "
            "supports this measurable criterion."
        )
    }

    scores_easy = [
        grade(determinism_action, "task_easy")
        for _ in range(3)
    ]
    assert scores_easy[0] == scores_easy[1] == scores_easy[2], f"Easy task non-deterministic: {scores_easy}"
    print(f"Easy determinism: {scores_easy[0]} βœ“")

    scores_hard = [
        grade(canonical_action, "task_hard")
        for _ in range(3)
    ]
    assert scores_hard[0] == scores_hard[1] == scores_hard[2], f"Hard task non-deterministic: {scores_hard}"
    print(f"Hard determinism: {scores_hard[0]} βœ“")

    print("\n[Phase 7] Staff-Level Segmented Prioritization")
    # Action with fix at the top
    prio_high_action = {
        "action_type": "propose_clarification",
        "ambiguous_term": "offensive",
        "suggested_definition": "Specifically, offensive behavior is defined as slurs. " + ("fluff " * 50),
        "justification": "Required for consistency.",
        "think": "Reasoning."
    }
    # Action with fix buried at bottom
    prio_low_action = {
        "action_type": "propose_clarification",
        "ambiguous_term": "offensive",
        "suggested_definition": ("fluff " * 50) + "Specifically, offensive behavior is defined as slurs. ",
        "justification": "Required for consistency.",
        "think": "Reasoning."
    }
    
    score_prio_high = grade(prio_high_action, "task_easy")
    score_prio_low = grade(prio_low_action, "task_easy")
    print(f"Prio High (Fix at Top): {score_prio_high:.4f}")
    print(f"Prio Low (Fix at Bottom): {score_prio_low:.4f}")
    assert score_prio_high > score_prio_low, f"Prioritization check failed: {score_prio_high} <= {score_prio_low}"
    print("βœ“ Segmented prioritization verified.")

    print("\n[Phase 8] Staff-Level Noise Filtering")
    # Clear fix
    signal_action = {
        "action_type": "propose_clarification",
        "ambiguous_term": "appropriate",
        "suggested_definition": "Determined as 5% threshold verified reports.",
        "justification": "Context.",
        "think": "Thinking."
    }
    # Fix distracted by red herring (pizza/mascot)
    noisy_action = {
        "action_type": "propose_clarification",
        "ambiguous_term": "appropriate",
        "suggested_definition": "Determined as 5% threshold verified reports. We should also buy pizza and fix the mascot.",
        "justification": "Context including noise.",
        "think": "Thinking."
    }
    score_signal = grade(signal_action, "task_easy")
    score_noisy = grade(noisy_action, "task_easy")
    print(f"Clean Signal Score: {score_signal:.4f}")
    print(f"Distracted Noisy Score: {score_noisy:.4f}")
    assert score_signal > score_noisy, f"Noise filtering check failed: {score_signal} <= {score_noisy}"
    print("βœ“ Red Herring penalty verified.")
    
    print("\n==================================================")
    print(" All Staff-Level Security & Logic checks passed.")