File size: 5,798 Bytes
4ec75cf
 
f3fd4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec75cf
 
 
 
 
f3fd4ef
 
 
 
 
 
 
 
 
 
 
 
 
4ec75cf
829f543
4ec75cf
72b3e8d
 
4ec75cf
829f543
4ec75cf
 
 
f3fd4ef
 
 
 
 
 
 
 
 
4ec75cf
72b3e8d
4ec75cf
 
 
f3fd4ef
 
 
 
 
 
72b3e8d
4ec75cf
 
 
f3fd4ef
 
 
 
 
 
4ec75cf
 
 
72b3e8d
4ec75cf
 
 
 
72b3e8d
f3fd4ef
 
 
 
72b3e8d
f3fd4ef
4ec75cf
 
 
f3fd4ef
4ec75cf
 
f3fd4ef
 
 
4ec75cf
f3fd4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ec75cf
 
 
 
 
 
f3fd4ef
 
 
4ec75cf
 
 
f3fd4ef
4ec75cf
72b3e8d
f3fd4ef
72b3e8d
f3fd4ef
 
72b3e8d
f3fd4ef
72b3e8d
4ec75cf
f3fd4ef
4ec75cf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# server/graders/base_grader.py
# Core grading utilities used by ALL domain graders.
#
# CHANGES FROM PREVIOUS VERSION:
# 1. difficulty_multiplier() — REMOVED ENTIRELY.
#    The cap (hard→0.80, medium→0.90) made every hard task score identically
#    at 0.80 and every medium task at 0.90, regardless of agent quality.
#    This is exactly the wrong behaviour for an RL training environment:
#    GRPO needs variance WITHIN difficulty levels, not a uniform ceiling.
#    Task difficulty now comes from the grader logic and case design alone.
#
# 2. safe_score range: [0.01, 0.99]
#    The official spec says "strictly between 0 and 1".
#    Discord consensus from many participants confirmed 0.01/0.99 as the
#    correct interpretation. Do not change this back to [0.0, 1.0].
#
# 3. Penalty values kept as-is (increased in last revision):
#    - repetition_penalty:    -0.20 per repeat (was -0.15)
#    - invalid_action_penalty: -0.40 for wrong domain action (was -0.20)
#    - harmful_output_penalty: -0.50 for destructive patterns
#    These are intentionally higher to create real signal.
#
# 4. efficiency_bonus reduced to 0.05 (was 0.10).
#    Small enough that it doesn't inflate scores, but still rewards
#    agents that solve tasks efficiently.

from typing import Dict, Any, List, Callable


def safe_score(raw) -> float:
    """
    Clamp score to [0.01, 0.99]. Never crash. Returns float.

    WHY [0.01, 0.99] NOT [0.0, 1.0]:
    - Official spec says scores must be strictly between 0 and 1
    - Discord confirmed 0.01/0.99 as the correct practical interpretation
    - A score of exactly 0.0 from a broken run looks like a crash
    - A score of exactly 1.0 means the grader is trivially solved

    WHY 4 DECIMAL PLACES:
    - Keeps variance visible (0.4500 vs 0.4750 are meaningfully different)
    - round() handles float precision artifacts
    """
    if raw is None:
        return 0.01
    try:
        val = float(raw)
        return round(max(0.01, min(0.99, val)), 4)
    except (TypeError, ValueError):
        return 0.01


def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float:
    """
    Penalise repeating the same action type in the last N steps.

    WHY: Without this, GRPO agents discover they can emit the same
    high-scoring action repeatedly within an episode. The penalty
    forces genuine strategy exploration each turn.

    -0.20 per repeat (capped by window=3, so max penalty is -0.60).
    """
    count = last_actions[-window:].count(action_type)
    return -0.20 * count


def invalid_action_penalty(action_type: str, valid_actions: List[str]) -> float:
    """
    Penalise actions not in the valid set for this domain.

    -0.40 because calling a dependency action on a security task is a
    fundamental routing error — it should hurt significantly.
    """
    return -0.40 if action_type not in valid_actions else 0.0


def harmful_output_penalty(action: Dict, forbidden_patterns: List[str]) -> float:
    """
    Penalise destructive patterns like 'os.remove', 'drop table'.

    -0.50 because these patterns represent the agent trying to "cheat"
    by deleting things rather than fixing them.
    """
    action_str = str(action).lower()
    for p in forbidden_patterns:
        if p.lower() in action_str:
            return -0.50
    return 0.0


def efficiency_bonus(step_count: int, max_steps: int, done: bool) -> float:
    """
    Small bonus for finishing early — rewards decisive, confident agents.

    WHY ONLY 0.05: The correctness score must be the dominant signal.
    The efficiency bonus should never flip a mediocre answer into a good score.
    """
    return 0.05 if done and step_count < max_steps // 2 else 0.0


def grade_dynamic(
    action:                Dict[str, Any],
    session,
    compute_correctness_fn: Callable,
    valid_actions:          List[str],
    forbidden_patterns:     List[str] = None,
    max_steps:              int       = 8,
) -> float:
    """
    Full reward pipeline. Entry point for all domain graders.

    Pipeline:
    1. Invalid action check — if wrong domain action, return penalised score immediately
    2. Repetition penalty — subtract for repeated action types
    3. compute_correctness_fn — domain-specific grader (security/dep/clinical)
    4. Harmful output penalty — subtract for destructive patterns
    5. Efficiency bonus — add small bonus for early completion
    6. safe_score — clamp to [0.01, 0.99]

    NOTE: difficulty_multiplier has been REMOVED.
    The task difficulty is expressed through:
    - Tighter CVSS ranges in hard cases (harder to guess)
    - More required_fix_tokens in hard cases
    - Adversarial reviewer_feedback in hard cases
    - Dependency graphs in hard clinical cases
    - Multiple checklist items with ordering in hard dep cases
    The grader itself should produce lower scores for harder tasks naturally.
    """
    if forbidden_patterns is None:
        forbidden_patterns = []

    action_type = action.get('action_type', 'unknown')

    # Step 1: Invalid action → skip grader entirely, return penalised score
    inv = invalid_action_penalty(action_type, valid_actions)
    rep = repetition_penalty(action_type, session.last_actions)
    if inv < 0:
        return safe_score(inv + rep)

    # Step 2: Domain-specific correctness
    correctness = compute_correctness_fn(action, session.task_case)
    if correctness is None:
        correctness = 0.01

    # Step 3: Harmful output check
    harm = harmful_output_penalty(action, forbidden_patterns)

    # Step 4: Efficiency bonus
    eff = efficiency_bonus(session.step_count + 1, max_steps, correctness >= 0.75)

    # Step 5: Combine and clamp
    raw = correctness + rep + harm + eff
    return safe_score(raw)