File size: 5,232 Bytes
94b1baf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)

class Rubric:
    """Minimal Rubric base – compatible with OpenEnv but self‑contained."""
    def __call__(self, env, action, obs, reward, done, info):
        return 0.0


# --------------------------------------------------------------------------------
# 1. TOOL‑USAGE BONUS
# --------------------------------------------------------------------------------
class ToolUsageRubric(Rubric):
    def __init__(self, bonus: float = 0.05):
        self.bonus = bonus

    def __call__(self, env, action, obs, reward, done, info):
        score = 0.0
        action_type = info.get("action_type", "")
        # Use pre-action flags from `info` so first-use bonuses are
        # computed correctly even though env flags are mutated in-step.
        prev_tests_run = info.get("prev_tests_run", env._tests_run)
        prev_linter_run = info.get("prev_linter_run", env._linter_run)
        prev_docs_queried = info.get("prev_docs_queried", env._docs_queried)

        if action_type == "run_tests":
            if not prev_tests_run:
                score += self.bonus
            score += 0.015
        elif action_type == "run_linter":
            if not prev_linter_run:
                score += self.bonus
            score += 0.015
        elif action_type == "query_docs":
            if not prev_docs_queried:
                score += self.bonus * 0.5
            # Encourage docs usage when it is likely useful:
            # - early exploration phase
            # - non-trivial query text
            if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8:
                score += 0.01
            # Discourage repeated docs calls after the first-use signal.
            if prev_docs_queried:
                score -= 0.01
        elif action_type == "question" and env._step_count <= 3:
            score += 0.02
        return score


# --------------------------------------------------------------------------------
# 2. DELTA‑BASED REWARDS
# --------------------------------------------------------------------------------
class TestDeltaRubric(Rubric):
    def __init__(self, weight: float = 0.3):
        self.weight = weight

    def __call__(self, env, action, obs, reward, done, info):
        delta = env._current_test_score - env._previous_test_score
        effective = self.weight
        if info.get("action_type") == "fix":
            effective *= 0.4
        return effective * delta


class LintDeltaRubric(Rubric):
    def __init__(self, weight: float = 0.3):
        self.weight = weight

    def __call__(self, env, action, obs, reward, done, info):
        delta = env._current_lint_score - env._previous_lint_score
        effective = self.weight * 0.5
        if info.get("action_type") == "fix":
            effective *= 0.4
        return effective * delta


# --------------------------------------------------------------------------------
# 3. TERMINAL SUCCESS BONUS
# --------------------------------------------------------------------------------
class TerminalSuccessRubric(Rubric):
    def __call__(self, env, action, obs, reward, done, info):
        if info.get("action_type") != "fix":
            return 0.0
        score = 0.0
        if env._current_test_score > 0.95:
            score += 0.4
        elif env._current_test_score > 0.85:
            score += 0.2
        return score


# --------------------------------------------------------------------------------
# 4. EXPLORATION & DIVERSITY
# --------------------------------------------------------------------------------
class ExplorationRubric(Rubric):
    def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
        self.penalty = penalty
        self.bonus = bonus

    def __call__(self, env, action, obs, reward, done, info):
        if len(env._action_history) < 3:
            return 0.0
        recent = env._action_history[-3:]
        unique = len(set(recent))
        if unique == 1:
            return self.penalty
        elif unique == 3:
            return self.bonus
        return 0.0


# --------------------------------------------------------------------------------
# 5. ANTI‑HACKING & CONSISTENCY
# --------------------------------------------------------------------------------
class AntiHackingRubric(Rubric):
    def __call__(self, env, action, obs, reward, done, info):
        if info.get("action_type") != "fix":
            return 0.0
        score = 0.0
        if not env._tests_run:
            score -= 0.25
        if env._step_count < 2:
            score -= 0.1
        if env._tests_run and env._linter_run:
            score += 0.02
        return score


# --------------------------------------------------------------------------------
# 6. STEP PENALTY
# --------------------------------------------------------------------------------
class StepPenaltyRubric(Rubric):
    def __init__(self, penalty: float = -0.01):
        self.penalty = penalty

    def __call__(self, env, action, obs, reward, done, info):
        return self.penalty