100XZX001 commited on
Commit
5087fec
·
verified ·
1 Parent(s): e9bbdc9

Create rubrics.py

Browse files
Files changed (1) hide show
  1. rubrics.py +148 -0
rubrics.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rubrics.py – OpenEnv Rubrics for Code Review Environment
2
+ from openenv import Rubric
3
+
4
+ # --------------------------------------------------------------------------------
5
+ # 1. TOOL‑USAGE BONUS (encourages first‑time use of diagnostic tools)
6
+ # --------------------------------------------------------------------------------
7
+ class ToolUsageRubric(Rubric):
8
+ """
9
+ Small fixed reward the first time each of the major diagnostic tools is used.
10
+ Also gives a tiny reward for every invocation to prevent the agent from ignoring them.
11
+ """
12
+ def __init__(self, bonus: float = 0.05):
13
+ self.bonus = bonus
14
+
15
+ def __call__(self, env, action, obs, reward, done, info):
16
+ score = 0.0
17
+ action_type = info.get("action_type", "")
18
+
19
+ if action_type == "run_tests":
20
+ if not env._tests_run:
21
+ score += self.bonus
22
+ score += 0.015
23
+ elif action_type == "run_linter":
24
+ if not env._linter_run:
25
+ score += self.bonus
26
+ score += 0.015
27
+ elif action_type == "query_docs":
28
+ if not env._docs_queried:
29
+ score += self.bonus * 0.5
30
+ elif action_type == "ask_question" and env._step_count <= 3:
31
+ score += 0.02
32
+ return score
33
+
34
+
35
+ # --------------------------------------------------------------------------------
36
+ # 2. DELTA‑BASED REWARDS (primary learning signal)
37
+ # --------------------------------------------------------------------------------
38
+ class TestDeltaRubric(Rubric):
39
+ """
40
+ Rewards improvement in the pass ratio of the test suite.
41
+ """
42
+ def __init__(self, weight: float = 0.3):
43
+ self.weight = weight
44
+
45
+ def __call__(self, env, action, obs, reward, done, info):
46
+ delta = env._current_test_score - env._previous_test_score
47
+ effective = self.weight
48
+ if info.get("action_type") == "propose_fix":
49
+ effective *= 0.4
50
+ return effective * delta
51
+
52
+
53
+ class LintDeltaRubric(Rubric):
54
+ """
55
+ Rewards improvement in lint score (normalised 0‑1).
56
+ """
57
+ def __init__(self, weight: float = 0.3):
58
+ self.weight = weight
59
+
60
+ def __call__(self, env, action, obs, reward, done, info):
61
+ delta = env._current_lint_score - env._previous_lint_score
62
+ effective = self.weight * 0.5
63
+ if info.get("action_type") == "propose_fix":
64
+ effective *= 0.4
65
+ return effective * delta
66
+
67
+
68
+ # --------------------------------------------------------------------------------
69
+ # 3. TERMINAL SUCCESS BONUS (propose_fix only)
70
+ # --------------------------------------------------------------------------------
71
+ class TerminalSuccessRubric(Rubric):
72
+ """
73
+ Bonus awarded when a proposed fix achieves high test and lint scores.
74
+ Graded: >0.85 → 0.2, >0.95 → 0.4.
75
+ """
76
+ def __call__(self, env, action, obs, reward, done, info):
77
+ if info.get("action_type") != "propose_fix":
78
+ return 0.0
79
+ score = 0.0
80
+ if env._current_test_score > 0.95:
81
+ score += 0.4
82
+ elif env._current_test_score > 0.85:
83
+ score += 0.2
84
+ return score
85
+
86
+
87
+ # --------------------------------------------------------------------------------
88
+ # 4. EXPLORATION & DIVERSITY (discourages repetition, encourages varied actions)
89
+ # --------------------------------------------------------------------------------
90
+ class ExplorationRubric(Rubric):
91
+ """
92
+ Encourages diverse action sequences.
93
+ - Penalty if last 3 actions are all the same.
94
+ - Bonus if they are all different.
95
+ """
96
+ def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
97
+ self.penalty = penalty
98
+ self.bonus = bonus
99
+
100
+ def __call__(self, env, action, obs, reward, done, info):
101
+ if len(env._action_history) < 3:
102
+ return 0.0
103
+ recent = env._action_history[-3:]
104
+ unique = len(set(recent))
105
+ if unique == 1:
106
+ return self.penalty
107
+ elif unique == 3:
108
+ return self.bonus
109
+ return 0.0
110
+
111
+
112
+ # --------------------------------------------------------------------------------
113
+ # 5. ANTI‑HACKING & CONSISTENCY (prevents reward without real work)
114
+ # --------------------------------------------------------------------------------
115
+ class AntiHackingRubric(Rubric):
116
+ """
117
+ Penalises suspicious behaviour:
118
+ - proposing a fix without ever running tests.
119
+ - proposing a fix too early (step < 2).
120
+ Additional cross‑signal penalties are applied in the environment (not as a rubric)
121
+ because they require modifying the base reward, not adding to it.
122
+ """
123
+ def __call__(self, env, action, obs, reward, done, info):
124
+ if info.get("action_type") != "propose_fix":
125
+ return 0.0
126
+ score = 0.0
127
+ if not env._tests_run:
128
+ score -= 0.25
129
+ if env._step_count < 2:
130
+ score -= 0.1
131
+ # tiny boost if the agent did the “right” preparation
132
+ if env._tests_run and env._linter_run:
133
+ score += 0.02
134
+ return score
135
+
136
+
137
+ # --------------------------------------------------------------------------------
138
+ # 6. STEP PENALTY (time pressure)
139
+ # --------------------------------------------------------------------------------
140
+ class StepPenaltyRubric(Rubric):
141
+ """
142
+ Simple per‑step penalty to encourage efficient resolution.
143
+ """
144
+ def __init__(self, penalty: float = -0.01):
145
+ self.penalty = penalty
146
+
147
+ def __call__(self, env, action, obs, reward, done, info):
148
+ return self.penalty