100XZX001 commited on
Commit
0903f4c
·
verified ·
1 Parent(s): cad4869

Update rubrics.py

Browse files
Files changed (1) hide show
  1. rubrics.py +13 -38
rubrics.py CHANGED
@@ -1,14 +1,15 @@
1
- # rubrics.py – OpenEnv Rubrics for Code Review Environment
2
- from openenv.core import Rubric
 
 
 
 
 
3
 
4
  # --------------------------------------------------------------------------------
5
- # 1. TOOL‑USAGE BONUS (encourages first‑time use of diagnostic tools)
6
  # --------------------------------------------------------------------------------
7
  class ToolUsageRubric(Rubric):
8
- """
9
- Small fixed reward the first time each of the major diagnostic tools is used.
10
- Also gives a tiny reward for every invocation to prevent the agent from ignoring them.
11
- """
12
  def __init__(self, bonus: float = 0.05):
13
  self.bonus = bonus
14
 
@@ -33,12 +34,9 @@ class ToolUsageRubric(Rubric):
33
 
34
 
35
  # --------------------------------------------------------------------------------
36
- # 2. DELTA‑BASED REWARDS (primary learning signal)
37
  # --------------------------------------------------------------------------------
38
  class TestDeltaRubric(Rubric):
39
- """
40
- Rewards improvement in the pass ratio of the test suite.
41
- """
42
  def __init__(self, weight: float = 0.3):
43
  self.weight = weight
44
 
@@ -51,9 +49,6 @@ class TestDeltaRubric(Rubric):
51
 
52
 
53
  class LintDeltaRubric(Rubric):
54
- """
55
- Rewards improvement in lint score (normalised 0‑1).
56
- """
57
  def __init__(self, weight: float = 0.3):
58
  self.weight = weight
59
 
@@ -66,13 +61,9 @@ class LintDeltaRubric(Rubric):
66
 
67
 
68
  # --------------------------------------------------------------------------------
69
- # 3. TERMINAL SUCCESS BONUS (propose_fix only)
70
  # --------------------------------------------------------------------------------
71
  class TerminalSuccessRubric(Rubric):
72
- """
73
- Bonus awarded when a proposed fix achieves high test and lint scores.
74
- Graded: >0.85 → 0.2, >0.95 → 0.4.
75
- """
76
  def __call__(self, env, action, obs, reward, done, info):
77
  if info.get("action_type") != "propose_fix":
78
  return 0.0
@@ -85,14 +76,9 @@ class TerminalSuccessRubric(Rubric):
85
 
86
 
87
  # --------------------------------------------------------------------------------
88
- # 4. EXPLORATION & DIVERSITY (discourages repetition, encourages varied actions)
89
  # --------------------------------------------------------------------------------
90
  class ExplorationRubric(Rubric):
91
- """
92
- Encourages diverse action sequences.
93
- - Penalty if last 3 actions are all the same.
94
- - Bonus if they are all different.
95
- """
96
  def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
97
  self.penalty = penalty
98
  self.bonus = bonus
@@ -110,16 +96,9 @@ class ExplorationRubric(Rubric):
110
 
111
 
112
  # --------------------------------------------------------------------------------
113
- # 5. ANTI‑HACKING & CONSISTENCY (prevents reward without real work)
114
  # --------------------------------------------------------------------------------
115
  class AntiHackingRubric(Rubric):
116
- """
117
- Penalises suspicious behaviour:
118
- - proposing a fix without ever running tests.
119
- - proposing a fix too early (step < 2).
120
- Additional cross‑signal penalties are applied in the environment (not as a rubric)
121
- because they require modifying the base reward, not adding to it.
122
- """
123
  def __call__(self, env, action, obs, reward, done, info):
124
  if info.get("action_type") != "propose_fix":
125
  return 0.0
@@ -128,19 +107,15 @@ class AntiHackingRubric(Rubric):
128
  score -= 0.25
129
  if env._step_count < 2:
130
  score -= 0.1
131
- # tiny boost if the agent did the “right” preparation
132
  if env._tests_run and env._linter_run:
133
  score += 0.02
134
  return score
135
 
136
 
137
  # --------------------------------------------------------------------------------
138
- # 6. STEP PENALTY (time pressure)
139
  # --------------------------------------------------------------------------------
140
  class StepPenaltyRubric(Rubric):
141
- """
142
- Simple per‑step penalty to encourage efficient resolution.
143
- """
144
  def __init__(self, penalty: float = -0.01):
145
  self.penalty = penalty
146
 
 
1
+ # rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)
2
+
3
+ class Rubric:
4
+ """Minimal Rubric base – compatible with OpenEnv but self‑contained."""
5
+ def __call__(self, env, action, obs, reward, done, info):
6
+ return 0.0
7
+
8
 
9
  # --------------------------------------------------------------------------------
10
+ # 1. TOOL‑USAGE BONUS
11
  # --------------------------------------------------------------------------------
12
  class ToolUsageRubric(Rubric):
 
 
 
 
13
  def __init__(self, bonus: float = 0.05):
14
  self.bonus = bonus
15
 
 
34
 
35
 
36
  # --------------------------------------------------------------------------------
37
+ # 2. DELTA‑BASED REWARDS
38
  # --------------------------------------------------------------------------------
39
  class TestDeltaRubric(Rubric):
 
 
 
40
  def __init__(self, weight: float = 0.3):
41
  self.weight = weight
42
 
 
49
 
50
 
51
  class LintDeltaRubric(Rubric):
 
 
 
52
  def __init__(self, weight: float = 0.3):
53
  self.weight = weight
54
 
 
61
 
62
 
63
  # --------------------------------------------------------------------------------
64
+ # 3. TERMINAL SUCCESS BONUS
65
  # --------------------------------------------------------------------------------
66
  class TerminalSuccessRubric(Rubric):
 
 
 
 
67
  def __call__(self, env, action, obs, reward, done, info):
68
  if info.get("action_type") != "propose_fix":
69
  return 0.0
 
76
 
77
 
78
  # --------------------------------------------------------------------------------
79
+ # 4. EXPLORATION & DIVERSITY
80
  # --------------------------------------------------------------------------------
81
  class ExplorationRubric(Rubric):
 
 
 
 
 
82
  def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
83
  self.penalty = penalty
84
  self.bonus = bonus
 
96
 
97
 
98
  # --------------------------------------------------------------------------------
99
+ # 5. ANTI‑HACKING & CONSISTENCY
100
  # --------------------------------------------------------------------------------
101
  class AntiHackingRubric(Rubric):
 
 
 
 
 
 
 
102
  def __call__(self, env, action, obs, reward, done, info):
103
  if info.get("action_type") != "propose_fix":
104
  return 0.0
 
107
  score -= 0.25
108
  if env._step_count < 2:
109
  score -= 0.1
 
110
  if env._tests_run and env._linter_run:
111
  score += 0.02
112
  return score
113
 
114
 
115
  # --------------------------------------------------------------------------------
116
+ # 6. STEP PENALTY
117
  # --------------------------------------------------------------------------------
118
  class StepPenaltyRubric(Rubric):
 
 
 
119
  def __init__(self, penalty: float = -0.01):
120
  self.penalty = penalty
121