File size: 5,232 Bytes
94b1baf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | # rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)
class Rubric:
"""Minimal Rubric base – compatible with OpenEnv but self‑contained."""
def __call__(self, env, action, obs, reward, done, info):
return 0.0
# --------------------------------------------------------------------------------
# 1. TOOL‑USAGE BONUS
# --------------------------------------------------------------------------------
class ToolUsageRubric(Rubric):
def __init__(self, bonus: float = 0.05):
self.bonus = bonus
def __call__(self, env, action, obs, reward, done, info):
score = 0.0
action_type = info.get("action_type", "")
# Use pre-action flags from `info` so first-use bonuses are
# computed correctly even though env flags are mutated in-step.
prev_tests_run = info.get("prev_tests_run", env._tests_run)
prev_linter_run = info.get("prev_linter_run", env._linter_run)
prev_docs_queried = info.get("prev_docs_queried", env._docs_queried)
if action_type == "run_tests":
if not prev_tests_run:
score += self.bonus
score += 0.015
elif action_type == "run_linter":
if not prev_linter_run:
score += self.bonus
score += 0.015
elif action_type == "query_docs":
if not prev_docs_queried:
score += self.bonus * 0.5
# Encourage docs usage when it is likely useful:
# - early exploration phase
# - non-trivial query text
if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8:
score += 0.01
# Discourage repeated docs calls after the first-use signal.
if prev_docs_queried:
score -= 0.01
elif action_type == "question" and env._step_count <= 3:
score += 0.02
return score
# --------------------------------------------------------------------------------
# 2. DELTA‑BASED REWARDS
# --------------------------------------------------------------------------------
class TestDeltaRubric(Rubric):
def __init__(self, weight: float = 0.3):
self.weight = weight
def __call__(self, env, action, obs, reward, done, info):
delta = env._current_test_score - env._previous_test_score
effective = self.weight
if info.get("action_type") == "fix":
effective *= 0.4
return effective * delta
class LintDeltaRubric(Rubric):
def __init__(self, weight: float = 0.3):
self.weight = weight
def __call__(self, env, action, obs, reward, done, info):
delta = env._current_lint_score - env._previous_lint_score
effective = self.weight * 0.5
if info.get("action_type") == "fix":
effective *= 0.4
return effective * delta
# --------------------------------------------------------------------------------
# 3. TERMINAL SUCCESS BONUS
# --------------------------------------------------------------------------------
class TerminalSuccessRubric(Rubric):
def __call__(self, env, action, obs, reward, done, info):
if info.get("action_type") != "fix":
return 0.0
score = 0.0
if env._current_test_score > 0.95:
score += 0.4
elif env._current_test_score > 0.85:
score += 0.2
return score
# --------------------------------------------------------------------------------
# 4. EXPLORATION & DIVERSITY
# --------------------------------------------------------------------------------
class ExplorationRubric(Rubric):
def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
self.penalty = penalty
self.bonus = bonus
def __call__(self, env, action, obs, reward, done, info):
if len(env._action_history) < 3:
return 0.0
recent = env._action_history[-3:]
unique = len(set(recent))
if unique == 1:
return self.penalty
elif unique == 3:
return self.bonus
return 0.0
# --------------------------------------------------------------------------------
# 5. ANTI‑HACKING & CONSISTENCY
# --------------------------------------------------------------------------------
class AntiHackingRubric(Rubric):
def __call__(self, env, action, obs, reward, done, info):
if info.get("action_type") != "fix":
return 0.0
score = 0.0
if not env._tests_run:
score -= 0.25
if env._step_count < 2:
score -= 0.1
if env._tests_run and env._linter_run:
score += 0.02
return score
# --------------------------------------------------------------------------------
# 6. STEP PENALTY
# --------------------------------------------------------------------------------
class StepPenaltyRubric(Rubric):
def __init__(self, penalty: float = -0.01):
self.penalty = penalty
def __call__(self, env, action, obs, reward, done, info):
return self.penalty
|