# rubrics.py – Self-contained Rubrics (no external OpenEnv dependency) class Rubric: """Minimal Rubric base – compatible with OpenEnv but self‑contained.""" def __call__(self, env, action, obs, reward, done, info): return 0.0 # -------------------------------------------------------------------------------- # 1. TOOL‑USAGE BONUS # -------------------------------------------------------------------------------- class ToolUsageRubric(Rubric): def __init__(self, bonus: float = 0.05): self.bonus = bonus def __call__(self, env, action, obs, reward, done, info): score = 0.0 action_type = info.get("action_type", "") # Use pre-action flags from `info` so first-use bonuses are # computed correctly even though env flags are mutated in-step. prev_tests_run = info.get("prev_tests_run", env._tests_run) prev_linter_run = info.get("prev_linter_run", env._linter_run) prev_docs_queried = info.get("prev_docs_queried", env._docs_queried) if action_type == "run_tests": if not prev_tests_run: score += self.bonus score += 0.015 elif action_type == "run_linter": if not prev_linter_run: score += self.bonus score += 0.015 elif action_type == "query_docs": if not prev_docs_queried: score += self.bonus * 0.5 # Encourage docs usage when it is likely useful: # - early exploration phase # - non-trivial query text if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8: score += 0.01 # Discourage repeated docs calls after the first-use signal. if prev_docs_queried: score -= 0.01 elif action_type == "question" and env._step_count <= 3: score += 0.02 return score # -------------------------------------------------------------------------------- # 2. DELTA‑BASED REWARDS # -------------------------------------------------------------------------------- class TestDeltaRubric(Rubric): def __init__(self, weight: float = 0.3): self.weight = weight def __call__(self, env, action, obs, reward, done, info): delta = env._current_test_score - env._previous_test_score effective = self.weight if info.get("action_type") == "fix": effective *= 0.4 return effective * delta class LintDeltaRubric(Rubric): def __init__(self, weight: float = 0.3): self.weight = weight def __call__(self, env, action, obs, reward, done, info): delta = env._current_lint_score - env._previous_lint_score effective = self.weight * 0.5 if info.get("action_type") == "fix": effective *= 0.4 return effective * delta # -------------------------------------------------------------------------------- # 3. TERMINAL SUCCESS BONUS # -------------------------------------------------------------------------------- class TerminalSuccessRubric(Rubric): def __call__(self, env, action, obs, reward, done, info): if info.get("action_type") != "fix": return 0.0 score = 0.0 if env._current_test_score > 0.95: score += 0.4 elif env._current_test_score > 0.85: score += 0.2 return score # -------------------------------------------------------------------------------- # 4. EXPLORATION & DIVERSITY # -------------------------------------------------------------------------------- class ExplorationRubric(Rubric): def __init__(self, penalty: float = -0.05, bonus: float = 0.021): self.penalty = penalty self.bonus = bonus def __call__(self, env, action, obs, reward, done, info): if len(env._action_history) < 3: return 0.0 recent = env._action_history[-3:] unique = len(set(recent)) if unique == 1: return self.penalty elif unique == 3: return self.bonus return 0.0 # -------------------------------------------------------------------------------- # 5. ANTI‑HACKING & CONSISTENCY # -------------------------------------------------------------------------------- class AntiHackingRubric(Rubric): def __call__(self, env, action, obs, reward, done, info): if info.get("action_type") != "fix": return 0.0 score = 0.0 if not env._tests_run: score -= 0.25 if env._step_count < 2: score -= 0.1 if env._tests_run and env._linter_run: score += 0.02 return score # -------------------------------------------------------------------------------- # 6. STEP PENALTY # -------------------------------------------------------------------------------- class StepPenaltyRubric(Rubric): def __init__(self, penalty: float = -0.01): self.penalty = penalty def __call__(self, env, action, obs, reward, done, info): return self.penalty