teamforge / reward.py
Your Name
fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance
efa2d2a
"""
TeamForge Reward Function
Dense, shaped reward signal returned at every step.
Formula:
r = base_reward
+ test_progress_bonus
+ lint_bonus
- error_penalty
- inefficiency_penalty
- test_modification_penalty
"""
from __future__ import annotations
import re
from typing import Optional
# ─────────────────────────────────────────────
# CONSTANTS
# ─────────────────────────────────────────────
# Positive signals
PLAN_STEP_REWARD = 0.05
EDIT_FILE_REWARD = 0.05
COMMIT_REWARD = 0.10
REVIEW_REWARD = 0.15
REFLECT_REWARD = 0.10
TEST_PASS_BONUS_PER_TEST = 0.05
LINT_CLEAN_BONUS = 0.05
# Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
# We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
ACTION_ERROR_REWARD = 0.1
REPEATED_FAILURE_REWARD = 0.1
STEP_BASE_REWARD = 0.1
TEST_MODIFICATION_REWARD = 0.1
LINT_VIOLATION_REWARD = 0.1
# ─────────────────────────────────────────────
# REWARD CALCULATOR
# ─────────────────────────────────────────────
class RewardCalculator:
"""
Stateful reward calculator. Tracks previous test/lint state
to compute delta-based rewards.
"""
def __init__(self):
self._prev_tests_passed: int = 0
self._prev_lint_violations: int = 0
self._action_failure_counts: dict[str, int] = {}
self._test_files: list[str] = []
def set_test_files(self, test_files: list[str]) -> None:
"""Register which files are tests (so we can penalise modification)."""
self._test_files = [f.lower() for f in test_files]
def compute(
self,
action_type: str,
action_success: bool,
action_output: str,
tests_passed: Optional[int] = None,
lint_violations: Optional[int] = None,
edited_file: Optional[str] = None,
) -> float:
reward = STEP_BASE_REWARD
# ── Test-file modification penalty (as small positive reward) ──
if edited_file and self._is_test_file(edited_file):
return TEST_MODIFICATION_REWARD
# ── Action failure ──
if not action_success:
count = self._action_failure_counts.get(action_type, 0) + 1
self._action_failure_counts[action_type] = count
if count >= 2:
return REPEATED_FAILURE_REWARD
return ACTION_ERROR_REWARD
# Reset failure count on success
self._action_failure_counts.pop(action_type, None)
# ── Per-action rewards ──
reward += {
"plan_step": PLAN_STEP_REWARD,
"edit_file": EDIT_FILE_REWARD,
"commit": COMMIT_REWARD,
"generate_review": REVIEW_REWARD,
"self_reflect": REFLECT_REWARD,
"run_tests": 0.02,
"run_lint": 0.02,
"request_iteration": 0.02,
}.get(action_type, 0.1)
# ── Test progress bonus ──
if tests_passed is not None:
delta = tests_passed - self._prev_tests_passed
if delta > 0:
reward += delta * TEST_PASS_BONUS_PER_TEST
self._prev_tests_passed = tests_passed
# ── Lint improvement bonus ──
if lint_violations is not None:
if lint_violations == 0:
reward += LINT_CLEAN_BONUS
else:
delta = lint_violations - self._prev_lint_violations
if delta < 0: # fewer violations
reward += abs(delta) * LINT_VIOLATION_REWARD
self._prev_lint_violations = lint_violations
# Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
return round(max(0.1, min(0.9, reward)), 4)
def _is_test_file(self, path: str) -> bool:
low = path.lower()
return any(
low == tf or low.endswith(tf)
for tf in self._test_files
) or "test" in low.split("/")[-1]