""" TeamForge Reward Function Dense, shaped reward signal returned at every step. Formula: r = base_reward + test_progress_bonus + lint_bonus - error_penalty - inefficiency_penalty - test_modification_penalty """ from __future__ import annotations import re from typing import Optional # ───────────────────────────────────────────── # CONSTANTS # ───────────────────────────────────────────── # Positive signals PLAN_STEP_REWARD = 0.05 EDIT_FILE_REWARD = 0.05 COMMIT_REWARD = 0.10 REVIEW_REWARD = 0.15 REFLECT_REWARD = 0.10 TEST_PASS_BONUS_PER_TEST = 0.05 LINT_CLEAN_BONUS = 0.05 # Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range) # We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety ACTION_ERROR_REWARD = 0.1 REPEATED_FAILURE_REWARD = 0.1 STEP_BASE_REWARD = 0.1 TEST_MODIFICATION_REWARD = 0.1 LINT_VIOLATION_REWARD = 0.1 # ───────────────────────────────────────────── # REWARD CALCULATOR # ───────────────────────────────────────────── class RewardCalculator: """ Stateful reward calculator. Tracks previous test/lint state to compute delta-based rewards. """ def __init__(self): self._prev_tests_passed: int = 0 self._prev_lint_violations: int = 0 self._action_failure_counts: dict[str, int] = {} self._test_files: list[str] = [] def set_test_files(self, test_files: list[str]) -> None: """Register which files are tests (so we can penalise modification).""" self._test_files = [f.lower() for f in test_files] def compute( self, action_type: str, action_success: bool, action_output: str, tests_passed: Optional[int] = None, lint_violations: Optional[int] = None, edited_file: Optional[str] = None, ) -> float: reward = STEP_BASE_REWARD # ── Test-file modification penalty (as small positive reward) ── if edited_file and self._is_test_file(edited_file): return TEST_MODIFICATION_REWARD # ── Action failure ── if not action_success: count = self._action_failure_counts.get(action_type, 0) + 1 self._action_failure_counts[action_type] = count if count >= 2: return REPEATED_FAILURE_REWARD return ACTION_ERROR_REWARD # Reset failure count on success self._action_failure_counts.pop(action_type, None) # ── Per-action rewards ── reward += { "plan_step": PLAN_STEP_REWARD, "edit_file": EDIT_FILE_REWARD, "commit": COMMIT_REWARD, "generate_review": REVIEW_REWARD, "self_reflect": REFLECT_REWARD, "run_tests": 0.02, "run_lint": 0.02, "request_iteration": 0.02, }.get(action_type, 0.1) # ── Test progress bonus ── if tests_passed is not None: delta = tests_passed - self._prev_tests_passed if delta > 0: reward += delta * TEST_PASS_BONUS_PER_TEST self._prev_tests_passed = tests_passed # ── Lint improvement bonus ── if lint_violations is not None: if lint_violations == 0: reward += LINT_CLEAN_BONUS else: delta = lint_violations - self._prev_lint_violations if delta < 0: # fewer violations reward += abs(delta) * LINT_VIOLATION_REWARD self._prev_lint_violations = lint_violations # Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement return round(max(0.1, min(0.9, reward)), 4) def _is_test_file(self, path: str) -> bool: low = path.lower() return any( low == tf or low.endswith(tf) for tf in self._test_files ) or "test" in low.split("/")[-1]