Spaces:
Sleeping
Sleeping
Your Name
fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance
efa2d2a | """ | |
| TeamForge Reward Function | |
| Dense, shaped reward signal returned at every step. | |
| Formula: | |
| r = base_reward | |
| + test_progress_bonus | |
| + lint_bonus | |
| - error_penalty | |
| - inefficiency_penalty | |
| - test_modification_penalty | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from typing import Optional | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONSTANTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # Positive signals | |
| PLAN_STEP_REWARD = 0.05 | |
| EDIT_FILE_REWARD = 0.05 | |
| COMMIT_REWARD = 0.10 | |
| REVIEW_REWARD = 0.15 | |
| REFLECT_REWARD = 0.10 | |
| TEST_PASS_BONUS_PER_TEST = 0.05 | |
| LINT_CLEAN_BONUS = 0.05 | |
| # Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range) | |
| # We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety | |
| ACTION_ERROR_REWARD = 0.1 | |
| REPEATED_FAILURE_REWARD = 0.1 | |
| STEP_BASE_REWARD = 0.1 | |
| TEST_MODIFICATION_REWARD = 0.1 | |
| LINT_VIOLATION_REWARD = 0.1 | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # REWARD CALCULATOR | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| class RewardCalculator: | |
| """ | |
| Stateful reward calculator. Tracks previous test/lint state | |
| to compute delta-based rewards. | |
| """ | |
| def __init__(self): | |
| self._prev_tests_passed: int = 0 | |
| self._prev_lint_violations: int = 0 | |
| self._action_failure_counts: dict[str, int] = {} | |
| self._test_files: list[str] = [] | |
| def set_test_files(self, test_files: list[str]) -> None: | |
| """Register which files are tests (so we can penalise modification).""" | |
| self._test_files = [f.lower() for f in test_files] | |
| def compute( | |
| self, | |
| action_type: str, | |
| action_success: bool, | |
| action_output: str, | |
| tests_passed: Optional[int] = None, | |
| lint_violations: Optional[int] = None, | |
| edited_file: Optional[str] = None, | |
| ) -> float: | |
| reward = STEP_BASE_REWARD | |
| # ββ Test-file modification penalty (as small positive reward) ββ | |
| if edited_file and self._is_test_file(edited_file): | |
| return TEST_MODIFICATION_REWARD | |
| # ββ Action failure ββ | |
| if not action_success: | |
| count = self._action_failure_counts.get(action_type, 0) + 1 | |
| self._action_failure_counts[action_type] = count | |
| if count >= 2: | |
| return REPEATED_FAILURE_REWARD | |
| return ACTION_ERROR_REWARD | |
| # Reset failure count on success | |
| self._action_failure_counts.pop(action_type, None) | |
| # ββ Per-action rewards ββ | |
| reward += { | |
| "plan_step": PLAN_STEP_REWARD, | |
| "edit_file": EDIT_FILE_REWARD, | |
| "commit": COMMIT_REWARD, | |
| "generate_review": REVIEW_REWARD, | |
| "self_reflect": REFLECT_REWARD, | |
| "run_tests": 0.02, | |
| "run_lint": 0.02, | |
| "request_iteration": 0.02, | |
| }.get(action_type, 0.1) | |
| # ββ Test progress bonus ββ | |
| if tests_passed is not None: | |
| delta = tests_passed - self._prev_tests_passed | |
| if delta > 0: | |
| reward += delta * TEST_PASS_BONUS_PER_TEST | |
| self._prev_tests_passed = tests_passed | |
| # ββ Lint improvement bonus ββ | |
| if lint_violations is not None: | |
| if lint_violations == 0: | |
| reward += LINT_CLEAN_BONUS | |
| else: | |
| delta = lint_violations - self._prev_lint_violations | |
| if delta < 0: # fewer violations | |
| reward += abs(delta) * LINT_VIOLATION_REWARD | |
| self._prev_lint_violations = lint_violations | |
| # Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement | |
| return round(max(0.1, min(0.9, reward)), 4) | |
| def _is_test_file(self, path: str) -> bool: | |
| low = path.lower() | |
| return any( | |
| low == tf or low.endswith(tf) | |
| for tf in self._test_files | |
| ) or "test" in low.split("/")[-1] | |