Spaces:

PrakashCider
/

teamforge

Sleeping

Your Name

fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance

efa2d2a about 2 months ago

4.37 kB

	"""
	TeamForge Reward Function
	Dense, shaped reward signal returned at every step.

	Formula:
	r = base_reward
	+ test_progress_bonus
	+ lint_bonus
	- error_penalty
	- inefficiency_penalty
	- test_modification_penalty
	"""

	from __future__ import annotations

	import re
	from typing import Optional


	# ─────────────────────────────────────────────
	# CONSTANTS
	# ─────────────────────────────────────────────

	# Positive signals
	PLAN_STEP_REWARD = 0.05
	EDIT_FILE_REWARD = 0.05
	COMMIT_REWARD = 0.10
	REVIEW_REWARD = 0.15
	REFLECT_REWARD = 0.10
	TEST_PASS_BONUS_PER_TEST = 0.05
	LINT_CLEAN_BONUS = 0.05

	# Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
	# We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
	ACTION_ERROR_REWARD = 0.1
	REPEATED_FAILURE_REWARD = 0.1
	STEP_BASE_REWARD = 0.1
	TEST_MODIFICATION_REWARD = 0.1
	LINT_VIOLATION_REWARD = 0.1


	# ─────────────────────────────────────────────
	# REWARD CALCULATOR
	# ─────────────────────────────────────────────

	class RewardCalculator:
	"""
	Stateful reward calculator. Tracks previous test/lint state
	to compute delta-based rewards.
	"""

	def __init__(self):
	self._prev_tests_passed: int = 0
	self._prev_lint_violations: int = 0
	self._action_failure_counts: dict[str, int] = {}
	self._test_files: list[str] = []

	def set_test_files(self, test_files: list[str]) -> None:
	"""Register which files are tests (so we can penalise modification)."""
	self._test_files = [f.lower() for f in test_files]

	def compute(
	self,
	action_type: str,
	action_success: bool,
	action_output: str,
	tests_passed: Optional[int] = None,
	lint_violations: Optional[int] = None,
	edited_file: Optional[str] = None,
	) -> float:
	reward = STEP_BASE_REWARD

	# ── Test-file modification penalty (as small positive reward) ──
	if edited_file and self._is_test_file(edited_file):
	return TEST_MODIFICATION_REWARD

	# ── Action failure ──
	if not action_success:
	count = self._action_failure_counts.get(action_type, 0) + 1
	self._action_failure_counts[action_type] = count
	if count >= 2:
	return REPEATED_FAILURE_REWARD
	return ACTION_ERROR_REWARD

	# Reset failure count on success
	self._action_failure_counts.pop(action_type, None)

	# ── Per-action rewards ──
	reward += {
	"plan_step": PLAN_STEP_REWARD,
	"edit_file": EDIT_FILE_REWARD,
	"commit": COMMIT_REWARD,
	"generate_review": REVIEW_REWARD,
	"self_reflect": REFLECT_REWARD,
	"run_tests": 0.02,
	"run_lint": 0.02,
	"request_iteration": 0.02,
	}.get(action_type, 0.1)

	# ── Test progress bonus ──
	if tests_passed is not None:
	delta = tests_passed - self._prev_tests_passed
	if delta > 0:
	reward += delta * TEST_PASS_BONUS_PER_TEST
	self._prev_tests_passed = tests_passed

	# ── Lint improvement bonus ──
	if lint_violations is not None:
	if lint_violations == 0:
	reward += LINT_CLEAN_BONUS
	else:
	delta = lint_violations - self._prev_lint_violations
	if delta < 0: # fewer violations
	reward += abs(delta) * LINT_VIOLATION_REWARD
	self._prev_lint_violations = lint_violations

	# Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
	return round(max(0.1, min(0.9, reward)), 4)

	def _is_test_file(self, path: str) -> bool:
	low = path.lower()
	return any(
	low == tf or low.endswith(tf)
	for tf in self._test_files
	) or "test" in low.split("/")[-1]