Spaces:

100XZX001
/

code-review-training

Sleeping

App Files Files Community

code-review-training / rubrics.py

100XZX001

Upload 18 files

5454b21 verified about 1 month ago

raw

history blame contribute delete

5.1 kB

	# rubrics.py – Self-contained Rubrics (no external OpenEnv dependency)

	class Rubric:
	"""Minimal Rubric base – compatible with OpenEnv but self‑contained."""
	def __call__(self, env, action, obs, reward, done, info):
	return 0.0


	# --------------------------------------------------------------------------------
	# 1. TOOL‑USAGE BONUS
	# --------------------------------------------------------------------------------
	class ToolUsageRubric(Rubric):
	def __init__(self, bonus: float = 0.05):
	self.bonus = bonus

	def __call__(self, env, action, obs, reward, done, info):
	score = 0.0
	action_type = info.get("action_type", "")
	# Use pre-action flags from `info` so first-use bonuses are
	# computed correctly even though env flags are mutated in-step.
	prev_tests_run = info.get("prev_tests_run", env._tests_run)
	prev_linter_run = info.get("prev_linter_run", env._linter_run)
	prev_docs_queried = info.get("prev_docs_queried", env._docs_queried)

	if action_type == "run_tests":
	if not prev_tests_run:
	score += self.bonus
	score += 0.015
	elif action_type == "run_linter":
	if not prev_linter_run:
	score += self.bonus
	score += 0.015
	elif action_type == "query_docs":
	if not prev_docs_queried:
	score += self.bonus * 0.5
	# Encourage docs usage when it is likely useful:
	# - early exploration phase
	# - non-trivial query text
	if env._step_count <= 4 and info.get("docs_query_len", 0) >= 8:
	score += 0.01
	# Discourage repeated docs calls after the first-use signal.
	if prev_docs_queried:
	score -= 0.01
	elif action_type == "question" and env._step_count <= 3:
	score += 0.02
	return score


	# --------------------------------------------------------------------------------
	# 2. DELTA‑BASED REWARDS
	# --------------------------------------------------------------------------------
	class TestDeltaRubric(Rubric):
	def __init__(self, weight: float = 0.3):
	self.weight = weight

	def __call__(self, env, action, obs, reward, done, info):
	delta = env._current_test_score - env._previous_test_score
	effective = self.weight
	if info.get("action_type") == "fix":
	effective *= 0.4
	return effective * delta


	class LintDeltaRubric(Rubric):
	def __init__(self, weight: float = 0.3):
	self.weight = weight

	def __call__(self, env, action, obs, reward, done, info):
	delta = env._current_lint_score - env._previous_lint_score
	effective = self.weight * 0.5
	if info.get("action_type") == "fix":
	effective *= 0.4
	return effective * delta


	# --------------------------------------------------------------------------------
	# 3. TERMINAL SUCCESS BONUS
	# --------------------------------------------------------------------------------
	class TerminalSuccessRubric(Rubric):
	def __call__(self, env, action, obs, reward, done, info):
	if info.get("action_type") != "fix":
	return 0.0
	score = 0.0
	if env._current_test_score > 0.95:
	score += 0.4
	elif env._current_test_score > 0.85:
	score += 0.2
	return score


	# --------------------------------------------------------------------------------
	# 4. EXPLORATION & DIVERSITY
	# --------------------------------------------------------------------------------
	class ExplorationRubric(Rubric):
	def __init__(self, penalty: float = -0.05, bonus: float = 0.021):
	self.penalty = penalty
	self.bonus = bonus

	def __call__(self, env, action, obs, reward, done, info):
	if len(env._action_history) < 3:
	return 0.0
	recent = env._action_history[-3:]
	unique = len(set(recent))
	if unique == 1:
	return self.penalty
	elif unique == 3:
	return self.bonus
	return 0.0


	# --------------------------------------------------------------------------------
	# 5. ANTI‑HACKING & CONSISTENCY
	# --------------------------------------------------------------------------------
	class AntiHackingRubric(Rubric):
	def __call__(self, env, action, obs, reward, done, info):
	if info.get("action_type") != "fix":
	return 0.0
	score = 0.0
	if not env._tests_run:
	score -= 0.25
	if env._step_count < 2:
	score -= 0.1
	if env._tests_run and env._linter_run:
	score += 0.02
	return score


	# --------------------------------------------------------------------------------
	# 6. STEP PENALTY
	# --------------------------------------------------------------------------------
	class StepPenaltyRubric(Rubric):
	def __init__(self, penalty: float = -0.01):
	self.penalty = penalty

	def __call__(self, env, action, obs, reward, done, info):
	return self.penalty