Spaces:

kgdrathan
/

explainer-env

Sleeping

App Files Files Community

explainer-env / rewards /generation.py

kgdrathan

Upload folder using huggingface_hub

5869d56 verified 25 days ago

raw

history blame contribute delete

11.9 kB

	"""Reward components for the generation phase.

	After exploration, the agent generates marimo/manim code. Rewards measure
	validity, task alignment, artifact structure, and research usage.

	Scoring model:
	quality = weighted sum of (validity, task alignment, structure, research usage)
	total = quality × gate

	Gates (multiplicative):
	- code doesn't parse → total = 0
	- static check fails → total = quality × small static-fail multiplier
	- code doesn't run → total = quality × execution-fail multiplier
	- code runs → total = quality × 1.0
	"""

	from __future__ import annotations

	import re
	from typing import TYPE_CHECKING

	from .sandbox import ast_parses, check_marimo, extract_scene_class

	try:
	from ..constants import MAX_REPAIR_REWARD, clamp_action_reward
	except ImportError: # pragma: no cover - supports direct test execution
	from constants import MAX_REPAIR_REWARD, clamp_action_reward

	if TYPE_CHECKING:
	from ..task_bank import Task


	# ---------------------------------------------------------------------------
	# Component weights
	# ---------------------------------------------------------------------------

	_WEIGHTS = {
	"validity": 0.15,
	"task_alignment": 0.30,
	"structure": 0.30,
	"research_usage": 0.25,
	}

	GATE_STATIC_FAIL = 0.12
	GATE_RUNS_FAIL = 0.30 # quality multiplier when static checks pass but execution fails


	_STOPWORDS = {
	"about", "after", "again", "against", "also", "because", "before", "being",
	"between", "class", "code", "construct", "could", "from", "have", "into",
	"like", "make", "more", "most", "only", "self", "show", "step", "than",
	"that", "their", "then", "there", "these", "this", "through", "using",
	"value", "where", "with", "would",
	}


	# ---------------------------------------------------------------------------
	# Individual scorers
	# ---------------------------------------------------------------------------


	def keyword_coverage(code: str, keywords_csv: str) -> float:
	"""Fraction of task keywords mentioned in the code (case-insensitive)."""
	if not keywords_csv:
	return 0.0
	keywords = [k.strip().lower() for k in keywords_csv.split(",") if k.strip()]
	if not keywords:
	return 0.0
	code_lower = code.lower()
	return sum(1 for kw in keywords if kw in code_lower) / len(keywords)


	def format_match(chosen_format: str, task: Task) -> float:
	"""1.0 if format matches the task's preferred format, else 0.3.

	If the task has no preferred format (None), any choice scores 1.0.
	"""
	if task.preferred_format is None:
	return 1.0
	return 1.0 if chosen_format == task.preferred_format else 0.3


	def marimo_structure(
	code: str,
	task: Task,
	static_check_passed: bool \| None = None,
	error_codes: list[str] \| None = None,
	) -> float:
	"""Score structural quality of a marimo notebook (0-1).

	Additive scoring for good patterns, penalties from ``marimo check``
	for breaking violations (duplicate defs, cycles, etc.).
	"""
	score = 0.0

	# Positive signals
	if "import marimo" in code or "from marimo" in code:
	score += 0.2
	if "marimo.App" in code or "mo.App" in code:
	score += 0.1
	cell_count = code.count("@app.cell")
	if cell_count >= 3:
	score += 0.2
	elif cell_count >= 1:
	score += 0.1

	ui_patterns = [
	"mo.md(",
	"mo.Html",
	"mo.accordion",
	"mo.callout",
	"mo.hstack(",
	"mo.vstack(",
	"mo.ui.slider",
	"mo.ui.dropdown",
	"mo.ui.table",
	"mo.ui.dataframe",
	]
	score += min(0.22, sum(0.06 for p in ui_patterns if p in code))

	reactive_plot_patterns = [
	"mo.ui.matplotlib(",
	"mo.ui.plotly(",
	"mo.ui.altair_chart(",
	]
	raw_plot_patterns = [
	"plt.",
	"matplotlib.pyplot",
	"px.",
	"plotly.",
	"alt.Chart",
	]
	if "mo.ui.matplotlib(plt.gca())" in code:
	score += 0.24 if task.data_available else 0.16
	elif any(p in code for p in reactive_plot_patterns):
	score += 0.18 if task.data_available else 0.10
	elif any(p in code for p in raw_plot_patterns):
	score += 0.08 if task.data_available else 0.03
	score -= 0.08

	if "plt.tight_layout(" in code:
	score -= 0.12

	if "np.math." in code:
	score -= 0.15

	tier_thresholds = {"advanced": 6, "intermediate": 4, "beginner": 2}
	if cell_count >= tier_thresholds.get(task.tier, 2):
	score += 0.1

	# Marimo check: penalize breaking violations, bonus for clean code
	if static_check_passed is None:
	passed, _, violations = check_marimo(code)
	else:
	passed = static_check_passed
	violations = error_codes or []

	if passed:
	score += 0.1
	else:
	penalty = {
	"MB002": 0.35,
	"MB003": 0.4,
	"MB005": 0.25,
	"MB001": 0.3,
	"MB004": 0.2,
	}
	for v in violations:
	score -= penalty.get(v, 0.15)

	return max(0.0, min(1.0, score))


	def manim_structure(code: str, task: Task) -> float:
	"""Score structural quality of a manim scene (0-1)."""
	from .sandbox import extract_scene_class

	score = 0.0
	if "from manim" in code or "import manim" in code:
	score += 0.2
	if extract_scene_class(code) is not None:
	score += 0.2
	if "def construct" in code:
	score += 0.1

	anim_patterns = [
	"self.play(", "self.wait(", "Create(", "FadeIn(", "FadeOut(",
	"Transform(", "Write(", "MoveToTarget", "Indicate(",
	"ReplacementTransform(",
	]
	anim_hits = sum(1 for p in anim_patterns if p in code)
	score += min(0.3, anim_hits * 0.05)

	math_patterns = ["MathTex(", "Tex(", "Axes(", "NumberPlane(", "Graph("]
	if any(p in code for p in math_patterns):
	score += 0.1

	tier_thresholds = {"advanced": 6, "intermediate": 4, "beginner": 2}
	if anim_hits >= tier_thresholds.get(task.tier, 2):
	score += 0.1

	return min(1.0, score)


	def narration_score(narration: str, fmt: str) -> float:
	"""Score narration quality. Only relevant for manim format."""
	if fmt != "manim":
	return 1.0
	if not narration or not narration.strip():
	return 0.0
	words = narration.split()
	score = 0.0
	if len(words) >= 30:
	score += 0.4
	elif len(words) >= 10:
	score += 0.2
	scene_markers = ["scene", "step", "first", "next", "then", "finally", "now"]
	score += min(0.3, sum(0.1 for m in scene_markers if m in narration.lower()))
	if len(words) >= 50:
	score += 0.3
	elif len(words) >= 20:
	score += 0.15
	return min(1.0, score)


	def context_usage(code: str, accumulated_context: list[str]) -> float:
	"""Score whether the generated code incorporates research findings (0-1)."""
	if not accumulated_context:
	return 0.0

	context_words: set[str] = set()
	for ctx in accumulated_context:
	context_words.update(_tokens(ctx))

	if not context_words:
	return 0.0

	code_words = set(_tokens(code))
	overlap = code_words & context_words
	if not overlap:
	return 0.0
	# Do not reward broad generic overlap too heavily; a few meaningful terms
	# should help, but strong usage needs a substantial slice of the context.
	target = min(max(len(context_words), 1), 24)
	return min(1.0, len(overlap) / target * 2.5)


	# ---------------------------------------------------------------------------
	# Main reward function
	# ---------------------------------------------------------------------------


	def compute_generate_reward(
	code: str,
	fmt: str,
	narration: str,
	task: Task,
	exec_success: bool,
	accumulated_context: list[str],
	static_check_passed: bool \| None = None,
	error_codes: list[str] \| None = None,
	) -> tuple[float, dict]:
	"""Compute the generation-phase reward. Returns (total, components).

	``python_parse_valid``, ``static_check_passed``, and ``code_runs`` act as
	gates. ``code_valid`` means the artifact is valid for its target format,
	not merely that the Python AST parses.
	"""
	parse_valid = ast_parses(code)
	c_parse = 1.0 if parse_valid else 0.0
	if static_check_passed is None:
	static_check_passed = _infer_static_check(code, fmt, parse_valid)

	c_static = 1.0 if parse_valid and static_check_passed else 0.0
	c_runs = 1.0 if exec_success else 0.0
	c_coverage = keyword_coverage(code, task.keywords)
	c_format = format_match(fmt, task)
	if fmt == "marimo":
	c_struct = marimo_structure(code, task, static_check_passed, error_codes)
	else:
	scene_structure = manim_structure(code, task)
	c_struct = 0.75 * scene_structure + 0.25 * narration_score(narration, fmt)
	c_ctx = context_usage(code, accumulated_context)
	c_validity = _validity_score(c_parse, c_static, c_runs)
	c_alignment = 0.75 * c_coverage + 0.25 * c_format

	quality = (
	_WEIGHTS["validity"] * c_validity
	+ _WEIGHTS["task_alignment"] * c_alignment
	+ _WEIGHTS["structure"] * c_struct
	+ _WEIGHTS["research_usage"] * c_ctx
	)

	# Apply gates
	if c_parse == 0.0:
	total = 0.0
	elif c_static == 0.0:
	total = quality * _static_fail_multiplier(error_codes or [])
	elif c_runs == 0.0:
	total = quality * GATE_RUNS_FAIL
	else:
	total = quality

	components = {
	"validity": round(c_validity, 3),
	"task_alignment": round(c_alignment, 3),
	"structure": round(c_struct, 3),
	"research_usage": round(c_ctx, 3),
	"generate_total": round(total, 4),
	}
	return total, components


	def _infer_static_check(code: str, fmt: str, parse_valid: bool) -> bool:
	if not parse_valid:
	return False
	if fmt == "marimo":
	passed, _, _ = check_marimo(code)
	return passed
	if fmt == "manim":
	return extract_scene_class(code) is not None
	return False


	def _static_fail_multiplier(error_codes: list[str]) -> float:
	"""Keep parseable but structurally invalid artifacts from scoring high."""
	if any(code.startswith("MB") for code in error_codes):
	return GATE_STATIC_FAIL
	return min(GATE_RUNS_FAIL, GATE_STATIC_FAIL * 1.5)


	def _validity_score(
	parse_valid: float,
	static_check_passed: float,
	code_runs: float,
	) -> float:
	if parse_valid == 0.0:
	return 0.0
	if static_check_passed == 0.0:
	return 0.35
	if code_runs == 0.0:
	return 0.70
	return 1.0


	def adjust_repair_reward(
	base_reward: float,
	*,
	repair_success: bool,
	previous_error_codes: list[str],
	new_error_codes: list[str],
	previous_code: str,
	repaired_code: str,
	) -> tuple[float, dict]:
	"""Discount repaired code but reward fixing the specific prior failure."""
	changed = _fingerprint(previous_code) != _fingerprint(repaired_code)
	fixed_prior = bool(previous_error_codes) and not (
	set(previous_error_codes) & set(new_error_codes)
	)

	if repair_success:
	reward = base_reward * 0.60
	reward += 0.08 if fixed_prior else 0.0
	reward += 0.04 if changed else 0.0
	else:
	reward = base_reward * 0.25
	reward += 0.04 if fixed_prior else 0.0

	if not changed:
	reward -= 0.15

	reward = min(MAX_REPAIR_REWARD, clamp_action_reward(reward))
	return reward, {
	"repair_success": 1.0 if repair_success else 0.0,
	"fixed_prior_errors": 1.0 if fixed_prior else 0.0,
	"changed_code": 1.0 if changed else 0.0,
	"repair_total": round(reward, 4),
	}


	def _tokens(text: str) -> list[str]:
	return [
	w
	for w in re.findall(r"\w+", text.lower())
	if len(w) > 3 and w not in _STOPWORDS
	]


	def _fingerprint(code: str) -> str:
	return re.sub(r"\s+", "", code)