Spaces:

dmaheshwar22
/

verifiable-rl-coder

Sleeping

deploy: replace template with real demo

0dd7c80 verified 23 days ago

978 Bytes

	"""Secondary reward — faster code scores higher, bounded `[0, 1]`.

	Scales the sandbox's measured `runtime_ms` against a budget:

	R = max(0, (budget_ms - runtime_ms) / budget_ms)

	`runtime_ms = 0` → 1.0 (ideal)
	`runtime_ms = budget_ms` → 0.0
	`runtime_ms > budget_ms` → 0.0 (clamped)

	Program failures (timeout, OOM, runner error) always return 0.0 — a program
	that didn't complete has no meaningful runtime to reward.

	Like `lint_reward`, this is a TIEBREAKER. Weight it small in the composite.
	"""

	from __future__ import annotations

	from ..sandbox.runner import RunResult

	DEFAULT_BUDGET_MS: int = 5000


	def runtime_reward(result: RunResult, *, budget_ms: int = DEFAULT_BUDGET_MS) -> float:
	"""Return reward in `[0, 1]` — faster runtime → higher score."""
	if result.timed_out or result.oom or result.error is not None:
	return 0.0
	if budget_ms <= 0:
	return 0.0
	return max(0.0, (budget_ms - result.runtime_ms) / budget_ms)