Spaces:

vishaldhakad
/

SecureCodeEnv

Sleeping

App Files Files Community

SecureCodeEnv / graders /performance.py

vishaldhakad

change in reward system to strict it between the 0-1

791664b 3 months ago

Raw

History Blame Contribute Delete

4.9 kB

	"""
	SecureCodeEnv - Performance Grader v4

	FIXES:
	- Inverted baseline (naive faster than optimal) → return neutral 0.5
	- Unmeasurable (-1.0) → return neutral 0.5
	- Both timings identical → return neutral 0.5
	- Agent faster than optimal → clamp to max 0.999 (not >1.0)
	- All scores clamped to (0.001, 0.999)
	"""
	import sys, tempfile, os, json, subprocess
	from graders.clamp import clamp

	NEUTRAL = 0.5 # returned when measurement is unreliable


	def grade_performance(code: str, task: dict) -> dict:
	test_cases = task.get("test_cases", [])
	naive_code = task.get("naive_code", "")
	optimal_code = task.get("optimal_code", "")

	if not test_cases or not naive_code or not optimal_code:
	return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
	"memory_score": clamp(NEUTRAL), "feedback": "No baselines — neutral score"}

	tc = next((t for t in test_cases
	if "fn" in t and "input" in t
	and "fn_class" not in t
	and "expected_exception" not in t), None)
	if not tc:
	return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
	"memory_score": clamp(NEUTRAL), "feedback": "No usable test case — neutral score"}

	fn_name = tc["fn"]
	inputs = tc["input"]

	try:
	agent_ms = _measure_ms(code, fn_name, inputs)
	naive_ms = _measure_ms(naive_code, fn_name, inputs)
	optimal_ms = _measure_ms(optimal_code, fn_name, inputs)

	# Any unmeasurable result → neutral
	if any(x < 0 for x in [agent_ms, naive_ms, optimal_ms]):
	return _neutral(agent_ms, naive_ms, optimal_ms, "Unmeasurable timing")

	# Indistinguishable → neutral
	if abs(naive_ms - optimal_ms) < 0.05:
	return _neutral(agent_ms, naive_ms, optimal_ms, "Timings indistinguishable")

	# Inverted baseline (naive < optimal means naive is actually "better")
	# This happens when optimal uses safer-but-slower code (e.g. Path.resolve vs os.path.join)
	# In that case performance cannot be meaningfully scored → neutral
	if naive_ms < optimal_ms:
	return _neutral(agent_ms, naive_ms, optimal_ms,
	"Baseline inverted (naive faster than optimal) — neutral")

	time_range = naive_ms - optimal_ms
	raw = 1.0 - ((agent_ms - optimal_ms) / time_range)
	# raw > 1.0 when agent faster than optimal → clamp handles it
	time_score = clamp(raw)

	return {
	"score": time_score,
	"time_score": time_score,
	"memory_score": time_score,
	"agent_ms": round(agent_ms, 3),
	"naive_ms": round(naive_ms, 3),
	"optimal_ms": round(optimal_ms, 3),
	"feedback": _feedback(time_score),
	}
	except Exception as e:
	return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
	"memory_score": clamp(NEUTRAL),
	"feedback": f"Measurement error: {str(e)[:60]}"}


	def _neutral(agent_ms, naive_ms, optimal_ms, reason: str) -> dict:
	return {
	"score": clamp(NEUTRAL),
	"time_score": clamp(NEUTRAL),
	"memory_score": clamp(NEUTRAL),
	"agent_ms": round(agent_ms, 3) if agent_ms >= 0 else None,
	"naive_ms": round(naive_ms, 3) if naive_ms >= 0 else None,
	"optimal_ms": round(optimal_ms, 3) if optimal_ms >= 0 else None,
	"feedback": reason,
	}


	def _measure_ms(code: str, fn_name: str, inputs: list, runs: int = 50) -> float:
	"""Returns ms or -1.0 if unmeasurable."""
	script = f"""
	import timeit, json, sys
	{code}
	def _run():
	{fn_name}(*{json.dumps(inputs)})
	times = timeit.repeat(_run, number={runs}, repeat=5)
	best = min(times) / {runs} * 1000
	sys.stdout.write(json.dumps({{"ms": best}}) + "\\n")
	sys.stdout.flush()
	"""
	tmp = None
	try:
	with tempfile.NamedTemporaryFile(mode="w", suffix=".py",
	delete=False, prefix="sce_perf_") as f:
	f.write(script); tmp = f.name
	proc = subprocess.run([sys.executable, tmp],
	capture_output=True, text=True, timeout=30)
	for line in reversed(proc.stdout.strip().splitlines()):
	line = line.strip()
	if line.startswith("{"):
	return json.loads(line)["ms"]
	return -1.0
	except Exception:
	return -1.0
	finally:
	if tmp and os.path.exists(tmp):
	try: os.unlink(tmp)
	except OSError: pass


	def _feedback(score: float) -> str:
	if score >= 0.85: return "Excellent — near-optimal efficiency"
	elif score >= 0.65: return "Good — minor optimisation possible"
	elif score >= 0.45: return "Acceptable — room for improvement"
	else: return "Poor — significant performance gap"