SecureCodeEnv / graders /performance.py
vishaldhakad's picture
change in reward system to strict it between the 0-1
791664b
Raw
History Blame Contribute Delete
4.9 kB
"""
SecureCodeEnv - Performance Grader v4
FIXES:
- Inverted baseline (naive faster than optimal) β†’ return neutral 0.5
- Unmeasurable (-1.0) β†’ return neutral 0.5
- Both timings identical β†’ return neutral 0.5
- Agent faster than optimal β†’ clamp to max 0.999 (not >1.0)
- All scores clamped to (0.001, 0.999)
"""
import sys, tempfile, os, json, subprocess
from graders.clamp import clamp
NEUTRAL = 0.5 # returned when measurement is unreliable
def grade_performance(code: str, task: dict) -> dict:
test_cases = task.get("test_cases", [])
naive_code = task.get("naive_code", "")
optimal_code = task.get("optimal_code", "")
if not test_cases or not naive_code or not optimal_code:
return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
"memory_score": clamp(NEUTRAL), "feedback": "No baselines β€” neutral score"}
tc = next((t for t in test_cases
if "fn" in t and "input" in t
and "fn_class" not in t
and "expected_exception" not in t), None)
if not tc:
return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
"memory_score": clamp(NEUTRAL), "feedback": "No usable test case β€” neutral score"}
fn_name = tc["fn"]
inputs = tc["input"]
try:
agent_ms = _measure_ms(code, fn_name, inputs)
naive_ms = _measure_ms(naive_code, fn_name, inputs)
optimal_ms = _measure_ms(optimal_code, fn_name, inputs)
# Any unmeasurable result β†’ neutral
if any(x < 0 for x in [agent_ms, naive_ms, optimal_ms]):
return _neutral(agent_ms, naive_ms, optimal_ms, "Unmeasurable timing")
# Indistinguishable β†’ neutral
if abs(naive_ms - optimal_ms) < 0.05:
return _neutral(agent_ms, naive_ms, optimal_ms, "Timings indistinguishable")
# Inverted baseline (naive < optimal means naive is actually "better")
# This happens when optimal uses safer-but-slower code (e.g. Path.resolve vs os.path.join)
# In that case performance cannot be meaningfully scored β†’ neutral
if naive_ms < optimal_ms:
return _neutral(agent_ms, naive_ms, optimal_ms,
"Baseline inverted (naive faster than optimal) β€” neutral")
time_range = naive_ms - optimal_ms
raw = 1.0 - ((agent_ms - optimal_ms) / time_range)
# raw > 1.0 when agent faster than optimal β†’ clamp handles it
time_score = clamp(raw)
return {
"score": time_score,
"time_score": time_score,
"memory_score": time_score,
"agent_ms": round(agent_ms, 3),
"naive_ms": round(naive_ms, 3),
"optimal_ms": round(optimal_ms, 3),
"feedback": _feedback(time_score),
}
except Exception as e:
return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL),
"memory_score": clamp(NEUTRAL),
"feedback": f"Measurement error: {str(e)[:60]}"}
def _neutral(agent_ms, naive_ms, optimal_ms, reason: str) -> dict:
return {
"score": clamp(NEUTRAL),
"time_score": clamp(NEUTRAL),
"memory_score": clamp(NEUTRAL),
"agent_ms": round(agent_ms, 3) if agent_ms >= 0 else None,
"naive_ms": round(naive_ms, 3) if naive_ms >= 0 else None,
"optimal_ms": round(optimal_ms, 3) if optimal_ms >= 0 else None,
"feedback": reason,
}
def _measure_ms(code: str, fn_name: str, inputs: list, runs: int = 50) -> float:
"""Returns ms or -1.0 if unmeasurable."""
script = f"""
import timeit, json, sys
{code}
def _run():
{fn_name}(*{json.dumps(inputs)})
times = timeit.repeat(_run, number={runs}, repeat=5)
best = min(times) / {runs} * 1000
sys.stdout.write(json.dumps({{"ms": best}}) + "\\n")
sys.stdout.flush()
"""
tmp = None
try:
with tempfile.NamedTemporaryFile(mode="w", suffix=".py",
delete=False, prefix="sce_perf_") as f:
f.write(script); tmp = f.name
proc = subprocess.run([sys.executable, tmp],
capture_output=True, text=True, timeout=30)
for line in reversed(proc.stdout.strip().splitlines()):
line = line.strip()
if line.startswith("{"):
return json.loads(line)["ms"]
return -1.0
except Exception:
return -1.0
finally:
if tmp and os.path.exists(tmp):
try: os.unlink(tmp)
except OSError: pass
def _feedback(score: float) -> str:
if score >= 0.85: return "Excellent β€” near-optimal efficiency"
elif score >= 0.65: return "Good β€” minor optimisation possible"
elif score >= 0.45: return "Acceptable β€” room for improvement"
else: return "Poor β€” significant performance gap"