""" SecureCodeEnv - Performance Grader v4 FIXES: - Inverted baseline (naive faster than optimal) → return neutral 0.5 - Unmeasurable (-1.0) → return neutral 0.5 - Both timings identical → return neutral 0.5 - Agent faster than optimal → clamp to max 0.999 (not >1.0) - All scores clamped to (0.001, 0.999) """ import sys, tempfile, os, json, subprocess from graders.clamp import clamp NEUTRAL = 0.5 # returned when measurement is unreliable def grade_performance(code: str, task: dict) -> dict: test_cases = task.get("test_cases", []) naive_code = task.get("naive_code", "") optimal_code = task.get("optimal_code", "") if not test_cases or not naive_code or not optimal_code: return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), "memory_score": clamp(NEUTRAL), "feedback": "No baselines — neutral score"} tc = next((t for t in test_cases if "fn" in t and "input" in t and "fn_class" not in t and "expected_exception" not in t), None) if not tc: return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), "memory_score": clamp(NEUTRAL), "feedback": "No usable test case — neutral score"} fn_name = tc["fn"] inputs = tc["input"] try: agent_ms = _measure_ms(code, fn_name, inputs) naive_ms = _measure_ms(naive_code, fn_name, inputs) optimal_ms = _measure_ms(optimal_code, fn_name, inputs) # Any unmeasurable result → neutral if any(x < 0 for x in [agent_ms, naive_ms, optimal_ms]): return _neutral(agent_ms, naive_ms, optimal_ms, "Unmeasurable timing") # Indistinguishable → neutral if abs(naive_ms - optimal_ms) < 0.05: return _neutral(agent_ms, naive_ms, optimal_ms, "Timings indistinguishable") # Inverted baseline (naive < optimal means naive is actually "better") # This happens when optimal uses safer-but-slower code (e.g. Path.resolve vs os.path.join) # In that case performance cannot be meaningfully scored → neutral if naive_ms < optimal_ms: return _neutral(agent_ms, naive_ms, optimal_ms, "Baseline inverted (naive faster than optimal) — neutral") time_range = naive_ms - optimal_ms raw = 1.0 - ((agent_ms - optimal_ms) / time_range) # raw > 1.0 when agent faster than optimal → clamp handles it time_score = clamp(raw) return { "score": time_score, "time_score": time_score, "memory_score": time_score, "agent_ms": round(agent_ms, 3), "naive_ms": round(naive_ms, 3), "optimal_ms": round(optimal_ms, 3), "feedback": _feedback(time_score), } except Exception as e: return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), "memory_score": clamp(NEUTRAL), "feedback": f"Measurement error: {str(e)[:60]}"} def _neutral(agent_ms, naive_ms, optimal_ms, reason: str) -> dict: return { "score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), "memory_score": clamp(NEUTRAL), "agent_ms": round(agent_ms, 3) if agent_ms >= 0 else None, "naive_ms": round(naive_ms, 3) if naive_ms >= 0 else None, "optimal_ms": round(optimal_ms, 3) if optimal_ms >= 0 else None, "feedback": reason, } def _measure_ms(code: str, fn_name: str, inputs: list, runs: int = 50) -> float: """Returns ms or -1.0 if unmeasurable.""" script = f""" import timeit, json, sys {code} def _run(): {fn_name}(*{json.dumps(inputs)}) times = timeit.repeat(_run, number={runs}, repeat=5) best = min(times) / {runs} * 1000 sys.stdout.write(json.dumps({{"ms": best}}) + "\\n") sys.stdout.flush() """ tmp = None try: with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, prefix="sce_perf_") as f: f.write(script); tmp = f.name proc = subprocess.run([sys.executable, tmp], capture_output=True, text=True, timeout=30) for line in reversed(proc.stdout.strip().splitlines()): line = line.strip() if line.startswith("{"): return json.loads(line)["ms"] return -1.0 except Exception: return -1.0 finally: if tmp and os.path.exists(tmp): try: os.unlink(tmp) except OSError: pass def _feedback(score: float) -> str: if score >= 0.85: return "Excellent — near-optimal efficiency" elif score >= 0.65: return "Good — minor optimisation possible" elif score >= 0.45: return "Acceptable — room for improvement" else: return "Poor — significant performance gap"