Spaces:
Sleeping
Sleeping
| """ | |
| SecureCodeEnv - Performance Grader v4 | |
| FIXES: | |
| - Inverted baseline (naive faster than optimal) β return neutral 0.5 | |
| - Unmeasurable (-1.0) β return neutral 0.5 | |
| - Both timings identical β return neutral 0.5 | |
| - Agent faster than optimal β clamp to max 0.999 (not >1.0) | |
| - All scores clamped to (0.001, 0.999) | |
| """ | |
| import sys, tempfile, os, json, subprocess | |
| from graders.clamp import clamp | |
| NEUTRAL = 0.5 # returned when measurement is unreliable | |
| def grade_performance(code: str, task: dict) -> dict: | |
| test_cases = task.get("test_cases", []) | |
| naive_code = task.get("naive_code", "") | |
| optimal_code = task.get("optimal_code", "") | |
| if not test_cases or not naive_code or not optimal_code: | |
| return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), | |
| "memory_score": clamp(NEUTRAL), "feedback": "No baselines β neutral score"} | |
| tc = next((t for t in test_cases | |
| if "fn" in t and "input" in t | |
| and "fn_class" not in t | |
| and "expected_exception" not in t), None) | |
| if not tc: | |
| return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), | |
| "memory_score": clamp(NEUTRAL), "feedback": "No usable test case β neutral score"} | |
| fn_name = tc["fn"] | |
| inputs = tc["input"] | |
| try: | |
| agent_ms = _measure_ms(code, fn_name, inputs) | |
| naive_ms = _measure_ms(naive_code, fn_name, inputs) | |
| optimal_ms = _measure_ms(optimal_code, fn_name, inputs) | |
| # Any unmeasurable result β neutral | |
| if any(x < 0 for x in [agent_ms, naive_ms, optimal_ms]): | |
| return _neutral(agent_ms, naive_ms, optimal_ms, "Unmeasurable timing") | |
| # Indistinguishable β neutral | |
| if abs(naive_ms - optimal_ms) < 0.05: | |
| return _neutral(agent_ms, naive_ms, optimal_ms, "Timings indistinguishable") | |
| # Inverted baseline (naive < optimal means naive is actually "better") | |
| # This happens when optimal uses safer-but-slower code (e.g. Path.resolve vs os.path.join) | |
| # In that case performance cannot be meaningfully scored β neutral | |
| if naive_ms < optimal_ms: | |
| return _neutral(agent_ms, naive_ms, optimal_ms, | |
| "Baseline inverted (naive faster than optimal) β neutral") | |
| time_range = naive_ms - optimal_ms | |
| raw = 1.0 - ((agent_ms - optimal_ms) / time_range) | |
| # raw > 1.0 when agent faster than optimal β clamp handles it | |
| time_score = clamp(raw) | |
| return { | |
| "score": time_score, | |
| "time_score": time_score, | |
| "memory_score": time_score, | |
| "agent_ms": round(agent_ms, 3), | |
| "naive_ms": round(naive_ms, 3), | |
| "optimal_ms": round(optimal_ms, 3), | |
| "feedback": _feedback(time_score), | |
| } | |
| except Exception as e: | |
| return {"score": clamp(NEUTRAL), "time_score": clamp(NEUTRAL), | |
| "memory_score": clamp(NEUTRAL), | |
| "feedback": f"Measurement error: {str(e)[:60]}"} | |
| def _neutral(agent_ms, naive_ms, optimal_ms, reason: str) -> dict: | |
| return { | |
| "score": clamp(NEUTRAL), | |
| "time_score": clamp(NEUTRAL), | |
| "memory_score": clamp(NEUTRAL), | |
| "agent_ms": round(agent_ms, 3) if agent_ms >= 0 else None, | |
| "naive_ms": round(naive_ms, 3) if naive_ms >= 0 else None, | |
| "optimal_ms": round(optimal_ms, 3) if optimal_ms >= 0 else None, | |
| "feedback": reason, | |
| } | |
| def _measure_ms(code: str, fn_name: str, inputs: list, runs: int = 50) -> float: | |
| """Returns ms or -1.0 if unmeasurable.""" | |
| script = f""" | |
| import timeit, json, sys | |
| {code} | |
| def _run(): | |
| {fn_name}(*{json.dumps(inputs)}) | |
| times = timeit.repeat(_run, number={runs}, repeat=5) | |
| best = min(times) / {runs} * 1000 | |
| sys.stdout.write(json.dumps({{"ms": best}}) + "\\n") | |
| sys.stdout.flush() | |
| """ | |
| tmp = None | |
| try: | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".py", | |
| delete=False, prefix="sce_perf_") as f: | |
| f.write(script); tmp = f.name | |
| proc = subprocess.run([sys.executable, tmp], | |
| capture_output=True, text=True, timeout=30) | |
| for line in reversed(proc.stdout.strip().splitlines()): | |
| line = line.strip() | |
| if line.startswith("{"): | |
| return json.loads(line)["ms"] | |
| return -1.0 | |
| except Exception: | |
| return -1.0 | |
| finally: | |
| if tmp and os.path.exists(tmp): | |
| try: os.unlink(tmp) | |
| except OSError: pass | |
| def _feedback(score: float) -> str: | |
| if score >= 0.85: return "Excellent β near-optimal efficiency" | |
| elif score >= 0.65: return "Good β minor optimisation possible" | |
| elif score >= 0.45: return "Acceptable β room for improvement" | |
| else: return "Poor β significant performance gap" | |