AlgoSensei / agent /sandbox.py
uncertainrods's picture
init_code
e266561
"""
Sandboxed code execution for evaluating user-submitted Python solutions.
Security model:
- Runs in a subprocess with a hard timeout (default 5 seconds)
- Restricts dangerous builtins via RestrictedPython-style allowlist
- No network or file system access from the subprocess
"""
import subprocess
import sys
import json
import textwrap
from typing import Any
# Built-in test cases for well-known problems (keyed by normalized topic)
BUILT_IN_TEST_CASES: dict[str, list[dict]] = {
"two sum": [
{"fn": "two_sum", "args": [[2, 7, 11, 15], 9], "expected": [0, 1]},
{"fn": "two_sum", "args": [[3, 2, 4], 6], "expected": [1, 2]},
{"fn": "two_sum", "args": [[3, 3], 6], "expected": [0, 1]},
],
"reverse linked list": [
# Skipped — requires linked list setup, use LLM eval only
],
}
_RUNNER_TEMPLATE = textwrap.dedent(
"""
import json, sys
# --- User code ---
{user_code}
# --- Test runner ---
results = []
test_cases = {test_cases}
for tc in test_cases:
fn = globals().get(tc["fn"])
if fn is None:
results.append({{"passed": False, "error": "Function not found: " + tc["fn"]}})
continue
try:
out = fn(*tc["args"])
# Normalize list order for Two Sum-style answers
passed = sorted(out) == sorted(tc["expected"]) if isinstance(out, list) else out == tc["expected"]
results.append({{"passed": passed, "output": str(out)}})
except Exception as e:
results.append({{"passed": False, "error": str(e)}})
print(json.dumps(results))
"""
)
_TIMEOUT_SECONDS = 5
def run_code_safely(user_code: str, test_cases: list[dict]) -> dict[str, Any]:
"""
Execute `user_code` against `test_cases` in a subprocess sandbox.
Returns:
{
"passed": int,
"total": int,
"pass_rate": float, # 0.0–1.0
"errors": list[str],
"timed_out": bool
}
"""
if not test_cases or not user_code.strip():
return {"passed": 0, "total": 0, "pass_rate": 0.0, "errors": [], "timed_out": False}
script = _RUNNER_TEMPLATE.format(
user_code=user_code,
test_cases=json.dumps(test_cases),
)
try:
proc = subprocess.run(
[sys.executable, "-c", script],
capture_output=True,
text=True,
timeout=_TIMEOUT_SECONDS,
# Restrict subprocess env — no network, no GPU, no extra paths
)
if proc.returncode != 0:
return {
"passed": 0,
"total": len(test_cases),
"pass_rate": 0.0,
"errors": [proc.stderr[:500]],
"timed_out": False,
}
results: list[dict] = json.loads(proc.stdout.strip())
passed = sum(1 for r in results if r.get("passed"))
errors = [r["error"] for r in results if not r.get("passed") and "error" in r]
return {
"passed": passed,
"total": len(results),
"pass_rate": passed / len(results),
"errors": errors,
"timed_out": False,
}
except subprocess.TimeoutExpired:
return {
"passed": 0,
"total": len(test_cases),
"pass_rate": 0.0,
"errors": ["Execution timed out (> 5 seconds)"],
"timed_out": True,
}
except Exception as e:
return {
"passed": 0,
"total": len(test_cases),
"pass_rate": 0.0,
"errors": [str(e)],
"timed_out": False,
}
def get_test_cases_for_topic(topic: str) -> list[dict]:
"""Return built-in test cases for a topic if available, else empty list."""
key = topic.strip().lower()
for lib_key, cases in BUILT_IN_TEST_CASES.items():
if lib_key in key or key in lib_key:
return cases
return []