Spaces:
Sleeping
Sleeping
| """ | |
| sandbox/executor.py — Safe code execution via subprocess isolation. | |
| Agent code is untrusted. Running it in-process risks: | |
| - Infinite loops blocking the server | |
| - File system access | |
| - Network exfiltration | |
| - Process termination | |
| Solution: write code to a temp file, run in a child subprocess with a hard | |
| timeout. Docker network policy blocks external network. Main process never crashes. | |
| """ | |
| import subprocess | |
| import tempfile | |
| import os | |
| import json | |
| from typing import Any, Dict | |
| def safe_exec( | |
| code: str, | |
| test_input: str, | |
| timeout: int = 5, | |
| entry_fn: str = None, | |
| ) -> Dict[str, Any]: | |
| """ | |
| Run agent code in an isolated subprocess. | |
| Args: | |
| code: Python source code (may include harness wrapper) | |
| test_input: Input string passed to the code (for logging only) | |
| timeout: Hard kill timeout in seconds (default 5) | |
| entry_fn: If provided, append a call to this function | |
| Returns: | |
| {"ok": True, "output": <parsed JSON or raw stdout>} | |
| {"ok": False, "error": <stderr or TIMEOUT>} | |
| """ | |
| with tempfile.NamedTemporaryFile( | |
| mode="w", suffix=".py", delete=False, encoding="utf-8" | |
| ) as f: | |
| f.write(code) | |
| if entry_fn: | |
| f.write(f"\nimport json, sys\n") | |
| f.write(f"result = {entry_fn}({repr(test_input)})\n") | |
| f.write(f'print(json.dumps({{"result": result}}))\n') | |
| path = f.name | |
| try: | |
| proc = subprocess.run( | |
| ["python3", path], | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout, | |
| ) | |
| if proc.returncode == 0 and proc.stdout.strip(): | |
| try: | |
| output = json.loads(proc.stdout.strip()) | |
| return {"ok": True, "output": output} | |
| except json.JSONDecodeError: | |
| return {"ok": True, "output": proc.stdout.strip()} | |
| if proc.returncode != 0: | |
| return {"ok": False, "error": (proc.stderr or proc.stdout)[:500]} | |
| return {"ok": True, "output": {}} | |
| except subprocess.TimeoutExpired: | |
| return {"ok": False, "error": "TIMEOUT — code took too long to execute"} | |
| except Exception as e: | |
| return {"ok": False, "error": f"executor_error:{type(e).__name__}:{e}"} | |
| finally: | |
| try: | |
| os.unlink(path) | |
| except OSError: | |
| pass | |
| def safe_run_tests(code: str, test_cases: list, timeout: int = 5) -> Dict[str, Any]: | |
| """ | |
| Run structured test cases against agent code. | |
| Each test case: {"input": ..., "expected": ...} | |
| Returns: | |
| {"passed": int, "total": int, "details": [...]} | |
| """ | |
| passed = 0 | |
| details = [] | |
| for i, tc in enumerate(test_cases): | |
| inp = tc.get("input") | |
| expected = tc.get("expected") | |
| wrapper = code + f""" | |
| import json, sys | |
| _inp = {repr(inp)} | |
| try: | |
| _result = run_task(_inp) | |
| _ok = _result == {repr(expected)} | |
| print(json.dumps({{"result": str(_result)[:200], "ok": _ok, "expected": {repr(expected)}}})) | |
| except Exception as e: | |
| print(json.dumps({{"result": None, "ok": False, "error": str(e)[:200], "expected": {repr(expected)}}})) | |
| """ | |
| result = safe_exec(wrapper, str(inp), timeout=timeout) | |
| if result["ok"]: | |
| out = result["output"] | |
| if isinstance(out, dict) and out.get("ok"): | |
| passed += 1 | |
| details.append({"test": i, "status": "pass", "input": str(inp)[:60]}) | |
| else: | |
| details.append({ | |
| "test": i, "status": "fail", | |
| "input": str(inp)[:60], | |
| "got": out.get("result", "?")[:60] if isinstance(out, dict) else str(out)[:60], | |
| "expected": str(expected)[:60], | |
| }) | |
| else: | |
| details.append({ | |
| "test": i, "status": "error", | |
| "input": str(inp)[:60], | |
| "error": result.get("error", "")[:100], | |
| }) | |
| return {"passed": passed, "total": len(test_cases), "details": details} | |