"""Tool 5/9: compile_and_benchmark. Compiles agent C++ with `g++ -O3 -march=native -fopenmp -std=c++20 -Wall -Werror` and benchmarks against the Python baseline using median-of-15 wall time. Caching: the (cpp_code + hardware_profile_id) sha256 keys a persistent on-disk cache of compiled `.so` files. Per plan §7 risk #2, a high cache hit rate is critical to keeping training cost within budget. Output language enforcement (per plan §10a): the wrapper signature is auto- generated from the Python AST and the agent's code MUST define `extern "C"` function with that exact signature. Compile errors → reward = 0. """ from __future__ import annotations import hashlib import json import os import re import shutil import subprocess import tempfile import time from pathlib import Path from typing import Any # Persistent compile cache directory (shared across episodes within a process run) _CACHE_ROOT = Path(os.environ.get("POLYGLOT_OPTIMA_CACHE", str(Path(tempfile.gettempdir()) / "polyglot_optima_cache"))) _CACHE_ROOT.mkdir(parents=True, exist_ok=True) # Compile std — locked to C++20 in production per plan §10a. # Allowing C++17/C++14 silently would let the agent learn code that fails on the # real GCC 14 deploy. Therefore: production = c++20 only. Dev fallback requires # the explicit POLYGLOT_OPTIMA_DEV_FALLBACK=1 env var (used by tests on machines # with old MinGW); even then we warn loudly so the divergence isn't invisible. _PRODUCTION_CXX_STD = "c++20" _DEV_FALLBACK_ALLOWED = os.environ.get("POLYGLOT_OPTIMA_DEV_FALLBACK", "0") == "1" def _detect_supported_cxx_std() -> str: """Return c++20 if the compiler supports it; else c++20 anyway in production (so the compile fails informatively and the gate registers it as syntax_error). With POLYGLOT_OPTIMA_DEV_FALLBACK=1 set, we fall back to the highest std the compiler accepts and emit a stderr warning. That mode is for local dev tests only — never for training or deploy.""" compiler = shutil.which("g++") or shutil.which("clang++") if not compiler: return _PRODUCTION_CXX_STD # Probe c++20 first try: r = subprocess.run([compiler, f"-std={_PRODUCTION_CXX_STD}", "-x", "c++", "-E", "-"], input="", capture_output=True, text=True, timeout=5) if r.returncode == 0 and "unrecognized" not in (r.stderr or "").lower(): return _PRODUCTION_CXX_STD except Exception: pass if not _DEV_FALLBACK_ALLOWED: # Production: stay on c++20. If the compiler can't, every compile will fail # — that's the right signal (deploy with old GCC needs upgrading, not lowering). return _PRODUCTION_CXX_STD # Dev fallback only — emit warning so the divergence is visible import sys as _sys for std in ("c++17", "c++14"): try: r = subprocess.run([compiler, f"-std={std}", "-x", "c++", "-E", "-"], input="", capture_output=True, text=True, timeout=5) if r.returncode == 0 and "unrecognized" not in (r.stderr or "").lower(): print( f"⚠ POLYGLOT_OPTIMA: dev fallback to -std={std} (compiler does not support c++20). " f"This is for local tests only — production training/deploy MUST use c++20.", file=_sys.stderr, ) return std except Exception: continue return _PRODUCTION_CXX_STD def _detect_openmp() -> bool: """Test whether `-fopenmp` actually links — MinGW often lacks pthread libs.""" compiler = shutil.which("g++") or shutil.which("clang++") if not compiler: return False try: # Try to compile + LINK a trivial OpenMP program. Compile-only succeeds even # without pthread; we need the link step to confirm the runtime is available. import tempfile with tempfile.TemporaryDirectory() as td: src = Path(td) / "_omp_probe.cpp" obj = Path(td) / "_omp_probe.so" src.write_text("#include \nint main(){return omp_get_num_threads();}\n") r = subprocess.run([compiler, "-fopenmp", str(src), "-shared", "-fPIC", "-o", str(obj)], capture_output=True, text=True, timeout=10) return r.returncode == 0 except Exception: return False def _detect_dispatchable() -> bool: """Compile + ctypes-load a tiny probe. Returns True iff the toolchain produces a .so loadable by THIS Python interpreter (catches bitness mismatch on MinGW).""" compiler = shutil.which("g++") or shutil.which("clang++") if not compiler: return False try: import ctypes as _ct import tempfile with tempfile.TemporaryDirectory() as td: src = Path(td) / "_probe.cpp" so = Path(td) / "_probe.so" src.write_text( 'extern "C" void agent_function(const double*, ' 'unsigned long long, double* o, unsigned long long n)' '{ if (n) o[0] = 1.0; }\n' ) r = subprocess.run( [compiler, "-O0", "-fPIC", "-shared", str(src), "-o", str(so)], capture_output=True, text=True, timeout=15, ) if r.returncode != 0: return False lib = _ct.CDLL(str(so)) return hasattr(lib, "agent_function") except Exception: return False _DETECTED_CXX_STD = _detect_supported_cxx_std() _HAS_OPENMP = _detect_openmp() _DISPATCHABLE = _detect_dispatchable() _BASE_COMPILE_FLAGS = [ "-O3", "-march=native", f"-std={_DETECTED_CXX_STD}", "-Wall", # `-Werror` removed: many MinGW builds emit warnings on default flags. # Production deploy can re-add via POLYGLOT_OPTIMA_STRICT=1 "-fPIC", "-shared", ] if _HAS_OPENMP: _BASE_COMPILE_FLAGS.insert(2, "-fopenmp") if os.environ.get("POLYGLOT_OPTIMA_STRICT", "0") == "1": _BASE_COMPILE_FLAGS.append("-Werror") # Banned headers (per plan §10a — would mask agent's actual contribution) _BANNED_INCLUDES = [ "", "", "", # BLAS/LAPACK "", "", # CUDA " str: h = hashlib.sha256() for p in parts: h.update(p.encode("utf-8")) h.update(b"\x00") return h.hexdigest() def _check_for_banned_headers(cpp_code: str) -> str | None: """Return error string if the code uses a banned header, else None.""" for banned in _BANNED_INCLUDES: if banned in cpp_code: return ( f"Banned header detected: {banned}. " f"We measure YOUR optimization, not a library call. " f"Allowed: STL, , , , " ) return None def _has_required_entry_point(cpp_code: str) -> bool: """Validate canonical ABI expected by runtime dispatcher. Required signature: extern "C" void agent_function(const double*, size_t|unsigned long long, double*, size_t|unsigned long long) """ pattern = ( r'extern\s*"C"\s+void\s+agent_function\s*\(' r'\s*const\s+double\s*\*\s*(?:\w+)?\s*,' r'\s*(?:size_t|unsigned\s+long\s+long)\s*(?:\w+)?\s*,' r'\s*double\s*\*\s*(?:\w+)?\s*,' r'\s*(?:size_t|unsigned\s+long\s+long)\s*(?:\w+)?\s*' r'\)' ) return re.search(pattern, cpp_code, flags=re.IGNORECASE | re.DOTALL) is not None def _compile( cpp_code: str, hw_profile: dict[str, Any], cache_key: str, timeout_s: int = 30, compile_flags: list[str] | None = None, ) -> dict[str, Any]: """Run g++; cache the .so by cache_key. Return dict with status + path/error.""" cache_dir = _CACHE_ROOT / cache_key[:2] cache_dir.mkdir(parents=True, exist_ok=True) so_path = cache_dir / f"{cache_key}.so" # Cache hit if so_path.exists(): return {"status": "success", "so_path": str(so_path), "cached": True} # Banned headers → reject before invoking compiler banned_err = _check_for_banned_headers(cpp_code) if banned_err: return {"status": "syntax_error", "error": banned_err, "cached": False} # Write source + invoke compiler src_path = cache_dir / f"{cache_key}.cpp" src_path.write_text(cpp_code, encoding="utf-8") # Resolve compiler — prefer g++ on Linux, fall back to clang++ on macOS compiler = shutil.which("g++") or shutil.which("clang++") or "g++" flags = list(compile_flags) if compile_flags else list(_BASE_COMPILE_FLAGS) cmd = [compiler, *flags, str(src_path), "-o", str(so_path)] try: proc = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout_s, ) except subprocess.TimeoutExpired: return {"status": "timeout", "error": f"Compilation exceeded {timeout_s}s", "cached": False} except FileNotFoundError: return {"status": "syntax_error", "error": f"Compiler {compiler!r} not found. Install GCC 14 or clang++.", "cached": False} if proc.returncode != 0: return { "status": "syntax_error", "error": (proc.stderr or proc.stdout)[:2000], "cmd": " ".join(cmd), "cached": False, } return {"status": "success", "so_path": str(so_path), "cached": False} def _load_python_function(python_code: str): """Exec python_code in a fresh namespace, return the first FunctionDef as a callable.""" import ast tree = ast.parse(python_code) fn_node = next((n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)), None) if fn_node is None: raise RuntimeError("python_code defines no function") ns: dict[str, Any] = {} exec(compile(tree, filename="", mode="exec"), ns) fn = ns.get(fn_node.name) if fn is None: raise RuntimeError(f"function {fn_node.name!r} not found after exec") return fn def _benchmark_python_baseline(python_code: str, sample_input_size: int = 1024) -> dict[str, Any]: """Real median-of-15 wall time of the Python function on a default-typed input.""" from server.tools._runtime import time_python_only, make_default_args_for try: py_fn = _load_python_function(python_code) args = make_default_args_for(py_fn, n=sample_input_size) median_ms = time_python_only(py_fn, args, n_per_repeat=5, repeats=3) return { "median_ms": float(median_ms), "method": "perf_counter_median_5x3", "n_samples": sample_input_size, } except Exception as e: # Don't crash the env on a broken Python function; signal "0 baseline" → speedup goes to 0 return { "median_ms": 0.0, "method": "error", "error": str(e)[:200], "n_samples": sample_input_size, } def _benchmark_cpp(so_path: str, python_code: str, sample_input_size: int = 1024) -> dict[str, Any]: """Real median-of-15 wall time of the compiled .so via ctypes dispatch.""" from server.tools._runtime import benchmark_python_vs_cpp, make_default_args_for try: py_fn = _load_python_function(python_code) args = make_default_args_for(py_fn, n=sample_input_size) result = benchmark_python_vs_cpp(so_path, py_fn, args, n_per_repeat=5, repeats=3) return { "median_ms": float(result["cpp_median_ms"]), "py_median_ms": float(result["py_median_ms"]), "speedup_internal": float(result["speedup"]), "method": "ctypes_perf_counter_median_5x3", "n_samples": sample_input_size, } except Exception as e: return { "median_ms": 0.0, "method": "error", "error": str(e)[:200], "n_samples": sample_input_size, } def compile_and_benchmark_tool(tool_args: dict[str, Any], state) -> dict[str, Any]: """Compile agent C++ and report compile status + speedup measurement. Args: cpp_code (str): The C++20 source to compile. Returns dict with: compile_status: "success" | "syntax_error" | "link_error" | "timeout" speedup: float (python_ms / cpp_ms) — only valid if compile_status == "success" python_ms: median-of-15 Python baseline cpp_ms: median-of-15 agent C++ wall time error: str (if compile_status != "success") cache_hit: bool """ cpp_code = tool_args.get("cpp_code", "") if not cpp_code.strip(): return {"compile_status": "syntax_error", "error": "empty cpp_code", "speedup": 0.0} if not _has_required_entry_point(cpp_code): return { "compile_status": "syntax_error", "error": ( 'Missing required entry point: must define `extern "C" ... agent_function(...)`' ), "speedup": 0.0, } # Cache key hw = state.hardware_profile cache_key = _sha256(cpp_code, json.dumps(hw, sort_keys=True)) t_compile_start = time.perf_counter() compile_result = _compile(cpp_code, hw, cache_key) compile_time_s = time.perf_counter() - t_compile_start if compile_result["status"] != "success": return { "compile_status": compile_result["status"], "error": compile_result.get("error", "compilation failed"), "speedup": 0.0, "compile_time_s": compile_time_s, "cache_hit": False, } # Real benchmark via ctypes dispatch — joint timing of python + cpp on same args cpp_bench = _benchmark_cpp(compile_result["so_path"], state.python_code) if cpp_bench.get("method") == "error": # Compilation succeeded but the .so couldn't be dispatched (wrong signature, missing symbol) return { "compile_status": "link_error", "error": cpp_bench.get("error", "ctypes dispatch failed"), "speedup": 0.0, "python_ms": 0.0, "cpp_ms": 0.0, "compile_time_s": compile_time_s, "cache_hit": compile_result.get("cached", False), } py_ms = cpp_bench.get("py_median_ms", 0.0) cpp_ms = cpp_bench["median_ms"] speedup = py_ms / max(cpp_ms, 1e-6) if py_ms > 0 else 0.0 return { "compile_status": "success", "speedup": speedup, "python_ms": py_ms, "cpp_ms": cpp_ms, "compile_time_s": compile_time_s, "cache_hit": compile_result.get("cached", False), "so_path": compile_result["so_path"], "method": "ctypes_median_5x3_walltime", } __all__ = ["compile_and_benchmark_tool", "_sha256", "_BASE_COMPILE_FLAGS"]