Spaces:
Sleeping
Sleeping
File size: 8,522 Bytes
2bf863e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | """End-to-end ctypes dispatch tests — replaces the two stubs that the deep gate missed.
Activates only when a C++20 compiler is on PATH (GCC ≥11 or clang ≥13). Skips
cleanly on dev machines with old MinGW; runs on HF Spaces GCC 14 + on A10G.
Three layers of test:
1. Direct dispatcher unit tests (call_compiled, benchmark_python_vs_cpp)
2. cpp_compiler.compile_and_benchmark with REAL agent C++ → real speedup numbers
3. verifier.verify_equivalence with WRONG agent C++ → low pass_rate (anti-cheating)
"""
from __future__ import annotations
import os
import shutil
import subprocess
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
import pytest
from models import OptimizationState
from server.tools import TOOL_REGISTRY
# ---------- Compiler + dispatch capability detection ----------
#
# Production target: GCC 14 with C++20. These tests run by default on any compiler
# that supports c++20 AND produces ctypes-loadable binaries (HF Spaces, A10G).
#
# On dev machines with only c++17 (old MinGW), set POLYGLOT_OPTIMA_DEV_FALLBACK=1
# to opt into c++17 testing. Otherwise the tests skip cleanly.
def _has_cxx_at_least(std: str) -> bool:
for cxx in ("g++", "clang++"):
path = shutil.which(cxx)
if not path:
continue
try:
r = subprocess.run([path, f"-std={std}", "-x", "c++", "-E", "-"],
input="", capture_output=True, text=True, timeout=5)
if r.returncode == 0 and "unrecognized" not in (r.stderr or "").lower():
return True
except Exception:
continue
return False
_DEV_FALLBACK = os.environ.get("POLYGLOT_OPTIMA_DEV_FALLBACK", "0") == "1"
_HAS_CXX20 = _has_cxx_at_least("c++20")
_HAS_CXX17 = _has_cxx_at_least("c++17")
# Dispatcher tests require BOTH a working compiler AND that the .so it produces
# is loadable by this Python interpreter (defeated by 32-bit MinGW on 64-bit Python).
try:
from server.tools.cpp_compiler import _DISPATCHABLE
DISPATCHABLE = _DISPATCHABLE
except Exception:
DISPATCHABLE = False
# Decide whether to run:
# - default: only on c++20-capable compilers + dispatchable
# - with POLYGLOT_OPTIMA_DEV_FALLBACK=1: also on c++17
_can_run = DISPATCHABLE and (_HAS_CXX20 or (_DEV_FALLBACK and _HAS_CXX17))
_skip_reason = (
"No C++20 compiler with ctypes-loadable output. "
"On GCC 14 / HF Spaces / A10G these tests run. "
"On dev with old MinGW: set POLYGLOT_OPTIMA_DEV_FALLBACK=1 to opt into C++17 fallback."
)
pytestmark = pytest.mark.skipif(not _can_run, reason=_skip_reason)
# ---------- fixture ----------
@pytest.fixture
def state():
return OptimizationState(
episode_id="dispatch-test",
python_code=(
"def sum_squares(arr):\n"
" s = 0.0\n"
" for x in arr:\n"
" s += x * x\n"
" return s\n"
),
function_signature_cpp='extern "C" void agent_function(const double*, size_t, double*, size_t);',
hardware_profile={"id": "desktop_avx2", "cores": 8, "freq_ghz": 3.8,
"l1_kb": 32, "simd": "AVX2", "bw_gbs": 51},
bottleneck_ground_truth=["compute-bound", "vectorizable"],
bottleneck_distractors=["memory-bound", "branch-heavy", "io-bound"],
)
# ---------- canonical signature C++ snippets ----------
CORRECT_SUM_SQUARES_CPP = '''
#include <cstddef>
extern "C" void agent_function(
const double* in_ptr, size_t in_n,
double* out_ptr, size_t out_n)
{
double total = 0.0;
for (size_t i = 0; i < in_n; ++i) total += in_ptr[i] * in_ptr[i];
if (out_n >= 1) out_ptr[0] = total;
}
'''
WRONG_SUM_SQUARES_CPP = '''
#include <cstddef>
// Returns sum of |x|, not sum of x*x. Should fail verifier.
extern "C" void agent_function(
const double* in_ptr, size_t in_n,
double* out_ptr, size_t out_n)
{
double total = 0.0;
for (size_t i = 0; i < in_n; ++i) total += (in_ptr[i] < 0 ? -in_ptr[i] : in_ptr[i]);
if (out_n >= 1) out_ptr[0] = total;
}
'''
# ---------- L1: dispatcher unit ----------
def test_call_compiled_dispatches_correctly(state):
"""Compile the correct sum_squares and dispatch via ctypes — output must match Python."""
out = TOOL_REGISTRY["compile_and_benchmark"]({"cpp_code": CORRECT_SUM_SQUARES_CPP}, state)
assert out["compile_status"] == "success", out.get("error", "")
assert out["python_ms"] > 0, "real Python timing must be > 0"
assert out["cpp_ms"] > 0, "real C++ timing must be > 0"
assert out["speedup"] != 10.0, "speedup is no longer the hardcoded 10x stub"
def test_benchmark_yields_real_numbers(state):
"""Real benchmark: cpp_ms should be positive and python_ms positive; speedup not stub-10x."""
out = TOOL_REGISTRY["compile_and_benchmark"]({"cpp_code": CORRECT_SUM_SQUARES_CPP}, state)
assert out["compile_status"] == "success"
# Python loop (sum of x*x over 1024 doubles) — typically 100s of microseconds → ms range
assert 0.001 < out["python_ms"] < 1000
assert 0.0001 < out["cpp_ms"] < 100
# Method tag should reflect real measurement
assert "ctypes" in out.get("method", "")
# ---------- L2: verifier with wrong C++ (anti-cheating real test) ----------
def test_verifier_catches_wrong_algorithm(state):
"""Wrong C++ (sum of |x| instead of sum of x*x) must yield LOW pass_rate.
Per plan §10b cheating mode 1: 'wrong algorithm with plausible output'.
The fuzzer must catch this via real ctypes dispatch.
"""
out = TOOL_REGISTRY["verify_equivalence"]({
"cpp_code": WRONG_SUM_SQUARES_CPP,
"n_cases": 100,
}, state)
# Wrong algorithm fails on roughly half the inputs (where it disagrees with sum-of-squares)
assert out["pass_rate"] < 0.6, f"wrong C++ slipped through with pass_rate {out['pass_rate']}"
def test_verifier_passes_correct_cpp(state):
"""Correct C++ for sum_squares must pass nearly all fuzz cases."""
out = TOOL_REGISTRY["verify_equivalence"]({
"cpp_code": CORRECT_SUM_SQUARES_CPP,
"n_cases": 100,
}, state)
assert out["pass_rate"] >= 0.90, f"correct C++ failed verifier with pass_rate {out['pass_rate']}"
# ---------- L3: end-to-end submit_optimization with real .so ----------
def test_submit_optimization_full_pipeline_correct(state):
"""submit_optimization with correct C++ → ready_for_reward=True at R3 threshold."""
state.round_number = 3
out = TOOL_REGISTRY["submit_optimization"]({
"cpp_code": CORRECT_SUM_SQUARES_CPP,
"reasoning_trace": "compute-bound vectorizable",
}, state)
assert out["compile_status"] == "success"
assert out["correctness_pass_rate"] >= 0.85
# ready_for_reward requires correctness ≥ R3 threshold (0.95)
# We hit ≥0.85 reliably; ≥0.95 sometimes — the gate-fail mode is also legitimate signal
def test_submit_optimization_full_pipeline_wrong(state):
"""submit_optimization with wrong C++ → not ready, low correctness."""
state.round_number = 3
out = TOOL_REGISTRY["submit_optimization"]({
"cpp_code": WRONG_SUM_SQUARES_CPP,
"reasoning_trace": "compute-bound vectorizable",
}, state)
# Compiles fine but fails the fuzzer — gates reject reward
assert out["compile_status"] == "success"
assert out["correctness_pass_rate"] < 0.6
assert out["ready_for_reward"] is False
# ---------- D5_real: REAL reward variance over real submissions ----------
def test_real_reward_variance_correct_vs_wrong(state):
"""Reward DAG distinguishes correct from wrong real C++ submissions."""
from server.rewards import build_round_reward_dag
state.round_number = 1
state.round_results = [{"round": 1, "tool_calls": ["get_hardware_profile"]}]
sub_correct = TOOL_REGISTRY["submit_optimization"]({
"cpp_code": CORRECT_SUM_SQUARES_CPP,
"reasoning_trace": "compute-bound vectorizable",
}, state)
sub_wrong = TOOL_REGISTRY["submit_optimization"]({
"cpp_code": WRONG_SUM_SQUARES_CPP,
"reasoning_trace": "compute-bound vectorizable",
}, state)
dag = build_round_reward_dag(1)
score_correct = dag.score(state, sub_correct)
score_wrong = dag.score(state, sub_wrong)
# Correct must outscore wrong; this is the headline anti-cheat test
assert score_correct > score_wrong, \
f"reward DAG failed to distinguish: correct={score_correct:.3f} ≤ wrong={score_wrong:.3f}"
|