codecourt / scripts /boundary_eval.py
ayussssssiiii's picture
Initial HF Space snapshot
fcb838d
"""
Boundary-condition capability probe for CodeCourt.
This script compares a weak baseline solver against a stronger solver on a
curated suite of small, adversarial, and boundary-heavy cases. The goal is to
produce one crisp artifact that answers: did the solver actually get better at
handling tricky edge conditions?
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from agents.prompts import BRUTE_FORCE_SOLUTIONS, REFERENCE_SOLUTIONS
from oracle.executor import OracleExecutor
CASES = [
{
"case_id": "graph_shortest_path_single_node",
"capability": "boundary_conditions",
"archetype": "graph",
"task_id": 0,
"description": "Shortest-path solver must handle the smallest graph where source equals destination.",
"input": "1 0\n",
"expected": "0",
},
{
"case_id": "graph_shortest_path_two_hop",
"capability": "boundary_conditions",
"archetype": "graph",
"task_id": 0,
"description": "Shortest-path solver must reason beyond direct neighbors.",
"input": "3 2\n1 2 4\n2 3 5\n",
"expected": "9",
},
{
"case_id": "graph_bipartite_min_odd_cycle",
"capability": "boundary_conditions",
"archetype": "graph",
"task_id": 1,
"description": "Bipartite check must reject the smallest odd cycle.",
"input": "3 3\n1 2\n2 3\n1 3\n",
"expected": "NO",
},
{
"case_id": "array_lis_hidden_valley",
"capability": "boundary_conditions",
"archetype": "array",
"task_id": 2,
"description": "LIS must recover after an early overshoot instead of greedily locking in.",
"input": "4\n2 5 3 4\n",
"expected": "3",
},
{
"case_id": "dp_lcs_order_sensitive",
"capability": "boundary_conditions",
"archetype": "dp",
"task_id": 2,
"description": "LCS must respect order, not just character overlap.",
"input": "abc\nca\n",
"expected": "1",
},
{
"case_id": "dp_lcs_repeated_chars",
"capability": "boundary_conditions",
"archetype": "dp",
"task_id": 2,
"description": "LCS must count a subsequence rather than raw character membership.",
"input": "abc\nac\n",
"expected": "2",
},
]
def solver_code(mode: str, archetype: str, task_id: int) -> str:
key = (archetype, task_id)
if mode == "brute_force":
return BRUTE_FORCE_SOLUTIONS.get(key, "print(0)")
if mode == "reference":
return REFERENCE_SOLUTIONS.get(key, "print(0)")
raise ValueError(f"Unsupported mode: {mode}")
def run_suite(mode: str, time_limit: float, memory_limit_mb: int) -> dict:
executor = OracleExecutor(time_limit=time_limit, memory_limit_mb=memory_limit_mb)
results = []
for case in CASES:
code = solver_code(mode, case["archetype"], case["task_id"])
result = executor.run(
code=code,
stdin_input=case["input"],
expected_output=case["expected"],
)
results.append({
"case_id": case["case_id"],
"capability": case["capability"],
"archetype": case["archetype"],
"task_id": case["task_id"],
"description": case["description"],
"passed": result.passed,
"status": result.status,
"outcome": result.outcome,
"stdout": result.stdout,
"stderr": result.stderr,
"expected_output": case["expected"],
"execution_time": result.execution_time,
})
passed = sum(1 for item in results if item["passed"])
return {
"mode": mode,
"total_cases": len(results),
"passed_cases": passed,
"pass_rate": passed / max(len(results), 1),
"cases": results,
}
def build_summary(baseline: dict, trained: dict) -> dict:
baseline_map = {case["case_id"]: case for case in baseline["cases"]}
trained_map = {case["case_id"]: case for case in trained["cases"]}
improved_cases = []
for case in CASES:
case_id = case["case_id"]
before = baseline_map[case_id]
after = trained_map[case_id]
if not before["passed"] and after["passed"]:
improved_cases.append({
"case_id": case_id,
"description": case["description"],
"archetype": case["archetype"],
"task_id": case["task_id"],
})
return {
"suite_name": "boundary_conditions",
"claim": "The solver improves on small, adversarial boundary cases that break shortcut reasoning.",
"baseline_mode": baseline["mode"],
"trained_mode": trained["mode"],
"baseline_pass_rate": baseline["pass_rate"],
"trained_pass_rate": trained["pass_rate"],
"pass_rate_delta": trained["pass_rate"] - baseline["pass_rate"],
"baseline_passed_cases": baseline["passed_cases"],
"trained_passed_cases": trained["passed_cases"],
"improved_case_count": len(improved_cases),
"improved_cases": improved_cases,
}
def parse_args():
parser = argparse.ArgumentParser(description="Run the CodeCourt boundary-condition capability probe")
parser.add_argument("--baseline-mode", default="brute_force", choices=["brute_force", "reference"])
parser.add_argument("--trained-mode", default="reference", choices=["brute_force", "reference"])
parser.add_argument("--time-limit", type=float, default=2.0)
parser.add_argument("--memory-limit-mb", type=int, default=256)
parser.add_argument("--output", default="./outputs/capability_boundary_eval.json")
return parser.parse_args()
def main():
args = parse_args()
baseline = run_suite(args.baseline_mode, args.time_limit, args.memory_limit_mb)
trained = run_suite(args.trained_mode, args.time_limit, args.memory_limit_mb)
summary = build_summary(baseline, trained)
payload = {
"summary": summary,
"baseline": baseline,
"trained": trained,
}
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2))
print(json.dumps(summary, indent=2))
print(f"Saved boundary probe to {output_path}")
if __name__ == "__main__":
main()