""" Boundary-condition capability probe for CodeCourt. This script compares a weak baseline solver against a stronger solver on a curated suite of small, adversarial, and boundary-heavy cases. The goal is to produce one crisp artifact that answers: did the solver actually get better at handling tricky edge conditions? """ from __future__ import annotations import argparse import json import os import sys from pathlib import Path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from agents.prompts import BRUTE_FORCE_SOLUTIONS, REFERENCE_SOLUTIONS from oracle.executor import OracleExecutor CASES = [ { "case_id": "graph_shortest_path_single_node", "capability": "boundary_conditions", "archetype": "graph", "task_id": 0, "description": "Shortest-path solver must handle the smallest graph where source equals destination.", "input": "1 0\n", "expected": "0", }, { "case_id": "graph_shortest_path_two_hop", "capability": "boundary_conditions", "archetype": "graph", "task_id": 0, "description": "Shortest-path solver must reason beyond direct neighbors.", "input": "3 2\n1 2 4\n2 3 5\n", "expected": "9", }, { "case_id": "graph_bipartite_min_odd_cycle", "capability": "boundary_conditions", "archetype": "graph", "task_id": 1, "description": "Bipartite check must reject the smallest odd cycle.", "input": "3 3\n1 2\n2 3\n1 3\n", "expected": "NO", }, { "case_id": "array_lis_hidden_valley", "capability": "boundary_conditions", "archetype": "array", "task_id": 2, "description": "LIS must recover after an early overshoot instead of greedily locking in.", "input": "4\n2 5 3 4\n", "expected": "3", }, { "case_id": "dp_lcs_order_sensitive", "capability": "boundary_conditions", "archetype": "dp", "task_id": 2, "description": "LCS must respect order, not just character overlap.", "input": "abc\nca\n", "expected": "1", }, { "case_id": "dp_lcs_repeated_chars", "capability": "boundary_conditions", "archetype": "dp", "task_id": 2, "description": "LCS must count a subsequence rather than raw character membership.", "input": "abc\nac\n", "expected": "2", }, ] def solver_code(mode: str, archetype: str, task_id: int) -> str: key = (archetype, task_id) if mode == "brute_force": return BRUTE_FORCE_SOLUTIONS.get(key, "print(0)") if mode == "reference": return REFERENCE_SOLUTIONS.get(key, "print(0)") raise ValueError(f"Unsupported mode: {mode}") def run_suite(mode: str, time_limit: float, memory_limit_mb: int) -> dict: executor = OracleExecutor(time_limit=time_limit, memory_limit_mb=memory_limit_mb) results = [] for case in CASES: code = solver_code(mode, case["archetype"], case["task_id"]) result = executor.run( code=code, stdin_input=case["input"], expected_output=case["expected"], ) results.append({ "case_id": case["case_id"], "capability": case["capability"], "archetype": case["archetype"], "task_id": case["task_id"], "description": case["description"], "passed": result.passed, "status": result.status, "outcome": result.outcome, "stdout": result.stdout, "stderr": result.stderr, "expected_output": case["expected"], "execution_time": result.execution_time, }) passed = sum(1 for item in results if item["passed"]) return { "mode": mode, "total_cases": len(results), "passed_cases": passed, "pass_rate": passed / max(len(results), 1), "cases": results, } def build_summary(baseline: dict, trained: dict) -> dict: baseline_map = {case["case_id"]: case for case in baseline["cases"]} trained_map = {case["case_id"]: case for case in trained["cases"]} improved_cases = [] for case in CASES: case_id = case["case_id"] before = baseline_map[case_id] after = trained_map[case_id] if not before["passed"] and after["passed"]: improved_cases.append({ "case_id": case_id, "description": case["description"], "archetype": case["archetype"], "task_id": case["task_id"], }) return { "suite_name": "boundary_conditions", "claim": "The solver improves on small, adversarial boundary cases that break shortcut reasoning.", "baseline_mode": baseline["mode"], "trained_mode": trained["mode"], "baseline_pass_rate": baseline["pass_rate"], "trained_pass_rate": trained["pass_rate"], "pass_rate_delta": trained["pass_rate"] - baseline["pass_rate"], "baseline_passed_cases": baseline["passed_cases"], "trained_passed_cases": trained["passed_cases"], "improved_case_count": len(improved_cases), "improved_cases": improved_cases, } def parse_args(): parser = argparse.ArgumentParser(description="Run the CodeCourt boundary-condition capability probe") parser.add_argument("--baseline-mode", default="brute_force", choices=["brute_force", "reference"]) parser.add_argument("--trained-mode", default="reference", choices=["brute_force", "reference"]) parser.add_argument("--time-limit", type=float, default=2.0) parser.add_argument("--memory-limit-mb", type=int, default=256) parser.add_argument("--output", default="./outputs/capability_boundary_eval.json") return parser.parse_args() def main(): args = parse_args() baseline = run_suite(args.baseline_mode, args.time_limit, args.memory_limit_mb) trained = run_suite(args.trained_mode, args.time_limit, args.memory_limit_mb) summary = build_summary(baseline, trained) payload = { "summary": summary, "baseline": baseline, "trained": trained, } output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(json.dumps(payload, indent=2)) print(json.dumps(summary, indent=2)) print(f"Saved boundary probe to {output_path}") if __name__ == "__main__": main()