Spaces:

ayussssssiiii
/

codecourt

Sleeping

File size: 6,536 Bytes

fcb838d

"""
Boundary-condition capability probe for CodeCourt.

This script compares a weak baseline solver against a stronger solver on a
curated suite of small, adversarial, and boundary-heavy cases. The goal is to
produce one crisp artifact that answers: did the solver actually get better at
handling tricky edge conditions?
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from agents.prompts import BRUTE_FORCE_SOLUTIONS, REFERENCE_SOLUTIONS
from oracle.executor import OracleExecutor


CASES = [
    {
        "case_id": "graph_shortest_path_single_node",
        "capability": "boundary_conditions",
        "archetype": "graph",
        "task_id": 0,
        "description": "Shortest-path solver must handle the smallest graph where source equals destination.",
        "input": "1 0\n",
        "expected": "0",
    },
    {
        "case_id": "graph_shortest_path_two_hop",
        "capability": "boundary_conditions",
        "archetype": "graph",
        "task_id": 0,
        "description": "Shortest-path solver must reason beyond direct neighbors.",
        "input": "3 2\n1 2 4\n2 3 5\n",
        "expected": "9",
    },
    {
        "case_id": "graph_bipartite_min_odd_cycle",
        "capability": "boundary_conditions",
        "archetype": "graph",
        "task_id": 1,
        "description": "Bipartite check must reject the smallest odd cycle.",
        "input": "3 3\n1 2\n2 3\n1 3\n",
        "expected": "NO",
    },
    {
        "case_id": "array_lis_hidden_valley",
        "capability": "boundary_conditions",
        "archetype": "array",
        "task_id": 2,
        "description": "LIS must recover after an early overshoot instead of greedily locking in.",
        "input": "4\n2 5 3 4\n",
        "expected": "3",
    },
    {
        "case_id": "dp_lcs_order_sensitive",
        "capability": "boundary_conditions",
        "archetype": "dp",
        "task_id": 2,
        "description": "LCS must respect order, not just character overlap.",
        "input": "abc\nca\n",
        "expected": "1",
    },
    {
        "case_id": "dp_lcs_repeated_chars",
        "capability": "boundary_conditions",
        "archetype": "dp",
        "task_id": 2,
        "description": "LCS must count a subsequence rather than raw character membership.",
        "input": "abc\nac\n",
        "expected": "2",
    },
]


def solver_code(mode: str, archetype: str, task_id: int) -> str:
    key = (archetype, task_id)
    if mode == "brute_force":
        return BRUTE_FORCE_SOLUTIONS.get(key, "print(0)")
    if mode == "reference":
        return REFERENCE_SOLUTIONS.get(key, "print(0)")
    raise ValueError(f"Unsupported mode: {mode}")


def run_suite(mode: str, time_limit: float, memory_limit_mb: int) -> dict:
    executor = OracleExecutor(time_limit=time_limit, memory_limit_mb=memory_limit_mb)
    results = []

    for case in CASES:
        code = solver_code(mode, case["archetype"], case["task_id"])
        result = executor.run(
            code=code,
            stdin_input=case["input"],
            expected_output=case["expected"],
        )
        results.append({
            "case_id": case["case_id"],
            "capability": case["capability"],
            "archetype": case["archetype"],
            "task_id": case["task_id"],
            "description": case["description"],
            "passed": result.passed,
            "status": result.status,
            "outcome": result.outcome,
            "stdout": result.stdout,
            "stderr": result.stderr,
            "expected_output": case["expected"],
            "execution_time": result.execution_time,
        })

    passed = sum(1 for item in results if item["passed"])
    return {
        "mode": mode,
        "total_cases": len(results),
        "passed_cases": passed,
        "pass_rate": passed / max(len(results), 1),
        "cases": results,
    }


def build_summary(baseline: dict, trained: dict) -> dict:
    baseline_map = {case["case_id"]: case for case in baseline["cases"]}
    trained_map = {case["case_id"]: case for case in trained["cases"]}
    improved_cases = []

    for case in CASES:
        case_id = case["case_id"]
        before = baseline_map[case_id]
        after = trained_map[case_id]
        if not before["passed"] and after["passed"]:
            improved_cases.append({
                "case_id": case_id,
                "description": case["description"],
                "archetype": case["archetype"],
                "task_id": case["task_id"],
            })

    return {
        "suite_name": "boundary_conditions",
        "claim": "The solver improves on small, adversarial boundary cases that break shortcut reasoning.",
        "baseline_mode": baseline["mode"],
        "trained_mode": trained["mode"],
        "baseline_pass_rate": baseline["pass_rate"],
        "trained_pass_rate": trained["pass_rate"],
        "pass_rate_delta": trained["pass_rate"] - baseline["pass_rate"],
        "baseline_passed_cases": baseline["passed_cases"],
        "trained_passed_cases": trained["passed_cases"],
        "improved_case_count": len(improved_cases),
        "improved_cases": improved_cases,
    }


def parse_args():
    parser = argparse.ArgumentParser(description="Run the CodeCourt boundary-condition capability probe")
    parser.add_argument("--baseline-mode", default="brute_force", choices=["brute_force", "reference"])
    parser.add_argument("--trained-mode", default="reference", choices=["brute_force", "reference"])
    parser.add_argument("--time-limit", type=float, default=2.0)
    parser.add_argument("--memory-limit-mb", type=int, default=256)
    parser.add_argument("--output", default="./outputs/capability_boundary_eval.json")
    return parser.parse_args()


def main():
    args = parse_args()
    baseline = run_suite(args.baseline_mode, args.time_limit, args.memory_limit_mb)
    trained = run_suite(args.trained_mode, args.time_limit, args.memory_limit_mb)
    summary = build_summary(baseline, trained)

    payload = {
        "summary": summary,
        "baseline": baseline,
        "trained": trained,
    }

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(json.dumps(payload, indent=2))

    print(json.dumps(summary, indent=2))
    print(f"Saved boundary probe to {output_path}")


if __name__ == "__main__":
    main()