Spaces:

ayussssssiiii
/

codecourt

Sleeping

App Files Files Community

codecourt / scripts /boundary_eval.py

ayussssssiiii

Initial HF Space snapshot

fcb838d about 1 month ago

raw

history blame contribute delete

6.54 kB

	"""
	Boundary-condition capability probe for CodeCourt.

	This script compares a weak baseline solver against a stronger solver on a
	curated suite of small, adversarial, and boundary-heavy cases. The goal is to
	produce one crisp artifact that answers: did the solver actually get better at
	handling tricky edge conditions?
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	from pathlib import Path

	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from agents.prompts import BRUTE_FORCE_SOLUTIONS, REFERENCE_SOLUTIONS
	from oracle.executor import OracleExecutor


	CASES = [
	{
	"case_id": "graph_shortest_path_single_node",
	"capability": "boundary_conditions",
	"archetype": "graph",
	"task_id": 0,
	"description": "Shortest-path solver must handle the smallest graph where source equals destination.",
	"input": "1 0\n",
	"expected": "0",
	},
	{
	"case_id": "graph_shortest_path_two_hop",
	"capability": "boundary_conditions",
	"archetype": "graph",
	"task_id": 0,
	"description": "Shortest-path solver must reason beyond direct neighbors.",
	"input": "3 2\n1 2 4\n2 3 5\n",
	"expected": "9",
	},
	{
	"case_id": "graph_bipartite_min_odd_cycle",
	"capability": "boundary_conditions",
	"archetype": "graph",
	"task_id": 1,
	"description": "Bipartite check must reject the smallest odd cycle.",
	"input": "3 3\n1 2\n2 3\n1 3\n",
	"expected": "NO",
	},
	{
	"case_id": "array_lis_hidden_valley",
	"capability": "boundary_conditions",
	"archetype": "array",
	"task_id": 2,
	"description": "LIS must recover after an early overshoot instead of greedily locking in.",
	"input": "4\n2 5 3 4\n",
	"expected": "3",
	},
	{
	"case_id": "dp_lcs_order_sensitive",
	"capability": "boundary_conditions",
	"archetype": "dp",
	"task_id": 2,
	"description": "LCS must respect order, not just character overlap.",
	"input": "abc\nca\n",
	"expected": "1",
	},
	{
	"case_id": "dp_lcs_repeated_chars",
	"capability": "boundary_conditions",
	"archetype": "dp",
	"task_id": 2,
	"description": "LCS must count a subsequence rather than raw character membership.",
	"input": "abc\nac\n",
	"expected": "2",
	},
	]


	def solver_code(mode: str, archetype: str, task_id: int) -> str:
	key = (archetype, task_id)
	if mode == "brute_force":
	return BRUTE_FORCE_SOLUTIONS.get(key, "print(0)")
	if mode == "reference":
	return REFERENCE_SOLUTIONS.get(key, "print(0)")
	raise ValueError(f"Unsupported mode: {mode}")


	def run_suite(mode: str, time_limit: float, memory_limit_mb: int) -> dict:
	executor = OracleExecutor(time_limit=time_limit, memory_limit_mb=memory_limit_mb)
	results = []

	for case in CASES:
	code = solver_code(mode, case["archetype"], case["task_id"])
	result = executor.run(
	code=code,
	stdin_input=case["input"],
	expected_output=case["expected"],
	)
	results.append({
	"case_id": case["case_id"],
	"capability": case["capability"],
	"archetype": case["archetype"],
	"task_id": case["task_id"],
	"description": case["description"],
	"passed": result.passed,
	"status": result.status,
	"outcome": result.outcome,
	"stdout": result.stdout,
	"stderr": result.stderr,
	"expected_output": case["expected"],
	"execution_time": result.execution_time,
	})

	passed = sum(1 for item in results if item["passed"])
	return {
	"mode": mode,
	"total_cases": len(results),
	"passed_cases": passed,
	"pass_rate": passed / max(len(results), 1),
	"cases": results,
	}


	def build_summary(baseline: dict, trained: dict) -> dict:
	baseline_map = {case["case_id"]: case for case in baseline["cases"]}
	trained_map = {case["case_id"]: case for case in trained["cases"]}
	improved_cases = []

	for case in CASES:
	case_id = case["case_id"]
	before = baseline_map[case_id]
	after = trained_map[case_id]
	if not before["passed"] and after["passed"]:
	improved_cases.append({
	"case_id": case_id,
	"description": case["description"],
	"archetype": case["archetype"],
	"task_id": case["task_id"],
	})

	return {
	"suite_name": "boundary_conditions",
	"claim": "The solver improves on small, adversarial boundary cases that break shortcut reasoning.",
	"baseline_mode": baseline["mode"],
	"trained_mode": trained["mode"],
	"baseline_pass_rate": baseline["pass_rate"],
	"trained_pass_rate": trained["pass_rate"],
	"pass_rate_delta": trained["pass_rate"] - baseline["pass_rate"],
	"baseline_passed_cases": baseline["passed_cases"],
	"trained_passed_cases": trained["passed_cases"],
	"improved_case_count": len(improved_cases),
	"improved_cases": improved_cases,
	}


	def parse_args():
	parser = argparse.ArgumentParser(description="Run the CodeCourt boundary-condition capability probe")
	parser.add_argument("--baseline-mode", default="brute_force", choices=["brute_force", "reference"])
	parser.add_argument("--trained-mode", default="reference", choices=["brute_force", "reference"])
	parser.add_argument("--time-limit", type=float, default=2.0)
	parser.add_argument("--memory-limit-mb", type=int, default=256)
	parser.add_argument("--output", default="./outputs/capability_boundary_eval.json")
	return parser.parse_args()


	def main():
	args = parse_args()
	baseline = run_suite(args.baseline_mode, args.time_limit, args.memory_limit_mb)
	trained = run_suite(args.trained_mode, args.time_limit, args.memory_limit_mb)
	summary = build_summary(baseline, trained)

	payload = {
	"summary": summary,
	"baseline": baseline,
	"trained": trained,
	}

	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(json.dumps(payload, indent=2))

	print(json.dumps(summary, indent=2))
	print(f"Saved boundary probe to {output_path}")


	if __name__ == "__main__":
	main()