sky2 / benchmarks /arc_benchmark /post_discovery_eval.py

Add files using upload-large-folder tool

b0e88cf verified about 1 month ago

7.04 kB

	import importlib.util
	import os
	import json
	import numpy as np
	from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts

	TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
	TASK_NUM = os.getenv("TASK_NUM", 0)
	OUTS_DIR = os.getenv("OUTS_DIR", "")
	# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
	PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")


	def _program_path():
	"""Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
	if PROGRAM_DIR:
	return os.path.join(PROGRAM_DIR, "best_program.py")
	return os.path.join(OUTS_DIR, "best", "best_program.py")


	def _result_path():
	"""Where to write post_evolution_evaluation_result.json."""
	if PROGRAM_DIR:
	return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
	return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")


	def load_program_module():
	"""Dynamically load the best_program.py module from the specified directory."""
	path = _program_path()
	if not os.path.isfile(path):
	raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
	spec = importlib.util.spec_from_file_location("program_module", path)
	program_module = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(program_module)

	return program_module

	def evaluate():
	"""Evaluate the program module located in the specified directory."""
	program_module = load_program_module()
	if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
	print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")

	error_artifacts = {
	"error_type": "MissingFunction",
	"error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
	"suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
	}

	return dict(
	metrics={
	"runs_successfully": 0.0,
	"combined_score": 0.0,
	"error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
	},
	artifacts=error_artifacts
	)
	# Load ARC tasks
	data_root = os.getenv("DATA_ROOT")
	if not data_root:
	data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
	challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
	solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")

	with open(challenge_path, 'r') as f:
	tasks = json.load(f)
	with open(solution_path, 'r') as f:
	solutions = json.load(f)

	task_id = list(tasks.keys())[int(TASK_NUM)]
	solution = solutions[task_id]
	task = tasks[task_id]

	# Sanity check: test inputs and solutions must align (same task, same order)
	if len(task["test"]) != len(solution):
	raise ValueError(
	f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
	f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
	f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
	)

	test_inputs = [np.array(inp["input"]) for inp in task['test']]
	test_gts = [np.array(gt) for gt in solution]

	test_attempts = []
	for inp in test_inputs:
	attempt_1 = program_module.transform_grid_attempt_1(inp)
	if not isinstance(attempt_1, np.ndarray):
	print(f"transform_grid_attempt_1 did not return a numpy array")

	error_artifacts = {
	"error_type": "InvalidReturnType",
	"error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
	"suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
	}

	return dict(
	metrics={
	"runs_successfully": 0.0,
	"combined_score": 0.0,
	"error": "transform_grid_attempt_1 did not return a numpy array"
	},
	artifacts=error_artifacts
	)

	attempt_2 = program_module.transform_grid_attempt_2(inp)
	if not isinstance(attempt_2, np.ndarray):
	print(f"transform_grid_attempt_2 did not return a numpy array")

	error_artifacts = {
	"error_type": "InvalidReturnType",
	"error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
	"suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
	}

	return dict(
	metrics={
	"runs_successfully": 0.0,
	"combined_score": 0.0,
	"error": "transform_grid_attempt_2 did not return a numpy array"
	},
	artifacts=error_artifacts
	)
	test_attempts.append([attempt_1, attempt_2])

	pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
	metrics = {
	"runs_successfully": 1.0,
	"combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
	}
	error_artifacts = {}
	for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
	example_name = f"test_example_{i}"
	metrics[f"{example_name}_pass_at_2"] = test_pass
	for attempt in test_diagnostics:
	metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
	if test_pass == 0:
	# test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
	first_failing = next(
	(test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
	test_diagnostics[0],
	)
	error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)

	return dict(
	metrics=metrics,
	artifacts=error_artifacts
	)

	if __name__ == "__main__":
	evaluation_result = evaluate()
	result_path = _result_path()
	os.makedirs(os.path.dirname(result_path), exist_ok=True)
	with open(result_path, 'w') as f:
	json.dump(evaluation_result, f, indent=4)
	print(f"Test-set evaluation written to {result_path}")