import importlib.util import os import json import numpy as np from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts TASK_FILE = os.getenv("ARC_TASK_FILE", "training") TASK_NUM = os.getenv("TASK_NUM", 0) OUTS_DIR = os.getenv("OUTS_DIR", "") # Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set PROGRAM_DIR = os.getenv("PROGRAM_DIR", "") def _program_path(): """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/.""" if PROGRAM_DIR: return os.path.join(PROGRAM_DIR, "best_program.py") return os.path.join(OUTS_DIR, "best", "best_program.py") def _result_path(): """Where to write post_evolution_evaluation_result.json.""" if PROGRAM_DIR: return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json") return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json") def load_program_module(): """Dynamically load the best_program.py module from the specified directory.""" path = _program_path() if not os.path.isfile(path): raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.") spec = importlib.util.spec_from_file_location("program_module", path) program_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(program_module) return program_module def evaluate(): """Evaluate the program module located in the specified directory.""" program_module = load_program_module() if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'): print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.") error_artifacts = { "error_type": "MissingFunction", "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.", "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array." } return dict( metrics={ "runs_successfully": 0.0, "combined_score": 0.0, "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions" }, artifacts=error_artifacts ) # Load ARC tasks data_root = os.getenv("DATA_ROOT") if not data_root: data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json") solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json") with open(challenge_path, 'r') as f: tasks = json.load(f) with open(solution_path, 'r') as f: solutions = json.load(f) task_id = list(tasks.keys())[int(TASK_NUM)] solution = solutions[task_id] task = tasks[task_id] # Sanity check: test inputs and solutions must align (same task, same order) if len(task["test"]) != len(solution): raise ValueError( f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs " f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json " f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)." ) test_inputs = [np.array(inp["input"]) for inp in task['test']] test_gts = [np.array(gt) for gt in solution] test_attempts = [] for inp in test_inputs: attempt_1 = program_module.transform_grid_attempt_1(inp) if not isinstance(attempt_1, np.ndarray): print(f"transform_grid_attempt_1 did not return a numpy array") error_artifacts = { "error_type": "InvalidReturnType", "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.", "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array." } return dict( metrics={ "runs_successfully": 0.0, "combined_score": 0.0, "error": "transform_grid_attempt_1 did not return a numpy array" }, artifacts=error_artifacts ) attempt_2 = program_module.transform_grid_attempt_2(inp) if not isinstance(attempt_2, np.ndarray): print(f"transform_grid_attempt_2 did not return a numpy array") error_artifacts = { "error_type": "InvalidReturnType", "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.", "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array." } return dict( metrics={ "runs_successfully": 0.0, "combined_score": 0.0, "error": "transform_grid_attempt_2 did not return a numpy array" }, artifacts=error_artifacts ) test_attempts.append([attempt_1, attempt_2]) pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts) metrics = { "runs_successfully": 1.0, "combined_score": sum(pass_at_2_test) / len(pass_at_2_test), } error_artifacts = {} for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)): example_name = f"test_example_{i}" metrics[f"{example_name}_pass_at_2"] = test_pass for attempt in test_diagnostics: metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"] if test_pass == 0: # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict first_failing = next( (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]), test_diagnostics[0], ) error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing) return dict( metrics=metrics, artifacts=error_artifacts ) if __name__ == "__main__": evaluation_result = evaluate() result_path = _result_path() os.makedirs(os.path.dirname(result_path), exist_ok=True) with open(result_path, 'w') as f: json.dump(evaluation_result, f, indent=4) print(f"Test-set evaluation written to {result_path}")