| import importlib.util |
| import os |
| import json |
| import numpy as np |
| from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts |
|
|
| TASK_FILE = os.getenv("ARC_TASK_FILE", "training") |
| TASK_NUM = os.getenv("TASK_NUM", 0) |
| OUTS_DIR = os.getenv("OUTS_DIR", "") |
| |
| PROGRAM_DIR = os.getenv("PROGRAM_DIR", "") |
|
|
|
|
| def _program_path(): |
| """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/.""" |
| if PROGRAM_DIR: |
| return os.path.join(PROGRAM_DIR, "best_program.py") |
| return os.path.join(OUTS_DIR, "best", "best_program.py") |
|
|
|
|
| def _result_path(): |
| """Where to write post_evolution_evaluation_result.json.""" |
| if PROGRAM_DIR: |
| return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json") |
| return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json") |
|
|
|
|
| def load_program_module(): |
| """Dynamically load the best_program.py module from the specified directory.""" |
| path = _program_path() |
| if not os.path.isfile(path): |
| raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.") |
| spec = importlib.util.spec_from_file_location("program_module", path) |
| program_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(program_module) |
| |
| return program_module |
|
|
| def evaluate(): |
| """Evaluate the program module located in the specified directory.""" |
| program_module = load_program_module() |
| if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'): |
| print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.") |
| |
| error_artifacts = { |
| "error_type": "MissingFunction", |
| "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.", |
| "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array." |
| } |
| |
| return dict( |
| metrics={ |
| "runs_successfully": 0.0, |
| "combined_score": 0.0, |
| "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions" |
| }, |
| artifacts=error_artifacts |
| ) |
| |
| data_root = os.getenv("DATA_ROOT") |
| if not data_root: |
| data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") |
| challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json") |
| solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json") |
|
|
| with open(challenge_path, 'r') as f: |
| tasks = json.load(f) |
| with open(solution_path, 'r') as f: |
| solutions = json.load(f) |
| |
| task_id = list(tasks.keys())[int(TASK_NUM)] |
| solution = solutions[task_id] |
| task = tasks[task_id] |
|
|
| |
| if len(task["test"]) != len(solution): |
| raise ValueError( |
| f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs " |
| f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json " |
| f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)." |
| ) |
|
|
| test_inputs = [np.array(inp["input"]) for inp in task['test']] |
| test_gts = [np.array(gt) for gt in solution] |
| |
| test_attempts = [] |
| for inp in test_inputs: |
| attempt_1 = program_module.transform_grid_attempt_1(inp) |
| if not isinstance(attempt_1, np.ndarray): |
| print(f"transform_grid_attempt_1 did not return a numpy array") |
| |
| error_artifacts = { |
| "error_type": "InvalidReturnType", |
| "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.", |
| "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array." |
| } |
| |
| return dict( |
| metrics={ |
| "runs_successfully": 0.0, |
| "combined_score": 0.0, |
| "error": "transform_grid_attempt_1 did not return a numpy array" |
| }, |
| artifacts=error_artifacts |
| ) |
| |
| attempt_2 = program_module.transform_grid_attempt_2(inp) |
| if not isinstance(attempt_2, np.ndarray): |
| print(f"transform_grid_attempt_2 did not return a numpy array") |
| |
| error_artifacts = { |
| "error_type": "InvalidReturnType", |
| "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.", |
| "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array." |
| } |
| |
| return dict( |
| metrics={ |
| "runs_successfully": 0.0, |
| "combined_score": 0.0, |
| "error": "transform_grid_attempt_2 did not return a numpy array" |
| }, |
| artifacts=error_artifacts |
| ) |
| test_attempts.append([attempt_1, attempt_2]) |
| |
| pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts) |
| metrics = { |
| "runs_successfully": 1.0, |
| "combined_score": sum(pass_at_2_test) / len(pass_at_2_test), |
| } |
| error_artifacts = {} |
| for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)): |
| example_name = f"test_example_{i}" |
| metrics[f"{example_name}_pass_at_2"] = test_pass |
| for attempt in test_diagnostics: |
| metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"] |
| if test_pass == 0: |
| |
| first_failing = next( |
| (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]), |
| test_diagnostics[0], |
| ) |
| error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing) |
| |
| return dict( |
| metrics=metrics, |
| artifacts=error_artifacts |
| ) |
| |
| if __name__ == "__main__": |
| evaluation_result = evaluate() |
| result_path = _result_path() |
| os.makedirs(os.path.dirname(result_path), exist_ok=True) |
| with open(result_path, 'w') as f: |
| json.dump(evaluation_result, f, indent=4) |
| print(f"Test-set evaluation written to {result_path}") |