File size: 7,041 Bytes
b0e88cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | import importlib.util
import os
import json
import numpy as np
from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
TASK_NUM = os.getenv("TASK_NUM", 0)
OUTS_DIR = os.getenv("OUTS_DIR", "")
# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
def _program_path():
"""Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
if PROGRAM_DIR:
return os.path.join(PROGRAM_DIR, "best_program.py")
return os.path.join(OUTS_DIR, "best", "best_program.py")
def _result_path():
"""Where to write post_evolution_evaluation_result.json."""
if PROGRAM_DIR:
return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
def load_program_module():
"""Dynamically load the best_program.py module from the specified directory."""
path = _program_path()
if not os.path.isfile(path):
raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
spec = importlib.util.spec_from_file_location("program_module", path)
program_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(program_module)
return program_module
def evaluate():
"""Evaluate the program module located in the specified directory."""
program_module = load_program_module()
if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
error_artifacts = {
"error_type": "MissingFunction",
"error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
"suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
},
artifacts=error_artifacts
)
# Load ARC tasks
data_root = os.getenv("DATA_ROOT")
if not data_root:
data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
with open(challenge_path, 'r') as f:
tasks = json.load(f)
with open(solution_path, 'r') as f:
solutions = json.load(f)
task_id = list(tasks.keys())[int(TASK_NUM)]
solution = solutions[task_id]
task = tasks[task_id]
# Sanity check: test inputs and solutions must align (same task, same order)
if len(task["test"]) != len(solution):
raise ValueError(
f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
)
test_inputs = [np.array(inp["input"]) for inp in task['test']]
test_gts = [np.array(gt) for gt in solution]
test_attempts = []
for inp in test_inputs:
attempt_1 = program_module.transform_grid_attempt_1(inp)
if not isinstance(attempt_1, np.ndarray):
print(f"transform_grid_attempt_1 did not return a numpy array")
error_artifacts = {
"error_type": "InvalidReturnType",
"error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
"suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "transform_grid_attempt_1 did not return a numpy array"
},
artifacts=error_artifacts
)
attempt_2 = program_module.transform_grid_attempt_2(inp)
if not isinstance(attempt_2, np.ndarray):
print(f"transform_grid_attempt_2 did not return a numpy array")
error_artifacts = {
"error_type": "InvalidReturnType",
"error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
"suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "transform_grid_attempt_2 did not return a numpy array"
},
artifacts=error_artifacts
)
test_attempts.append([attempt_1, attempt_2])
pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
metrics = {
"runs_successfully": 1.0,
"combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
}
error_artifacts = {}
for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
example_name = f"test_example_{i}"
metrics[f"{example_name}_pass_at_2"] = test_pass
for attempt in test_diagnostics:
metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
if test_pass == 0:
# test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
first_failing = next(
(test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
test_diagnostics[0],
)
error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
return dict(
metrics=metrics,
artifacts=error_artifacts
)
if __name__ == "__main__":
evaluation_result = evaluate()
result_path = _result_path()
os.makedirs(os.path.dirname(result_path), exist_ok=True)
with open(result_path, 'w') as f:
json.dump(evaluation_result, f, indent=4)
print(f"Test-set evaluation written to {result_path}") |