sky2 / benchmarks /arc_benchmark /post_discovery_eval.py
JustinTX's picture
Add files using upload-large-folder tool
b0e88cf verified
import importlib.util
import os
import json
import numpy as np
from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
TASK_NUM = os.getenv("TASK_NUM", 0)
OUTS_DIR = os.getenv("OUTS_DIR", "")
# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
def _program_path():
"""Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
if PROGRAM_DIR:
return os.path.join(PROGRAM_DIR, "best_program.py")
return os.path.join(OUTS_DIR, "best", "best_program.py")
def _result_path():
"""Where to write post_evolution_evaluation_result.json."""
if PROGRAM_DIR:
return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
def load_program_module():
"""Dynamically load the best_program.py module from the specified directory."""
path = _program_path()
if not os.path.isfile(path):
raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
spec = importlib.util.spec_from_file_location("program_module", path)
program_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(program_module)
return program_module
def evaluate():
"""Evaluate the program module located in the specified directory."""
program_module = load_program_module()
if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
error_artifacts = {
"error_type": "MissingFunction",
"error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
"suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
},
artifacts=error_artifacts
)
# Load ARC tasks
data_root = os.getenv("DATA_ROOT")
if not data_root:
data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
with open(challenge_path, 'r') as f:
tasks = json.load(f)
with open(solution_path, 'r') as f:
solutions = json.load(f)
task_id = list(tasks.keys())[int(TASK_NUM)]
solution = solutions[task_id]
task = tasks[task_id]
# Sanity check: test inputs and solutions must align (same task, same order)
if len(task["test"]) != len(solution):
raise ValueError(
f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
)
test_inputs = [np.array(inp["input"]) for inp in task['test']]
test_gts = [np.array(gt) for gt in solution]
test_attempts = []
for inp in test_inputs:
attempt_1 = program_module.transform_grid_attempt_1(inp)
if not isinstance(attempt_1, np.ndarray):
print(f"transform_grid_attempt_1 did not return a numpy array")
error_artifacts = {
"error_type": "InvalidReturnType",
"error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
"suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "transform_grid_attempt_1 did not return a numpy array"
},
artifacts=error_artifacts
)
attempt_2 = program_module.transform_grid_attempt_2(inp)
if not isinstance(attempt_2, np.ndarray):
print(f"transform_grid_attempt_2 did not return a numpy array")
error_artifacts = {
"error_type": "InvalidReturnType",
"error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
"suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
}
return dict(
metrics={
"runs_successfully": 0.0,
"combined_score": 0.0,
"error": "transform_grid_attempt_2 did not return a numpy array"
},
artifacts=error_artifacts
)
test_attempts.append([attempt_1, attempt_2])
pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
metrics = {
"runs_successfully": 1.0,
"combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
}
error_artifacts = {}
for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
example_name = f"test_example_{i}"
metrics[f"{example_name}_pass_at_2"] = test_pass
for attempt in test_diagnostics:
metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
if test_pass == 0:
# test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
first_failing = next(
(test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
test_diagnostics[0],
)
error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
return dict(
metrics=metrics,
artifacts=error_artifacts
)
if __name__ == "__main__":
evaluation_result = evaluate()
result_path = _result_path()
os.makedirs(os.path.dirname(result_path), exist_ok=True)
with open(result_path, 'w') as f:
json.dump(evaluation_result, f, indent=4)
print(f"Test-set evaluation written to {result_path}")