File size: 7,041 Bytes
b0e88cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import importlib.util
import os
import json
import numpy as np
from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts

TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
TASK_NUM = os.getenv("TASK_NUM", 0)
OUTS_DIR = os.getenv("OUTS_DIR", "")
# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")


def _program_path():
    """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
    if PROGRAM_DIR:
        return os.path.join(PROGRAM_DIR, "best_program.py")
    return os.path.join(OUTS_DIR, "best", "best_program.py")


def _result_path():
    """Where to write post_evolution_evaluation_result.json."""
    if PROGRAM_DIR:
        return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
    return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")


def load_program_module():
    """Dynamically load the best_program.py module from the specified directory."""
    path = _program_path()
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
    spec = importlib.util.spec_from_file_location("program_module", path)
    program_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(program_module)
    
    return program_module

def evaluate():
    """Evaluate the program module located in the specified directory."""
    program_module = load_program_module()
    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
        
        error_artifacts = {
                "error_type": "MissingFunction",
                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
            }
        
        return dict(
                metrics={
                    "runs_successfully": 0.0, 
                    "combined_score": 0.0,
                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
                },
                artifacts=error_artifacts
            )
    # Load ARC tasks
    data_root = os.getenv("DATA_ROOT")
    if not data_root:
        data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
    solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")

    with open(challenge_path, 'r') as f:
        tasks = json.load(f)
    with open(solution_path, 'r') as f:
        solutions = json.load(f)
        
    task_id = list(tasks.keys())[int(TASK_NUM)]
    solution = solutions[task_id]
    task = tasks[task_id]

    # Sanity check: test inputs and solutions must align (same task, same order)
    if len(task["test"]) != len(solution):
        raise ValueError(
            f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
            f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
            f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
        )

    test_inputs = [np.array(inp["input"]) for inp in task['test']]
    test_gts = [np.array(gt) for gt in solution]
    
    test_attempts = []
    for inp in test_inputs:
        attempt_1 = program_module.transform_grid_attempt_1(inp)
        if not isinstance(attempt_1, np.ndarray):
            print(f"transform_grid_attempt_1 did not return a numpy array")
            
            error_artifacts = {
                "error_type": "InvalidReturnType",
                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
            }
            
            return dict(
                metrics={
                    "runs_successfully": 0.0, 
                    "combined_score": 0.0,
                    "error": "transform_grid_attempt_1 did not return a numpy array"
                },
                artifacts=error_artifacts
            )
        
        attempt_2 = program_module.transform_grid_attempt_2(inp)
        if not isinstance(attempt_2, np.ndarray):
            print(f"transform_grid_attempt_2 did not return a numpy array")
            
            error_artifacts = {
                "error_type": "InvalidReturnType",
                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
            }
            
            return dict(
                metrics={
                    "runs_successfully": 0.0, 
                    "combined_score": 0.0,
                    "error": "transform_grid_attempt_2 did not return a numpy array"
                },
                artifacts=error_artifacts
            )
        test_attempts.append([attempt_1, attempt_2])
        
    pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
    metrics = {
        "runs_successfully": 1.0,
        "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
    }
    error_artifacts = {}
    for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
        example_name = f"test_example_{i}"
        metrics[f"{example_name}_pass_at_2"] = test_pass
        for attempt in test_diagnostics:
            metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
        if test_pass == 0:
            # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
            first_failing = next(
                (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
                test_diagnostics[0],
            )
            error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
    
    return dict(
        metrics=metrics,
        artifacts=error_artifacts
    )
    
if __name__ == "__main__":
    evaluation_result = evaluate()
    result_path = _result_path()
    os.makedirs(os.path.dirname(result_path), exist_ok=True)
    with open(result_path, 'w') as f:
        json.dump(evaluation_result, f, indent=4)
    print(f"Test-set evaluation written to {result_path}")