File size: 5,625 Bytes
14c9c2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import argparse
from typing import Dict, Any, List, Tuple
import numpy as np
from shinka.core import run_shinka_eval


def construct_text_feedback(all_df) -> str:
    """Collect feedback from all wrong answers."""
    extra_dfs = [df.sort_values("id").reset_index(drop=True) for df in all_df]
    # Find ids where all three dataframes have "correct" == False
    ids_all_incorrect = set.intersection(
        *[set(df.loc[df["correct"] == False, "id"]) for df in extra_dfs]
    )
    ids_all_incorrect = sorted(ids_all_incorrect)
    # Select from first dataframe
    df0_selected = extra_dfs[0][extra_dfs[0]["id"].isin(ids_all_incorrect)]
    random_id = df0_selected.sample(1)["id"].values[0]
    false_answer = df0_selected[df0_selected["id"] == random_id]
    text_feedback = f"# Example of an AIME problem that could not be answered correctly:\n\n {false_answer.iloc[0]['problem']}"
    text_feedback += (
        f"\n\n# The Agent's wrong full response:\n\n{false_answer.iloc[0]['response']}"
    )
    text_feedback += (
        f"\n\n# The Agent's submit answer:\n\n{false_answer.iloc[0]['llm_answer']}"
    )
    text_feedback += f"\n\n#The ground truth problem answer:\n\n{false_answer.iloc[0]['true_answer']}"
    return text_feedback


def default_aggregate_metrics(
    results: List[Tuple[float, float, float, float]],
) -> Dict[str, float]:
    """Default aggregator for results."""
    if not results:
        public_metrics = {
            "performance": 0.0,
            "cost": 0.0,
        }
        private_metrics = {"processed": 0}
        metrics = {
            "public": public_metrics,
            "private": private_metrics,
            "combined_score": 0.0,
            "text_feedback": "",
        }
        return metrics

    (
        all_performance,
        all_cost,
        all_processed,
        all_num_llm_calls,
        all_df,
    ) = zip(*results)
    all_processed = sum(all_processed)
    total_num_llm_calls = sum(all_num_llm_calls)
    public_metrics = {
        "cost": float(np.mean(all_cost)),
        "avg_num_llm_calls": float(total_num_llm_calls / all_processed),
    }
    private_metrics = {
        "all_performance": all_performance,
        "all_cost": all_cost,
        "all_processed": all_processed,
        "all_num_llm_calls": all_num_llm_calls,
    }
    # Store extra data as pickle file
    extra_data = {
        "df": all_df,
    }
    text_feedback = construct_text_feedback(all_df)
    metrics = {
        "public": public_metrics,
        "private": private_metrics,
        "combined_score": float(np.mean(all_performance)),
        "extra_data": extra_data,
        "text_feedback": text_feedback,
    }
    return metrics


def get_experiment_kwargs(
    run_idx: int, model_name: str, year: int, max_calls: int
) -> Dict[str, Any]:
    """Provides keyword arguments for each experiment run."""
    return {"model_name": model_name, "year": year, "max_calls": max_calls}


def main(
    program_path: str,
    results_dir: str,
    model_name: str,
    year: int,
    num_experiment_runs: int = 5,
    max_calls: int = 10,
) -> None:
    """Runs the evaluation using the shinka.eval utility."""
    print(f"Evaluating program: {program_path}")
    print(f"Saving results to: {results_dir}")
    print(f"Using model: {model_name}")
    print(f"Using year: {year}")
    print(f"Using max calls: {max_calls}")
    print(f"Using num experiment runs: {num_experiment_runs}")

    from functools import partial

    get_kwargs_for_run = partial(
        get_experiment_kwargs,
        model_name=model_name,
        year=year,
        max_calls=max_calls,
    )

    metrics, correct, error = run_shinka_eval(
        program_path=program_path,
        results_dir=results_dir,
        experiment_fn_name="run_experiment",
        num_runs=num_experiment_runs,
        get_experiment_kwargs=get_kwargs_for_run,
        aggregate_metrics_fn=default_aggregate_metrics,
    )

    if correct:
        print("Evaluation completed successfully.")
        print("Metrics:")
        for key, value in metrics.items():
            print(f"  {key}: {value}")
    else:
        print(f"Evaluation failed: {error}")
        print("Default metrics stored due to error:")
        for key, value in metrics.items():
            print(f"  {key}: {value}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Agent evaluation script using shinka.eval"
    )
    parser.add_argument(
        "--program_path",
        type=str,
        default="initial.py",
        help="Path to the program to evaluate (must contain 'run_experiment')",
    )
    parser.add_argument(
        "--results_dir",
        type=str,
        default="results",
        help="Directory to save results and logs (metrics.json, correct.json)",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        default="gpt-4.1-nano",
        help="Name of the model to use for evaluation",
    )
    parser.add_argument(
        "--year",
        type=int,
        default=2024,
        help="Year of the AIME dataset to use for evaluation",
    )
    parser.add_argument(
        "--num_experiment_runs",
        type=int,
        default=3,
        help="Number of experiment runs to perform",
    )
    parser.add_argument(
        "--max_calls",
        type=int,
        default=10,
        help="Maximum number of calls to the LLM",
    )
    args = parser.parse_args()
    main(
        args.program_path,
        args.results_dir,
        args.model_name,
        args.year,
        args.num_experiment_runs,
        args.max_calls,
    )