import argparse from typing import Dict, Any, List, Tuple import numpy as np from shinka.core import run_shinka_eval def construct_text_feedback(all_df) -> str: """Collect feedback from all wrong answers.""" extra_dfs = [df.sort_values("id").reset_index(drop=True) for df in all_df] # Find ids where all three dataframes have "correct" == False ids_all_incorrect = set.intersection( *[set(df.loc[df["correct"] == False, "id"]) for df in extra_dfs] ) ids_all_incorrect = sorted(ids_all_incorrect) # Select from first dataframe df0_selected = extra_dfs[0][extra_dfs[0]["id"].isin(ids_all_incorrect)] random_id = df0_selected.sample(1)["id"].values[0] false_answer = df0_selected[df0_selected["id"] == random_id] text_feedback = f"# Example of an AIME problem that could not be answered correctly:\n\n {false_answer.iloc[0]['problem']}" text_feedback += ( f"\n\n# The Agent's wrong full response:\n\n{false_answer.iloc[0]['response']}" ) text_feedback += ( f"\n\n# The Agent's submit answer:\n\n{false_answer.iloc[0]['llm_answer']}" ) text_feedback += f"\n\n#The ground truth problem answer:\n\n{false_answer.iloc[0]['true_answer']}" return text_feedback def default_aggregate_metrics( results: List[Tuple[float, float, float, float]], ) -> Dict[str, float]: """Default aggregator for results.""" if not results: public_metrics = { "performance": 0.0, "cost": 0.0, } private_metrics = {"processed": 0} metrics = { "public": public_metrics, "private": private_metrics, "combined_score": 0.0, "text_feedback": "", } return metrics ( all_performance, all_cost, all_processed, all_num_llm_calls, all_df, ) = zip(*results) all_processed = sum(all_processed) total_num_llm_calls = sum(all_num_llm_calls) public_metrics = { "cost": float(np.mean(all_cost)), "avg_num_llm_calls": float(total_num_llm_calls / all_processed), } private_metrics = { "all_performance": all_performance, "all_cost": all_cost, "all_processed": all_processed, "all_num_llm_calls": all_num_llm_calls, } # Store extra data as pickle file extra_data = { "df": all_df, } text_feedback = construct_text_feedback(all_df) metrics = { "public": public_metrics, "private": private_metrics, "combined_score": float(np.mean(all_performance)), "extra_data": extra_data, "text_feedback": text_feedback, } return metrics def get_experiment_kwargs( run_idx: int, model_name: str, year: int, max_calls: int ) -> Dict[str, Any]: """Provides keyword arguments for each experiment run.""" return {"model_name": model_name, "year": year, "max_calls": max_calls} def main( program_path: str, results_dir: str, model_name: str, year: int, num_experiment_runs: int = 5, max_calls: int = 10, ) -> None: """Runs the evaluation using the shinka.eval utility.""" print(f"Evaluating program: {program_path}") print(f"Saving results to: {results_dir}") print(f"Using model: {model_name}") print(f"Using year: {year}") print(f"Using max calls: {max_calls}") print(f"Using num experiment runs: {num_experiment_runs}") from functools import partial get_kwargs_for_run = partial( get_experiment_kwargs, model_name=model_name, year=year, max_calls=max_calls, ) metrics, correct, error = run_shinka_eval( program_path=program_path, results_dir=results_dir, experiment_fn_name="run_experiment", num_runs=num_experiment_runs, get_experiment_kwargs=get_kwargs_for_run, aggregate_metrics_fn=default_aggregate_metrics, ) if correct: print("Evaluation completed successfully.") print("Metrics:") for key, value in metrics.items(): print(f" {key}: {value}") else: print(f"Evaluation failed: {error}") print("Default metrics stored due to error:") for key, value in metrics.items(): print(f" {key}: {value}") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Agent evaluation script using shinka.eval" ) parser.add_argument( "--program_path", type=str, default="initial.py", help="Path to the program to evaluate (must contain 'run_experiment')", ) parser.add_argument( "--results_dir", type=str, default="results", help="Directory to save results and logs (metrics.json, correct.json)", ) parser.add_argument( "--model_name", type=str, default="gpt-4.1-nano", help="Name of the model to use for evaluation", ) parser.add_argument( "--year", type=int, default=2024, help="Year of the AIME dataset to use for evaluation", ) parser.add_argument( "--num_experiment_runs", type=int, default=3, help="Number of experiment runs to perform", ) parser.add_argument( "--max_calls", type=int, default=10, help="Maximum number of calls to the LLM", ) args = parser.parse_args() main( args.program_path, args.results_dir, args.model_name, args.year, args.num_experiment_runs, args.max_calls, )