JustinTX's picture
Add files using upload-large-folder tool
14c9c2b verified
import argparse
from typing import Dict, Any, List, Tuple
import numpy as np
from shinka.core import run_shinka_eval
def construct_text_feedback(all_df) -> str:
"""Collect feedback from all wrong answers."""
extra_dfs = [df.sort_values("id").reset_index(drop=True) for df in all_df]
# Find ids where all three dataframes have "correct" == False
ids_all_incorrect = set.intersection(
*[set(df.loc[df["correct"] == False, "id"]) for df in extra_dfs]
)
ids_all_incorrect = sorted(ids_all_incorrect)
# Select from first dataframe
df0_selected = extra_dfs[0][extra_dfs[0]["id"].isin(ids_all_incorrect)]
random_id = df0_selected.sample(1)["id"].values[0]
false_answer = df0_selected[df0_selected["id"] == random_id]
text_feedback = f"# Example of an AIME problem that could not be answered correctly:\n\n {false_answer.iloc[0]['problem']}"
text_feedback += (
f"\n\n# The Agent's wrong full response:\n\n{false_answer.iloc[0]['response']}"
)
text_feedback += (
f"\n\n# The Agent's submit answer:\n\n{false_answer.iloc[0]['llm_answer']}"
)
text_feedback += f"\n\n#The ground truth problem answer:\n\n{false_answer.iloc[0]['true_answer']}"
return text_feedback
def default_aggregate_metrics(
results: List[Tuple[float, float, float, float]],
) -> Dict[str, float]:
"""Default aggregator for results."""
if not results:
public_metrics = {
"performance": 0.0,
"cost": 0.0,
}
private_metrics = {"processed": 0}
metrics = {
"public": public_metrics,
"private": private_metrics,
"combined_score": 0.0,
"text_feedback": "",
}
return metrics
(
all_performance,
all_cost,
all_processed,
all_num_llm_calls,
all_df,
) = zip(*results)
all_processed = sum(all_processed)
total_num_llm_calls = sum(all_num_llm_calls)
public_metrics = {
"cost": float(np.mean(all_cost)),
"avg_num_llm_calls": float(total_num_llm_calls / all_processed),
}
private_metrics = {
"all_performance": all_performance,
"all_cost": all_cost,
"all_processed": all_processed,
"all_num_llm_calls": all_num_llm_calls,
}
# Store extra data as pickle file
extra_data = {
"df": all_df,
}
text_feedback = construct_text_feedback(all_df)
metrics = {
"public": public_metrics,
"private": private_metrics,
"combined_score": float(np.mean(all_performance)),
"extra_data": extra_data,
"text_feedback": text_feedback,
}
return metrics
def get_experiment_kwargs(
run_idx: int, model_name: str, year: int, max_calls: int
) -> Dict[str, Any]:
"""Provides keyword arguments for each experiment run."""
return {"model_name": model_name, "year": year, "max_calls": max_calls}
def main(
program_path: str,
results_dir: str,
model_name: str,
year: int,
num_experiment_runs: int = 5,
max_calls: int = 10,
) -> None:
"""Runs the evaluation using the shinka.eval utility."""
print(f"Evaluating program: {program_path}")
print(f"Saving results to: {results_dir}")
print(f"Using model: {model_name}")
print(f"Using year: {year}")
print(f"Using max calls: {max_calls}")
print(f"Using num experiment runs: {num_experiment_runs}")
from functools import partial
get_kwargs_for_run = partial(
get_experiment_kwargs,
model_name=model_name,
year=year,
max_calls=max_calls,
)
metrics, correct, error = run_shinka_eval(
program_path=program_path,
results_dir=results_dir,
experiment_fn_name="run_experiment",
num_runs=num_experiment_runs,
get_experiment_kwargs=get_kwargs_for_run,
aggregate_metrics_fn=default_aggregate_metrics,
)
if correct:
print("Evaluation completed successfully.")
print("Metrics:")
for key, value in metrics.items():
print(f" {key}: {value}")
else:
print(f"Evaluation failed: {error}")
print("Default metrics stored due to error:")
for key, value in metrics.items():
print(f" {key}: {value}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Agent evaluation script using shinka.eval"
)
parser.add_argument(
"--program_path",
type=str,
default="initial.py",
help="Path to the program to evaluate (must contain 'run_experiment')",
)
parser.add_argument(
"--results_dir",
type=str,
default="results",
help="Directory to save results and logs (metrics.json, correct.json)",
)
parser.add_argument(
"--model_name",
type=str,
default="gpt-4.1-nano",
help="Name of the model to use for evaluation",
)
parser.add_argument(
"--year",
type=int,
default=2024,
help="Year of the AIME dataset to use for evaluation",
)
parser.add_argument(
"--num_experiment_runs",
type=int,
default=3,
help="Number of experiment runs to perform",
)
parser.add_argument(
"--max_calls",
type=int,
default=10,
help="Maximum number of calls to the LLM",
)
args = parser.parse_args()
main(
args.program_path,
args.results_dir,
args.model_name,
args.year,
args.num_experiment_runs,
args.max_calls,
)