File size: 5,720 Bytes
517cbd2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | import argparse
import json
import os
import traceback
from pathlib import Path
import ale_bench
from ale_bench.result import CaseResult, JudgeResult, Result
def result_feedback(result: Result) -> CaseResult:
if result.overall_judge_result == JudgeResult.ACCEPTED:
return result.case_results[0]
else:
selected_case_idx = 0
for idx, case_result in enumerate(result.case_results):
if case_result.judge_result == result.overall_judge_result:
selected_case_idx = idx
break
return result.case_results[selected_case_idx]
def main(program_path: str, problem_id: str) -> dict:
"""Runs the evaluation using the shinka.eval utility."""
print(f"Problem ID: {problem_id}")
print(f"Evaluating program: {program_path}")
try:
session = ale_bench.start(
problem_id=problem_id,
lite_version=False,
num_workers=13,
)
code = Path(program_path).read_text().replace("# EVOLVE-BLOCK-START", "").replace("# EVOLVE-BLOCK-END", "").strip()
private_result, final_rank, final_performance = session.private_eval(
code, code_language="cpp20",
)
# Store the private_result as JSON in the results directory
private_json_str = private_result.model_dump_json(indent=4)
private_json = json.loads(private_json_str)
private_passed_cases, private_failed_cases = 0, 0
num_private_cases = len(private_json["case_results"])
for case in private_json["case_results"]:
if case["judge_result"] == "ACCEPTED":
private_passed_cases += 1
else:
private_failed_cases += 1
print(
f"Passed {private_passed_cases} cases, failed {private_failed_cases} cases out of {num_private_cases}"
)
print(
f"Final Private Score: {private_result.overall_absolute_score} - Mean Score: {private_result.overall_absolute_score / num_private_cases}"
)
print(f"Rank: {final_rank}, Performance: {final_performance}")
metrics = {}
private_metrics = {
"private_rank": final_rank,
"private_performance": final_performance,
"private_score": private_result.overall_absolute_score,
"num_private_passed_cases": private_passed_cases,
"num_private_failed_cases": private_failed_cases,
}
metrics["private"] = private_metrics
# Monitor resource consumption
print(f"Current Resource Usage: {session.current_resource_usage}")
print(f"Remaining Resources: {session.remaining_resource_usage}")
return metrics
except Exception as e:
print(f"Evaluation failed completely: {str(e)}")
print(traceback.format_exc())
metrics = {
"combined_score": 0.0,
"public": {"judge_result": "REJECTED"},
"private": {
"private_rank": 0,
"private_performance": 0,
"private_score": 0,
"num_private_passed_cases": 0,
"num_private_failed_cases": 0,
},
}
return metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Agent evaluation script using shinka.eval"
)
parser.add_argument(
"--program-path",
type=str,
default="program.cpp",
help="Path to the program to evaluate",
)
parser.add_argument(
"--problem-id",
type=str,
default="ahc025",
help="Problem ID",
)
parsed_args = parser.parse_args()
# Collect results from 3 runs
all_results = []
for i in range(3):
print(f"\n{'='*60}")
print(f"Running evaluation {i+1} of 3")
print('='*60)
result = main(
parsed_args.program_path,
parsed_args.problem_id,
)
all_results.append(result)
print('='*60)
# Compute averages
print(f"\n{'='*60}")
print("FINAL AVERAGED RESULTS ACROSS 3 RUNS")
print('='*60)
private_scores = [r["private"]["private_score"] for r in all_results]
private_performances = [r["private"]["private_performance"] for r in all_results]
private_ranks = [r["private"]["private_rank"] for r in all_results]
passed_cases = [r["private"]["num_private_passed_cases"] for r in all_results]
failed_cases = [r["private"]["num_private_failed_cases"] for r in all_results]
avg_private_score = sum(private_scores) / len(private_scores)
avg_private_performance = sum(private_performances) / len(private_performances)
avg_private_rank = sum(private_ranks) / len(private_ranks)
avg_passed_cases = sum(passed_cases) / len(passed_cases)
avg_failed_cases = sum(failed_cases) / len(failed_cases)
print(f"\nAverage Private Score: {avg_private_score:.2f}")
print(f" Individual scores: {private_scores}")
print(f"\nAverage Private Performance: {avg_private_performance:.4f}")
print(f" Individual performances: {private_performances}")
print(f"\nAverage Private Rank: {avg_private_rank:.2f}")
print(f" Individual ranks: {private_ranks}")
print(f"\nAverage Passed Cases: {avg_passed_cases:.2f}")
print(f"Average Failed Cases: {avg_failed_cases:.2f}")
print('='*60)
# Return summary
summary = {
"avg_private_score": avg_private_score,
"avg_private_performance": avg_private_performance,
"avg_private_rank": avg_private_rank,
"all_results": all_results
}
print(f"\nFinal Summary:")
print(json.dumps(summary, indent=2)) |