"""Primary evaluator for the AlphaEvolve AC task.""" import argparse import sys from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import numpy as np # Add project root to import path for direct file execution. sys.path.insert(0, str(Path(__file__).resolve().parents[2])) from shinka.core import run_shinka_eval def evaluate_sequence(sequence: list[float]) -> float: """ Evaluate coefficient sequence with security/validity checks. Returns np.inf for invalid inputs. Lower is better. """ if not isinstance(sequence, list): return float(np.inf) if not sequence: return float(np.inf) for x in sequence: if isinstance(x, bool) or not isinstance(x, (int, float)): return float(np.inf) if np.isnan(x) or np.isinf(x): return float(np.inf) sequence = [float(x) for x in sequence] sequence = [max(0.0, x) for x in sequence] sequence = [min(1000.0, x) for x in sequence] n = len(sequence) b_sequence = np.convolve(sequence, sequence) max_b = float(np.max(b_sequence)) sum_a = float(np.sum(sequence)) if sum_a < 0.01: return float(np.inf) return float(2.0 * n * max_b / (sum_a**2)) def validate_run_output(run_output: Any) -> Tuple[bool, Optional[str]]: """Validate output of run().""" try: if not isinstance(run_output, list): return False, "run() must return list[float]" if len(run_output) == 0: return False, "run() returned empty list" value = evaluate_sequence(run_output) if not np.isfinite(value): return False, "evaluate_sequence returned inf/nan" return True, None except Exception as exc: return False, str(exc) def aggregate_alphaevolve_ac_metrics(results: List[list[float]]) -> Dict[str, Any]: """Aggregate metrics with best-only ranking.""" if not results: return { "combined_score": 0.0, "public": {"best_value": None, "num_runs": 0}, "private": {"all_values": []}, "text_feedback": "No successful runs.", } values: List[float] = [] lengths: List[int] = [] best_sequence: Optional[list[float]] = None best_value = float(np.inf) for seq in results: val = evaluate_sequence(seq) values.append(float(val)) lengths.append(len(seq)) if val < best_value: best_value = float(val) best_sequence = seq combined_score = -best_value public = { "best_value": best_value, "best_length": len(best_sequence) if best_sequence is not None else None, "num_runs": len(results), } private = { "all_values": values, "all_lengths": lengths, } return { "combined_score": combined_score, "public": public, "private": private, "text_feedback": ( "Lower evaluate_sequence value is better. " "combined_score = -best_value." ), } def main(program_path: str, results_dir: str, num_experiment_runs: int = 1): """Run evaluation and persist metrics.json/correct.json.""" print(f"Evaluating program: {program_path}") print(f"Saving results to: {results_dir}") print(f"Number of runs: {num_experiment_runs}") metrics, correct, error = run_shinka_eval( program_path=program_path, results_dir=results_dir, experiment_fn_name="run", num_runs=num_experiment_runs, validate_fn=validate_run_output, aggregate_metrics_fn=aggregate_alphaevolve_ac_metrics, ) if correct: print("Evaluation completed successfully.") else: print(f"Evaluation failed: {error}") print(f"combined_score={metrics.get('combined_score')}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Evaluate AlphaEvolve AC task") parser.add_argument( "--program_path", type=str, default="tasks/alphaevolve_ac/initial.py", ) parser.add_argument( "--results_dir", type=str, default="tasks/alphaevolve_ac/results/debug_eval", ) parser.add_argument( "--num_experiment_runs", type=int, default=1, ) args = parser.parse_args() main( program_path=args.program_path, results_dir=args.results_dir, num_experiment_runs=args.num_experiment_runs, )